2025-07-08 16:40:00 +02:00
11 changed files with 97 additions and 154 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -1,16 +1,15 @@
 stages:
  - build
 run-build:
  stage: build
-  image: rust:1.86
+  image: rust:1.79
  before_script:
-    - rustup component add rustfmt
+  - rustup component add rustfmt
-    - rustup component add clippy
+  - rustup component add clippy
    - export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so
  script:
-    - rustc --version && cargo --version
+  - rustc --version && cargo --version
-    - echo $LIBXML2
+  - cargo fmt -- --check
-    - cargo fmt -- --check
+  - cargo clippy --all-targets --all-features -- -D warnings
-    - cargo clippy --all-targets --all-features -- -D warnings
+  - cargo build --release
    - cargo build --release
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,7 +3,7 @@ members = ["article_scraper", "article_scraper_cli"]
 resolver = "2"
 [workspace.package]
-version = "2.1.3"
+version = "2.1.1"
 authors = ["Jan Lukas Gernert <jangernert@gmail.com>"]
 edition = "2021"
 license = "GPL-3.0-or-later"
--- a/article_scraper/Cargo.toml
+++ b/article_scraper/Cargo.toml
@ -14,17 +14,17 @@ exclude = ["resources/tests"]
 [dependencies]
 thiserror = "2.0"
 libxml = "0.3"
-reqwest = { version = "0.12", features = ["stream"] }
+reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
 tokio = { version = "1", features = ["macros", "fs", "io-util"] }
 url = "2.5"
-regex = "1.11"
+regex = "1.10"
 encoding_rs = "0.8"
 chrono = "0.4"
 base64 = "0.22"
 image = "0.25"
 log = "0.4"
-rust-embed = "8.7"
+rust-embed="8.4"
-once_cell = "1.21"
+once_cell = "1.19"
 escaper = "0.1"
 futures = "0.3"
 unic-emoji-char = "0.9"
--- a/article_scraper/ftr-site-config
+++ b/article_scraper/ftr-site-config
@ -1 +1 @@
-Subproject commit 69aa220193d99427d3822fabccdfaeede56cd532
+Subproject commit ccde390b11893cbafdc84f74b449ddc3cc05c024
--- a/article_scraper/src/clean.rs
+++ b/article_scraper/src/clean.rs
--- a/article_scraper/src/error.rs
+++ b/article_scraper/src/error.rs
@ -6,10 +6,10 @@ use thiserror::Error;
 #[derive(Error, Debug)]
 pub enum ScraperError {
-    #[error("Configerror {0}")]
+    #[error("")]
    Config(#[from] ConfigError),
-    #[error("ImageDownloadError {0}")]
+    #[error("")]
    Image(#[from] ImageDownloadError),
-    #[error("FullTextParserError {0}")]
+    #[error("")]
    Scrap(#[from] FullTextParserError),
 }
--- a/article_scraper/src/full_text_parser/metadata.rs
+++ b/article_scraper/src/full_text_parser/metadata.rs
@ -158,7 +158,7 @@ fn extract_date(
 fn get_meta(context: &Context, name: &str) -> Option<String> {
    Util::get_attribute(
        context,
-        &format!("//meta[contains(@name, '{name}')]"),
+        &format!("//meta[contains(@name, '{}')]", name),
        "content",
    )
    .ok()
--- a/article_scraper/src/full_text_parser/mod.rs
+++ b/article_scraper/src/full_text_parser/mod.rs
@ -69,11 +69,6 @@ impl FullTextParser {
        let html = Self::get_body(response).await?;
        if html.is_empty() {
            log::error!("Empty response body");
            return Err(FullTextParserError::Http);
        }
        // check for fingerprints
        let config = if config.is_none() {
            if let Some(url) = Fingerprints::detect(&html) {
@ -269,17 +264,10 @@ impl FullTextParser {
        }
        // parse html
-        let document = Self::parse_html_string_patched(html.as_str()).map_err(|err| {
+        Self::parse_html_string_patched(html.as_str()).map_err(|err| {
            log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
            FullTextParserError::Xml
-        })?;
+        })
        if document.get_root_element().is_none() {
            log::error!("document without root");
            Err(FullTextParserError::Xml)
        } else {
            Ok(document)
        }
    }
    /// FIXME: Here are some patched functions of libxml crate.
@ -287,7 +275,7 @@ impl FullTextParser {
    /// See:
    /// - <https://github.com/KWARC/rust-libxml/issues/111>
    /// - <https://github.com/Orange-OpenSource/hurl/issues/1535>
-    ///   These two functions should be removed when the issue is fixed in libxml crate.
+    /// These two functions should be removed when the issue is fixed in libxml crate.
    fn try_usize_to_i32(value: usize) -> Result<i32, libxml::parser::XmlParseError> {
        if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) {
            // Cannot safely use our value comparison, but the conversion if always safe.
@ -366,73 +354,63 @@ impl FullTextParser {
            .send()
            .await
            .map_err(|err| {
-                log::error!("Downloading HTML failed: GET '{url}' - '{err}'");
+                log::error!(
                    "Downloading HTML failed: GET '{}' - '{}'",
                    url.as_str(),
                    err
                );
                FullTextParserError::Http
            })?;
        Ok(response)
    }
    async fn get_body(response: Response) -> Result<String, FullTextParserError> {
-        let status = response.status();
+        if response.status().is_success() {
-        if !status.is_success() {
+            let headers = response.headers().clone();
-            log::error!("status code: {status}");
+            let bytes = response
-            return Err(FullTextParserError::Http);
+                .bytes()
-        }
+                .await
                .map_err(|_| FullTextParserError::Http)?;
-        let headers = response.headers().clone();
+            match from_utf8(&bytes) {
-
+                Ok(utf8_str) => {
-        if headers
+                    log::debug!("Valid utf-8 string");
-            .get(reqwest::header::CONTENT_LENGTH)
+                    return Ok(utf8_str.into());
            .and_then(|hv| hv.to_str().ok())
            .and_then(|str| str.parse::<i64>().ok())
            .map(|content_length| content_length == 0)
            .unwrap_or(false)
        {
            log::error!("Empty response body");
            return Err(FullTextParserError::Http);
        }
        let bytes = response
            .bytes()
            .await
            .map_err(|_| FullTextParserError::Http)?;
        match from_utf8(&bytes) {
            Ok(utf8_str) => {
                log::debug!("Valid utf-8 string");
                Ok(utf8_str.into())
            }
            Err(error) => {
                log::debug!("Invalid utf-8 string");
                let lossy_string = std::string::String::from_utf8_lossy(&bytes);
                if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) {
                    log::debug!("Encoding extracted from HTML: '{encoding}'");
                    if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
                        let decoded_html = decoded_html.replacen(
                            &format!("charset=\"{encoding}\""),
                            "charset=\"utf-8\"",
                            1,
                        );
                        return Ok(decoded_html);
                    }
                }
                Err(error) => {
                    log::debug!("Invalid utf-8 string");
                    let lossy_string = std::string::String::from_utf8_lossy(&bytes);
-                if let Some(encoding) = Self::get_encoding_from_http_header(&headers) {
+                    if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) {
-                    log::debug!("Encoding extracted from headers: '{encoding}'");
+                        log::debug!("Encoding extracted from HTML: '{}'", encoding);
-                    if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
+                        if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
-                        let decoded_html = decoded_html.replacen(
+                            let decoded_html = decoded_html.replacen(
-                            &format!("charset=\"{encoding}\""),
+                                &format!("charset=\"{encoding}\""),
-                            "charset=\"utf-8\"",
+                                "charset=\"utf-8\"",
-                            1,
+                                1,
-                        );
+                            );
-                        return Ok(decoded_html);
+                            return Ok(decoded_html);
                        }
                    }
                }
-                Err(FullTextParserError::Utf8(error))
+                    if let Some(encoding) = Self::get_encoding_from_http_header(&headers) {
                        log::debug!("Encoding extracted from headers: '{}'", encoding);
                        if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
                            let decoded_html = decoded_html.replacen(
                                &format!("charset=\"{encoding}\""),
                                "charset=\"utf-8\"",
                                1,
                            );
                            return Ok(decoded_html);
                        }
                    }
                    return Err(FullTextParserError::Utf8(error));
                }
            }
        }
        Err(FullTextParserError::Http)
    }
    pub async fn download(
@ -444,12 +422,7 @@ impl FullTextParser {
        let headers = Util::generate_headers(config, global_config)?;
        let response = Self::get_response(url, client, headers).await?;
        let body = Self::get_body(response).await?;
-        if body.is_empty() {
+        Ok(body)
            log::error!("Empty response body");
            Err(FullTextParserError::Http)
        } else {
            Ok(body)
        }
    }
    fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
@ -722,7 +695,7 @@ impl FullTextParser {
    }
    fn fix_iframe_size(context: &Context, site_name: &str) -> Result<(), FullTextParserError> {
-        let xpath = &format!("//iframe[contains(@src, '{site_name}')]");
+        let xpath = &format!("//iframe[contains(@src, '{}')]", site_name);
        let node_vec = Util::evaluate_xpath(context, xpath, false)?;
        for mut node in node_vec {
            if node.is_null() {
@ -761,7 +734,7 @@ impl FullTextParser {
    ) -> Result<(), FullTextParserError> {
        let xpath_tag = tag.unwrap_or("*");
-        let xpath = &format!("//{xpath_tag}[@{attribute}]");
+        let xpath = &format!("//{}[@{}]", xpath_tag, attribute);
        let node_vec = Util::evaluate_xpath(context, xpath, false)?;
        for mut node in node_vec {
            if let Err(err) = node.remove_property(attribute) {
@ -791,16 +764,17 @@ impl FullTextParser {
            if let Some(url) = node.get_attribute(attribute) {
                let trimmed_url = url.trim();
-                if url.starts_with('#') || url.starts_with("\\#") {
+                let is_hash_url = url.starts_with('#');
                    continue;
                }
                let is_relative_url = url::Url::parse(&url)
                    .err()
                    .map(|err| err == url::ParseError::RelativeUrlWithoutBase)
                    .unwrap_or(false);
                let is_javascript = trimmed_url.contains("javascript:");
                if !is_hash_url && node.get_name().to_uppercase() == "A" {
                    _ = node.set_attribute("target", "_blank");
                }
                if let Some(srcset) = node.get_attribute("srcset") {
                    let res = constants::SRC_SET_URL
                        .captures_iter(&srcset)
@ -831,7 +805,9 @@ impl FullTextParser {
                    _ = node.set_attribute("srcset", res.as_str());
                }
-                if is_relative_url {
+                if is_hash_url {
                    _ = node.set_attribute(attribute, trimmed_url);
                } else if is_relative_url {
                    let completed_url = match article_url.join(trimmed_url) {
                        Ok(joined_url) => joined_url,
                        Err(_) => continue,
@ -944,7 +920,7 @@ impl FullTextParser {
            for xpath_strip_img_src in &config.strip_image_src {
                _ = Util::strip_node(
                    context,
-                    &format!("//img[contains(@src,'{xpath_strip_img_src}')]"),
+                    &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
                );
            }
        }
@ -952,7 +928,7 @@ impl FullTextParser {
        for xpath_strip_img_src in &global_config.strip_image_src {
            _ = Util::strip_node(
                context,
-                &format!("//img[contains(@src,'{xpath_strip_img_src}')]"),
+                &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
            );
        }
@ -1267,7 +1243,7 @@ impl FullTextParser {
        Ok(())
    }
-    pub(crate) fn remove_single_cell_tables(root: &mut Node) {
+    fn remove_single_cell_tables(root: &mut Node) {
        let mut node_iter = Some(root.clone());
        while let Some(node) = node_iter {
@ -1305,7 +1281,7 @@ impl FullTextParser {
        }
    }
-    pub(crate) fn remove_extra_p_and_div(root: &mut Node) {
+    fn remove_extra_p_and_div(root: &mut Node) {
        let mut node_iter = Some(root.clone());
        while let Some(mut node) = node_iter {
@ -1327,7 +1303,7 @@ impl FullTextParser {
        }
    }
-    pub(crate) fn remove_share_elements(root: &mut Node) {
+    fn remove_share_elements(root: &mut Node) {
        let mut node_iter = Some(root.clone());
        while let Some(mut node) = node_iter {
@ -1347,7 +1323,7 @@ impl FullTextParser {
        }
    }
-    pub(crate) fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> {
+    fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> {
        let mut node_iter = Some(root.clone());
        while let Some(mut node) = node_iter {
@ -1436,7 +1412,7 @@ impl FullTextParser {
        Ok(())
    }
-    pub(crate) fn remove_empty_nodes(root: &mut Node) {
+    fn remove_empty_nodes(root: &mut Node) {
        let mut node_iter = Some(root.clone());
        while let Some(mut node) = node_iter {
--- a/article_scraper/src/images/mod.rs
+++ b/article_scraper/src/images/mod.rs
@ -219,9 +219,7 @@ impl ImageDownloader {
        let mut image_urls = Vec::new();
        for node in node_vec {
-            if let Ok(url) = Self::harvest_image_urls_from_node(node) {
+            image_urls.push(Self::harvest_image_urls_from_node(node)?);
                image_urls.push(url);
            }
        }
        Ok(image_urls)
@ -276,7 +274,7 @@ impl ImageDownloader {
        }
        let image_base64 = base64::engine::general_purpose::STANDARD.encode(&image.data);
-        let image_string = format!("data:{content_type};base64,{image_base64}");
+        let image_string = format!("data:{};base64,{}", content_type, image_base64);
        let image_data_base64 = ImageDataBase64 {
            url: image.url,
            data: image_string,
--- a/article_scraper/src/util.rs
+++ b/article_scraper/src/util.rs
@ -264,15 +264,17 @@ impl Util {
        context: &Context,
        id_or_class: &str,
    ) -> Result<(), FullTextParserError> {
-        let xpath =
+        let xpath = &format!(
-            &format!("//*[contains(@class, '{id_or_class}') or contains(@id, '{id_or_class}')]");
+            "//*[contains(@class, '{}') or contains(@id, '{}')]",
            id_or_class, id_or_class
        );
        let mut ancestor = xpath.clone();
        if ancestor.starts_with("//") {
            ancestor = ancestor.chars().skip(2).collect();
        }
-        let query = &format!("{xpath}[not(ancestor::{ancestor})]");
+        let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
        let node_vec = Util::evaluate_xpath(context, query, false)?;
        for mut node in node_vec {
            if node.is_null() {
--- a/article_scraper_cli/Cargo.toml
+++ b/article_scraper_cli/Cargo.toml
@ -9,22 +9,11 @@ repository.workspace = true
 [dependencies]
-article_scraper = { path = "../article_scraper/" }
+article_scraper = { path  = "../article_scraper/" }
-clap = { version = "4.5", features = ["derive"] }
+clap =  { version = "4.5", features = [ "derive" ] }
 simplelog = "0.12"
 log = "0.4"
 url = "2.5"
-reqwest = { version = "0.12", features = [
+reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
-    "json",
+tokio = { version = "1", features = ["macros", "fs", "io-util", "rt-multi-thread" ] }
-    "native-tls",
+indicatif = "0.17"
    "gzip",
    "brotli",
    "stream",
 ] }
 tokio = { version = "1", features = [
    "macros",
    "fs",
    "io-util",
    "rt-multi-thread",
 ] }
 indicatif = "0.18"
		`@ -1 +1 @@`
			`Subproject commit 69aa220193d99427d3822fabccdfaeede56cd532`				`Subproject commit ccde390b11893cbafdc84f74b449ddc3cc05c024`