bump version

clean html fragment: don't remove same page links & footnotes
2025-07-08 08:30:00 +02:00 · 2025-07-07 18:56:16 +02:00 · 2025-07-07 18:03:45 +02:00
9 changed files with 69 additions and 43 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -1,17 +1,16 @@
 stages:
  - build
-  

 run-build:
  stage: build
  image: rust:1.86
  before_script:
-  - rustup component add rustfmt
-  - rustup component add clippy
-  - export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so
+    - rustup component add rustfmt
+    - rustup component add clippy
+    - export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so
  script:
-  - rustc --version && cargo --version
-  - echo $LIBXML2
-  - cargo fmt -- --check
-  - cargo clippy --all-targets --all-features -- -D warnings
-  - cargo build --release
+    - rustc --version && cargo --version
+    - echo $LIBXML2
+    - cargo fmt -- --check
+    - cargo clippy --all-targets --all-features -- -D warnings
+    - cargo build --release
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,8 +3,8 @@ members = ["article_scraper", "article_scraper_cli"]
 resolver = "2"

 [workspace.package]
-version = "2.1.2"
+version = "2.1.3"
 authors = ["Jan Lukas Gernert <jangernert@gmail.com>"]
 edition = "2021"
 license = "GPL-3.0-or-later"
-repository = "https://gitlab.com/news-flash/article_scraper"
+repository = "https://gitlab.com/news-flash/article_scraper"
--- a/article_scraper/Cargo.toml
+++ b/article_scraper/Cargo.toml
@ -23,8 +23,8 @@ chrono = "0.4"
 base64 = "0.22"
 image = "0.25"
 log = "0.4"
-rust-embed="8.6"
-once_cell = "1.20"
+rust-embed = "8.7"
+once_cell = "1.21"
 escaper = "0.1"
 futures = "0.3"
 unic-emoji-char = "0.9"
--- a/article_scraper/src/clean.rs
+++ b/article_scraper/src/clean.rs
--- a/article_scraper/src/full_text_parser/metadata.rs
+++ b/article_scraper/src/full_text_parser/metadata.rs
@ -158,7 +158,7 @@ fn extract_date(
 fn get_meta(context: &Context, name: &str) -> Option<String> {
    Util::get_attribute(
        context,
-        &format!("//meta[contains(@name, '{}')]", name),
+        &format!("//meta[contains(@name, '{name}')]"),
        "content",
    )
    .ok()
--- a/article_scraper/src/full_text_parser/mod.rs
+++ b/article_scraper/src/full_text_parser/mod.rs
@ -722,7 +722,7 @@ impl FullTextParser {
    }

    fn fix_iframe_size(context: &Context, site_name: &str) -> Result<(), FullTextParserError> {
-        let xpath = &format!("//iframe[contains(@src, '{}')]", site_name);
+        let xpath = &format!("//iframe[contains(@src, '{site_name}')]");
        let node_vec = Util::evaluate_xpath(context, xpath, false)?;
        for mut node in node_vec {
            if node.is_null() {
@ -761,7 +761,7 @@ impl FullTextParser {
    ) -> Result<(), FullTextParserError> {
        let xpath_tag = tag.unwrap_or("*");

-        let xpath = &format!("//{}[@{}]", xpath_tag, attribute);
+        let xpath = &format!("//{xpath_tag}[@{attribute}]");
        let node_vec = Util::evaluate_xpath(context, xpath, false)?;
        for mut node in node_vec {
            if let Err(err) = node.remove_property(attribute) {
@ -791,17 +791,16 @@ impl FullTextParser {
            if let Some(url) = node.get_attribute(attribute) {
                let trimmed_url = url.trim();

-                let is_hash_url = url.starts_with('#');
+                if url.starts_with('#') || url.starts_with("\\#") {
+                    continue;
+                }
+
                let is_relative_url = url::Url::parse(&url)
                    .err()
                    .map(|err| err == url::ParseError::RelativeUrlWithoutBase)
                    .unwrap_or(false);
                let is_javascript = trimmed_url.contains("javascript:");

-                if !is_hash_url && node.get_name().to_uppercase() == "A" {
-                    _ = node.set_attribute("target", "_blank");
-                }
-
                if let Some(srcset) = node.get_attribute("srcset") {
                    let res = constants::SRC_SET_URL
                        .captures_iter(&srcset)
@ -832,9 +831,7 @@ impl FullTextParser {
                    _ = node.set_attribute("srcset", res.as_str());
                }

-                if is_hash_url {
-                    _ = node.set_attribute(attribute, trimmed_url);
-                } else if is_relative_url {
+                if is_relative_url {
                    let completed_url = match article_url.join(trimmed_url) {
                        Ok(joined_url) => joined_url,
                        Err(_) => continue,
@ -947,7 +944,7 @@ impl FullTextParser {
            for xpath_strip_img_src in &config.strip_image_src {
                _ = Util::strip_node(
                    context,
-                    &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
+                    &format!("//img[contains(@src,'{xpath_strip_img_src}')]"),
                );
            }
        }
@ -955,7 +952,7 @@ impl FullTextParser {
        for xpath_strip_img_src in &global_config.strip_image_src {
            _ = Util::strip_node(
                context,
-                &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
+                &format!("//img[contains(@src,'{xpath_strip_img_src}')]"),
            );
        }

@ -1270,7 +1267,7 @@ impl FullTextParser {
        Ok(())
    }

-    fn remove_single_cell_tables(root: &mut Node) {
+    pub(crate) fn remove_single_cell_tables(root: &mut Node) {
        let mut node_iter = Some(root.clone());

        while let Some(node) = node_iter {
@ -1308,7 +1305,7 @@ impl FullTextParser {
        }
    }

-    fn remove_extra_p_and_div(root: &mut Node) {
+    pub(crate) fn remove_extra_p_and_div(root: &mut Node) {
        let mut node_iter = Some(root.clone());

        while let Some(mut node) = node_iter {
@ -1330,7 +1327,7 @@ impl FullTextParser {
        }
    }

-    fn remove_share_elements(root: &mut Node) {
+    pub(crate) fn remove_share_elements(root: &mut Node) {
        let mut node_iter = Some(root.clone());

        while let Some(mut node) = node_iter {
@ -1350,7 +1347,7 @@ impl FullTextParser {
        }
    }

-    fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> {
+    pub(crate) fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> {
        let mut node_iter = Some(root.clone());

        while let Some(mut node) = node_iter {
@ -1439,7 +1436,7 @@ impl FullTextParser {
        Ok(())
    }

-    fn remove_empty_nodes(root: &mut Node) {
+    pub(crate) fn remove_empty_nodes(root: &mut Node) {
        let mut node_iter = Some(root.clone());

        while let Some(mut node) = node_iter {
--- a/article_scraper/src/images/mod.rs
+++ b/article_scraper/src/images/mod.rs
@ -276,7 +276,7 @@ impl ImageDownloader {
        }

        let image_base64 = base64::engine::general_purpose::STANDARD.encode(&image.data);
-        let image_string = format!("data:{};base64,{}", content_type, image_base64);
+        let image_string = format!("data:{content_type};base64,{image_base64}");
        let image_data_base64 = ImageDataBase64 {
            url: image.url,
            data: image_string,
--- a/article_scraper/src/util.rs
+++ b/article_scraper/src/util.rs
@ -264,17 +264,15 @@ impl Util {
        context: &Context,
        id_or_class: &str,
    ) -> Result<(), FullTextParserError> {
-        let xpath = &format!(
-            "//*[contains(@class, '{}') or contains(@id, '{}')]",
-            id_or_class, id_or_class
-        );
+        let xpath =
+            &format!("//*[contains(@class, '{id_or_class}') or contains(@id, '{id_or_class}')]");

        let mut ancestor = xpath.clone();
        if ancestor.starts_with("//") {
            ancestor = ancestor.chars().skip(2).collect();
        }

-        let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
+        let query = &format!("{xpath}[not(ancestor::{ancestor})]");
        let node_vec = Util::evaluate_xpath(context, query, false)?;
        for mut node in node_vec {
            if node.is_null() {
--- a/article_scraper_cli/Cargo.toml
+++ b/article_scraper_cli/Cargo.toml
@ -9,11 +9,22 @@ repository.workspace = true


 [dependencies]
-article_scraper = { path  = "../article_scraper/" }
-clap =  { version = "4.5", features = [ "derive" ] }
+article_scraper = { path = "../article_scraper/" }
+clap = { version = "4.5", features = ["derive"] }
 simplelog = "0.12"
 log = "0.4"
 url = "2.5"
-reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
-tokio = { version = "1", features = ["macros", "fs", "io-util", "rt-multi-thread" ] }
-indicatif = "0.17"
+reqwest = { version = "0.12", features = [
+    "json",
+    "native-tls",
+    "gzip",
+    "brotli",
+    "stream",
+] }
+tokio = { version = "1", features = [
+    "macros",
+    "fs",
+    "io-util",
+    "rt-multi-thread",
+] }
+indicatif = "0.18"
Author	SHA1	Message	Date
Jan Lukas Gernert	4c9709e292	bump version	2025-07-07 18:56:16 +02:00
Jan Lukas Gernert	a23a691c31	clean html fragment: don't remove same page links & footnotes	2025-07-07 18:03:45 +02:00