first content extraction kinda working

2025-07-10 01:15:31 +02:00 · 2023-02-20 00:29:44 +01:00 · 2023-02-20 00:29:44 +01:00 · cce912c354
commit cce912c354
parent 2c76a869e7
8 changed files with 363 additions and 58 deletions
--- a/src/full_text_parser/config/config_entry.rs
+++ b/src/full_text_parser/config/config_entry.rs
@ -34,6 +34,24 @@ pub struct ConfigEntry {
    pub next_page_link: Option<String>,
 }

+impl Default for ConfigEntry {
+    fn default() -> Self {
+        Self {
+            xpath_title: Vec::new(),
+            xpath_author: Vec::new(),
+            xpath_date: Vec::new(),
+            xpath_body: Vec::new(),
+            xpath_strip: Vec::new(),
+            strip_id_or_class: Vec::new(),
+            strip_image_src: Vec::new(),
+            replace: Vec::new(),
+            header: Vec::new(),
+            single_page_link: None,
+            next_page_link: None,
+        }
+    }
+}
+
 impl ConfigEntry {
    pub async fn parse_path(config_path: &Path) -> Result<ConfigEntry, ConfigError> {
        let mut file = fs::File::open(&config_path).await?;
--- a/src/full_text_parser/mod.rs
+++ b/src/full_text_parser/mod.rs
@ -181,8 +181,8 @@ impl FullTextParser {
        Self::strip_junk(&xpath_ctx, config, global_config, url);
        let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;

-        if found_body {
-            if let Err(error) = Readability::extract_body_readability(document, root) {
+        if !found_body {
+            if let Err(error) = Readability::extract_body(document, root) {
                log::error!("Both ftr and readability failed to find content: {}", error);
                return Err(error);
            }
--- a/src/full_text_parser/readability/mod.rs
+++ b/src/full_text_parser/readability/mod.rs
@ -1,9 +1,12 @@
 mod constants;
 mod state;

+#[cfg(test)]
+mod tests;
+
 use std::cmp::Ordering;

-use libxml::tree::{Document, Node, NodeType};
+use libxml::tree::{node, Document, Node, NodeType};

 use self::state::State;
 use super::error::FullTextParserError;
@ -11,13 +14,12 @@ use super::error::FullTextParserError;
 pub struct Readability;

 impl Readability {
-    pub fn extract_body_readability(
-        document: Document,
-        root: &mut Node,
-    ) -> Result<bool, FullTextParserError> {
+    pub fn extract_body(document: Document, root: &mut Node) -> Result<bool, FullTextParserError> {
+        node::set_node_rc_guard(6);
+
        let mut state = State::default();
        let mut document = document;
-        let mut attempts: Vec<(Node, usize)> = Vec::new();
+        let mut attempts: Vec<(Node, usize, Document)> = Vec::new();
        let document_cache = document
            .dup()
            .map_err(|()| FullTextParserError::Readability)?;
@ -75,7 +77,7 @@ impl Readability {
                }

                // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
-                if tag_name == "DIV"
+                if (tag_name == "DIV"
                    || tag_name == "SECTION"
                    || tag_name == "HEADER"
                    || tag_name == "H1"
@ -83,7 +85,8 @@ impl Readability {
                    || tag_name == "H3"
                    || tag_name == "H4"
                    || tag_name == "H5"
-                    || tag_name == "H6" && Self::is_element_without_content(node_ref)
+                    || tag_name == "H6")
+                    && Self::is_element_without_content(node_ref)
                {
                    node = Self::remove_and_next(node_ref);
                    continue;
@ -159,7 +162,7 @@ impl Readability {
            // Loop through all paragraphs, and assign a score to them based on how content-y they look.
            // Then add their score to their parent node.
            // A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
-            for element_to_score in elements_to_score {
+            for element_to_score in elements_to_score.drain(..) {
                if element_to_score.get_parent().is_none() {
                    continue;
                }
@ -195,7 +198,7 @@ impl Readability {
                    }

                    if Self::get_content_score(&ancestor).is_none() {
-                        Self::initialize_node(&mut ancestor, &state);
+                        Self::initialize_node(&mut ancestor, &state)?;
                        candidates.push(ancestor.clone());
                    }

@ -213,7 +216,7 @@ impl Readability {

                    if let Some(mut score) = Self::get_content_score(&ancestor) {
                        score += content_score / score_divider;
-                        Self::set_content_score(&mut ancestor, score);
+                        Self::set_content_score(&mut ancestor, score)?;
                    }
                }
            }
@ -226,7 +229,7 @@ impl Readability {
                // unaffected by this operation.
                if let Some(content_score) = Self::get_content_score(candidate) {
                    let candidate_score = content_score * (1.0 - Self::get_link_density(candidate));
-                    Self::set_content_score(candidate, candidate_score);
+                    Self::set_content_score(candidate, candidate_score)?;
                }
            }

@ -244,11 +247,11 @@ impl Readability {
            let mut top_candidate = top_candidates.first().cloned().unwrap_or_else(|| {
                // If we still have no top candidate, just use the body as a last resort.
                // We also have to copy the body node so it is something we can modify.
-                Self::initialize_node(root, &state);
+                let mut rt = document.get_root_element().unwrap();
+                Self::initialize_node(&mut rt, &state).unwrap();
                needed_to_create_top_candidate = true;
-                root.clone()
+                rt
            });
-            #[allow(unused_assignments)]
            let mut parent_of_top_candidate = None;

            let mut alternative_candidate_ancestors = Vec::new();
@ -257,8 +260,9 @@ impl Readability {
            for top_candidate in &top_candidates {
                if let Some(score) = Self::get_content_score(top_candidate) {
                    if score >= 0.75 {
-                        alternative_candidate_ancestors
-                            .push(Self::get_node_ancestors(top_candidate, 0));
+                        if let Some(ancestor) = top_candidate.get_parent() {
+                            alternative_candidate_ancestors.push(ancestor);
+                        }
                    }
                }
            }
@ -273,15 +277,16 @@ impl Readability {
                            alternative_candidate_ancestors.len(),
                            constants::MINIMUM_TOPCANDIDATES,
                        );
-                        for item in alternative_candidate_ancestors.iter().take(tmp) {
-                            let tmp = item.iter().any(|n| n == parent);
-                            lists_containing_this_ancestor += if tmp { 1 } else { 0 };
+                        for ancestor in alternative_candidate_ancestors.iter().take(tmp) {
+                            lists_containing_this_ancestor += if ancestor == parent { 1 } else { 0 };
                        }

                        if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {
                            top_candidate = parent.clone();
                            break;
                        }
+                    } else {
+                        break;
                    }

                    parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
@ -289,7 +294,7 @@ impl Readability {
            }

            if Self::get_content_score(&top_candidate).is_none() {
-                Self::initialize_node(&mut top_candidate, &state);
+                Self::initialize_node(&mut top_candidate, &state)?;
            }

            // Because of our bonus system, parents of candidates might have scores
@ -353,7 +358,7 @@ impl Readability {
            }

            if Self::get_content_score(&top_candidate).is_none() {
-                Self::initialize_node(&mut top_candidate, &state);
+                Self::initialize_node(&mut top_candidate, &state)?;
            }

            // Now that we have the top candidate, look through its siblings for content
@ -432,6 +437,7 @@ impl Readability {
                            })?;
                        }

+                        sibling.unlink();
                        article_content.add_child(&mut sibling).map_err(|error| {
                            log::error!("{error}");
                            FullTextParserError::Readability
@ -471,6 +477,7 @@ impl Readability {
                })?;

                for mut child in article_content.get_child_nodes() {
+                    child.unlink();
                    div.add_child(&mut child).map_err(|error| {
                        log::error!("{error}");
                        FullTextParserError::Readability
@ -489,33 +496,31 @@ impl Readability {
            // grabArticle with different flags set. This gives us a higher likelihood of
            // finding the content, and the sieve approach gives us a higher likelihood of
            // finding the -right- content.
-            let text_length = Self::get_inner_text(&article_content, true).len();
+            let text = Self::get_inner_text(&article_content, true);
+            let text_length = text.len();

            if text_length < constants::DEFAULT_CHAR_THRESHOLD {
                parse_successful = false;
-                document = document_cache
-                    .dup()
-                    .map_err(|()| FullTextParserError::Readability)?;

                if state.strip_unlikely {
                    state.strip_unlikely = false;
-                    attempts.push((article_content, text_length));
+                    attempts.push((article_content, text_length, document));
                } else if state.weigh_classes {
                    state.weigh_classes = false;
-                    attempts.push((article_content, text_length));
+                    attempts.push((article_content, text_length, document));
                } else if state.clean_conditionally {
                    state.clean_conditionally = false;
-                    attempts.push((article_content, text_length));
+                    attempts.push((article_content, text_length, document));
                } else {
-                    attempts.push((article_content, text_length));
+                    attempts.push((article_content, text_length, document));
                    // No luck after removing flags, just return the longest text we found during the different loops

-                    attempts.sort_by(|(_, size_a), (_, size_b)| size_a.cmp(size_b));
+                    attempts.sort_by(|(_, size_a, _), (_, size_b, _)| size_a.cmp(size_b));

                    // But first check if we actually have something
-                    if let Some((best_attempt, _len)) = attempts.first() {
-                        article_content = best_attempt.clone();
-                        root.add_child(&mut article_content).map_err(|error| {
+                    if let Some((mut best_attempt, _len, _document)) = attempts.pop() {
+                        best_attempt.unlink();
+                        root.add_child(&mut best_attempt).map_err(|error| {
                            log::error!("{error}");
                            FullTextParserError::Readability
                        })?;
@ -524,6 +529,10 @@ impl Readability {

                    return Ok(parse_successful);
                }
+
+                document = document_cache
+                    .dup()
+                    .map_err(|()| FullTextParserError::Readability)?;
            } else {
                root.add_child(&mut article_content).map_err(|error| {
                    log::error!("{error}");
@ -539,9 +548,12 @@ impl Readability {
            .and_then(|a| a.parse::<f64>().ok())
    }

-    fn set_content_score(node: &mut Node, score: f64) {
+    fn set_content_score(node: &mut Node, score: f64) -> Result<(), FullTextParserError> {
        node.set_attribute(constants::SCORE_ATTR, &score.to_string())
-            .expect("Failed to set content score");
+            .map_err(|err| {
+                log::error!("failed to set content score: {err}");
+                FullTextParserError::Readability
+            })
    }

    fn is_probably_visible(node: &Node) -> bool {
@ -580,6 +592,8 @@ impl Readability {
    }

    fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
+        let mut node = node.clone();
+
        // First check for kids if those aren't being ignored
        let first_child = node.get_first_child();
        if !ignore_self_and_kids && first_child.is_some() {
@ -602,9 +616,16 @@ impl Readability {
            }

            if let Some(parent) = parent {
+                let parent_name = parent.get_name().to_uppercase();
+                if parent_name == "HTML" {
+                    break;
+                }
+
                let next_sibling = parent.get_next_sibling();
                if next_sibling.is_some() {
                    return next_sibling;
+                } else {
+                    node = parent;
                }
            }
        }
@ -649,11 +670,11 @@ impl Readability {
    // the same as the article title.
    fn header_duplicates_title(node: &Node) -> bool {
        let name = node.get_name().to_lowercase();
-        if name != "h1" || name != "h2" {
+        if name != "h1" && name != "h2" {
            return false;
        }
        let heading = Self::get_inner_text(node, false);
-        Self::text_similarity(&heading, "FIXME") > 0.75
+        Self::text_similarity(&heading, "Get your Frontend JavaScript Code Covered") > 0.75
    }

    fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
@ -759,10 +780,10 @@ impl Readability {

        fn get_elems(node: &Node, tag: &str, vec: &mut Vec<Node>, all_tags: bool) {
            for child in node.get_child_elements() {
-                if all_tags || child.get_name() == tag {
-                    vec.push(child);
+                if all_tags || child.get_name().to_uppercase() == tag {
+                    vec.push(child.clone());
                }
-                get_elems(node, tag, vec, all_tags);
+                get_elems(&child, tag, vec, all_tags);
            }
        }

@ -823,7 +844,7 @@ impl Readability {
        let mut ancestors = Vec::new();
        let mut node = node.clone();

-        for _ in 0..max_depth {
+        for _ in 0..=max_depth {
            let parent = node.get_parent();
            match parent {
                Some(parent) => {
@ -839,7 +860,7 @@ impl Readability {

    // Initialize a node with the readability object. Also checks the
    // className/id for special names to add to its score.
-    fn initialize_node(node: &mut Node, state: &State) {
+    fn initialize_node(node: &mut Node, state: &State) -> Result<(), FullTextParserError> {
        let score = match node.get_name().to_uppercase().as_str() {
            "DIV" => 5,
            "PRE" | "TD" | "BLOCKQUITE" => 3,
@ -848,7 +869,8 @@ impl Readability {
            _ => 0,
        };
        let score = score + Self::get_class_weight(node, state);
-        Self::set_content_score(node, score as f64);
+        Self::set_content_score(node, score as f64)?;
+        Ok(())
    }

    fn get_class_weight(node: &Node, state: &State) -> i64 {
--- a/src/full_text_parser/readability/tests.rs
+++ b/src/full_text_parser/readability/tests.rs
@ -0,0 +1,27 @@
+use libxml::tree::{Document, Node};
+use reqwest::Url;
+
+use crate::full_text_parser::config::ConfigEntry;
+
+async fn prepare(html: &str, url: &Url) -> Document {
+    let empty_config = ConfigEntry::default();
+    let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap();
+    let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
+    crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config, url);
+    document
+}
+
+
+#[tokio::test]
+async fn test_1() {
+    let _ = env_logger::builder().is_test(true).try_init();
+
+    let html = std::fs::read_to_string(r"./resources/tests/readability-test-1.html")
+        .expect("Failed to read HTML");
+    let url = Url::parse("http://google.com").unwrap();
+    let document = prepare(&html, &url).await;
+
+    let mut root = Node::new("article", None, &document).unwrap();
+
+    super::Readability::extract_body(document, &mut root).unwrap();
+}
--- a/src/full_text_parser/tests.rs
+++ b/src/full_text_parser/tests.rs
@ -2,7 +2,7 @@ use super::FullTextParser;
 use reqwest::Client;
 use std::path::PathBuf;

-#[tokio::test(flavor = "current_thread")]
+#[tokio::test]
 async fn golem() {
    let out_path = PathBuf::from(r"./test_output");
    let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
@ -26,7 +26,7 @@ async fn golem() {
    assert_eq!(article.author, Some(String::from("Hauke Gierow")));
 }

-#[tokio::test(flavor = "current_thread")]
+#[tokio::test]
 async fn phoronix() {
    let out_path = PathBuf::from(r"./test_output");
    let url =
@ -45,7 +45,7 @@ async fn phoronix() {
    );
 }

-#[tokio::test(flavor = "current_thread")]
+#[tokio::test]
 async fn youtube() {
    let out_path = PathBuf::from(r"./test_output");
    let url = url::Url::parse("https://www.youtube.com/watch?v=8KjaIumu-jI").unwrap();
@ -64,7 +64,7 @@ async fn youtube() {
        .unwrap_or(false));
 }

-#[tokio::test(flavor = "current_thread")]
+#[tokio::test]
 async fn encoding_windows_1252() {
    let url = url::Url::parse("https://www.aerzteblatt.de/nachrichten/139511/Scholz-zuversichtlich-mit-Blick-auf-Coronasituation-im-Winter").unwrap();
    let html = FullTextParser::download(&url, &Client::new(), reqwest::header::HeaderMap::new())
--- a/src/images/mod.rs
+++ b/src/images/mod.rs
@ -1,5 +1,6 @@
 pub use self::error::ImageDownloadError;
 use crate::util::Util;
+use base64::Engine;
 use libxml::parser::Parser;
 use libxml::tree::{Document, Node, SaveOptions};
 use libxml::xpath::Context;
@ -143,8 +144,8 @@ impl ImageDownloader {
            }
        }

-        let small_image_base64 = base64::encode(&small_image);
-        let big_image_base64 = big_image.map(base64::encode);
+        let small_image_base64 = base64::engine::general_purpose::STANDARD.encode(&small_image);
+        let big_image_base64 = big_image.map(|img| base64::engine::general_purpose::STANDARD.encode(img));
        let small_image_string =
            format!("data:{};base64,{}", content_type_small, small_image_base64);
        let big_image_string = match big_image_base64 {
@ -290,13 +291,13 @@ mod tests {
    use std::fs;
    use std::io::Write;

-    #[tokio::test(flavor = "current_thread")]
+    #[tokio::test]
    async fn close_tags() {
        let image_dowloader = ImageDownloader::new((2048, 2048));
-        let hdyleaflet = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
+        let html = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
            .expect("Failed to read HTML");
        let result = image_dowloader
-            .download_images_from_string(&hdyleaflet, &Client::new())
+            .download_images_from_string(&html, &Client::new())
            .await
            .expect("Failed to downalod images");
        let mut file = fs::File::create(r"./test_output/fedora31_images_downloaded.html")