somewhat complete readability algorithm

2025-07-07 16:15:32 +02:00 · 2023-02-17 14:16:01 +01:00 · 2023-02-17 14:16:01 +01:00 · 71a8816747
commit 71a8816747
parent 979358fd35
5 changed files with 620 additions and 92 deletions
--- a/src/article.rs
+++ b/src/article.rs
@ -42,7 +42,7 @@ impl Article {
                };
                file_name.push_str(".html");
                let path = path.join(file_name);
-                let mut html_file = File::create(&path)?;
+                let mut html_file = File::create(path)?;
                html_file.write_all(html.as_bytes())?;
                return Ok(());
            }
--- a/src/full_text_parser/error.rs
+++ b/src/full_text_parser/error.rs
@ -18,6 +18,8 @@ pub enum FullTextParserError {
    ContentType,
    #[error("Invalid UTF8 Text")]
    Utf8(#[from] std::str::Utf8Error),
+    #[error("Readability Error")]
+    Readability,
    #[error("Unknown Error")]
    Unknown,
 }
--- a/src/full_text_parser/mod.rs
+++ b/src/full_text_parser/mod.rs
@ -182,7 +182,7 @@ impl FullTextParser {
        let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;

        if found_body {
-            if let Err(error) = Readability::extract_body_readability(&document, root) {
+            if let Err(error) = Readability::extract_body_readability(document, root) {
                log::error!("Both ftr and readability failed to find content: {}", error);
                return Err(error);
            }
--- a/src/full_text_parser/readability/constants.rs
+++ b/src/full_text_parser/readability/constants.rs
@ -1,6 +1,11 @@
+use std::collections::HashSet;
+
 use once_cell::sync::Lazy;
 use regex::Regex;

+pub const DEFAULT_CHAR_THRESHOLD: usize = 500;
+pub static SIBLING_CONTENT: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r#"/\.( |$)/"#).expect("SIBLING_CONTENT regex"));
 pub static BYLINE: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r#"/byline|author|dateline|writtenby|p-author/i"#).expect("BYLINE regex")
 });
@ -17,7 +22,17 @@ pub static OKAY_MAYBE_ITS_A_CANDIDATE: Lazy<Regex> = Lazy::new(|| {
 pub static HAS_CONTENT: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"/\S$/"#).expect("HAS_CONTENT regex"));
 pub static HASH_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"/^#.+/"#).expect("HASH_URL regex"));
+pub static POSITIVE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(
+        r#"/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i"#,
+    )
+    .expect("POSITIVE regex")
+});
+pub static NEGATIVE: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r#"/-ad-|hidden|^hid$| hid$| hid |^hid"#).expect("NEGATIVE regex"));

+pub const SCORE_ATTR: &str = "content_score";
+pub const MINIMUM_TOPCANDIDATES: usize = 3;
 pub const UNLIKELY_ROLES: &[&str] = &[
    "menu",
    "menubar",
@ -30,6 +45,22 @@ pub const UNLIKELY_ROLES: &[&str] = &[

 pub const DEFAULT_TAGS_TO_SCORE: &[&str] =
    &["SECTION", "H2", "H3", "H4", "H5", "H6", "P", "TD", "PRE"];
+pub static DIV_TO_P_ELEMS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
+    HashSet::from([
+        "BLOCKQUOTE",
+        "DL",
+        "DIV",
+        "IMG",
+        "OL",
+        "P",
+        "PRE",
+        "TABLE",
+        "UL",
+    ])
+});
+
+pub static ALTER_TO_DIV_EXCEPTIONS: Lazy<HashSet<&'static str>> =
+    Lazy::new(|| HashSet::from(["DIV", "ARTICLE", "SECTION", "P"]));

 pub const PHRASING_ELEMS: &[&str] = &[
    // "CANVAS", "IFRAME", "SVG", "VIDEO",
--- a/src/full_text_parser/readability/mod.rs
+++ b/src/full_text_parser/readability/mod.rs
@ -1,6 +1,8 @@
 mod constants;
 mod state;

+use std::cmp::Ordering;
+
 use libxml::tree::{Document, Node, NodeType};

 use self::state::State;
@ -10,10 +12,17 @@ pub struct Readability;

 impl Readability {
    pub fn extract_body_readability(
-        document: &Document,
-        _root: &mut Node,
+        document: Document,
+        root: &mut Node,
    ) -> Result<bool, FullTextParserError> {
        let mut state = State::default();
+        let mut document = document;
+        let mut attempts: Vec<(Node, usize)> = Vec::new();
+        let document_cache = document
+            .dup()
+            .map_err(|()| FullTextParserError::Readability)?;
+
+        loop {
            let mut elements_to_score = Vec::new();
            let mut node: Option<Node> = document.clone().get_root_element();

@ -93,11 +102,18 @@ impl Readability {
                            if let Some(p) = p.as_mut() {
                                let _ = p.add_child(&mut child_node);
                            } else if !Self::is_whitespace(&child_node) {
-                            let mut new_node = Node::new("p", None, document).unwrap();
+                                let mut new_node = Node::new("p", None, &document)
+                                    .map_err(|()| FullTextParserError::Readability)?;
                                node_ref
                                    .replace_child_node(new_node.clone(), child_node.clone())
-                                .unwrap();
-                            new_node.add_child(&mut child_node).unwrap();
+                                    .map_err(|error| {
+                                        log::error!("{error}");
+                                        FullTextParserError::Readability
+                                    })?;
+                                new_node.add_child(&mut child_node).map_err(|error| {
+                                    log::error!("{error}");
+                                    FullTextParserError::Readability
+                                })?;
                                p.replace(new_node);
                            }
                        } else if let Some(p) = p.as_mut() {
@ -115,13 +131,415 @@ impl Readability {
                    // algorithm with DIVs with are, in practice, paragraphs.
                    if Self::has_single_tag_inside_element(node_ref, "P")
                        && Self::get_link_density(node_ref) < 0.25
-                {}
+                    {
+                        if let Some(new_node) = node_ref.get_child_nodes().first() {
+                            if let Some(mut parent) = node_ref.get_parent() {
+                                parent
+                                    .replace_child_node(new_node.clone(), node_ref.clone())
+                                    .map_err(|error| {
+                                        log::error!("{error}");
+                                        FullTextParserError::Readability
+                                    })?;
+                                node = Some(new_node.clone());
+                                elements_to_score.push(new_node.clone());
+                                continue;
+                            }
+                        }
+                    } else if !Self::has_child_block_element(node_ref) && node_ref.set_name("P").is_ok() {
+                        elements_to_score.push(node_ref.clone());
+                    }
                }

                node = Self::next_node(node_ref, false);
            }

-        unimplemented!()
+            let mut candidates = Vec::new();
+            // Loop through all paragraphs, and assign a score to them based on how content-y they look.
+            // Then add their score to their parent node.
+            // A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
+            for element_to_score in elements_to_score {
+                if element_to_score.get_parent().is_none() {
+                    continue;
+                }
+
+                let inner_text = Self::get_inner_text(&element_to_score, true);
+
+                // If this paragraph is less than 25 characters, don't even count it.
+                if inner_text.len() < 25 {
+                    continue;
+                }
+
+                // Exclude nodes with no ancestor.
+                let ancestors = Self::get_node_ancestors(&element_to_score, 5);
+                if ancestors.is_empty() {
+                    continue;
+                }
+
+                let mut content_score = 0.0;
+
+                // Add a point for the paragraph itself as a base.
+                content_score += 1.0;
+
+                // Add points for any commas within this paragraph.
+                content_score += inner_text.split(',').count() as f64;
+
+                // For every 100 characters in this paragraph, add another point. Up to 3 points.
+                content_score += f64::min(f64::floor(inner_text.len() as f64 / 100.0), 3.0);
+
+                // Initialize and score ancestors.
+                for (level, mut ancestor) in ancestors.into_iter().enumerate() {
+                    if ancestor.get_parent().is_none() {
+                        continue;
+                    }
+
+                    if Self::get_content_score(&ancestor).is_none() {
+                        Self::initialize_node(&mut ancestor, &state);
+                        candidates.push(ancestor.clone());
+                    }
+
+                    // Node score divider:
+                    // - parent:             1 (no division)
+                    // - grandparent:        2
+                    // - great grandparent+: ancestor level * 3
+                    let score_divider = if level == 0 {
+                        1.0
+                    } else if level == 1 {
+                        2.0
+                    } else {
+                        level as f64 * 3.0
+                    };
+
+                    if let Some(mut score) = Self::get_content_score(&ancestor) {
+                        score += content_score / score_divider;
+                        Self::set_content_score(&mut ancestor, score);
+                    }
+                }
+            }
+
+            // After we've calculated scores, loop through all of the possible
+            // candidate nodes we found and find the one with the highest score.
+            for candidate in candidates.iter_mut() {
+                // Scale the final candidates score based on link density. Good content
+                // should have a relatively small link density (5% or less) and be mostly
+                // unaffected by this operation.
+                if let Some(content_score) = Self::get_content_score(candidate) {
+                    let candidate_score = content_score * (1.0 - Self::get_link_density(candidate));
+                    Self::set_content_score(candidate, candidate_score);
+                }
+            }
+
+            candidates.sort_by(|a, b| {
+                if let (Some(a), Some(b)) = (Self::get_content_score(a), Self::get_content_score(b))
+                {
+                    a.partial_cmp(&b).unwrap_or(Ordering::Equal)
+                } else {
+                    Ordering::Equal
+                }
+            });
+
+            let top_candidates = candidates.into_iter().take(5).collect::<Vec<_>>();
+            let mut needed_to_create_top_candidate = false;
+            let mut top_candidate = top_candidates.first().cloned().unwrap_or_else(|| {
+                // If we still have no top candidate, just use the body as a last resort.
+                // We also have to copy the body node so it is something we can modify.
+                Self::initialize_node(root, &state);
+                needed_to_create_top_candidate = true;
+                root.clone()
+            });
+            #[allow(unused_assignments)]
+            let mut parent_of_top_candidate = None;
+
+            let mut alternative_candidate_ancestors = Vec::new();
+            // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
+            // and whose scores are quite closed with current `topCandidate` node.
+            for top_candidate in &top_candidates {
+                if let Some(score) = Self::get_content_score(top_candidate) {
+                    if score >= 0.75 {
+                        alternative_candidate_ancestors
+                            .push(Self::get_node_ancestors(top_candidate, 0));
+                    }
+                }
+            }
+
+            if alternative_candidate_ancestors.len() >= constants::MINIMUM_TOPCANDIDATES {
+                parent_of_top_candidate = top_candidate.get_parent();
+
+                loop {
+                    if let Some(parent) = &parent_of_top_candidate {
+                        let mut lists_containing_this_ancestor = 0;
+                        let tmp = usize::min(
+                            alternative_candidate_ancestors.len(),
+                            constants::MINIMUM_TOPCANDIDATES,
+                        );
+                        for item in alternative_candidate_ancestors.iter().take(tmp) {
+                            let tmp = item.iter().any(|n| n == parent);
+                            lists_containing_this_ancestor += if tmp { 1 } else { 0 };
+                        }
+
+                        if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {
+                            top_candidate = parent.clone();
+                            break;
+                        }
+                    }
+
+                    parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
+                }
+            }
+
+            if Self::get_content_score(&top_candidate).is_none() {
+                Self::initialize_node(&mut top_candidate, &state);
+            }
+
+            // Because of our bonus system, parents of candidates might have scores
+            // themselves. They get half of the node. There won't be nodes with higher
+            // scores than our topCandidate, but if we see the score going *up* in the first
+            // few steps up the tree, that's a decent sign that there might be more content
+            // lurking in other places that we want to unify in. The sibling stuff
+            // below does some of that - but only if we've looked high enough up the DOM
+            // tree.
+            parent_of_top_candidate = top_candidate.get_parent();
+            let mut last_score = Self::get_content_score(&top_candidate).unwrap_or(0.0);
+
+            // The scores shouldn't get too low.
+            let score_threshold = last_score / 3.0;
+
+            while Self::has_tag_name(parent_of_top_candidate.as_ref(), "BODY") {
+                if parent_of_top_candidate
+                    .as_ref()
+                    .map(|n| Self::get_content_score(n).is_none())
+                    .unwrap_or(false)
+                {
+                    parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
+                    continue;
+                }
+
+                let parent_score = parent_of_top_candidate
+                    .as_ref()
+                    .and_then(Self::get_content_score)
+                    .unwrap_or(0.0);
+                if parent_score < score_threshold {
+                    break;
+                }
+
+                if parent_score > last_score {
+                    // Alright! We found a better parent to use.
+                    if let Some(parent) = parent_of_top_candidate {
+                        top_candidate = parent;
+                    }
+                    break;
+                }
+
+                last_score = parent_of_top_candidate
+                    .as_ref()
+                    .and_then(Self::get_content_score)
+                    .unwrap_or(0.0);
+                parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
+            }
+
+            // If the top candidate is the only child, use parent instead. This will help sibling
+            // joining logic when adjacent content is actually located in parent's sibling node.
+            parent_of_top_candidate = top_candidate.get_parent();
+
+            while Self::has_tag_name(parent_of_top_candidate.as_ref(), "BODY")
+                && parent_of_top_candidate
+                    .as_ref()
+                    .map(|n| n.get_child_elements().len() == 1)
+                    .unwrap_or(false)
+            {
+                top_candidate = parent_of_top_candidate.ok_or(FullTextParserError::Readability)?;
+                parent_of_top_candidate = top_candidate.get_parent();
+            }
+
+            if Self::get_content_score(&top_candidate).is_none() {
+                Self::initialize_node(&mut top_candidate, &state);
+            }
+
+            // Now that we have the top candidate, look through its siblings for content
+            // that might also be related. Things like preambles, content split by ads
+            // that we removed, etc.
+            let mut article_content =
+                Node::new("DIV", None, &document).map_err(|()| FullTextParserError::Readability)?;
+
+            let sibling_score_threshold = f64::max(
+                10.0,
+                Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2,
+            );
+            // Keep potential top candidate's parent node to try to get text direction of it later.
+            parent_of_top_candidate = top_candidate.get_parent();
+            let siblings = parent_of_top_candidate
+                .as_ref()
+                .map(|n| n.get_child_nodes());
+
+            if let Some(siblings) = siblings {
+                for mut sibling in siblings {
+                    let mut append = false;
+
+                    let score = Self::get_content_score(&sibling);
+                    log::debug!("Looking at sibling node: {sibling:?} with score {score:?}");
+
+                    if top_candidate == sibling {
+                        append = true;
+                    } else {
+                        let mut content_bonus = 0.0;
+
+                        // Give a bonus if sibling nodes and top candidates have the example same classname
+                        let sibling_classes = sibling.get_class_names();
+                        let tc_classes = top_candidate.get_class_names();
+
+                        if sibling_classes
+                            .iter()
+                            .all(|class| tc_classes.contains(class))
+                            && !tc_classes.is_empty()
+                        {
+                            content_bonus +=
+                                Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2;
+                        }
+
+                        if Self::get_content_score(&sibling).unwrap_or(0.0) + content_bonus
+                            >= sibling_score_threshold
+                        {
+                            append = true;
+                        } else if sibling.get_name().to_uppercase() == "P" {
+                            let link_density = Self::get_link_density(&sibling);
+                            let node_content = Self::get_inner_text(&sibling, false);
+                            let node_length = node_content.len();
+
+                            if node_length > 80
+                                && (link_density < 0.25
+                                    || (node_length > 0
+                                        && link_density == 0.0
+                                        && constants::SIBLING_CONTENT.is_match(&node_content)))
+                            {
+                                append = true;
+                            }
+                        }
+                    }
+
+                    if append {
+                        log::debug!("Appending node: {sibling:?}");
+
+                        if !constants::ALTER_TO_DIV_EXCEPTIONS.contains(sibling.get_name().as_str())
+                        {
+                            // We have a node that isn't a common block level element, like a form or td tag.
+                            // Turn it into a div so it doesn't get filtered out later by accident.
+                            log::debug!("Altering sibling: {sibling:?} to div.");
+
+                            sibling.set_name("DIV").map_err(|error| {
+                                log::error!("{error}");
+                                FullTextParserError::Readability
+                            })?;
+                        }
+
+                        article_content.add_child(&mut sibling).map_err(|error| {
+                            log::error!("{error}");
+                            FullTextParserError::Readability
+                        })?;
+                    }
+                }
+            }
+
+            if needed_to_create_top_candidate {
+                // We already created a fake div thing, and there wouldn't have been any siblings left
+                // for the previous loop, so there's no point trying to create a new div, and then
+                // move all the children over. Just assign IDs and class names here. No need to append
+                // because that already happened anyway.
+                top_candidate
+                    .set_property("id", "readability-page-1")
+                    .map_err(|error| {
+                        log::error!("{error}");
+                        FullTextParserError::Readability
+                    })?;
+                top_candidate
+                    .set_property("class", "page")
+                    .map_err(|error| {
+                        log::error!("{error}");
+                        FullTextParserError::Readability
+                    })?;
+            } else {
+                let mut div = Node::new("DIV", None, &document)
+                    .map_err(|()| FullTextParserError::Readability)?;
+                div.set_property("id", "readability-page-1")
+                    .map_err(|error| {
+                        log::error!("{error}");
+                        FullTextParserError::Readability
+                    })?;
+                div.set_property("class", "page").map_err(|error| {
+                    log::error!("{error}");
+                    FullTextParserError::Readability
+                })?;
+
+                for mut child in article_content.get_child_nodes() {
+                    div.add_child(&mut child).map_err(|error| {
+                        log::error!("{error}");
+                        FullTextParserError::Readability
+                    })?;
+                }
+                article_content.add_child(&mut div).map_err(|error| {
+                    log::error!("{error}");
+                    FullTextParserError::Readability
+                })?;
+            }
+
+            let mut parse_successful = true;
+
+            // Now that we've gone through the full algorithm, check to see if
+            // we got any meaningful content. If we didn't, we may need to re-run
+            // grabArticle with different flags set. This gives us a higher likelihood of
+            // finding the content, and the sieve approach gives us a higher likelihood of
+            // finding the -right- content.
+            let text_length = Self::get_inner_text(&article_content, true).len();
+
+            if text_length < constants::DEFAULT_CHAR_THRESHOLD {
+                parse_successful = false;
+                document = document_cache
+                    .dup()
+                    .map_err(|()| FullTextParserError::Readability)?;
+
+                if state.strip_unlikely {
+                    state.strip_unlikely = false;
+                    attempts.push((article_content, text_length));
+                } else if state.weigh_classes {
+                    state.weigh_classes = false;
+                    attempts.push((article_content, text_length));
+                } else if state.clean_conditionally {
+                    state.clean_conditionally = false;
+                    attempts.push((article_content, text_length));
+                } else {
+                    attempts.push((article_content, text_length));
+                    // No luck after removing flags, just return the longest text we found during the different loops
+
+                    attempts.sort_by(|(_, size_a), (_, size_b)| size_a.cmp(size_b));
+
+                    // But first check if we actually have something
+                    if let Some((best_attempt, _len)) = attempts.first() {
+                        article_content = best_attempt.clone();
+                        root.add_child(&mut article_content).map_err(|error| {
+                            log::error!("{error}");
+                            FullTextParserError::Readability
+                        })?;
+                        parse_successful = true;
+                    }
+
+                    return Ok(parse_successful);
+                }
+            } else {
+                root.add_child(&mut article_content).map_err(|error| {
+                    log::error!("{error}");
+                    FullTextParserError::Readability
+                })?;
+                return Ok(parse_successful);
+            }
+        }
+    }
+
+    fn get_content_score(node: &Node) -> Option<f64> {
+        node.get_attribute(constants::SCORE_ATTR)
+            .and_then(|a| a.parse::<f64>().ok())
+    }
+
+    fn set_content_score(node: &mut Node, score: f64) {
+        node.set_attribute(constants::SCORE_ATTR, &score.to_string())
+            .expect("Failed to set content score");
    }

    fn is_probably_visible(node: &Node) -> bool {
@ -390,4 +808,81 @@ impl Readability {

        link_length / text_length as f64
    }
+
+    // Determine whether element has any children block level elements.
+    fn has_child_block_element(node: &Node) -> bool {
+        node.get_child_elements().iter().any(|node| {
+            constants::DIV_TO_P_ELEMS.contains(node.get_name().as_str())
+                || Self::has_child_block_element(node)
+        })
+    }
+
+    fn get_node_ancestors(node: &Node, max_depth: u64) -> Vec<Node> {
+        let mut ancestors = Vec::new();
+        let mut node = node.clone();
+
+        for _ in 0..max_depth {
+            let parent = node.get_parent();
+            match parent {
+                Some(parent) => {
+                    ancestors.push(parent.clone());
+                    node = parent;
+                }
+                None => return ancestors,
+            }
+        }
+
+        ancestors
+    }
+
+    // Initialize a node with the readability object. Also checks the
+    // className/id for special names to add to its score.
+    fn initialize_node(node: &mut Node, state: &State) {
+        let score = match node.get_name().to_uppercase().as_str() {
+            "DIV" => 5,
+            "PRE" | "TD" | "BLOCKQUITE" => 3,
+            "ADDRESS" | "OL" | "UL" | "DL" | "DD" | "DT" | "LI" | "FORM" => -3,
+            "H1" | "H2" | "H3" | "H4" | "H5" | "H6" | "TH" => -5,
+            _ => 0,
+        };
+        let score = score + Self::get_class_weight(node, state);
+        Self::set_content_score(node, score as f64);
+    }
+
+    fn get_class_weight(node: &Node, state: &State) -> i64 {
+        if !state.weigh_classes {
+            return 0;
+        }
+
+        let mut weight = 0;
+
+        // Look for a special classname
+        if let Some(class_names) = node.get_property("class") {
+            if constants::NEGATIVE.is_match(&class_names) {
+                weight -= 25;
+            }
+
+            if constants::POSITIVE.is_match(&class_names) {
+                weight += 25;
+            }
+        }
+
+        // Look for a special ID
+        if let Some(class_names) = node.get_property("id") {
+            if constants::NEGATIVE.is_match(&class_names) {
+                weight -= 25;
+            }
+
+            if constants::POSITIVE.is_match(&class_names) {
+                weight += 25;
+            }
+        }
+
+        weight
+    }
+
+    fn has_tag_name(node: Option<&Node>, tag_name: &str) -> bool {
+        node.map(|n| n.get_name().to_uppercase() == tag_name.to_uppercase())
+            .unwrap_or(false)
+    }
 }