more

2025-07-08 00:19:59 +02:00 · 2023-01-01 21:35:46 +01:00 · 2023-01-01 21:35:46 +01:00 · 979358fd35
commit 979358fd35
parent 2750ad648d
6 changed files with 318 additions and 51 deletions
--- a/src/full_text_parser/metadata.rs
+++ b/src/full_text_parser/metadata.rs
@ -1,9 +1,9 @@
+use super::config::ConfigEntry;
+use crate::{article::Article, util::Util};
 use chrono::{DateTime, Utc};
 use libxml::xpath::Context;
 use log::{debug, warn};
 use std::str::FromStr;
-use crate::{article::Article, util::Util};
-use super::config::ConfigEntry;

 pub fn extract(
    context: &Context,
@ -11,19 +11,23 @@ pub fn extract(
    global_config: &ConfigEntry,
    article: &mut Article,
 ) {
-    
    if article.title.is_none() {
-        article.title = extract_title(context, config, global_config).and_then(|title| Some(match escaper::decode_html(&title) {
+        article.title = extract_title(context, config, global_config).map(|title| {
+            match escaper::decode_html(&title) {
                Ok(escaped_title) => escaped_title,
                Err(_error) => title,
-        }));
+            }
+        });
    }

    if article.author.is_none() {
-        article.author = extract_author(context, config, global_config).and_then(|author| Some(match escaper::decode_html(&author) {
+        article.author =
+            extract_author(context, config, global_config).map(
+                |author| match escaper::decode_html(&author) {
                    Ok(escaped_author) => escaped_author,
                    Err(_error) => author,
-        }));
+                },
+            );
    }

    if article.date.is_none() {
@ -34,7 +38,7 @@ pub fn extract(
 fn extract_title(
    context: &Context,
    config: Option<&ConfigEntry>,
-    global_config: &ConfigEntry
+    global_config: &ConfigEntry,
 ) -> Option<String> {
    // check site specific config
    if let Some(config) = config {
@ -67,7 +71,7 @@ fn extract_title(
 fn extract_author(
    context: &Context,
    config: Option<&ConfigEntry>,
-    global_config: &ConfigEntry
+    global_config: &ConfigEntry,
 ) -> Option<String> {
    // check site specific config
    if let Some(config) = config {
@ -96,7 +100,7 @@ fn extract_author(
 fn extract_date(
    context: &Context,
    config: Option<&ConfigEntry>,
-    global_config: &ConfigEntry
+    global_config: &ConfigEntry,
 ) -> Option<DateTime<Utc>> {
    // check site specific config
    if let Some(config) = config {
@ -128,5 +132,10 @@ fn extract_date(
 }

 fn get_meta(context: &Context, name: &str) -> Option<String> {
-    Util::get_attribute(context, &format!("//meta[contains(@name, '{}')]", name), "content").ok()
+    Util::get_attribute(
+        context,
+        &format!("//meta[contains(@name, '{}')]", name),
+        "content",
+    )
+    .ok()
 }
--- a/src/full_text_parser/mod.rs
+++ b/src/full_text_parser/mod.rs
@ -1,8 +1,8 @@
 pub mod config;
 pub mod error;
 mod fingerprints;
-mod readability;
 mod metadata;
+mod readability;

 #[cfg(test)]
 mod tests;
--- a/src/full_text_parser/readability/constants.rs
+++ b/src/full_text_parser/readability/constants.rs
@ -0,0 +1,40 @@
+use once_cell::sync::Lazy;
+use regex::Regex;
+
+pub static BYLINE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r#"/byline|author|dateline|writtenby|p-author/i"#).expect("BYLINE regex")
+});
+pub static NORMALIZE: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex"));
+pub static TOKENIZE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"/\W+/g"#).expect("TOKENIZE regex"));
+pub static UNLIELY_CANDIDATES: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r#"/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i"#).expect("UNLIELY_CANDIDATES regex")
+});
+pub static OKAY_MAYBE_ITS_A_CANDIDATE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r#"/and|article|body|column|content|main|shadow/i"#)
+        .expect("OKAY_MAYBE_ITS_A_CANDIDATE regex")
+});
+pub static HAS_CONTENT: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r#"/\S$/"#).expect("HAS_CONTENT regex"));
+pub static HASH_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"/^#.+/"#).expect("HASH_URL regex"));
+
+pub const UNLIKELY_ROLES: &[&str] = &[
+    "menu",
+    "menubar",
+    "complementary",
+    "navigation",
+    "alert",
+    "alertdialog",
+    "dialog",
+];
+
+pub const DEFAULT_TAGS_TO_SCORE: &[&str] =
+    &["SECTION", "H2", "H3", "H4", "H5", "H6", "P", "TD", "PRE"];
+
+pub const PHRASING_ELEMS: &[&str] = &[
+    // "CANVAS", "IFRAME", "SVG", "VIDEO",
+    "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", "DATALIST", "DFN", "EM",
+    "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL", "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT",
+    "OUTPUT", "PROGRESS", "Q", "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG",
+    "SUB", "SUP", "TEXTAREA", "TIME", "VAR", "WBR",
+];
--- a/src/full_text_parser/readability/mod.rs
+++ b/src/full_text_parser/readability/mod.rs
@ -1,7 +1,7 @@
-mod regex;
+mod constants;
 mod state;

-use libxml::tree::{Document, Node};
+use libxml::tree::{Document, Node, NodeType};

 use self::state::State;
 use super::error::FullTextParserError;
@ -11,21 +11,29 @@ pub struct Readability;
 impl Readability {
    pub fn extract_body_readability(
        document: &Document,
-        root: &mut Node,
+        _root: &mut Node,
    ) -> Result<bool, FullTextParserError> {
        let mut state = State::default();
+        let mut elements_to_score = Vec::new();
        let mut node: Option<Node> = document.clone().get_root_element();

        while let Some(node_ref) = node.as_mut() {
-
-            let match_string = node_ref.get_class_names().iter().fold(String::new(), |a, b| format!("{a} {b}"));
+            let tag_name = node_ref.get_name().to_uppercase();
+            let match_string = node_ref
+                .get_class_names()
+                .iter()
+                .fold(String::new(), |a, b| format!("{a} {b}"));
+            let match_string = match node_ref.get_property("id") {
+                Some(id) => format!("{match_string} {id}"),
+                None => match_string,
+            };

            if !Self::is_probably_visible(node_ref) {
                node = Self::remove_and_next(node_ref);
                continue;
            }

-            if Self::check_byline(node_ref, &match_string) {
+            if Self::check_byline(node_ref, &match_string, &mut state) {
                node = Self::remove_and_next(node_ref);
                continue;
            }
@ -36,8 +44,78 @@ impl Readability {
                continue;
            }

+            // Remove unlikely candidates
            if state.strip_unlikely {
+                if constants::UNLIELY_CANDIDATES.is_match(&match_string)
+                    && !constants::OKAY_MAYBE_ITS_A_CANDIDATE.is_match(&match_string)
+                    && !Self::has_ancestor_tag(node_ref, "table", None)
+                    && !Self::has_ancestor_tag(node_ref, "code", None)
+                    && tag_name != "BODY"
+                    && tag_name != "A"
+                {
+                    node = Self::remove_and_next(node_ref);
+                    continue;
+                }

+                if let Some(role) = node_ref.get_attribute("role") {
+                    if constants::UNLIKELY_ROLES.contains(&role.as_str()) {
+                        node = Self::remove_and_next(node_ref);
+                        continue;
+                    }
+                }
+            }
+
+            // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
+            if tag_name == "DIV"
+                || tag_name == "SECTION"
+                || tag_name == "HEADER"
+                || tag_name == "H1"
+                || tag_name == "H2"
+                || tag_name == "H3"
+                || tag_name == "H4"
+                || tag_name == "H5"
+                || tag_name == "H6" && Self::is_element_without_content(node_ref)
+            {
+                node = Self::remove_and_next(node_ref);
+                continue;
+            }
+
+            if constants::DEFAULT_TAGS_TO_SCORE.contains(&tag_name.as_str()) {
+                elements_to_score.push(node_ref.clone());
+            }
+
+            // Turn all divs that don't have children block level elements into p's
+            if tag_name == "DIV" {
+                // Put phrasing content into paragraphs.
+                let mut p: Option<Node> = None;
+                for mut child_node in node_ref.get_child_nodes().into_iter() {
+                    if Self::is_phrasing_content(&child_node) {
+                        if let Some(p) = p.as_mut() {
+                            let _ = p.add_child(&mut child_node);
+                        } else if !Self::is_whitespace(&child_node) {
+                            let mut new_node = Node::new("p", None, document).unwrap();
+                            node_ref
+                                .replace_child_node(new_node.clone(), child_node.clone())
+                                .unwrap();
+                            new_node.add_child(&mut child_node).unwrap();
+                            p.replace(new_node);
+                        }
+                    } else if let Some(p) = p.as_mut() {
+                        for mut r_node in p.get_child_nodes().into_iter().rev() {
+                            if Self::is_whitespace(&r_node) {
+                                r_node.unlink();
+                            }
+                        }
+                    }
+                }
+
+                // Sites like http://mobile.slate.com encloses each paragraph with a DIV
+                // element. DIVs with only a P element inside and no text content can be
+                // safely converted into plain P elements to avoid confusing the scoring
+                // algorithm with DIVs with are, in practice, paragraphs.
+                if Self::has_single_tag_inside_element(node_ref, "P")
+                    && Self::get_link_density(node_ref) < 0.25
+                {}
            }

            node = Self::next_node(node_ref, false);
@ -61,10 +139,24 @@ impl Readability {
        !display_none && !is_hidden && !aria_hidden || has_fallback_image
    }

+    fn is_whitespace(node: &Node) -> bool {
+        let is_text_node = node
+            .get_type()
+            .map(|t| t == NodeType::TextNode)
+            .unwrap_or(false);
+        let is_element_node = node
+            .get_type()
+            .map(|t| t == NodeType::ElementNode)
+            .unwrap_or(false);
+
+        (is_text_node && node.get_content().trim().is_empty())
+            || (is_element_node && node.get_name().to_uppercase() == "BR")
+    }
+
    fn remove_and_next(node: &mut Node) -> Option<Node> {
        let next_node = Self::next_node(node, true);
        node.unlink();
-        return next_node;
+        next_node
    }

    fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
@ -100,7 +192,11 @@ impl Readability {
        None
    }

-    fn check_byline(node: &Node, matchstring: &str) -> bool {
+    fn check_byline(node: &Node, matchstring: &str, state: &mut State) -> bool {
+        if state.byline.is_some() {
+            return false;
+        }
+
        let rel = node
            .get_attribute("rel")
            .map(|rel| rel == "author")
@ -111,8 +207,11 @@ impl Readability {
            .unwrap_or(false);

        let content = node.get_content();
-        if rel || itemprop || regex::BYLINE.is_match(matchstring) && Self::is_valid_byline(&content) {
-            // FIXME
+        if rel
+            || itemprop
+            || constants::BYLINE.is_match(matchstring) && Self::is_valid_byline(&content)
+        {
+            state.byline = Some(content.trim().into());
            true
        } else {
            false
@ -140,7 +239,7 @@ impl Readability {
    fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
        let content = node.get_content().trim().to_owned();
        if normalize_spaces {
-            regex::NORMALIZE.replace(&content, " ").into()
+            constants::NORMALIZE.replace(&content, " ").into()
        } else {
            content
        }
@ -149,17 +248,146 @@ impl Readability {
    fn text_similarity(a: &str, b: &str) -> f64 {
        let a = a.to_lowercase();
        let b = b.to_lowercase();
-        let tokens_a = regex::TOKENIZE.split(&a).collect::<Vec<_>>();
-        let tokens_b = regex::TOKENIZE.split(&b).collect::<Vec<_>>();
-        if tokens_a.iter().count() == 0 || tokens_b.iter().count() == 0 {
+        let tokens_a = constants::TOKENIZE.split(&a).collect::<Vec<_>>();
+        let tokens_b = constants::TOKENIZE.split(&b).collect::<Vec<_>>();
+        if tokens_a.is_empty() || tokens_b.is_empty() {
            return 0.0;
        }

-        let tokens_b_total: f64 = tokens_b.iter().map(|t| t.len()).fold(0.0, |a, b| a + b as f64);
-        let uniq_tokens_b = tokens_b.into_iter().filter(|token| !tokens_a.iter().any(|t| t == token)).collect::<Vec<_>>();
-        let uniq_tokens_b_total: f64 = uniq_tokens_b.iter().map(|t| t.len()).fold(0.0, |a, b| a + b as f64);
+        let tokens_b_total: f64 = tokens_b
+            .iter()
+            .map(|t| t.len())
+            .fold(0.0, |a, b| a + b as f64);
+        let uniq_tokens_b = tokens_b
+            .into_iter()
+            .filter(|token| !tokens_a.iter().any(|t| t == token))
+            .collect::<Vec<_>>();
+        let uniq_tokens_b_total: f64 = uniq_tokens_b
+            .iter()
+            .map(|t| t.len())
+            .fold(0.0, |a, b| a + b as f64);

        let distance_b = uniq_tokens_b_total / tokens_b_total;
        1.0 - distance_b
    }
+
+    fn has_ancestor_tag(node: &Node, tag_name: &str, max_depth: Option<u64>) -> bool {
+        let max_depth = max_depth.unwrap_or(3);
+        let tag_name = tag_name.to_uppercase();
+        let mut depth = 0;
+        let mut node = node.get_parent();
+
+        loop {
+            if depth > max_depth {
+                return false;
+            }
+
+            let tmp_node = match node {
+                Some(node) => node,
+                None => return false,
+            };
+
+            if tmp_node.get_name() == tag_name {
+                return true;
+            }
+
+            node = tmp_node.get_parent();
+            depth += 1;
+        }
+    }
+
+    fn has_single_tag_inside_element(node: &Node, tag: &str) -> bool {
+        // There should be exactly 1 element child with given tag
+        if node.get_child_nodes().len() == 1
+            || node
+                .get_child_nodes()
+                .first()
+                .map(|n| n.get_name().to_uppercase() == tag)
+                .unwrap_or(false)
+        {
+            return false;
+        }
+
+        // And there should be no text nodes with real content
+        node.get_child_nodes().iter().any(|n| {
+            n.get_type()
+                .map(|t| t == NodeType::TextNode)
+                .unwrap_or(false)
+                && constants::HAS_CONTENT.is_match(&n.get_content())
+        })
+    }
+
+    fn is_element_without_content(node: &Node) -> bool {
+        if let Some(node_type) = node.get_type() {
+            let len = node.get_child_nodes().len();
+
+            return node_type == NodeType::ElementNode
+                && node.get_content().trim().is_empty()
+                && (len == 0
+                    || len
+                        == Self::get_elements_by_tag_name(node, "br").len()
+                            + Self::get_elements_by_tag_name(node, "hr").len());
+        }
+
+        false
+    }
+
+    fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec<Node> {
+        let tag = tag.to_uppercase();
+        let all_tags = tag == "*";
+        let mut vec = Vec::new();
+
+        fn get_elems(node: &Node, tag: &str, vec: &mut Vec<Node>, all_tags: bool) {
+            for child in node.get_child_elements() {
+                if all_tags || child.get_name() == tag {
+                    vec.push(child);
+                }
+                get_elems(node, tag, vec, all_tags);
+            }
+        }
+
+        get_elems(node, &tag, &mut vec, all_tags);
+        vec
+    }
+
+    fn is_phrasing_content(node: &Node) -> bool {
+        let tag_name = node.get_name().to_uppercase();
+        let is_text_node = node
+            .get_type()
+            .map(|t| t == NodeType::TextNode)
+            .unwrap_or(false);
+
+        is_text_node
+            || constants::PHRASING_ELEMS.contains(&tag_name.as_str())
+            || (tag_name == "A" || tag_name == "DEL" || tag_name == "INS")
+                && node
+                    .get_child_nodes()
+                    .iter()
+                    .map(Self::is_phrasing_content)
+                    .all(|val| val)
+    }
+
+    fn get_link_density(node: &Node) -> f64 {
+        let text_length = Self::get_inner_text(node, false).len();
+        if text_length == 0 {
+            return 0.0;
+        }
+
+        let mut link_length = 0.0;
+
+        // XXX implement _reduceNodeList?
+        let link_nodes = Self::get_elements_by_tag_name(node, "A");
+        for link_node in link_nodes {
+            if let Some(href) = link_node.get_attribute("href") {
+                let coefficient = if constants::HASH_URL.is_match(&href) {
+                    0.3
+                } else {
+                    1.0
+                };
+                link_length += Self::get_inner_text(&link_node, false).len() as f64 * coefficient;
+            }
+        }
+
+        link_length / text_length as f64
+    }
 }
--- a/src/full_text_parser/readability/regex.rs
+++ b/src/full_text_parser/readability/regex.rs
@ -1,12 +0,0 @@
-use once_cell::sync::Lazy;
-use regex::Regex;
-
-pub static BYLINE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r#"/byline|author|dateline|writtenby|p-author/i"#).expect("BYLINE regex")
-});
-pub static NORMALIZE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex")
-});
-pub static TOKENIZE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r#"/\W+/g"#).expect("TOKENIZE regex")
-});
--- a/src/full_text_parser/readability/state.rs
+++ b/src/full_text_parser/readability/state.rs
@ -3,6 +3,7 @@ pub struct State {
    pub weigh_classes: bool,
    pub clean_conditionally: bool,
    pub should_remove_title_header: bool,
+    pub byline: Option<String>,
 }

 impl Default for State {
@ -12,6 +13,7 @@ impl Default for State {
            weigh_classes: true,
            clean_conditionally: true,
            should_remove_title_header: true,
+            byline: None,
        }
    }
 }