start implementing readability

2025-07-07 16:15:32 +02:00 · 2023-01-01 14:51:34 +01:00 · 2023-01-01 14:51:34 +01:00 · 2750ad648d
commit 2750ad648d
parent c08f5afa5d
10 changed files with 375 additions and 124 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -1,20 +1,10 @@
-image: rust:latest
-
 stages:
-  - lint
  - build
  

 run-build:
  stage: build
  image: rust:latest
-  script:
-  - rustc --version && cargo --version
-  - cargo build --release --jobs 1
-
-run-lint:
-  stage: lint
-  image: rust:latest
  before_script:
  - rustup component add rustfmt
  - rustup component add clippy
@ -22,3 +12,4 @@ run-lint:
  - rustc --version && cargo --version
  - cargo fmt -- --check
  - cargo clippy --all-targets --all-features -- -D warnings
+  - cargo build --release --jobs 1
--- a/Cargo.toml
+++ b/Cargo.toml
@ -16,8 +16,9 @@ url = "2.3"
 regex = "1.7"
 encoding_rs = "0.8"
 chrono = "0.4"
-base64 = "0.13"
+base64 = "0.20"
 image = "0.24"
 log = "0.4"
 rust-embed="6.4"
 once_cell = "1.16"
+escaper = "0.1"
--- a/src/full_text_parser/fingerprints.rs
+++ b/src/full_text_parser/fingerprints.rs
@ -7,28 +7,26 @@ static FINGERPRINT_REGEXES: Lazy<HashMap<&'static str, Regex>> = Lazy::new(|| {
    let mut m = HashMap::with_capacity(4);
    m.insert(
        "fingerprint.blogspot.com",
-        regex::Regex::new(
+        Regex::new(
            r#"/\\<meta\s*content=([\\'"])blogger([\\'"])\s*name=([\\'"])generator([\\'"])/i"#,
        )
        .expect("failed to build static regex"),
    );
    m.insert(
        "fingerprint.blogspot.com",
-        regex::Regex::new(
+        Regex::new(
            r#"/\\<meta\s*name=([\\'"])generator([\\'"])\s*content=([\\'"])Blogger([\\'"])/i"#,
        )
        .expect("failed to build static regex"),
    );
    m.insert(
        "fingerprint.wordpress.com",
-        regex::Regex::new(
-            r#"/\\<meta\\s*name=([\\'"])generator([\\'"])\s*content=([\\'"])WordPress/i"#,
-        )
+        Regex::new(r#"/\\<meta\\s*name=([\\'"])generator([\\'"])\s*content=([\\'"])WordPress/i"#)
            .expect("failed to build static regex"),
    );
    m.insert(
        "fingerprint.ippen.media",
-        regex::Regex::new(r#"/\\<div\\s*class=([\\'"])id-SiteBEEPWrap([\\'"])\\>/i"#)
+        Regex::new(r#"/\\<div\\s*class=([\\'"])id-SiteBEEPWrap([\\'"])\\>/i"#)
            .expect("failed to build static regex"),
    );
    m
--- a/src/full_text_parser/metadata.rs
+++ b/src/full_text_parser/metadata.rs
@ -0,0 +1,132 @@
+use chrono::{DateTime, Utc};
+use libxml::xpath::Context;
+use log::{debug, warn};
+use std::str::FromStr;
+use crate::{article::Article, util::Util};
+use super::config::ConfigEntry;
+
+pub fn extract(
+    context: &Context,
+    config: Option<&ConfigEntry>,
+    global_config: &ConfigEntry,
+    article: &mut Article,
+) {
+    
+    if article.title.is_none() {
+        article.title = extract_title(context, config, global_config).and_then(|title| Some(match escaper::decode_html(&title) {
+            Ok(escaped_title) => escaped_title,
+            Err(_error) => title,
+        }));
+    }
+
+    if article.author.is_none() {
+        article.author = extract_author(context, config, global_config).and_then(|author| Some(match escaper::decode_html(&author) {
+            Ok(escaped_author) => escaped_author,
+            Err(_error) => author,
+        }));
+    }
+
+    if article.date.is_none() {
+        article.date = extract_date(context, config, global_config);
+    }
+}
+
+fn extract_title(
+    context: &Context,
+    config: Option<&ConfigEntry>,
+    global_config: &ConfigEntry
+) -> Option<String> {
+    // check site specific config
+    if let Some(config) = config {
+        for xpath_title in &config.xpath_title {
+            if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
+                debug!("Article title: '{}'", title);
+                return Some(title);
+            }
+        }
+    }
+
+    // check global config
+    for xpath_title in &global_config.xpath_title {
+        if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
+            debug!("Article title: '{}'", title);
+            return Some(title);
+        }
+    }
+
+    // generic meta (readablity)
+    get_meta(context, "dc:title")
+        .or_else(|| get_meta(context, "dcterm:title"))
+        .or_else(|| get_meta(context, "og:title"))
+        .or_else(|| get_meta(context, "weibo:article:title"))
+        .or_else(|| get_meta(context, "weibo:webpage:title"))
+        .or_else(|| get_meta(context, "title"))
+        .or_else(|| get_meta(context, "twitter:title"))
+}
+
+fn extract_author(
+    context: &Context,
+    config: Option<&ConfigEntry>,
+    global_config: &ConfigEntry
+) -> Option<String> {
+    // check site specific config
+    if let Some(config) = config {
+        for xpath_author in &config.xpath_author {
+            if let Ok(author) = Util::extract_value(context, xpath_author) {
+                debug!("Article author: '{}'", author);
+                return Some(author);
+            }
+        }
+    }
+
+    // check global config
+    for xpath_author in &global_config.xpath_author {
+        if let Ok(author) = Util::extract_value(context, xpath_author) {
+            debug!("Article author: '{}'", author);
+            return Some(author);
+        }
+    }
+
+    // generic meta (readablity)
+    get_meta(context, "dc:creator")
+        .or_else(|| get_meta(context, "dcterm:creator"))
+        .or_else(|| get_meta(context, "author"))
+}
+
+fn extract_date(
+    context: &Context,
+    config: Option<&ConfigEntry>,
+    global_config: &ConfigEntry
+) -> Option<DateTime<Utc>> {
+    // check site specific config
+    if let Some(config) = config {
+        for xpath_date in &config.xpath_date {
+            if let Ok(date_string) = Util::extract_value(context, xpath_date) {
+                debug!("Article date: '{}'", date_string);
+                if let Ok(date) = DateTime::from_str(&date_string) {
+                    return Some(date);
+                } else {
+                    warn!("Parsing the date string '{}' failed", date_string);
+                }
+            }
+        }
+    }
+
+    // check global config
+    for xpath_date in &global_config.xpath_date {
+        if let Ok(date_string) = Util::extract_value(context, xpath_date) {
+            debug!("Article date: '{}'", date_string);
+            if let Ok(date) = DateTime::from_str(&date_string) {
+                return Some(date);
+            } else {
+                warn!("Parsing the date string '{}' failed", date_string);
+            }
+        }
+    }
+
+    None
+}
+
+fn get_meta(context: &Context, name: &str) -> Option<String> {
+    Util::get_attribute(context, &format!("//meta[contains(@name, '{}')]", name), "content").ok()
+}
--- a/src/full_text_parser/mod.rs
+++ b/src/full_text_parser/mod.rs
@ -1,15 +1,18 @@
 pub mod config;
 pub mod error;
 mod fingerprints;
+mod readability;
+mod metadata;

 #[cfg(test)]
 mod tests;

 use self::config::{ConfigCollection, ConfigEntry};
 use self::error::FullTextParserError;
+use self::readability::Readability;
 use crate::article::Article;
 use crate::util::Util;
-use chrono::DateTime;
+
 use encoding_rs::Encoding;
 use fingerprints::Fingerprints;
 use libxml::parser::Parser;
@ -19,7 +22,7 @@ use log::{debug, error, info, warn};
 use reqwest::header::HeaderMap;
 use reqwest::Client;
 use std::path::Path;
-use std::str::{from_utf8, FromStr};
+use std::str::from_utf8;

 pub struct FullTextParser {
    config_files: ConfigCollection,
@ -154,7 +157,7 @@ impl FullTextParser {
                // parse again with single page url
                debug!("Single page link found '{}'", single_page_url);

-                return self
+                if let Err(error) = self
                    .parse_single_page(
                        article,
                        &single_page_url,
@ -163,16 +166,27 @@ impl FullTextParser {
                        global_config,
                        client,
                    )
-                    .await;
+                    .await
+                {
+                    log::warn!("Single Page parsing: {}", error);
+                    log::debug!("Continuing with regular parser.");
+                }
            }
        }

-        Self::extract_metadata(&xpath_ctx, config, global_config, article);
+        metadata::extract(&xpath_ctx, config, global_config, article);
        if article.thumbnail_url.is_none() {
            Self::check_for_thumbnail(&xpath_ctx, article);
        }
        Self::strip_junk(&xpath_ctx, config, global_config, url);
-        Self::extract_body(&xpath_ctx, root, config, global_config)?;
+        let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
+
+        if found_body {
+            if let Err(error) = Readability::extract_body_readability(&document, root) {
+                log::error!("Both ftr and readability failed to find content: {}", error);
+                return Err(error);
+            }
+        }

        while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
            let headers = Util::generate_headers(config, global_config)?;
@ -232,7 +246,7 @@ impl FullTextParser {
        let html = Self::download(url, client, headers).await?;
        let document = Self::parse_html(&html, config, global_config)?;
        let xpath_ctx = Self::get_xpath_ctx(&document)?;
-        Self::extract_metadata(&xpath_ctx, config, global_config, article);
+        metadata::extract(&xpath_ctx, config, global_config, article);
        Self::check_for_thumbnail(&xpath_ctx, article);
        Self::strip_junk(&xpath_ctx, config, global_config, url);
        Self::extract_body(&xpath_ctx, root, config, global_config)?;
@ -363,7 +377,7 @@ impl FullTextParser {
    }

    fn check_for_thumbnail(context: &Context, article: &mut Article) {
-        if let Ok(thumb) = Self::get_attribute(
+        if let Ok(thumb) = Util::get_attribute(
            context,
            "//meta[contains(@name, 'twitter:image')]",
            "content",
@ -373,14 +387,14 @@ impl FullTextParser {
        }

        if let Ok(thumb) =
-            Self::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content")
+        Util::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content")
        {
            article.thumbnail_url = Some(thumb);
            return;
        }

        if let Ok(thumb) =
-            Self::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href")
+        Util::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href")
        {
            article.thumbnail_url = Some(thumb);
        }
@ -472,17 +486,6 @@ impl FullTextParser {
        Ok(())
    }

-    fn get_attribute(
-        context: &Context,
-        xpath: &str,
-        attribute: &str,
-    ) -> Result<String, FullTextParserError> {
-        Util::evaluate_xpath(context, xpath, false)?
-            .iter()
-            .find_map(|node| node.get_attribute(attribute))
-            .ok_or(FullTextParserError::Xml)
-    }
-
    fn repair_urls(
        context: &Context,
        xpath: &str,
@ -612,90 +615,12 @@ impl FullTextParser {
        let _ = Util::strip_node(context, "//*[@type='text/css']");
    }

-    fn extract_metadata(
-        context: &Context,
-        config: Option<&ConfigEntry>,
-        global_config: &ConfigEntry,
-        article: &mut Article,
-    ) {
-        // try to get title
-        if let Some(config) = config {
-            for xpath_title in &config.xpath_title {
-                if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
-                    debug!("Article title: '{}'", title);
-                    article.title = Some(title);
-                    break;
-                }
-            }
-        }
-
-        if article.title.is_none() {
-            for xpath_title in &global_config.xpath_title {
-                if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
-                    debug!("Article title: '{}'", title);
-                    article.title = Some(title);
-                    break;
-                }
-            }
-        }
-
-        // try to get the author
-        if let Some(config) = config {
-            for xpath_author in &config.xpath_author {
-                if let Ok(author) = Util::extract_value(context, xpath_author) {
-                    debug!("Article author: '{}'", author);
-                    article.author = Some(author);
-                    break;
-                }
-            }
-        }
-
-        if article.author.is_none() {
-            for xpath_author in &global_config.xpath_author {
-                if let Ok(author) = Util::extract_value(context, xpath_author) {
-                    debug!("Article author: '{}'", author);
-                    article.author = Some(author);
-                    break;
-                }
-            }
-        }
-
-        // try to get the date
-        if let Some(config) = config {
-            for xpath_date in &config.xpath_date {
-                if let Ok(date_string) = Util::extract_value(context, xpath_date) {
-                    debug!("Article date: '{}'", date_string);
-                    if let Ok(date) = DateTime::from_str(&date_string) {
-                        article.date = Some(date);
-                        break;
-                    } else {
-                        warn!("Parsing the date string '{}' failed", date_string);
-                    }
-                }
-            }
-        }
-
-        if article.date.is_none() {
-            for xpath_date in &global_config.xpath_date {
-                if let Ok(date_string) = Util::extract_value(context, xpath_date) {
-                    debug!("Article date: '{}'", date_string);
-                    if let Ok(date) = DateTime::from_str(&date_string) {
-                        article.date = Some(date);
-                        break;
-                    } else {
-                        warn!("Parsing the date string '{}' failed", date_string);
-                    }
-                }
-            }
-        }
-    }
-
    fn extract_body(
        context: &Context,
        root: &mut Node,
        config: Option<&ConfigEntry>,
        global_config: &ConfigEntry,
-    ) -> Result<(), FullTextParserError> {
+    ) -> Result<bool, FullTextParserError> {
        let mut found_something = false;

        if let Some(config) = config {
@ -712,10 +637,9 @@ impl FullTextParser {

        if !found_something {
            log::error!("no body found");
-            return Err(FullTextParserError::Scrape);
        }

-        Ok(())
+        Ok(found_something)
    }

    fn extract_body_single(
@ -752,7 +676,7 @@ impl FullTextParser {
    ) -> Option<url::Url> {
        if let Some(config) = config {
            if let Some(next_page_xpath) = config.next_page_link.as_deref() {
-                if let Ok(next_page_string) = Self::get_attribute(context, next_page_xpath, "href")
+                if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href")
                {
                    if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
                        return Some(next_page_url);
@ -760,7 +684,7 @@ impl FullTextParser {
                }
            }
        } else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() {
-            if let Ok(next_page_string) = Self::get_attribute(context, next_page_xpath, "href") {
+            if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href") {
                if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
                    return Some(next_page_url);
                }
--- a/src/full_text_parser/readability/mod.rs
+++ b/src/full_text_parser/readability/mod.rs
@ -0,0 +1,165 @@
+mod regex;
+mod state;
+
+use libxml::tree::{Document, Node};
+
+use self::state::State;
+use super::error::FullTextParserError;
+
+pub struct Readability;
+
+impl Readability {
+    pub fn extract_body_readability(
+        document: &Document,
+        root: &mut Node,
+    ) -> Result<bool, FullTextParserError> {
+        let mut state = State::default();
+        let mut node: Option<Node> = document.clone().get_root_element();
+
+        while let Some(node_ref) = node.as_mut() {
+
+            let match_string = node_ref.get_class_names().iter().fold(String::new(), |a, b| format!("{a} {b}"));
+
+            if !Self::is_probably_visible(node_ref) {
+                node = Self::remove_and_next(node_ref);
+                continue;
+            }
+
+            if Self::check_byline(node_ref, &match_string) {
+                node = Self::remove_and_next(node_ref);
+                continue;
+            }
+
+            if state.should_remove_title_header && Self::header_duplicates_title(node_ref) {
+                state.should_remove_title_header = false;
+                node = Self::remove_and_next(node_ref);
+                continue;
+            }
+
+            if state.strip_unlikely {
+                
+            }
+
+            node = Self::next_node(node_ref, false);
+        }
+
+        unimplemented!()
+    }
+
+    fn is_probably_visible(node: &Node) -> bool {
+        let display_none = node
+            .get_attribute("display")
+            .map(|display| display == "none")
+            .unwrap_or(false);
+        let is_hidden = node.has_attribute("hidden");
+        let aria_hidden = node
+            .get_attribute("aria-hidden")
+            .map(|attr| attr == "true")
+            .unwrap_or(false);
+        let has_fallback_image = node.get_class_names().contains("fallback-image");
+
+        !display_none && !is_hidden && !aria_hidden || has_fallback_image
+    }
+
+    fn remove_and_next(node: &mut Node) -> Option<Node> {
+        let next_node = Self::next_node(node, true);
+        node.unlink();
+        return next_node;
+    }
+
+    fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
+        // First check for kids if those aren't being ignored
+        let first_child = node.get_first_child();
+        if !ignore_self_and_kids && first_child.is_some() {
+            return first_child;
+        }
+
+        // Then for siblings...
+        let next_sibling = node.get_next_sibling();
+        if next_sibling.is_some() {
+            return next_sibling;
+        }
+
+        // And finally, move up the parent chain *and* find a sibling
+        // (because this is depth-first traversal, we will have already
+        // seen the parent nodes themselves).
+        loop {
+            let parent = node.get_parent();
+            if parent.is_none() {
+                break;
+            }
+
+            if let Some(parent) = parent {
+                let next_sibling = parent.get_next_sibling();
+                if next_sibling.is_some() {
+                    return next_sibling;
+                }
+            }
+        }
+
+        None
+    }
+
+    fn check_byline(node: &Node, matchstring: &str) -> bool {
+        let rel = node
+            .get_attribute("rel")
+            .map(|rel| rel == "author")
+            .unwrap_or(false);
+        let itemprop = node
+            .get_attribute("itemprop")
+            .map(|prop| prop.contains("author"))
+            .unwrap_or(false);
+
+        let content = node.get_content();
+        if rel || itemprop || regex::BYLINE.is_match(matchstring) && Self::is_valid_byline(&content) {
+            // FIXME
+            true
+        } else {
+            false
+        }
+    }
+
+    // Check whether the input string could be a byline.
+    // This verifies that the input length is less than 100 chars.
+    fn is_valid_byline(line: &str) -> bool {
+        let len = line.trim().len();
+        len > 0 && len < 100
+    }
+
+    // Check if this node is an H1 or H2 element whose content is mostly
+    // the same as the article title.
+    fn header_duplicates_title(node: &Node) -> bool {
+        let name = node.get_name().to_lowercase();
+        if name != "h1" || name != "h2" {
+            return false;
+        }
+        let heading = Self::get_inner_text(node, false);
+        Self::text_similarity(&heading, "FIXME") > 0.75
+    }
+
+    fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
+        let content = node.get_content().trim().to_owned();
+        if normalize_spaces {
+            regex::NORMALIZE.replace(&content, " ").into()
+        } else {
+            content
+        }
+    }
+
+    fn text_similarity(a: &str, b: &str) -> f64 {
+        let a = a.to_lowercase();
+        let b = b.to_lowercase();
+        let tokens_a = regex::TOKENIZE.split(&a).collect::<Vec<_>>();
+        let tokens_b = regex::TOKENIZE.split(&b).collect::<Vec<_>>();
+        if tokens_a.iter().count() == 0 || tokens_b.iter().count() == 0 {
+            return 0.0;
+        }
+
+        let tokens_b_total: f64 = tokens_b.iter().map(|t| t.len()).fold(0.0, |a, b| a + b as f64);
+        let uniq_tokens_b = tokens_b.into_iter().filter(|token| !tokens_a.iter().any(|t| t == token)).collect::<Vec<_>>();
+        let uniq_tokens_b_total: f64 = uniq_tokens_b.iter().map(|t| t.len()).fold(0.0, |a, b| a + b as f64);
+        
+        let distance_b = uniq_tokens_b_total / tokens_b_total;
+        1.0 - distance_b
+    }
+}
--- a/src/full_text_parser/readability/regex.rs
+++ b/src/full_text_parser/readability/regex.rs
@ -0,0 +1,12 @@
+use once_cell::sync::Lazy;
+use regex::Regex;
+
+pub static BYLINE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r#"/byline|author|dateline|writtenby|p-author/i"#).expect("BYLINE regex")
+});
+pub static NORMALIZE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex")
+});
+pub static TOKENIZE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r#"/\W+/g"#).expect("TOKENIZE regex")
+});
--- a/src/full_text_parser/readability/state.rs
+++ b/src/full_text_parser/readability/state.rs
@ -0,0 +1,17 @@
+pub struct State {
+    pub strip_unlikely: bool,
+    pub weigh_classes: bool,
+    pub clean_conditionally: bool,
+    pub should_remove_title_header: bool,
+}
+
+impl Default for State {
+    fn default() -> Self {
+        Self {
+            strip_unlikely: true,
+            weigh_classes: true,
+            clean_conditionally: true,
+            should_remove_title_header: true,
+        }
+    }
+}
--- a/src/images/mod.rs
+++ b/src/images/mod.rs
@ -36,7 +36,7 @@ impl ImageDownloader {
        doc: &Document,
        client: &Client,
    ) -> Result<String, ImageDownloadError> {
-        let xpath_ctx = Context::new(&doc).map_err(|()| {
+        let xpath_ctx = Context::new(doc).map_err(|()| {
            error!("Failed to create xpath context for document");
            ImageDownloadError::HtmlParse
        })?;
--- a/src/util.rs
+++ b/src/util.rs
@ -145,6 +145,17 @@ impl Util {
        None
    }

+    pub fn get_attribute(
+        context: &Context,
+        xpath: &str,
+        attribute: &str,
+    ) -> Result<String, FullTextParserError> {
+        Util::evaluate_xpath(context, xpath, false)?
+            .iter()
+            .find_map(|node| node.get_attribute(attribute))
+            .ok_or(FullTextParserError::Xml)
+    }
+
    pub fn extract_value(context: &Context, xpath: &str) -> Result<String, FullTextParserError> {
        let node_vec = Util::evaluate_xpath(context, xpath, false)?;
        if let Some(val) = node_vec.get(0) {