start implementing readability

2025-07-07 16:15:32 +02:00 · 2023-01-01 14:51:34 +01:00 · 2023-01-01 14:51:34 +01:00 · 2750ad648d
commit 2750ad648d
parent c08f5afa5d
10 changed files with 375 additions and 124 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -1,20 +1,10 @@
 image: rust:latest
 stages:
  - lint
  - build
 run-build:
  stage: build
  image: rust:latest
  script:
  - rustc --version && cargo --version
  - cargo build --release --jobs 1
 run-lint:
  stage: lint
  image: rust:latest
  before_script:
  - rustup component add rustfmt
  - rustup component add clippy
@ -22,3 +12,4 @@ run-lint:
  - rustc --version && cargo --version
  - cargo fmt -- --check
  - cargo clippy --all-targets --all-features -- -D warnings
  - cargo build --release --jobs 1
--- a/Cargo.toml
+++ b/Cargo.toml
@ -16,8 +16,9 @@ url = "2.3"
 regex = "1.7"
 encoding_rs = "0.8"
 chrono = "0.4"
-base64 = "0.13"
+base64 = "0.20"
 image = "0.24"
 log = "0.4"
 rust-embed="6.4"
-once_cell = "1.16"
+once_cell = "1.16"
 escaper = "0.1"
--- a/src/full_text_parser/fingerprints.rs
+++ b/src/full_text_parser/fingerprints.rs
@ -7,28 +7,26 @@ static FINGERPRINT_REGEXES: Lazy<HashMap<&'static str, Regex>> = Lazy::new(|| {
    let mut m = HashMap::with_capacity(4);
    m.insert(
        "fingerprint.blogspot.com",
-        regex::Regex::new(
+        Regex::new(
            r#"/\\<meta\s*content=([\\'"])blogger([\\'"])\s*name=([\\'"])generator([\\'"])/i"#,
        )
        .expect("failed to build static regex"),
    );
    m.insert(
        "fingerprint.blogspot.com",
-        regex::Regex::new(
+        Regex::new(
            r#"/\\<meta\s*name=([\\'"])generator([\\'"])\s*content=([\\'"])Blogger([\\'"])/i"#,
        )
        .expect("failed to build static regex"),
    );
    m.insert(
        "fingerprint.wordpress.com",
-        regex::Regex::new(
+        Regex::new(r#"/\\<meta\\s*name=([\\'"])generator([\\'"])\s*content=([\\'"])WordPress/i"#)
-            r#"/\\<meta\\s*name=([\\'"])generator([\\'"])\s*content=([\\'"])WordPress/i"#,
+            .expect("failed to build static regex"),
        )
        .expect("failed to build static regex"),
    );
    m.insert(
        "fingerprint.ippen.media",
-        regex::Regex::new(r#"/\\<div\\s*class=([\\'"])id-SiteBEEPWrap([\\'"])\\>/i"#)
+        Regex::new(r#"/\\<div\\s*class=([\\'"])id-SiteBEEPWrap([\\'"])\\>/i"#)
            .expect("failed to build static regex"),
    );
    m
--- a/src/full_text_parser/metadata.rs
+++ b/src/full_text_parser/metadata.rs
@ -0,0 +1,132 @@
 use chrono::{DateTime, Utc};
 use libxml::xpath::Context;
 use log::{debug, warn};
 use std::str::FromStr;
 use crate::{article::Article, util::Util};
 use super::config::ConfigEntry;
 pub fn extract(
    context: &Context,
    config: Option<&ConfigEntry>,
    global_config: &ConfigEntry,
    article: &mut Article,
 ) {
    if article.title.is_none() {
        article.title = extract_title(context, config, global_config).and_then(|title| Some(match escaper::decode_html(&title) {
            Ok(escaped_title) => escaped_title,
            Err(_error) => title,
        }));
    }
    if article.author.is_none() {
        article.author = extract_author(context, config, global_config).and_then(|author| Some(match escaper::decode_html(&author) {
            Ok(escaped_author) => escaped_author,
            Err(_error) => author,
        }));
    }
    if article.date.is_none() {
        article.date = extract_date(context, config, global_config);
    }
 }
 fn extract_title(
    context: &Context,
    config: Option<&ConfigEntry>,
    global_config: &ConfigEntry
 ) -> Option<String> {
    // check site specific config
    if let Some(config) = config {
        for xpath_title in &config.xpath_title {
            if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
                debug!("Article title: '{}'", title);
                return Some(title);
            }
        }
    }
    // check global config
    for xpath_title in &global_config.xpath_title {
        if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
            debug!("Article title: '{}'", title);
            return Some(title);
        }
    }
    // generic meta (readablity)
    get_meta(context, "dc:title")
        .or_else(|| get_meta(context, "dcterm:title"))
        .or_else(|| get_meta(context, "og:title"))
        .or_else(|| get_meta(context, "weibo:article:title"))
        .or_else(|| get_meta(context, "weibo:webpage:title"))
        .or_else(|| get_meta(context, "title"))
        .or_else(|| get_meta(context, "twitter:title"))
 }
 fn extract_author(
    context: &Context,
    config: Option<&ConfigEntry>,
    global_config: &ConfigEntry
 ) -> Option<String> {
    // check site specific config
    if let Some(config) = config {
        for xpath_author in &config.xpath_author {
            if let Ok(author) = Util::extract_value(context, xpath_author) {
                debug!("Article author: '{}'", author);
                return Some(author);
            }
        }
    }
    // check global config
    for xpath_author in &global_config.xpath_author {
        if let Ok(author) = Util::extract_value(context, xpath_author) {
            debug!("Article author: '{}'", author);
            return Some(author);
        }
    }
    // generic meta (readablity)
    get_meta(context, "dc:creator")
        .or_else(|| get_meta(context, "dcterm:creator"))
        .or_else(|| get_meta(context, "author"))
 }
 fn extract_date(
    context: &Context,
    config: Option<&ConfigEntry>,
    global_config: &ConfigEntry
 ) -> Option<DateTime<Utc>> {
    // check site specific config
    if let Some(config) = config {
        for xpath_date in &config.xpath_date {
            if let Ok(date_string) = Util::extract_value(context, xpath_date) {
                debug!("Article date: '{}'", date_string);
                if let Ok(date) = DateTime::from_str(&date_string) {
                    return Some(date);
                } else {
                    warn!("Parsing the date string '{}' failed", date_string);
                }
            }
        }
    }
    // check global config
    for xpath_date in &global_config.xpath_date {
        if let Ok(date_string) = Util::extract_value(context, xpath_date) {
            debug!("Article date: '{}'", date_string);
            if let Ok(date) = DateTime::from_str(&date_string) {
                return Some(date);
            } else {
                warn!("Parsing the date string '{}' failed", date_string);
            }
        }
    }
    None
 }
 fn get_meta(context: &Context, name: &str) -> Option<String> {
    Util::get_attribute(context, &format!("//meta[contains(@name, '{}')]", name), "content").ok()
 }
--- a/src/full_text_parser/mod.rs
+++ b/src/full_text_parser/mod.rs
@ -1,15 +1,18 @@
 pub mod config;
 pub mod error;
 mod fingerprints;
 mod readability;
 mod metadata;
 #[cfg(test)]
 mod tests;
 use self::config::{ConfigCollection, ConfigEntry};
 use self::error::FullTextParserError;
 use self::readability::Readability;
 use crate::article::Article;
 use crate::util::Util;
-use chrono::DateTime;
+
 use encoding_rs::Encoding;
 use fingerprints::Fingerprints;
 use libxml::parser::Parser;
@ -19,7 +22,7 @@ use log::{debug, error, info, warn};
 use reqwest::header::HeaderMap;
 use reqwest::Client;
 use std::path::Path;
-use std::str::{from_utf8, FromStr};
+use std::str::from_utf8;
 pub struct FullTextParser {
    config_files: ConfigCollection,
@ -154,7 +157,7 @@ impl FullTextParser {
                // parse again with single page url
                debug!("Single page link found '{}'", single_page_url);
-                return self
+                if let Err(error) = self
                    .parse_single_page(
                        article,
                        &single_page_url,
@ -163,16 +166,27 @@ impl FullTextParser {
                        global_config,
                        client,
                    )
-                    .await;
+                    .await
                {
                    log::warn!("Single Page parsing: {}", error);
                    log::debug!("Continuing with regular parser.");
                }
            }
        }
-        Self::extract_metadata(&xpath_ctx, config, global_config, article);
+        metadata::extract(&xpath_ctx, config, global_config, article);
        if article.thumbnail_url.is_none() {
            Self::check_for_thumbnail(&xpath_ctx, article);
        }
        Self::strip_junk(&xpath_ctx, config, global_config, url);
-        Self::extract_body(&xpath_ctx, root, config, global_config)?;
+        let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
        if found_body {
            if let Err(error) = Readability::extract_body_readability(&document, root) {
                log::error!("Both ftr and readability failed to find content: {}", error);
                return Err(error);
            }
        }
        while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
            let headers = Util::generate_headers(config, global_config)?;
@ -232,7 +246,7 @@ impl FullTextParser {
        let html = Self::download(url, client, headers).await?;
        let document = Self::parse_html(&html, config, global_config)?;
        let xpath_ctx = Self::get_xpath_ctx(&document)?;
-        Self::extract_metadata(&xpath_ctx, config, global_config, article);
+        metadata::extract(&xpath_ctx, config, global_config, article);
        Self::check_for_thumbnail(&xpath_ctx, article);
        Self::strip_junk(&xpath_ctx, config, global_config, url);
        Self::extract_body(&xpath_ctx, root, config, global_config)?;
@ -363,7 +377,7 @@ impl FullTextParser {
    }
    fn check_for_thumbnail(context: &Context, article: &mut Article) {
-        if let Ok(thumb) = Self::get_attribute(
+        if let Ok(thumb) = Util::get_attribute(
            context,
            "//meta[contains(@name, 'twitter:image')]",
            "content",
@ -373,14 +387,14 @@ impl FullTextParser {
        }
        if let Ok(thumb) =
-            Self::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content")
+        Util::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content")
        {
            article.thumbnail_url = Some(thumb);
            return;
        }
        if let Ok(thumb) =
-            Self::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href")
+        Util::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href")
        {
            article.thumbnail_url = Some(thumb);
        }
@ -472,17 +486,6 @@ impl FullTextParser {
        Ok(())
    }
    fn get_attribute(
        context: &Context,
        xpath: &str,
        attribute: &str,
    ) -> Result<String, FullTextParserError> {
        Util::evaluate_xpath(context, xpath, false)?
            .iter()
            .find_map(|node| node.get_attribute(attribute))
            .ok_or(FullTextParserError::Xml)
    }
    fn repair_urls(
        context: &Context,
        xpath: &str,
@ -612,90 +615,12 @@ impl FullTextParser {
        let _ = Util::strip_node(context, "//*[@type='text/css']");
    }
    fn extract_metadata(
        context: &Context,
        config: Option<&ConfigEntry>,
        global_config: &ConfigEntry,
        article: &mut Article,
    ) {
        // try to get title
        if let Some(config) = config {
            for xpath_title in &config.xpath_title {
                if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
                    debug!("Article title: '{}'", title);
                    article.title = Some(title);
                    break;
                }
            }
        }
        if article.title.is_none() {
            for xpath_title in &global_config.xpath_title {
                if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
                    debug!("Article title: '{}'", title);
                    article.title = Some(title);
                    break;
                }
            }
        }
        // try to get the author
        if let Some(config) = config {
            for xpath_author in &config.xpath_author {
                if let Ok(author) = Util::extract_value(context, xpath_author) {
                    debug!("Article author: '{}'", author);
                    article.author = Some(author);
                    break;
                }
            }
        }
        if article.author.is_none() {
            for xpath_author in &global_config.xpath_author {
                if let Ok(author) = Util::extract_value(context, xpath_author) {
                    debug!("Article author: '{}'", author);
                    article.author = Some(author);
                    break;
                }
            }
        }
        // try to get the date
        if let Some(config) = config {
            for xpath_date in &config.xpath_date {
                if let Ok(date_string) = Util::extract_value(context, xpath_date) {
                    debug!("Article date: '{}'", date_string);
                    if let Ok(date) = DateTime::from_str(&date_string) {
                        article.date = Some(date);
                        break;
                    } else {
                        warn!("Parsing the date string '{}' failed", date_string);
                    }
                }
            }
        }
        if article.date.is_none() {
            for xpath_date in &global_config.xpath_date {
                if let Ok(date_string) = Util::extract_value(context, xpath_date) {
                    debug!("Article date: '{}'", date_string);
                    if let Ok(date) = DateTime::from_str(&date_string) {
                        article.date = Some(date);
                        break;
                    } else {
                        warn!("Parsing the date string '{}' failed", date_string);
                    }
                }
            }
        }
    }
    fn extract_body(
        context: &Context,
        root: &mut Node,
        config: Option<&ConfigEntry>,
        global_config: &ConfigEntry,
-    ) -> Result<(), FullTextParserError> {
+    ) -> Result<bool, FullTextParserError> {
        let mut found_something = false;
        if let Some(config) = config {
@ -712,10 +637,9 @@ impl FullTextParser {
        if !found_something {
            log::error!("no body found");
            return Err(FullTextParserError::Scrape);
        }
-        Ok(())
+        Ok(found_something)
    }
    fn extract_body_single(
@ -752,7 +676,7 @@ impl FullTextParser {
    ) -> Option<url::Url> {
        if let Some(config) = config {
            if let Some(next_page_xpath) = config.next_page_link.as_deref() {
-                if let Ok(next_page_string) = Self::get_attribute(context, next_page_xpath, "href")
+                if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href")
                {
                    if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
                        return Some(next_page_url);
@ -760,7 +684,7 @@ impl FullTextParser {
                }
            }
        } else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() {
-            if let Ok(next_page_string) = Self::get_attribute(context, next_page_xpath, "href") {
+            if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href") {
                if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
                    return Some(next_page_url);
                }
--- a/src/full_text_parser/readability/mod.rs
+++ b/src/full_text_parser/readability/mod.rs
@ -0,0 +1,165 @@
 mod regex;
 mod state;
 use libxml::tree::{Document, Node};
 use self::state::State;
 use super::error::FullTextParserError;
 pub struct Readability;
 impl Readability {
    pub fn extract_body_readability(
        document: &Document,
        root: &mut Node,
    ) -> Result<bool, FullTextParserError> {
        let mut state = State::default();
        let mut node: Option<Node> = document.clone().get_root_element();
        while let Some(node_ref) = node.as_mut() {
            let match_string = node_ref.get_class_names().iter().fold(String::new(), |a, b| format!("{a} {b}"));
            if !Self::is_probably_visible(node_ref) {
                node = Self::remove_and_next(node_ref);
                continue;
            }
            if Self::check_byline(node_ref, &match_string) {
                node = Self::remove_and_next(node_ref);
                continue;
            }
            if state.should_remove_title_header && Self::header_duplicates_title(node_ref) {
                state.should_remove_title_header = false;
                node = Self::remove_and_next(node_ref);
                continue;
            }
            if state.strip_unlikely {
            }
            node = Self::next_node(node_ref, false);
        }
        unimplemented!()
    }
    fn is_probably_visible(node: &Node) -> bool {
        let display_none = node
            .get_attribute("display")
            .map(|display| display == "none")
            .unwrap_or(false);
        let is_hidden = node.has_attribute("hidden");
        let aria_hidden = node
            .get_attribute("aria-hidden")
            .map(|attr| attr == "true")
            .unwrap_or(false);
        let has_fallback_image = node.get_class_names().contains("fallback-image");
        !display_none && !is_hidden && !aria_hidden || has_fallback_image
    }
    fn remove_and_next(node: &mut Node) -> Option<Node> {
        let next_node = Self::next_node(node, true);
        node.unlink();
        return next_node;
    }
    fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
        // First check for kids if those aren't being ignored
        let first_child = node.get_first_child();
        if !ignore_self_and_kids && first_child.is_some() {
            return first_child;
        }
        // Then for siblings...
        let next_sibling = node.get_next_sibling();
        if next_sibling.is_some() {
            return next_sibling;
        }
        // And finally, move up the parent chain *and* find a sibling
        // (because this is depth-first traversal, we will have already
        // seen the parent nodes themselves).
        loop {
            let parent = node.get_parent();
            if parent.is_none() {
                break;
            }
            if let Some(parent) = parent {
                let next_sibling = parent.get_next_sibling();
                if next_sibling.is_some() {
                    return next_sibling;
                }
            }
        }
        None
    }
    fn check_byline(node: &Node, matchstring: &str) -> bool {
        let rel = node
            .get_attribute("rel")
            .map(|rel| rel == "author")
            .unwrap_or(false);
        let itemprop = node
            .get_attribute("itemprop")
            .map(|prop| prop.contains("author"))
            .unwrap_or(false);
        let content = node.get_content();
        if rel || itemprop || regex::BYLINE.is_match(matchstring) && Self::is_valid_byline(&content) {
            // FIXME
            true
        } else {
            false
        }
    }
    // Check whether the input string could be a byline.
    // This verifies that the input length is less than 100 chars.
    fn is_valid_byline(line: &str) -> bool {
        let len = line.trim().len();
        len > 0 && len < 100
    }
    // Check if this node is an H1 or H2 element whose content is mostly
    // the same as the article title.
    fn header_duplicates_title(node: &Node) -> bool {
        let name = node.get_name().to_lowercase();
        if name != "h1" || name != "h2" {
            return false;
        }
        let heading = Self::get_inner_text(node, false);
        Self::text_similarity(&heading, "FIXME") > 0.75
    }
    fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
        let content = node.get_content().trim().to_owned();
        if normalize_spaces {
            regex::NORMALIZE.replace(&content, " ").into()
        } else {
            content
        }
    }
    fn text_similarity(a: &str, b: &str) -> f64 {
        let a = a.to_lowercase();
        let b = b.to_lowercase();
        let tokens_a = regex::TOKENIZE.split(&a).collect::<Vec<_>>();
        let tokens_b = regex::TOKENIZE.split(&b).collect::<Vec<_>>();
        if tokens_a.iter().count() == 0 || tokens_b.iter().count() == 0 {
            return 0.0;
        }
        let tokens_b_total: f64 = tokens_b.iter().map(|t| t.len()).fold(0.0, |a, b| a + b as f64);
        let uniq_tokens_b = tokens_b.into_iter().filter(|token| !tokens_a.iter().any(|t| t == token)).collect::<Vec<_>>();
        let uniq_tokens_b_total: f64 = uniq_tokens_b.iter().map(|t| t.len()).fold(0.0, |a, b| a + b as f64);
        let distance_b = uniq_tokens_b_total / tokens_b_total;
        1.0 - distance_b
    }
 }
--- a/src/full_text_parser/readability/regex.rs
+++ b/src/full_text_parser/readability/regex.rs
@ -0,0 +1,12 @@
 use once_cell::sync::Lazy;
 use regex::Regex;
 pub static BYLINE: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r#"/byline|author|dateline|writtenby|p-author/i"#).expect("BYLINE regex")
 });
 pub static NORMALIZE: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex")
 });
 pub static TOKENIZE: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r#"/\W+/g"#).expect("TOKENIZE regex")
 });
--- a/src/full_text_parser/readability/state.rs
+++ b/src/full_text_parser/readability/state.rs
@ -0,0 +1,17 @@
 pub struct State {
    pub strip_unlikely: bool,
    pub weigh_classes: bool,
    pub clean_conditionally: bool,
    pub should_remove_title_header: bool,
 }
 impl Default for State {
    fn default() -> Self {
        Self {
            strip_unlikely: true,
            weigh_classes: true,
            clean_conditionally: true,
            should_remove_title_header: true,
        }
    }
 }
--- a/src/images/mod.rs
+++ b/src/images/mod.rs
@ -36,7 +36,7 @@ impl ImageDownloader {
        doc: &Document,
        client: &Client,
    ) -> Result<String, ImageDownloadError> {
-        let xpath_ctx = Context::new(&doc).map_err(|()| {
+        let xpath_ctx = Context::new(doc).map_err(|()| {
            error!("Failed to create xpath context for document");
            ImageDownloadError::HtmlParse
        })?;
--- a/src/util.rs
+++ b/src/util.rs
@ -145,6 +145,17 @@ impl Util {
        None
    }
    pub fn get_attribute(
        context: &Context,
        xpath: &str,
        attribute: &str,
    ) -> Result<String, FullTextParserError> {
        Util::evaluate_xpath(context, xpath, false)?
            .iter()
            .find_map(|node| node.get_attribute(attribute))
            .ok_or(FullTextParserError::Xml)
    }
    pub fn extract_value(context: &Context, xpath: &str) -> Result<String, FullTextParserError> {
        let node_vec = Util::evaluate_xpath(context, xpath, false)?;
        if let Some(val) = node_vec.get(0) {