diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0cbd700..55d3464 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,20 +1,10 @@ -image: rust:latest - stages: - - lint - build run-build: stage: build image: rust:latest - script: - - rustc --version && cargo --version - - cargo build --release --jobs 1 - -run-lint: - stage: lint - image: rust:latest before_script: - rustup component add rustfmt - rustup component add clippy @@ -22,3 +12,4 @@ run-lint: - rustc --version && cargo --version - cargo fmt -- --check - cargo clippy --all-targets --all-features -- -D warnings + - cargo build --release --jobs 1 diff --git a/Cargo.toml b/Cargo.toml index 9d2b801..63d842a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,8 +16,9 @@ url = "2.3" regex = "1.7" encoding_rs = "0.8" chrono = "0.4" -base64 = "0.13" +base64 = "0.20" image = "0.24" log = "0.4" rust-embed="6.4" -once_cell = "1.16" \ No newline at end of file +once_cell = "1.16" +escaper = "0.1" \ No newline at end of file diff --git a/src/full_text_parser/fingerprints.rs b/src/full_text_parser/fingerprints.rs index a65db91..4de0f21 100644 --- a/src/full_text_parser/fingerprints.rs +++ b/src/full_text_parser/fingerprints.rs @@ -7,28 +7,26 @@ static FINGERPRINT_REGEXES: Lazy> = Lazy::new(|| { let mut m = HashMap::with_capacity(4); m.insert( "fingerprint.blogspot.com", - regex::Regex::new( + Regex::new( r#"/\\/i"#) + Regex::new(r#"/\\/i"#) .expect("failed to build static regex"), ); m diff --git a/src/full_text_parser/metadata.rs b/src/full_text_parser/metadata.rs new file mode 100644 index 0000000..23120ed --- /dev/null +++ b/src/full_text_parser/metadata.rs @@ -0,0 +1,132 @@ +use chrono::{DateTime, Utc}; +use libxml::xpath::Context; +use log::{debug, warn}; +use std::str::FromStr; +use crate::{article::Article, util::Util}; +use super::config::ConfigEntry; + +pub fn extract( + context: &Context, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry, + article: &mut Article, +) { + + if article.title.is_none() { + article.title = extract_title(context, config, global_config).and_then(|title| Some(match escaper::decode_html(&title) { + Ok(escaped_title) => escaped_title, + Err(_error) => title, + })); + } + + if article.author.is_none() { + article.author = extract_author(context, config, global_config).and_then(|author| Some(match escaper::decode_html(&author) { + Ok(escaped_author) => escaped_author, + Err(_error) => author, + })); + } + + if article.date.is_none() { + article.date = extract_date(context, config, global_config); + } +} + +fn extract_title( + context: &Context, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry +) -> Option { + // check site specific config + if let Some(config) = config { + for xpath_title in &config.xpath_title { + if let Ok(title) = Util::extract_value_merge(context, xpath_title) { + debug!("Article title: '{}'", title); + return Some(title); + } + } + } + + // check global config + for xpath_title in &global_config.xpath_title { + if let Ok(title) = Util::extract_value_merge(context, xpath_title) { + debug!("Article title: '{}'", title); + return Some(title); + } + } + + // generic meta (readablity) + get_meta(context, "dc:title") + .or_else(|| get_meta(context, "dcterm:title")) + .or_else(|| get_meta(context, "og:title")) + .or_else(|| get_meta(context, "weibo:article:title")) + .or_else(|| get_meta(context, "weibo:webpage:title")) + .or_else(|| get_meta(context, "title")) + .or_else(|| get_meta(context, "twitter:title")) +} + +fn extract_author( + context: &Context, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry +) -> Option { + // check site specific config + if let Some(config) = config { + for xpath_author in &config.xpath_author { + if let Ok(author) = Util::extract_value(context, xpath_author) { + debug!("Article author: '{}'", author); + return Some(author); + } + } + } + + // check global config + for xpath_author in &global_config.xpath_author { + if let Ok(author) = Util::extract_value(context, xpath_author) { + debug!("Article author: '{}'", author); + return Some(author); + } + } + + // generic meta (readablity) + get_meta(context, "dc:creator") + .or_else(|| get_meta(context, "dcterm:creator")) + .or_else(|| get_meta(context, "author")) +} + +fn extract_date( + context: &Context, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry +) -> Option> { + // check site specific config + if let Some(config) = config { + for xpath_date in &config.xpath_date { + if let Ok(date_string) = Util::extract_value(context, xpath_date) { + debug!("Article date: '{}'", date_string); + if let Ok(date) = DateTime::from_str(&date_string) { + return Some(date); + } else { + warn!("Parsing the date string '{}' failed", date_string); + } + } + } + } + + // check global config + for xpath_date in &global_config.xpath_date { + if let Ok(date_string) = Util::extract_value(context, xpath_date) { + debug!("Article date: '{}'", date_string); + if let Ok(date) = DateTime::from_str(&date_string) { + return Some(date); + } else { + warn!("Parsing the date string '{}' failed", date_string); + } + } + } + + None +} + +fn get_meta(context: &Context, name: &str) -> Option { + Util::get_attribute(context, &format!("//meta[contains(@name, '{}')]", name), "content").ok() +} \ No newline at end of file diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs index 651b711..0d427bf 100644 --- a/src/full_text_parser/mod.rs +++ b/src/full_text_parser/mod.rs @@ -1,15 +1,18 @@ pub mod config; pub mod error; mod fingerprints; +mod readability; +mod metadata; #[cfg(test)] mod tests; use self::config::{ConfigCollection, ConfigEntry}; use self::error::FullTextParserError; +use self::readability::Readability; use crate::article::Article; use crate::util::Util; -use chrono::DateTime; + use encoding_rs::Encoding; use fingerprints::Fingerprints; use libxml::parser::Parser; @@ -19,7 +22,7 @@ use log::{debug, error, info, warn}; use reqwest::header::HeaderMap; use reqwest::Client; use std::path::Path; -use std::str::{from_utf8, FromStr}; +use std::str::from_utf8; pub struct FullTextParser { config_files: ConfigCollection, @@ -154,7 +157,7 @@ impl FullTextParser { // parse again with single page url debug!("Single page link found '{}'", single_page_url); - return self + if let Err(error) = self .parse_single_page( article, &single_page_url, @@ -163,16 +166,27 @@ impl FullTextParser { global_config, client, ) - .await; + .await + { + log::warn!("Single Page parsing: {}", error); + log::debug!("Continuing with regular parser."); + } } } - Self::extract_metadata(&xpath_ctx, config, global_config, article); + metadata::extract(&xpath_ctx, config, global_config, article); if article.thumbnail_url.is_none() { Self::check_for_thumbnail(&xpath_ctx, article); } Self::strip_junk(&xpath_ctx, config, global_config, url); - Self::extract_body(&xpath_ctx, root, config, global_config)?; + let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?; + + if found_body { + if let Err(error) = Readability::extract_body_readability(&document, root) { + log::error!("Both ftr and readability failed to find content: {}", error); + return Err(error); + } + } while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) { let headers = Util::generate_headers(config, global_config)?; @@ -232,7 +246,7 @@ impl FullTextParser { let html = Self::download(url, client, headers).await?; let document = Self::parse_html(&html, config, global_config)?; let xpath_ctx = Self::get_xpath_ctx(&document)?; - Self::extract_metadata(&xpath_ctx, config, global_config, article); + metadata::extract(&xpath_ctx, config, global_config, article); Self::check_for_thumbnail(&xpath_ctx, article); Self::strip_junk(&xpath_ctx, config, global_config, url); Self::extract_body(&xpath_ctx, root, config, global_config)?; @@ -363,7 +377,7 @@ impl FullTextParser { } fn check_for_thumbnail(context: &Context, article: &mut Article) { - if let Ok(thumb) = Self::get_attribute( + if let Ok(thumb) = Util::get_attribute( context, "//meta[contains(@name, 'twitter:image')]", "content", @@ -373,14 +387,14 @@ impl FullTextParser { } if let Ok(thumb) = - Self::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content") + Util::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content") { article.thumbnail_url = Some(thumb); return; } if let Ok(thumb) = - Self::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href") + Util::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href") { article.thumbnail_url = Some(thumb); } @@ -472,17 +486,6 @@ impl FullTextParser { Ok(()) } - fn get_attribute( - context: &Context, - xpath: &str, - attribute: &str, - ) -> Result { - Util::evaluate_xpath(context, xpath, false)? - .iter() - .find_map(|node| node.get_attribute(attribute)) - .ok_or(FullTextParserError::Xml) - } - fn repair_urls( context: &Context, xpath: &str, @@ -612,90 +615,12 @@ impl FullTextParser { let _ = Util::strip_node(context, "//*[@type='text/css']"); } - fn extract_metadata( - context: &Context, - config: Option<&ConfigEntry>, - global_config: &ConfigEntry, - article: &mut Article, - ) { - // try to get title - if let Some(config) = config { - for xpath_title in &config.xpath_title { - if let Ok(title) = Util::extract_value_merge(context, xpath_title) { - debug!("Article title: '{}'", title); - article.title = Some(title); - break; - } - } - } - - if article.title.is_none() { - for xpath_title in &global_config.xpath_title { - if let Ok(title) = Util::extract_value_merge(context, xpath_title) { - debug!("Article title: '{}'", title); - article.title = Some(title); - break; - } - } - } - - // try to get the author - if let Some(config) = config { - for xpath_author in &config.xpath_author { - if let Ok(author) = Util::extract_value(context, xpath_author) { - debug!("Article author: '{}'", author); - article.author = Some(author); - break; - } - } - } - - if article.author.is_none() { - for xpath_author in &global_config.xpath_author { - if let Ok(author) = Util::extract_value(context, xpath_author) { - debug!("Article author: '{}'", author); - article.author = Some(author); - break; - } - } - } - - // try to get the date - if let Some(config) = config { - for xpath_date in &config.xpath_date { - if let Ok(date_string) = Util::extract_value(context, xpath_date) { - debug!("Article date: '{}'", date_string); - if let Ok(date) = DateTime::from_str(&date_string) { - article.date = Some(date); - break; - } else { - warn!("Parsing the date string '{}' failed", date_string); - } - } - } - } - - if article.date.is_none() { - for xpath_date in &global_config.xpath_date { - if let Ok(date_string) = Util::extract_value(context, xpath_date) { - debug!("Article date: '{}'", date_string); - if let Ok(date) = DateTime::from_str(&date_string) { - article.date = Some(date); - break; - } else { - warn!("Parsing the date string '{}' failed", date_string); - } - } - } - } - } - fn extract_body( context: &Context, root: &mut Node, config: Option<&ConfigEntry>, global_config: &ConfigEntry, - ) -> Result<(), FullTextParserError> { + ) -> Result { let mut found_something = false; if let Some(config) = config { @@ -712,10 +637,9 @@ impl FullTextParser { if !found_something { log::error!("no body found"); - return Err(FullTextParserError::Scrape); } - Ok(()) + Ok(found_something) } fn extract_body_single( @@ -752,7 +676,7 @@ impl FullTextParser { ) -> Option { if let Some(config) = config { if let Some(next_page_xpath) = config.next_page_link.as_deref() { - if let Ok(next_page_string) = Self::get_attribute(context, next_page_xpath, "href") + if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href") { if let Ok(next_page_url) = url::Url::parse(&next_page_string) { return Some(next_page_url); @@ -760,7 +684,7 @@ impl FullTextParser { } } } else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() { - if let Ok(next_page_string) = Self::get_attribute(context, next_page_xpath, "href") { + if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href") { if let Ok(next_page_url) = url::Url::parse(&next_page_string) { return Some(next_page_url); } diff --git a/src/full_text_parser/readability/mod.rs b/src/full_text_parser/readability/mod.rs new file mode 100644 index 0000000..b34049e --- /dev/null +++ b/src/full_text_parser/readability/mod.rs @@ -0,0 +1,165 @@ +mod regex; +mod state; + +use libxml::tree::{Document, Node}; + +use self::state::State; +use super::error::FullTextParserError; + +pub struct Readability; + +impl Readability { + pub fn extract_body_readability( + document: &Document, + root: &mut Node, + ) -> Result { + let mut state = State::default(); + let mut node: Option = document.clone().get_root_element(); + + while let Some(node_ref) = node.as_mut() { + + let match_string = node_ref.get_class_names().iter().fold(String::new(), |a, b| format!("{a} {b}")); + + if !Self::is_probably_visible(node_ref) { + node = Self::remove_and_next(node_ref); + continue; + } + + if Self::check_byline(node_ref, &match_string) { + node = Self::remove_and_next(node_ref); + continue; + } + + if state.should_remove_title_header && Self::header_duplicates_title(node_ref) { + state.should_remove_title_header = false; + node = Self::remove_and_next(node_ref); + continue; + } + + if state.strip_unlikely { + + } + + node = Self::next_node(node_ref, false); + } + + unimplemented!() + } + + fn is_probably_visible(node: &Node) -> bool { + let display_none = node + .get_attribute("display") + .map(|display| display == "none") + .unwrap_or(false); + let is_hidden = node.has_attribute("hidden"); + let aria_hidden = node + .get_attribute("aria-hidden") + .map(|attr| attr == "true") + .unwrap_or(false); + let has_fallback_image = node.get_class_names().contains("fallback-image"); + + !display_none && !is_hidden && !aria_hidden || has_fallback_image + } + + fn remove_and_next(node: &mut Node) -> Option { + let next_node = Self::next_node(node, true); + node.unlink(); + return next_node; + } + + fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option { + // First check for kids if those aren't being ignored + let first_child = node.get_first_child(); + if !ignore_self_and_kids && first_child.is_some() { + return first_child; + } + + // Then for siblings... + let next_sibling = node.get_next_sibling(); + if next_sibling.is_some() { + return next_sibling; + } + + // And finally, move up the parent chain *and* find a sibling + // (because this is depth-first traversal, we will have already + // seen the parent nodes themselves). + loop { + let parent = node.get_parent(); + if parent.is_none() { + break; + } + + if let Some(parent) = parent { + let next_sibling = parent.get_next_sibling(); + if next_sibling.is_some() { + return next_sibling; + } + } + } + + None + } + + fn check_byline(node: &Node, matchstring: &str) -> bool { + let rel = node + .get_attribute("rel") + .map(|rel| rel == "author") + .unwrap_or(false); + let itemprop = node + .get_attribute("itemprop") + .map(|prop| prop.contains("author")) + .unwrap_or(false); + + let content = node.get_content(); + if rel || itemprop || regex::BYLINE.is_match(matchstring) && Self::is_valid_byline(&content) { + // FIXME + true + } else { + false + } + } + + // Check whether the input string could be a byline. + // This verifies that the input length is less than 100 chars. + fn is_valid_byline(line: &str) -> bool { + let len = line.trim().len(); + len > 0 && len < 100 + } + + // Check if this node is an H1 or H2 element whose content is mostly + // the same as the article title. + fn header_duplicates_title(node: &Node) -> bool { + let name = node.get_name().to_lowercase(); + if name != "h1" || name != "h2" { + return false; + } + let heading = Self::get_inner_text(node, false); + Self::text_similarity(&heading, "FIXME") > 0.75 + } + + fn get_inner_text(node: &Node, normalize_spaces: bool) -> String { + let content = node.get_content().trim().to_owned(); + if normalize_spaces { + regex::NORMALIZE.replace(&content, " ").into() + } else { + content + } + } + + fn text_similarity(a: &str, b: &str) -> f64 { + let a = a.to_lowercase(); + let b = b.to_lowercase(); + let tokens_a = regex::TOKENIZE.split(&a).collect::>(); + let tokens_b = regex::TOKENIZE.split(&b).collect::>(); + if tokens_a.iter().count() == 0 || tokens_b.iter().count() == 0 { + return 0.0; + } + + let tokens_b_total: f64 = tokens_b.iter().map(|t| t.len()).fold(0.0, |a, b| a + b as f64); + let uniq_tokens_b = tokens_b.into_iter().filter(|token| !tokens_a.iter().any(|t| t == token)).collect::>(); + let uniq_tokens_b_total: f64 = uniq_tokens_b.iter().map(|t| t.len()).fold(0.0, |a, b| a + b as f64); + + let distance_b = uniq_tokens_b_total / tokens_b_total; + 1.0 - distance_b + } +} diff --git a/src/full_text_parser/readability/regex.rs b/src/full_text_parser/readability/regex.rs new file mode 100644 index 0000000..455b60a --- /dev/null +++ b/src/full_text_parser/readability/regex.rs @@ -0,0 +1,12 @@ +use once_cell::sync::Lazy; +use regex::Regex; + +pub static BYLINE: Lazy = Lazy::new(|| { + Regex::new(r#"/byline|author|dateline|writtenby|p-author/i"#).expect("BYLINE regex") +}); +pub static NORMALIZE: Lazy = Lazy::new(|| { + Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex") +}); +pub static TOKENIZE: Lazy = Lazy::new(|| { + Regex::new(r#"/\W+/g"#).expect("TOKENIZE regex") +}); \ No newline at end of file diff --git a/src/full_text_parser/readability/state.rs b/src/full_text_parser/readability/state.rs new file mode 100644 index 0000000..e5de7a9 --- /dev/null +++ b/src/full_text_parser/readability/state.rs @@ -0,0 +1,17 @@ +pub struct State { + pub strip_unlikely: bool, + pub weigh_classes: bool, + pub clean_conditionally: bool, + pub should_remove_title_header: bool, +} + +impl Default for State { + fn default() -> Self { + Self { + strip_unlikely: true, + weigh_classes: true, + clean_conditionally: true, + should_remove_title_header: true, + } + } +} \ No newline at end of file diff --git a/src/images/mod.rs b/src/images/mod.rs index 526c0d4..9ab8131 100644 --- a/src/images/mod.rs +++ b/src/images/mod.rs @@ -36,7 +36,7 @@ impl ImageDownloader { doc: &Document, client: &Client, ) -> Result { - let xpath_ctx = Context::new(&doc).map_err(|()| { + let xpath_ctx = Context::new(doc).map_err(|()| { error!("Failed to create xpath context for document"); ImageDownloadError::HtmlParse })?; diff --git a/src/util.rs b/src/util.rs index 2e72486..c80a29f 100644 --- a/src/util.rs +++ b/src/util.rs @@ -145,6 +145,17 @@ impl Util { None } + pub fn get_attribute( + context: &Context, + xpath: &str, + attribute: &str, + ) -> Result { + Util::evaluate_xpath(context, xpath, false)? + .iter() + .find_map(|node| node.get_attribute(attribute)) + .ok_or(FullTextParserError::Xml) + } + pub fn extract_value(context: &Context, xpath: &str) -> Result { let node_vec = Util::evaluate_xpath(context, xpath, false)?; if let Some(val) = node_vec.get(0) {