diff --git a/src/full_text_parser/readability/constants.rs b/src/constants.rs similarity index 82% rename from src/full_text_parser/readability/constants.rs rename to src/constants.rs index 1d1ded8..d18338c 100644 --- a/src/full_text_parser/readability/constants.rs +++ b/src/constants.rs @@ -11,7 +11,7 @@ pub static BYLINE: Lazy = Lazy::new(|| { }); pub static NORMALIZE: Lazy = Lazy::new(|| Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex")); -pub static TOKENIZE: Lazy = Lazy::new(|| Regex::new(r#"/\W+/g"#).expect("TOKENIZE regex")); +pub static TOKENIZE: Lazy = Lazy::new(|| Regex::new(r#"\W+"#).expect("TOKENIZE regex")); pub static UNLIELY_CANDIDATES: Lazy = Lazy::new(|| { Regex::new(r#"/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i"#).expect("UNLIELY_CANDIDATES regex") }); @@ -31,6 +31,14 @@ pub static POSITIVE: Lazy = Lazy::new(|| { pub static NEGATIVE: Lazy = Lazy::new(|| Regex::new(r#"/-ad-|hidden|^hid$| hid$| hid |^hid"#).expect("NEGATIVE regex")); +pub static TITLE_SEPARATOR: Lazy = + Lazy::new(|| Regex::new(r#"[-|\\/>»]"#).expect("TITLE_SEPARATOR regex")); +pub static TITLE_CUT_END: Lazy = + Lazy::new(|| Regex::new(r#"(.*)[-|\\/>»] .*"#).expect("TITLE_CUT_END regex")); +pub static WORD_COUNT: Lazy = Lazy::new(|| Regex::new(r#"\s+"#).expect("WORD_COUNT regex")); +pub static TITLE_CUT_FRONT: Lazy = + Lazy::new(|| Regex::new(r#"/[^-|\\/>»]*[-|\\/>»](.*)/gi"#).expect("TITLE_CUT_FRONT regex")); + pub const SCORE_ATTR: &str = "content_score"; pub const MINIMUM_TOPCANDIDATES: usize = 3; pub const UNLIKELY_ROLES: &[&str] = &[ diff --git a/src/full_text_parser/metadata.rs b/src/full_text_parser/metadata.rs index 68b9d4a..dd859a4 100644 --- a/src/full_text_parser/metadata.rs +++ b/src/full_text_parser/metadata.rs @@ -1,5 +1,5 @@ use super::config::ConfigEntry; -use crate::{article::Article, util::Util}; +use crate::{article::Article, constants, util::Util}; use chrono::{DateTime, Utc}; use libxml::xpath::Context; use log::{debug, warn}; @@ -8,16 +8,29 @@ use std::str::FromStr; pub fn extract( context: &Context, config: Option<&ConfigEntry>, - global_config: &ConfigEntry, + global_config: Option<&ConfigEntry>, article: &mut Article, ) { if article.title.is_none() { - article.title = extract_title(context, config, global_config).map(|title| { - match escaper::decode_html(&title) { + article.title = extract_title(context, config, global_config) + .map(|title| match escaper::decode_html(&title) { Ok(escaped_title) => escaped_title, Err(_error) => title, - } - }); + }) + .map(|title| { + // clean titles that contain separators + if constants::TITLE_SEPARATOR.is_match(&title) { + let new_title = constants::TITLE_CUT_END.replace(&title, "$1"); + let word_count = constants::WORD_COUNT.split(&title).count(); + if word_count < 3 { + constants::TITLE_CUT_FRONT.replace(&title, "$1").to_string() + } else { + new_title.to_string() + } + } else { + title + } + }); } if article.author.is_none() { @@ -38,7 +51,7 @@ pub fn extract( fn extract_title( context: &Context, config: Option<&ConfigEntry>, - global_config: &ConfigEntry, + global_config: Option<&ConfigEntry>, ) -> Option { // check site specific config if let Some(config) = config { @@ -51,27 +64,30 @@ fn extract_title( } // check global config - for xpath_title in &global_config.xpath_title { - if let Ok(title) = Util::extract_value_merge(context, xpath_title) { - debug!("Article title: '{}'", title); - return Some(title); + if let Some(global_config) = global_config { + for xpath_title in &global_config.xpath_title { + if let Ok(title) = Util::extract_value_merge(context, xpath_title) { + debug!("Article title: '{}'", title); + return Some(title); + } } } // generic meta (readablity) - get_meta(context, "dc:title") + Util::extract_value(context, "//title") + .ok() + .or_else(|| get_meta(context, "dc:title")) .or_else(|| get_meta(context, "dcterm:title")) .or_else(|| get_meta(context, "og:title")) .or_else(|| get_meta(context, "weibo:article:title")) .or_else(|| get_meta(context, "weibo:webpage:title")) - .or_else(|| get_meta(context, "title")) .or_else(|| get_meta(context, "twitter:title")) } fn extract_author( context: &Context, config: Option<&ConfigEntry>, - global_config: &ConfigEntry, + global_config: Option<&ConfigEntry>, ) -> Option { // check site specific config if let Some(config) = config { @@ -84,23 +100,26 @@ fn extract_author( } // check global config - for xpath_author in &global_config.xpath_author { - if let Ok(author) = Util::extract_value(context, xpath_author) { - debug!("Article author: '{}'", author); - return Some(author); + if let Some(global_config) = global_config { + for xpath_author in &global_config.xpath_author { + if let Ok(author) = Util::extract_value(context, xpath_author) { + debug!("Article author: '{}'", author); + return Some(author); + } } } // generic meta (readablity) - get_meta(context, "dc:creator") + Util::extract_value(context, "//author") + .ok() + .or_else(|| get_meta(context, "dc:creator")) .or_else(|| get_meta(context, "dcterm:creator")) - .or_else(|| get_meta(context, "author")) } fn extract_date( context: &Context, config: Option<&ConfigEntry>, - global_config: &ConfigEntry, + global_config: Option<&ConfigEntry>, ) -> Option> { // check site specific config if let Some(config) = config { @@ -117,13 +136,15 @@ fn extract_date( } // check global config - for xpath_date in &global_config.xpath_date { - if let Ok(date_string) = Util::extract_value(context, xpath_date) { - debug!("Article date: '{}'", date_string); - if let Ok(date) = DateTime::from_str(&date_string) { - return Some(date); - } else { - warn!("Parsing the date string '{}' failed", date_string); + if let Some(global_config) = global_config { + for xpath_date in &global_config.xpath_date { + if let Ok(date_string) = Util::extract_value(context, xpath_date) { + debug!("Article date: '{}'", date_string); + if let Ok(date) = DateTime::from_str(&date_string) { + return Some(date); + } else { + warn!("Parsing the date string '{}' failed", date_string); + } } } } diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs index 2d5d0a7..3f69442 100644 --- a/src/full_text_parser/mod.rs +++ b/src/full_text_parser/mod.rs @@ -174,7 +174,7 @@ impl FullTextParser { } } - metadata::extract(&xpath_ctx, config, global_config, article); + metadata::extract(&xpath_ctx, config, Some(global_config), article); if article.thumbnail_url.is_none() { Self::check_for_thumbnail(&xpath_ctx, article); } @@ -182,7 +182,8 @@ impl FullTextParser { let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?; if !found_body { - if let Err(error) = Readability::extract_body(document, root) { + if let Err(error) = Readability::extract_body(document, root, article.title.as_deref()) + { log::error!("Both ftr and readability failed to find content: {}", error); return Err(error); } @@ -246,7 +247,7 @@ impl FullTextParser { let html = Self::download(url, client, headers).await?; let document = Self::parse_html(&html, config, global_config)?; let xpath_ctx = Self::get_xpath_ctx(&document)?; - metadata::extract(&xpath_ctx, config, global_config, article); + metadata::extract(&xpath_ctx, config, Some(global_config), article); Self::check_for_thumbnail(&xpath_ctx, article); Self::strip_junk(&xpath_ctx, config, global_config, url); Self::extract_body(&xpath_ctx, root, config, global_config)?; diff --git a/src/full_text_parser/readability/mod.rs b/src/full_text_parser/readability/mod.rs index 00a6f2d..de049f0 100644 --- a/src/full_text_parser/readability/mod.rs +++ b/src/full_text_parser/readability/mod.rs @@ -1,4 +1,3 @@ -mod constants; mod state; #[cfg(test)] @@ -10,11 +9,16 @@ use libxml::tree::{node, Document, Node, NodeType}; use self::state::State; use super::error::FullTextParserError; +use crate::constants; pub struct Readability; impl Readability { - pub fn extract_body(document: Document, root: &mut Node) -> Result { + pub fn extract_body( + document: Document, + root: &mut Node, + title: Option<&str>, + ) -> Result { node::set_node_rc_guard(6); let mut state = State::default(); @@ -49,7 +53,9 @@ impl Readability { continue; } - if state.should_remove_title_header && Self::header_duplicates_title(node_ref) { + if state.should_remove_title_header + && Self::header_duplicates_title(node_ref, title) + { state.should_remove_title_header = false; node = Self::remove_and_next(node_ref); continue; @@ -278,7 +284,8 @@ impl Readability { constants::MINIMUM_TOPCANDIDATES, ); for ancestor in alternative_candidate_ancestors.iter().take(tmp) { - lists_containing_this_ancestor += if ancestor == parent { 1 } else { 0 }; + lists_containing_this_ancestor += + if ancestor == parent { 1 } else { 0 }; } if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES { @@ -668,13 +675,18 @@ impl Readability { // Check if this node is an H1 or H2 element whose content is mostly // the same as the article title. - fn header_duplicates_title(node: &Node) -> bool { + fn header_duplicates_title(node: &Node, title: Option<&str>) -> bool { let name = node.get_name().to_lowercase(); if name != "h1" && name != "h2" { return false; } let heading = Self::get_inner_text(node, false); - Self::text_similarity(&heading, "Get your Frontend JavaScript Code Covered") > 0.75 + + if let Some(title) = title { + Self::text_similarity(&heading, title) > 0.75 + } else { + false + } } fn get_inner_text(node: &Node, normalize_spaces: bool) -> String { @@ -695,18 +707,12 @@ impl Readability { return 0.0; } - let tokens_b_total: f64 = tokens_b - .iter() - .map(|t| t.len()) - .fold(0.0, |a, b| a + b as f64); + let tokens_b_total = tokens_b.join(" ").len() as f64; let uniq_tokens_b = tokens_b .into_iter() .filter(|token| !tokens_a.iter().any(|t| t == token)) .collect::>(); - let uniq_tokens_b_total: f64 = uniq_tokens_b - .iter() - .map(|t| t.len()) - .fold(0.0, |a, b| a + b as f64); + let uniq_tokens_b_total = uniq_tokens_b.join(" ").len() as f64; let distance_b = uniq_tokens_b_total / tokens_b_total; 1.0 - distance_b diff --git a/src/full_text_parser/readability/tests.rs b/src/full_text_parser/readability/tests.rs index 07bde03..e8547fb 100644 --- a/src/full_text_parser/readability/tests.rs +++ b/src/full_text_parser/readability/tests.rs @@ -1,17 +1,30 @@ -use libxml::tree::{Document, Node}; +use libxml::{ + tree::{Document, Node}, + xpath::Context, +}; use reqwest::Url; -use crate::full_text_parser::config::ConfigEntry; +use crate::{ + article::Article, + full_text_parser::{config::ConfigEntry, metadata}, +}; -async fn prepare(html: &str, url: &Url) -> Document { +async fn prepare(html: &str, url: &Url) -> (Document, Context, Article) { let empty_config = ConfigEntry::default(); let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap(); let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap(); crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config, url); - document + let article = Article { + title: None, + author: None, + url: url.clone(), + date: None, + thumbnail_url: None, + document: None, + }; + (document, xpath_ctx, article) } - #[tokio::test] async fn test_1() { let _ = env_logger::builder().is_test(true).try_init(); @@ -19,9 +32,11 @@ async fn test_1() { let html = std::fs::read_to_string(r"./resources/tests/readability-test-1.html") .expect("Failed to read HTML"); let url = Url::parse("http://google.com").unwrap(); - let document = prepare(&html, &url).await; + let (document, xpath_ctx, mut article) = prepare(&html, &url).await; let mut root = Node::new("article", None, &document).unwrap(); - super::Readability::extract_body(document, &mut root).unwrap(); + metadata::extract(&xpath_ctx, None, None, &mut article); + + super::Readability::extract_body(document, &mut root, article.title.as_deref()).unwrap(); } diff --git a/src/images/mod.rs b/src/images/mod.rs index b1b9a41..96acd4f 100644 --- a/src/images/mod.rs +++ b/src/images/mod.rs @@ -145,7 +145,8 @@ impl ImageDownloader { } let small_image_base64 = base64::engine::general_purpose::STANDARD.encode(&small_image); - let big_image_base64 = big_image.map(|img| base64::engine::general_purpose::STANDARD.encode(img)); + let big_image_base64 = + big_image.map(|img| base64::engine::general_purpose::STANDARD.encode(img)); let small_image_string = format!("data:{};base64,{}", content_type_small, small_image_base64); let big_image_string = match big_image_base64 { diff --git a/src/lib.rs b/src/lib.rs index 08f6e76..76244d9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ mod article; +mod constants; mod error; mod full_text_parser; pub mod images;