diff --git a/src/full_text_parser/metadata.rs b/src/full_text_parser/metadata.rs index 23120ed..68b9d4a 100644 --- a/src/full_text_parser/metadata.rs +++ b/src/full_text_parser/metadata.rs @@ -1,9 +1,9 @@ +use super::config::ConfigEntry; +use crate::{article::Article, util::Util}; use chrono::{DateTime, Utc}; use libxml::xpath::Context; use log::{debug, warn}; use std::str::FromStr; -use crate::{article::Article, util::Util}; -use super::config::ConfigEntry; pub fn extract( context: &Context, @@ -11,19 +11,23 @@ pub fn extract( global_config: &ConfigEntry, article: &mut Article, ) { - if article.title.is_none() { - article.title = extract_title(context, config, global_config).and_then(|title| Some(match escaper::decode_html(&title) { - Ok(escaped_title) => escaped_title, - Err(_error) => title, - })); + article.title = extract_title(context, config, global_config).map(|title| { + match escaper::decode_html(&title) { + Ok(escaped_title) => escaped_title, + Err(_error) => title, + } + }); } if article.author.is_none() { - article.author = extract_author(context, config, global_config).and_then(|author| Some(match escaper::decode_html(&author) { - Ok(escaped_author) => escaped_author, - Err(_error) => author, - })); + article.author = + extract_author(context, config, global_config).map( + |author| match escaper::decode_html(&author) { + Ok(escaped_author) => escaped_author, + Err(_error) => author, + }, + ); } if article.date.is_none() { @@ -34,7 +38,7 @@ pub fn extract( fn extract_title( context: &Context, config: Option<&ConfigEntry>, - global_config: &ConfigEntry + global_config: &ConfigEntry, ) -> Option { // check site specific config if let Some(config) = config { @@ -67,7 +71,7 @@ fn extract_title( fn extract_author( context: &Context, config: Option<&ConfigEntry>, - global_config: &ConfigEntry + global_config: &ConfigEntry, ) -> Option { // check site specific config if let Some(config) = config { @@ -96,7 +100,7 @@ fn extract_author( fn extract_date( context: &Context, config: Option<&ConfigEntry>, - global_config: &ConfigEntry + global_config: &ConfigEntry, ) -> Option> { // check site specific config if let Some(config) = config { @@ -128,5 +132,10 @@ fn extract_date( } fn get_meta(context: &Context, name: &str) -> Option { - Util::get_attribute(context, &format!("//meta[contains(@name, '{}')]", name), "content").ok() -} \ No newline at end of file + Util::get_attribute( + context, + &format!("//meta[contains(@name, '{}')]", name), + "content", + ) + .ok() +} diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs index 0d427bf..6ffbf1a 100644 --- a/src/full_text_parser/mod.rs +++ b/src/full_text_parser/mod.rs @@ -1,8 +1,8 @@ pub mod config; pub mod error; mod fingerprints; -mod readability; mod metadata; +mod readability; #[cfg(test)] mod tests; @@ -387,14 +387,14 @@ impl FullTextParser { } if let Ok(thumb) = - Util::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content") + Util::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content") { article.thumbnail_url = Some(thumb); return; } if let Ok(thumb) = - Util::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href") + Util::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href") { article.thumbnail_url = Some(thumb); } diff --git a/src/full_text_parser/readability/constants.rs b/src/full_text_parser/readability/constants.rs new file mode 100644 index 0000000..557932c --- /dev/null +++ b/src/full_text_parser/readability/constants.rs @@ -0,0 +1,40 @@ +use once_cell::sync::Lazy; +use regex::Regex; + +pub static BYLINE: Lazy = Lazy::new(|| { + Regex::new(r#"/byline|author|dateline|writtenby|p-author/i"#).expect("BYLINE regex") +}); +pub static NORMALIZE: Lazy = + Lazy::new(|| Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex")); +pub static TOKENIZE: Lazy = Lazy::new(|| Regex::new(r#"/\W+/g"#).expect("TOKENIZE regex")); +pub static UNLIELY_CANDIDATES: Lazy = Lazy::new(|| { + Regex::new(r#"/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i"#).expect("UNLIELY_CANDIDATES regex") +}); +pub static OKAY_MAYBE_ITS_A_CANDIDATE: Lazy = Lazy::new(|| { + Regex::new(r#"/and|article|body|column|content|main|shadow/i"#) + .expect("OKAY_MAYBE_ITS_A_CANDIDATE regex") +}); +pub static HAS_CONTENT: Lazy = + Lazy::new(|| Regex::new(r#"/\S$/"#).expect("HAS_CONTENT regex")); +pub static HASH_URL: Lazy = Lazy::new(|| Regex::new(r#"/^#.+/"#).expect("HASH_URL regex")); + +pub const UNLIKELY_ROLES: &[&str] = &[ + "menu", + "menubar", + "complementary", + "navigation", + "alert", + "alertdialog", + "dialog", +]; + +pub const DEFAULT_TAGS_TO_SCORE: &[&str] = + &["SECTION", "H2", "H3", "H4", "H5", "H6", "P", "TD", "PRE"]; + +pub const PHRASING_ELEMS: &[&str] = &[ + // "CANVAS", "IFRAME", "SVG", "VIDEO", + "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", "DATALIST", "DFN", "EM", + "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL", "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", + "OUTPUT", "PROGRESS", "Q", "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", + "SUB", "SUP", "TEXTAREA", "TIME", "VAR", "WBR", +]; diff --git a/src/full_text_parser/readability/mod.rs b/src/full_text_parser/readability/mod.rs index b34049e..7a9465a 100644 --- a/src/full_text_parser/readability/mod.rs +++ b/src/full_text_parser/readability/mod.rs @@ -1,7 +1,7 @@ -mod regex; +mod constants; mod state; -use libxml::tree::{Document, Node}; +use libxml::tree::{Document, Node, NodeType}; use self::state::State; use super::error::FullTextParserError; @@ -11,21 +11,29 @@ pub struct Readability; impl Readability { pub fn extract_body_readability( document: &Document, - root: &mut Node, + _root: &mut Node, ) -> Result { let mut state = State::default(); + let mut elements_to_score = Vec::new(); let mut node: Option = document.clone().get_root_element(); while let Some(node_ref) = node.as_mut() { - - let match_string = node_ref.get_class_names().iter().fold(String::new(), |a, b| format!("{a} {b}")); + let tag_name = node_ref.get_name().to_uppercase(); + let match_string = node_ref + .get_class_names() + .iter() + .fold(String::new(), |a, b| format!("{a} {b}")); + let match_string = match node_ref.get_property("id") { + Some(id) => format!("{match_string} {id}"), + None => match_string, + }; if !Self::is_probably_visible(node_ref) { node = Self::remove_and_next(node_ref); continue; } - if Self::check_byline(node_ref, &match_string) { + if Self::check_byline(node_ref, &match_string, &mut state) { node = Self::remove_and_next(node_ref); continue; } @@ -36,8 +44,78 @@ impl Readability { continue; } + // Remove unlikely candidates if state.strip_unlikely { - + if constants::UNLIELY_CANDIDATES.is_match(&match_string) + && !constants::OKAY_MAYBE_ITS_A_CANDIDATE.is_match(&match_string) + && !Self::has_ancestor_tag(node_ref, "table", None) + && !Self::has_ancestor_tag(node_ref, "code", None) + && tag_name != "BODY" + && tag_name != "A" + { + node = Self::remove_and_next(node_ref); + continue; + } + + if let Some(role) = node_ref.get_attribute("role") { + if constants::UNLIKELY_ROLES.contains(&role.as_str()) { + node = Self::remove_and_next(node_ref); + continue; + } + } + } + + // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). + if tag_name == "DIV" + || tag_name == "SECTION" + || tag_name == "HEADER" + || tag_name == "H1" + || tag_name == "H2" + || tag_name == "H3" + || tag_name == "H4" + || tag_name == "H5" + || tag_name == "H6" && Self::is_element_without_content(node_ref) + { + node = Self::remove_and_next(node_ref); + continue; + } + + if constants::DEFAULT_TAGS_TO_SCORE.contains(&tag_name.as_str()) { + elements_to_score.push(node_ref.clone()); + } + + // Turn all divs that don't have children block level elements into p's + if tag_name == "DIV" { + // Put phrasing content into paragraphs. + let mut p: Option = None; + for mut child_node in node_ref.get_child_nodes().into_iter() { + if Self::is_phrasing_content(&child_node) { + if let Some(p) = p.as_mut() { + let _ = p.add_child(&mut child_node); + } else if !Self::is_whitespace(&child_node) { + let mut new_node = Node::new("p", None, document).unwrap(); + node_ref + .replace_child_node(new_node.clone(), child_node.clone()) + .unwrap(); + new_node.add_child(&mut child_node).unwrap(); + p.replace(new_node); + } + } else if let Some(p) = p.as_mut() { + for mut r_node in p.get_child_nodes().into_iter().rev() { + if Self::is_whitespace(&r_node) { + r_node.unlink(); + } + } + } + } + + // Sites like http://mobile.slate.com encloses each paragraph with a DIV + // element. DIVs with only a P element inside and no text content can be + // safely converted into plain P elements to avoid confusing the scoring + // algorithm with DIVs with are, in practice, paragraphs. + if Self::has_single_tag_inside_element(node_ref, "P") + && Self::get_link_density(node_ref) < 0.25 + {} } node = Self::next_node(node_ref, false); @@ -61,10 +139,24 @@ impl Readability { !display_none && !is_hidden && !aria_hidden || has_fallback_image } + fn is_whitespace(node: &Node) -> bool { + let is_text_node = node + .get_type() + .map(|t| t == NodeType::TextNode) + .unwrap_or(false); + let is_element_node = node + .get_type() + .map(|t| t == NodeType::ElementNode) + .unwrap_or(false); + + (is_text_node && node.get_content().trim().is_empty()) + || (is_element_node && node.get_name().to_uppercase() == "BR") + } + fn remove_and_next(node: &mut Node) -> Option { let next_node = Self::next_node(node, true); node.unlink(); - return next_node; + next_node } fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option { @@ -100,7 +192,11 @@ impl Readability { None } - fn check_byline(node: &Node, matchstring: &str) -> bool { + fn check_byline(node: &Node, matchstring: &str, state: &mut State) -> bool { + if state.byline.is_some() { + return false; + } + let rel = node .get_attribute("rel") .map(|rel| rel == "author") @@ -111,8 +207,11 @@ impl Readability { .unwrap_or(false); let content = node.get_content(); - if rel || itemprop || regex::BYLINE.is_match(matchstring) && Self::is_valid_byline(&content) { - // FIXME + if rel + || itemprop + || constants::BYLINE.is_match(matchstring) && Self::is_valid_byline(&content) + { + state.byline = Some(content.trim().into()); true } else { false @@ -140,7 +239,7 @@ impl Readability { fn get_inner_text(node: &Node, normalize_spaces: bool) -> String { let content = node.get_content().trim().to_owned(); if normalize_spaces { - regex::NORMALIZE.replace(&content, " ").into() + constants::NORMALIZE.replace(&content, " ").into() } else { content } @@ -149,17 +248,146 @@ impl Readability { fn text_similarity(a: &str, b: &str) -> f64 { let a = a.to_lowercase(); let b = b.to_lowercase(); - let tokens_a = regex::TOKENIZE.split(&a).collect::>(); - let tokens_b = regex::TOKENIZE.split(&b).collect::>(); - if tokens_a.iter().count() == 0 || tokens_b.iter().count() == 0 { + let tokens_a = constants::TOKENIZE.split(&a).collect::>(); + let tokens_b = constants::TOKENIZE.split(&b).collect::>(); + if tokens_a.is_empty() || tokens_b.is_empty() { return 0.0; } - let tokens_b_total: f64 = tokens_b.iter().map(|t| t.len()).fold(0.0, |a, b| a + b as f64); - let uniq_tokens_b = tokens_b.into_iter().filter(|token| !tokens_a.iter().any(|t| t == token)).collect::>(); - let uniq_tokens_b_total: f64 = uniq_tokens_b.iter().map(|t| t.len()).fold(0.0, |a, b| a + b as f64); - + let tokens_b_total: f64 = tokens_b + .iter() + .map(|t| t.len()) + .fold(0.0, |a, b| a + b as f64); + let uniq_tokens_b = tokens_b + .into_iter() + .filter(|token| !tokens_a.iter().any(|t| t == token)) + .collect::>(); + let uniq_tokens_b_total: f64 = uniq_tokens_b + .iter() + .map(|t| t.len()) + .fold(0.0, |a, b| a + b as f64); + let distance_b = uniq_tokens_b_total / tokens_b_total; 1.0 - distance_b } + + fn has_ancestor_tag(node: &Node, tag_name: &str, max_depth: Option) -> bool { + let max_depth = max_depth.unwrap_or(3); + let tag_name = tag_name.to_uppercase(); + let mut depth = 0; + let mut node = node.get_parent(); + + loop { + if depth > max_depth { + return false; + } + + let tmp_node = match node { + Some(node) => node, + None => return false, + }; + + if tmp_node.get_name() == tag_name { + return true; + } + + node = tmp_node.get_parent(); + depth += 1; + } + } + + fn has_single_tag_inside_element(node: &Node, tag: &str) -> bool { + // There should be exactly 1 element child with given tag + if node.get_child_nodes().len() == 1 + || node + .get_child_nodes() + .first() + .map(|n| n.get_name().to_uppercase() == tag) + .unwrap_or(false) + { + return false; + } + + // And there should be no text nodes with real content + node.get_child_nodes().iter().any(|n| { + n.get_type() + .map(|t| t == NodeType::TextNode) + .unwrap_or(false) + && constants::HAS_CONTENT.is_match(&n.get_content()) + }) + } + + fn is_element_without_content(node: &Node) -> bool { + if let Some(node_type) = node.get_type() { + let len = node.get_child_nodes().len(); + + return node_type == NodeType::ElementNode + && node.get_content().trim().is_empty() + && (len == 0 + || len + == Self::get_elements_by_tag_name(node, "br").len() + + Self::get_elements_by_tag_name(node, "hr").len()); + } + + false + } + + fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec { + let tag = tag.to_uppercase(); + let all_tags = tag == "*"; + let mut vec = Vec::new(); + + fn get_elems(node: &Node, tag: &str, vec: &mut Vec, all_tags: bool) { + for child in node.get_child_elements() { + if all_tags || child.get_name() == tag { + vec.push(child); + } + get_elems(node, tag, vec, all_tags); + } + } + + get_elems(node, &tag, &mut vec, all_tags); + vec + } + + fn is_phrasing_content(node: &Node) -> bool { + let tag_name = node.get_name().to_uppercase(); + let is_text_node = node + .get_type() + .map(|t| t == NodeType::TextNode) + .unwrap_or(false); + + is_text_node + || constants::PHRASING_ELEMS.contains(&tag_name.as_str()) + || (tag_name == "A" || tag_name == "DEL" || tag_name == "INS") + && node + .get_child_nodes() + .iter() + .map(Self::is_phrasing_content) + .all(|val| val) + } + + fn get_link_density(node: &Node) -> f64 { + let text_length = Self::get_inner_text(node, false).len(); + if text_length == 0 { + return 0.0; + } + + let mut link_length = 0.0; + + // XXX implement _reduceNodeList? + let link_nodes = Self::get_elements_by_tag_name(node, "A"); + for link_node in link_nodes { + if let Some(href) = link_node.get_attribute("href") { + let coefficient = if constants::HASH_URL.is_match(&href) { + 0.3 + } else { + 1.0 + }; + link_length += Self::get_inner_text(&link_node, false).len() as f64 * coefficient; + } + } + + link_length / text_length as f64 + } } diff --git a/src/full_text_parser/readability/regex.rs b/src/full_text_parser/readability/regex.rs deleted file mode 100644 index 455b60a..0000000 --- a/src/full_text_parser/readability/regex.rs +++ /dev/null @@ -1,12 +0,0 @@ -use once_cell::sync::Lazy; -use regex::Regex; - -pub static BYLINE: Lazy = Lazy::new(|| { - Regex::new(r#"/byline|author|dateline|writtenby|p-author/i"#).expect("BYLINE regex") -}); -pub static NORMALIZE: Lazy = Lazy::new(|| { - Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex") -}); -pub static TOKENIZE: Lazy = Lazy::new(|| { - Regex::new(r#"/\W+/g"#).expect("TOKENIZE regex") -}); \ No newline at end of file diff --git a/src/full_text_parser/readability/state.rs b/src/full_text_parser/readability/state.rs index e5de7a9..1683058 100644 --- a/src/full_text_parser/readability/state.rs +++ b/src/full_text_parser/readability/state.rs @@ -3,6 +3,7 @@ pub struct State { pub weigh_classes: bool, pub clean_conditionally: bool, pub should_remove_title_header: bool, + pub byline: Option, } impl Default for State { @@ -12,6 +13,7 @@ impl Default for State { weigh_classes: true, clean_conditionally: true, should_remove_title_header: true, + byline: None, } } -} \ No newline at end of file +}