use libxml::{ tree::{Node, NodeType}, xpath::Context, }; use reqwest::{ header::{HeaderMap, HeaderName, HeaderValue}, Response, }; use tokio::fs::DirEntry; use crate::{ constants, full_text_parser::{config::ConfigEntry, error::FullTextParserError}, }; pub struct Util; impl Util { pub fn check_extension(path: &DirEntry, extension: &str) -> bool { if let Some(ext) = path.path().extension() { ext.to_str() == Some(extension) } else { false } } pub fn str_extract_value<'a>(identifier: &str, line: &'a str) -> &'a str { let value = &line[identifier.len()..]; let value = value.trim(); match value.find('#') { Some(pos) => &value[..pos], None => value, } } pub fn split_values(values: &str) -> Vec<&str> { values.split('|').map(|s| s.trim()).collect() } pub fn select_rule<'a>( site_specific_rule: Option<&'a str>, global_rule: Option<&'a str>, ) -> Option<&'a str> { if site_specific_rule.is_some() { site_specific_rule } else { global_rule } } pub fn generate_headers( site_specific_rule: Option<&ConfigEntry>, global_rule: &ConfigEntry, ) -> Result { let mut headers = HeaderMap::new(); if let Some(config) = site_specific_rule { for header in &config.header { let name = HeaderName::from_bytes(header.name.as_bytes()) .map_err(|_| FullTextParserError::Config)?; let value = header .value .parse::() .map_err(|_| FullTextParserError::Config)?; headers.insert(name, value); } } for header in &global_rule.header { let name = HeaderName::from_bytes(header.name.as_bytes()) .map_err(|_| FullTextParserError::Config)?; let value = header .value .parse::() .map_err(|_| FullTextParserError::Config)?; headers.insert(name, value); } Ok(headers) } pub fn find_page_url(xpath_ctx: &Context, xpath_page_link: &str) -> Option { let res = Self::evaluate_xpath(xpath_ctx, xpath_page_link, false).ok()?; let mut url = None; for node in res { let content = node.get_content(); let url_str = if content.trim().is_empty() && node.has_attribute("href") { node.get_attribute("href") .expect("already checked for href") } else { content }; if let Ok(parsed_url) = url::Url::parse(&url_str) { url = Some(parsed_url); break; } } url } pub fn evaluate_xpath( xpath_ctx: &Context, xpath: &str, thorw_if_empty: bool, ) -> Result, FullTextParserError> { let res = xpath_ctx.evaluate(xpath).map_err(|()| { log::debug!("Evaluation of xpath '{}' yielded no results", xpath); FullTextParserError::Xml })?; let node_vec = res.get_nodes_as_vec(); if node_vec.is_empty() { log::debug!("Evaluation of xpath '{}' yielded no results", xpath); if thorw_if_empty { return Err(FullTextParserError::Xml); } } Ok(node_vec) } pub fn check_content_type(response: &Response) -> Result { if response.status().is_success() { if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) { if let Ok(content_type) = content_type.to_str() { if content_type.contains("text/html") { return Ok(true); } } } log::error!("Content type is not text/HTML"); return Ok(false); } log::error!("Failed to determine content type"); Err(FullTextParserError::Http) } pub fn check_redirect(response: &Response, original_url: &url::Url) -> Option { if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT { log::debug!("Article url redirects to '{}'", response.url().as_str()); return Some(response.url().clone()); } else if response.url() != original_url { return Some(response.url().clone()); } None } pub fn get_attribute( context: &Context, xpath: &str, attribute: &str, ) -> Result { Util::evaluate_xpath(context, xpath, false)? .iter() .find_map(|node| node.get_attribute(attribute)) .ok_or(FullTextParserError::Xml) } pub fn extract_value(context: &Context, xpath: &str) -> Result { let node_vec = Util::evaluate_xpath(context, xpath, false)?; if let Some(val) = node_vec.get(0) { return Ok(val.get_content()); } Err(FullTextParserError::Xml) } pub fn extract_value_merge( context: &Context, xpath: &str, ) -> Result { let node_vec = Util::evaluate_xpath(context, xpath, true)?; let mut val = String::new(); for node in node_vec { let part = node .get_content() .split_whitespace() .map(|s| format!("{} ", s)) .collect::(); val.push_str(&part); val.push(' '); } Ok(val.trim().to_string()) } pub fn strip_node(context: &Context, xpath: &str) -> Result<(), FullTextParserError> { let mut ancestor = xpath.to_string(); if ancestor.starts_with("//") { ancestor = ancestor.chars().skip(2).collect(); } let query = &format!("{}[not(ancestor::{})]", xpath, ancestor); let node_vec = Util::evaluate_xpath(context, query, false)?; for mut node in node_vec { let tag_name = node.get_name(); if constants::EMBED_TAG_NAMES.contains(tag_name.to_uppercase().as_str()) && node .get_attributes() .iter() .any(|(_name, value)| constants::VIDEOS.is_match(value)) { continue; } node.unlink(); } Ok(()) } pub fn strip_id_or_class( context: &Context, id_or_class: &str, ) -> Result<(), FullTextParserError> { let xpath = &format!( "//*[contains(@class, '{}') or contains(@id, '{}')]", id_or_class, id_or_class ); let mut ancestor = xpath.clone(); if ancestor.starts_with("//") { ancestor = ancestor.chars().skip(2).collect(); } let query = &format!("{}[not(ancestor::{})]", xpath, ancestor); let node_vec = Util::evaluate_xpath(context, query, false)?; for mut node in node_vec { node.unlink(); } Ok(()) } pub fn is_probably_visible(node: &Node) -> bool { let is_hidden = node.has_attribute("hidden"); let aria_hidden = node .get_attribute("aria-hidden") .map(|attr| attr == "true") .unwrap_or(false); let has_fallback_image = node.get_class_names().contains("fallback-image"); !is_hidden && !aria_hidden || has_fallback_image } pub fn is_whitespace(node: &Node) -> bool { let content = node.get_content(); let tag_name = node.get_name().to_uppercase(); let is_text_node = node .get_type() .map(|t| t == NodeType::TextNode) .unwrap_or(false); let is_element_node = node .get_type() .map(|t| t == NodeType::ElementNode) .unwrap_or(false); (is_text_node && content.trim().is_empty()) || (is_element_node && tag_name == "BR") } pub fn remove_and_next(node: &mut Node) -> Option { let next_node = Self::next_node(node, true); node.unlink(); next_node } pub fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option { let mut node = node.clone(); // First check for kids if those aren't being ignored let first_child = node.get_first_child(); if !ignore_self_and_kids && first_child.is_some() { return first_child; } // Then for siblings... let next_sibling = node.get_next_sibling(); if next_sibling.is_some() { return next_sibling; } // And finally, move up the parent chain *and* find a sibling // (because this is depth-first traversal, we will have already // seen the parent nodes themselves). while let Some(parent) = node.get_parent() { let parent_name = parent.get_name().to_uppercase(); if parent_name == "HTML" { break; } let next_sibling = parent.get_next_sibling(); if next_sibling.is_some() { return next_sibling; } else { node = parent; } } None } pub fn get_inner_text(node: &Node, normalize_spaces: bool) -> String { let content = node.get_content().trim().to_owned(); if normalize_spaces { constants::NORMALIZE.replace(&content, " ").into() } else { content } } pub fn text_similarity(a: &str, b: &str) -> f64 { let a = a.to_lowercase(); let b = b.to_lowercase(); let tokens_a = constants::TOKENIZE.split(&a).collect::>(); let tokens_b = constants::TOKENIZE.split(&b).collect::>(); if tokens_a.is_empty() || tokens_b.is_empty() { return 0.0; } let tokens_b_total = tokens_b.join(" ").len() as f64; let uniq_tokens_b = tokens_b .into_iter() .filter(|token| !tokens_a.iter().any(|t| t == token)) .collect::>(); let uniq_tokens_b_total = uniq_tokens_b.join(" ").len() as f64; let distance_b = uniq_tokens_b_total / tokens_b_total; 1.0 - distance_b } pub fn has_ancestor_tag( node: &Node, tag_name: &str, max_depth: Option, filter: Option, ) -> bool where F: Fn(&Node) -> bool, { let max_depth = max_depth.unwrap_or(3); let tag_name = tag_name.to_uppercase(); let mut depth = 0; let mut node = node.get_parent(); loop { if depth > max_depth { return false; } let tmp_node = match node { Some(node) => node, None => return false, }; if tmp_node.get_name() == tag_name && filter .as_ref() .map(|filter| filter(&tmp_node)) .unwrap_or(true) { return true; } node = tmp_node.get_parent(); depth += 1; } } pub fn has_single_tag_inside_element(node: &Node, tag: &str) -> bool { // There should be exactly 1 element child with given tag if node.get_child_elements().len() != 1 || node .get_child_elements() .first() .map(|n| n.get_name().to_uppercase() != tag) .unwrap_or(false) { return false; } // And there should be no text nodes with real content !node.get_child_nodes().iter().any(|n| { n.get_type() .map(|t| t == NodeType::TextNode) .unwrap_or(false) && constants::HAS_CONTENT.is_match(&n.get_content()) }) } pub fn is_element_without_content(node: &Node) -> bool { if let Some(node_type) = node.get_type() { let len = node.get_child_nodes().len(); node_type == NodeType::ElementNode && (len == 0 || len == Self::get_elements_by_tag_name(node, "br").len() + Self::get_elements_by_tag_name(node, "hr").len()) && node.get_content().trim().is_empty() } else { false } } pub fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec { let tag = tag.to_uppercase(); let all_tags = tag == "*"; let mut vec = Vec::new(); fn get_elems(node: &Node, tag: &str, vec: &mut Vec, all_tags: bool) { for child in node.get_child_elements() { if all_tags || child.get_name().to_uppercase() == tag { vec.push(child.clone()); } get_elems(&child, tag, vec, all_tags); } } get_elems(node, &tag, &mut vec, all_tags); vec } pub fn get_link_density(node: &Node) -> f64 { let text_length = Util::get_inner_text(node, false).len(); if text_length == 0 { return 0.0; } let mut link_length = 0.0; // XXX implement _reduceNodeList? let link_nodes = Util::get_elements_by_tag_name(node, "A"); for link_node in link_nodes { if let Some(href) = link_node.get_attribute("href") { let coefficient = if constants::HASH_URL.is_match(&href) { 0.3 } else { 1.0 }; link_length += Util::get_inner_text(&link_node, false).len() as f64 * coefficient; } } link_length / text_length as f64 } // Determine whether element has any children block level elements. pub fn has_child_block_element(node: &Node) -> bool { node.get_child_nodes().iter().any(|node| { constants::DIV_TO_P_ELEMS.contains(node.get_name().to_uppercase().as_str()) || Self::has_child_block_element(node) }) } pub fn get_node_ancestors(node: &Node, max_depth: Option) -> Vec { let mut ancestors = Vec::new(); let mut node = node.clone(); let max_depth = max_depth.unwrap_or(u64::MAX); for _ in 0..max_depth { let parent = node.get_parent(); match parent { Some(parent) => { ancestors.push(parent.clone()); node = parent; } None => return ancestors, } } ancestors } pub fn has_tag_name(node: Option<&Node>, tag_name: &str) -> bool { node.map(|n| n.get_name().to_uppercase() == tag_name.to_uppercase()) .unwrap_or(false) } // Check if node is image, or if node contains exactly only one image // whether as a direct child or as its descendants. pub fn is_single_image(node: &Node) -> bool { if node.get_name().to_uppercase() == "IMG" { true } else if node.get_child_elements().len() != 1 || node.get_content().trim() != "" { false } else if let Some(first_child) = node.get_child_elements().first() { Self::is_single_image(first_child) } else { false } } // Clean an element of all tags of type "tag" if they look fishy. // "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. pub fn clean_conditionally(root: &mut Node, tag: &str) { // Gather counts for other typical elements embedded within. // Traverse backwards so we can remove nodes at the same time // without effecting the traversal. // // TODO: Consider taking into account original contentScore here. let nodes = Util::get_elements_by_tag_name(root, tag); for mut node in nodes.into_iter().rev() { if Self::should_remove(&node, tag) { node.unlink(); } } } fn should_remove(node: &Node, tag: &str) -> bool { // First check if this node IS data table, in which case don't remove it. let mut is_list = tag == "ul" || tag == "ol"; if !is_list { let mut list_length = 0.0; let ul_nodes = Self::get_elements_by_tag_name(node, "ul"); let ol_nodes = Self::get_elements_by_tag_name(node, "ol"); for list_node in ul_nodes { list_length += Util::get_inner_text(&list_node, false).len() as f64; } for list_node in ol_nodes { list_length += Util::get_inner_text(&list_node, false).len() as f64; } is_list = (list_length / Util::get_inner_text(node, false).len() as f64) > 0.9; } if tag == "table" && Self::is_data_table(node) { return false; } // Next check if we're inside a data table, in which case don't remove it as well. if Self::has_ancestor_tag(node, "table", Some(u64::MAX), Some(Self::is_data_table)) { return false; } if Self::has_ancestor_tag(node, "code", None, None:: bool>) { return false; } let weight = Self::get_class_weight(node); if weight < 0 { return true; } if Self::get_char_count(node, ',') < 10 { // If there are not very many commas, and the number of // non-paragraph elements is more than paragraphs or other // ominous signs, remove the element. let p = Self::get_elements_by_tag_name(node, "p").len(); let img = Self::get_elements_by_tag_name(node, "img").len(); let li = Self::get_elements_by_tag_name(node, "li").len() as i64 - 100; let input = Self::get_elements_by_tag_name(node, "input").len(); let heading_density = Self::get_text_density(node, &["h1", "h2", "h3", "h4", "h5", "h6"]); let mut embed_count = 0; let embed_tags = ["object", "embed", "iframe"]; for embed_tag in embed_tags { for embed_node in Self::get_elements_by_tag_name(node, embed_tag) { // If this embed has attribute that matches video regex, don't delete it. for (_name, value) in embed_node.get_attributes() { if constants::VIDEOS.is_match(&value) { return false; } } // For embed with tag, check inner HTML as well. // if embed_node.get_name().to_lowercase() == "object" && constants::VIDEOS.is_match(embed_node.innerHTML) { // return false; // } embed_count += 1; } } let link_density = Self::get_link_density(node); let content = Self::get_inner_text(node, false); let content_length = content.len(); let have_to_remove = (img > 1 && (p as f64 / img as f64) < 0.5 && !Self::has_ancestor_tag(node, "figure", None, None:: bool>)) || (!is_list && li > p as i64) || (input as f64 > f64::floor(p as f64 / 3.0)) || (!is_list && heading_density < 0.9 && content_length < 25 && (img == 0 || img > 2) && !Self::has_ancestor_tag(node, "figure", None, None:: bool>)) || (!is_list && weight < 25 && link_density > 0.2) || (weight >= 25 && link_density > 0.5) || ((embed_count == 1 && content_length < 75) || embed_count > 1); // Allow simple lists of images to remain in pages if is_list && have_to_remove { for child in node.get_child_elements() { // Don't filter in lists with li's that contain more than one child if child.get_child_elements().len() > 1 { return have_to_remove; } } let li_count = Util::get_elements_by_tag_name(node, "li").len(); // Only allow the list to remain if every li contains an image if img == li_count { return false; } } have_to_remove } else { false } } pub fn get_class_weight(node: &Node) -> i64 { let mut weight = 0; // Look for a special classname if let Some(class_names) = node.get_property("class") { if constants::NEGATIVE.is_match(&class_names) { weight -= 25; } if constants::POSITIVE.is_match(&class_names) { weight += 25; } } // Look for a special ID if let Some(class_names) = node.get_property("id") { if constants::NEGATIVE.is_match(&class_names) { weight -= 25; } if constants::POSITIVE.is_match(&class_names) { weight += 25; } } weight } fn get_char_count(node: &Node, char: char) -> usize { Util::get_inner_text(node, false).split(char).count() - 1 } fn get_text_density(node: &Node, tags: &[&str]) -> f64 { let text_length = Util::get_inner_text(node, false).len(); if text_length == 0 { return 0.0; } let mut children_length = 0; for tag in tags { for child in Self::get_elements_by_tag_name(node, tag) { children_length += Util::get_inner_text(&child, false).len() } } children_length as f64 / text_length as f64 } fn is_data_table(node: &Node) -> bool { node.get_attribute(constants::DATA_TABLE_ATTR) .and_then(|is_data_table| is_data_table.parse::().ok()) .unwrap_or(false) } pub fn mark_data_tables(context: &Context) -> Result<(), FullTextParserError> { let nodes = Util::evaluate_xpath(context, "//table", false)?; for mut node in nodes { if node .get_attribute("role") .map(|role| role == "presentation") .unwrap_or(false) { let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "false"); continue; } if node .get_attribute("datatable") .map(|role| role == "0") .unwrap_or(false) { let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "false"); continue; } if node.get_attribute("summary").is_some() { let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "true"); continue; } if let Some(first_caption) = Self::get_elements_by_tag_name(&node, "caption").first() { if !first_caption.get_child_nodes().is_empty() { let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "true"); continue; } } // If the table has a descendant with any of these tags, consider a data table: let data_table_descendants = ["col", "colgroup", "tfoot", "thead", "th"]; for descendant in data_table_descendants { if !Self::get_elements_by_tag_name(&node, descendant).is_empty() { let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "true"); continue; } } // Nested tables indicate a layout table: if !Self::get_elements_by_tag_name(&node, "table").is_empty() { let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "false"); continue; } let (rows, columns) = Self::get_row_and_column_count(&node); if rows >= 10 || columns > 4 { let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "true"); continue; } // Now just go by size entirely: let _ = node.set_attribute( constants::DATA_TABLE_ATTR, if rows * columns > 10 { "true" } else { "false" }, ); } Ok(()) } pub fn get_row_and_column_count(node: &Node) -> (usize, usize) { if node.get_name().to_uppercase() != "TABLE" { return (0, 0); } let mut rows = 0; let mut columns = 0; let trs = Self::get_elements_by_tag_name(node, "tr"); for tr in trs { let row_span = tr .get_attribute("rowspan") .and_then(|span| span.parse::().ok()) .unwrap_or(1); rows += row_span; // Now look for column-related info let mut columns_in_this_row = 0; let cells = Self::get_elements_by_tag_name(&tr, "td"); for cell in cells { let colspan = cell .get_attribute("colspan") .and_then(|span| span.parse::().ok()) .unwrap_or(1); columns_in_this_row += colspan; } columns = usize::max(columns, columns_in_this_row); } (rows, columns) } pub fn is_phrasing_content(node: &Node) -> bool { let tag_name = node.get_name().to_uppercase(); let is_text_node = node .get_type() .map(|t| t == NodeType::TextNode) .unwrap_or(false); is_text_node || constants::PHRASING_ELEMS.contains(&tag_name.as_str()) || ((tag_name == "A" || tag_name == "DEL" || tag_name == "INS") && node.get_child_nodes().iter().all(Self::is_phrasing_content)) } }