1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00
article_scraper/src/util.rs
Jan Lukas Gernert 8d529a6d74 fmt
2023-03-12 13:39:29 +01:00

777 lines
26 KiB
Rust

use libxml::{
tree::{Node, NodeType},
xpath::Context,
};
use reqwest::{
header::{HeaderMap, HeaderName, HeaderValue},
Response,
};
use tokio::fs::DirEntry;
use crate::{
constants,
full_text_parser::{config::ConfigEntry, error::FullTextParserError},
};
pub struct Util;
impl Util {
pub fn check_extension(path: &DirEntry, extension: &str) -> bool {
if let Some(ext) = path.path().extension() {
ext.to_str() == Some(extension)
} else {
false
}
}
pub fn str_extract_value<'a>(identifier: &str, line: &'a str) -> &'a str {
let value = &line[identifier.len()..];
let value = value.trim();
match value.find('#') {
Some(pos) => &value[..pos],
None => value,
}
}
pub fn split_values(values: &str) -> Vec<&str> {
values.split('|').map(|s| s.trim()).collect()
}
pub fn select_rule<'a>(
site_specific_rule: Option<&'a str>,
global_rule: Option<&'a str>,
) -> Option<&'a str> {
if site_specific_rule.is_some() {
site_specific_rule
} else {
global_rule
}
}
pub fn generate_headers(
site_specific_rule: Option<&ConfigEntry>,
global_rule: &ConfigEntry,
) -> Result<HeaderMap, FullTextParserError> {
let mut headers = HeaderMap::new();
if let Some(config) = site_specific_rule {
for header in &config.header {
let name = HeaderName::from_bytes(header.name.as_bytes())
.map_err(|_| FullTextParserError::Config)?;
let value = header
.value
.parse::<HeaderValue>()
.map_err(|_| FullTextParserError::Config)?;
headers.insert(name, value);
}
}
for header in &global_rule.header {
let name = HeaderName::from_bytes(header.name.as_bytes())
.map_err(|_| FullTextParserError::Config)?;
let value = header
.value
.parse::<HeaderValue>()
.map_err(|_| FullTextParserError::Config)?;
headers.insert(name, value);
}
Ok(headers)
}
pub fn find_page_url(xpath_ctx: &Context, xpath_page_link: &str) -> Option<url::Url> {
let res = Self::evaluate_xpath(xpath_ctx, xpath_page_link, false).ok()?;
let mut url = None;
for node in res {
let content = node.get_content();
let url_str = if content.trim().is_empty() && node.has_attribute("href") {
node.get_attribute("href")
.expect("already checked for href")
} else {
content
};
if let Ok(parsed_url) = url::Url::parse(&url_str) {
url = Some(parsed_url);
break;
}
}
url
}
pub fn evaluate_xpath(
xpath_ctx: &Context,
xpath: &str,
thorw_if_empty: bool,
) -> Result<Vec<Node>, FullTextParserError> {
let res = xpath_ctx.evaluate(xpath).map_err(|()| {
log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
FullTextParserError::Xml
})?;
let node_vec = res.get_nodes_as_vec();
if node_vec.is_empty() {
log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
if thorw_if_empty {
return Err(FullTextParserError::Xml);
}
}
Ok(node_vec)
}
pub fn check_content_type(response: &Response) -> Result<bool, FullTextParserError> {
if response.status().is_success() {
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
if let Ok(content_type) = content_type.to_str() {
if content_type.contains("text/html") {
return Ok(true);
}
}
}
log::error!("Content type is not text/HTML");
return Ok(false);
}
log::error!("Failed to determine content type");
Err(FullTextParserError::Http)
}
pub fn check_redirect(response: &Response, original_url: &url::Url) -> Option<url::Url> {
if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
log::debug!("Article url redirects to '{}'", response.url().as_str());
return Some(response.url().clone());
} else if response.url() != original_url {
return Some(response.url().clone());
}
None
}
pub fn get_attribute(
context: &Context,
xpath: &str,
attribute: &str,
) -> Result<String, FullTextParserError> {
Util::evaluate_xpath(context, xpath, false)?
.iter()
.find_map(|node| node.get_attribute(attribute))
.ok_or(FullTextParserError::Xml)
}
pub fn extract_value(context: &Context, xpath: &str) -> Result<String, FullTextParserError> {
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
if let Some(val) = node_vec.get(0) {
return Ok(val.get_content());
}
Err(FullTextParserError::Xml)
}
pub fn extract_value_merge(
context: &Context,
xpath: &str,
) -> Result<String, FullTextParserError> {
let node_vec = Util::evaluate_xpath(context, xpath, true)?;
let mut val = String::new();
for node in node_vec {
let part = node
.get_content()
.split_whitespace()
.map(|s| format!("{} ", s))
.collect::<String>();
val.push_str(&part);
val.push(' ');
}
Ok(val.trim().to_string())
}
pub fn strip_node(context: &Context, xpath: &str) -> Result<(), FullTextParserError> {
let mut ancestor = xpath.to_string();
if ancestor.starts_with("//") {
ancestor = ancestor.chars().skip(2).collect();
}
let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
let node_vec = Util::evaluate_xpath(context, query, false)?;
for mut node in node_vec {
let tag_name = node.get_name();
if constants::EMBED_TAG_NAMES.contains(tag_name.to_uppercase().as_str())
&& node
.get_attributes()
.iter()
.any(|(_name, value)| constants::VIDEOS.is_match(value))
{
continue;
}
node.unlink();
}
Ok(())
}
pub fn strip_id_or_class(
context: &Context,
id_or_class: &str,
) -> Result<(), FullTextParserError> {
let xpath = &format!(
"//*[contains(@class, '{}') or contains(@id, '{}')]",
id_or_class, id_or_class
);
let mut ancestor = xpath.clone();
if ancestor.starts_with("//") {
ancestor = ancestor.chars().skip(2).collect();
}
let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
let node_vec = Util::evaluate_xpath(context, query, false)?;
for mut node in node_vec {
node.unlink();
}
Ok(())
}
pub fn is_probably_visible(node: &Node) -> bool {
let is_hidden = node.has_attribute("hidden");
let aria_hidden = node
.get_attribute("aria-hidden")
.map(|attr| attr == "true")
.unwrap_or(false);
let has_fallback_image = node.get_class_names().contains("fallback-image");
!is_hidden && !aria_hidden || has_fallback_image
}
pub fn is_whitespace(node: &Node) -> bool {
let content = node.get_content();
let tag_name = node.get_name().to_uppercase();
let is_text_node = node
.get_type()
.map(|t| t == NodeType::TextNode)
.unwrap_or(false);
let is_element_node = node
.get_type()
.map(|t| t == NodeType::ElementNode)
.unwrap_or(false);
(is_text_node && content.trim().is_empty()) || (is_element_node && tag_name == "BR")
}
pub fn remove_and_next(node: &mut Node) -> Option<Node> {
let next_node = Self::next_node(node, true);
node.unlink();
next_node
}
pub fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
let mut node = node.clone();
// First check for kids if those aren't being ignored
let first_child = node.get_first_child();
if !ignore_self_and_kids && first_child.is_some() {
return first_child;
}
// Then for siblings...
let next_sibling = node.get_next_sibling();
if next_sibling.is_some() {
return next_sibling;
}
// And finally, move up the parent chain *and* find a sibling
// (because this is depth-first traversal, we will have already
// seen the parent nodes themselves).
loop {
let parent = node.get_parent();
if parent.is_none() {
break;
}
if let Some(parent) = parent {
let parent_name = parent.get_name().to_uppercase();
if parent_name == "HTML" {
break;
}
let next_sibling = parent.get_next_sibling();
if next_sibling.is_some() {
return next_sibling;
} else {
node = parent;
}
}
}
None
}
pub fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
let content = node.get_content().trim().to_owned();
if normalize_spaces {
constants::NORMALIZE.replace(&content, " ").into()
} else {
content
}
}
pub fn text_similarity(a: &str, b: &str) -> f64 {
let a = a.to_lowercase();
let b = b.to_lowercase();
let tokens_a = constants::TOKENIZE.split(&a).collect::<Vec<_>>();
let tokens_b = constants::TOKENIZE.split(&b).collect::<Vec<_>>();
if tokens_a.is_empty() || tokens_b.is_empty() {
return 0.0;
}
let tokens_b_total = tokens_b.join(" ").len() as f64;
let uniq_tokens_b = tokens_b
.into_iter()
.filter(|token| !tokens_a.iter().any(|t| t == token))
.collect::<Vec<_>>();
let uniq_tokens_b_total = uniq_tokens_b.join(" ").len() as f64;
let distance_b = uniq_tokens_b_total / tokens_b_total;
1.0 - distance_b
}
pub fn has_ancestor_tag<F>(
node: &Node,
tag_name: &str,
max_depth: Option<u64>,
filter: Option<F>,
) -> bool
where
F: Fn(&Node) -> bool,
{
let max_depth = max_depth.unwrap_or(3);
let tag_name = tag_name.to_uppercase();
let mut depth = 0;
let mut node = node.get_parent();
loop {
if depth > max_depth {
return false;
}
let tmp_node = match node {
Some(node) => node,
None => return false,
};
if tmp_node.get_name() == tag_name
&& filter
.as_ref()
.map(|filter| filter(&tmp_node))
.unwrap_or(true)
{
return true;
}
node = tmp_node.get_parent();
depth += 1;
}
}
pub fn has_single_tag_inside_element(node: &Node, tag: &str) -> bool {
// There should be exactly 1 element child with given tag
if node.get_child_nodes().len() != 1
|| node
.get_child_nodes()
.first()
.map(|n| n.get_name().to_uppercase() != tag)
.unwrap_or(false)
{
return false;
}
// And there should be no text nodes with real content
!node.get_child_nodes().iter().any(|n| {
n.get_type()
.map(|t| t == NodeType::TextNode)
.unwrap_or(false)
&& constants::HAS_CONTENT.is_match(&n.get_content())
})
}
pub fn is_element_without_content(node: &Node) -> bool {
if let Some(node_type) = node.get_type() {
let len = node.get_child_nodes().len();
node_type == NodeType::ElementNode
&& (len == 0
|| len
== Self::get_elements_by_tag_name(node, "br").len()
+ Self::get_elements_by_tag_name(node, "hr").len())
&& node.get_content().trim().is_empty()
} else {
false
}
}
pub fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec<Node> {
let tag = tag.to_uppercase();
let all_tags = tag == "*";
let mut vec = Vec::new();
fn get_elems(node: &Node, tag: &str, vec: &mut Vec<Node>, all_tags: bool) {
for child in node.get_child_elements() {
if all_tags || child.get_name().to_uppercase() == tag {
vec.push(child.clone());
}
get_elems(&child, tag, vec, all_tags);
}
}
get_elems(node, &tag, &mut vec, all_tags);
vec
}
pub fn get_link_density(node: &Node) -> f64 {
let text_length = Util::get_inner_text(node, false).len();
if text_length == 0 {
return 0.0;
}
let mut link_length = 0.0;
// XXX implement _reduceNodeList?
let link_nodes = Util::get_elements_by_tag_name(node, "A");
for link_node in link_nodes {
if let Some(href) = link_node.get_attribute("href") {
let coefficient = if constants::HASH_URL.is_match(&href) {
0.3
} else {
1.0
};
link_length += Util::get_inner_text(&link_node, false).len() as f64 * coefficient;
}
}
link_length / text_length as f64
}
// Determine whether element has any children block level elements.
pub fn has_child_block_element(node: &Node) -> bool {
node.get_child_nodes().iter().any(|node| {
constants::DIV_TO_P_ELEMS.contains(node.get_name().to_uppercase().as_str())
|| Self::has_child_block_element(node)
})
}
pub fn get_node_ancestors(node: &Node, max_depth: Option<u64>) -> Vec<Node> {
let mut ancestors = Vec::new();
let mut node = node.clone();
let max_depth = max_depth.unwrap_or(u64::MAX);
for _ in 0..max_depth {
let parent = node.get_parent();
match parent {
Some(parent) => {
ancestors.push(parent.clone());
node = parent;
}
None => return ancestors,
}
}
ancestors
}
pub fn has_tag_name(node: Option<&Node>, tag_name: &str) -> bool {
node.map(|n| n.get_name().to_uppercase() == tag_name.to_uppercase())
.unwrap_or(false)
}
// Check if node is image, or if node contains exactly only one image
// whether as a direct child or as its descendants.
pub fn is_single_image(node: &Node) -> bool {
if node.get_name().to_uppercase() == "IMG" {
true
} else if node.get_child_elements().len() != 1 || node.get_content().trim() != "" {
false
} else if let Some(first_child) = node.get_child_elements().first() {
Self::is_single_image(first_child)
} else {
false
}
}
// Clean an element of all tags of type "tag" if they look fishy.
// "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
pub fn clean_conditionally(root: &mut Node, tag: &str) {
// Gather counts for other typical elements embedded within.
// Traverse backwards so we can remove nodes at the same time
// without effecting the traversal.
//
// TODO: Consider taking into account original contentScore here.
let nodes = Util::get_elements_by_tag_name(root, tag);
for mut node in nodes.into_iter().rev() {
if Self::should_remove(&node, tag) {
node.unlink();
}
}
}
fn should_remove(node: &Node, tag: &str) -> bool {
// First check if this node IS data table, in which case don't remove it.
let mut is_list = tag == "ul" || tag == "ol";
if !is_list {
let mut list_length = 0.0;
let ul_nodes = Self::get_elements_by_tag_name(node, "ul");
let ol_nodes = Self::get_elements_by_tag_name(node, "ol");
for list_node in ul_nodes {
list_length += Util::get_inner_text(&list_node, false).len() as f64;
}
for list_node in ol_nodes {
list_length += Util::get_inner_text(&list_node, false).len() as f64;
}
is_list = (list_length / Util::get_inner_text(node, false).len() as f64) > 0.9;
}
if tag == "table" && Self::is_data_table(node) {
return false;
}
// Next check if we're inside a data table, in which case don't remove it as well.
if Self::has_ancestor_tag(node, "table", Some(u64::MAX), Some(Self::is_data_table)) {
return false;
}
if Self::has_ancestor_tag(node, "code", None, None::<fn(&Node) -> bool>) {
return false;
}
let weight = Self::get_class_weight(node);
if weight < 0 {
return true;
}
if Self::get_char_count(node, ',') < 10 {
// If there are not very many commas, and the number of
// non-paragraph elements is more than paragraphs or other
// ominous signs, remove the element.
let p = Self::get_elements_by_tag_name(node, "p").len();
let img = Self::get_elements_by_tag_name(node, "img").len();
let li = Self::get_elements_by_tag_name(node, "li").len() as i64 - 100;
let input = Self::get_elements_by_tag_name(node, "input").len();
let heading_density =
Self::get_text_density(node, &["h1", "h2", "h3", "h4", "h5", "h6"]);
let mut embed_count = 0;
let embed_tags = ["object", "embed", "iframe"];
for embed_tag in embed_tags {
for embed_node in Self::get_elements_by_tag_name(node, embed_tag) {
// If this embed has attribute that matches video regex, don't delete it.
for (_name, value) in embed_node.get_attributes() {
if constants::VIDEOS.is_match(&value) {
return false;
}
}
// For embed with <object> tag, check inner HTML as well.
// if embed_node.get_name().to_lowercase() == "object" && constants::VIDEOS.is_match(embed_node.innerHTML) {
// return false;
// }
embed_count += 1;
}
}
let link_density = Self::get_link_density(node);
let content = Self::get_inner_text(node, false);
let content_length = content.len();
let have_to_remove = (img > 1
&& (p as f64 / img as f64) < 0.5
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
|| (!is_list && li > p as i64)
|| (input as f64 > f64::floor(p as f64 / 3.0))
|| (!is_list
&& heading_density < 0.9
&& content_length < 25
&& (img == 0 || img > 2)
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
|| (!is_list && weight < 25 && link_density > 0.2)
|| (weight >= 25 && link_density > 0.5)
|| ((embed_count == 1 && content_length < 75) || embed_count > 1);
// Allow simple lists of images to remain in pages
if is_list && have_to_remove {
for child in node.get_child_elements() {
// Don't filter in lists with li's that contain more than one child
if child.get_child_elements().len() > 1 {
return have_to_remove;
}
}
let li_count = Util::get_elements_by_tag_name(node, "li").len();
// Only allow the list to remain if every li contains an image
if img == li_count {
return false;
}
}
have_to_remove
} else {
false
}
}
pub fn get_class_weight(node: &Node) -> i64 {
let mut weight = 0;
// Look for a special classname
if let Some(class_names) = node.get_property("class") {
if constants::NEGATIVE.is_match(&class_names) {
weight -= 25;
}
if constants::POSITIVE.is_match(&class_names) {
weight += 25;
}
}
// Look for a special ID
if let Some(class_names) = node.get_property("id") {
if constants::NEGATIVE.is_match(&class_names) {
weight -= 25;
}
if constants::POSITIVE.is_match(&class_names) {
weight += 25;
}
}
weight
}
fn get_char_count(node: &Node, char: char) -> usize {
Util::get_inner_text(node, false).split(char).count() - 1
}
fn get_text_density(node: &Node, tags: &[&str]) -> f64 {
let text_length = Util::get_inner_text(node, false).len();
if text_length == 0 {
return 0.0;
}
let mut children_length = 0;
for tag in tags {
for child in Self::get_elements_by_tag_name(node, tag) {
children_length += Util::get_inner_text(&child, false).len()
}
}
children_length as f64 / text_length as f64
}
fn is_data_table(node: &Node) -> bool {
node.get_attribute(constants::DATA_TABLE_ATTR)
.and_then(|is_data_table| is_data_table.parse::<bool>().ok())
.unwrap_or(false)
}
pub fn mark_data_tables(context: &Context) -> Result<(), FullTextParserError> {
let nodes = Util::evaluate_xpath(context, "//table", false)?;
for mut node in nodes {
if node
.get_attribute("role")
.map(|role| role == "presentation")
.unwrap_or(false)
{
let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "false");
continue;
}
if node
.get_attribute("datatable")
.map(|role| role == "0")
.unwrap_or(false)
{
let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "false");
continue;
}
if node.get_attribute("summary").is_some() {
let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "true");
continue;
}
if let Some(first_caption) = Self::get_elements_by_tag_name(&node, "caption").first() {
if !first_caption.get_child_nodes().is_empty() {
let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "true");
continue;
}
}
// If the table has a descendant with any of these tags, consider a data table:
let data_table_descendants = ["col", "colgroup", "tfoot", "thead", "th"];
for descendant in data_table_descendants {
if !Self::get_elements_by_tag_name(&node, descendant).is_empty() {
let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "true");
continue;
}
}
// Nested tables indicate a layout table:
if !Self::get_elements_by_tag_name(&node, "table").is_empty() {
let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "false");
continue;
}
let (rows, columns) = Self::get_row_and_column_count(&node);
if rows >= 10 || columns > 4 {
let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "true");
continue;
}
// Now just go by size entirely:
let _ = node.set_attribute(
constants::DATA_TABLE_ATTR,
if rows * columns > 10 { "true" } else { "false" },
);
}
Ok(())
}
pub fn get_row_and_column_count(node: &Node) -> (usize, usize) {
if node.get_name().to_uppercase() != "TABLE" {
return (0, 0);
}
let mut rows = 0;
let mut columns = 0;
let trs = Self::get_elements_by_tag_name(node, "tr");
for tr in trs {
let row_span = tr
.get_attribute("rowspan")
.and_then(|span| span.parse::<usize>().ok())
.unwrap_or(1);
rows += row_span;
// Now look for column-related info
let mut columns_in_this_row = 0;
let cells = Self::get_elements_by_tag_name(&tr, "td");
for cell in cells {
let colspan = cell
.get_attribute("colspan")
.and_then(|span| span.parse::<usize>().ok())
.unwrap_or(1);
columns_in_this_row += colspan;
}
columns = usize::max(columns, columns_in_this_row);
}
(rows, columns)
}
}