1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 16:40:00 +02:00

unwrap noscript images

This commit is contained in:
Jan Lukas Gernert 2023-02-23 01:53:42 +01:00
parent 98c06e11f4
commit 7ae98904d4
6 changed files with 537 additions and 284 deletions

View file

@ -1,11 +1,17 @@
use libxml::{tree::Node, xpath::Context};
use libxml::{
tree::{Node, NodeType},
xpath::Context,
};
use reqwest::{
header::{HeaderMap, HeaderName, HeaderValue},
Response,
};
use tokio::fs::DirEntry;
use crate::full_text_parser::{config::ConfigEntry, error::FullTextParserError};
use crate::{
constants,
full_text_parser::{config::ConfigEntry, error::FullTextParserError},
};
pub struct Util;
@ -219,4 +225,258 @@ impl Util {
}
Ok(())
}
pub fn is_probably_visible(node: &Node) -> bool {
let display_none = node
.get_attribute("display")
.map(|display| display == "none")
.unwrap_or(false);
let is_hidden = node.has_attribute("hidden");
let aria_hidden = node
.get_attribute("aria-hidden")
.map(|attr| attr == "true")
.unwrap_or(false);
let has_fallback_image = node.get_class_names().contains("fallback-image");
!display_none && !is_hidden && !aria_hidden || has_fallback_image
}
pub fn is_whitespace(node: &Node) -> bool {
let is_text_node = node
.get_type()
.map(|t| t == NodeType::TextNode)
.unwrap_or(false);
let is_element_node = node
.get_type()
.map(|t| t == NodeType::ElementNode)
.unwrap_or(false);
(is_text_node && node.get_content().trim().is_empty())
|| (is_element_node && node.get_name().to_uppercase() == "BR")
}
pub fn remove_and_next(node: &mut Node) -> Option<Node> {
let next_node = Self::next_node(node, true);
node.unlink();
next_node
}
pub fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
let mut node = node.clone();
// First check for kids if those aren't being ignored
let first_child = node.get_first_child();
if !ignore_self_and_kids && first_child.is_some() {
return first_child;
}
// Then for siblings...
let next_sibling = node.get_next_sibling();
if next_sibling.is_some() {
return next_sibling;
}
// And finally, move up the parent chain *and* find a sibling
// (because this is depth-first traversal, we will have already
// seen the parent nodes themselves).
loop {
let parent = node.get_parent();
if parent.is_none() {
break;
}
if let Some(parent) = parent {
let parent_name = parent.get_name().to_uppercase();
if parent_name == "HTML" {
break;
}
let next_sibling = parent.get_next_sibling();
if next_sibling.is_some() {
return next_sibling;
} else {
node = parent;
}
}
}
None
}
pub fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
let content = node.get_content().trim().to_owned();
if normalize_spaces {
constants::NORMALIZE.replace(&content, " ").into()
} else {
content
}
}
pub fn text_similarity(a: &str, b: &str) -> f64 {
let a = a.to_lowercase();
let b = b.to_lowercase();
let tokens_a = constants::TOKENIZE.split(&a).collect::<Vec<_>>();
let tokens_b = constants::TOKENIZE.split(&b).collect::<Vec<_>>();
if tokens_a.is_empty() || tokens_b.is_empty() {
return 0.0;
}
let tokens_b_total = tokens_b.join(" ").len() as f64;
let uniq_tokens_b = tokens_b
.into_iter()
.filter(|token| !tokens_a.iter().any(|t| t == token))
.collect::<Vec<_>>();
let uniq_tokens_b_total = uniq_tokens_b.join(" ").len() as f64;
let distance_b = uniq_tokens_b_total / tokens_b_total;
1.0 - distance_b
}
pub fn has_ancestor_tag(node: &Node, tag_name: &str, max_depth: Option<u64>) -> bool {
let max_depth = max_depth.unwrap_or(3);
let tag_name = tag_name.to_uppercase();
let mut depth = 0;
let mut node = node.get_parent();
loop {
if depth > max_depth {
return false;
}
let tmp_node = match node {
Some(node) => node,
None => return false,
};
if tmp_node.get_name() == tag_name {
return true;
}
node = tmp_node.get_parent();
depth += 1;
}
}
pub fn has_single_tag_inside_element(node: &Node, tag: &str) -> bool {
// There should be exactly 1 element child with given tag
if node.get_child_nodes().len() == 1
|| node
.get_child_nodes()
.first()
.map(|n| n.get_name().to_uppercase() == tag)
.unwrap_or(false)
{
return false;
}
// And there should be no text nodes with real content
node.get_child_nodes().iter().any(|n| {
n.get_type()
.map(|t| t == NodeType::TextNode)
.unwrap_or(false)
&& constants::HAS_CONTENT.is_match(&n.get_content())
})
}
pub fn is_element_without_content(node: &Node) -> bool {
if let Some(node_type) = node.get_type() {
let len = node.get_child_nodes().len();
return node_type == NodeType::ElementNode
&& node.get_content().trim().is_empty()
&& (len == 0
|| len
== Self::get_elements_by_tag_name(node, "br").len()
+ Self::get_elements_by_tag_name(node, "hr").len());
}
false
}
pub fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec<Node> {
let tag = tag.to_uppercase();
let all_tags = tag == "*";
let mut vec = Vec::new();
fn get_elems(node: &Node, tag: &str, vec: &mut Vec<Node>, all_tags: bool) {
for child in node.get_child_elements() {
if all_tags || child.get_name().to_uppercase() == tag {
vec.push(child.clone());
}
get_elems(&child, tag, vec, all_tags);
}
}
get_elems(node, &tag, &mut vec, all_tags);
vec
}
pub fn get_link_density(node: &Node) -> f64 {
let text_length = Util::get_inner_text(node, false).len();
if text_length == 0 {
return 0.0;
}
let mut link_length = 0.0;
// XXX implement _reduceNodeList?
let link_nodes = Util::get_elements_by_tag_name(node, "A");
for link_node in link_nodes {
if let Some(href) = link_node.get_attribute("href") {
let coefficient = if constants::HASH_URL.is_match(&href) {
0.3
} else {
1.0
};
link_length += Util::get_inner_text(&link_node, false).len() as f64 * coefficient;
}
}
link_length / text_length as f64
}
// Determine whether element has any children block level elements.
pub fn has_child_block_element(node: &Node) -> bool {
node.get_child_elements().iter().any(|node| {
constants::DIV_TO_P_ELEMS.contains(node.get_name().as_str())
|| Self::has_child_block_element(node)
})
}
pub fn get_node_ancestors(node: &Node, max_depth: u64) -> Vec<Node> {
let mut ancestors = Vec::new();
let mut node = node.clone();
for _ in 0..=max_depth {
let parent = node.get_parent();
match parent {
Some(parent) => {
ancestors.push(parent.clone());
node = parent;
}
None => return ancestors,
}
}
ancestors
}
pub fn has_tag_name(node: Option<&Node>, tag_name: &str) -> bool {
node.map(|n| n.get_name().to_uppercase() == tag_name.to_uppercase())
.unwrap_or(false)
}
// Check if node is image, or if node contains exactly only one image
// whether as a direct child or as its descendants.
pub fn is_single_image(node: &Node) -> bool {
if node.get_name().to_uppercase() == "IMG" {
true
} else if node.get_child_nodes().len() != 1 || node.get_content().trim() != "" {
false
} else if let Some(first_child) = node.get_child_nodes().first() {
Self::is_single_image(first_child)
} else {
false
}
}
}