1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

unwrap noscript images

This commit is contained in:
Jan Lukas Gernert 2023-02-23 01:53:42 +01:00
parent 98c06e11f4
commit 7ae98904d4
6 changed files with 537 additions and 284 deletions

View file

@ -18,7 +18,7 @@ impl Article {
pub fn get_content(&self) -> Option<String> {
// serialize content
let options = SaveOptions {
format: false,
format: true,
no_declaration: false,
no_empty_tags: true,
no_xhtml: false,

View file

@ -19,6 +19,7 @@ use libxml::parser::Parser;
use libxml::tree::{Document, Node};
use libxml::xpath::Context;
use log::{debug, error, info, warn};
use regex::Regex;
use reqwest::header::HeaderMap;
use reqwest::Client;
use std::path::Path;
@ -124,6 +125,10 @@ impl FullTextParser {
return Err(error);
}
if let Some(mut root) = document.get_root_element() {
Self::post_process_content(&mut root)?;
}
article.document = Some(document);
Ok(article)
@ -179,6 +184,7 @@ impl FullTextParser {
Self::check_for_thumbnail(&xpath_ctx, article);
}
Self::strip_junk(&xpath_ctx, config, global_config, url);
Self::unwrap_noscript_images(&xpath_ctx)?;
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
if !found_body {
@ -195,6 +201,7 @@ impl FullTextParser {
document = Self::parse_html(&html, config, global_config)?;
xpath_ctx = Self::get_xpath_ctx(&document)?;
Self::strip_junk(&xpath_ctx, config, global_config, &url);
Self::unwrap_noscript_images(&xpath_ctx)?;
Self::extract_body(&xpath_ctx, root, config, global_config)?;
}
@ -609,6 +616,12 @@ impl FullTextParser {
// strip all comments
let _ = Util::strip_node(context, "//comment()");
// strip all scripts
let _ = Util::strip_node(context, "//script");
// strip all styles
let _ = Util::strip_node(context, "//style");
// strip all empty url-tags <a/>
let _ = Util::strip_node(context, "//a[not(node())]");
@ -616,6 +629,91 @@ impl FullTextParser {
let _ = Util::strip_node(context, "//*[@type='text/css']");
}
/**
* Find all <noscript> that are located after <img> nodes, and which contain only one
* <img> element. Replace the first image with the image from inside the <noscript> tag,
* and remove the <noscript> tag. This improves the quality of the images we use on
* some sites (e.g. Medium).
**/
fn unwrap_noscript_images(ctx: &Context) -> Result<(), FullTextParserError> {
// Find img without source or attributes that might contains image, and remove it.
// This is done to prevent a placeholder img is replaced by img from noscript in next step.
let img_regex = Regex::new(r#"/\.(jpg|jpeg|png|webp)/i"#).unwrap();
let img_nodes = Util::evaluate_xpath(ctx, "//img", false)?;
for mut img_node in img_nodes {
let attrs = img_node.get_attributes();
let keep = attrs.iter().any(|(name, value)| {
name == "src"
|| name == "srcset"
|| name == "data-src"
|| name == "data-srcset"
|| img_regex.is_match(&value)
});
if !keep {
img_node.unlink();
}
}
// Next find noscript and try to extract its image
let noscript_nodes = Util::evaluate_xpath(ctx, "//noscript", false)?;
for mut noscript_node in noscript_nodes {
// Parse content of noscript and make sure it only contains image
if !Util::is_single_image(&noscript_node) {
continue;
}
// If noscript has previous sibling and it only contains image,
// replace it with noscript content. However we also keep old
// attributes that might contains image.
if let Some(prev) = noscript_node.get_prev_element_sibling() {
if Util::is_single_image(&prev) {
{
let mut prev_img = prev.clone();
if prev_img.get_name().to_uppercase() != "IMG" {
if let Some(img_node) = Util::get_elements_by_tag_name(&prev_img, "img").into_iter().next() {
prev_img = img_node;
}
}
let new_img = Util::get_elements_by_tag_name(&noscript_node, "img").into_iter().next();
if let Some(mut new_img) = new_img {
for (key, value) in prev_img.get_attributes() {
if value.is_empty() {
continue;
}
if key == "src" || key == "srcset" || img_regex.is_match(&value) {
if new_img.get_attribute(&key).as_deref() == Some(&value) {
continue;
}
let mut attr_name = key;
if new_img.has_attribute(&attr_name) {
attr_name = format!("data-old-{attr_name}");
}
new_img.set_attribute(&attr_name, &value).unwrap();
}
}
}
}
if let Some(mut parent) = noscript_node.get_parent() {
if let Some(first_child) = noscript_node.get_first_child() {
parent.replace_child_node(first_child, prev).unwrap();
noscript_node.unlink();
}
}
}
}
}
Ok(())
}
fn extract_body(
context: &Context,
root: &mut Node,
@ -726,4 +824,76 @@ impl FullTextParser {
Ok(())
}
pub(crate) fn post_process_content(root: &mut Node) -> Result<(), FullTextParserError> {
Self::clean_classes(root)?;
Self::simplify_nested_elements(root)?;
Ok(())
}
fn clean_classes(root: &mut Node) -> Result<(), FullTextParserError> {
let mut node_iter = Some(root.clone());
while let Some(mut node) = node_iter {
let classes = node.get_class_names();
if classes.contains("page") {
node.set_attribute("class", "page").map_err(|e| {
log::error!("{e}");
FullTextParserError::Xml
})?;
} else {
node.remove_attribute("class").map_err(|e| {
log::error!("{e}");
FullTextParserError::Xml
})?;
}
node.remove_attribute("content_score").map_err(|e| {
log::error!("{e}");
FullTextParserError::Xml
})?;
node_iter = Util::next_node(&node, false);
}
Ok(())
}
fn simplify_nested_elements(root: &mut Node) -> Result<(), FullTextParserError> {
let mut node_iter = Some(root.clone());
while let Some(mut node) = node_iter {
let tag_name = node.get_name().to_uppercase();
if tag_name != "ARTICLE"
&& node.get_parent().is_some()
&& (tag_name == "DIV" || tag_name == "SECTION")
{
if Util::is_element_without_content(&node) {
node_iter = Util::remove_and_next(&mut node);
continue;
} else if Util::has_single_tag_inside_element(&node, "DIV")
|| Util::has_single_tag_inside_element(&node, "SECTION")
{
if let Some(mut parent) = node.get_parent() {
if let Some(mut child) = node.get_child_nodes().into_iter().next() {
for (k, v) in node.get_attributes().into_iter() {
child.set_attribute(&k, &v).map_err(|e| {
log::error!("{e}");
FullTextParserError::Xml
})?;
}
parent
.replace_child_node(child, node.clone())
.map_err(|e| {
log::error!("{e}");
FullTextParserError::Xml
})?;
}
}
}
}
node_iter = Util::next_node(&node, false);
}
Ok(())
}
}

View file

@ -9,7 +9,7 @@ use libxml::tree::{node, Document, Node, NodeType};
use self::state::State;
use super::error::FullTextParserError;
use crate::constants;
use crate::{constants, util::Util};
pub struct Readability;
@ -43,13 +43,13 @@ impl Readability {
None => match_string,
};
if !Self::is_probably_visible(node_ref) {
node = Self::remove_and_next(node_ref);
if !Util::is_probably_visible(node_ref) {
node = Util::remove_and_next(node_ref);
continue;
}
if Self::check_byline(node_ref, &match_string, &mut state) {
node = Self::remove_and_next(node_ref);
node = Util::remove_and_next(node_ref);
continue;
}
@ -57,7 +57,7 @@ impl Readability {
&& Self::header_duplicates_title(node_ref, title)
{
state.should_remove_title_header = false;
node = Self::remove_and_next(node_ref);
node = Util::remove_and_next(node_ref);
continue;
}
@ -65,18 +65,18 @@ impl Readability {
if state.strip_unlikely {
if constants::UNLIELY_CANDIDATES.is_match(&match_string)
&& !constants::OKAY_MAYBE_ITS_A_CANDIDATE.is_match(&match_string)
&& !Self::has_ancestor_tag(node_ref, "table", None)
&& !Self::has_ancestor_tag(node_ref, "code", None)
&& !Util::has_ancestor_tag(node_ref, "table", None)
&& !Util::has_ancestor_tag(node_ref, "code", None)
&& tag_name != "BODY"
&& tag_name != "A"
{
node = Self::remove_and_next(node_ref);
node = Util::remove_and_next(node_ref);
continue;
}
if let Some(role) = node_ref.get_attribute("role") {
if constants::UNLIKELY_ROLES.contains(&role.as_str()) {
node = Self::remove_and_next(node_ref);
node = Util::remove_and_next(node_ref);
continue;
}
}
@ -92,9 +92,9 @@ impl Readability {
|| tag_name == "H4"
|| tag_name == "H5"
|| tag_name == "H6")
&& Self::is_element_without_content(node_ref)
&& Util::is_element_without_content(node_ref)
{
node = Self::remove_and_next(node_ref);
node = Util::remove_and_next(node_ref);
continue;
}
@ -110,7 +110,7 @@ impl Readability {
if Self::is_phrasing_content(&child_node) {
if let Some(p) = p.as_mut() {
let _ = p.add_child(&mut child_node);
} else if !Self::is_whitespace(&child_node) {
} else if !Util::is_whitespace(&child_node) {
let mut new_node = Node::new("p", None, &document)
.map_err(|()| FullTextParserError::Readability)?;
node_ref
@ -127,7 +127,7 @@ impl Readability {
}
} else if let Some(p) = p.as_mut() {
for mut r_node in p.get_child_nodes().into_iter().rev() {
if Self::is_whitespace(&r_node) {
if Util::is_whitespace(&r_node) {
r_node.unlink();
}
}
@ -138,8 +138,8 @@ impl Readability {
// element. DIVs with only a P element inside and no text content can be
// safely converted into plain P elements to avoid confusing the scoring
// algorithm with DIVs with are, in practice, paragraphs.
if Self::has_single_tag_inside_element(node_ref, "P")
&& Self::get_link_density(node_ref) < 0.25
if Util::has_single_tag_inside_element(node_ref, "P")
&& Util::get_link_density(node_ref) < 0.25
{
if let Some(new_node) = node_ref.get_child_nodes().first() {
if let Some(mut parent) = node_ref.get_parent() {
@ -154,14 +154,14 @@ impl Readability {
continue;
}
}
} else if !Self::has_child_block_element(node_ref)
} else if !Util::has_child_block_element(node_ref)
&& node_ref.set_name("P").is_ok()
{
elements_to_score.push(node_ref.clone());
}
}
node = Self::next_node(node_ref, false);
node = Util::next_node(node_ref, false);
}
let mut candidates = Vec::new();
@ -173,7 +173,7 @@ impl Readability {
continue;
}
let inner_text = Self::get_inner_text(&element_to_score, true);
let inner_text = Util::get_inner_text(&element_to_score, true);
// If this paragraph is less than 25 characters, don't even count it.
if inner_text.len() < 25 {
@ -181,7 +181,7 @@ impl Readability {
}
// Exclude nodes with no ancestor.
let ancestors = Self::get_node_ancestors(&element_to_score, 5);
let ancestors = Util::get_node_ancestors(&element_to_score, 5);
if ancestors.is_empty() {
continue;
}
@ -234,7 +234,7 @@ impl Readability {
// should have a relatively small link density (5% or less) and be mostly
// unaffected by this operation.
if let Some(content_score) = Self::get_content_score(candidate) {
let candidate_score = content_score * (1.0 - Self::get_link_density(candidate));
let candidate_score = content_score * (1.0 - Util::get_link_density(candidate));
Self::set_content_score(candidate, candidate_score)?;
}
}
@ -242,7 +242,7 @@ impl Readability {
candidates.sort_by(|a, b| {
if let (Some(a), Some(b)) = (Self::get_content_score(a), Self::get_content_score(b))
{
a.partial_cmp(&b).unwrap_or(Ordering::Equal)
b.partial_cmp(&a).unwrap_or(Ordering::Equal)
} else {
Ordering::Equal
}
@ -317,7 +317,7 @@ impl Readability {
// The scores shouldn't get too low.
let score_threshold = last_score / 3.0;
while Self::has_tag_name(parent_of_top_candidate.as_ref(), "BODY") {
while Util::has_tag_name(parent_of_top_candidate.as_ref(), "BODY") {
if parent_of_top_candidate
.as_ref()
.map(|n| Self::get_content_score(n).is_none())
@ -354,7 +354,7 @@ impl Readability {
// joining logic when adjacent content is actually located in parent's sibling node.
parent_of_top_candidate = top_candidate.get_parent();
while Self::has_tag_name(parent_of_top_candidate.as_ref(), "BODY")
while Util::has_tag_name(parent_of_top_candidate.as_ref(), "BODY")
&& parent_of_top_candidate
.as_ref()
.map(|n| n.get_child_elements().len() == 1)
@ -414,8 +414,8 @@ impl Readability {
{
append = true;
} else if sibling.get_name().to_uppercase() == "P" {
let link_density = Self::get_link_density(&sibling);
let node_content = Self::get_inner_text(&sibling, false);
let link_density = Util::get_link_density(&sibling);
let node_content = Util::get_inner_text(&sibling, false);
let node_length = node_content.len();
if node_length > 80
@ -432,7 +432,8 @@ impl Readability {
if append {
log::debug!("Appending node: {sibling:?}");
if !constants::ALTER_TO_DIV_EXCEPTIONS.contains(sibling.get_name().as_str())
if !constants::ALTER_TO_DIV_EXCEPTIONS
.contains(sibling.get_name().to_uppercase().as_str())
{
// We have a node that isn't a common block level element, like a form or td tag.
// Turn it into a div so it doesn't get filtered out later by accident.
@ -503,7 +504,7 @@ impl Readability {
// grabArticle with different flags set. This gives us a higher likelihood of
// finding the content, and the sieve approach gives us a higher likelihood of
// finding the -right- content.
let text = Self::get_inner_text(&article_content, true);
let text = Util::get_inner_text(&article_content, true);
let text_length = text.len();
if text_length < constants::DEFAULT_CHAR_THRESHOLD {
@ -525,12 +526,14 @@ impl Readability {
attempts.sort_by(|(_, size_a, _), (_, size_b, _)| size_a.cmp(size_b));
// But first check if we actually have something
if let Some((mut best_attempt, _len, _document)) = attempts.pop() {
best_attempt.unlink();
root.add_child(&mut best_attempt).map_err(|error| {
log::error!("{error}");
FullTextParserError::Readability
})?;
if let Some((best_attempt, _len, _document)) = attempts.pop() {
for mut child in best_attempt.get_child_nodes() {
child.unlink();
root.add_child(&mut child).map_err(|error| {
log::error!("{error}");
FullTextParserError::Readability
})?;
}
parse_successful = true;
}
@ -541,10 +544,13 @@ impl Readability {
.dup()
.map_err(|()| FullTextParserError::Readability)?;
} else {
root.add_child(&mut article_content).map_err(|error| {
log::error!("{error}");
FullTextParserError::Readability
})?;
for mut child in article_content.get_child_nodes() {
child.unlink();
root.add_child(&mut child).map_err(|error| {
log::error!("{error}");
FullTextParserError::Readability
})?;
}
return Ok(parse_successful);
}
}
@ -563,83 +569,6 @@ impl Readability {
})
}
fn is_probably_visible(node: &Node) -> bool {
let display_none = node
.get_attribute("display")
.map(|display| display == "none")
.unwrap_or(false);
let is_hidden = node.has_attribute("hidden");
let aria_hidden = node
.get_attribute("aria-hidden")
.map(|attr| attr == "true")
.unwrap_or(false);
let has_fallback_image = node.get_class_names().contains("fallback-image");
!display_none && !is_hidden && !aria_hidden || has_fallback_image
}
fn is_whitespace(node: &Node) -> bool {
let is_text_node = node
.get_type()
.map(|t| t == NodeType::TextNode)
.unwrap_or(false);
let is_element_node = node
.get_type()
.map(|t| t == NodeType::ElementNode)
.unwrap_or(false);
(is_text_node && node.get_content().trim().is_empty())
|| (is_element_node && node.get_name().to_uppercase() == "BR")
}
fn remove_and_next(node: &mut Node) -> Option<Node> {
let next_node = Self::next_node(node, true);
node.unlink();
next_node
}
fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
let mut node = node.clone();
// First check for kids if those aren't being ignored
let first_child = node.get_first_child();
if !ignore_self_and_kids && first_child.is_some() {
return first_child;
}
// Then for siblings...
let next_sibling = node.get_next_sibling();
if next_sibling.is_some() {
return next_sibling;
}
// And finally, move up the parent chain *and* find a sibling
// (because this is depth-first traversal, we will have already
// seen the parent nodes themselves).
loop {
let parent = node.get_parent();
if parent.is_none() {
break;
}
if let Some(parent) = parent {
let parent_name = parent.get_name().to_uppercase();
if parent_name == "HTML" {
break;
}
let next_sibling = parent.get_next_sibling();
if next_sibling.is_some() {
return next_sibling;
} else {
node = parent;
}
}
}
None
}
fn check_byline(node: &Node, matchstring: &str, state: &mut State) -> bool {
if state.byline.is_some() {
return false;
@ -680,123 +609,15 @@ impl Readability {
if name != "h1" && name != "h2" {
return false;
}
let heading = Self::get_inner_text(node, false);
let heading = Util::get_inner_text(node, false);
if let Some(title) = title {
Self::text_similarity(&heading, title) > 0.75
Util::text_similarity(&heading, title) > 0.75
} else {
false
}
}
fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
let content = node.get_content().trim().to_owned();
if normalize_spaces {
constants::NORMALIZE.replace(&content, " ").into()
} else {
content
}
}
fn text_similarity(a: &str, b: &str) -> f64 {
let a = a.to_lowercase();
let b = b.to_lowercase();
let tokens_a = constants::TOKENIZE.split(&a).collect::<Vec<_>>();
let tokens_b = constants::TOKENIZE.split(&b).collect::<Vec<_>>();
if tokens_a.is_empty() || tokens_b.is_empty() {
return 0.0;
}
let tokens_b_total = tokens_b.join(" ").len() as f64;
let uniq_tokens_b = tokens_b
.into_iter()
.filter(|token| !tokens_a.iter().any(|t| t == token))
.collect::<Vec<_>>();
let uniq_tokens_b_total = uniq_tokens_b.join(" ").len() as f64;
let distance_b = uniq_tokens_b_total / tokens_b_total;
1.0 - distance_b
}
fn has_ancestor_tag(node: &Node, tag_name: &str, max_depth: Option<u64>) -> bool {
let max_depth = max_depth.unwrap_or(3);
let tag_name = tag_name.to_uppercase();
let mut depth = 0;
let mut node = node.get_parent();
loop {
if depth > max_depth {
return false;
}
let tmp_node = match node {
Some(node) => node,
None => return false,
};
if tmp_node.get_name() == tag_name {
return true;
}
node = tmp_node.get_parent();
depth += 1;
}
}
fn has_single_tag_inside_element(node: &Node, tag: &str) -> bool {
// There should be exactly 1 element child with given tag
if node.get_child_nodes().len() == 1
|| node
.get_child_nodes()
.first()
.map(|n| n.get_name().to_uppercase() == tag)
.unwrap_or(false)
{
return false;
}
// And there should be no text nodes with real content
node.get_child_nodes().iter().any(|n| {
n.get_type()
.map(|t| t == NodeType::TextNode)
.unwrap_or(false)
&& constants::HAS_CONTENT.is_match(&n.get_content())
})
}
fn is_element_without_content(node: &Node) -> bool {
if let Some(node_type) = node.get_type() {
let len = node.get_child_nodes().len();
return node_type == NodeType::ElementNode
&& node.get_content().trim().is_empty()
&& (len == 0
|| len
== Self::get_elements_by_tag_name(node, "br").len()
+ Self::get_elements_by_tag_name(node, "hr").len());
}
false
}
fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec<Node> {
let tag = tag.to_uppercase();
let all_tags = tag == "*";
let mut vec = Vec::new();
fn get_elems(node: &Node, tag: &str, vec: &mut Vec<Node>, all_tags: bool) {
for child in node.get_child_elements() {
if all_tags || child.get_name().to_uppercase() == tag {
vec.push(child.clone());
}
get_elems(&child, tag, vec, all_tags);
}
}
get_elems(node, &tag, &mut vec, all_tags);
vec
}
fn is_phrasing_content(node: &Node) -> bool {
let tag_name = node.get_name().to_uppercase();
let is_text_node = node
@ -814,56 +635,6 @@ impl Readability {
.all(|val| val)
}
fn get_link_density(node: &Node) -> f64 {
let text_length = Self::get_inner_text(node, false).len();
if text_length == 0 {
return 0.0;
}
let mut link_length = 0.0;
// XXX implement _reduceNodeList?
let link_nodes = Self::get_elements_by_tag_name(node, "A");
for link_node in link_nodes {
if let Some(href) = link_node.get_attribute("href") {
let coefficient = if constants::HASH_URL.is_match(&href) {
0.3
} else {
1.0
};
link_length += Self::get_inner_text(&link_node, false).len() as f64 * coefficient;
}
}
link_length / text_length as f64
}
// Determine whether element has any children block level elements.
fn has_child_block_element(node: &Node) -> bool {
node.get_child_elements().iter().any(|node| {
constants::DIV_TO_P_ELEMS.contains(node.get_name().as_str())
|| Self::has_child_block_element(node)
})
}
fn get_node_ancestors(node: &Node, max_depth: u64) -> Vec<Node> {
let mut ancestors = Vec::new();
let mut node = node.clone();
for _ in 0..=max_depth {
let parent = node.get_parent();
match parent {
Some(parent) => {
ancestors.push(parent.clone());
node = parent;
}
None => return ancestors,
}
}
ancestors
}
// Initialize a node with the readability object. Also checks the
// className/id for special names to add to its score.
fn initialize_node(node: &mut Node, state: &State) -> Result<(), FullTextParserError> {
@ -910,9 +681,4 @@ impl Readability {
weight
}
fn has_tag_name(node: Option<&Node>, tag_name: &str) -> bool {
node.map(|n| n.get_name().to_uppercase() == tag_name.to_uppercase())
.unwrap_or(false)
}
}

View file

@ -14,6 +14,7 @@ async fn prepare(html: &str, url: &Url) -> (Document, Context, Article) {
let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap();
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config, url);
crate::FullTextParser::unwrap_noscript_images(&xpath_ctx).unwrap();
let article = Article {
title: None,
author: None,
@ -34,9 +35,18 @@ async fn test_1() {
let url = Url::parse("http://google.com").unwrap();
let (document, xpath_ctx, mut article) = prepare(&html, &url).await;
let mut article_document = Document::new().unwrap();
let mut root = Node::new("article", None, &document).unwrap();
article_document.set_root_element(&root);
metadata::extract(&xpath_ctx, None, None, &mut article);
super::Readability::extract_body(document, &mut root, article.title.as_deref()).unwrap();
if let Some(mut root) = article_document.get_root_element() {
crate::FullTextParser::post_process_content(&mut root).unwrap();
}
article.document = Some(article_document);
let html = article.get_content().unwrap();
std::fs::write("test.html", html).unwrap();
}

View file

@ -1,4 +1,5 @@
use super::FullTextParser;
use super::{FullTextParser, config::ConfigEntry};
use libxml::tree::SaveOptions;
use reqwest::Client;
use std::path::PathBuf;
@ -72,3 +73,49 @@ async fn encoding_windows_1252() {
.unwrap();
assert!(html.contains("Bund-Länder-Konferenz"));
}
#[tokio::test]
async fn unwrap_noscript_images() {
let _ = env_logger::builder().is_test(true).try_init();
let html = r#"
<p>Lorem ipsum dolor sit amet,
<span class="lazyload">
<img src="foto-m0101.jpg" alt="image description">
<noscript><img src="foto-m0102.jpg" alt="image description"></noscript>
</span>
consectetur adipiscing elit.
</p>
"#;
let expected = r#"<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body>
<p>Lorem ipsum dolor sit amet,
<span class="lazyload">
<img src="foto-m0102.jpg" alt="image description" data-old-src="foto-m0101.jpg">
</span>
consectetur adipiscing elit.
</p>
</body></html>
"#;
let empty_config = ConfigEntry::default();
let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap();
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
crate::FullTextParser::unwrap_noscript_images(&xpath_ctx).unwrap();
let options = SaveOptions {
format: true,
no_declaration: false,
no_empty_tags: true,
no_xhtml: false,
xhtml: false,
as_xml: false,
as_html: true,
non_significant_whitespace: false,
};
let res = document.to_string_with_options(options);
assert_eq!(res, expected);
}

View file

@ -1,11 +1,17 @@
use libxml::{tree::Node, xpath::Context};
use libxml::{
tree::{Node, NodeType},
xpath::Context,
};
use reqwest::{
header::{HeaderMap, HeaderName, HeaderValue},
Response,
};
use tokio::fs::DirEntry;
use crate::full_text_parser::{config::ConfigEntry, error::FullTextParserError};
use crate::{
constants,
full_text_parser::{config::ConfigEntry, error::FullTextParserError},
};
pub struct Util;
@ -219,4 +225,258 @@ impl Util {
}
Ok(())
}
pub fn is_probably_visible(node: &Node) -> bool {
let display_none = node
.get_attribute("display")
.map(|display| display == "none")
.unwrap_or(false);
let is_hidden = node.has_attribute("hidden");
let aria_hidden = node
.get_attribute("aria-hidden")
.map(|attr| attr == "true")
.unwrap_or(false);
let has_fallback_image = node.get_class_names().contains("fallback-image");
!display_none && !is_hidden && !aria_hidden || has_fallback_image
}
pub fn is_whitespace(node: &Node) -> bool {
let is_text_node = node
.get_type()
.map(|t| t == NodeType::TextNode)
.unwrap_or(false);
let is_element_node = node
.get_type()
.map(|t| t == NodeType::ElementNode)
.unwrap_or(false);
(is_text_node && node.get_content().trim().is_empty())
|| (is_element_node && node.get_name().to_uppercase() == "BR")
}
pub fn remove_and_next(node: &mut Node) -> Option<Node> {
let next_node = Self::next_node(node, true);
node.unlink();
next_node
}
pub fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
let mut node = node.clone();
// First check for kids if those aren't being ignored
let first_child = node.get_first_child();
if !ignore_self_and_kids && first_child.is_some() {
return first_child;
}
// Then for siblings...
let next_sibling = node.get_next_sibling();
if next_sibling.is_some() {
return next_sibling;
}
// And finally, move up the parent chain *and* find a sibling
// (because this is depth-first traversal, we will have already
// seen the parent nodes themselves).
loop {
let parent = node.get_parent();
if parent.is_none() {
break;
}
if let Some(parent) = parent {
let parent_name = parent.get_name().to_uppercase();
if parent_name == "HTML" {
break;
}
let next_sibling = parent.get_next_sibling();
if next_sibling.is_some() {
return next_sibling;
} else {
node = parent;
}
}
}
None
}
pub fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
let content = node.get_content().trim().to_owned();
if normalize_spaces {
constants::NORMALIZE.replace(&content, " ").into()
} else {
content
}
}
pub fn text_similarity(a: &str, b: &str) -> f64 {
let a = a.to_lowercase();
let b = b.to_lowercase();
let tokens_a = constants::TOKENIZE.split(&a).collect::<Vec<_>>();
let tokens_b = constants::TOKENIZE.split(&b).collect::<Vec<_>>();
if tokens_a.is_empty() || tokens_b.is_empty() {
return 0.0;
}
let tokens_b_total = tokens_b.join(" ").len() as f64;
let uniq_tokens_b = tokens_b
.into_iter()
.filter(|token| !tokens_a.iter().any(|t| t == token))
.collect::<Vec<_>>();
let uniq_tokens_b_total = uniq_tokens_b.join(" ").len() as f64;
let distance_b = uniq_tokens_b_total / tokens_b_total;
1.0 - distance_b
}
pub fn has_ancestor_tag(node: &Node, tag_name: &str, max_depth: Option<u64>) -> bool {
let max_depth = max_depth.unwrap_or(3);
let tag_name = tag_name.to_uppercase();
let mut depth = 0;
let mut node = node.get_parent();
loop {
if depth > max_depth {
return false;
}
let tmp_node = match node {
Some(node) => node,
None => return false,
};
if tmp_node.get_name() == tag_name {
return true;
}
node = tmp_node.get_parent();
depth += 1;
}
}
pub fn has_single_tag_inside_element(node: &Node, tag: &str) -> bool {
// There should be exactly 1 element child with given tag
if node.get_child_nodes().len() == 1
|| node
.get_child_nodes()
.first()
.map(|n| n.get_name().to_uppercase() == tag)
.unwrap_or(false)
{
return false;
}
// And there should be no text nodes with real content
node.get_child_nodes().iter().any(|n| {
n.get_type()
.map(|t| t == NodeType::TextNode)
.unwrap_or(false)
&& constants::HAS_CONTENT.is_match(&n.get_content())
})
}
pub fn is_element_without_content(node: &Node) -> bool {
if let Some(node_type) = node.get_type() {
let len = node.get_child_nodes().len();
return node_type == NodeType::ElementNode
&& node.get_content().trim().is_empty()
&& (len == 0
|| len
== Self::get_elements_by_tag_name(node, "br").len()
+ Self::get_elements_by_tag_name(node, "hr").len());
}
false
}
pub fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec<Node> {
let tag = tag.to_uppercase();
let all_tags = tag == "*";
let mut vec = Vec::new();
fn get_elems(node: &Node, tag: &str, vec: &mut Vec<Node>, all_tags: bool) {
for child in node.get_child_elements() {
if all_tags || child.get_name().to_uppercase() == tag {
vec.push(child.clone());
}
get_elems(&child, tag, vec, all_tags);
}
}
get_elems(node, &tag, &mut vec, all_tags);
vec
}
pub fn get_link_density(node: &Node) -> f64 {
let text_length = Util::get_inner_text(node, false).len();
if text_length == 0 {
return 0.0;
}
let mut link_length = 0.0;
// XXX implement _reduceNodeList?
let link_nodes = Util::get_elements_by_tag_name(node, "A");
for link_node in link_nodes {
if let Some(href) = link_node.get_attribute("href") {
let coefficient = if constants::HASH_URL.is_match(&href) {
0.3
} else {
1.0
};
link_length += Util::get_inner_text(&link_node, false).len() as f64 * coefficient;
}
}
link_length / text_length as f64
}
// Determine whether element has any children block level elements.
pub fn has_child_block_element(node: &Node) -> bool {
node.get_child_elements().iter().any(|node| {
constants::DIV_TO_P_ELEMS.contains(node.get_name().as_str())
|| Self::has_child_block_element(node)
})
}
pub fn get_node_ancestors(node: &Node, max_depth: u64) -> Vec<Node> {
let mut ancestors = Vec::new();
let mut node = node.clone();
for _ in 0..=max_depth {
let parent = node.get_parent();
match parent {
Some(parent) => {
ancestors.push(parent.clone());
node = parent;
}
None => return ancestors,
}
}
ancestors
}
pub fn has_tag_name(node: Option<&Node>, tag_name: &str) -> bool {
node.map(|n| n.get_name().to_uppercase() == tag_name.to_uppercase())
.unwrap_or(false)
}
// Check if node is image, or if node contains exactly only one image
// whether as a direct child or as its descendants.
pub fn is_single_image(node: &Node) -> bool {
if node.get_name().to_uppercase() == "IMG" {
true
} else if node.get_child_nodes().len() != 1 || node.get_content().trim() != "" {
false
} else if let Some(first_child) = node.get_child_nodes().first() {
Self::is_single_image(first_child)
} else {
false
}
}
}