mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
unwrap noscript images
This commit is contained in:
parent
98c06e11f4
commit
7ae98904d4
6 changed files with 537 additions and 284 deletions
|
@ -18,7 +18,7 @@ impl Article {
|
|||
pub fn get_content(&self) -> Option<String> {
|
||||
// serialize content
|
||||
let options = SaveOptions {
|
||||
format: false,
|
||||
format: true,
|
||||
no_declaration: false,
|
||||
no_empty_tags: true,
|
||||
no_xhtml: false,
|
||||
|
|
|
@ -19,6 +19,7 @@ use libxml::parser::Parser;
|
|||
use libxml::tree::{Document, Node};
|
||||
use libxml::xpath::Context;
|
||||
use log::{debug, error, info, warn};
|
||||
use regex::Regex;
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::Client;
|
||||
use std::path::Path;
|
||||
|
@ -124,6 +125,10 @@ impl FullTextParser {
|
|||
return Err(error);
|
||||
}
|
||||
|
||||
if let Some(mut root) = document.get_root_element() {
|
||||
Self::post_process_content(&mut root)?;
|
||||
}
|
||||
|
||||
article.document = Some(document);
|
||||
|
||||
Ok(article)
|
||||
|
@ -179,6 +184,7 @@ impl FullTextParser {
|
|||
Self::check_for_thumbnail(&xpath_ctx, article);
|
||||
}
|
||||
Self::strip_junk(&xpath_ctx, config, global_config, url);
|
||||
Self::unwrap_noscript_images(&xpath_ctx)?;
|
||||
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||
|
||||
if !found_body {
|
||||
|
@ -195,6 +201,7 @@ impl FullTextParser {
|
|||
document = Self::parse_html(&html, config, global_config)?;
|
||||
xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||
Self::strip_junk(&xpath_ctx, config, global_config, &url);
|
||||
Self::unwrap_noscript_images(&xpath_ctx)?;
|
||||
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||
}
|
||||
|
||||
|
@ -609,6 +616,12 @@ impl FullTextParser {
|
|||
// strip all comments
|
||||
let _ = Util::strip_node(context, "//comment()");
|
||||
|
||||
// strip all scripts
|
||||
let _ = Util::strip_node(context, "//script");
|
||||
|
||||
// strip all styles
|
||||
let _ = Util::strip_node(context, "//style");
|
||||
|
||||
// strip all empty url-tags <a/>
|
||||
let _ = Util::strip_node(context, "//a[not(node())]");
|
||||
|
||||
|
@ -616,6 +629,91 @@ impl FullTextParser {
|
|||
let _ = Util::strip_node(context, "//*[@type='text/css']");
|
||||
}
|
||||
|
||||
/**
|
||||
* Find all <noscript> that are located after <img> nodes, and which contain only one
|
||||
* <img> element. Replace the first image with the image from inside the <noscript> tag,
|
||||
* and remove the <noscript> tag. This improves the quality of the images we use on
|
||||
* some sites (e.g. Medium).
|
||||
**/
|
||||
fn unwrap_noscript_images(ctx: &Context) -> Result<(), FullTextParserError> {
|
||||
// Find img without source or attributes that might contains image, and remove it.
|
||||
// This is done to prevent a placeholder img is replaced by img from noscript in next step.
|
||||
let img_regex = Regex::new(r#"/\.(jpg|jpeg|png|webp)/i"#).unwrap();
|
||||
let img_nodes = Util::evaluate_xpath(ctx, "//img", false)?;
|
||||
for mut img_node in img_nodes {
|
||||
let attrs = img_node.get_attributes();
|
||||
|
||||
let keep = attrs.iter().any(|(name, value)| {
|
||||
name == "src"
|
||||
|| name == "srcset"
|
||||
|| name == "data-src"
|
||||
|| name == "data-srcset"
|
||||
|| img_regex.is_match(&value)
|
||||
});
|
||||
if !keep {
|
||||
img_node.unlink();
|
||||
}
|
||||
}
|
||||
|
||||
// Next find noscript and try to extract its image
|
||||
let noscript_nodes = Util::evaluate_xpath(ctx, "//noscript", false)?;
|
||||
for mut noscript_node in noscript_nodes {
|
||||
// Parse content of noscript and make sure it only contains image
|
||||
if !Util::is_single_image(&noscript_node) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If noscript has previous sibling and it only contains image,
|
||||
// replace it with noscript content. However we also keep old
|
||||
// attributes that might contains image.
|
||||
if let Some(prev) = noscript_node.get_prev_element_sibling() {
|
||||
if Util::is_single_image(&prev) {
|
||||
|
||||
{
|
||||
let mut prev_img = prev.clone();
|
||||
|
||||
if prev_img.get_name().to_uppercase() != "IMG" {
|
||||
if let Some(img_node) = Util::get_elements_by_tag_name(&prev_img, "img").into_iter().next() {
|
||||
prev_img = img_node;
|
||||
}
|
||||
}
|
||||
|
||||
let new_img = Util::get_elements_by_tag_name(&noscript_node, "img").into_iter().next();
|
||||
if let Some(mut new_img) = new_img {
|
||||
for (key, value) in prev_img.get_attributes() {
|
||||
if value.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if key == "src" || key == "srcset" || img_regex.is_match(&value) {
|
||||
if new_img.get_attribute(&key).as_deref() == Some(&value) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut attr_name = key;
|
||||
if new_img.has_attribute(&attr_name) {
|
||||
attr_name = format!("data-old-{attr_name}");
|
||||
}
|
||||
|
||||
new_img.set_attribute(&attr_name, &value).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(mut parent) = noscript_node.get_parent() {
|
||||
if let Some(first_child) = noscript_node.get_first_child() {
|
||||
parent.replace_child_node(first_child, prev).unwrap();
|
||||
noscript_node.unlink();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn extract_body(
|
||||
context: &Context,
|
||||
root: &mut Node,
|
||||
|
@ -726,4 +824,76 @@ impl FullTextParser {
|
|||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn post_process_content(root: &mut Node) -> Result<(), FullTextParserError> {
|
||||
Self::clean_classes(root)?;
|
||||
Self::simplify_nested_elements(root)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn clean_classes(root: &mut Node) -> Result<(), FullTextParserError> {
|
||||
let mut node_iter = Some(root.clone());
|
||||
|
||||
while let Some(mut node) = node_iter {
|
||||
let classes = node.get_class_names();
|
||||
if classes.contains("page") {
|
||||
node.set_attribute("class", "page").map_err(|e| {
|
||||
log::error!("{e}");
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
} else {
|
||||
node.remove_attribute("class").map_err(|e| {
|
||||
log::error!("{e}");
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
}
|
||||
|
||||
node.remove_attribute("content_score").map_err(|e| {
|
||||
log::error!("{e}");
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
|
||||
node_iter = Util::next_node(&node, false);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn simplify_nested_elements(root: &mut Node) -> Result<(), FullTextParserError> {
|
||||
let mut node_iter = Some(root.clone());
|
||||
|
||||
while let Some(mut node) = node_iter {
|
||||
let tag_name = node.get_name().to_uppercase();
|
||||
if tag_name != "ARTICLE"
|
||||
&& node.get_parent().is_some()
|
||||
&& (tag_name == "DIV" || tag_name == "SECTION")
|
||||
{
|
||||
if Util::is_element_without_content(&node) {
|
||||
node_iter = Util::remove_and_next(&mut node);
|
||||
continue;
|
||||
} else if Util::has_single_tag_inside_element(&node, "DIV")
|
||||
|| Util::has_single_tag_inside_element(&node, "SECTION")
|
||||
{
|
||||
if let Some(mut parent) = node.get_parent() {
|
||||
if let Some(mut child) = node.get_child_nodes().into_iter().next() {
|
||||
for (k, v) in node.get_attributes().into_iter() {
|
||||
child.set_attribute(&k, &v).map_err(|e| {
|
||||
log::error!("{e}");
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
}
|
||||
parent
|
||||
.replace_child_node(child, node.clone())
|
||||
.map_err(|e| {
|
||||
log::error!("{e}");
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
node_iter = Util::next_node(&node, false);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@ use libxml::tree::{node, Document, Node, NodeType};
|
|||
|
||||
use self::state::State;
|
||||
use super::error::FullTextParserError;
|
||||
use crate::constants;
|
||||
use crate::{constants, util::Util};
|
||||
|
||||
pub struct Readability;
|
||||
|
||||
|
@ -43,13 +43,13 @@ impl Readability {
|
|||
None => match_string,
|
||||
};
|
||||
|
||||
if !Self::is_probably_visible(node_ref) {
|
||||
node = Self::remove_and_next(node_ref);
|
||||
if !Util::is_probably_visible(node_ref) {
|
||||
node = Util::remove_and_next(node_ref);
|
||||
continue;
|
||||
}
|
||||
|
||||
if Self::check_byline(node_ref, &match_string, &mut state) {
|
||||
node = Self::remove_and_next(node_ref);
|
||||
node = Util::remove_and_next(node_ref);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -57,7 +57,7 @@ impl Readability {
|
|||
&& Self::header_duplicates_title(node_ref, title)
|
||||
{
|
||||
state.should_remove_title_header = false;
|
||||
node = Self::remove_and_next(node_ref);
|
||||
node = Util::remove_and_next(node_ref);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -65,18 +65,18 @@ impl Readability {
|
|||
if state.strip_unlikely {
|
||||
if constants::UNLIELY_CANDIDATES.is_match(&match_string)
|
||||
&& !constants::OKAY_MAYBE_ITS_A_CANDIDATE.is_match(&match_string)
|
||||
&& !Self::has_ancestor_tag(node_ref, "table", None)
|
||||
&& !Self::has_ancestor_tag(node_ref, "code", None)
|
||||
&& !Util::has_ancestor_tag(node_ref, "table", None)
|
||||
&& !Util::has_ancestor_tag(node_ref, "code", None)
|
||||
&& tag_name != "BODY"
|
||||
&& tag_name != "A"
|
||||
{
|
||||
node = Self::remove_and_next(node_ref);
|
||||
node = Util::remove_and_next(node_ref);
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(role) = node_ref.get_attribute("role") {
|
||||
if constants::UNLIKELY_ROLES.contains(&role.as_str()) {
|
||||
node = Self::remove_and_next(node_ref);
|
||||
node = Util::remove_and_next(node_ref);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -92,9 +92,9 @@ impl Readability {
|
|||
|| tag_name == "H4"
|
||||
|| tag_name == "H5"
|
||||
|| tag_name == "H6")
|
||||
&& Self::is_element_without_content(node_ref)
|
||||
&& Util::is_element_without_content(node_ref)
|
||||
{
|
||||
node = Self::remove_and_next(node_ref);
|
||||
node = Util::remove_and_next(node_ref);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -110,7 +110,7 @@ impl Readability {
|
|||
if Self::is_phrasing_content(&child_node) {
|
||||
if let Some(p) = p.as_mut() {
|
||||
let _ = p.add_child(&mut child_node);
|
||||
} else if !Self::is_whitespace(&child_node) {
|
||||
} else if !Util::is_whitespace(&child_node) {
|
||||
let mut new_node = Node::new("p", None, &document)
|
||||
.map_err(|()| FullTextParserError::Readability)?;
|
||||
node_ref
|
||||
|
@ -127,7 +127,7 @@ impl Readability {
|
|||
}
|
||||
} else if let Some(p) = p.as_mut() {
|
||||
for mut r_node in p.get_child_nodes().into_iter().rev() {
|
||||
if Self::is_whitespace(&r_node) {
|
||||
if Util::is_whitespace(&r_node) {
|
||||
r_node.unlink();
|
||||
}
|
||||
}
|
||||
|
@ -138,8 +138,8 @@ impl Readability {
|
|||
// element. DIVs with only a P element inside and no text content can be
|
||||
// safely converted into plain P elements to avoid confusing the scoring
|
||||
// algorithm with DIVs with are, in practice, paragraphs.
|
||||
if Self::has_single_tag_inside_element(node_ref, "P")
|
||||
&& Self::get_link_density(node_ref) < 0.25
|
||||
if Util::has_single_tag_inside_element(node_ref, "P")
|
||||
&& Util::get_link_density(node_ref) < 0.25
|
||||
{
|
||||
if let Some(new_node) = node_ref.get_child_nodes().first() {
|
||||
if let Some(mut parent) = node_ref.get_parent() {
|
||||
|
@ -154,14 +154,14 @@ impl Readability {
|
|||
continue;
|
||||
}
|
||||
}
|
||||
} else if !Self::has_child_block_element(node_ref)
|
||||
} else if !Util::has_child_block_element(node_ref)
|
||||
&& node_ref.set_name("P").is_ok()
|
||||
{
|
||||
elements_to_score.push(node_ref.clone());
|
||||
}
|
||||
}
|
||||
|
||||
node = Self::next_node(node_ref, false);
|
||||
node = Util::next_node(node_ref, false);
|
||||
}
|
||||
|
||||
let mut candidates = Vec::new();
|
||||
|
@ -173,7 +173,7 @@ impl Readability {
|
|||
continue;
|
||||
}
|
||||
|
||||
let inner_text = Self::get_inner_text(&element_to_score, true);
|
||||
let inner_text = Util::get_inner_text(&element_to_score, true);
|
||||
|
||||
// If this paragraph is less than 25 characters, don't even count it.
|
||||
if inner_text.len() < 25 {
|
||||
|
@ -181,7 +181,7 @@ impl Readability {
|
|||
}
|
||||
|
||||
// Exclude nodes with no ancestor.
|
||||
let ancestors = Self::get_node_ancestors(&element_to_score, 5);
|
||||
let ancestors = Util::get_node_ancestors(&element_to_score, 5);
|
||||
if ancestors.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
@ -234,7 +234,7 @@ impl Readability {
|
|||
// should have a relatively small link density (5% or less) and be mostly
|
||||
// unaffected by this operation.
|
||||
if let Some(content_score) = Self::get_content_score(candidate) {
|
||||
let candidate_score = content_score * (1.0 - Self::get_link_density(candidate));
|
||||
let candidate_score = content_score * (1.0 - Util::get_link_density(candidate));
|
||||
Self::set_content_score(candidate, candidate_score)?;
|
||||
}
|
||||
}
|
||||
|
@ -242,7 +242,7 @@ impl Readability {
|
|||
candidates.sort_by(|a, b| {
|
||||
if let (Some(a), Some(b)) = (Self::get_content_score(a), Self::get_content_score(b))
|
||||
{
|
||||
a.partial_cmp(&b).unwrap_or(Ordering::Equal)
|
||||
b.partial_cmp(&a).unwrap_or(Ordering::Equal)
|
||||
} else {
|
||||
Ordering::Equal
|
||||
}
|
||||
|
@ -317,7 +317,7 @@ impl Readability {
|
|||
// The scores shouldn't get too low.
|
||||
let score_threshold = last_score / 3.0;
|
||||
|
||||
while Self::has_tag_name(parent_of_top_candidate.as_ref(), "BODY") {
|
||||
while Util::has_tag_name(parent_of_top_candidate.as_ref(), "BODY") {
|
||||
if parent_of_top_candidate
|
||||
.as_ref()
|
||||
.map(|n| Self::get_content_score(n).is_none())
|
||||
|
@ -354,7 +354,7 @@ impl Readability {
|
|||
// joining logic when adjacent content is actually located in parent's sibling node.
|
||||
parent_of_top_candidate = top_candidate.get_parent();
|
||||
|
||||
while Self::has_tag_name(parent_of_top_candidate.as_ref(), "BODY")
|
||||
while Util::has_tag_name(parent_of_top_candidate.as_ref(), "BODY")
|
||||
&& parent_of_top_candidate
|
||||
.as_ref()
|
||||
.map(|n| n.get_child_elements().len() == 1)
|
||||
|
@ -414,8 +414,8 @@ impl Readability {
|
|||
{
|
||||
append = true;
|
||||
} else if sibling.get_name().to_uppercase() == "P" {
|
||||
let link_density = Self::get_link_density(&sibling);
|
||||
let node_content = Self::get_inner_text(&sibling, false);
|
||||
let link_density = Util::get_link_density(&sibling);
|
||||
let node_content = Util::get_inner_text(&sibling, false);
|
||||
let node_length = node_content.len();
|
||||
|
||||
if node_length > 80
|
||||
|
@ -432,7 +432,8 @@ impl Readability {
|
|||
if append {
|
||||
log::debug!("Appending node: {sibling:?}");
|
||||
|
||||
if !constants::ALTER_TO_DIV_EXCEPTIONS.contains(sibling.get_name().as_str())
|
||||
if !constants::ALTER_TO_DIV_EXCEPTIONS
|
||||
.contains(sibling.get_name().to_uppercase().as_str())
|
||||
{
|
||||
// We have a node that isn't a common block level element, like a form or td tag.
|
||||
// Turn it into a div so it doesn't get filtered out later by accident.
|
||||
|
@ -503,7 +504,7 @@ impl Readability {
|
|||
// grabArticle with different flags set. This gives us a higher likelihood of
|
||||
// finding the content, and the sieve approach gives us a higher likelihood of
|
||||
// finding the -right- content.
|
||||
let text = Self::get_inner_text(&article_content, true);
|
||||
let text = Util::get_inner_text(&article_content, true);
|
||||
let text_length = text.len();
|
||||
|
||||
if text_length < constants::DEFAULT_CHAR_THRESHOLD {
|
||||
|
@ -525,12 +526,14 @@ impl Readability {
|
|||
attempts.sort_by(|(_, size_a, _), (_, size_b, _)| size_a.cmp(size_b));
|
||||
|
||||
// But first check if we actually have something
|
||||
if let Some((mut best_attempt, _len, _document)) = attempts.pop() {
|
||||
best_attempt.unlink();
|
||||
root.add_child(&mut best_attempt).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
if let Some((best_attempt, _len, _document)) = attempts.pop() {
|
||||
for mut child in best_attempt.get_child_nodes() {
|
||||
child.unlink();
|
||||
root.add_child(&mut child).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
}
|
||||
parse_successful = true;
|
||||
}
|
||||
|
||||
|
@ -541,10 +544,13 @@ impl Readability {
|
|||
.dup()
|
||||
.map_err(|()| FullTextParserError::Readability)?;
|
||||
} else {
|
||||
root.add_child(&mut article_content).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
for mut child in article_content.get_child_nodes() {
|
||||
child.unlink();
|
||||
root.add_child(&mut child).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
}
|
||||
return Ok(parse_successful);
|
||||
}
|
||||
}
|
||||
|
@ -563,83 +569,6 @@ impl Readability {
|
|||
})
|
||||
}
|
||||
|
||||
fn is_probably_visible(node: &Node) -> bool {
|
||||
let display_none = node
|
||||
.get_attribute("display")
|
||||
.map(|display| display == "none")
|
||||
.unwrap_or(false);
|
||||
let is_hidden = node.has_attribute("hidden");
|
||||
let aria_hidden = node
|
||||
.get_attribute("aria-hidden")
|
||||
.map(|attr| attr == "true")
|
||||
.unwrap_or(false);
|
||||
let has_fallback_image = node.get_class_names().contains("fallback-image");
|
||||
|
||||
!display_none && !is_hidden && !aria_hidden || has_fallback_image
|
||||
}
|
||||
|
||||
fn is_whitespace(node: &Node) -> bool {
|
||||
let is_text_node = node
|
||||
.get_type()
|
||||
.map(|t| t == NodeType::TextNode)
|
||||
.unwrap_or(false);
|
||||
let is_element_node = node
|
||||
.get_type()
|
||||
.map(|t| t == NodeType::ElementNode)
|
||||
.unwrap_or(false);
|
||||
|
||||
(is_text_node && node.get_content().trim().is_empty())
|
||||
|| (is_element_node && node.get_name().to_uppercase() == "BR")
|
||||
}
|
||||
|
||||
fn remove_and_next(node: &mut Node) -> Option<Node> {
|
||||
let next_node = Self::next_node(node, true);
|
||||
node.unlink();
|
||||
next_node
|
||||
}
|
||||
|
||||
fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
|
||||
let mut node = node.clone();
|
||||
|
||||
// First check for kids if those aren't being ignored
|
||||
let first_child = node.get_first_child();
|
||||
if !ignore_self_and_kids && first_child.is_some() {
|
||||
return first_child;
|
||||
}
|
||||
|
||||
// Then for siblings...
|
||||
let next_sibling = node.get_next_sibling();
|
||||
if next_sibling.is_some() {
|
||||
return next_sibling;
|
||||
}
|
||||
|
||||
// And finally, move up the parent chain *and* find a sibling
|
||||
// (because this is depth-first traversal, we will have already
|
||||
// seen the parent nodes themselves).
|
||||
loop {
|
||||
let parent = node.get_parent();
|
||||
if parent.is_none() {
|
||||
break;
|
||||
}
|
||||
|
||||
if let Some(parent) = parent {
|
||||
let parent_name = parent.get_name().to_uppercase();
|
||||
if parent_name == "HTML" {
|
||||
break;
|
||||
}
|
||||
|
||||
let next_sibling = parent.get_next_sibling();
|
||||
if next_sibling.is_some() {
|
||||
return next_sibling;
|
||||
} else {
|
||||
node = parent;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
fn check_byline(node: &Node, matchstring: &str, state: &mut State) -> bool {
|
||||
if state.byline.is_some() {
|
||||
return false;
|
||||
|
@ -680,123 +609,15 @@ impl Readability {
|
|||
if name != "h1" && name != "h2" {
|
||||
return false;
|
||||
}
|
||||
let heading = Self::get_inner_text(node, false);
|
||||
let heading = Util::get_inner_text(node, false);
|
||||
|
||||
if let Some(title) = title {
|
||||
Self::text_similarity(&heading, title) > 0.75
|
||||
Util::text_similarity(&heading, title) > 0.75
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
|
||||
let content = node.get_content().trim().to_owned();
|
||||
if normalize_spaces {
|
||||
constants::NORMALIZE.replace(&content, " ").into()
|
||||
} else {
|
||||
content
|
||||
}
|
||||
}
|
||||
|
||||
fn text_similarity(a: &str, b: &str) -> f64 {
|
||||
let a = a.to_lowercase();
|
||||
let b = b.to_lowercase();
|
||||
let tokens_a = constants::TOKENIZE.split(&a).collect::<Vec<_>>();
|
||||
let tokens_b = constants::TOKENIZE.split(&b).collect::<Vec<_>>();
|
||||
if tokens_a.is_empty() || tokens_b.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let tokens_b_total = tokens_b.join(" ").len() as f64;
|
||||
let uniq_tokens_b = tokens_b
|
||||
.into_iter()
|
||||
.filter(|token| !tokens_a.iter().any(|t| t == token))
|
||||
.collect::<Vec<_>>();
|
||||
let uniq_tokens_b_total = uniq_tokens_b.join(" ").len() as f64;
|
||||
|
||||
let distance_b = uniq_tokens_b_total / tokens_b_total;
|
||||
1.0 - distance_b
|
||||
}
|
||||
|
||||
fn has_ancestor_tag(node: &Node, tag_name: &str, max_depth: Option<u64>) -> bool {
|
||||
let max_depth = max_depth.unwrap_or(3);
|
||||
let tag_name = tag_name.to_uppercase();
|
||||
let mut depth = 0;
|
||||
let mut node = node.get_parent();
|
||||
|
||||
loop {
|
||||
if depth > max_depth {
|
||||
return false;
|
||||
}
|
||||
|
||||
let tmp_node = match node {
|
||||
Some(node) => node,
|
||||
None => return false,
|
||||
};
|
||||
|
||||
if tmp_node.get_name() == tag_name {
|
||||
return true;
|
||||
}
|
||||
|
||||
node = tmp_node.get_parent();
|
||||
depth += 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn has_single_tag_inside_element(node: &Node, tag: &str) -> bool {
|
||||
// There should be exactly 1 element child with given tag
|
||||
if node.get_child_nodes().len() == 1
|
||||
|| node
|
||||
.get_child_nodes()
|
||||
.first()
|
||||
.map(|n| n.get_name().to_uppercase() == tag)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// And there should be no text nodes with real content
|
||||
node.get_child_nodes().iter().any(|n| {
|
||||
n.get_type()
|
||||
.map(|t| t == NodeType::TextNode)
|
||||
.unwrap_or(false)
|
||||
&& constants::HAS_CONTENT.is_match(&n.get_content())
|
||||
})
|
||||
}
|
||||
|
||||
fn is_element_without_content(node: &Node) -> bool {
|
||||
if let Some(node_type) = node.get_type() {
|
||||
let len = node.get_child_nodes().len();
|
||||
|
||||
return node_type == NodeType::ElementNode
|
||||
&& node.get_content().trim().is_empty()
|
||||
&& (len == 0
|
||||
|| len
|
||||
== Self::get_elements_by_tag_name(node, "br").len()
|
||||
+ Self::get_elements_by_tag_name(node, "hr").len());
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec<Node> {
|
||||
let tag = tag.to_uppercase();
|
||||
let all_tags = tag == "*";
|
||||
let mut vec = Vec::new();
|
||||
|
||||
fn get_elems(node: &Node, tag: &str, vec: &mut Vec<Node>, all_tags: bool) {
|
||||
for child in node.get_child_elements() {
|
||||
if all_tags || child.get_name().to_uppercase() == tag {
|
||||
vec.push(child.clone());
|
||||
}
|
||||
get_elems(&child, tag, vec, all_tags);
|
||||
}
|
||||
}
|
||||
|
||||
get_elems(node, &tag, &mut vec, all_tags);
|
||||
vec
|
||||
}
|
||||
|
||||
fn is_phrasing_content(node: &Node) -> bool {
|
||||
let tag_name = node.get_name().to_uppercase();
|
||||
let is_text_node = node
|
||||
|
@ -814,56 +635,6 @@ impl Readability {
|
|||
.all(|val| val)
|
||||
}
|
||||
|
||||
fn get_link_density(node: &Node) -> f64 {
|
||||
let text_length = Self::get_inner_text(node, false).len();
|
||||
if text_length == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let mut link_length = 0.0;
|
||||
|
||||
// XXX implement _reduceNodeList?
|
||||
let link_nodes = Self::get_elements_by_tag_name(node, "A");
|
||||
for link_node in link_nodes {
|
||||
if let Some(href) = link_node.get_attribute("href") {
|
||||
let coefficient = if constants::HASH_URL.is_match(&href) {
|
||||
0.3
|
||||
} else {
|
||||
1.0
|
||||
};
|
||||
link_length += Self::get_inner_text(&link_node, false).len() as f64 * coefficient;
|
||||
}
|
||||
}
|
||||
|
||||
link_length / text_length as f64
|
||||
}
|
||||
|
||||
// Determine whether element has any children block level elements.
|
||||
fn has_child_block_element(node: &Node) -> bool {
|
||||
node.get_child_elements().iter().any(|node| {
|
||||
constants::DIV_TO_P_ELEMS.contains(node.get_name().as_str())
|
||||
|| Self::has_child_block_element(node)
|
||||
})
|
||||
}
|
||||
|
||||
fn get_node_ancestors(node: &Node, max_depth: u64) -> Vec<Node> {
|
||||
let mut ancestors = Vec::new();
|
||||
let mut node = node.clone();
|
||||
|
||||
for _ in 0..=max_depth {
|
||||
let parent = node.get_parent();
|
||||
match parent {
|
||||
Some(parent) => {
|
||||
ancestors.push(parent.clone());
|
||||
node = parent;
|
||||
}
|
||||
None => return ancestors,
|
||||
}
|
||||
}
|
||||
|
||||
ancestors
|
||||
}
|
||||
|
||||
// Initialize a node with the readability object. Also checks the
|
||||
// className/id for special names to add to its score.
|
||||
fn initialize_node(node: &mut Node, state: &State) -> Result<(), FullTextParserError> {
|
||||
|
@ -910,9 +681,4 @@ impl Readability {
|
|||
|
||||
weight
|
||||
}
|
||||
|
||||
fn has_tag_name(node: Option<&Node>, tag_name: &str) -> bool {
|
||||
node.map(|n| n.get_name().to_uppercase() == tag_name.to_uppercase())
|
||||
.unwrap_or(false)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,6 +14,7 @@ async fn prepare(html: &str, url: &Url) -> (Document, Context, Article) {
|
|||
let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap();
|
||||
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
|
||||
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config, url);
|
||||
crate::FullTextParser::unwrap_noscript_images(&xpath_ctx).unwrap();
|
||||
let article = Article {
|
||||
title: None,
|
||||
author: None,
|
||||
|
@ -34,9 +35,18 @@ async fn test_1() {
|
|||
let url = Url::parse("http://google.com").unwrap();
|
||||
let (document, xpath_ctx, mut article) = prepare(&html, &url).await;
|
||||
|
||||
let mut article_document = Document::new().unwrap();
|
||||
let mut root = Node::new("article", None, &document).unwrap();
|
||||
article_document.set_root_element(&root);
|
||||
|
||||
metadata::extract(&xpath_ctx, None, None, &mut article);
|
||||
|
||||
super::Readability::extract_body(document, &mut root, article.title.as_deref()).unwrap();
|
||||
|
||||
if let Some(mut root) = article_document.get_root_element() {
|
||||
crate::FullTextParser::post_process_content(&mut root).unwrap();
|
||||
}
|
||||
|
||||
article.document = Some(article_document);
|
||||
let html = article.get_content().unwrap();
|
||||
std::fs::write("test.html", html).unwrap();
|
||||
}
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
use super::FullTextParser;
|
||||
use super::{FullTextParser, config::ConfigEntry};
|
||||
use libxml::tree::SaveOptions;
|
||||
use reqwest::Client;
|
||||
use std::path::PathBuf;
|
||||
|
||||
|
@ -72,3 +73,49 @@ async fn encoding_windows_1252() {
|
|||
.unwrap();
|
||||
assert!(html.contains("Bund-Länder-Konferenz"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn unwrap_noscript_images() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
let html = r#"
|
||||
<p>Lorem ipsum dolor sit amet,
|
||||
<span class="lazyload">
|
||||
<img src="foto-m0101.jpg" alt="image description">
|
||||
<noscript><img src="foto-m0102.jpg" alt="image description"></noscript>
|
||||
</span>
|
||||
consectetur adipiscing elit.
|
||||
</p>
|
||||
"#;
|
||||
|
||||
let expected = r#"<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html><body>
|
||||
<p>Lorem ipsum dolor sit amet,
|
||||
<span class="lazyload">
|
||||
<img src="foto-m0102.jpg" alt="image description" data-old-src="foto-m0101.jpg">
|
||||
|
||||
</span>
|
||||
consectetur adipiscing elit.
|
||||
</p>
|
||||
</body></html>
|
||||
"#;
|
||||
|
||||
let empty_config = ConfigEntry::default();
|
||||
let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap();
|
||||
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
|
||||
|
||||
crate::FullTextParser::unwrap_noscript_images(&xpath_ctx).unwrap();
|
||||
|
||||
let options = SaveOptions {
|
||||
format: true,
|
||||
no_declaration: false,
|
||||
no_empty_tags: true,
|
||||
no_xhtml: false,
|
||||
xhtml: false,
|
||||
as_xml: false,
|
||||
as_html: true,
|
||||
non_significant_whitespace: false,
|
||||
};
|
||||
let res = document.to_string_with_options(options);
|
||||
assert_eq!(res, expected);
|
||||
}
|
||||
|
|
264
src/util.rs
264
src/util.rs
|
@ -1,11 +1,17 @@
|
|||
use libxml::{tree::Node, xpath::Context};
|
||||
use libxml::{
|
||||
tree::{Node, NodeType},
|
||||
xpath::Context,
|
||||
};
|
||||
use reqwest::{
|
||||
header::{HeaderMap, HeaderName, HeaderValue},
|
||||
Response,
|
||||
};
|
||||
use tokio::fs::DirEntry;
|
||||
|
||||
use crate::full_text_parser::{config::ConfigEntry, error::FullTextParserError};
|
||||
use crate::{
|
||||
constants,
|
||||
full_text_parser::{config::ConfigEntry, error::FullTextParserError},
|
||||
};
|
||||
|
||||
pub struct Util;
|
||||
|
||||
|
@ -219,4 +225,258 @@ impl Util {
|
|||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn is_probably_visible(node: &Node) -> bool {
|
||||
let display_none = node
|
||||
.get_attribute("display")
|
||||
.map(|display| display == "none")
|
||||
.unwrap_or(false);
|
||||
let is_hidden = node.has_attribute("hidden");
|
||||
let aria_hidden = node
|
||||
.get_attribute("aria-hidden")
|
||||
.map(|attr| attr == "true")
|
||||
.unwrap_or(false);
|
||||
let has_fallback_image = node.get_class_names().contains("fallback-image");
|
||||
|
||||
!display_none && !is_hidden && !aria_hidden || has_fallback_image
|
||||
}
|
||||
|
||||
pub fn is_whitespace(node: &Node) -> bool {
|
||||
let is_text_node = node
|
||||
.get_type()
|
||||
.map(|t| t == NodeType::TextNode)
|
||||
.unwrap_or(false);
|
||||
let is_element_node = node
|
||||
.get_type()
|
||||
.map(|t| t == NodeType::ElementNode)
|
||||
.unwrap_or(false);
|
||||
|
||||
(is_text_node && node.get_content().trim().is_empty())
|
||||
|| (is_element_node && node.get_name().to_uppercase() == "BR")
|
||||
}
|
||||
|
||||
pub fn remove_and_next(node: &mut Node) -> Option<Node> {
|
||||
let next_node = Self::next_node(node, true);
|
||||
node.unlink();
|
||||
next_node
|
||||
}
|
||||
|
||||
pub fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
|
||||
let mut node = node.clone();
|
||||
|
||||
// First check for kids if those aren't being ignored
|
||||
let first_child = node.get_first_child();
|
||||
if !ignore_self_and_kids && first_child.is_some() {
|
||||
return first_child;
|
||||
}
|
||||
|
||||
// Then for siblings...
|
||||
let next_sibling = node.get_next_sibling();
|
||||
if next_sibling.is_some() {
|
||||
return next_sibling;
|
||||
}
|
||||
|
||||
// And finally, move up the parent chain *and* find a sibling
|
||||
// (because this is depth-first traversal, we will have already
|
||||
// seen the parent nodes themselves).
|
||||
loop {
|
||||
let parent = node.get_parent();
|
||||
if parent.is_none() {
|
||||
break;
|
||||
}
|
||||
|
||||
if let Some(parent) = parent {
|
||||
let parent_name = parent.get_name().to_uppercase();
|
||||
if parent_name == "HTML" {
|
||||
break;
|
||||
}
|
||||
|
||||
let next_sibling = parent.get_next_sibling();
|
||||
if next_sibling.is_some() {
|
||||
return next_sibling;
|
||||
} else {
|
||||
node = parent;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
pub fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
|
||||
let content = node.get_content().trim().to_owned();
|
||||
if normalize_spaces {
|
||||
constants::NORMALIZE.replace(&content, " ").into()
|
||||
} else {
|
||||
content
|
||||
}
|
||||
}
|
||||
|
||||
pub fn text_similarity(a: &str, b: &str) -> f64 {
|
||||
let a = a.to_lowercase();
|
||||
let b = b.to_lowercase();
|
||||
let tokens_a = constants::TOKENIZE.split(&a).collect::<Vec<_>>();
|
||||
let tokens_b = constants::TOKENIZE.split(&b).collect::<Vec<_>>();
|
||||
if tokens_a.is_empty() || tokens_b.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let tokens_b_total = tokens_b.join(" ").len() as f64;
|
||||
let uniq_tokens_b = tokens_b
|
||||
.into_iter()
|
||||
.filter(|token| !tokens_a.iter().any(|t| t == token))
|
||||
.collect::<Vec<_>>();
|
||||
let uniq_tokens_b_total = uniq_tokens_b.join(" ").len() as f64;
|
||||
|
||||
let distance_b = uniq_tokens_b_total / tokens_b_total;
|
||||
1.0 - distance_b
|
||||
}
|
||||
|
||||
pub fn has_ancestor_tag(node: &Node, tag_name: &str, max_depth: Option<u64>) -> bool {
|
||||
let max_depth = max_depth.unwrap_or(3);
|
||||
let tag_name = tag_name.to_uppercase();
|
||||
let mut depth = 0;
|
||||
let mut node = node.get_parent();
|
||||
|
||||
loop {
|
||||
if depth > max_depth {
|
||||
return false;
|
||||
}
|
||||
|
||||
let tmp_node = match node {
|
||||
Some(node) => node,
|
||||
None => return false,
|
||||
};
|
||||
|
||||
if tmp_node.get_name() == tag_name {
|
||||
return true;
|
||||
}
|
||||
|
||||
node = tmp_node.get_parent();
|
||||
depth += 1;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn has_single_tag_inside_element(node: &Node, tag: &str) -> bool {
|
||||
// There should be exactly 1 element child with given tag
|
||||
if node.get_child_nodes().len() == 1
|
||||
|| node
|
||||
.get_child_nodes()
|
||||
.first()
|
||||
.map(|n| n.get_name().to_uppercase() == tag)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// And there should be no text nodes with real content
|
||||
node.get_child_nodes().iter().any(|n| {
|
||||
n.get_type()
|
||||
.map(|t| t == NodeType::TextNode)
|
||||
.unwrap_or(false)
|
||||
&& constants::HAS_CONTENT.is_match(&n.get_content())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn is_element_without_content(node: &Node) -> bool {
|
||||
if let Some(node_type) = node.get_type() {
|
||||
let len = node.get_child_nodes().len();
|
||||
|
||||
return node_type == NodeType::ElementNode
|
||||
&& node.get_content().trim().is_empty()
|
||||
&& (len == 0
|
||||
|| len
|
||||
== Self::get_elements_by_tag_name(node, "br").len()
|
||||
+ Self::get_elements_by_tag_name(node, "hr").len());
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
pub fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec<Node> {
|
||||
let tag = tag.to_uppercase();
|
||||
let all_tags = tag == "*";
|
||||
let mut vec = Vec::new();
|
||||
|
||||
fn get_elems(node: &Node, tag: &str, vec: &mut Vec<Node>, all_tags: bool) {
|
||||
for child in node.get_child_elements() {
|
||||
if all_tags || child.get_name().to_uppercase() == tag {
|
||||
vec.push(child.clone());
|
||||
}
|
||||
get_elems(&child, tag, vec, all_tags);
|
||||
}
|
||||
}
|
||||
|
||||
get_elems(node, &tag, &mut vec, all_tags);
|
||||
vec
|
||||
}
|
||||
|
||||
pub fn get_link_density(node: &Node) -> f64 {
|
||||
let text_length = Util::get_inner_text(node, false).len();
|
||||
if text_length == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let mut link_length = 0.0;
|
||||
|
||||
// XXX implement _reduceNodeList?
|
||||
let link_nodes = Util::get_elements_by_tag_name(node, "A");
|
||||
for link_node in link_nodes {
|
||||
if let Some(href) = link_node.get_attribute("href") {
|
||||
let coefficient = if constants::HASH_URL.is_match(&href) {
|
||||
0.3
|
||||
} else {
|
||||
1.0
|
||||
};
|
||||
link_length += Util::get_inner_text(&link_node, false).len() as f64 * coefficient;
|
||||
}
|
||||
}
|
||||
|
||||
link_length / text_length as f64
|
||||
}
|
||||
|
||||
// Determine whether element has any children block level elements.
|
||||
pub fn has_child_block_element(node: &Node) -> bool {
|
||||
node.get_child_elements().iter().any(|node| {
|
||||
constants::DIV_TO_P_ELEMS.contains(node.get_name().as_str())
|
||||
|| Self::has_child_block_element(node)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_node_ancestors(node: &Node, max_depth: u64) -> Vec<Node> {
|
||||
let mut ancestors = Vec::new();
|
||||
let mut node = node.clone();
|
||||
|
||||
for _ in 0..=max_depth {
|
||||
let parent = node.get_parent();
|
||||
match parent {
|
||||
Some(parent) => {
|
||||
ancestors.push(parent.clone());
|
||||
node = parent;
|
||||
}
|
||||
None => return ancestors,
|
||||
}
|
||||
}
|
||||
|
||||
ancestors
|
||||
}
|
||||
|
||||
pub fn has_tag_name(node: Option<&Node>, tag_name: &str) -> bool {
|
||||
node.map(|n| n.get_name().to_uppercase() == tag_name.to_uppercase())
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
// Check if node is image, or if node contains exactly only one image
|
||||
// whether as a direct child or as its descendants.
|
||||
pub fn is_single_image(node: &Node) -> bool {
|
||||
if node.get_name().to_uppercase() == "IMG" {
|
||||
true
|
||||
} else if node.get_child_nodes().len() != 1 || node.get_content().trim() != "" {
|
||||
false
|
||||
} else if let Some(first_child) = node.get_child_nodes().first() {
|
||||
Self::is_single_image(first_child)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue