mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
fix post processing
This commit is contained in:
parent
2528aa3e18
commit
f5b7ff198a
2 changed files with 32 additions and 11 deletions
|
@ -587,13 +587,14 @@ impl FullTextParser {
|
||||||
_ = Util::strip_node(context, "//*[contains(@style,'display: none')]");
|
_ = Util::strip_node(context, "//*[contains(@style,'display: none')]");
|
||||||
_ = Self::remove_attribute(context, None, "style");
|
_ = Self::remove_attribute(context, None, "style");
|
||||||
|
|
||||||
// strip all comments
|
// strip all input elements
|
||||||
|
_ = Util::strip_node(context, "//form");
|
||||||
_ = Util::strip_node(context, "//input");
|
_ = Util::strip_node(context, "//input");
|
||||||
_ = Util::strip_node(context, "//textarea");
|
_ = Util::strip_node(context, "//textarea");
|
||||||
_ = Util::strip_node(context, "//select");
|
_ = Util::strip_node(context, "//select");
|
||||||
_ = Util::strip_node(context, "//button");
|
_ = Util::strip_node(context, "//button");
|
||||||
|
|
||||||
// strip all input elements
|
// strip all comments
|
||||||
_ = Util::strip_node(context, "//comment()");
|
_ = Util::strip_node(context, "//comment()");
|
||||||
|
|
||||||
// strip all scripts
|
// strip all scripts
|
||||||
|
@ -859,11 +860,12 @@ impl FullTextParser {
|
||||||
Util::mark_data_tables(&context)?;
|
Util::mark_data_tables(&context)?;
|
||||||
|
|
||||||
if let Some(mut root) = document.get_root_element() {
|
if let Some(mut root) = document.get_root_element() {
|
||||||
Util::clean_conditionally(&mut root, "form")?;
|
Self::remove_extra_p_and_div(&mut root);
|
||||||
Util::clean_conditionally(&mut root, "fieldset")?;
|
|
||||||
Util::clean_conditionally(&mut root, "table")?;
|
Util::clean_conditionally(&mut root, "fieldset");
|
||||||
Util::clean_conditionally(&mut root, "ul")?;
|
Util::clean_conditionally(&mut root, "table");
|
||||||
Util::clean_conditionally(&mut root, "div")?;
|
Util::clean_conditionally(&mut root, "ul");
|
||||||
|
Util::clean_conditionally(&mut root, "div");
|
||||||
|
|
||||||
Self::clean_classes(&mut root)?;
|
Self::clean_classes(&mut root)?;
|
||||||
Self::simplify_nested_elements(&mut root)?;
|
Self::simplify_nested_elements(&mut root)?;
|
||||||
|
@ -872,6 +874,27 @@ impl FullTextParser {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn remove_extra_p_and_div(root: &mut Node) {
|
||||||
|
let mut node_iter = Some(root.clone());
|
||||||
|
|
||||||
|
while let Some(mut node) = node_iter {
|
||||||
|
let tag_name = node.get_name();
|
||||||
|
if tag_name == "P" || tag_name == "DIV" {
|
||||||
|
let img_count = Util::get_elements_by_tag_name(&node, "img").len();
|
||||||
|
let embed_count = Util::get_elements_by_tag_name(&node, "embed").len();
|
||||||
|
let object_count = Util::get_elements_by_tag_name(&node, "object").len();
|
||||||
|
let iframe_count = Util::get_elements_by_tag_name(&node, "iframe").len();
|
||||||
|
let total_count = img_count + embed_count + object_count + iframe_count;
|
||||||
|
|
||||||
|
if total_count == 0 && Util::get_inner_text(&node, false).trim().is_empty() {
|
||||||
|
node.unlink();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
node_iter = Util::next_node(&node, false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn clean_classes(root: &mut Node) -> Result<(), FullTextParserError> {
|
fn clean_classes(root: &mut Node) -> Result<(), FullTextParserError> {
|
||||||
let mut node_iter = Some(root.clone());
|
let mut node_iter = Some(root.clone());
|
||||||
|
|
||||||
|
|
|
@ -504,7 +504,7 @@ impl Util {
|
||||||
|
|
||||||
// Clean an element of all tags of type "tag" if they look fishy.
|
// Clean an element of all tags of type "tag" if they look fishy.
|
||||||
// "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
|
// "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
|
||||||
pub fn clean_conditionally(root: &mut Node, tag: &str) -> Result<(), FullTextParserError> {
|
pub fn clean_conditionally(root: &mut Node, tag: &str) {
|
||||||
// Gather counts for other typical elements embedded within.
|
// Gather counts for other typical elements embedded within.
|
||||||
// Traverse backwards so we can remove nodes at the same time
|
// Traverse backwards so we can remove nodes at the same time
|
||||||
// without effecting the traversal.
|
// without effecting the traversal.
|
||||||
|
@ -516,11 +516,9 @@ impl Util {
|
||||||
.filter(|node| Self::should_remove(node, tag))
|
.filter(|node| Self::should_remove(node, tag))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
for mut node in nodes_to_remove {
|
for mut node in nodes_to_remove.into_iter().rev() {
|
||||||
node.unlink();
|
node.unlink();
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn should_remove(node: &Node, tag: &str) -> bool {
|
fn should_remove(node: &Node, tag: &str) -> bool {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue