1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 08:30:00 +02:00

fixes, more sanitation & 1 more failing test

This commit is contained in:
Jan Lukas Gernert 2023-02-28 01:50:13 +01:00
parent 56c08c501a
commit 31a8033844
8 changed files with 1993 additions and 162 deletions

View file

@ -594,7 +594,6 @@ impl FullTextParser {
let _ = Self::fix_lazy_images(context, "lazyload", "data-src");
let _ = Self::fix_iframe_size(context, "youtube.com");
let _ = Self::remove_attribute(context, None, "style");
let _ = Self::remove_attribute(context, Some("a"), "onclick");
let _ = Self::remove_attribute(context, Some("img"), "srcset");
let _ = Self::remove_attribute(context, Some("img"), "sizes");
@ -610,6 +609,8 @@ impl FullTextParser {
// strip elements that contain style="display: none;"
let _ = Util::strip_node(context, "//*[contains(@style,'display:none')]");
let _ = Util::strip_node(context, "//*[contains(@style,'display: none')]");
let _ = Self::remove_attribute(context, None, "style");
// strip all comments
let _ = Util::strip_node(context, "//input");
@ -849,11 +850,6 @@ impl FullTextParser {
}
pub(crate) fn post_process_content(document: &Document) -> Result<(), FullTextParserError> {
if let Some(mut root) = document.get_root_element() {
Self::clean_classes(&mut root)?;
Self::simplify_nested_elements(&mut root)?;
}
let context = Context::new(document).map_err(|()| {
error!("Creating xpath context failed for article HTML");
FullTextParserError::Xml
@ -884,6 +880,19 @@ impl FullTextParser {
}
}
Util::mark_data_tables(&context)?;
if let Some(mut root) = document.get_root_element() {
Util::clean_conditionally(&mut root, "form")?;
Util::clean_conditionally(&mut root, "fieldset")?;
Util::clean_conditionally(&mut root, "table")?;
Util::clean_conditionally(&mut root, "ul")?;
Util::clean_conditionally(&mut root, "div")?;
Self::clean_classes(&mut root)?;
Self::simplify_nested_elements(&mut root)?;
}
Ok(())
}
@ -904,11 +913,17 @@ impl FullTextParser {
})?;
}
node.remove_attribute("content_score").map_err(|e| {
node.remove_attribute(constants::SCORE_ATTR).map_err(|e| {
log::error!("{e}");
FullTextParserError::Xml
})?;
node.remove_attribute(constants::DATA_TABLE_ATTR)
.map_err(|e| {
log::error!("{e}");
FullTextParserError::Xml
})?;
node_iter = Util::next_node(&node, false);
}
Ok(())