1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 16:40:00 +02:00

4 more test & remove share elements

This commit is contained in:
Jan Lukas Gernert 2023-04-01 17:19:37 +02:00
parent be6e08bd6d
commit 0d6db710e8
13 changed files with 5151 additions and 1110 deletions

View file

@ -975,6 +975,7 @@ impl FullTextParser {
Util::clean_conditionally(node, "ul");
Util::clean_conditionally(node, "div");
Self::remove_share_elements(node);
Self::clean_attributes(node)?;
Self::remove_single_cell_tables(node);
Self::remove_extra_p_and_div(node);
@ -1042,6 +1043,26 @@ impl FullTextParser {
}
}
fn remove_share_elements(root: &mut Node) {
let mut node_iter = Some(root.clone());
while let Some(mut node) = node_iter {
let match_string = format!(
"{} {}",
node.get_attribute("class").unwrap_or_default(),
node.get_attribute("id").unwrap_or_default()
);
if constants::SHARE_ELEMENTS.is_match(&match_string)
&& node.get_content().len() < constants::DEFAULT_CHAR_THRESHOLD
{
node_iter = Util::remove_and_next(&mut node);
} else {
node_iter = Util::next_node(&node, false);
}
}
}
fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> {
let mut node_iter = Some(root.clone());