mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 16:40:00 +02:00
4 more test & remove share elements
This commit is contained in:
parent
be6e08bd6d
commit
0d6db710e8
13 changed files with 5151 additions and 1110 deletions
|
@ -975,6 +975,7 @@ impl FullTextParser {
|
|||
Util::clean_conditionally(node, "ul");
|
||||
Util::clean_conditionally(node, "div");
|
||||
|
||||
Self::remove_share_elements(node);
|
||||
Self::clean_attributes(node)?;
|
||||
Self::remove_single_cell_tables(node);
|
||||
Self::remove_extra_p_and_div(node);
|
||||
|
@ -1042,6 +1043,26 @@ impl FullTextParser {
|
|||
}
|
||||
}
|
||||
|
||||
fn remove_share_elements(root: &mut Node) {
|
||||
let mut node_iter = Some(root.clone());
|
||||
|
||||
while let Some(mut node) = node_iter {
|
||||
let match_string = format!(
|
||||
"{} {}",
|
||||
node.get_attribute("class").unwrap_or_default(),
|
||||
node.get_attribute("id").unwrap_or_default()
|
||||
);
|
||||
|
||||
if constants::SHARE_ELEMENTS.is_match(&match_string)
|
||||
&& node.get_content().len() < constants::DEFAULT_CHAR_THRESHOLD
|
||||
{
|
||||
node_iter = Util::remove_and_next(&mut node);
|
||||
} else {
|
||||
node_iter = Util::next_node(&node, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> {
|
||||
let mut node_iter = Some(root.clone());
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue