1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 16:40:00 +02:00

make cleaning more obvious

This commit is contained in:
Jan Lukas Gernert 2023-03-19 23:09:06 +01:00
parent 11e08ae505
commit 280c516cbe
4 changed files with 27 additions and 23 deletions

View file

@ -125,9 +125,7 @@ impl FullTextParser {
return Err(error);
}
if let Some(mut root) = document.get_root_element() {
Self::post_process_content(&mut root, false)?;
}
Self::post_process_document(&document)?;
article.document = Some(document);
@ -784,7 +782,7 @@ impl FullTextParser {
return Err(FullTextParserError::Xml);
}
Self::post_process_content(&mut node, true)?;
Self::post_process_page(&mut node)?;
node.unlink();
if root.add_child(&mut node).is_ok() {
@ -857,20 +855,29 @@ impl FullTextParser {
Ok(())
}
pub(crate) fn post_process_content(
node: &mut Node,
clean_conditionally: bool,
pub(crate) fn post_process_document(
document: &Document
) -> Result<(), FullTextParserError> {
if clean_conditionally {
Util::clean_conditionally(node, "fieldset");
Util::clean_conditionally(node, "table");
Util::clean_conditionally(node, "ul");
Util::clean_conditionally(node, "div");
if let Some(mut root) = document.get_root_element() {
Self::simplify_nested_elements(&mut root)?;
Self::clean_attributes(&mut root)?;
Self::remove_single_cell_tables(&mut root);
Self::remove_extra_p_and_div(&mut root);
}
Self::clean_attributes(node)?;
Self::simplify_nested_elements(node)?;
Ok(())
}
pub(crate) fn post_process_page(
node: &mut Node,
) -> Result<(), FullTextParserError> {
Util::clean_conditionally(node, "fieldset");
Util::clean_conditionally(node, "table");
Util::clean_conditionally(node, "ul");
Util::clean_conditionally(node, "div");
Self::clean_attributes(node)?;
Self::remove_single_cell_tables(node);
Self::remove_extra_p_and_div(node);