diff --git a/resources/tests/readability/hukumusume/expected.html b/resources/tests/readability/hukumusume/expected.html index 994ea5e..6ef604b 100644 --- a/resources/tests/readability/hukumusume/expected.html +++ b/resources/tests/readability/hukumusume/expected.html @@ -22,9 +22,9 @@
+
![]() ![]() |
diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs index 40bfb68..d7e7bef 100644 --- a/src/full_text_parser/mod.rs +++ b/src/full_text_parser/mod.rs @@ -125,9 +125,7 @@ impl FullTextParser { return Err(error); } - if let Some(mut root) = document.get_root_element() { - Self::post_process_content(&mut root, false)?; - } + Self::post_process_document(&document)?; article.document = Some(document); @@ -784,7 +782,7 @@ impl FullTextParser { return Err(FullTextParserError::Xml); } - Self::post_process_content(&mut node, true)?; + Self::post_process_page(&mut node)?; node.unlink(); if root.add_child(&mut node).is_ok() { @@ -857,20 +855,29 @@ impl FullTextParser { Ok(()) } - pub(crate) fn post_process_content( - node: &mut Node, - clean_conditionally: bool, + pub(crate) fn post_process_document( + document: &Document ) -> Result<(), FullTextParserError> { - if clean_conditionally { - Util::clean_conditionally(node, "fieldset"); - Util::clean_conditionally(node, "table"); - Util::clean_conditionally(node, "ul"); - Util::clean_conditionally(node, "div"); + if let Some(mut root) = document.get_root_element() { + Self::simplify_nested_elements(&mut root)?; + + Self::clean_attributes(&mut root)?; + Self::remove_single_cell_tables(&mut root); + Self::remove_extra_p_and_div(&mut root); } - Self::clean_attributes(node)?; - Self::simplify_nested_elements(node)?; + Ok(()) + } + pub(crate) fn post_process_page( + node: &mut Node, + ) -> Result<(), FullTextParserError> { + Util::clean_conditionally(node, "fieldset"); + Util::clean_conditionally(node, "table"); + Util::clean_conditionally(node, "ul"); + Util::clean_conditionally(node, "div"); + + Self::clean_attributes(node)?; Self::remove_single_cell_tables(node); Self::remove_extra_p_and_div(node); diff --git a/src/full_text_parser/readability/mod.rs b/src/full_text_parser/readability/mod.rs index dfe2a5b..2f1c625 100644 --- a/src/full_text_parser/readability/mod.rs +++ b/src/full_text_parser/readability/mod.rs @@ -497,10 +497,9 @@ impl Readability { } } - crate::FullTextParser::post_process_content( - &mut article_content, - state.clean_conditionally, - )?; + if state.clean_conditionally { + crate::FullTextParser::post_process_page(&mut article_content)?; + } if needed_to_create_top_candidate { // We already created a fake div thing, and there wouldn't have been any siblings left diff --git a/src/full_text_parser/readability/tests.rs b/src/full_text_parser/readability/tests.rs index fb02eb2..51da394 100644 --- a/src/full_text_parser/readability/tests.rs +++ b/src/full_text_parser/readability/tests.rs @@ -34,9 +34,7 @@ async fn run_test(name: &str) { metadata::extract(&xpath_ctx, None, None, &mut article); super::Readability::extract_body(document, &mut root, article.title.as_deref()).unwrap(); - if let Some(mut root) = article_document.get_root_element() { - crate::FullTextParser::post_process_content(&mut root, false).unwrap(); - } + crate::FullTextParser::post_process_document(&article_document).unwrap(); article.document = Some(article_document); let html = article.get_content().unwrap(); |