diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs index f810e4f..10e4841 100644 --- a/src/full_text_parser/mod.rs +++ b/src/full_text_parser/mod.rs @@ -587,13 +587,14 @@ impl FullTextParser { _ = Util::strip_node(context, "//*[contains(@style,'display: none')]"); _ = Self::remove_attribute(context, None, "style"); - // strip all comments + // strip all input elements + _ = Util::strip_node(context, "//form"); _ = Util::strip_node(context, "//input"); _ = Util::strip_node(context, "//textarea"); _ = Util::strip_node(context, "//select"); _ = Util::strip_node(context, "//button"); - // strip all input elements + // strip all comments _ = Util::strip_node(context, "//comment()"); // strip all scripts @@ -859,11 +860,12 @@ impl FullTextParser { Util::mark_data_tables(&context)?; if let Some(mut root) = document.get_root_element() { - Util::clean_conditionally(&mut root, "form")?; - Util::clean_conditionally(&mut root, "fieldset")?; - Util::clean_conditionally(&mut root, "table")?; - Util::clean_conditionally(&mut root, "ul")?; - Util::clean_conditionally(&mut root, "div")?; + Self::remove_extra_p_and_div(&mut root); + + Util::clean_conditionally(&mut root, "fieldset"); + Util::clean_conditionally(&mut root, "table"); + Util::clean_conditionally(&mut root, "ul"); + Util::clean_conditionally(&mut root, "div"); Self::clean_classes(&mut root)?; Self::simplify_nested_elements(&mut root)?; @@ -872,6 +874,27 @@ impl FullTextParser { Ok(()) } + fn remove_extra_p_and_div(root: &mut Node) { + let mut node_iter = Some(root.clone()); + + while let Some(mut node) = node_iter { + let tag_name = node.get_name(); + if tag_name == "P" || tag_name == "DIV" { + let img_count = Util::get_elements_by_tag_name(&node, "img").len(); + let embed_count = Util::get_elements_by_tag_name(&node, "embed").len(); + let object_count = Util::get_elements_by_tag_name(&node, "object").len(); + let iframe_count = Util::get_elements_by_tag_name(&node, "iframe").len(); + let total_count = img_count + embed_count + object_count + iframe_count; + + if total_count == 0 && Util::get_inner_text(&node, false).trim().is_empty() { + node.unlink(); + } + } + + node_iter = Util::next_node(&node, false); + } + } + fn clean_classes(root: &mut Node) -> Result<(), FullTextParserError> { let mut node_iter = Some(root.clone()); diff --git a/src/util.rs b/src/util.rs index 94bc48f..b95157e 100644 --- a/src/util.rs +++ b/src/util.rs @@ -504,7 +504,7 @@ impl Util { // Clean an element of all tags of type "tag" if they look fishy. // "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. - pub fn clean_conditionally(root: &mut Node, tag: &str) -> Result<(), FullTextParserError> { + pub fn clean_conditionally(root: &mut Node, tag: &str) { // Gather counts for other typical elements embedded within. // Traverse backwards so we can remove nodes at the same time // without effecting the traversal. @@ -516,11 +516,9 @@ impl Util { .filter(|node| Self::should_remove(node, tag)) .collect::>(); - for mut node in nodes_to_remove { + for mut node in nodes_to_remove.into_iter().rev() { node.unlink(); } - - Ok(()) } fn should_remove(node: &Node, tag: &str) -> bool {