diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index 4ac6090..14cc208 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -35,7 +35,7 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result, ) { // replace H1 with H2 as H1 should be only title that is displayed separately if let Ok(h1_nodes) = Util::evaluate_xpath(context, "//h1", false) { @@ -781,6 +804,14 @@ impl FullTextParser { } } + if let Ok(h2_nodes) = Util::evaluate_xpath(context, "//h2", false) { + for mut h2_node in h2_nodes { + if Util::header_duplicates_title(&h2_node, title) { + h2_node.unlink(); + } + } + } + // rename all font nodes to span if let Ok(font_nodes) = Util::evaluate_xpath(context, "//font", false) { for mut font_node in font_nodes { diff --git a/article_scraper/src/full_text_parser/readability/mod.rs b/article_scraper/src/full_text_parser/readability/mod.rs index 6bfbe50..8627a4a 100644 --- a/article_scraper/src/full_text_parser/readability/mod.rs +++ b/article_scraper/src/full_text_parser/readability/mod.rs @@ -53,7 +53,7 @@ impl Readability { let document = crate::FullTextParser::parse_html(html, None, &empty_config)?; let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document)?; - crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document); + crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document, None); let mut article = crate::article::Article { title: None, author: None, @@ -127,7 +127,7 @@ impl Readability { } if state.should_remove_title_header - && Self::header_duplicates_title(node_ref, title) + && Util::header_duplicates_title(node_ref, title) { state.should_remove_title_header = false; node = Util::remove_and_next(node_ref); @@ -742,22 +742,6 @@ impl Readability { len > 0 && len < 100 } - // Check if this node is an H1 or H2 element whose content is mostly - // the same as the article title. - fn header_duplicates_title(node: &Node, title: Option<&str>) -> bool { - let name = node.get_name().to_lowercase(); - if name != "h1" && name != "h2" { - return false; - } - let heading = Util::get_inner_text(node, false); - - if let Some(title) = title { - Util::text_similarity(title, &heading) > 0.75 - } else { - false - } - } - // Initialize a node with the readability object. Also checks the // className/id for special names to add to its score. fn initialize_node(node: &mut Node, state: &State) -> Result<(), FullTextParserError> { diff --git a/article_scraper/src/full_text_parser/readability/tests.rs b/article_scraper/src/full_text_parser/readability/tests.rs index bcc95a6..18cf8b2 100644 --- a/article_scraper/src/full_text_parser/readability/tests.rs +++ b/article_scraper/src/full_text_parser/readability/tests.rs @@ -22,7 +22,7 @@ async fn run_test(name: &str) { let document = crate::FullTextParser::parse_html(&html, None, &empty_config).unwrap(); let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap(); - crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document); + crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document, None); let mut article = Article { title: None, author: None, diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index 76f162a..3417dac 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -361,6 +361,22 @@ impl Util { 1.0 - distance_b } + // Check if this node is an H1 or H2 element whose content is mostly + // the same as the article title. + pub fn header_duplicates_title(node: &Node, title: Option<&str>) -> bool { + let name = node.get_name().to_lowercase(); + if name != "h1" && name != "h2" { + return false; + } + let heading = Util::get_inner_text(node, false); + + if let Some(title) = title { + Util::text_similarity(title, &heading) > 0.75 + } else { + false + } + } + pub fn has_any_descendent_tag(node: &Node, tag_names: &HashSet<&str>) -> bool { let children = node.get_child_elements(); let is_direct_child = children