1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

remove <h1/2> duplicating the title

This commit is contained in:
Jan Lukas Gernert 2023-04-30 09:24:00 +02:00
parent eb4b3603f5
commit d8ceee1403
5 changed files with 54 additions and 23 deletions

View file

@ -35,7 +35,7 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserEr
let empty_config = FtrConfigEntry::default();
let document = FullTextParser::parse_html(html, None, &empty_config)?;
let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?;
FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document);
FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document, None);
if let Some(mut root) = document.get_root_element() {
FullTextParser::post_process_page(&mut root)?;
}

View file

@ -97,6 +97,7 @@ impl FullTextParser {
global_config,
&article.url,
&old_document,
article.title.as_deref(),
);
let found_body = Self::extract_body(&xpath_ctx, &mut root, config, global_config)?;
if !found_body {
@ -258,7 +259,14 @@ impl FullTextParser {
if article.thumbnail_url.is_none() {
Self::check_for_thumbnail(&xpath_ctx, article);
}
Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document);
Self::prep_content(
&xpath_ctx,
config,
global_config,
&article.url,
&document,
article.title.as_deref(),
);
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
if !found_body {
@ -281,7 +289,14 @@ impl FullTextParser {
{
next_page_url.replace(url);
}
Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document);
Self::prep_content(
&xpath_ctx,
config,
global_config,
&article.url,
&document,
article.title.as_deref(),
);
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
if !found_body {
@ -345,7 +360,14 @@ impl FullTextParser {
let xpath_ctx = Self::get_xpath_ctx(&document)?;
metadata::extract(&xpath_ctx, config, Some(global_config), article);
Self::check_for_thumbnail(&xpath_ctx, article);
Self::prep_content(&xpath_ctx, config, global_config, url, &document);
Self::prep_content(
&xpath_ctx,
config,
global_config,
url,
&document,
article.title.as_deref(),
);
Self::extract_body(&xpath_ctx, root, config, global_config)?;
Ok(())
@ -773,6 +795,7 @@ impl FullTextParser {
global_config: &ConfigEntry,
url: &Url,
document: &Document,
title: Option<&str>,
) {
// replace H1 with H2 as H1 should be only title that is displayed separately
if let Ok(h1_nodes) = Util::evaluate_xpath(context, "//h1", false) {
@ -781,6 +804,14 @@ impl FullTextParser {
}
}
if let Ok(h2_nodes) = Util::evaluate_xpath(context, "//h2", false) {
for mut h2_node in h2_nodes {
if Util::header_duplicates_title(&h2_node, title) {
h2_node.unlink();
}
}
}
// rename all font nodes to span
if let Ok(font_nodes) = Util::evaluate_xpath(context, "//font", false) {
for mut font_node in font_nodes {

View file

@ -53,7 +53,7 @@ impl Readability {
let document = crate::FullTextParser::parse_html(html, None, &empty_config)?;
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document)?;
crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document);
crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document, None);
let mut article = crate::article::Article {
title: None,
author: None,
@ -127,7 +127,7 @@ impl Readability {
}
if state.should_remove_title_header
&& Self::header_duplicates_title(node_ref, title)
&& Util::header_duplicates_title(node_ref, title)
{
state.should_remove_title_header = false;
node = Util::remove_and_next(node_ref);
@ -742,22 +742,6 @@ impl Readability {
len > 0 && len < 100
}
// Check if this node is an H1 or H2 element whose content is mostly
// the same as the article title.
fn header_duplicates_title(node: &Node, title: Option<&str>) -> bool {
let name = node.get_name().to_lowercase();
if name != "h1" && name != "h2" {
return false;
}
let heading = Util::get_inner_text(node, false);
if let Some(title) = title {
Util::text_similarity(title, &heading) > 0.75
} else {
false
}
}
// Initialize a node with the readability object. Also checks the
// className/id for special names to add to its score.
fn initialize_node(node: &mut Node, state: &State) -> Result<(), FullTextParserError> {

View file

@ -22,7 +22,7 @@ async fn run_test(name: &str) {
let document = crate::FullTextParser::parse_html(&html, None, &empty_config).unwrap();
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document);
crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document, None);
let mut article = Article {
title: None,
author: None,

View file

@ -361,6 +361,22 @@ impl Util {
1.0 - distance_b
}
// Check if this node is an H1 or H2 element whose content is mostly
// the same as the article title.
pub fn header_duplicates_title(node: &Node, title: Option<&str>) -> bool {
let name = node.get_name().to_lowercase();
if name != "h1" && name != "h2" {
return false;
}
let heading = Util::get_inner_text(node, false);
if let Some(title) = title {
Util::text_similarity(title, &heading) > 0.75
} else {
false
}
}
pub fn has_any_descendent_tag(node: &Node, tag_names: &HashSet<&str>) -> bool {
let children = node.get_child_elements();
let is_direct_child = children