mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
remove <h1/2> duplicating the title
This commit is contained in:
parent
eb4b3603f5
commit
d8ceee1403
5 changed files with 54 additions and 23 deletions
|
@ -35,7 +35,7 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserEr
|
||||||
let empty_config = FtrConfigEntry::default();
|
let empty_config = FtrConfigEntry::default();
|
||||||
let document = FullTextParser::parse_html(html, None, &empty_config)?;
|
let document = FullTextParser::parse_html(html, None, &empty_config)?;
|
||||||
let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?;
|
let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?;
|
||||||
FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document);
|
FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document, None);
|
||||||
if let Some(mut root) = document.get_root_element() {
|
if let Some(mut root) = document.get_root_element() {
|
||||||
FullTextParser::post_process_page(&mut root)?;
|
FullTextParser::post_process_page(&mut root)?;
|
||||||
}
|
}
|
||||||
|
|
|
@ -97,6 +97,7 @@ impl FullTextParser {
|
||||||
global_config,
|
global_config,
|
||||||
&article.url,
|
&article.url,
|
||||||
&old_document,
|
&old_document,
|
||||||
|
article.title.as_deref(),
|
||||||
);
|
);
|
||||||
let found_body = Self::extract_body(&xpath_ctx, &mut root, config, global_config)?;
|
let found_body = Self::extract_body(&xpath_ctx, &mut root, config, global_config)?;
|
||||||
if !found_body {
|
if !found_body {
|
||||||
|
@ -258,7 +259,14 @@ impl FullTextParser {
|
||||||
if article.thumbnail_url.is_none() {
|
if article.thumbnail_url.is_none() {
|
||||||
Self::check_for_thumbnail(&xpath_ctx, article);
|
Self::check_for_thumbnail(&xpath_ctx, article);
|
||||||
}
|
}
|
||||||
Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document);
|
Self::prep_content(
|
||||||
|
&xpath_ctx,
|
||||||
|
config,
|
||||||
|
global_config,
|
||||||
|
&article.url,
|
||||||
|
&document,
|
||||||
|
article.title.as_deref(),
|
||||||
|
);
|
||||||
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||||
|
|
||||||
if !found_body {
|
if !found_body {
|
||||||
|
@ -281,7 +289,14 @@ impl FullTextParser {
|
||||||
{
|
{
|
||||||
next_page_url.replace(url);
|
next_page_url.replace(url);
|
||||||
}
|
}
|
||||||
Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document);
|
Self::prep_content(
|
||||||
|
&xpath_ctx,
|
||||||
|
config,
|
||||||
|
global_config,
|
||||||
|
&article.url,
|
||||||
|
&document,
|
||||||
|
article.title.as_deref(),
|
||||||
|
);
|
||||||
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||||
|
|
||||||
if !found_body {
|
if !found_body {
|
||||||
|
@ -345,7 +360,14 @@ impl FullTextParser {
|
||||||
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
metadata::extract(&xpath_ctx, config, Some(global_config), article);
|
metadata::extract(&xpath_ctx, config, Some(global_config), article);
|
||||||
Self::check_for_thumbnail(&xpath_ctx, article);
|
Self::check_for_thumbnail(&xpath_ctx, article);
|
||||||
Self::prep_content(&xpath_ctx, config, global_config, url, &document);
|
Self::prep_content(
|
||||||
|
&xpath_ctx,
|
||||||
|
config,
|
||||||
|
global_config,
|
||||||
|
url,
|
||||||
|
&document,
|
||||||
|
article.title.as_deref(),
|
||||||
|
);
|
||||||
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -773,6 +795,7 @@ impl FullTextParser {
|
||||||
global_config: &ConfigEntry,
|
global_config: &ConfigEntry,
|
||||||
url: &Url,
|
url: &Url,
|
||||||
document: &Document,
|
document: &Document,
|
||||||
|
title: Option<&str>,
|
||||||
) {
|
) {
|
||||||
// replace H1 with H2 as H1 should be only title that is displayed separately
|
// replace H1 with H2 as H1 should be only title that is displayed separately
|
||||||
if let Ok(h1_nodes) = Util::evaluate_xpath(context, "//h1", false) {
|
if let Ok(h1_nodes) = Util::evaluate_xpath(context, "//h1", false) {
|
||||||
|
@ -781,6 +804,14 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Ok(h2_nodes) = Util::evaluate_xpath(context, "//h2", false) {
|
||||||
|
for mut h2_node in h2_nodes {
|
||||||
|
if Util::header_duplicates_title(&h2_node, title) {
|
||||||
|
h2_node.unlink();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// rename all font nodes to span
|
// rename all font nodes to span
|
||||||
if let Ok(font_nodes) = Util::evaluate_xpath(context, "//font", false) {
|
if let Ok(font_nodes) = Util::evaluate_xpath(context, "//font", false) {
|
||||||
for mut font_node in font_nodes {
|
for mut font_node in font_nodes {
|
||||||
|
|
|
@ -53,7 +53,7 @@ impl Readability {
|
||||||
let document = crate::FullTextParser::parse_html(html, None, &empty_config)?;
|
let document = crate::FullTextParser::parse_html(html, None, &empty_config)?;
|
||||||
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document)?;
|
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document)?;
|
||||||
|
|
||||||
crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document);
|
crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document, None);
|
||||||
let mut article = crate::article::Article {
|
let mut article = crate::article::Article {
|
||||||
title: None,
|
title: None,
|
||||||
author: None,
|
author: None,
|
||||||
|
@ -127,7 +127,7 @@ impl Readability {
|
||||||
}
|
}
|
||||||
|
|
||||||
if state.should_remove_title_header
|
if state.should_remove_title_header
|
||||||
&& Self::header_duplicates_title(node_ref, title)
|
&& Util::header_duplicates_title(node_ref, title)
|
||||||
{
|
{
|
||||||
state.should_remove_title_header = false;
|
state.should_remove_title_header = false;
|
||||||
node = Util::remove_and_next(node_ref);
|
node = Util::remove_and_next(node_ref);
|
||||||
|
@ -742,22 +742,6 @@ impl Readability {
|
||||||
len > 0 && len < 100
|
len > 0 && len < 100
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if this node is an H1 or H2 element whose content is mostly
|
|
||||||
// the same as the article title.
|
|
||||||
fn header_duplicates_title(node: &Node, title: Option<&str>) -> bool {
|
|
||||||
let name = node.get_name().to_lowercase();
|
|
||||||
if name != "h1" && name != "h2" {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
let heading = Util::get_inner_text(node, false);
|
|
||||||
|
|
||||||
if let Some(title) = title {
|
|
||||||
Util::text_similarity(title, &heading) > 0.75
|
|
||||||
} else {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initialize a node with the readability object. Also checks the
|
// Initialize a node with the readability object. Also checks the
|
||||||
// className/id for special names to add to its score.
|
// className/id for special names to add to its score.
|
||||||
fn initialize_node(node: &mut Node, state: &State) -> Result<(), FullTextParserError> {
|
fn initialize_node(node: &mut Node, state: &State) -> Result<(), FullTextParserError> {
|
||||||
|
|
|
@ -22,7 +22,7 @@ async fn run_test(name: &str) {
|
||||||
let document = crate::FullTextParser::parse_html(&html, None, &empty_config).unwrap();
|
let document = crate::FullTextParser::parse_html(&html, None, &empty_config).unwrap();
|
||||||
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
|
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
|
||||||
|
|
||||||
crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document);
|
crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document, None);
|
||||||
let mut article = Article {
|
let mut article = Article {
|
||||||
title: None,
|
title: None,
|
||||||
author: None,
|
author: None,
|
||||||
|
|
|
@ -361,6 +361,22 @@ impl Util {
|
||||||
1.0 - distance_b
|
1.0 - distance_b
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check if this node is an H1 or H2 element whose content is mostly
|
||||||
|
// the same as the article title.
|
||||||
|
pub fn header_duplicates_title(node: &Node, title: Option<&str>) -> bool {
|
||||||
|
let name = node.get_name().to_lowercase();
|
||||||
|
if name != "h1" && name != "h2" {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
let heading = Util::get_inner_text(node, false);
|
||||||
|
|
||||||
|
if let Some(title) = title {
|
||||||
|
Util::text_similarity(title, &heading) > 0.75
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn has_any_descendent_tag(node: &Node, tag_names: &HashSet<&str>) -> bool {
|
pub fn has_any_descendent_tag(node: &Node, tag_names: &HashSet<&str>) -> bool {
|
||||||
let children = node.get_child_elements();
|
let children = node.get_child_elements();
|
||||||
let is_direct_child = children
|
let is_direct_child = children
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue