1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 08:30:00 +02:00

improve title extraction

This commit is contained in:
Jan Lukas Gernert 2023-02-20 02:32:58 +01:00
parent cce912c354
commit 98c06e11f4
7 changed files with 107 additions and 54 deletions

View file

@ -174,7 +174,7 @@ impl FullTextParser {
}
}
metadata::extract(&xpath_ctx, config, global_config, article);
metadata::extract(&xpath_ctx, config, Some(global_config), article);
if article.thumbnail_url.is_none() {
Self::check_for_thumbnail(&xpath_ctx, article);
}
@ -182,7 +182,8 @@ impl FullTextParser {
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
if !found_body {
if let Err(error) = Readability::extract_body(document, root) {
if let Err(error) = Readability::extract_body(document, root, article.title.as_deref())
{
log::error!("Both ftr and readability failed to find content: {}", error);
return Err(error);
}
@ -246,7 +247,7 @@ impl FullTextParser {
let html = Self::download(url, client, headers).await?;
let document = Self::parse_html(&html, config, global_config)?;
let xpath_ctx = Self::get_xpath_ctx(&document)?;
metadata::extract(&xpath_ctx, config, global_config, article);
metadata::extract(&xpath_ctx, config, Some(global_config), article);
Self::check_for_thumbnail(&xpath_ctx, article);
Self::strip_junk(&xpath_ctx, config, global_config, url);
Self::extract_body(&xpath_ctx, root, config, global_config)?;