1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

extract thumbnail url

This commit is contained in:
Jan Lukas Gernert 2022-12-11 16:18:03 +01:00
parent 0c8aba4f4a
commit 22e98fdab7
3 changed files with 29 additions and 0 deletions

View file

@ -10,6 +10,7 @@ pub struct Article {
pub url: Url, pub url: Url,
pub date: Option<DateTime<Utc>>, pub date: Option<DateTime<Utc>>,
pub html: Option<String>, pub html: Option<String>,
pub thumbnail_url: Option<String>,
} }
impl Article { impl Article {

View file

@ -76,6 +76,7 @@ impl FullTextParser {
url: url.clone(), url: url.clone(),
date: None, date: None,
html: None, html: None,
thumbnail_url: None,
}; };
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?; let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
@ -170,6 +171,9 @@ impl FullTextParser {
} }
Self::extract_metadata(&xpath_ctx, config, global_config, article); Self::extract_metadata(&xpath_ctx, config, global_config, article);
if article.thumbnail_url.is_none() {
Self::check_for_thumbnail(&xpath_ctx, article);
}
Self::strip_junk(&xpath_ctx, config, global_config, url); Self::strip_junk(&xpath_ctx, config, global_config, url);
Self::extract_body(&xpath_ctx, root, config, global_config)?; Self::extract_body(&xpath_ctx, root, config, global_config)?;
@ -232,6 +236,7 @@ impl FullTextParser {
let document = Self::parse_html(html, config, global_config)?; let document = Self::parse_html(html, config, global_config)?;
let xpath_ctx = Self::get_xpath_ctx(&document)?; let xpath_ctx = Self::get_xpath_ctx(&document)?;
Self::extract_metadata(&xpath_ctx, config, global_config, article); Self::extract_metadata(&xpath_ctx, config, global_config, article);
Self::check_for_thumbnail(&xpath_ctx, article);
Self::strip_junk(&xpath_ctx, config, global_config, url); Self::strip_junk(&xpath_ctx, config, global_config, url);
Self::extract_body(&xpath_ctx, root, config, global_config)?; Self::extract_body(&xpath_ctx, root, config, global_config)?;
@ -350,6 +355,23 @@ impl FullTextParser {
conf conf
} }
fn check_for_thumbnail(context: &Context, article: &mut Article) {
if let Some(thumb) = Self::get_attribute(context, "//meta[contains(@name, 'twitter:image')]", "content").ok() {
article.thumbnail_url = Some(thumb);
return
}
if let Some(thumb) = Self::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content").ok() {
article.thumbnail_url = Some(thumb);
return
}
if let Some(thumb) = Self::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href").ok() {
article.thumbnail_url = Some(thumb);
return
}
}
fn fix_lazy_images( fn fix_lazy_images(
context: &Context, context: &Context,
class: &str, class: &str,

View file

@ -17,6 +17,12 @@ async fn golem() {
"HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben" "HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben"
)) ))
); );
assert_eq!(
article.thumbnail_url,
Some(String::from(
"https://www.golem.de/1708/129460-144318-i_rc.jpg"
))
);
assert_eq!(article.author, Some(String::from("Hauke Gierow"))); assert_eq!(article.author, Some(String::from("Hauke Gierow")));
} }