From 22e98fdab7da3ed7fc44ca8a686f95d821baa946 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 11 Dec 2022 16:18:03 +0100 Subject: [PATCH] extract thumbnail url --- src/article.rs | 1 + src/full_text_parser/mod.rs | 22 ++++++++++++++++++++++ src/full_text_parser/tests.rs | 6 ++++++ 3 files changed, 29 insertions(+) diff --git a/src/article.rs b/src/article.rs index dbe2738..3d56213 100644 --- a/src/article.rs +++ b/src/article.rs @@ -10,6 +10,7 @@ pub struct Article { pub url: Url, pub date: Option>, pub html: Option, + pub thumbnail_url: Option, } impl Article { diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs index 5067d30..c5ae3ef 100644 --- a/src/full_text_parser/mod.rs +++ b/src/full_text_parser/mod.rs @@ -76,6 +76,7 @@ impl FullTextParser { url: url.clone(), date: None, html: None, + thumbnail_url: None, }; let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?; @@ -170,6 +171,9 @@ impl FullTextParser { } Self::extract_metadata(&xpath_ctx, config, global_config, article); + if article.thumbnail_url.is_none() { + Self::check_for_thumbnail(&xpath_ctx, article); + } Self::strip_junk(&xpath_ctx, config, global_config, url); Self::extract_body(&xpath_ctx, root, config, global_config)?; @@ -232,6 +236,7 @@ impl FullTextParser { let document = Self::parse_html(html, config, global_config)?; let xpath_ctx = Self::get_xpath_ctx(&document)?; Self::extract_metadata(&xpath_ctx, config, global_config, article); + Self::check_for_thumbnail(&xpath_ctx, article); Self::strip_junk(&xpath_ctx, config, global_config, url); Self::extract_body(&xpath_ctx, root, config, global_config)?; @@ -350,6 +355,23 @@ impl FullTextParser { conf } + fn check_for_thumbnail(context: &Context, article: &mut Article) { + if let Some(thumb) = Self::get_attribute(context, "//meta[contains(@name, 'twitter:image')]", "content").ok() { + article.thumbnail_url = Some(thumb); + return + } + + if let Some(thumb) = Self::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content").ok() { + article.thumbnail_url = Some(thumb); + return + } + + if let Some(thumb) = Self::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href").ok() { + article.thumbnail_url = Some(thumb); + return + } + } + fn fix_lazy_images( context: &Context, class: &str, diff --git a/src/full_text_parser/tests.rs b/src/full_text_parser/tests.rs index f720312..896fa55 100644 --- a/src/full_text_parser/tests.rs +++ b/src/full_text_parser/tests.rs @@ -17,6 +17,12 @@ async fn golem() { "HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben" )) ); + assert_eq!( + article.thumbnail_url, + Some(String::from( + "https://www.golem.de/1708/129460-144318-i_rc.jpg" + )) + ); assert_eq!(article.author, Some(String::from("Hauke Gierow"))); }