mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
extract thumbnail url
This commit is contained in:
parent
0c8aba4f4a
commit
22e98fdab7
3 changed files with 29 additions and 0 deletions
|
@ -10,6 +10,7 @@ pub struct Article {
|
|||
pub url: Url,
|
||||
pub date: Option<DateTime<Utc>>,
|
||||
pub html: Option<String>,
|
||||
pub thumbnail_url: Option<String>,
|
||||
}
|
||||
|
||||
impl Article {
|
||||
|
|
|
@ -76,6 +76,7 @@ impl FullTextParser {
|
|||
url: url.clone(),
|
||||
date: None,
|
||||
html: None,
|
||||
thumbnail_url: None,
|
||||
};
|
||||
|
||||
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
||||
|
@ -170,6 +171,9 @@ impl FullTextParser {
|
|||
}
|
||||
|
||||
Self::extract_metadata(&xpath_ctx, config, global_config, article);
|
||||
if article.thumbnail_url.is_none() {
|
||||
Self::check_for_thumbnail(&xpath_ctx, article);
|
||||
}
|
||||
Self::strip_junk(&xpath_ctx, config, global_config, url);
|
||||
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||
|
||||
|
@ -232,6 +236,7 @@ impl FullTextParser {
|
|||
let document = Self::parse_html(html, config, global_config)?;
|
||||
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||
Self::extract_metadata(&xpath_ctx, config, global_config, article);
|
||||
Self::check_for_thumbnail(&xpath_ctx, article);
|
||||
Self::strip_junk(&xpath_ctx, config, global_config, url);
|
||||
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||
|
||||
|
@ -350,6 +355,23 @@ impl FullTextParser {
|
|||
conf
|
||||
}
|
||||
|
||||
fn check_for_thumbnail(context: &Context, article: &mut Article) {
|
||||
if let Some(thumb) = Self::get_attribute(context, "//meta[contains(@name, 'twitter:image')]", "content").ok() {
|
||||
article.thumbnail_url = Some(thumb);
|
||||
return
|
||||
}
|
||||
|
||||
if let Some(thumb) = Self::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content").ok() {
|
||||
article.thumbnail_url = Some(thumb);
|
||||
return
|
||||
}
|
||||
|
||||
if let Some(thumb) = Self::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href").ok() {
|
||||
article.thumbnail_url = Some(thumb);
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
fn fix_lazy_images(
|
||||
context: &Context,
|
||||
class: &str,
|
||||
|
|
|
@ -17,6 +17,12 @@ async fn golem() {
|
|||
"HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben"
|
||||
))
|
||||
);
|
||||
assert_eq!(
|
||||
article.thumbnail_url,
|
||||
Some(String::from(
|
||||
"https://www.golem.de/1708/129460-144318-i_rc.jpg"
|
||||
))
|
||||
);
|
||||
assert_eq!(article.author, Some(String::from("Hauke Gierow")));
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue