mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
extract thumbnail url
This commit is contained in:
parent
0c8aba4f4a
commit
22e98fdab7
3 changed files with 29 additions and 0 deletions
|
@ -10,6 +10,7 @@ pub struct Article {
|
||||||
pub url: Url,
|
pub url: Url,
|
||||||
pub date: Option<DateTime<Utc>>,
|
pub date: Option<DateTime<Utc>>,
|
||||||
pub html: Option<String>,
|
pub html: Option<String>,
|
||||||
|
pub thumbnail_url: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Article {
|
impl Article {
|
||||||
|
|
|
@ -76,6 +76,7 @@ impl FullTextParser {
|
||||||
url: url.clone(),
|
url: url.clone(),
|
||||||
date: None,
|
date: None,
|
||||||
html: None,
|
html: None,
|
||||||
|
thumbnail_url: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
||||||
|
@ -170,6 +171,9 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
Self::extract_metadata(&xpath_ctx, config, global_config, article);
|
Self::extract_metadata(&xpath_ctx, config, global_config, article);
|
||||||
|
if article.thumbnail_url.is_none() {
|
||||||
|
Self::check_for_thumbnail(&xpath_ctx, article);
|
||||||
|
}
|
||||||
Self::strip_junk(&xpath_ctx, config, global_config, url);
|
Self::strip_junk(&xpath_ctx, config, global_config, url);
|
||||||
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||||
|
|
||||||
|
@ -232,6 +236,7 @@ impl FullTextParser {
|
||||||
let document = Self::parse_html(html, config, global_config)?;
|
let document = Self::parse_html(html, config, global_config)?;
|
||||||
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
Self::extract_metadata(&xpath_ctx, config, global_config, article);
|
Self::extract_metadata(&xpath_ctx, config, global_config, article);
|
||||||
|
Self::check_for_thumbnail(&xpath_ctx, article);
|
||||||
Self::strip_junk(&xpath_ctx, config, global_config, url);
|
Self::strip_junk(&xpath_ctx, config, global_config, url);
|
||||||
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||||
|
|
||||||
|
@ -350,6 +355,23 @@ impl FullTextParser {
|
||||||
conf
|
conf
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn check_for_thumbnail(context: &Context, article: &mut Article) {
|
||||||
|
if let Some(thumb) = Self::get_attribute(context, "//meta[contains(@name, 'twitter:image')]", "content").ok() {
|
||||||
|
article.thumbnail_url = Some(thumb);
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(thumb) = Self::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content").ok() {
|
||||||
|
article.thumbnail_url = Some(thumb);
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(thumb) = Self::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href").ok() {
|
||||||
|
article.thumbnail_url = Some(thumb);
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn fix_lazy_images(
|
fn fix_lazy_images(
|
||||||
context: &Context,
|
context: &Context,
|
||||||
class: &str,
|
class: &str,
|
||||||
|
|
|
@ -17,6 +17,12 @@ async fn golem() {
|
||||||
"HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben"
|
"HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben"
|
||||||
))
|
))
|
||||||
);
|
);
|
||||||
|
assert_eq!(
|
||||||
|
article.thumbnail_url,
|
||||||
|
Some(String::from(
|
||||||
|
"https://www.golem.de/1708/129460-144318-i_rc.jpg"
|
||||||
|
))
|
||||||
|
);
|
||||||
assert_eq!(article.author, Some(String::from("Hauke Gierow")));
|
assert_eq!(article.author, Some(String::from("Hauke Gierow")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue