1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

clippy fixes

This commit is contained in:
Jan Lukas Gernert 2022-10-07 09:20:10 +02:00
parent 8c2af14871
commit 69659da983
5 changed files with 51 additions and 54 deletions

View file

@ -18,7 +18,7 @@ impl Article {
if let Some(ref html) = self.html { if let Some(ref html) = self.html {
if let Ok(()) = std::fs::create_dir_all(&path) { if let Ok(()) = std::fs::create_dir_all(&path) {
let mut file_name = match self.title.clone() { let mut file_name = match self.title.clone() {
Some(file_name) => file_name.replace("/", "_"), Some(file_name) => file_name.replace('/', "_"),
None => "Unknown Title".to_owned(), None => "Unknown Title".to_owned(),
}; };
file_name.push_str(".html"); file_name.push_str(".html");

View file

@ -120,7 +120,7 @@ impl ConfigEntry {
continue; continue;
} }
if let Some(to_replace) = value.get(0) { if let Some(to_replace) = value.first() {
if let Some(replace_with) = value.get(1) { if let Some(replace_with) = value.get(1) {
replace_vec.push(Replace { replace_vec.push(Replace {
to_replace: (*to_replace).to_string(), to_replace: (*to_replace).to_string(),
@ -139,7 +139,7 @@ impl ConfigEntry {
continue; continue;
} }
if let Some(name) = value.get(0) { if let Some(name) = value.first() {
if let Some(value) = value.get(1) { if let Some(value) = value.get(1) {
header_vec.push(Header { header_vec.push(Header {
name: (*name).to_string(), name: (*name).to_string(),

View file

@ -154,10 +154,7 @@ impl ImageDownloader {
} }
let small_image_base64 = base64::encode(&small_image); let small_image_base64 = base64::encode(&small_image);
let big_image_base64 = match big_image { let big_image_base64 = big_image.map(base64::encode);
Some(big_image) => Some(base64::encode(&big_image)),
None => None,
};
let small_image_string = let small_image_string =
format!("data:{};base64,{}", content_type_small, small_image_base64); format!("data:{};base64,{}", content_type_small, small_image_base64);
let big_image_string = match big_image_base64 { let big_image_string = match big_image_base64 {

View file

@ -49,17 +49,17 @@ impl ArticleScraper {
// custom youtube handling, but prefer config if exists // custom youtube handling, but prefer config if exists
if !self.config_files.contains_config("youtube.com.txt") { if !self.config_files.contains_config("youtube.com.txt") {
if let Some(article) = youtube::Youtube::handle(&url) { if let Some(article) = youtube::Youtube::handle(url) {
return Ok(article); return Ok(article);
} }
} }
// check if we have a config for the url // check if we have a config for the url
let config = self.get_grabber_config(&url); let config = self.get_grabber_config(url);
let global_config = self let global_config = self
.config_files .config_files
.get("global.txt") .get("global.txt")
.ok_or_else(|| ScraperErrorKind::Config)?; .ok_or(ScraperErrorKind::Config)?;
let headers = Util::generate_headers(config, global_config)?; let headers = Util::generate_headers(config, global_config)?;
@ -75,7 +75,7 @@ impl ArticleScraper {
.context(ScraperErrorKind::Http)?; .context(ScraperErrorKind::Http)?;
// check if url redirects and we need to pick up the new url // check if url redirects and we need to pick up the new url
let url = if let Some(new_url) = Util::check_redirect(&response, &url) { let url = if let Some(new_url) = Util::check_redirect(&response, url) {
debug!("Url '{}' redirects to '{}'", url.as_str(), new_url.as_str()); debug!("Url '{}' redirects to '{}'", url.as_str(), new_url.as_str());
new_url new_url
} else { } else {
@ -153,7 +153,7 @@ impl ArticleScraper {
client: &Client, client: &Client,
) -> Result<(), ScraperError> { ) -> Result<(), ScraperError> {
let headers = Util::generate_headers(config, global_config)?; let headers = Util::generate_headers(config, global_config)?;
let html = ArticleScraper::download(&url, client, headers).await?; let html = ArticleScraper::download(url, client, headers).await?;
let mut document = Self::parse_html(html, config, global_config)?; let mut document = Self::parse_html(html, config, global_config)?;
let mut xpath_ctx = Self::get_xpath_ctx(&document)?; let mut xpath_ctx = Self::get_xpath_ctx(&document)?;
@ -168,7 +168,7 @@ impl ArticleScraper {
xpath_single_page_link xpath_single_page_link
); );
if let Some(single_page_url) = Util::find_page_url(&xpath_ctx, &xpath_single_page_link) if let Some(single_page_url) = Util::find_page_url(&xpath_ctx, xpath_single_page_link)
{ {
// parse again with single page url // parse again with single page url
debug!("Single page link found '{}'", single_page_url); debug!("Single page link found '{}'", single_page_url);
@ -187,7 +187,7 @@ impl ArticleScraper {
} }
ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article); ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article);
ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url); ArticleScraper::strip_junk(&xpath_ctx, config, global_config, url);
ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?; ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?;
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) { while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
@ -229,7 +229,7 @@ impl ArticleScraper {
} }
fn get_xpath_ctx(doc: &Document) -> Result<Context, ScraperError> { fn get_xpath_ctx(doc: &Document) -> Result<Context, ScraperError> {
Ok(Context::new(&doc).map_err(|()| { Ok(Context::new(doc).map_err(|()| {
error!("Creating xpath context failed for downloaded HTML"); error!("Creating xpath context failed for downloaded HTML");
ScraperErrorKind::Xml ScraperErrorKind::Xml
})?) })?)
@ -245,11 +245,11 @@ impl ArticleScraper {
client: &Client, client: &Client,
) -> Result<(), ScraperError> { ) -> Result<(), ScraperError> {
let headers = Util::generate_headers(config, global_config)?; let headers = Util::generate_headers(config, global_config)?;
let html = ArticleScraper::download(&url, client, headers).await?; let html = ArticleScraper::download(url, client, headers).await?;
let document = Self::parse_html(html, config, global_config)?; let document = Self::parse_html(html, config, global_config)?;
let xpath_ctx = Self::get_xpath_ctx(&document)?; let xpath_ctx = Self::get_xpath_ctx(&document)?;
ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article); ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article);
ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url); ArticleScraper::strip_junk(&xpath_ctx, config, global_config, url);
ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?; ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?;
Ok(()) Ok(())
@ -521,30 +521,30 @@ impl ArticleScraper {
// strip specified xpath // strip specified xpath
if let Some(config) = config { if let Some(config) = config {
for xpath_strip in &config.xpath_strip { for xpath_strip in &config.xpath_strip {
let _ = Util::strip_node(&context, xpath_strip); let _ = Util::strip_node(context, xpath_strip);
} }
} }
for xpath_strip in &global_config.xpath_strip { for xpath_strip in &global_config.xpath_strip {
let _ = Util::strip_node(&context, xpath_strip); let _ = Util::strip_node(context, xpath_strip);
} }
// strip everything with specified 'id' or 'class' // strip everything with specified 'id' or 'class'
if let Some(config) = config { if let Some(config) = config {
for xpaht_strip_class in &config.strip_id_or_class { for xpaht_strip_class in &config.strip_id_or_class {
let _ = Util::strip_id_or_class(&context, xpaht_strip_class); let _ = Util::strip_id_or_class(context, xpaht_strip_class);
} }
} }
for xpaht_strip_class in &global_config.strip_id_or_class { for xpaht_strip_class in &global_config.strip_id_or_class {
let _ = Util::strip_id_or_class(&context, xpaht_strip_class); let _ = Util::strip_id_or_class(context, xpaht_strip_class);
} }
// strip any <img> element where @src attribute contains this substring // strip any <img> element where @src attribute contains this substring
if let Some(config) = config { if let Some(config) = config {
for xpath_strip_img_src in &config.strip_image_src { for xpath_strip_img_src in &config.strip_image_src {
let _ = Util::strip_node( let _ = Util::strip_node(
&context, context,
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src), &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
); );
} }
@ -552,45 +552,45 @@ impl ArticleScraper {
for xpath_strip_img_src in &global_config.strip_image_src { for xpath_strip_img_src in &global_config.strip_image_src {
let _ = Util::strip_node( let _ = Util::strip_node(
&context, context,
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src), &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
); );
} }
let _ = ArticleScraper::fix_lazy_images(&context, "lazyload", "data-src"); let _ = ArticleScraper::fix_lazy_images(context, "lazyload", "data-src");
let _ = ArticleScraper::fix_iframe_size(&context, "youtube.com"); let _ = ArticleScraper::fix_iframe_size(context, "youtube.com");
let _ = ArticleScraper::remove_attribute(&context, None, "style"); let _ = ArticleScraper::remove_attribute(context, None, "style");
let _ = ArticleScraper::remove_attribute(&context, Some("a"), "onclick"); let _ = ArticleScraper::remove_attribute(context, Some("a"), "onclick");
let _ = ArticleScraper::remove_attribute(&context, Some("img"), "srcset"); let _ = ArticleScraper::remove_attribute(context, Some("img"), "srcset");
let _ = ArticleScraper::remove_attribute(&context, Some("img"), "sizes"); let _ = ArticleScraper::remove_attribute(context, Some("img"), "sizes");
let _ = ArticleScraper::add_attribute(&context, Some("a"), "target", "_blank"); let _ = ArticleScraper::add_attribute(context, Some("a"), "target", "_blank");
let _ = ArticleScraper::repair_urls(&context, "//img", "src", &url); let _ = ArticleScraper::repair_urls(context, "//img", "src", url);
let _ = ArticleScraper::repair_urls(&context, "//a", "src", &url); let _ = ArticleScraper::repair_urls(context, "//a", "src", url);
let _ = ArticleScraper::repair_urls(&context, "//a", "href", &url); let _ = ArticleScraper::repair_urls(context, "//a", "href", url);
let _ = ArticleScraper::repair_urls(&context, "//object", "data", &url); let _ = ArticleScraper::repair_urls(context, "//object", "data", url);
let _ = ArticleScraper::repair_urls(&context, "//iframe", "src", &url); let _ = ArticleScraper::repair_urls(context, "//iframe", "src", url);
// strip elements using Readability.com and Instapaper.com ignore class names // strip elements using Readability.com and Instapaper.com ignore class names
// .entry-unrelated and .instapaper_ignore // .entry-unrelated and .instapaper_ignore
// See http://blog.instapaper.com/post/730281947 // See http://blog.instapaper.com/post/730281947
let _ = Util::strip_node(&context, &String::from( let _ = Util::strip_node(context,
"//*[contains(@class,' entry-unrelated ') or contains(@class,' instapaper_ignore ')]")); "//*[contains(@class,' entry-unrelated ') or contains(@class,' instapaper_ignore ')]");
// strip elements that contain style="display: none;" // strip elements that contain style="display: none;"
let _ = Util::strip_node( let _ = Util::strip_node(
&context, context,
&String::from("//*[contains(@style,'display:none')]"), "//*[contains(@style,'display:none')]",
); );
// strip all comments // strip all comments
let _ = Util::strip_node(&context, &String::from("//comment()")); let _ = Util::strip_node(context, "//comment()");
// strip all empty url-tags <a/> // strip all empty url-tags <a/>
let _ = Util::strip_node(&context, &String::from("//a[not(node())]")); let _ = Util::strip_node(context, "//a[not(node())]");
// strip all external css and fonts // strip all external css and fonts
let _ = Util::strip_node(&context, &String::from("//*[@type='text/css']")); let _ = Util::strip_node(context, "//*[@type='text/css']");
} }
fn extract_metadata( fn extract_metadata(
@ -602,7 +602,7 @@ impl ArticleScraper {
// try to get title // try to get title
if let Some(config) = config { if let Some(config) = config {
for xpath_title in &config.xpath_title { for xpath_title in &config.xpath_title {
if let Ok(title) = Util::extract_value_merge(&context, xpath_title) { if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
debug!("Article title: '{}'", title); debug!("Article title: '{}'", title);
article.title = Some(title); article.title = Some(title);
break; break;
@ -612,7 +612,7 @@ impl ArticleScraper {
if article.title.is_none() { if article.title.is_none() {
for xpath_title in &global_config.xpath_title { for xpath_title in &global_config.xpath_title {
if let Ok(title) = Util::extract_value_merge(&context, xpath_title) { if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
debug!("Article title: '{}'", title); debug!("Article title: '{}'", title);
article.title = Some(title); article.title = Some(title);
break; break;
@ -623,7 +623,7 @@ impl ArticleScraper {
// try to get the author // try to get the author
if let Some(config) = config { if let Some(config) = config {
for xpath_author in &config.xpath_author { for xpath_author in &config.xpath_author {
if let Ok(author) = Util::extract_value(&context, xpath_author) { if let Ok(author) = Util::extract_value(context, xpath_author) {
debug!("Article author: '{}'", author); debug!("Article author: '{}'", author);
article.author = Some(author); article.author = Some(author);
break; break;
@ -633,7 +633,7 @@ impl ArticleScraper {
if article.author.is_none() { if article.author.is_none() {
for xpath_author in &global_config.xpath_author { for xpath_author in &global_config.xpath_author {
if let Ok(author) = Util::extract_value(&context, xpath_author) { if let Ok(author) = Util::extract_value(context, xpath_author) {
debug!("Article author: '{}'", author); debug!("Article author: '{}'", author);
article.author = Some(author); article.author = Some(author);
break; break;
@ -644,7 +644,7 @@ impl ArticleScraper {
// try to get the date // try to get the date
if let Some(config) = config { if let Some(config) = config {
for xpath_date in &config.xpath_date { for xpath_date in &config.xpath_date {
if let Ok(date_string) = Util::extract_value(&context, xpath_date) { if let Ok(date_string) = Util::extract_value(context, xpath_date) {
debug!("Article date: '{}'", date_string); debug!("Article date: '{}'", date_string);
if let Ok(date) = DateTime::from_str(&date_string) { if let Ok(date) = DateTime::from_str(&date_string) {
article.date = Some(date); article.date = Some(date);
@ -658,7 +658,7 @@ impl ArticleScraper {
if article.date.is_none() { if article.date.is_none() {
for xpath_date in &global_config.xpath_date { for xpath_date in &global_config.xpath_date {
if let Ok(date_string) = Util::extract_value(&context, xpath_date) { if let Ok(date_string) = Util::extract_value(context, xpath_date) {
debug!("Article date: '{}'", date_string); debug!("Article date: '{}'", date_string);
if let Ok(date) = DateTime::from_str(&date_string) { if let Ok(date) = DateTime::from_str(&date_string) {
article.date = Some(date); article.date = Some(date);
@ -681,13 +681,13 @@ impl ArticleScraper {
if let Some(config) = config { if let Some(config) = config {
for xpath_body in &config.xpath_body { for xpath_body in &config.xpath_body {
found_something = ArticleScraper::extract_body_single(&context, root, xpath_body)?; found_something = ArticleScraper::extract_body_single(context, root, xpath_body)?;
} }
} }
if !found_something { if !found_something {
for xpath_body in &global_config.xpath_body { for xpath_body in &global_config.xpath_body {
found_something = ArticleScraper::extract_body_single(&context, root, xpath_body)?; found_something = ArticleScraper::extract_body_single(context, root, xpath_body)?;
} }
} }
@ -734,7 +734,7 @@ impl ArticleScraper {
if let Some(config) = config { if let Some(config) = config {
if let Some(next_page_xpath) = config.next_page_link.as_deref() { if let Some(next_page_xpath) = config.next_page_link.as_deref() {
if let Ok(next_page_string) = if let Ok(next_page_string) =
ArticleScraper::get_attribute(&context, next_page_xpath, "href") ArticleScraper::get_attribute(context, next_page_xpath, "href")
{ {
if let Ok(next_page_url) = url::Url::parse(&next_page_string) { if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
return Some(next_page_url); return Some(next_page_url);
@ -743,7 +743,7 @@ impl ArticleScraper {
} }
} else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() { } else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() {
if let Ok(next_page_string) = if let Ok(next_page_string) =
ArticleScraper::get_attribute(&context, next_page_xpath, "href") ArticleScraper::get_attribute(context, next_page_xpath, "href")
{ {
if let Ok(next_page_url) = url::Url::parse(&next_page_string) { if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
return Some(next_page_url); return Some(next_page_url);

View file

@ -78,7 +78,7 @@ impl Util {
} }
pub fn find_page_url(xpath_ctx: &Context, xpath_page_link: &str) -> Option<url::Url> { pub fn find_page_url(xpath_ctx: &Context, xpath_page_link: &str) -> Option<url::Url> {
let res = Self::evaluate_xpath(&xpath_ctx, &xpath_page_link, false).ok()?; let res = Self::evaluate_xpath(xpath_ctx, xpath_page_link, false).ok()?;
let mut url = None; let mut url = None;
for node in res { for node in res {
@ -168,7 +168,7 @@ impl Util {
.map(|s| format!("{} ", s)) .map(|s| format!("{} ", s))
.collect::<String>(); .collect::<String>();
val.push_str(&part); val.push_str(&part);
val.push_str(" "); val.push(' ');
} }
Ok(val.trim().to_string()) Ok(val.trim().to_string())