From 0c3946dd5bdc7d87aa5b167e2b7cf052585000c9 Mon Sep 17 00:00:00 2001 From: Felix Buehler Date: Fri, 29 May 2020 18:55:00 +0200 Subject: [PATCH] fix fmt+lint --- src/article.rs | 3 +- src/config/error.rs | 2 +- src/config/mod.rs | 28 +++++++-------- src/error.rs | 2 +- src/images/error.rs | 2 +- src/images/mod.rs | 24 ++++++------- src/lib.rs | 88 +++++++++++++++++++++------------------------ 7 files changed, 68 insertions(+), 81 deletions(-) diff --git a/src/article.rs b/src/article.rs index 5d860fd..2ee61a2 100644 --- a/src/article.rs +++ b/src/article.rs @@ -1,7 +1,6 @@ use crate::error::{ScraperError, ScraperErrorKind}; use chrono::{DateTime, Utc}; use failure::ResultExt; -use std; use std::io::Write; use std::path::PathBuf; use url::Url; @@ -32,6 +31,6 @@ impl Article { } } - Err(ScraperErrorKind::Unknown)? + Err(ScraperErrorKind::Unknown.into()) } } diff --git a/src/config/error.rs b/src/config/error.rs index 71c2982..f2ae18c 100644 --- a/src/config/error.rs +++ b/src/config/error.rs @@ -48,7 +48,7 @@ impl From for ConfigError { impl From> for ConfigError { fn from(inner: Context) -> ConfigError { - ConfigError { inner: inner } + ConfigError { inner } } } diff --git a/src/config/mod.rs b/src/config/mod.rs index d7347d7..9ea32ee 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -104,7 +104,7 @@ impl GrabberConfig { let mut iterator = buffer.lines().peekable(); while let Some(Ok(line)) = iterator.next() { let line = line.trim(); - if line.starts_with("#") + if line.starts_with('#') || line.starts_with(tidy) || line.starts_with(prune) || line.starts_with(test_url) @@ -136,8 +136,8 @@ impl GrabberConfig { if let Some(to_replace) = value.get(0) { if let Some(replace_with) = value.get(1) { replace_vec.push(Replace { - to_replace: to_replace.to_string(), - replace_with: replace_with.to_string(), + to_replace: (*to_replace).to_string(), + replace_with: (*replace_with).to_string(), }); } } @@ -162,22 +162,22 @@ impl GrabberConfig { } } - if xpath_body.len() == 0 { + if xpath_body.is_empty() { warn!("No body xpath found for {}", config_path.display()); - Err(ConfigErrorKind::BadConfig)? + return Err(ConfigErrorKind::BadConfig.into()); } let config = GrabberConfig { - xpath_title: xpath_title, - xpath_author: xpath_author, - xpath_date: xpath_date, - xpath_body: xpath_body, - xpath_strip: xpath_strip, - strip_id_or_class: strip_id_or_class, - strip_image_src: strip_image_src, + xpath_title, + xpath_author, + xpath_date, + xpath_body, + xpath_strip, + strip_id_or_class, + strip_image_src, replace: replace_vec, - single_page_link: single_page_link, - next_page_link: next_page_link, + single_page_link, + next_page_link, }; Ok(config) diff --git a/src/error.rs b/src/error.rs index 71a287c..82473c1 100644 --- a/src/error.rs +++ b/src/error.rs @@ -58,7 +58,7 @@ impl From for ScraperError { impl From> for ScraperError { fn from(inner: Context) -> ScraperError { - ScraperError { inner: inner } + ScraperError { inner } } } diff --git a/src/images/error.rs b/src/images/error.rs index 49b664e..34f3480 100644 --- a/src/images/error.rs +++ b/src/images/error.rs @@ -63,7 +63,7 @@ impl From for ImageDownloadError { impl From> for ImageDownloadError { fn from(inner: Context) -> ImageDownloadError { - ImageDownloadError { inner: inner } + ImageDownloadError { inner } } } diff --git a/src/images/mod.rs b/src/images/mod.rs index 964cfd1..39b8498 100644 --- a/src/images/mod.rs +++ b/src/images/mod.rs @@ -1,14 +1,11 @@ use self::error::{ImageDownloadError, ImageDownloadErrorKind}; use crate::ArticleScraper; -use base64; use failure::ResultExt; -use image; use libxml::parser::Parser; use libxml::tree::{Node, SaveOptions}; use libxml::xpath::Context; use log::{debug, error}; use reqwest::{Client, Response}; -use url; mod error; @@ -73,12 +70,12 @@ impl ImageDownloader { if let Ok((small_image, big_image)) = self.save_image(&url, &parent_url, client).await { - if let Err(_) = node.set_property("src", &small_image) { - return Err(ImageDownloadErrorKind::HtmlParse)?; + if node.set_property("src", &small_image).is_err() { + return Err(ImageDownloadErrorKind::HtmlParse.into()); } if let Some(big_image) = big_image { - if let Err(_) = node.set_property("big-src", &big_image) { - return Err(ImageDownloadErrorKind::HtmlParse)?; + if node.set_property("big-src", &big_image).is_err() { + return Err(ImageDownloadErrorKind::HtmlParse.into()); } } } @@ -195,10 +192,10 @@ impl ImageDownloader { } error!("{} is not an image", response.url()); - return Err(ImageDownloadErrorKind::ContentType)?; + return Err(ImageDownloadErrorKind::ContentType.into()); } - Err(ImageDownloadErrorKind::Http)? + Err(ImageDownloadErrorKind::Http.into()) } fn scale_image( @@ -297,7 +294,7 @@ impl ImageDownloader { } debug!("Image parent element not relevant"); - Err(ImageDownloadErrorKind::ParentDownload)? + Err(ImageDownloadErrorKind::ParentDownload.into()) } fn get_content_lenght(response: &Response) -> Result { @@ -310,7 +307,7 @@ impl ImageDownloader { } } } - Err(ImageDownloadErrorKind::ContentLenght)? + Err(ImageDownloadErrorKind::ContentLenght.into()) } } @@ -330,9 +327,8 @@ mod tests { .download_images_from_string(&hdyleaflet, &Client::new()) .await .expect("Failed to downalod images"); - let mut file = - fs::File::create(r"./test_output/fedora31_images_downloaded.html") - .expect("Failed to create output file"); + let mut file = fs::File::create(r"./test_output/fedora31_images_downloaded.html") + .expect("Failed to create output file"); file.write_all(result.as_bytes()) .expect("Failed to write result to file"); } diff --git a/src/lib.rs b/src/lib.rs index c16fa49..73ebbe3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,14 +14,12 @@ use libxml::parser::Parser; use libxml::tree::{Document, Node, SaveOptions}; use libxml::xpath::Context; use log::{debug, error, info, warn}; -use regex; use reqwest::{Client, Response}; use std::collections; use std::path::PathBuf; use std::str::FromStr; use std::sync::{Arc, RwLock}; use std::thread; -use url; pub struct ArticleScraper { pub image_downloader: ImageDownloader, @@ -79,7 +77,7 @@ impl ArticleScraper { // check if we are dealing with text/html if !ArticleScraper::check_content_type(&response)? { - return Err(ScraperErrorKind::ContentType)?; + return Err(ScraperErrorKind::ContentType.into()); } // check if we have a config for the url @@ -181,16 +179,12 @@ impl ArticleScraper { ArticleScraper::strip_junk(&xpath_ctx, config, &url); ArticleScraper::extract_body(&xpath_ctx, root, config)?; - loop { - if let Some(url) = self.check_for_next_page(&xpath_ctx, config) { - let html = ArticleScraper::download(&url, client).await?; - document = Self::parse_html(html, config)?; - xpath_ctx = Self::get_xpath_ctx(&document)?; - ArticleScraper::strip_junk(&xpath_ctx, config, &url); - ArticleScraper::extract_body(&xpath_ctx, root, config)?; - } else { - break; - } + while let Some(url) = self.check_for_next_page(&xpath_ctx, config) { + let html = ArticleScraper::download(&url, client).await?; + document = Self::parse_html(html, config)?; + xpath_ctx = Self::get_xpath_ctx(&document)?; + ArticleScraper::strip_junk(&xpath_ctx, config, &url); + ArticleScraper::extract_body(&xpath_ctx, root, config)?; } Ok(()) @@ -231,10 +225,10 @@ impl ArticleScraper { let node_vec = res.get_nodes_as_vec(); - if node_vec.len() == 0 { + if node_vec.is_empty() { debug!("Evaluation of xpath '{}' yielded no results", xpath); if thorw_if_empty { - return Err(ScraperErrorKind::Xml)?; + return Err(ScraperErrorKind::Xml.into()); } } @@ -297,7 +291,7 @@ impl ArticleScraper { return Ok(text); } - Err(ScraperErrorKind::Http)? + Err(ScraperErrorKind::Http.into()) } fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> { @@ -349,7 +343,7 @@ impl ArticleScraper { } None => { error!("Getting config failed due to bad Url"); - return Err(ScraperErrorKind::Config)?; + return Err(ScraperErrorKind::Config.into()); } }; @@ -357,15 +351,15 @@ impl ArticleScraper { if let Some(config_files) = &*self.config_files.read().unwrap() { match config_files.get(&config_name) { - Some(config) => return Ok(config.clone()), + Some(config) => Ok(config.clone()), None => { error!("No config file of the name '{}' fount", config_name); - Err(ScraperErrorKind::Config)? + Err(ScraperErrorKind::Config.into()) } } } else { error!("Config files have not been parsed yet."); - return Err(ScraperErrorKind::Config)?; + Err(ScraperErrorKind::Config.into()) } } @@ -384,7 +378,7 @@ impl ArticleScraper { } error!("Failed to determine content type"); - Err(ScraperErrorKind::Http)? + Err(ScraperErrorKind::Http.into()) } fn check_redirect(response: &Response) -> Option { @@ -402,7 +396,7 @@ impl ArticleScraper { return Ok(val.get_content()); } - Err(ScraperErrorKind::Xml)? + Err(ScraperErrorKind::Xml.into()) } fn extract_value_merge(context: &Context, xpath: &str) -> Result { @@ -412,11 +406,11 @@ impl ArticleScraper { val.push_str(&node.get_content()); } - return Ok(val.trim().to_string()); + Ok(val.trim().to_string()) } - fn strip_node(context: &Context, xpath: &String) -> Result<(), ScraperError> { - let mut ancestor = xpath.clone(); + fn strip_node(context: &Context, xpath: &str) -> Result<(), ScraperError> { + let mut ancestor = xpath.to_string(); if ancestor.starts_with("//") { ancestor = ancestor.chars().skip(2).collect(); } @@ -429,7 +423,7 @@ impl ArticleScraper { Ok(()) } - fn strip_id_or_class(context: &Context, id_or_class: &String) -> Result<(), ScraperError> { + fn strip_id_or_class(context: &Context, id_or_class: &str) -> Result<(), ScraperError> { let xpath = &format!( "//*[contains(@class, '{}') or contains(@id, '{}')]", id_or_class, id_or_class @@ -457,8 +451,8 @@ impl ArticleScraper { let node_vec = Self::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { if let Some(correct_url) = node.get_property(property_url) { - if let Err(_) = node.set_property("src", &correct_url) { - return Err(ScraperErrorKind::Xml)?; + if node.set_property("src", &correct_url).is_err() { + return Err(ScraperErrorKind::Xml.into()); } } } @@ -485,11 +479,11 @@ impl ArticleScraper { } error!("Failed to add video wrapper
as parent of iframe"); - return Err(ScraperErrorKind::Xml)?; + return Err(ScraperErrorKind::Xml.into()); } error!("Failed to get parent of iframe"); - return Err(ScraperErrorKind::Xml)?; + return Err(ScraperErrorKind::Xml.into()); } Ok(()) } @@ -507,8 +501,8 @@ impl ArticleScraper { let xpath = &format!("//{}[@{}]", xpath_tag, attribute); let node_vec = Self::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { - if let Err(_) = node.remove_property(attribute) { - return Err(ScraperErrorKind::Xml)?; + if node.remove_property(attribute).is_err() { + return Err(ScraperErrorKind::Xml.into()); } } Ok(()) @@ -528,8 +522,8 @@ impl ArticleScraper { let xpath = &format!("//{}", xpath_tag); let node_vec = Self::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { - if let Err(_) = node.set_attribute(attribute, value) { - return Err(ScraperErrorKind::Xml)?; + if node.set_attribute(attribute, value).is_err() { + return Err(ScraperErrorKind::Xml.into()); } } Ok(()) @@ -547,7 +541,7 @@ impl ArticleScraper { } } - Err(ScraperErrorKind::Xml)? + Err(ScraperErrorKind::Xml.into()) } fn repair_urls( @@ -561,8 +555,8 @@ impl ArticleScraper { if let Some(val) = node.get_attribute(attribute) { if let Err(url::ParseError::RelativeUrlWithoutBase) = url::Url::parse(&val) { if let Ok(fixed_url) = ArticleScraper::complete_url(article_url, &val) { - if let Err(_) = node.set_attribute(attribute, fixed_url.as_str()) { - return Err(ScraperErrorKind::Xml)?; + if node.set_attribute(attribute, fixed_url.as_str()).is_err() { + return Err(ScraperErrorKind::Scrape.into()); } } } @@ -584,7 +578,7 @@ impl ArticleScraper { completed_url.push_str("//"); completed_url.push_str(host); } - _ => return Err(ScraperErrorKind::Url)?, + _ => return Err(ScraperErrorKind::Scrape.into()), }; } @@ -593,7 +587,7 @@ impl ArticleScraper { } completed_url.push_str(incomplete_url); let url = url::Url::parse(&completed_url).context(ScraperErrorKind::Url)?; - return Ok(url); + Ok(url) } fn strip_junk(context: &Context, config: &GrabberConfig, url: &url::Url) { @@ -698,7 +692,7 @@ impl ArticleScraper { } if !found_something { - return Err(ScraperErrorKind::Scrape)?; + return Err(ScraperErrorKind::Scrape.into()); } Ok(()) @@ -713,18 +707,16 @@ impl ArticleScraper { { let node_vec = Self::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { - if node.get_property("style").is_some() { - if let Err(_) = node.remove_property("style") { - return Err(ScraperErrorKind::Xml)?; - } + if node.get_property("style").is_some() && node.remove_property("style").is_err() { + return Err(ScraperErrorKind::Xml.into()); } node.unlink(); - if let Ok(_) = root.add_child(&mut node) { + if root.add_child(&mut node).is_ok() { found_something = true; } else { error!("Failed to add body to prepared document"); - return Err(ScraperErrorKind::Xml)?; + return Err(ScraperErrorKind::Xml.into()); } } } @@ -751,14 +743,14 @@ impl ArticleScraper { if let Ok(mut head_node) = Node::new("head", None, document) { if let Ok(()) = root.add_prev_sibling(&mut head_node) { if let Ok(mut meta) = head_node.new_child(None, "meta") { - if let Ok(_) = meta.set_property("charset", "utf-8") { + if meta.set_property("charset", "utf-8").is_ok() { return Ok(()); } } } } - Err(ScraperErrorKind::Xml)? + Err(ScraperErrorKind::Xml.into()) } fn prevent_self_closing_tags(context: &Context) -> Result<(), ScraperError> {