diff --git a/src/article.rs b/src/article.rs index d4204a0..36abbc2 100644 --- a/src/article.rs +++ b/src/article.rs @@ -1,13 +1,10 @@ -use std; -use url::Url; -use std::path::PathBuf; +use crate::error::{ScraperError, ScraperErrorKind}; use chrono::NaiveDateTime; -use crate::error::{ - ScraperError, - ScraperErrorKind, -}; -use std::io::Write; use failure::ResultExt; +use std; +use std::io::Write; +use std::path::PathBuf; +use url::Url; pub struct Article { pub title: Option, @@ -19,7 +16,6 @@ pub struct Article { impl Article { pub fn save_html(&self, path: &PathBuf) -> Result<(), ScraperError> { - if let Some(ref html) = self.html { if let Ok(()) = std::fs::create_dir_all(&path) { let mut file_name = match self.title.clone() { @@ -29,12 +25,13 @@ impl Article { file_name.push_str(".html"); let path = path.join(file_name); let mut html_file = std::fs::File::create(&path).context(ScraperErrorKind::IO)?; - html_file.write_all(html.as_bytes()).context(ScraperErrorKind::IO)?; - return Ok(()) + html_file + .write_all(html.as_bytes()) + .context(ScraperErrorKind::IO)?; + return Ok(()); } } Err(ScraperErrorKind::Unknown)? } - -} \ No newline at end of file +} diff --git a/src/config/error.rs b/src/config/error.rs index 4f992d2..71c2982 100644 --- a/src/config/error.rs +++ b/src/config/error.rs @@ -1,4 +1,4 @@ -use failure::{Context, Fail, Backtrace, Error}; +use failure::{Backtrace, Context, Error, Fail}; use std::fmt; #[derive(Debug)] @@ -40,7 +40,9 @@ impl fmt::Display for ConfigError { impl From for ConfigError { fn from(kind: ConfigErrorKind) -> ConfigError { - ConfigError { inner: Context::new(kind) } + ConfigError { + inner: Context::new(kind), + } } } @@ -52,6 +54,8 @@ impl From> for ConfigError { impl From for ConfigError { fn from(_: Error) -> ConfigError { - ConfigError { inner: Context::new(ConfigErrorKind::Unknown) } + ConfigError { + inner: Context::new(ConfigErrorKind::Unknown), + } } -} \ No newline at end of file +} diff --git a/src/config/macros.rs b/src/config/macros.rs index f111bb2..808a5e6 100644 --- a/src/config/macros.rs +++ b/src/config/macros.rs @@ -1,43 +1,43 @@ macro_rules! extract_vec_multi { - ( + ( $line: ident, $identifier: ident, $vector: ident ) => { - if $line.starts_with($identifier) { - let value = GrabberConfig::extract_value($identifier, $line); - let value = GrabberConfig::split_values(value); - let value: Vec = value.iter().map(|s| s.trim().to_string()).collect(); - $vector.extend(value); - continue; - } - }; + if $line.starts_with($identifier) { + let value = GrabberConfig::extract_value($identifier, $line); + let value = GrabberConfig::split_values(value); + let value: Vec = value.iter().map(|s| s.trim().to_string()).collect(); + $vector.extend(value); + continue; + } + }; } macro_rules! extract_vec_single { - ( + ( $line: ident, $identifier: ident, $vector: ident ) => { - if $line.starts_with($identifier) { - let value = GrabberConfig::extract_value($identifier, $line); - $vector.push(value.to_string()); - continue; - } - }; + if $line.starts_with($identifier) { + let value = GrabberConfig::extract_value($identifier, $line); + $vector.push(value.to_string()); + continue; + } + }; } macro_rules! extract_option_single { - ( + ( $line: ident, $identifier: ident, $option: ident ) => { - if $line.starts_with($identifier) { - let value = GrabberConfig::extract_value($identifier, $line); - $option = Some(value.to_string()); - continue; - } - }; -} \ No newline at end of file + if $line.starts_with($identifier) { + let value = GrabberConfig::extract_value($identifier, $line); + $option = Some(value.to_string()); + continue; + } + }; +} diff --git a/src/config/mod.rs b/src/config/mod.rs index 3c3abc6..d7347d7 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1,11 +1,11 @@ +use self::error::{ConfigError, ConfigErrorKind}; +use failure::ResultExt; +use log::warn; use std::collections; -use std::path::{PathBuf}; use std::fs; use std::io; use std::io::BufRead; -use failure::ResultExt; -use log::error; -use self::error::{ConfigError, ConfigErrorKind}; +use std::path::PathBuf; #[macro_use] mod macros; @@ -13,11 +13,13 @@ mod error; pub type ConfigCollection = collections::HashMap; +#[derive(Clone)] pub struct Replace { pub to_replace: String, pub replace_with: String, } +#[derive(Clone)] pub struct GrabberConfig { pub xpath_title: Vec, pub xpath_author: Vec, @@ -32,33 +34,36 @@ pub struct GrabberConfig { } impl GrabberConfig { - pub fn parse_directory(directory: &PathBuf) -> Result { // create data dir if it doesn't already exist std::fs::DirBuilder::new() .recursive(true) .create(&directory) .context(ConfigErrorKind::IO)?; - + let paths = fs::read_dir(directory).context(ConfigErrorKind::IO)?; - let mut collection: collections::HashMap = collections::HashMap::new(); + let mut collection: collections::HashMap = + collections::HashMap::new(); for path in paths { if let Ok(path) = path { if let Some(extension) = path.path().extension() { - if let Some(extension) = extension.to_str() { - if extension == "txt" { + if let Some(extension) = extension.to_str() { + if extension == "txt" { if let Ok(config) = GrabberConfig::new(path.path()) { - collection.insert(path.file_name().to_string_lossy().into_owned(), config); + collection.insert( + path.file_name().to_string_lossy().into_owned(), + config, + ); } - } - } + } + } } } } - Ok(collection) + Ok(collection) } fn new(config_path: PathBuf) -> Result { @@ -99,47 +104,45 @@ impl GrabberConfig { let mut iterator = buffer.lines().peekable(); while let Some(Ok(line)) = iterator.next() { let line = line.trim(); - if line.starts_with("#") - || line.starts_with(tidy) - || line.starts_with(prune) - || line.starts_with(test_url) - || line.starts_with(autodetect) - || line.is_empty() { + if line.starts_with("#") + || line.starts_with(tidy) + || line.starts_with(prune) + || line.starts_with(test_url) + || line.starts_with(autodetect) + || line.is_empty() + { continue; } + extract_vec_multi!(line, title, xpath_title); + extract_vec_multi!(line, body, xpath_body); + extract_vec_multi!(line, date, xpath_date); + extract_vec_multi!(line, author, xpath_author); - extract_vec_multi!(line, title, xpath_title); - extract_vec_multi!(line, body, xpath_body); - extract_vec_multi!(line, date, xpath_date); - extract_vec_multi!(line, author, xpath_author); + extract_vec_single!(line, strip, xpath_strip); + extract_vec_single!(line, strip_id, strip_id_or_class); + extract_vec_single!(line, strip_img, strip_image_src); - extract_vec_single!(line, strip, xpath_strip); - extract_vec_single!(line, strip_id, strip_id_or_class); - extract_vec_single!(line, strip_img, strip_image_src); - - extract_option_single!(line, single_page, single_page_link); - extract_option_single!(line, next_page, next_page_link); + extract_option_single!(line, single_page, single_page_link); + extract_option_single!(line, next_page, next_page_link); if line.starts_with(replace_single) { let value = GrabberConfig::extract_value(replace_single, line); let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect(); - if value.len() != 2{ + if value.len() != 2 { continue; } if let Some(to_replace) = value.get(0) { if let Some(replace_with) = value.get(1) { - replace_vec.push( - Replace { - to_replace: to_replace.to_string(), - replace_with: replace_with.to_string(), - } - ); + replace_vec.push(Replace { + to_replace: to_replace.to_string(), + replace_with: replace_with.to_string(), + }); } } - continue; + continue; } if line.starts_with(find) { @@ -155,12 +158,12 @@ impl GrabberConfig { replace_vec.push(r); } - continue; + continue; } } if xpath_body.len() == 0 { - error!("No body xpath found for {}", config_path.display()); + warn!("No body xpath found for {}", config_path.display()); Err(ConfigErrorKind::BadConfig)? } diff --git a/src/error.rs b/src/error.rs index 29bf4ca..71a287c 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,4 +1,4 @@ -use failure::{Context, Fail, Backtrace, Error}; +use failure::{Backtrace, Context, Error, Fail}; use std::fmt; #[derive(Debug)] @@ -50,7 +50,9 @@ impl ScraperError { impl From for ScraperError { fn from(kind: ScraperErrorKind) -> ScraperError { - ScraperError { inner: Context::new(kind) } + ScraperError { + inner: Context::new(kind), + } } } @@ -62,6 +64,8 @@ impl From> for ScraperError { impl From for ScraperError { fn from(_: Error) -> ScraperError { - ScraperError { inner: Context::new(ScraperErrorKind::Unknown) } + ScraperError { + inner: Context::new(ScraperErrorKind::Unknown), + } } -} \ No newline at end of file +} diff --git a/src/images/error.rs b/src/images/error.rs index e2f0115..49b664e 100644 --- a/src/images/error.rs +++ b/src/images/error.rs @@ -1,6 +1,6 @@ -use failure::{Context, Fail, Backtrace, Error}; -use std::fmt; use super::super::ScraperErrorKind; +use failure::{Backtrace, Context, Error, Fail}; +use std::fmt; #[derive(Debug)] pub struct ImageDownloadError { @@ -55,7 +55,9 @@ impl ImageDownloadError { impl From for ImageDownloadError { fn from(kind: ImageDownloadErrorKind) -> ImageDownloadError { - ImageDownloadError { inner: Context::new(kind) } + ImageDownloadError { + inner: Context::new(kind), + } } } @@ -72,12 +74,16 @@ impl From for ImageDownloadError { _ => ImageDownloadErrorKind::Unknown, }; - ImageDownloadError { inner: Context::new(kind) } + ImageDownloadError { + inner: Context::new(kind), + } } } impl From for ImageDownloadError { fn from(_: Error) -> ImageDownloadError { - ImageDownloadError { inner: Context::new(ImageDownloadErrorKind::Unknown) } + ImageDownloadError { + inner: Context::new(ImageDownloadErrorKind::Unknown), + } } -} \ No newline at end of file +} diff --git a/src/images/mod.rs b/src/images/mod.rs index 33f2dc9..c3326af 100644 --- a/src/images/mod.rs +++ b/src/images/mod.rs @@ -1,22 +1,16 @@ -use reqwest; -use log::{ - error, - debug, -}; -use libxml::parser::Parser; -use libxml::xpath::Context; -use libxml::tree::{ - Node, - SaveOptions, -}; -use url; -use failure::ResultExt; -use std::error::Error; use self::error::{ImageDownloadError, ImageDownloadErrorKind}; -use base64; -use std; -use image; use crate::ArticleScraper; +use base64; +use failure::ResultExt; +use image; +use libxml::parser::Parser; +use libxml::tree::{Node, SaveOptions}; +use libxml::xpath::Context; +use log::{debug, error}; +use reqwest; +use std; +use std::error::Error; +use url; mod error; @@ -26,7 +20,6 @@ pub struct ImageDownloader { } impl ImageDownloader { - pub fn new(max_size: (u32, u32)) -> ImageDownloader { ImageDownloader { client: reqwest::Client::new(), @@ -34,7 +27,10 @@ impl ImageDownloader { } } - pub async fn download_images_from_string(&self, html: &str) -> Result { + pub async fn download_images_from_string( + &self, + html: &str, + ) -> Result { let parser = Parser::default_html(); let doc = parser.parse_string(html).map_err(|_| { error!("Failed to parse HTML string"); @@ -61,7 +57,10 @@ impl ImageDownloader { Ok(doc.to_string_with_options(options)) } - pub async fn download_images_from_context(&self, context: &Context) -> Result<(), ImageDownloadError> { + pub async fn download_images_from_context( + &self, + context: &Context, + ) -> Result<(), ImageDownloadError> { let xpath = "//img"; let node_vec = ArticleScraper::evaluate_xpath(context, xpath, false) .context(ImageDownloadErrorKind::HtmlParse)?; @@ -74,7 +73,9 @@ impl ImageDownloader { Err(_) => None, }; - if let Ok((small_image, big_image)) = self.save_image(&url, &parent_url).await { + if let Ok((small_image, big_image)) = + self.save_image(&url, &parent_url).await + { if let Err(_) = node.set_property("src", &small_image) { return Err(ImageDownloadErrorKind::HtmlParse)?; } @@ -92,17 +93,27 @@ impl ImageDownloader { Ok(()) } - async fn save_image(&self, image_url: &url::Url, parent_url: &Option) -> Result<(String, Option), ImageDownloadError> { - - let response = self.client.get(image_url.clone()).send().await.map_err(|err| { - error!("GET {} failed - {}", image_url.as_str(), err.description()); - err - }).context(ImageDownloadErrorKind::Http)?; + async fn save_image( + &self, + image_url: &url::Url, + parent_url: &Option, + ) -> Result<(String, Option), ImageDownloadError> { + let response = self + .client + .get(image_url.clone()) + .send() + .await + .map_err(|err| { + error!("GET {} failed - {}", image_url.as_str(), err.description()); + err + }) + .context(ImageDownloadErrorKind::Http)?; let content_type_small = ImageDownloader::check_image_content_type(&response)?; - let content_type_small = content_type_small.to_str() + let content_type_small = content_type_small + .to_str() .context(ImageDownloadErrorKind::ContentType)?; - let mut content_type_big : Option = None; + let mut content_type_big: Option = None; let mut small_image = response .bytes() @@ -110,21 +121,29 @@ impl ImageDownloader { .context(ImageDownloadErrorKind::IO)? .as_ref() .to_vec(); - - let mut big_image : Option> = None; + + let mut big_image: Option> = None; if let Some(parent_url) = parent_url { - let response_big = self.client.get(parent_url.clone()).send().await - .context(ImageDownloadErrorKind::Http)?; - content_type_big = Some(ImageDownloader::check_image_content_type(&response_big)? - .to_str() - .context(ImageDownloadErrorKind::ContentType)? - .to_owned()); - big_image = Some(response_big - .bytes() + let response_big = self + .client + .get(parent_url.clone()) + .send() .await - .context(ImageDownloadErrorKind::IO)? - .to_vec()); + .context(ImageDownloadErrorKind::Http)?; + content_type_big = Some( + ImageDownloader::check_image_content_type(&response_big)? + .to_str() + .context(ImageDownloadErrorKind::ContentType)? + .to_owned(), + ); + big_image = Some( + response_big + .bytes() + .await + .context(ImageDownloadErrorKind::IO)? + .to_vec(), + ); } if content_type_small != "image/svg+xml" && content_type_small != "image/gif" { @@ -135,74 +154,94 @@ impl ImageDownloader { big_image = Some(original_image); content_type_big = Some(content_type_small.to_owned()); } - } - else { + } else { small_image = original_image; } } - + let small_image_base64 = base64::encode(&small_image); let big_image_base64 = match big_image { Some(big_image) => Some(base64::encode(&big_image)), None => None, }; - let small_image_string = format!("data:{};base64,{}", content_type_small, small_image_base64); + let small_image_string = + format!("data:{};base64,{}", content_type_small, small_image_base64); let big_image_string = match big_image_base64 { Some(big_image_base64) => { - let content_type_big = content_type_big.ok_or(ImageDownloadErrorKind::ParentDownload) + let content_type_big = content_type_big + .ok_or(ImageDownloadErrorKind::ParentDownload) .map_err(|err| { debug!("content_type_big should not be None when a big image exists"); err })?; - Some(format!("data:{};base64,{}", content_type_big, big_image_base64)) - }, + Some(format!( + "data:{};base64,{}", + content_type_big, big_image_base64 + )) + } None => None, }; Ok((small_image_string, big_image_string)) } - fn check_image_content_type(response: &reqwest::Response) -> Result { - + fn check_image_content_type( + response: &reqwest::Response, + ) -> Result { if response.status().is_success() { if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) { - if content_type.to_str().context(ImageDownloadErrorKind::ContentType)?.contains("image") { - return Ok(content_type.clone()) + if content_type + .to_str() + .context(ImageDownloadErrorKind::ContentType)? + .contains("image") + { + return Ok(content_type.clone()); } } error!("{} is not an image", response.url()); - return Err(ImageDownloadErrorKind::ContentType)? + return Err(ImageDownloadErrorKind::ContentType)?; } Err(ImageDownloadErrorKind::Http)? } - fn scale_image(image_buffer: &[u8], max_dimensions: (u32, u32)) -> Result<(Vec, Option>), ImageDownloadError> { - let mut original_image : Vec = Vec::new(); - let mut resized_image : Option> = None; + fn scale_image( + image_buffer: &[u8], + max_dimensions: (u32, u32), + ) -> Result<(Vec, Option>), ImageDownloadError> { + let mut original_image: Vec = Vec::new(); + let mut resized_image: Option> = None; let mut image = image::load_from_memory(image_buffer) .map_err(|err| { error!("Failed to open image to resize"); err - }).context(ImageDownloadErrorKind::ImageScale)?; - - image.write_to(&mut original_image, image::ImageOutputFormat::PNG) + }) + .context(ImageDownloadErrorKind::ImageScale)?; + + image + .write_to(&mut original_image, image::ImageOutputFormat::PNG) .map_err(|err| { error!("Failed to save resized image to resize"); err - }).context(ImageDownloadErrorKind::ImageScale)?; + }) + .context(ImageDownloadErrorKind::ImageScale)?; let dimensions = Self::get_image_dimensions(&image); - if dimensions.0 > max_dimensions.0 - || dimensions.1 > max_dimensions.1 { - image = image.resize(max_dimensions.0, max_dimensions.1, image::FilterType::Lanczos3); - let mut resized_buf : Vec = Vec::new(); - image.write_to(&mut resized_buf, image::ImageOutputFormat::PNG) + if dimensions.0 > max_dimensions.0 || dimensions.1 > max_dimensions.1 { + image = image.resize( + max_dimensions.0, + max_dimensions.1, + image::FilterType::Lanczos3, + ); + let mut resized_buf: Vec = Vec::new(); + image + .write_to(&mut resized_buf, image::ImageOutputFormat::PNG) .map_err(|err| { error!("Failed to save resized image to resize"); err - }).context(ImageDownloadErrorKind::ImageScale)?; + }) + .context(ImageDownloadErrorKind::ImageScale)?; resized_image = Some(resized_buf); } @@ -220,22 +259,40 @@ impl ImageDownloader { } } - async fn check_image_parent(&self, node: &Node, child_url: &url::Url) -> Result { + async fn check_image_parent( + &self, + node: &Node, + child_url: &url::Url, + ) -> Result { if let Some(parent) = node.get_parent() { if parent.get_name() == "a" { if let Some(url) = parent.get_property("href") { - let parent_url = url::Url::parse(&url).context(ImageDownloadErrorKind::ParentDownload)?; - let parent_response = self.client.head(parent_url.clone()).send().await.context(ImageDownloadErrorKind::ParentDownload)?; - let _ = ImageDownloader::check_image_content_type(&parent_response).context(ImageDownloadErrorKind::ParentDownload)?; - let child_response = self.client.get(child_url.clone()).send().await.context(ImageDownloadErrorKind::ParentDownload)?; - let parent_length = Self::get_content_lenght(&parent_response).context(ImageDownloadErrorKind::ParentDownload)?; - let child_length = Self::get_content_lenght(&child_response).context(ImageDownloadErrorKind::ParentDownload)?; + let parent_url = + url::Url::parse(&url).context(ImageDownloadErrorKind::ParentDownload)?; + let parent_response = self + .client + .head(parent_url.clone()) + .send() + .await + .context(ImageDownloadErrorKind::ParentDownload)?; + let _ = ImageDownloader::check_image_content_type(&parent_response) + .context(ImageDownloadErrorKind::ParentDownload)?; + let child_response = self + .client + .get(child_url.clone()) + .send() + .await + .context(ImageDownloadErrorKind::ParentDownload)?; + let parent_length = Self::get_content_lenght(&parent_response) + .context(ImageDownloadErrorKind::ParentDownload)?; + let child_length = Self::get_content_lenght(&child_response) + .context(ImageDownloadErrorKind::ParentDownload)?; if parent_length > child_length { - return Ok(parent_url) + return Ok(parent_url); } - return Ok(child_url.clone()) + return Ok(child_url.clone()); } } } @@ -249,7 +306,7 @@ impl ImageDownloader { if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) { if let Ok(content_length) = content_length.to_str() { if let Ok(content_length) = content_length.parse::() { - return Ok(content_length) + return Ok(content_length); } } } @@ -258,7 +315,6 @@ impl ImageDownloader { } } - #[cfg(test)] mod tests { use super::*; @@ -270,11 +326,14 @@ mod tests { let image_dowloader = ImageDownloader::new((2048, 2048)); let hdyleaflet = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html") .expect("Failed to read HTML"); - let result = image_dowloader.download_images_from_string(&hdyleaflet) + let result = image_dowloader + .download_images_from_string(&hdyleaflet) .await .expect("Failed to downalod images"); - let mut file = fs::File::create(r"./resources/tests/planetGnome/fedora31_images_downloaded.html") - .expect("Failed to create output file"); - file.write_all(result.as_bytes()).expect("Failed to write result to file"); + let mut file = + fs::File::create(r"./resources/tests/planetGnome/fedora31_images_downloaded.html") + .expect("Failed to create output file"); + file.write_all(result.as_bytes()) + .expect("Failed to write result to file"); } -} \ No newline at end of file +} diff --git a/src/lib.rs b/src/lib.rs index e3c7c22..48c67b8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,69 +1,78 @@ +mod article; mod config; mod error; -mod article; pub mod images; -use reqwest; -use url; -use regex; -use log::{ - error, - debug, - info, - warn, -}; +use self::error::{ScraperError, ScraperErrorKind}; use crate::article::Article; -use libxml::parser::Parser; -use libxml::xpath::Context; -use libxml::tree::{ - Document, - Node, - SaveOptions, -}; -use std::path::PathBuf; -use std::ops::Index; -use failure::ResultExt; -use std::error::Error; -use crate::config::{ - GrabberConfig, - ConfigCollection -}; -use encoding_rs::{ - Encoding, -}; -use chrono::NaiveDateTime; -use std::str::FromStr; +use crate::config::{ConfigCollection, GrabberConfig}; use crate::images::ImageDownloader; -use self::error::{ - ScraperError, - ScraperErrorKind -}; - +use chrono::NaiveDateTime; +use encoding_rs::Encoding; +use failure::ResultExt; +use libxml::parser::Parser; +use libxml::tree::{Document, Node, SaveOptions}; +use libxml::xpath::Context; +use log::{debug, error, info, warn}; +use regex; +use reqwest; +use std::collections; +use std::error::Error; +use std::path::PathBuf; +use std::str::FromStr; +use std::sync::{Arc, RwLock}; +use std::thread; +use url; pub struct ArticleScraper { pub image_downloader: ImageDownloader, - config_files: ConfigCollection, + config_files: Arc>>, client: reqwest::Client, } impl ArticleScraper { pub fn new(config_path: PathBuf) -> Result { + let config_files = Arc::new(RwLock::new(None)); - let config_files = GrabberConfig::parse_directory(&config_path).context(ScraperErrorKind::Config)?; + let locked_config_files = config_files.clone(); + thread::spawn(move || { + if let Ok(config_files) = GrabberConfig::parse_directory(&config_path) { + locked_config_files + .write() + .expect("Failed to lock config file cache") + .replace(config_files); + } else { + locked_config_files + .write() + .expect("Failed to lock config file cache") + .replace(collections::HashMap::new()); + } + }); Ok(ArticleScraper { image_downloader: ImageDownloader::new((2048, 2048)), - config_files: config_files, + config_files, client: reqwest::Client::new(), }) } - pub async fn parse(&self, url: url::Url, download_images: bool) -> Result { - + pub async fn parse( + &self, + url: url::Url, + download_images: bool, + ) -> Result { info!("Scraping article: {}", url.as_str()); - let response = self.client.head(url.clone()).send().await + let response = self + .client + .head(url.clone()) + .send() + .await .map_err(|err| { - error!("Failed head request to: {} - {}", url.as_str(), err.description()); + error!( + "Failed head request to: {} - {}", + url.as_str(), + err.description() + ); err }) .context(ScraperErrorKind::Http)?; @@ -77,7 +86,7 @@ impl ArticleScraper { // check if we are dealing with text/html if !ArticleScraper::check_content_type(&response)? { - return Err(ScraperErrorKind::ContentType)? + return Err(ScraperErrorKind::ContentType)?; } // check if we have a config for the url @@ -91,19 +100,16 @@ impl ArticleScraper { html: None, }; - let mut document = Document::new().map_err(|()| { - ScraperErrorKind::Xml - })?; + let mut document = Document::new().map_err(|()| ScraperErrorKind::Xml)?; - let mut root = Node::new("article", None, &document).map_err(|()| { - ScraperErrorKind::Xml - })?; + let mut root = Node::new("article", None, &document).map_err(|()| ScraperErrorKind::Xml)?; document.set_root_element(&root); ArticleScraper::generate_head(&mut root, &document)?; - self.parse_pages(&mut article, &url, &mut root, config).await?; + self.parse_pages(&mut article, &url, &mut root, &config) + .await?; let context = Context::new(&document).map_err(|()| { error!("Failed to create xpath context for extracted article"); @@ -112,16 +118,20 @@ impl ArticleScraper { if let Err(error) = ArticleScraper::prevent_self_closing_tags(&context) { error!("Preventing self closing tags failed - {}", error); - return Err(error) + return Err(error); } if let Err(error) = ArticleScraper::eliminate_noscrip_tag(&context) { error!("Eliminating