From d2960d853924a3f28ede8547c99200b77d6b0d3c Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Mon, 10 Feb 2020 18:01:35 +0100 Subject: [PATCH] require client for parsing --- src/images/mod.rs | 40 +++++++++++++++------------------------- src/lib.rs | 39 ++++++++++++++++----------------------- 2 files changed, 31 insertions(+), 48 deletions(-) diff --git a/src/images/mod.rs b/src/images/mod.rs index 42d1a03..b60b1b0 100644 --- a/src/images/mod.rs +++ b/src/images/mod.rs @@ -15,29 +15,18 @@ use url; mod error; pub struct ImageDownloader { - client: Client, max_size: (u32, u32), } impl ImageDownloader { pub fn new(max_size: (u32, u32)) -> Self { - Self::new_with_client(max_size, Client::new()) - } - - pub fn new_with_client(max_size: (u32, u32), client: Client) -> Self { - ImageDownloader { - client, - max_size, - } - } - - pub fn set_client(&mut self, client: Client) { - self.client = client; + ImageDownloader { max_size } } pub async fn download_images_from_string( &self, html: &str, + client: &Client, ) -> Result { let parser = Parser::default_html(); let doc = parser.parse_string(html).map_err(|_| { @@ -50,7 +39,8 @@ impl ImageDownloader { ImageDownloadErrorKind::HtmlParse })?; - self.download_images_from_context(&xpath_ctx).await?; + self.download_images_from_context(&xpath_ctx, client) + .await?; let options = SaveOptions { format: false, @@ -68,6 +58,7 @@ impl ImageDownloader { pub async fn download_images_from_context( &self, context: &Context, + client: &Client, ) -> Result<(), ImageDownloadError> { let xpath = "//img"; let node_vec = ArticleScraper::evaluate_xpath(context, xpath, false) @@ -76,13 +67,13 @@ impl ImageDownloader { if let Some(url) = node.get_property("src") { if !url.starts_with("data:") { if let Ok(url) = url::Url::parse(&url) { - let parent_url = match self.check_image_parent(&node, &url).await { + let parent_url = match self.check_image_parent(&node, &url, client).await { Ok(url) => Some(url), Err(_) => None, }; if let Ok((small_image, big_image)) = - self.save_image(&url, &parent_url).await + self.save_image(&url, &parent_url, client).await { if let Err(_) = node.set_property("src", &small_image) { return Err(ImageDownloadErrorKind::HtmlParse)?; @@ -105,9 +96,9 @@ impl ImageDownloader { &self, image_url: &url::Url, parent_url: &Option, + client: &Client, ) -> Result<(String, Option), ImageDownloadError> { - let response = self - .client + let response = client .get(image_url.clone()) .send() .await @@ -133,8 +124,7 @@ impl ImageDownloader { let mut big_image: Option> = None; if let Some(parent_url) = parent_url { - let response_big = self - .client + let response_big = client .get(parent_url.clone()) .send() .await @@ -271,22 +261,21 @@ impl ImageDownloader { &self, node: &Node, child_url: &url::Url, + client: &Client, ) -> Result { if let Some(parent) = node.get_parent() { if parent.get_name() == "a" { if let Some(url) = parent.get_property("href") { let parent_url = url::Url::parse(&url).context(ImageDownloadErrorKind::ParentDownload)?; - let parent_response = self - .client + let parent_response = client .head(parent_url.clone()) .send() .await .context(ImageDownloadErrorKind::ParentDownload)?; let _ = ImageDownloader::check_image_content_type(&parent_response) .context(ImageDownloadErrorKind::ParentDownload)?; - let child_response = self - .client + let child_response = client .get(child_url.clone()) .send() .await @@ -326,6 +315,7 @@ impl ImageDownloader { #[cfg(test)] mod tests { use super::*; + use reqwest::Client; use std::fs; use std::io::Write; @@ -335,7 +325,7 @@ mod tests { let hdyleaflet = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html") .expect("Failed to read HTML"); let result = image_dowloader - .download_images_from_string(&hdyleaflet) + .download_images_from_string(&hdyleaflet, &Client::new()) .await .expect("Failed to downalod images"); let mut file = diff --git a/src/lib.rs b/src/lib.rs index 2043bda..9f4155a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,15 +27,10 @@ use url; pub struct ArticleScraper { pub image_downloader: ImageDownloader, config_files: Arc>>, - client: Client, } impl ArticleScraper { pub fn new(config_path: PathBuf) -> Self { - Self::new_with_client(config_path, Client::new()) - } - - pub fn new_with_client(config_path: PathBuf, client: Client) -> Self { let config_files = Arc::new(RwLock::new(None)); let locked_config_files = config_files.clone(); @@ -54,25 +49,19 @@ impl ArticleScraper { }); ArticleScraper { - image_downloader: ImageDownloader::new_with_client((2048, 2048), client.clone()), + image_downloader: ImageDownloader::new((2048, 2048)), config_files, - client, } } - pub fn set_client(&mut self, client: Client) { - self.client = client.clone(); - self.image_downloader.set_client(client); - } - pub async fn parse( &self, url: url::Url, download_images: bool, + client: &Client, ) -> Result { info!("Scraping article: '{}'", url.as_str()); - let response = self - .client + let response = client .head(url.clone()) .send() .await @@ -117,7 +106,7 @@ impl ArticleScraper { ArticleScraper::generate_head(&mut root, &document)?; - self.parse_pages(&mut article, &url, &mut root, &config) + self.parse_pages(&mut article, &url, &mut root, &config, client) .await?; let context = Context::new(&document).map_err(|()| { @@ -138,7 +127,7 @@ impl ArticleScraper { if download_images { if let Err(error) = self .image_downloader - .download_images_from_context(&context) + .download_images_from_context(&context, client) .await { error!("Downloading images failed: '{}'", error); @@ -168,8 +157,9 @@ impl ArticleScraper { url: &url::Url, root: &mut Node, config: &GrabberConfig, + client: &Client, ) -> Result<(), ScraperError> { - let html = ArticleScraper::download(&url, &self.client).await?; + let html = ArticleScraper::download(&url, client).await?; let mut document = Self::parse_html(html, config)?; let mut xpath_ctx = Self::get_xpath_ctx(&document)?; @@ -183,9 +173,10 @@ impl ArticleScraper { if !result.trim().is_empty() { // parse again with single page url debug!("Single page link found '{}'", result); - let single_page_url = url::Url::parse(&result).context(ScraperErrorKind::Url)?; + let single_page_url = + url::Url::parse(&result).context(ScraperErrorKind::Url)?; return self - .parse_single_page(article, &single_page_url, root, config) + .parse_single_page(article, &single_page_url, root, config, client) .await; } } @@ -197,7 +188,7 @@ impl ArticleScraper { loop { if let Some(url) = self.check_for_next_page(&xpath_ctx, config) { - let html = ArticleScraper::download(&url, &self.client).await?; + let html = ArticleScraper::download(&url, client).await?; document = Self::parse_html(html, config)?; xpath_ctx = Self::get_xpath_ctx(&document)?; ArticleScraper::strip_junk(&xpath_ctx, config, &url); @@ -261,8 +252,9 @@ impl ArticleScraper { url: &url::Url, root: &mut Node, config: &GrabberConfig, + client: &Client, ) -> Result<(), ScraperError> { - let html = ArticleScraper::download(&url, &self.client).await?; + let html = ArticleScraper::download(&url, client).await?; let document = Self::parse_html(html, config)?; let xpath_ctx = Self::get_xpath_ctx(&document)?; ArticleScraper::extract_metadata(&xpath_ctx, config, article); @@ -811,6 +803,7 @@ impl ArticleScraper { #[cfg(test)] mod tests { use crate::*; + use reqwest::Client; #[tokio::test(basic_scheduler)] async fn golem() { @@ -819,7 +812,7 @@ mod tests { let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap(); let grabber = ArticleScraper::new(config_path); - let article = grabber.parse(url, true).await.unwrap(); + let article = grabber.parse(url, true, &Client::new()).await.unwrap(); article.save_html(&out_path).unwrap(); assert_eq!( @@ -841,7 +834,7 @@ mod tests { .unwrap(); let grabber = ArticleScraper::new(config_path); - let article = grabber.parse(url, true).await.unwrap(); + let article = grabber.parse(url, true, &Client::new()).await.unwrap(); article.save_html(&out_path).unwrap(); assert_eq!(