Merge branch 'master' of gitlab.com:news-flash/article_scraper

2025-07-07 16:15:32 +02:00 · 2020-04-28 02:34:24 +02:00 · 2020-04-28 02:34:24 +02:00 · 1fbce6413d
commit 1fbce6413d
parent f6d021b67b d2960d8539
3 changed files with 64 additions and 66 deletions
--- a/src/article.rs
+++ b/src/article.rs
@ -19,7 +19,7 @@ impl Article {
        if let Some(ref html) = self.html {
            if let Ok(()) = std::fs::create_dir_all(&path) {
                let mut file_name = match self.title.clone() {
-                    Some(file_name) => file_name,
+                    Some(file_name) => file_name.replace("/", "_"),
                    None => "Unknown Title".to_owned(),
                };
                file_name.push_str(".html");
--- a/src/images/mod.rs
+++ b/src/images/mod.rs
@ -7,7 +7,7 @@ use libxml::parser::Parser;
 use libxml::tree::{Node, SaveOptions};
 use libxml::xpath::Context;
 use log::{debug, error};
-use reqwest;
+use reqwest::{Client, Response};
 use std;
 use std::error::Error;
 use url;
@ -15,21 +15,18 @@ use url;
 mod error;

 pub struct ImageDownloader {
-    client: reqwest::Client,
    max_size: (u32, u32),
 }

 impl ImageDownloader {
-    pub fn new(max_size: (u32, u32)) -> ImageDownloader {
-        ImageDownloader {
-            client: reqwest::Client::new(),
-            max_size: max_size,
-        }
+    pub fn new(max_size: (u32, u32)) -> Self {
+        ImageDownloader { max_size }
    }

    pub async fn download_images_from_string(
        &self,
        html: &str,
+        client: &Client,
    ) -> Result<String, ImageDownloadError> {
        let parser = Parser::default_html();
        let doc = parser.parse_string(html).map_err(|_| {
@ -42,7 +39,8 @@ impl ImageDownloader {
            ImageDownloadErrorKind::HtmlParse
        })?;

-        self.download_images_from_context(&xpath_ctx).await?;
+        self.download_images_from_context(&xpath_ctx, client)
+            .await?;

        let options = SaveOptions {
            format: false,
@ -60,6 +58,7 @@ impl ImageDownloader {
    pub async fn download_images_from_context(
        &self,
        context: &Context,
+        client: &Client,
    ) -> Result<(), ImageDownloadError> {
        let xpath = "//img";
        let node_vec = ArticleScraper::evaluate_xpath(context, xpath, false)
@ -68,13 +67,13 @@ impl ImageDownloader {
            if let Some(url) = node.get_property("src") {
                if !url.starts_with("data:") {
                    if let Ok(url) = url::Url::parse(&url) {
-                        let parent_url = match self.check_image_parent(&node, &url).await {
+                        let parent_url = match self.check_image_parent(&node, &url, client).await {
                            Ok(url) => Some(url),
                            Err(_) => None,
                        };

                        if let Ok((small_image, big_image)) =
-                            self.save_image(&url, &parent_url).await
+                            self.save_image(&url, &parent_url, client).await
                        {
                            if let Err(_) = node.set_property("src", &small_image) {
                                return Err(ImageDownloadErrorKind::HtmlParse)?;
@ -97,9 +96,9 @@ impl ImageDownloader {
        &self,
        image_url: &url::Url,
        parent_url: &Option<url::Url>,
+        client: &Client,
    ) -> Result<(String, Option<String>), ImageDownloadError> {
-        let response = self
-            .client
+        let response = client
            .get(image_url.clone())
            .send()
            .await
@ -125,8 +124,7 @@ impl ImageDownloader {
        let mut big_image: Option<Vec<u8>> = None;

        if let Some(parent_url) = parent_url {
-            let response_big = self
-                .client
+            let response_big = client
                .get(parent_url.clone())
                .send()
                .await
@ -185,7 +183,7 @@ impl ImageDownloader {
    }

    fn check_image_content_type(
-        response: &reqwest::Response,
+        response: &Response,
    ) -> Result<reqwest::header::HeaderValue, ImageDownloadError> {
        if response.status().is_success() {
            if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
@ -263,22 +261,21 @@ impl ImageDownloader {
        &self,
        node: &Node,
        child_url: &url::Url,
+        client: &Client,
    ) -> Result<url::Url, ImageDownloadError> {
        if let Some(parent) = node.get_parent() {
            if parent.get_name() == "a" {
                if let Some(url) = parent.get_property("href") {
                    let parent_url =
                        url::Url::parse(&url).context(ImageDownloadErrorKind::ParentDownload)?;
-                    let parent_response = self
-                        .client
+                    let parent_response = client
                        .head(parent_url.clone())
                        .send()
                        .await
                        .context(ImageDownloadErrorKind::ParentDownload)?;
                    let _ = ImageDownloader::check_image_content_type(&parent_response)
                        .context(ImageDownloadErrorKind::ParentDownload)?;
-                    let child_response = self
-                        .client
+                    let child_response = client
                        .get(child_url.clone())
                        .send()
                        .await
@ -301,7 +298,7 @@ impl ImageDownloader {
        Err(ImageDownloadErrorKind::ParentDownload)?
    }

-    fn get_content_lenght(response: &reqwest::Response) -> Result<u64, ImageDownloadError> {
+    fn get_content_lenght(response: &Response) -> Result<u64, ImageDownloadError> {
        if response.status().is_success() {
            if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) {
                if let Ok(content_length) = content_length.to_str() {
@ -318,6 +315,7 @@ impl ImageDownloader {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use reqwest::Client;
    use std::fs;
    use std::io::Write;

@ -327,7 +325,7 @@ mod tests {
        let hdyleaflet = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
            .expect("Failed to read HTML");
        let result = image_dowloader
-            .download_images_from_string(&hdyleaflet)
+            .download_images_from_string(&hdyleaflet, &Client::new())
            .await
            .expect("Failed to downalod images");
        let mut file =
--- a/src/lib.rs
+++ b/src/lib.rs
@ -15,7 +15,7 @@ use libxml::tree::{Document, Node, SaveOptions};
 use libxml::xpath::Context;
 use log::{debug, error, info, warn};
 use regex;
-use reqwest;
+use reqwest::{Client, Response};
 use std::collections;
 use std::error::Error;
 use std::path::PathBuf;
@ -27,11 +27,10 @@ use url;
 pub struct ArticleScraper {
    pub image_downloader: ImageDownloader,
    config_files: Arc<RwLock<Option<ConfigCollection>>>,
-    client: reqwest::Client,
 }

 impl ArticleScraper {
-    pub fn new(config_path: PathBuf) -> Result<ArticleScraper, ScraperError> {
+    pub fn new(config_path: PathBuf) -> Self {
        let config_files = Arc::new(RwLock::new(None));

        let locked_config_files = config_files.clone();
@ -49,21 +48,20 @@ impl ArticleScraper {
            }
        });

-        Ok(ArticleScraper {
+        ArticleScraper {
            image_downloader: ImageDownloader::new((2048, 2048)),
            config_files,
-            client: reqwest::Client::new(),
-        })
+        }
    }

    pub async fn parse(
        &self,
        url: url::Url,
        download_images: bool,
+        client: &Client,
    ) -> Result<Article, ScraperError> {
        info!("Scraping article: '{}'", url.as_str());
-        let response = self
-            .client
+        let response = client
            .head(url.clone())
            .send()
            .await
@ -108,7 +106,7 @@ impl ArticleScraper {

        ArticleScraper::generate_head(&mut root, &document)?;

-        self.parse_pages(&mut article, &url, &mut root, &config)
+        self.parse_pages(&mut article, &url, &mut root, &config, client)
            .await?;

        let context = Context::new(&document).map_err(|()| {
@ -121,15 +119,15 @@ impl ArticleScraper {
            return Err(error);
        }

-        if let Err(error) = ArticleScraper::eliminate_noscrip_tag(&context) {
-            error!("Eliminating <noscript> tag failed - '{}'", error);
-            return Err(error);
-        }
+        // if let Err(error) = ArticleScraper::eliminate_noscript_tag(&context) {
+        //     error!("Eliminating <noscript> tag failed - {}", error);
+        //     return Err(error)
+        // }

        if download_images {
            if let Err(error) = self
                .image_downloader
-                .download_images_from_context(&context)
+                .download_images_from_context(&context, client)
                .await
            {
                error!("Downloading images failed: '{}'", error);
@ -159,8 +157,9 @@ impl ArticleScraper {
        url: &url::Url,
        root: &mut Node,
        config: &GrabberConfig,
+        client: &Client,
    ) -> Result<(), ScraperError> {
-        let html = ArticleScraper::download(&url, &self.client).await?;
+        let html = ArticleScraper::download(&url, client).await?;
        let mut document = Self::parse_html(html, config)?;
        let mut xpath_ctx = Self::get_xpath_ctx(&document)?;

@ -174,9 +173,10 @@ impl ArticleScraper {
                if !result.trim().is_empty() {
                    // parse again with single page url
                    debug!("Single page link found '{}'", result);
-                    let single_page_url = url::Url::parse(&result).context(ScraperErrorKind::Url)?;
+                    let single_page_url =
+                        url::Url::parse(&result).context(ScraperErrorKind::Url)?;
                    return self
-                        .parse_single_page(article, &single_page_url, root, config)
+                        .parse_single_page(article, &single_page_url, root, config, client)
                        .await;
                }
            }
@ -188,7 +188,7 @@ impl ArticleScraper {

        loop {
            if let Some(url) = self.check_for_next_page(&xpath_ctx, config) {
-                let html = ArticleScraper::download(&url, &self.client).await?;
+                let html = ArticleScraper::download(&url, client).await?;
                document = Self::parse_html(html, config)?;
                xpath_ctx = Self::get_xpath_ctx(&document)?;
                ArticleScraper::strip_junk(&xpath_ctx, config, &url);
@ -252,8 +252,9 @@ impl ArticleScraper {
        url: &url::Url,
        root: &mut Node,
        config: &GrabberConfig,
+        client: &Client,
    ) -> Result<(), ScraperError> {
-        let html = ArticleScraper::download(&url, &self.client).await?;
+        let html = ArticleScraper::download(&url, client).await?;
        let document = Self::parse_html(html, config)?;
        let xpath_ctx = Self::get_xpath_ctx(&document)?;
        ArticleScraper::extract_metadata(&xpath_ctx, config, article);
@ -263,7 +264,7 @@ impl ArticleScraper {
        Ok(())
    }

-    async fn download(url: &url::Url, client: &reqwest::Client) -> Result<String, ScraperError> {
+    async fn download(url: &url::Url, client: &Client) -> Result<String, ScraperError> {
        let response = client
            .get(url.as_str())
            .send()
@ -373,7 +374,7 @@ impl ArticleScraper {
        }
    }

-    fn check_content_type(response: &reqwest::Response) -> Result<bool, ScraperError> {
+    fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
        if response.status().is_success() {
            if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
                if let Ok(content_type) = content_type.to_str() {
@ -391,7 +392,7 @@ impl ArticleScraper {
        Err(ScraperErrorKind::Http)?
    }

-    fn check_redirect(response: &reqwest::Response) -> Option<url::Url> {
+    fn check_redirect(response: &Response) -> Option<url::Url> {
        if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
            debug!("Article url redirects to '{}'", response.url().as_str());
            return Some(response.url().clone());
@ -646,7 +647,7 @@ impl ArticleScraper {
        );

        // strip all scripts
-        let _ = ArticleScraper::strip_node(&context, &String::from("//script"));
+        //let _ = ArticleScraper::strip_node(&context, &String::from("//script"));

        // strip all comments
        let _ = ArticleScraper::strip_node(&context, &String::from("//comment()"));
@ -782,28 +783,27 @@ impl ArticleScraper {
        Ok(())
    }

-    fn eliminate_noscrip_tag(context: &Context) -> Result<(), ScraperError> {
-        let xpath = "//noscript";
-        let node_vec = Self::evaluate_xpath(context, xpath, false)?;
-
-        for mut node in node_vec {
-            if let Some(mut parent) = node.get_parent() {
-                node.unlink();
-                let children = node.get_child_nodes();
-                for mut child in children {
-                    child.unlink();
-                    let _ = parent.add_child(&mut child);
-                }
-            }
-        }
-
-        Ok(())
-    }
+    // fn eliminate_noscript_tag(context: &Context) -> Result<(), ScraperError> {
+    //     let xpath = "//noscript";
+    //     let node_vec = Self::evaluate_xpath(context, xpath, false)?;
+    //     for mut node in node_vec {
+    //         if let Some(mut parent) = node.get_parent() {
+    //             node.unlink();
+    //             let children = node.get_child_nodes();
+    //             for mut child in children {
+    //                 child.unlink();
+    //                 let _ = parent.add_child(&mut child);
+    //             }
+    //         }
+    //     }
+    //     Ok(())
+    // }
 }

 #[cfg(test)]
 mod tests {
    use crate::*;
+    use reqwest::Client;

    #[tokio::test(basic_scheduler)]
    async fn golem() {
@ -811,8 +811,8 @@ mod tests {
        let out_path = PathBuf::from(r"./test_output");
        let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();

-        let grabber = ArticleScraper::new(config_path).unwrap();
-        let article = grabber.parse(url, true).await.unwrap();
+        let grabber = ArticleScraper::new(config_path);
+        let article = grabber.parse(url, true, &Client::new()).await.unwrap();
        article.save_html(&out_path).unwrap();

        assert_eq!(
@ -833,8 +833,8 @@ mod tests {
        )
        .unwrap();

-        let grabber = ArticleScraper::new(config_path).unwrap();
-        let article = grabber.parse(url, true).await.unwrap();
+        let grabber = ArticleScraper::new(config_path);
+        let article = grabber.parse(url, true, &Client::new()).await.unwrap();
        article.save_html(&out_path).unwrap();

        assert_eq!(