use global rules

2025-07-07 16:15:32 +02:00 · 2022-10-06 11:50:09 +02:00 · 2022-10-06 11:50:09 +02:00 · c1ae011fcd
commit c1ae011fcd
parent 3a6a70ee64
9 changed files with 209 additions and 150 deletions
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 70a3a3ac28a5db57e47f25cd4573e604cfc90f80
+Subproject commit a6beb80d445b8d99542d8a2f9157cec69ea8b767
--- a/resources/tests/golem/golem.de.txt
+++ b/resources/tests/golem/golem.de.txt
@ -1,42 +0,0 @@
 # Author: zinnober
 # Rewrite of original template which fetched the printer-version without pictures
 tidy: no
 prune: no
 # Set full title
 title: //h1/span
 date: //time
 author: //a[@rel='author']
 # Content is here
 body: //article
 # Fetch full multipage articles
 next_page_link: //a[@id='atoc_next']
 # Remove tracking and ads
 strip_id_or_class: iqadtile4
 # General Cleanup
 strip_id_or_class: list-jtoc
 strip_id_or_class: table-jtoc
 strip_id_or_class: implied
 strip_id_or_class: social-
 strip_id_or_class: comments
 strip_id_or_class: footer
 strip_id_or_class: job-market
 strip_id_or_class: tags
 # Tidy up galleries (could still be improved, though)
 strip: //img[@src='']
 strip: //li[not(*)]
 strip: //div[contains(@style,'margin')]
 strip: //figure[contains(@id,'gvideo')]
 # Try yourself
 test_url: http://www.golem.de/news/intel-core-i7-5960x-im-test-die-pc-revolution-beginnt-mit-octacore-und-ddr4-1408-108893.html
 test_url: http://www.golem.de/news/test-infamous-first-light-neonbunter-actionspass-1408-108914.html
--- a/resources/tests/phoronix/phoronix.com.txt
+++ b/resources/tests/phoronix/phoronix.com.txt
@ -1,9 +0,0 @@
 # based on the grabber rules of picofeed
 title: //article/header
 body: //div[@class="content"]
 test_url: http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1
 # replace_string(<h5>): <h2>
 next_page_link: //a[@title='Go To Next Page']
--- a/src/config/config_collection.rs
+++ b/src/config/config_collection.rs
@ -15,17 +15,15 @@ pub struct ConfigCollection {
 impl ConfigCollection {
    pub async fn parse(directory: Option<&Path>) -> ConfigCollection {
        let mut user_entries = HashMap::new();
        let mut embedded_entries = HashMap::new();
        for (file_name, entry) in EmbededConfigFiles::iter()
            .filter_map(|file_name| EmbededConfigFiles::get(&file_name).map(|e| (file_name, e)))
        {
-            if let Ok(entry) = ConfigEntry::parse_data(entry.data).await {
+            let entry = ConfigEntry::parse_data(entry.data).await.unwrap();
-                let file_name: &str = file_name.borrow();
+            let file_name: &str = file_name.borrow();
-                embedded_entries.insert(file_name.to_owned(), entry);
+            embedded_entries.insert(file_name.to_owned(), entry);
            }
        }
        if let Some(directory) = directory {
--- a/src/config/config_entry.rs
+++ b/src/config/config_entry.rs
@ -2,7 +2,6 @@ use crate::util::Util;
 use super::error::{ConfigError, ConfigErrorKind};
 use failure::ResultExt;
 use log::warn;
 use std::borrow::Cow;
 use std::io::Cursor;
 use std::path::Path;
@ -140,11 +139,6 @@ impl ConfigEntry {
            }
        }
        if xpath_body.is_empty() {
            warn!("No body xpath found for");
            return Err(ConfigErrorKind::BadConfig.into());
        }
        let config = ConfigEntry {
            xpath_title,
            xpath_author,
--- a/src/config/error.rs
+++ b/src/config/error.rs
@ -10,8 +10,6 @@ pub struct ConfigError {
 pub enum ConfigErrorKind {
    #[fail(display = "IO Error")]
    IO,
    #[fail(display = "Config does not contain body xpath")]
    BadConfig,
    #[fail(display = "Unknown Error")]
    Unknown,
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -22,6 +22,7 @@ use log::{debug, error, info, warn};
 use reqwest::{Client, Response};
 use std::path::Path;
 use std::str::FromStr;
 use util::Util;
 pub struct ArticleScraper {
    pub image_downloader: ImageDownloader,
@ -76,7 +77,11 @@ impl ArticleScraper {
        }
        // check if we have a config for the url
-        let config = self.get_grabber_config(&url)?;
+        let config = self.get_grabber_config(&url);
        let global_config = self
            .config_files
            .get("global.txt")
            .ok_or_else(|| ScraperErrorKind::Config)?;
        let mut article = Article {
            title: None,
@ -94,7 +99,7 @@ impl ArticleScraper {
        ArticleScraper::generate_head(&mut root, &document)?;
-        self.parse_pages(&mut article, &url, &mut root, &config, client)
+        self.parse_pages(&mut article, &url, &mut root, config, global_config, client)
            .await?;
        let context = Context::new(&document).map_err(|()| {
@ -139,15 +144,20 @@ impl ArticleScraper {
        article: &mut Article,
        url: &url::Url,
        root: &mut Node,
-        config: &ConfigEntry,
+        config: Option<&ConfigEntry>,
        global_config: &ConfigEntry,
        client: &Client,
    ) -> Result<(), ScraperError> {
        let html = ArticleScraper::download(&url, client).await?;
-        let mut document = Self::parse_html(html, config)?;
+        let mut document = Self::parse_html(html, config, global_config)?;
        let mut xpath_ctx = Self::get_xpath_ctx(&document)?;
        // check for single page link
-        if let Some(xpath_single_page_link) = config.single_page_link.clone() {
+        let rule = Util::select_rule(
            config.and_then(|c| c.single_page_link.as_deref()),
            global_config.single_page_link.as_deref(),
        );
        if let Some(xpath_single_page_link) = rule {
            debug!(
                "Single page link xpath specified in config '{}'",
                xpath_single_page_link
@ -159,32 +169,49 @@ impl ArticleScraper {
                    let single_page_url =
                        url::Url::parse(&result).context(ScraperErrorKind::Url)?;
                    return self
-                        .parse_single_page(article, &single_page_url, root, config, client)
+                        .parse_single_page(
                            article,
                            &single_page_url,
                            root,
                            config,
                            global_config,
                            client,
                        )
                        .await;
                }
            }
        }
-        ArticleScraper::extract_metadata(&xpath_ctx, config, article);
+        ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article);
-        ArticleScraper::strip_junk(&xpath_ctx, config, &url);
+        ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url);
-        ArticleScraper::extract_body(&xpath_ctx, root, config)?;
+        ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?;
-        while let Some(url) = self.check_for_next_page(&xpath_ctx, config) {
+        while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
            let html = ArticleScraper::download(&url, client).await?;
-            document = Self::parse_html(html, config)?;
+            document = Self::parse_html(html, config, global_config)?;
            xpath_ctx = Self::get_xpath_ctx(&document)?;
-            ArticleScraper::strip_junk(&xpath_ctx, config, &url);
+            ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url);
-            ArticleScraper::extract_body(&xpath_ctx, root, config)?;
+            ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?;
        }
        Ok(())
    }
-    fn parse_html(html: String, config: &ConfigEntry) -> Result<Document, ScraperError> {
+    fn parse_html(
        html: String,
        config: Option<&ConfigEntry>,
        global_config: &ConfigEntry,
    ) -> Result<Document, ScraperError> {
        // replace matches in raw html
        let mut html = html;
-        for replace in &config.replace {
+        if let Some(config) = config {
            for replace in &config.replace {
                html = html.replace(&replace.to_replace, &replace.replace_with);
            }
        }
        for replace in &global_config.replace {
            html = html.replace(&replace.to_replace, &replace.replace_with);
        }
@ -230,15 +257,16 @@ impl ArticleScraper {
        article: &mut Article,
        url: &url::Url,
        root: &mut Node,
-        config: &ConfigEntry,
+        config: Option<&ConfigEntry>,
        global_config: &ConfigEntry,
        client: &Client,
    ) -> Result<(), ScraperError> {
        let html = ArticleScraper::download(&url, client).await?;
-        let document = Self::parse_html(html, config)?;
+        let document = Self::parse_html(html, config, global_config)?;
        let xpath_ctx = Self::get_xpath_ctx(&document)?;
-        ArticleScraper::extract_metadata(&xpath_ctx, config, article);
+        ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article);
-        ArticleScraper::strip_junk(&xpath_ctx, config, &url);
+        ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url);
-        ArticleScraper::extract_body(&xpath_ctx, root, config)?;
+        ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?;
        Ok(())
    }
@ -340,16 +368,17 @@ impl ArticleScraper {
        }
    }
-    fn get_grabber_config(&self, url: &url::Url) -> Result<ConfigEntry, ScraperError> {
+    fn get_grabber_config(&self, url: &url::Url) -> Option<&ConfigEntry> {
-        let config_name = Self::get_host_name(url)? + ".txt";
+        let conf = Self::get_host_name(url)
            .ok()
            .map(|url| url + ".txt")
            .and_then(|name| self.config_files.get(&name));
-        match self.config_files.get(&config_name) {
+        if conf.is_none() {
-            Some(config) => Ok(config.clone()),
+            log::warn!("No config found for url '{}'", url);
            None => {
                error!("No config file of the name '{}' found", config_name);
                Err(ScraperErrorKind::Config.into())
            }
        }
        conf
    }
    fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
@ -575,19 +604,45 @@ impl ArticleScraper {
        Ok(url)
    }
-    fn strip_junk(context: &Context, config: &ConfigEntry, url: &url::Url) {
+    fn strip_junk(
        context: &Context,
        config: Option<&ConfigEntry>,
        global_config: &ConfigEntry,
        url: &url::Url,
    ) {
        // strip specified xpath
-        for xpath_strip in &config.xpath_strip {
+        if let Some(config) = config {
            for xpath_strip in &config.xpath_strip {
                let _ = ArticleScraper::strip_node(&context, xpath_strip);
            }
        }
        for xpath_strip in &global_config.xpath_strip {
            let _ = ArticleScraper::strip_node(&context, xpath_strip);
        }
        // strip everything with specified 'id' or 'class'
-        for xpaht_strip_class in &config.strip_id_or_class {
+        if let Some(config) = config {
            for xpaht_strip_class in &config.strip_id_or_class {
                let _ = ArticleScraper::strip_id_or_class(&context, xpaht_strip_class);
            }
        }
        for xpaht_strip_class in &global_config.strip_id_or_class {
            let _ = ArticleScraper::strip_id_or_class(&context, xpaht_strip_class);
        }
        // strip any <img> element where @src attribute contains this substring
-        for xpath_strip_img_src in &config.strip_image_src {
+        if let Some(config) = config {
            for xpath_strip_img_src in &config.strip_image_src {
                let _ = ArticleScraper::strip_node(
                    &context,
                    &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
                );
            }
        }
        for xpath_strip_img_src in &global_config.strip_image_src {
            let _ = ArticleScraper::strip_node(
                &context,
                &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
@ -620,9 +675,6 @@ impl ArticleScraper {
            &String::from("//*[contains(@style,'display:none')]"),
        );
        // strip all scripts
        //let _ = ArticleScraper::strip_node(&context, &String::from("//script"));
        // strip all comments
        let _ = ArticleScraper::strip_node(&context, &String::from("//comment()"));
@ -633,34 +685,79 @@ impl ArticleScraper {
        let _ = ArticleScraper::strip_node(&context, &String::from("//*[@type='text/css']"));
    }
-    fn extract_metadata(context: &Context, config: &ConfigEntry, article: &mut Article) {
+    fn extract_metadata(
        context: &Context,
        config: Option<&ConfigEntry>,
        global_config: &ConfigEntry,
        article: &mut Article,
    ) {
        // try to get title
-        for xpath_title in &config.xpath_title {
+        if let Some(config) = config {
-            if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) {
+            for xpath_title in &config.xpath_title {
-                debug!("Article title: '{}'", title);
+                if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) {
-                article.title = Some(title);
+                    debug!("Article title: '{}'", title);
-                break;
+                    article.title = Some(title);
                    break;
                }
            }
        }
        if article.title.is_none() {
            for xpath_title in &global_config.xpath_title {
                if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) {
                    debug!("Article title: '{}'", title);
                    article.title = Some(title);
                    break;
                }
            }
        }
        // try to get the author
-        for xpath_author in &config.xpath_author {
+        if let Some(config) = config {
-            if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) {
+            for xpath_author in &config.xpath_author {
-                debug!("Article author: '{}'", author);
+                if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) {
-                article.author = Some(author);
+                    debug!("Article author: '{}'", author);
-                break;
+                    article.author = Some(author);
                    break;
                }
            }
        }
        if article.title.is_none() {
            for xpath_author in &global_config.xpath_author {
                if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) {
                    debug!("Article author: '{}'", author);
                    article.author = Some(author);
                    break;
                }
            }
        }
        // try to get the date
-        for xpath_date in &config.xpath_date {
+        if let Some(config) = config {
-            if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) {
+            for xpath_date in &config.xpath_date {
-                debug!("Article date: '{}'", date_string);
+                if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) {
-                if let Ok(date) = DateTime::from_str(&date_string) {
+                    debug!("Article date: '{}'", date_string);
-                    article.date = Some(date);
+                    if let Ok(date) = DateTime::from_str(&date_string) {
-                    break;
+                        article.date = Some(date);
-                } else {
+                        break;
-                    warn!("Parsing the date string '{}' failed", date_string);
+                    } else {
                        warn!("Parsing the date string '{}' failed", date_string);
                    }
                }
            }
        }
        if article.title.is_none() {
            for xpath_date in &global_config.xpath_date {
                if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) {
                    debug!("Article date: '{}'", date_string);
                    if let Ok(date) = DateTime::from_str(&date_string) {
                        article.date = Some(date);
                        break;
                    } else {
                        warn!("Parsing the date string '{}' failed", date_string);
                    }
                }
            }
        }
@ -669,14 +766,25 @@ impl ArticleScraper {
    fn extract_body(
        context: &Context,
        root: &mut Node,
-        config: &ConfigEntry,
+        config: Option<&ConfigEntry>,
        global_config: &ConfigEntry,
    ) -> Result<(), ScraperError> {
        let mut found_something = false;
-        for xpath_body in &config.xpath_body {
+
-            found_something = ArticleScraper::extract_body_single(&context, root, xpath_body)?;
+        if let Some(config) = config {
            for xpath_body in &config.xpath_body {
                found_something = ArticleScraper::extract_body_single(&context, root, xpath_body)?;
            }
        }
        if !found_something {
            for xpath_body in &global_config.xpath_body {
                found_something = ArticleScraper::extract_body_single(&context, root, xpath_body)?;
            }
        }
        if !found_something {
            log::error!("no body found");
            return Err(ScraperErrorKind::Scrape.into());
        }
@ -709,10 +817,25 @@ impl ArticleScraper {
        Ok(found_something)
    }
-    fn check_for_next_page(&self, context: &Context, config: &ConfigEntry) -> Option<url::Url> {
+    fn check_for_next_page(
-        if let Some(next_page_xpath) = config.next_page_link.clone() {
+        &self,
        context: &Context,
        config: Option<&ConfigEntry>,
        global_config: &ConfigEntry,
    ) -> Option<url::Url> {
        if let Some(config) = config {
            if let Some(next_page_xpath) = config.next_page_link.as_deref() {
                if let Ok(next_page_string) =
                    ArticleScraper::get_attribute(&context, next_page_xpath, "href")
                {
                    if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
                        return Some(next_page_url);
                    }
                }
            }
        } else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() {
            if let Ok(next_page_string) =
-                ArticleScraper::get_attribute(&context, &next_page_xpath, "href")
+                ArticleScraper::get_attribute(&context, next_page_xpath, "href")
            {
                if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
                    return Some(next_page_url);
--- a/src/tests.rs
+++ b/src/tests.rs
@ -1,35 +1,21 @@
 use crate::*;
 use std::path::PathBuf;
 use reqwest::Client;
-
+use std::path::PathBuf;
 #[tokio::test(flavor = "current_thread")]
 async fn golem() {
    let out_path = PathBuf::from(r"./test_output");
    let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
    let grabber = ArticleScraper::new(None).await;
    let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
    article.save_html(&out_path).unwrap();
    assert_eq!(
        article.title,
        Some(String::from(
            "HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben"
        ))
    );
    assert_eq!(article.author, Some(String::from("Hauke Gierow")));
 }
 #[tokio::test(flavor = "current_thread")]
 async fn phoronix() {
    let out_path = PathBuf::from(r"./test_output");
-    let url = url::Url::parse(
+    let url =
-        "http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1",
+        url::Url::parse("http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1")
-    )
+            .unwrap();
    .unwrap();
    let grabber = ArticleScraper::new(None).await;
-    let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
+
    let start = chrono::Utc::now();
    let article = grabber.parse(&url, false, &Client::new()).await.unwrap();
    let end = chrono::Utc::now();
    let duration = end - start;
    println!("duration: {}ms", duration.num_milliseconds());
    article.save_html(&out_path).unwrap();
    assert_eq!(
@ -51,4 +37,4 @@ async fn youtube() {
        article.html,
        Some("<iframe width=\"650\" height=\"350\" frameborder=\"0\" src=\"https://www.youtube-nocookie.com/embed/lHRkYLcmFY8\" allowfullscreen></iframe>".into())
    );
-}
+}
--- a/src/util.rs
+++ b/src/util.rs
@ -23,4 +23,15 @@ impl Util {
    pub fn split_values(values: &str) -> Vec<&str> {
        values.split('|').map(|s| s.trim()).collect()
    }
    pub fn select_rule<'a>(
        site_specific_rule: Option<&'a str>,
        global_rule: Option<&'a str>,
    ) -> Option<&'a str> {
        if site_specific_rule.is_some() {
            site_specific_rule
        } else {
            global_rule
        }
    }
 }
		`@ -1 +1 @@`
			`Subproject commit 70a3a3ac28a5db57e47f25cd4573e604cfc90f80`				`Subproject commit a6beb80d445b8d99542d8a2f9157cec69ea8b767`