use global rules

2025-07-07 16:15:32 +02:00 · 2022-10-06 11:50:09 +02:00 · 2022-10-06 11:50:09 +02:00 · c1ae011fcd
commit c1ae011fcd
parent 3a6a70ee64
9 changed files with 209 additions and 150 deletions
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 70a3a3ac28a5db57e47f25cd4573e604cfc90f80
+Subproject commit a6beb80d445b8d99542d8a2f9157cec69ea8b767
--- a/resources/tests/golem/golem.de.txt
+++ b/resources/tests/golem/golem.de.txt
@ -1,42 +0,0 @@
-# Author: zinnober
-# Rewrite of original template which fetched the printer-version without pictures
-
-tidy: no
-prune: no
-
-# Set full title
-title: //h1/span
-
-date: //time
-author: //a[@rel='author']
-
-# Content is here
-body: //article
-
-# Fetch full multipage articles
-next_page_link: //a[@id='atoc_next']
-
-# Remove tracking and ads
-strip_id_or_class: iqadtile4
-
-# General Cleanup
-strip_id_or_class: list-jtoc
-strip_id_or_class: table-jtoc
-strip_id_or_class: implied
-strip_id_or_class: social-
-strip_id_or_class: comments
-strip_id_or_class: footer
-strip_id_or_class: job-market
-strip_id_or_class: tags
-
-# Tidy up galleries (could still be improved, though)
-strip: //img[@src='']
-strip: //li[not(*)]
-strip: //div[contains(@style,'margin')]
-strip: //figure[contains(@id,'gvideo')]
-
-
-# Try yourself
-test_url: http://www.golem.de/news/intel-core-i7-5960x-im-test-die-pc-revolution-beginnt-mit-octacore-und-ddr4-1408-108893.html
-test_url: http://www.golem.de/news/test-infamous-first-light-neonbunter-actionspass-1408-108914.html
-
--- a/resources/tests/phoronix/phoronix.com.txt
+++ b/resources/tests/phoronix/phoronix.com.txt
@ -1,9 +0,0 @@
-# based on the grabber rules of picofeed
-
-title: //article/header
-body: //div[@class="content"]
-test_url: http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1
-
-# replace_string(<h5>): <h2>
-
-next_page_link: //a[@title='Go To Next Page']
--- a/src/config/config_collection.rs
+++ b/src/config/config_collection.rs
@ -15,18 +15,16 @@ pub struct ConfigCollection {

 impl ConfigCollection {
    pub async fn parse(directory: Option<&Path>) -> ConfigCollection {
-
        let mut user_entries = HashMap::new();
        let mut embedded_entries = HashMap::new();

        for (file_name, entry) in EmbededConfigFiles::iter()
            .filter_map(|file_name| EmbededConfigFiles::get(&file_name).map(|e| (file_name, e)))
        {
-            if let Ok(entry) = ConfigEntry::parse_data(entry.data).await {
+            let entry = ConfigEntry::parse_data(entry.data).await.unwrap();
            let file_name: &str = file_name.borrow();
            embedded_entries.insert(file_name.to_owned(), entry);
        }
-        }

        if let Some(directory) = directory {
            // create data dir if it doesn't already exist
--- a/src/config/config_entry.rs
+++ b/src/config/config_entry.rs
@ -2,7 +2,6 @@ use crate::util::Util;

 use super::error::{ConfigError, ConfigErrorKind};
 use failure::ResultExt;
-use log::warn;
 use std::borrow::Cow;
 use std::io::Cursor;
 use std::path::Path;
@ -140,11 +139,6 @@ impl ConfigEntry {
            }
        }

-        if xpath_body.is_empty() {
-            warn!("No body xpath found for");
-            return Err(ConfigErrorKind::BadConfig.into());
-        }
-
        let config = ConfigEntry {
            xpath_title,
            xpath_author,
--- a/src/config/error.rs
+++ b/src/config/error.rs
@ -10,8 +10,6 @@ pub struct ConfigError {
 pub enum ConfigErrorKind {
    #[fail(display = "IO Error")]
    IO,
-    #[fail(display = "Config does not contain body xpath")]
-    BadConfig,
    #[fail(display = "Unknown Error")]
    Unknown,
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -22,6 +22,7 @@ use log::{debug, error, info, warn};
 use reqwest::{Client, Response};
 use std::path::Path;
 use std::str::FromStr;
+use util::Util;

 pub struct ArticleScraper {
    pub image_downloader: ImageDownloader,
@ -76,7 +77,11 @@ impl ArticleScraper {
        }

        // check if we have a config for the url
-        let config = self.get_grabber_config(&url)?;
+        let config = self.get_grabber_config(&url);
+        let global_config = self
+            .config_files
+            .get("global.txt")
+            .ok_or_else(|| ScraperErrorKind::Config)?;

        let mut article = Article {
            title: None,
@ -94,7 +99,7 @@ impl ArticleScraper {

        ArticleScraper::generate_head(&mut root, &document)?;

-        self.parse_pages(&mut article, &url, &mut root, &config, client)
+        self.parse_pages(&mut article, &url, &mut root, config, global_config, client)
            .await?;

        let context = Context::new(&document).map_err(|()| {
@ -139,15 +144,20 @@ impl ArticleScraper {
        article: &mut Article,
        url: &url::Url,
        root: &mut Node,
-        config: &ConfigEntry,
+        config: Option<&ConfigEntry>,
+        global_config: &ConfigEntry,
        client: &Client,
    ) -> Result<(), ScraperError> {
        let html = ArticleScraper::download(&url, client).await?;
-        let mut document = Self::parse_html(html, config)?;
+        let mut document = Self::parse_html(html, config, global_config)?;
        let mut xpath_ctx = Self::get_xpath_ctx(&document)?;

        // check for single page link
-        if let Some(xpath_single_page_link) = config.single_page_link.clone() {
+        let rule = Util::select_rule(
+            config.and_then(|c| c.single_page_link.as_deref()),
+            global_config.single_page_link.as_deref(),
+        );
+        if let Some(xpath_single_page_link) = rule {
            debug!(
                "Single page link xpath specified in config '{}'",
                xpath_single_page_link
@ -159,34 +169,51 @@ impl ArticleScraper {
                    let single_page_url =
                        url::Url::parse(&result).context(ScraperErrorKind::Url)?;
                    return self
-                        .parse_single_page(article, &single_page_url, root, config, client)
+                        .parse_single_page(
+                            article,
+                            &single_page_url,
+                            root,
+                            config,
+                            global_config,
+                            client,
+                        )
                        .await;
                }
            }
        }

-        ArticleScraper::extract_metadata(&xpath_ctx, config, article);
-        ArticleScraper::strip_junk(&xpath_ctx, config, &url);
-        ArticleScraper::extract_body(&xpath_ctx, root, config)?;
+        ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article);
+        ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url);
+        ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?;

-        while let Some(url) = self.check_for_next_page(&xpath_ctx, config) {
+        while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
            let html = ArticleScraper::download(&url, client).await?;
-            document = Self::parse_html(html, config)?;
+            document = Self::parse_html(html, config, global_config)?;
            xpath_ctx = Self::get_xpath_ctx(&document)?;
-            ArticleScraper::strip_junk(&xpath_ctx, config, &url);
-            ArticleScraper::extract_body(&xpath_ctx, root, config)?;
+            ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url);
+            ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?;
        }

        Ok(())
    }

-    fn parse_html(html: String, config: &ConfigEntry) -> Result<Document, ScraperError> {
+    fn parse_html(
+        html: String,
+        config: Option<&ConfigEntry>,
+        global_config: &ConfigEntry,
+    ) -> Result<Document, ScraperError> {
        // replace matches in raw html

        let mut html = html;
+        if let Some(config) = config {
            for replace in &config.replace {
                html = html.replace(&replace.to_replace, &replace.replace_with);
            }
+        }
+
+        for replace in &global_config.replace {
+            html = html.replace(&replace.to_replace, &replace.replace_with);
+        }

        // parse html
        let parser = Parser::default_html();
@ -230,15 +257,16 @@ impl ArticleScraper {
        article: &mut Article,
        url: &url::Url,
        root: &mut Node,
-        config: &ConfigEntry,
+        config: Option<&ConfigEntry>,
+        global_config: &ConfigEntry,
        client: &Client,
    ) -> Result<(), ScraperError> {
        let html = ArticleScraper::download(&url, client).await?;
-        let document = Self::parse_html(html, config)?;
+        let document = Self::parse_html(html, config, global_config)?;
        let xpath_ctx = Self::get_xpath_ctx(&document)?;
-        ArticleScraper::extract_metadata(&xpath_ctx, config, article);
-        ArticleScraper::strip_junk(&xpath_ctx, config, &url);
-        ArticleScraper::extract_body(&xpath_ctx, root, config)?;
+        ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article);
+        ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url);
+        ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?;

        Ok(())
    }
@ -340,16 +368,17 @@ impl ArticleScraper {
        }
    }

-    fn get_grabber_config(&self, url: &url::Url) -> Result<ConfigEntry, ScraperError> {
-        let config_name = Self::get_host_name(url)? + ".txt";
+    fn get_grabber_config(&self, url: &url::Url) -> Option<&ConfigEntry> {
+        let conf = Self::get_host_name(url)
+            .ok()
+            .map(|url| url + ".txt")
+            .and_then(|name| self.config_files.get(&name));

-        match self.config_files.get(&config_name) {
-            Some(config) => Ok(config.clone()),
-            None => {
-                error!("No config file of the name '{}' found", config_name);
-                Err(ScraperErrorKind::Config.into())
-            }
+        if conf.is_none() {
+            log::warn!("No config found for url '{}'", url);
        }
+
+        conf
    }

    fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
@ -575,24 +604,50 @@ impl ArticleScraper {
        Ok(url)
    }

-    fn strip_junk(context: &Context, config: &ConfigEntry, url: &url::Url) {
+    fn strip_junk(
+        context: &Context,
+        config: Option<&ConfigEntry>,
+        global_config: &ConfigEntry,
+        url: &url::Url,
+    ) {
        // strip specified xpath
+        if let Some(config) = config {
            for xpath_strip in &config.xpath_strip {
                let _ = ArticleScraper::strip_node(&context, xpath_strip);
            }
+        }
+
+        for xpath_strip in &global_config.xpath_strip {
+            let _ = ArticleScraper::strip_node(&context, xpath_strip);
+        }

        // strip everything with specified 'id' or 'class'
+        if let Some(config) = config {
            for xpaht_strip_class in &config.strip_id_or_class {
                let _ = ArticleScraper::strip_id_or_class(&context, xpaht_strip_class);
            }
+        }
+
+        for xpaht_strip_class in &global_config.strip_id_or_class {
+            let _ = ArticleScraper::strip_id_or_class(&context, xpaht_strip_class);
+        }

        // strip any <img> element where @src attribute contains this substring
+        if let Some(config) = config {
            for xpath_strip_img_src in &config.strip_image_src {
                let _ = ArticleScraper::strip_node(
                    &context,
                    &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
                );
            }
+        }
+
+        for xpath_strip_img_src in &global_config.strip_image_src {
+            let _ = ArticleScraper::strip_node(
+                &context,
+                &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
+            );
+        }

        let _ = ArticleScraper::fix_lazy_images(&context, "lazyload", "data-src");
        let _ = ArticleScraper::fix_iframe_size(&context, "youtube.com");
@ -620,9 +675,6 @@ impl ArticleScraper {
            &String::from("//*[contains(@style,'display:none')]"),
        );

-        // strip all scripts
-        //let _ = ArticleScraper::strip_node(&context, &String::from("//script"));
-
        // strip all comments
        let _ = ArticleScraper::strip_node(&context, &String::from("//comment()"));

@ -633,8 +685,14 @@ impl ArticleScraper {
        let _ = ArticleScraper::strip_node(&context, &String::from("//*[@type='text/css']"));
    }

-    fn extract_metadata(context: &Context, config: &ConfigEntry, article: &mut Article) {
+    fn extract_metadata(
+        context: &Context,
+        config: Option<&ConfigEntry>,
+        global_config: &ConfigEntry,
+        article: &mut Article,
+    ) {
        // try to get title
+        if let Some(config) = config {
            for xpath_title in &config.xpath_title {
                if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) {
                    debug!("Article title: '{}'", title);
@ -642,8 +700,20 @@ impl ArticleScraper {
                    break;
                }
            }
+        }
+
+        if article.title.is_none() {
+            for xpath_title in &global_config.xpath_title {
+                if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) {
+                    debug!("Article title: '{}'", title);
+                    article.title = Some(title);
+                    break;
+                }
+            }
+        }

        // try to get the author
+        if let Some(config) = config {
            for xpath_author in &config.xpath_author {
                if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) {
                    debug!("Article author: '{}'", author);
@ -651,8 +721,20 @@ impl ArticleScraper {
                    break;
                }
            }
+        }
+
+        if article.title.is_none() {
+            for xpath_author in &global_config.xpath_author {
+                if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) {
+                    debug!("Article author: '{}'", author);
+                    article.author = Some(author);
+                    break;
+                }
+            }
+        }

        // try to get the date
+        if let Some(config) = config {
            for xpath_date in &config.xpath_date {
                if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) {
                    debug!("Article date: '{}'", date_string);
@ -666,17 +748,43 @@ impl ArticleScraper {
            }
        }

+        if article.title.is_none() {
+            for xpath_date in &global_config.xpath_date {
+                if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) {
+                    debug!("Article date: '{}'", date_string);
+                    if let Ok(date) = DateTime::from_str(&date_string) {
+                        article.date = Some(date);
+                        break;
+                    } else {
+                        warn!("Parsing the date string '{}' failed", date_string);
+                    }
+                }
+            }
+        }
+    }
+
    fn extract_body(
        context: &Context,
        root: &mut Node,
-        config: &ConfigEntry,
+        config: Option<&ConfigEntry>,
+        global_config: &ConfigEntry,
    ) -> Result<(), ScraperError> {
        let mut found_something = false;
+
+        if let Some(config) = config {
            for xpath_body in &config.xpath_body {
                found_something = ArticleScraper::extract_body_single(&context, root, xpath_body)?;
            }
+        }

        if !found_something {
+            for xpath_body in &global_config.xpath_body {
+                found_something = ArticleScraper::extract_body_single(&context, root, xpath_body)?;
+            }
+        }
+
+        if !found_something {
+            log::error!("no body found");
            return Err(ScraperErrorKind::Scrape.into());
        }

@ -709,10 +817,25 @@ impl ArticleScraper {
        Ok(found_something)
    }

-    fn check_for_next_page(&self, context: &Context, config: &ConfigEntry) -> Option<url::Url> {
-        if let Some(next_page_xpath) = config.next_page_link.clone() {
+    fn check_for_next_page(
+        &self,
+        context: &Context,
+        config: Option<&ConfigEntry>,
+        global_config: &ConfigEntry,
+    ) -> Option<url::Url> {
+        if let Some(config) = config {
+            if let Some(next_page_xpath) = config.next_page_link.as_deref() {
                if let Ok(next_page_string) =
-                ArticleScraper::get_attribute(&context, &next_page_xpath, "href")
+                    ArticleScraper::get_attribute(&context, next_page_xpath, "href")
+                {
+                    if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
+                        return Some(next_page_url);
+                    }
+                }
+            }
+        } else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() {
+            if let Ok(next_page_string) =
+                ArticleScraper::get_attribute(&context, next_page_xpath, "href")
            {
                if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
                    return Some(next_page_url);
--- a/src/tests.rs
+++ b/src/tests.rs
@ -1,35 +1,21 @@
 use crate::*;
-use std::path::PathBuf;
 use reqwest::Client;
-
-#[tokio::test(flavor = "current_thread")]
-async fn golem() {
-    let out_path = PathBuf::from(r"./test_output");
-    let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
-
-    let grabber = ArticleScraper::new(None).await;
-    let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
-    article.save_html(&out_path).unwrap();
-
-    assert_eq!(
-        article.title,
-        Some(String::from(
-            "HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben"
-        ))
-    );
-    assert_eq!(article.author, Some(String::from("Hauke Gierow")));
-}
+use std::path::PathBuf;

 #[tokio::test(flavor = "current_thread")]
 async fn phoronix() {
    let out_path = PathBuf::from(r"./test_output");
-    let url = url::Url::parse(
-        "http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1",
-    )
+    let url =
+        url::Url::parse("http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1")
            .unwrap();

    let grabber = ArticleScraper::new(None).await;
-    let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
+
+    let start = chrono::Utc::now();
+    let article = grabber.parse(&url, false, &Client::new()).await.unwrap();
+    let end = chrono::Utc::now();
+    let duration = end - start;
+    println!("duration: {}ms", duration.num_milliseconds());
    article.save_html(&out_path).unwrap();

    assert_eq!(
--- a/src/util.rs
+++ b/src/util.rs
@ -23,4 +23,15 @@ impl Util {
    pub fn split_values(values: &str) -> Vec<&str> {
        values.split('|').map(|s| s.trim()).collect()
    }
+
+    pub fn select_rule<'a>(
+        site_specific_rule: Option<&'a str>,
+        global_rule: Option<&'a str>,
+    ) -> Option<&'a str> {
+        if site_specific_rule.is_some() {
+            site_specific_rule
+        } else {
+            global_rule
+        }
+    }
 }