diff --git a/ftr-site-config b/ftr-site-config index 70a3a3a..a6beb80 160000 --- a/ftr-site-config +++ b/ftr-site-config @@ -1 +1 @@ -Subproject commit 70a3a3ac28a5db57e47f25cd4573e604cfc90f80 +Subproject commit a6beb80d445b8d99542d8a2f9157cec69ea8b767 diff --git a/resources/tests/golem/golem.de.txt b/resources/tests/golem/golem.de.txt deleted file mode 100644 index bf3f418..0000000 --- a/resources/tests/golem/golem.de.txt +++ /dev/null @@ -1,42 +0,0 @@ -# Author: zinnober -# Rewrite of original template which fetched the printer-version without pictures - -tidy: no -prune: no - -# Set full title -title: //h1/span - -date: //time -author: //a[@rel='author'] - -# Content is here -body: //article - -# Fetch full multipage articles -next_page_link: //a[@id='atoc_next'] - -# Remove tracking and ads -strip_id_or_class: iqadtile4 - -# General Cleanup -strip_id_or_class: list-jtoc -strip_id_or_class: table-jtoc -strip_id_or_class: implied -strip_id_or_class: social- -strip_id_or_class: comments -strip_id_or_class: footer -strip_id_or_class: job-market -strip_id_or_class: tags - -# Tidy up galleries (could still be improved, though) -strip: //img[@src=''] -strip: //li[not(*)] -strip: //div[contains(@style,'margin')] -strip: //figure[contains(@id,'gvideo')] - - -# Try yourself -test_url: http://www.golem.de/news/intel-core-i7-5960x-im-test-die-pc-revolution-beginnt-mit-octacore-und-ddr4-1408-108893.html -test_url: http://www.golem.de/news/test-infamous-first-light-neonbunter-actionspass-1408-108914.html - diff --git a/resources/tests/phoronix/phoronix.com.txt b/resources/tests/phoronix/phoronix.com.txt deleted file mode 100644 index 1fa9e4b..0000000 --- a/resources/tests/phoronix/phoronix.com.txt +++ /dev/null @@ -1,9 +0,0 @@ -# based on the grabber rules of picofeed - -title: //article/header -body: //div[@class="content"] -test_url: http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1 - -# replace_string(
):

- -next_page_link: //a[@title='Go To Next Page'] diff --git a/src/config/config_collection.rs b/src/config/config_collection.rs index e5257e1..ec72cb7 100644 --- a/src/config/config_collection.rs +++ b/src/config/config_collection.rs @@ -15,17 +15,15 @@ pub struct ConfigCollection { impl ConfigCollection { pub async fn parse(directory: Option<&Path>) -> ConfigCollection { - let mut user_entries = HashMap::new(); let mut embedded_entries = HashMap::new(); for (file_name, entry) in EmbededConfigFiles::iter() .filter_map(|file_name| EmbededConfigFiles::get(&file_name).map(|e| (file_name, e))) { - if let Ok(entry) = ConfigEntry::parse_data(entry.data).await { - let file_name: &str = file_name.borrow(); - embedded_entries.insert(file_name.to_owned(), entry); - } + let entry = ConfigEntry::parse_data(entry.data).await.unwrap(); + let file_name: &str = file_name.borrow(); + embedded_entries.insert(file_name.to_owned(), entry); } if let Some(directory) = directory { diff --git a/src/config/config_entry.rs b/src/config/config_entry.rs index 450abc2..f1de7ea 100644 --- a/src/config/config_entry.rs +++ b/src/config/config_entry.rs @@ -2,7 +2,6 @@ use crate::util::Util; use super::error::{ConfigError, ConfigErrorKind}; use failure::ResultExt; -use log::warn; use std::borrow::Cow; use std::io::Cursor; use std::path::Path; @@ -140,11 +139,6 @@ impl ConfigEntry { } } - if xpath_body.is_empty() { - warn!("No body xpath found for"); - return Err(ConfigErrorKind::BadConfig.into()); - } - let config = ConfigEntry { xpath_title, xpath_author, diff --git a/src/config/error.rs b/src/config/error.rs index f2ae18c..a93587a 100644 --- a/src/config/error.rs +++ b/src/config/error.rs @@ -10,8 +10,6 @@ pub struct ConfigError { pub enum ConfigErrorKind { #[fail(display = "IO Error")] IO, - #[fail(display = "Config does not contain body xpath")] - BadConfig, #[fail(display = "Unknown Error")] Unknown, } diff --git a/src/lib.rs b/src/lib.rs index 02bb0c0..813f9a7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -22,6 +22,7 @@ use log::{debug, error, info, warn}; use reqwest::{Client, Response}; use std::path::Path; use std::str::FromStr; +use util::Util; pub struct ArticleScraper { pub image_downloader: ImageDownloader, @@ -76,7 +77,11 @@ impl ArticleScraper { } // check if we have a config for the url - let config = self.get_grabber_config(&url)?; + let config = self.get_grabber_config(&url); + let global_config = self + .config_files + .get("global.txt") + .ok_or_else(|| ScraperErrorKind::Config)?; let mut article = Article { title: None, @@ -94,7 +99,7 @@ impl ArticleScraper { ArticleScraper::generate_head(&mut root, &document)?; - self.parse_pages(&mut article, &url, &mut root, &config, client) + self.parse_pages(&mut article, &url, &mut root, config, global_config, client) .await?; let context = Context::new(&document).map_err(|()| { @@ -139,15 +144,20 @@ impl ArticleScraper { article: &mut Article, url: &url::Url, root: &mut Node, - config: &ConfigEntry, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry, client: &Client, ) -> Result<(), ScraperError> { let html = ArticleScraper::download(&url, client).await?; - let mut document = Self::parse_html(html, config)?; + let mut document = Self::parse_html(html, config, global_config)?; let mut xpath_ctx = Self::get_xpath_ctx(&document)?; // check for single page link - if let Some(xpath_single_page_link) = config.single_page_link.clone() { + let rule = Util::select_rule( + config.and_then(|c| c.single_page_link.as_deref()), + global_config.single_page_link.as_deref(), + ); + if let Some(xpath_single_page_link) = rule { debug!( "Single page link xpath specified in config '{}'", xpath_single_page_link @@ -159,32 +169,49 @@ impl ArticleScraper { let single_page_url = url::Url::parse(&result).context(ScraperErrorKind::Url)?; return self - .parse_single_page(article, &single_page_url, root, config, client) + .parse_single_page( + article, + &single_page_url, + root, + config, + global_config, + client, + ) .await; } } } - ArticleScraper::extract_metadata(&xpath_ctx, config, article); - ArticleScraper::strip_junk(&xpath_ctx, config, &url); - ArticleScraper::extract_body(&xpath_ctx, root, config)?; + ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article); + ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url); + ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?; - while let Some(url) = self.check_for_next_page(&xpath_ctx, config) { + while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) { let html = ArticleScraper::download(&url, client).await?; - document = Self::parse_html(html, config)?; + document = Self::parse_html(html, config, global_config)?; xpath_ctx = Self::get_xpath_ctx(&document)?; - ArticleScraper::strip_junk(&xpath_ctx, config, &url); - ArticleScraper::extract_body(&xpath_ctx, root, config)?; + ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url); + ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?; } Ok(()) } - fn parse_html(html: String, config: &ConfigEntry) -> Result { + fn parse_html( + html: String, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry, + ) -> Result { // replace matches in raw html let mut html = html; - for replace in &config.replace { + if let Some(config) = config { + for replace in &config.replace { + html = html.replace(&replace.to_replace, &replace.replace_with); + } + } + + for replace in &global_config.replace { html = html.replace(&replace.to_replace, &replace.replace_with); } @@ -230,15 +257,16 @@ impl ArticleScraper { article: &mut Article, url: &url::Url, root: &mut Node, - config: &ConfigEntry, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry, client: &Client, ) -> Result<(), ScraperError> { let html = ArticleScraper::download(&url, client).await?; - let document = Self::parse_html(html, config)?; + let document = Self::parse_html(html, config, global_config)?; let xpath_ctx = Self::get_xpath_ctx(&document)?; - ArticleScraper::extract_metadata(&xpath_ctx, config, article); - ArticleScraper::strip_junk(&xpath_ctx, config, &url); - ArticleScraper::extract_body(&xpath_ctx, root, config)?; + ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article); + ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url); + ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?; Ok(()) } @@ -340,16 +368,17 @@ impl ArticleScraper { } } - fn get_grabber_config(&self, url: &url::Url) -> Result { - let config_name = Self::get_host_name(url)? + ".txt"; + fn get_grabber_config(&self, url: &url::Url) -> Option<&ConfigEntry> { + let conf = Self::get_host_name(url) + .ok() + .map(|url| url + ".txt") + .and_then(|name| self.config_files.get(&name)); - match self.config_files.get(&config_name) { - Some(config) => Ok(config.clone()), - None => { - error!("No config file of the name '{}' found", config_name); - Err(ScraperErrorKind::Config.into()) - } + if conf.is_none() { + log::warn!("No config found for url '{}'", url); } + + conf } fn check_content_type(response: &Response) -> Result { @@ -575,19 +604,45 @@ impl ArticleScraper { Ok(url) } - fn strip_junk(context: &Context, config: &ConfigEntry, url: &url::Url) { + fn strip_junk( + context: &Context, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry, + url: &url::Url, + ) { // strip specified xpath - for xpath_strip in &config.xpath_strip { + if let Some(config) = config { + for xpath_strip in &config.xpath_strip { + let _ = ArticleScraper::strip_node(&context, xpath_strip); + } + } + + for xpath_strip in &global_config.xpath_strip { let _ = ArticleScraper::strip_node(&context, xpath_strip); } // strip everything with specified 'id' or 'class' - for xpaht_strip_class in &config.strip_id_or_class { + if let Some(config) = config { + for xpaht_strip_class in &config.strip_id_or_class { + let _ = ArticleScraper::strip_id_or_class(&context, xpaht_strip_class); + } + } + + for xpaht_strip_class in &global_config.strip_id_or_class { let _ = ArticleScraper::strip_id_or_class(&context, xpaht_strip_class); } // strip any element where @src attribute contains this substring - for xpath_strip_img_src in &config.strip_image_src { + if let Some(config) = config { + for xpath_strip_img_src in &config.strip_image_src { + let _ = ArticleScraper::strip_node( + &context, + &format!("//img[contains(@src,'{}')]", xpath_strip_img_src), + ); + } + } + + for xpath_strip_img_src in &global_config.strip_image_src { let _ = ArticleScraper::strip_node( &context, &format!("//img[contains(@src,'{}')]", xpath_strip_img_src), @@ -620,9 +675,6 @@ impl ArticleScraper { &String::from("//*[contains(@style,'display:none')]"), ); - // strip all scripts - //let _ = ArticleScraper::strip_node(&context, &String::from("//script")); - // strip all comments let _ = ArticleScraper::strip_node(&context, &String::from("//comment()")); @@ -633,34 +685,79 @@ impl ArticleScraper { let _ = ArticleScraper::strip_node(&context, &String::from("//*[@type='text/css']")); } - fn extract_metadata(context: &Context, config: &ConfigEntry, article: &mut Article) { + fn extract_metadata( + context: &Context, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry, + article: &mut Article, + ) { // try to get title - for xpath_title in &config.xpath_title { - if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) { - debug!("Article title: '{}'", title); - article.title = Some(title); - break; + if let Some(config) = config { + for xpath_title in &config.xpath_title { + if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) { + debug!("Article title: '{}'", title); + article.title = Some(title); + break; + } + } + } + + if article.title.is_none() { + for xpath_title in &global_config.xpath_title { + if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) { + debug!("Article title: '{}'", title); + article.title = Some(title); + break; + } } } // try to get the author - for xpath_author in &config.xpath_author { - if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) { - debug!("Article author: '{}'", author); - article.author = Some(author); - break; + if let Some(config) = config { + for xpath_author in &config.xpath_author { + if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) { + debug!("Article author: '{}'", author); + article.author = Some(author); + break; + } + } + } + + if article.title.is_none() { + for xpath_author in &global_config.xpath_author { + if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) { + debug!("Article author: '{}'", author); + article.author = Some(author); + break; + } } } // try to get the date - for xpath_date in &config.xpath_date { - if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) { - debug!("Article date: '{}'", date_string); - if let Ok(date) = DateTime::from_str(&date_string) { - article.date = Some(date); - break; - } else { - warn!("Parsing the date string '{}' failed", date_string); + if let Some(config) = config { + for xpath_date in &config.xpath_date { + if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) { + debug!("Article date: '{}'", date_string); + if let Ok(date) = DateTime::from_str(&date_string) { + article.date = Some(date); + break; + } else { + warn!("Parsing the date string '{}' failed", date_string); + } + } + } + } + + if article.title.is_none() { + for xpath_date in &global_config.xpath_date { + if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) { + debug!("Article date: '{}'", date_string); + if let Ok(date) = DateTime::from_str(&date_string) { + article.date = Some(date); + break; + } else { + warn!("Parsing the date string '{}' failed", date_string); + } } } } @@ -669,14 +766,25 @@ impl ArticleScraper { fn extract_body( context: &Context, root: &mut Node, - config: &ConfigEntry, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry, ) -> Result<(), ScraperError> { let mut found_something = false; - for xpath_body in &config.xpath_body { - found_something = ArticleScraper::extract_body_single(&context, root, xpath_body)?; + + if let Some(config) = config { + for xpath_body in &config.xpath_body { + found_something = ArticleScraper::extract_body_single(&context, root, xpath_body)?; + } } if !found_something { + for xpath_body in &global_config.xpath_body { + found_something = ArticleScraper::extract_body_single(&context, root, xpath_body)?; + } + } + + if !found_something { + log::error!("no body found"); return Err(ScraperErrorKind::Scrape.into()); } @@ -709,10 +817,25 @@ impl ArticleScraper { Ok(found_something) } - fn check_for_next_page(&self, context: &Context, config: &ConfigEntry) -> Option { - if let Some(next_page_xpath) = config.next_page_link.clone() { + fn check_for_next_page( + &self, + context: &Context, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry, + ) -> Option { + if let Some(config) = config { + if let Some(next_page_xpath) = config.next_page_link.as_deref() { + if let Ok(next_page_string) = + ArticleScraper::get_attribute(&context, next_page_xpath, "href") + { + if let Ok(next_page_url) = url::Url::parse(&next_page_string) { + return Some(next_page_url); + } + } + } + } else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() { if let Ok(next_page_string) = - ArticleScraper::get_attribute(&context, &next_page_xpath, "href") + ArticleScraper::get_attribute(&context, next_page_xpath, "href") { if let Ok(next_page_url) = url::Url::parse(&next_page_string) { return Some(next_page_url); diff --git a/src/tests.rs b/src/tests.rs index f4114cc..598a893 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -1,35 +1,21 @@ use crate::*; -use std::path::PathBuf; use reqwest::Client; - -#[tokio::test(flavor = "current_thread")] -async fn golem() { - let out_path = PathBuf::from(r"./test_output"); - let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap(); - - let grabber = ArticleScraper::new(None).await; - let article = grabber.parse(&url, true, &Client::new()).await.unwrap(); - article.save_html(&out_path).unwrap(); - - assert_eq!( - article.title, - Some(String::from( - "HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben" - )) - ); - assert_eq!(article.author, Some(String::from("Hauke Gierow"))); -} +use std::path::PathBuf; #[tokio::test(flavor = "current_thread")] async fn phoronix() { let out_path = PathBuf::from(r"./test_output"); - let url = url::Url::parse( - "http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1", - ) - .unwrap(); + let url = + url::Url::parse("http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1") + .unwrap(); let grabber = ArticleScraper::new(None).await; - let article = grabber.parse(&url, true, &Client::new()).await.unwrap(); + + let start = chrono::Utc::now(); + let article = grabber.parse(&url, false, &Client::new()).await.unwrap(); + let end = chrono::Utc::now(); + let duration = end - start; + println!("duration: {}ms", duration.num_milliseconds()); article.save_html(&out_path).unwrap(); assert_eq!( @@ -51,4 +37,4 @@ async fn youtube() { article.html, Some("".into()) ); -} \ No newline at end of file +} diff --git a/src/util.rs b/src/util.rs index e3670ba..f6519a7 100644 --- a/src/util.rs +++ b/src/util.rs @@ -23,4 +23,15 @@ impl Util { pub fn split_values(values: &str) -> Vec<&str> { values.split('|').map(|s| s.trim()).collect() } + + pub fn select_rule<'a>( + site_specific_rule: Option<&'a str>, + global_rule: Option<&'a str>, + ) -> Option<&'a str> { + if site_specific_rule.is_some() { + site_specific_rule + } else { + global_rule + } + } }