diff --git a/ftr-site-config b/ftr-site-config
index 70a3a3a..a6beb80 160000
--- a/ftr-site-config
+++ b/ftr-site-config
@@ -1 +1 @@
-Subproject commit 70a3a3ac28a5db57e47f25cd4573e604cfc90f80
+Subproject commit a6beb80d445b8d99542d8a2f9157cec69ea8b767
diff --git a/resources/tests/golem/golem.de.txt b/resources/tests/golem/golem.de.txt
deleted file mode 100644
index bf3f418..0000000
--- a/resources/tests/golem/golem.de.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-# Author: zinnober
-# Rewrite of original template which fetched the printer-version without pictures
-
-tidy: no
-prune: no
-
-# Set full title
-title: //h1/span
-
-date: //time
-author: //a[@rel='author']
-
-# Content is here
-body: //article
-
-# Fetch full multipage articles
-next_page_link: //a[@id='atoc_next']
-
-# Remove tracking and ads
-strip_id_or_class: iqadtile4
-
-# General Cleanup
-strip_id_or_class: list-jtoc
-strip_id_or_class: table-jtoc
-strip_id_or_class: implied
-strip_id_or_class: social-
-strip_id_or_class: comments
-strip_id_or_class: footer
-strip_id_or_class: job-market
-strip_id_or_class: tags
-
-# Tidy up galleries (could still be improved, though)
-strip: //img[@src='']
-strip: //li[not(*)]
-strip: //div[contains(@style,'margin')]
-strip: //figure[contains(@id,'gvideo')]
-
-
-# Try yourself
-test_url: http://www.golem.de/news/intel-core-i7-5960x-im-test-die-pc-revolution-beginnt-mit-octacore-und-ddr4-1408-108893.html
-test_url: http://www.golem.de/news/test-infamous-first-light-neonbunter-actionspass-1408-108914.html
-
diff --git a/resources/tests/phoronix/phoronix.com.txt b/resources/tests/phoronix/phoronix.com.txt
deleted file mode 100644
index 1fa9e4b..0000000
--- a/resources/tests/phoronix/phoronix.com.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-# based on the grabber rules of picofeed
-
-title: //article/header
-body: //div[@class="content"]
-test_url: http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1
-
-# replace_string(
):
-
-next_page_link: //a[@title='Go To Next Page']
diff --git a/src/config/config_collection.rs b/src/config/config_collection.rs
index e5257e1..ec72cb7 100644
--- a/src/config/config_collection.rs
+++ b/src/config/config_collection.rs
@@ -15,17 +15,15 @@ pub struct ConfigCollection {
impl ConfigCollection {
pub async fn parse(directory: Option<&Path>) -> ConfigCollection {
-
let mut user_entries = HashMap::new();
let mut embedded_entries = HashMap::new();
for (file_name, entry) in EmbededConfigFiles::iter()
.filter_map(|file_name| EmbededConfigFiles::get(&file_name).map(|e| (file_name, e)))
{
- if let Ok(entry) = ConfigEntry::parse_data(entry.data).await {
- let file_name: &str = file_name.borrow();
- embedded_entries.insert(file_name.to_owned(), entry);
- }
+ let entry = ConfigEntry::parse_data(entry.data).await.unwrap();
+ let file_name: &str = file_name.borrow();
+ embedded_entries.insert(file_name.to_owned(), entry);
}
if let Some(directory) = directory {
diff --git a/src/config/config_entry.rs b/src/config/config_entry.rs
index 450abc2..f1de7ea 100644
--- a/src/config/config_entry.rs
+++ b/src/config/config_entry.rs
@@ -2,7 +2,6 @@ use crate::util::Util;
use super::error::{ConfigError, ConfigErrorKind};
use failure::ResultExt;
-use log::warn;
use std::borrow::Cow;
use std::io::Cursor;
use std::path::Path;
@@ -140,11 +139,6 @@ impl ConfigEntry {
}
}
- if xpath_body.is_empty() {
- warn!("No body xpath found for");
- return Err(ConfigErrorKind::BadConfig.into());
- }
-
let config = ConfigEntry {
xpath_title,
xpath_author,
diff --git a/src/config/error.rs b/src/config/error.rs
index f2ae18c..a93587a 100644
--- a/src/config/error.rs
+++ b/src/config/error.rs
@@ -10,8 +10,6 @@ pub struct ConfigError {
pub enum ConfigErrorKind {
#[fail(display = "IO Error")]
IO,
- #[fail(display = "Config does not contain body xpath")]
- BadConfig,
#[fail(display = "Unknown Error")]
Unknown,
}
diff --git a/src/lib.rs b/src/lib.rs
index 02bb0c0..813f9a7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -22,6 +22,7 @@ use log::{debug, error, info, warn};
use reqwest::{Client, Response};
use std::path::Path;
use std::str::FromStr;
+use util::Util;
pub struct ArticleScraper {
pub image_downloader: ImageDownloader,
@@ -76,7 +77,11 @@ impl ArticleScraper {
}
// check if we have a config for the url
- let config = self.get_grabber_config(&url)?;
+ let config = self.get_grabber_config(&url);
+ let global_config = self
+ .config_files
+ .get("global.txt")
+ .ok_or_else(|| ScraperErrorKind::Config)?;
let mut article = Article {
title: None,
@@ -94,7 +99,7 @@ impl ArticleScraper {
ArticleScraper::generate_head(&mut root, &document)?;
- self.parse_pages(&mut article, &url, &mut root, &config, client)
+ self.parse_pages(&mut article, &url, &mut root, config, global_config, client)
.await?;
let context = Context::new(&document).map_err(|()| {
@@ -139,15 +144,20 @@ impl ArticleScraper {
article: &mut Article,
url: &url::Url,
root: &mut Node,
- config: &ConfigEntry,
+ config: Option<&ConfigEntry>,
+ global_config: &ConfigEntry,
client: &Client,
) -> Result<(), ScraperError> {
let html = ArticleScraper::download(&url, client).await?;
- let mut document = Self::parse_html(html, config)?;
+ let mut document = Self::parse_html(html, config, global_config)?;
let mut xpath_ctx = Self::get_xpath_ctx(&document)?;
// check for single page link
- if let Some(xpath_single_page_link) = config.single_page_link.clone() {
+ let rule = Util::select_rule(
+ config.and_then(|c| c.single_page_link.as_deref()),
+ global_config.single_page_link.as_deref(),
+ );
+ if let Some(xpath_single_page_link) = rule {
debug!(
"Single page link xpath specified in config '{}'",
xpath_single_page_link
@@ -159,32 +169,49 @@ impl ArticleScraper {
let single_page_url =
url::Url::parse(&result).context(ScraperErrorKind::Url)?;
return self
- .parse_single_page(article, &single_page_url, root, config, client)
+ .parse_single_page(
+ article,
+ &single_page_url,
+ root,
+ config,
+ global_config,
+ client,
+ )
.await;
}
}
}
- ArticleScraper::extract_metadata(&xpath_ctx, config, article);
- ArticleScraper::strip_junk(&xpath_ctx, config, &url);
- ArticleScraper::extract_body(&xpath_ctx, root, config)?;
+ ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article);
+ ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url);
+ ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?;
- while let Some(url) = self.check_for_next_page(&xpath_ctx, config) {
+ while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
let html = ArticleScraper::download(&url, client).await?;
- document = Self::parse_html(html, config)?;
+ document = Self::parse_html(html, config, global_config)?;
xpath_ctx = Self::get_xpath_ctx(&document)?;
- ArticleScraper::strip_junk(&xpath_ctx, config, &url);
- ArticleScraper::extract_body(&xpath_ctx, root, config)?;
+ ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url);
+ ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?;
}
Ok(())
}
- fn parse_html(html: String, config: &ConfigEntry) -> Result {
+ fn parse_html(
+ html: String,
+ config: Option<&ConfigEntry>,
+ global_config: &ConfigEntry,
+ ) -> Result {
// replace matches in raw html
let mut html = html;
- for replace in &config.replace {
+ if let Some(config) = config {
+ for replace in &config.replace {
+ html = html.replace(&replace.to_replace, &replace.replace_with);
+ }
+ }
+
+ for replace in &global_config.replace {
html = html.replace(&replace.to_replace, &replace.replace_with);
}
@@ -230,15 +257,16 @@ impl ArticleScraper {
article: &mut Article,
url: &url::Url,
root: &mut Node,
- config: &ConfigEntry,
+ config: Option<&ConfigEntry>,
+ global_config: &ConfigEntry,
client: &Client,
) -> Result<(), ScraperError> {
let html = ArticleScraper::download(&url, client).await?;
- let document = Self::parse_html(html, config)?;
+ let document = Self::parse_html(html, config, global_config)?;
let xpath_ctx = Self::get_xpath_ctx(&document)?;
- ArticleScraper::extract_metadata(&xpath_ctx, config, article);
- ArticleScraper::strip_junk(&xpath_ctx, config, &url);
- ArticleScraper::extract_body(&xpath_ctx, root, config)?;
+ ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article);
+ ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url);
+ ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?;
Ok(())
}
@@ -340,16 +368,17 @@ impl ArticleScraper {
}
}
- fn get_grabber_config(&self, url: &url::Url) -> Result {
- let config_name = Self::get_host_name(url)? + ".txt";
+ fn get_grabber_config(&self, url: &url::Url) -> Option<&ConfigEntry> {
+ let conf = Self::get_host_name(url)
+ .ok()
+ .map(|url| url + ".txt")
+ .and_then(|name| self.config_files.get(&name));
- match self.config_files.get(&config_name) {
- Some(config) => Ok(config.clone()),
- None => {
- error!("No config file of the name '{}' found", config_name);
- Err(ScraperErrorKind::Config.into())
- }
+ if conf.is_none() {
+ log::warn!("No config found for url '{}'", url);
}
+
+ conf
}
fn check_content_type(response: &Response) -> Result {
@@ -575,19 +604,45 @@ impl ArticleScraper {
Ok(url)
}
- fn strip_junk(context: &Context, config: &ConfigEntry, url: &url::Url) {
+ fn strip_junk(
+ context: &Context,
+ config: Option<&ConfigEntry>,
+ global_config: &ConfigEntry,
+ url: &url::Url,
+ ) {
// strip specified xpath
- for xpath_strip in &config.xpath_strip {
+ if let Some(config) = config {
+ for xpath_strip in &config.xpath_strip {
+ let _ = ArticleScraper::strip_node(&context, xpath_strip);
+ }
+ }
+
+ for xpath_strip in &global_config.xpath_strip {
let _ = ArticleScraper::strip_node(&context, xpath_strip);
}
// strip everything with specified 'id' or 'class'
- for xpaht_strip_class in &config.strip_id_or_class {
+ if let Some(config) = config {
+ for xpaht_strip_class in &config.strip_id_or_class {
+ let _ = ArticleScraper::strip_id_or_class(&context, xpaht_strip_class);
+ }
+ }
+
+ for xpaht_strip_class in &global_config.strip_id_or_class {
let _ = ArticleScraper::strip_id_or_class(&context, xpaht_strip_class);
}
// strip any
element where @src attribute contains this substring
- for xpath_strip_img_src in &config.strip_image_src {
+ if let Some(config) = config {
+ for xpath_strip_img_src in &config.strip_image_src {
+ let _ = ArticleScraper::strip_node(
+ &context,
+ &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
+ );
+ }
+ }
+
+ for xpath_strip_img_src in &global_config.strip_image_src {
let _ = ArticleScraper::strip_node(
&context,
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
@@ -620,9 +675,6 @@ impl ArticleScraper {
&String::from("//*[contains(@style,'display:none')]"),
);
- // strip all scripts
- //let _ = ArticleScraper::strip_node(&context, &String::from("//script"));
-
// strip all comments
let _ = ArticleScraper::strip_node(&context, &String::from("//comment()"));
@@ -633,34 +685,79 @@ impl ArticleScraper {
let _ = ArticleScraper::strip_node(&context, &String::from("//*[@type='text/css']"));
}
- fn extract_metadata(context: &Context, config: &ConfigEntry, article: &mut Article) {
+ fn extract_metadata(
+ context: &Context,
+ config: Option<&ConfigEntry>,
+ global_config: &ConfigEntry,
+ article: &mut Article,
+ ) {
// try to get title
- for xpath_title in &config.xpath_title {
- if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) {
- debug!("Article title: '{}'", title);
- article.title = Some(title);
- break;
+ if let Some(config) = config {
+ for xpath_title in &config.xpath_title {
+ if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) {
+ debug!("Article title: '{}'", title);
+ article.title = Some(title);
+ break;
+ }
+ }
+ }
+
+ if article.title.is_none() {
+ for xpath_title in &global_config.xpath_title {
+ if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) {
+ debug!("Article title: '{}'", title);
+ article.title = Some(title);
+ break;
+ }
}
}
// try to get the author
- for xpath_author in &config.xpath_author {
- if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) {
- debug!("Article author: '{}'", author);
- article.author = Some(author);
- break;
+ if let Some(config) = config {
+ for xpath_author in &config.xpath_author {
+ if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) {
+ debug!("Article author: '{}'", author);
+ article.author = Some(author);
+ break;
+ }
+ }
+ }
+
+ if article.title.is_none() {
+ for xpath_author in &global_config.xpath_author {
+ if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) {
+ debug!("Article author: '{}'", author);
+ article.author = Some(author);
+ break;
+ }
}
}
// try to get the date
- for xpath_date in &config.xpath_date {
- if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) {
- debug!("Article date: '{}'", date_string);
- if let Ok(date) = DateTime::from_str(&date_string) {
- article.date = Some(date);
- break;
- } else {
- warn!("Parsing the date string '{}' failed", date_string);
+ if let Some(config) = config {
+ for xpath_date in &config.xpath_date {
+ if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) {
+ debug!("Article date: '{}'", date_string);
+ if let Ok(date) = DateTime::from_str(&date_string) {
+ article.date = Some(date);
+ break;
+ } else {
+ warn!("Parsing the date string '{}' failed", date_string);
+ }
+ }
+ }
+ }
+
+ if article.title.is_none() {
+ for xpath_date in &global_config.xpath_date {
+ if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) {
+ debug!("Article date: '{}'", date_string);
+ if let Ok(date) = DateTime::from_str(&date_string) {
+ article.date = Some(date);
+ break;
+ } else {
+ warn!("Parsing the date string '{}' failed", date_string);
+ }
}
}
}
@@ -669,14 +766,25 @@ impl ArticleScraper {
fn extract_body(
context: &Context,
root: &mut Node,
- config: &ConfigEntry,
+ config: Option<&ConfigEntry>,
+ global_config: &ConfigEntry,
) -> Result<(), ScraperError> {
let mut found_something = false;
- for xpath_body in &config.xpath_body {
- found_something = ArticleScraper::extract_body_single(&context, root, xpath_body)?;
+
+ if let Some(config) = config {
+ for xpath_body in &config.xpath_body {
+ found_something = ArticleScraper::extract_body_single(&context, root, xpath_body)?;
+ }
}
if !found_something {
+ for xpath_body in &global_config.xpath_body {
+ found_something = ArticleScraper::extract_body_single(&context, root, xpath_body)?;
+ }
+ }
+
+ if !found_something {
+ log::error!("no body found");
return Err(ScraperErrorKind::Scrape.into());
}
@@ -709,10 +817,25 @@ impl ArticleScraper {
Ok(found_something)
}
- fn check_for_next_page(&self, context: &Context, config: &ConfigEntry) -> Option {
- if let Some(next_page_xpath) = config.next_page_link.clone() {
+ fn check_for_next_page(
+ &self,
+ context: &Context,
+ config: Option<&ConfigEntry>,
+ global_config: &ConfigEntry,
+ ) -> Option {
+ if let Some(config) = config {
+ if let Some(next_page_xpath) = config.next_page_link.as_deref() {
+ if let Ok(next_page_string) =
+ ArticleScraper::get_attribute(&context, next_page_xpath, "href")
+ {
+ if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
+ return Some(next_page_url);
+ }
+ }
+ }
+ } else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() {
if let Ok(next_page_string) =
- ArticleScraper::get_attribute(&context, &next_page_xpath, "href")
+ ArticleScraper::get_attribute(&context, next_page_xpath, "href")
{
if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
return Some(next_page_url);
diff --git a/src/tests.rs b/src/tests.rs
index f4114cc..598a893 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -1,35 +1,21 @@
use crate::*;
-use std::path::PathBuf;
use reqwest::Client;
-
-#[tokio::test(flavor = "current_thread")]
-async fn golem() {
- let out_path = PathBuf::from(r"./test_output");
- let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
-
- let grabber = ArticleScraper::new(None).await;
- let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
- article.save_html(&out_path).unwrap();
-
- assert_eq!(
- article.title,
- Some(String::from(
- "HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben"
- ))
- );
- assert_eq!(article.author, Some(String::from("Hauke Gierow")));
-}
+use std::path::PathBuf;
#[tokio::test(flavor = "current_thread")]
async fn phoronix() {
let out_path = PathBuf::from(r"./test_output");
- let url = url::Url::parse(
- "http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1",
- )
- .unwrap();
+ let url =
+ url::Url::parse("http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1")
+ .unwrap();
let grabber = ArticleScraper::new(None).await;
- let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
+
+ let start = chrono::Utc::now();
+ let article = grabber.parse(&url, false, &Client::new()).await.unwrap();
+ let end = chrono::Utc::now();
+ let duration = end - start;
+ println!("duration: {}ms", duration.num_milliseconds());
article.save_html(&out_path).unwrap();
assert_eq!(
@@ -51,4 +37,4 @@ async fn youtube() {
article.html,
Some("".into())
);
-}
\ No newline at end of file
+}
diff --git a/src/util.rs b/src/util.rs
index e3670ba..f6519a7 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -23,4 +23,15 @@ impl Util {
pub fn split_values(values: &str) -> Vec<&str> {
values.split('|').map(|s| s.trim()).collect()
}
+
+ pub fn select_rule<'a>(
+ site_specific_rule: Option<&'a str>,
+ global_rule: Option<&'a str>,
+ ) -> Option<&'a str> {
+ if site_specific_rule.is_some() {
+ site_specific_rule
+ } else {
+ global_rule
+ }
+ }
}