mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
use global rules
This commit is contained in:
parent
3a6a70ee64
commit
c1ae011fcd
9 changed files with 209 additions and 150 deletions
|
@ -1 +1 @@
|
||||||
Subproject commit 70a3a3ac28a5db57e47f25cd4573e604cfc90f80
|
Subproject commit a6beb80d445b8d99542d8a2f9157cec69ea8b767
|
|
@ -1,42 +0,0 @@
|
||||||
# Author: zinnober
|
|
||||||
# Rewrite of original template which fetched the printer-version without pictures
|
|
||||||
|
|
||||||
tidy: no
|
|
||||||
prune: no
|
|
||||||
|
|
||||||
# Set full title
|
|
||||||
title: //h1/span
|
|
||||||
|
|
||||||
date: //time
|
|
||||||
author: //a[@rel='author']
|
|
||||||
|
|
||||||
# Content is here
|
|
||||||
body: //article
|
|
||||||
|
|
||||||
# Fetch full multipage articles
|
|
||||||
next_page_link: //a[@id='atoc_next']
|
|
||||||
|
|
||||||
# Remove tracking and ads
|
|
||||||
strip_id_or_class: iqadtile4
|
|
||||||
|
|
||||||
# General Cleanup
|
|
||||||
strip_id_or_class: list-jtoc
|
|
||||||
strip_id_or_class: table-jtoc
|
|
||||||
strip_id_or_class: implied
|
|
||||||
strip_id_or_class: social-
|
|
||||||
strip_id_or_class: comments
|
|
||||||
strip_id_or_class: footer
|
|
||||||
strip_id_or_class: job-market
|
|
||||||
strip_id_or_class: tags
|
|
||||||
|
|
||||||
# Tidy up galleries (could still be improved, though)
|
|
||||||
strip: //img[@src='']
|
|
||||||
strip: //li[not(*)]
|
|
||||||
strip: //div[contains(@style,'margin')]
|
|
||||||
strip: //figure[contains(@id,'gvideo')]
|
|
||||||
|
|
||||||
|
|
||||||
# Try yourself
|
|
||||||
test_url: http://www.golem.de/news/intel-core-i7-5960x-im-test-die-pc-revolution-beginnt-mit-octacore-und-ddr4-1408-108893.html
|
|
||||||
test_url: http://www.golem.de/news/test-infamous-first-light-neonbunter-actionspass-1408-108914.html
|
|
||||||
|
|
|
@ -1,9 +0,0 @@
|
||||||
# based on the grabber rules of picofeed
|
|
||||||
|
|
||||||
title: //article/header
|
|
||||||
body: //div[@class="content"]
|
|
||||||
test_url: http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1
|
|
||||||
|
|
||||||
# replace_string(<h5>): <h2>
|
|
||||||
|
|
||||||
next_page_link: //a[@title='Go To Next Page']
|
|
|
@ -15,17 +15,15 @@ pub struct ConfigCollection {
|
||||||
|
|
||||||
impl ConfigCollection {
|
impl ConfigCollection {
|
||||||
pub async fn parse(directory: Option<&Path>) -> ConfigCollection {
|
pub async fn parse(directory: Option<&Path>) -> ConfigCollection {
|
||||||
|
|
||||||
let mut user_entries = HashMap::new();
|
let mut user_entries = HashMap::new();
|
||||||
let mut embedded_entries = HashMap::new();
|
let mut embedded_entries = HashMap::new();
|
||||||
|
|
||||||
for (file_name, entry) in EmbededConfigFiles::iter()
|
for (file_name, entry) in EmbededConfigFiles::iter()
|
||||||
.filter_map(|file_name| EmbededConfigFiles::get(&file_name).map(|e| (file_name, e)))
|
.filter_map(|file_name| EmbededConfigFiles::get(&file_name).map(|e| (file_name, e)))
|
||||||
{
|
{
|
||||||
if let Ok(entry) = ConfigEntry::parse_data(entry.data).await {
|
let entry = ConfigEntry::parse_data(entry.data).await.unwrap();
|
||||||
let file_name: &str = file_name.borrow();
|
let file_name: &str = file_name.borrow();
|
||||||
embedded_entries.insert(file_name.to_owned(), entry);
|
embedded_entries.insert(file_name.to_owned(), entry);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(directory) = directory {
|
if let Some(directory) = directory {
|
||||||
|
|
|
@ -2,7 +2,6 @@ use crate::util::Util;
|
||||||
|
|
||||||
use super::error::{ConfigError, ConfigErrorKind};
|
use super::error::{ConfigError, ConfigErrorKind};
|
||||||
use failure::ResultExt;
|
use failure::ResultExt;
|
||||||
use log::warn;
|
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
@ -140,11 +139,6 @@ impl ConfigEntry {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if xpath_body.is_empty() {
|
|
||||||
warn!("No body xpath found for");
|
|
||||||
return Err(ConfigErrorKind::BadConfig.into());
|
|
||||||
}
|
|
||||||
|
|
||||||
let config = ConfigEntry {
|
let config = ConfigEntry {
|
||||||
xpath_title,
|
xpath_title,
|
||||||
xpath_author,
|
xpath_author,
|
||||||
|
|
|
@ -10,8 +10,6 @@ pub struct ConfigError {
|
||||||
pub enum ConfigErrorKind {
|
pub enum ConfigErrorKind {
|
||||||
#[fail(display = "IO Error")]
|
#[fail(display = "IO Error")]
|
||||||
IO,
|
IO,
|
||||||
#[fail(display = "Config does not contain body xpath")]
|
|
||||||
BadConfig,
|
|
||||||
#[fail(display = "Unknown Error")]
|
#[fail(display = "Unknown Error")]
|
||||||
Unknown,
|
Unknown,
|
||||||
}
|
}
|
||||||
|
|
243
src/lib.rs
243
src/lib.rs
|
@ -22,6 +22,7 @@ use log::{debug, error, info, warn};
|
||||||
use reqwest::{Client, Response};
|
use reqwest::{Client, Response};
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
use util::Util;
|
||||||
|
|
||||||
pub struct ArticleScraper {
|
pub struct ArticleScraper {
|
||||||
pub image_downloader: ImageDownloader,
|
pub image_downloader: ImageDownloader,
|
||||||
|
@ -76,7 +77,11 @@ impl ArticleScraper {
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if we have a config for the url
|
// check if we have a config for the url
|
||||||
let config = self.get_grabber_config(&url)?;
|
let config = self.get_grabber_config(&url);
|
||||||
|
let global_config = self
|
||||||
|
.config_files
|
||||||
|
.get("global.txt")
|
||||||
|
.ok_or_else(|| ScraperErrorKind::Config)?;
|
||||||
|
|
||||||
let mut article = Article {
|
let mut article = Article {
|
||||||
title: None,
|
title: None,
|
||||||
|
@ -94,7 +99,7 @@ impl ArticleScraper {
|
||||||
|
|
||||||
ArticleScraper::generate_head(&mut root, &document)?;
|
ArticleScraper::generate_head(&mut root, &document)?;
|
||||||
|
|
||||||
self.parse_pages(&mut article, &url, &mut root, &config, client)
|
self.parse_pages(&mut article, &url, &mut root, config, global_config, client)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let context = Context::new(&document).map_err(|()| {
|
let context = Context::new(&document).map_err(|()| {
|
||||||
|
@ -139,15 +144,20 @@ impl ArticleScraper {
|
||||||
article: &mut Article,
|
article: &mut Article,
|
||||||
url: &url::Url,
|
url: &url::Url,
|
||||||
root: &mut Node,
|
root: &mut Node,
|
||||||
config: &ConfigEntry,
|
config: Option<&ConfigEntry>,
|
||||||
|
global_config: &ConfigEntry,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<(), ScraperError> {
|
) -> Result<(), ScraperError> {
|
||||||
let html = ArticleScraper::download(&url, client).await?;
|
let html = ArticleScraper::download(&url, client).await?;
|
||||||
let mut document = Self::parse_html(html, config)?;
|
let mut document = Self::parse_html(html, config, global_config)?;
|
||||||
let mut xpath_ctx = Self::get_xpath_ctx(&document)?;
|
let mut xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
|
|
||||||
// check for single page link
|
// check for single page link
|
||||||
if let Some(xpath_single_page_link) = config.single_page_link.clone() {
|
let rule = Util::select_rule(
|
||||||
|
config.and_then(|c| c.single_page_link.as_deref()),
|
||||||
|
global_config.single_page_link.as_deref(),
|
||||||
|
);
|
||||||
|
if let Some(xpath_single_page_link) = rule {
|
||||||
debug!(
|
debug!(
|
||||||
"Single page link xpath specified in config '{}'",
|
"Single page link xpath specified in config '{}'",
|
||||||
xpath_single_page_link
|
xpath_single_page_link
|
||||||
|
@ -159,32 +169,49 @@ impl ArticleScraper {
|
||||||
let single_page_url =
|
let single_page_url =
|
||||||
url::Url::parse(&result).context(ScraperErrorKind::Url)?;
|
url::Url::parse(&result).context(ScraperErrorKind::Url)?;
|
||||||
return self
|
return self
|
||||||
.parse_single_page(article, &single_page_url, root, config, client)
|
.parse_single_page(
|
||||||
|
article,
|
||||||
|
&single_page_url,
|
||||||
|
root,
|
||||||
|
config,
|
||||||
|
global_config,
|
||||||
|
client,
|
||||||
|
)
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ArticleScraper::extract_metadata(&xpath_ctx, config, article);
|
ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article);
|
||||||
ArticleScraper::strip_junk(&xpath_ctx, config, &url);
|
ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url);
|
||||||
ArticleScraper::extract_body(&xpath_ctx, root, config)?;
|
ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||||
|
|
||||||
while let Some(url) = self.check_for_next_page(&xpath_ctx, config) {
|
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
|
||||||
let html = ArticleScraper::download(&url, client).await?;
|
let html = ArticleScraper::download(&url, client).await?;
|
||||||
document = Self::parse_html(html, config)?;
|
document = Self::parse_html(html, config, global_config)?;
|
||||||
xpath_ctx = Self::get_xpath_ctx(&document)?;
|
xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
ArticleScraper::strip_junk(&xpath_ctx, config, &url);
|
ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url);
|
||||||
ArticleScraper::extract_body(&xpath_ctx, root, config)?;
|
ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_html(html: String, config: &ConfigEntry) -> Result<Document, ScraperError> {
|
fn parse_html(
|
||||||
|
html: String,
|
||||||
|
config: Option<&ConfigEntry>,
|
||||||
|
global_config: &ConfigEntry,
|
||||||
|
) -> Result<Document, ScraperError> {
|
||||||
// replace matches in raw html
|
// replace matches in raw html
|
||||||
|
|
||||||
let mut html = html;
|
let mut html = html;
|
||||||
for replace in &config.replace {
|
if let Some(config) = config {
|
||||||
|
for replace in &config.replace {
|
||||||
|
html = html.replace(&replace.to_replace, &replace.replace_with);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for replace in &global_config.replace {
|
||||||
html = html.replace(&replace.to_replace, &replace.replace_with);
|
html = html.replace(&replace.to_replace, &replace.replace_with);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -230,15 +257,16 @@ impl ArticleScraper {
|
||||||
article: &mut Article,
|
article: &mut Article,
|
||||||
url: &url::Url,
|
url: &url::Url,
|
||||||
root: &mut Node,
|
root: &mut Node,
|
||||||
config: &ConfigEntry,
|
config: Option<&ConfigEntry>,
|
||||||
|
global_config: &ConfigEntry,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<(), ScraperError> {
|
) -> Result<(), ScraperError> {
|
||||||
let html = ArticleScraper::download(&url, client).await?;
|
let html = ArticleScraper::download(&url, client).await?;
|
||||||
let document = Self::parse_html(html, config)?;
|
let document = Self::parse_html(html, config, global_config)?;
|
||||||
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
ArticleScraper::extract_metadata(&xpath_ctx, config, article);
|
ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article);
|
||||||
ArticleScraper::strip_junk(&xpath_ctx, config, &url);
|
ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url);
|
||||||
ArticleScraper::extract_body(&xpath_ctx, root, config)?;
|
ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -340,16 +368,17 @@ impl ArticleScraper {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_grabber_config(&self, url: &url::Url) -> Result<ConfigEntry, ScraperError> {
|
fn get_grabber_config(&self, url: &url::Url) -> Option<&ConfigEntry> {
|
||||||
let config_name = Self::get_host_name(url)? + ".txt";
|
let conf = Self::get_host_name(url)
|
||||||
|
.ok()
|
||||||
|
.map(|url| url + ".txt")
|
||||||
|
.and_then(|name| self.config_files.get(&name));
|
||||||
|
|
||||||
match self.config_files.get(&config_name) {
|
if conf.is_none() {
|
||||||
Some(config) => Ok(config.clone()),
|
log::warn!("No config found for url '{}'", url);
|
||||||
None => {
|
|
||||||
error!("No config file of the name '{}' found", config_name);
|
|
||||||
Err(ScraperErrorKind::Config.into())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
conf
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
|
fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
|
||||||
|
@ -575,19 +604,45 @@ impl ArticleScraper {
|
||||||
Ok(url)
|
Ok(url)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn strip_junk(context: &Context, config: &ConfigEntry, url: &url::Url) {
|
fn strip_junk(
|
||||||
|
context: &Context,
|
||||||
|
config: Option<&ConfigEntry>,
|
||||||
|
global_config: &ConfigEntry,
|
||||||
|
url: &url::Url,
|
||||||
|
) {
|
||||||
// strip specified xpath
|
// strip specified xpath
|
||||||
for xpath_strip in &config.xpath_strip {
|
if let Some(config) = config {
|
||||||
|
for xpath_strip in &config.xpath_strip {
|
||||||
|
let _ = ArticleScraper::strip_node(&context, xpath_strip);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for xpath_strip in &global_config.xpath_strip {
|
||||||
let _ = ArticleScraper::strip_node(&context, xpath_strip);
|
let _ = ArticleScraper::strip_node(&context, xpath_strip);
|
||||||
}
|
}
|
||||||
|
|
||||||
// strip everything with specified 'id' or 'class'
|
// strip everything with specified 'id' or 'class'
|
||||||
for xpaht_strip_class in &config.strip_id_or_class {
|
if let Some(config) = config {
|
||||||
|
for xpaht_strip_class in &config.strip_id_or_class {
|
||||||
|
let _ = ArticleScraper::strip_id_or_class(&context, xpaht_strip_class);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for xpaht_strip_class in &global_config.strip_id_or_class {
|
||||||
let _ = ArticleScraper::strip_id_or_class(&context, xpaht_strip_class);
|
let _ = ArticleScraper::strip_id_or_class(&context, xpaht_strip_class);
|
||||||
}
|
}
|
||||||
|
|
||||||
// strip any <img> element where @src attribute contains this substring
|
// strip any <img> element where @src attribute contains this substring
|
||||||
for xpath_strip_img_src in &config.strip_image_src {
|
if let Some(config) = config {
|
||||||
|
for xpath_strip_img_src in &config.strip_image_src {
|
||||||
|
let _ = ArticleScraper::strip_node(
|
||||||
|
&context,
|
||||||
|
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for xpath_strip_img_src in &global_config.strip_image_src {
|
||||||
let _ = ArticleScraper::strip_node(
|
let _ = ArticleScraper::strip_node(
|
||||||
&context,
|
&context,
|
||||||
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
|
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
|
||||||
|
@ -620,9 +675,6 @@ impl ArticleScraper {
|
||||||
&String::from("//*[contains(@style,'display:none')]"),
|
&String::from("//*[contains(@style,'display:none')]"),
|
||||||
);
|
);
|
||||||
|
|
||||||
// strip all scripts
|
|
||||||
//let _ = ArticleScraper::strip_node(&context, &String::from("//script"));
|
|
||||||
|
|
||||||
// strip all comments
|
// strip all comments
|
||||||
let _ = ArticleScraper::strip_node(&context, &String::from("//comment()"));
|
let _ = ArticleScraper::strip_node(&context, &String::from("//comment()"));
|
||||||
|
|
||||||
|
@ -633,34 +685,79 @@ impl ArticleScraper {
|
||||||
let _ = ArticleScraper::strip_node(&context, &String::from("//*[@type='text/css']"));
|
let _ = ArticleScraper::strip_node(&context, &String::from("//*[@type='text/css']"));
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_metadata(context: &Context, config: &ConfigEntry, article: &mut Article) {
|
fn extract_metadata(
|
||||||
|
context: &Context,
|
||||||
|
config: Option<&ConfigEntry>,
|
||||||
|
global_config: &ConfigEntry,
|
||||||
|
article: &mut Article,
|
||||||
|
) {
|
||||||
// try to get title
|
// try to get title
|
||||||
for xpath_title in &config.xpath_title {
|
if let Some(config) = config {
|
||||||
if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) {
|
for xpath_title in &config.xpath_title {
|
||||||
debug!("Article title: '{}'", title);
|
if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) {
|
||||||
article.title = Some(title);
|
debug!("Article title: '{}'", title);
|
||||||
break;
|
article.title = Some(title);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if article.title.is_none() {
|
||||||
|
for xpath_title in &global_config.xpath_title {
|
||||||
|
if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) {
|
||||||
|
debug!("Article title: '{}'", title);
|
||||||
|
article.title = Some(title);
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// try to get the author
|
// try to get the author
|
||||||
for xpath_author in &config.xpath_author {
|
if let Some(config) = config {
|
||||||
if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) {
|
for xpath_author in &config.xpath_author {
|
||||||
debug!("Article author: '{}'", author);
|
if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) {
|
||||||
article.author = Some(author);
|
debug!("Article author: '{}'", author);
|
||||||
break;
|
article.author = Some(author);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if article.title.is_none() {
|
||||||
|
for xpath_author in &global_config.xpath_author {
|
||||||
|
if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) {
|
||||||
|
debug!("Article author: '{}'", author);
|
||||||
|
article.author = Some(author);
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// try to get the date
|
// try to get the date
|
||||||
for xpath_date in &config.xpath_date {
|
if let Some(config) = config {
|
||||||
if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) {
|
for xpath_date in &config.xpath_date {
|
||||||
debug!("Article date: '{}'", date_string);
|
if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) {
|
||||||
if let Ok(date) = DateTime::from_str(&date_string) {
|
debug!("Article date: '{}'", date_string);
|
||||||
article.date = Some(date);
|
if let Ok(date) = DateTime::from_str(&date_string) {
|
||||||
break;
|
article.date = Some(date);
|
||||||
} else {
|
break;
|
||||||
warn!("Parsing the date string '{}' failed", date_string);
|
} else {
|
||||||
|
warn!("Parsing the date string '{}' failed", date_string);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if article.title.is_none() {
|
||||||
|
for xpath_date in &global_config.xpath_date {
|
||||||
|
if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) {
|
||||||
|
debug!("Article date: '{}'", date_string);
|
||||||
|
if let Ok(date) = DateTime::from_str(&date_string) {
|
||||||
|
article.date = Some(date);
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
warn!("Parsing the date string '{}' failed", date_string);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -669,14 +766,25 @@ impl ArticleScraper {
|
||||||
fn extract_body(
|
fn extract_body(
|
||||||
context: &Context,
|
context: &Context,
|
||||||
root: &mut Node,
|
root: &mut Node,
|
||||||
config: &ConfigEntry,
|
config: Option<&ConfigEntry>,
|
||||||
|
global_config: &ConfigEntry,
|
||||||
) -> Result<(), ScraperError> {
|
) -> Result<(), ScraperError> {
|
||||||
let mut found_something = false;
|
let mut found_something = false;
|
||||||
for xpath_body in &config.xpath_body {
|
|
||||||
found_something = ArticleScraper::extract_body_single(&context, root, xpath_body)?;
|
if let Some(config) = config {
|
||||||
|
for xpath_body in &config.xpath_body {
|
||||||
|
found_something = ArticleScraper::extract_body_single(&context, root, xpath_body)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !found_something {
|
if !found_something {
|
||||||
|
for xpath_body in &global_config.xpath_body {
|
||||||
|
found_something = ArticleScraper::extract_body_single(&context, root, xpath_body)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !found_something {
|
||||||
|
log::error!("no body found");
|
||||||
return Err(ScraperErrorKind::Scrape.into());
|
return Err(ScraperErrorKind::Scrape.into());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -709,10 +817,25 @@ impl ArticleScraper {
|
||||||
Ok(found_something)
|
Ok(found_something)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_for_next_page(&self, context: &Context, config: &ConfigEntry) -> Option<url::Url> {
|
fn check_for_next_page(
|
||||||
if let Some(next_page_xpath) = config.next_page_link.clone() {
|
&self,
|
||||||
|
context: &Context,
|
||||||
|
config: Option<&ConfigEntry>,
|
||||||
|
global_config: &ConfigEntry,
|
||||||
|
) -> Option<url::Url> {
|
||||||
|
if let Some(config) = config {
|
||||||
|
if let Some(next_page_xpath) = config.next_page_link.as_deref() {
|
||||||
|
if let Ok(next_page_string) =
|
||||||
|
ArticleScraper::get_attribute(&context, next_page_xpath, "href")
|
||||||
|
{
|
||||||
|
if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
|
||||||
|
return Some(next_page_url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() {
|
||||||
if let Ok(next_page_string) =
|
if let Ok(next_page_string) =
|
||||||
ArticleScraper::get_attribute(&context, &next_page_xpath, "href")
|
ArticleScraper::get_attribute(&context, next_page_xpath, "href")
|
||||||
{
|
{
|
||||||
if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
|
if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
|
||||||
return Some(next_page_url);
|
return Some(next_page_url);
|
||||||
|
|
36
src/tests.rs
36
src/tests.rs
|
@ -1,35 +1,21 @@
|
||||||
use crate::*;
|
use crate::*;
|
||||||
use std::path::PathBuf;
|
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
|
use std::path::PathBuf;
|
||||||
#[tokio::test(flavor = "current_thread")]
|
|
||||||
async fn golem() {
|
|
||||||
let out_path = PathBuf::from(r"./test_output");
|
|
||||||
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
|
|
||||||
|
|
||||||
let grabber = ArticleScraper::new(None).await;
|
|
||||||
let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
|
|
||||||
article.save_html(&out_path).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
article.title,
|
|
||||||
Some(String::from(
|
|
||||||
"HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben"
|
|
||||||
))
|
|
||||||
);
|
|
||||||
assert_eq!(article.author, Some(String::from("Hauke Gierow")));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test(flavor = "current_thread")]
|
#[tokio::test(flavor = "current_thread")]
|
||||||
async fn phoronix() {
|
async fn phoronix() {
|
||||||
let out_path = PathBuf::from(r"./test_output");
|
let out_path = PathBuf::from(r"./test_output");
|
||||||
let url = url::Url::parse(
|
let url =
|
||||||
"http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1",
|
url::Url::parse("http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1")
|
||||||
)
|
.unwrap();
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let grabber = ArticleScraper::new(None).await;
|
let grabber = ArticleScraper::new(None).await;
|
||||||
let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
|
|
||||||
|
let start = chrono::Utc::now();
|
||||||
|
let article = grabber.parse(&url, false, &Client::new()).await.unwrap();
|
||||||
|
let end = chrono::Utc::now();
|
||||||
|
let duration = end - start;
|
||||||
|
println!("duration: {}ms", duration.num_milliseconds());
|
||||||
article.save_html(&out_path).unwrap();
|
article.save_html(&out_path).unwrap();
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
@ -51,4 +37,4 @@ async fn youtube() {
|
||||||
article.html,
|
article.html,
|
||||||
Some("<iframe width=\"650\" height=\"350\" frameborder=\"0\" src=\"https://www.youtube-nocookie.com/embed/lHRkYLcmFY8\" allowfullscreen></iframe>".into())
|
Some("<iframe width=\"650\" height=\"350\" frameborder=\"0\" src=\"https://www.youtube-nocookie.com/embed/lHRkYLcmFY8\" allowfullscreen></iframe>".into())
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
11
src/util.rs
11
src/util.rs
|
@ -23,4 +23,15 @@ impl Util {
|
||||||
pub fn split_values(values: &str) -> Vec<&str> {
|
pub fn split_values(values: &str) -> Vec<&str> {
|
||||||
values.split('|').map(|s| s.trim()).collect()
|
values.split('|').map(|s| s.trim()).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn select_rule<'a>(
|
||||||
|
site_specific_rule: Option<&'a str>,
|
||||||
|
global_rule: Option<&'a str>,
|
||||||
|
) -> Option<&'a str> {
|
||||||
|
if site_specific_rule.is_some() {
|
||||||
|
site_specific_rule
|
||||||
|
} else {
|
||||||
|
global_rule
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue