mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
embedded config files
This commit is contained in:
parent
aa09666f4c
commit
3a6a70ee64
10 changed files with 335 additions and 286 deletions
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
[submodule "ftr-site-config"]
|
||||||
|
path = ftr-site-config
|
||||||
|
url = https://github.com/fivefilters/ftr-site-config.git
|
|
@ -1,6 +1,6 @@
|
||||||
[package]
|
[package]
|
||||||
name = "article_scraper"
|
name = "article_scraper"
|
||||||
version = "1.1.8-alpha.0"
|
version = "2.0.0-alpha.0"
|
||||||
authors = ["Jan Lukas Gernert <jangernert@gmail.com>"]
|
authors = ["Jan Lukas Gernert <jangernert@gmail.com>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
license = "GPL-3.0-or-later"
|
license = "GPL-3.0-or-later"
|
||||||
|
@ -19,4 +19,4 @@ chrono = "0.4"
|
||||||
base64 = "0.13"
|
base64 = "0.13"
|
||||||
image = "0.24"
|
image = "0.24"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
parking_lot = "0.12"
|
rust-embed="6.4"
|
||||||
|
|
1
ftr-site-config
Submodule
1
ftr-site-config
Submodule
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit 70a3a3ac28a5db57e47f25cd4573e604cfc90f80
|
75
src/config/config_collection.rs
Normal file
75
src/config/config_collection.rs
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
use rust_embed::RustEmbed;
|
||||||
|
use std::{borrow::Borrow, collections::HashMap, path::Path};
|
||||||
|
|
||||||
|
use super::ConfigEntry;
|
||||||
|
use crate::util::Util;
|
||||||
|
|
||||||
|
#[derive(RustEmbed)]
|
||||||
|
#[folder = "ftr-site-config"]
|
||||||
|
struct EmbededConfigFiles;
|
||||||
|
|
||||||
|
pub struct ConfigCollection {
|
||||||
|
embedded_entries: HashMap<String, ConfigEntry>,
|
||||||
|
user_entries: HashMap<String, ConfigEntry>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ConfigCollection {
|
||||||
|
pub async fn parse(directory: Option<&Path>) -> ConfigCollection {
|
||||||
|
|
||||||
|
let mut user_entries = HashMap::new();
|
||||||
|
let mut embedded_entries = HashMap::new();
|
||||||
|
|
||||||
|
for (file_name, entry) in EmbededConfigFiles::iter()
|
||||||
|
.filter_map(|file_name| EmbededConfigFiles::get(&file_name).map(|e| (file_name, e)))
|
||||||
|
{
|
||||||
|
if let Ok(entry) = ConfigEntry::parse_data(entry.data).await {
|
||||||
|
let file_name: &str = file_name.borrow();
|
||||||
|
embedded_entries.insert(file_name.to_owned(), entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(directory) = directory {
|
||||||
|
// create data dir if it doesn't already exist
|
||||||
|
if let Err(error) = std::fs::DirBuilder::new()
|
||||||
|
.recursive(true)
|
||||||
|
.create(&directory)
|
||||||
|
{
|
||||||
|
log::warn!(
|
||||||
|
"Failed to create user config directory {:?}: {}",
|
||||||
|
directory,
|
||||||
|
error
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Ok(mut dir) = tokio::fs::read_dir(directory).await {
|
||||||
|
while let Ok(entry) = dir.next_entry().await {
|
||||||
|
if let Some(entry) = entry {
|
||||||
|
if Util::check_extension(&entry, "txt") {
|
||||||
|
if let Ok(config) = ConfigEntry::parse_path(&entry.path()).await {
|
||||||
|
let file_name = entry.file_name().to_string_lossy().into_owned();
|
||||||
|
user_entries.insert(file_name, config);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Self {
|
||||||
|
embedded_entries,
|
||||||
|
user_entries,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get(&self, key: &str) -> Option<&ConfigEntry> {
|
||||||
|
if let Some(user_entry) = self.user_entries.get(key) {
|
||||||
|
Some(user_entry)
|
||||||
|
} else {
|
||||||
|
self.embedded_entries.get(key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn contains_config(&self, key: &str) -> bool {
|
||||||
|
self.user_entries.contains_key(key) || self.embedded_entries.contains_key(key)
|
||||||
|
}
|
||||||
|
}
|
163
src/config/config_entry.rs
Normal file
163
src/config/config_entry.rs
Normal file
|
@ -0,0 +1,163 @@
|
||||||
|
use crate::util::Util;
|
||||||
|
|
||||||
|
use super::error::{ConfigError, ConfigErrorKind};
|
||||||
|
use failure::ResultExt;
|
||||||
|
use log::warn;
|
||||||
|
use std::borrow::Cow;
|
||||||
|
use std::io::Cursor;
|
||||||
|
use std::path::Path;
|
||||||
|
use tokio::fs;
|
||||||
|
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct Replace {
|
||||||
|
pub to_replace: String,
|
||||||
|
pub replace_with: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct ConfigEntry {
|
||||||
|
pub xpath_title: Vec<String>,
|
||||||
|
pub xpath_author: Vec<String>,
|
||||||
|
pub xpath_date: Vec<String>,
|
||||||
|
pub xpath_body: Vec<String>,
|
||||||
|
pub xpath_strip: Vec<String>,
|
||||||
|
pub strip_id_or_class: Vec<String>,
|
||||||
|
pub strip_image_src: Vec<String>,
|
||||||
|
pub replace: Vec<Replace>,
|
||||||
|
pub single_page_link: Option<String>,
|
||||||
|
pub next_page_link: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ConfigEntry {
|
||||||
|
pub async fn parse_path(config_path: &Path) -> Result<ConfigEntry, ConfigError> {
|
||||||
|
let mut file = fs::File::open(&config_path)
|
||||||
|
.await
|
||||||
|
.context(ConfigErrorKind::IO)?;
|
||||||
|
let buffer = BufReader::new(&mut file);
|
||||||
|
|
||||||
|
Self::parse(buffer).await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn parse_data(data: Cow<'static, [u8]>) -> Result<ConfigEntry, ConfigError> {
|
||||||
|
let data = data.as_ref();
|
||||||
|
let mut cursor = Cursor::new(data);
|
||||||
|
let buffer = BufReader::new(&mut cursor);
|
||||||
|
|
||||||
|
Self::parse(buffer).await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn parse<R: AsyncRead + Unpin>(buffer: BufReader<R>) -> Result<ConfigEntry, ConfigError> {
|
||||||
|
let mut xpath_title: Vec<String> = Vec::new();
|
||||||
|
let mut xpath_author: Vec<String> = Vec::new();
|
||||||
|
let mut xpath_date: Vec<String> = Vec::new();
|
||||||
|
let mut xpath_body: Vec<String> = Vec::new();
|
||||||
|
let mut xpath_strip: Vec<String> = Vec::new();
|
||||||
|
let mut strip_id_or_class: Vec<String> = Vec::new();
|
||||||
|
let mut strip_image_src: Vec<String> = Vec::new();
|
||||||
|
let mut replace_vec: Vec<Replace> = Vec::new();
|
||||||
|
let mut next_page_link: Option<String> = None;
|
||||||
|
let mut single_page_link: Option<String> = None;
|
||||||
|
|
||||||
|
// ignore: tidy, prune, autodetect_on_failure and test_url
|
||||||
|
let title = "title:";
|
||||||
|
let body = "body:";
|
||||||
|
let date = "date:";
|
||||||
|
let author = "author:";
|
||||||
|
let strip = "strip:";
|
||||||
|
let strip_id = "strip_id_or_class:";
|
||||||
|
let strip_img = "strip_image_src:";
|
||||||
|
let single_page = "single_page_link:";
|
||||||
|
let next_page = "next_page_link:";
|
||||||
|
let find = "find_string:";
|
||||||
|
let replace = "replace_string:";
|
||||||
|
let replace_single = "replace_string(";
|
||||||
|
|
||||||
|
// ignore these
|
||||||
|
let tidy = "tidy:";
|
||||||
|
let prune = "prune:";
|
||||||
|
let test_url = "test_url:";
|
||||||
|
let autodetect = "autodetect_on_failure:";
|
||||||
|
|
||||||
|
let mut lines = buffer.lines();
|
||||||
|
|
||||||
|
while let Ok(Some(line)) = lines.next_line().await {
|
||||||
|
let line = line.trim();
|
||||||
|
if line.starts_with('#')
|
||||||
|
|| line.starts_with(tidy)
|
||||||
|
|| line.starts_with(prune)
|
||||||
|
|| line.starts_with(test_url)
|
||||||
|
|| line.starts_with(autodetect)
|
||||||
|
|| line.is_empty()
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
extract_vec_multi!(line, title, xpath_title);
|
||||||
|
extract_vec_multi!(line, body, xpath_body);
|
||||||
|
extract_vec_multi!(line, date, xpath_date);
|
||||||
|
extract_vec_multi!(line, author, xpath_author);
|
||||||
|
|
||||||
|
extract_vec_single!(line, strip, xpath_strip);
|
||||||
|
extract_vec_single!(line, strip_id, strip_id_or_class);
|
||||||
|
extract_vec_single!(line, strip_img, strip_image_src);
|
||||||
|
|
||||||
|
extract_option_single!(line, single_page, single_page_link);
|
||||||
|
extract_option_single!(line, next_page, next_page_link);
|
||||||
|
|
||||||
|
if line.starts_with(replace_single) {
|
||||||
|
let value = Util::extract_value(replace_single, line);
|
||||||
|
let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect();
|
||||||
|
if value.len() != 2 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(to_replace) = value.get(0) {
|
||||||
|
if let Some(replace_with) = value.get(1) {
|
||||||
|
replace_vec.push(Replace {
|
||||||
|
to_replace: (*to_replace).to_string(),
|
||||||
|
replace_with: (*replace_with).to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if line.starts_with(find) {
|
||||||
|
let to_replace = Util::extract_value(find, line).into();
|
||||||
|
|
||||||
|
if let Ok(Some(next_line)) = lines.next_line().await {
|
||||||
|
let replace_with = Util::extract_value(replace, &next_line).into();
|
||||||
|
|
||||||
|
replace_vec.push(Replace {
|
||||||
|
to_replace,
|
||||||
|
replace_with,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if xpath_body.is_empty() {
|
||||||
|
warn!("No body xpath found for");
|
||||||
|
return Err(ConfigErrorKind::BadConfig.into());
|
||||||
|
}
|
||||||
|
|
||||||
|
let config = ConfigEntry {
|
||||||
|
xpath_title,
|
||||||
|
xpath_author,
|
||||||
|
xpath_date,
|
||||||
|
xpath_body,
|
||||||
|
xpath_strip,
|
||||||
|
strip_id_or_class,
|
||||||
|
strip_image_src,
|
||||||
|
replace: replace_vec,
|
||||||
|
single_page_link,
|
||||||
|
next_page_link,
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(config)
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,181 +1,8 @@
|
||||||
use crate::util::Util;
|
|
||||||
|
|
||||||
use self::error::{ConfigError, ConfigErrorKind};
|
|
||||||
use failure::ResultExt;
|
|
||||||
use log::warn;
|
|
||||||
use std::collections::HashMap;
|
|
||||||
use tokio::fs;
|
|
||||||
use tokio::io;
|
|
||||||
use tokio::io::AsyncBufReadExt;
|
|
||||||
use std::path::PathBuf;
|
|
||||||
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
mod macros;
|
mod macros;
|
||||||
|
mod config_collection;
|
||||||
|
mod config_entry;
|
||||||
mod error;
|
mod error;
|
||||||
|
|
||||||
pub type ConfigCollection = HashMap<String, GrabberConfig>;
|
pub use config_collection::ConfigCollection;
|
||||||
|
pub use config_entry::ConfigEntry;
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct Replace {
|
|
||||||
pub to_replace: String,
|
|
||||||
pub replace_with: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct GrabberConfig {
|
|
||||||
pub xpath_title: Vec<String>,
|
|
||||||
pub xpath_author: Vec<String>,
|
|
||||||
pub xpath_date: Vec<String>,
|
|
||||||
pub xpath_body: Vec<String>,
|
|
||||||
pub xpath_strip: Vec<String>,
|
|
||||||
pub strip_id_or_class: Vec<String>,
|
|
||||||
pub strip_image_src: Vec<String>,
|
|
||||||
pub replace: Vec<Replace>,
|
|
||||||
pub single_page_link: Option<String>,
|
|
||||||
pub next_page_link: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl GrabberConfig {
|
|
||||||
pub async fn parse_directory(directory: &PathBuf) -> Result<ConfigCollection, ConfigError> {
|
|
||||||
// create data dir if it doesn't already exist
|
|
||||||
std::fs::DirBuilder::new()
|
|
||||||
.recursive(true)
|
|
||||||
.create(&directory)
|
|
||||||
.context(ConfigErrorKind::IO)?;
|
|
||||||
|
|
||||||
let mut dir = tokio::fs::read_dir(directory).await.context(ConfigErrorKind::IO)?;
|
|
||||||
let mut collection = HashMap::new();
|
|
||||||
|
|
||||||
while let Ok(entry) = dir.next_entry().await {
|
|
||||||
if let Some(entry) = entry {
|
|
||||||
if Util::check_extension(&entry, "txt") {
|
|
||||||
if let Ok(config) = GrabberConfig::new(entry.path()).await {
|
|
||||||
let file_name = entry.file_name().to_string_lossy().into_owned();
|
|
||||||
collection.insert(file_name, config);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(collection)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async fn new(config_path: PathBuf) -> Result<GrabberConfig, ConfigError> {
|
|
||||||
let mut file = fs::File::open(&config_path).await.context(ConfigErrorKind::IO)?;
|
|
||||||
let buffer = io::BufReader::new(&mut file);
|
|
||||||
|
|
||||||
let mut xpath_title: Vec<String> = Vec::new();
|
|
||||||
let mut xpath_author: Vec<String> = Vec::new();
|
|
||||||
let mut xpath_date: Vec<String> = Vec::new();
|
|
||||||
let mut xpath_body: Vec<String> = Vec::new();
|
|
||||||
let mut xpath_strip: Vec<String> = Vec::new();
|
|
||||||
let mut strip_id_or_class: Vec<String> = Vec::new();
|
|
||||||
let mut strip_image_src: Vec<String> = Vec::new();
|
|
||||||
let mut replace_vec: Vec<Replace> = Vec::new();
|
|
||||||
let mut next_page_link: Option<String> = None;
|
|
||||||
let mut single_page_link: Option<String> = None;
|
|
||||||
|
|
||||||
// ignore: tidy, prune, autodetect_on_failure and test_url
|
|
||||||
let title = "title:";
|
|
||||||
let body = "body:";
|
|
||||||
let date = "date:";
|
|
||||||
let author = "author:";
|
|
||||||
let strip = "strip:";
|
|
||||||
let strip_id = "strip_id_or_class:";
|
|
||||||
let strip_img = "strip_image_src:";
|
|
||||||
let single_page = "single_page_link:";
|
|
||||||
let next_page = "next_page_link:";
|
|
||||||
let find = "find_string:";
|
|
||||||
let replace = "replace_string:";
|
|
||||||
let replace_single = "replace_string(";
|
|
||||||
|
|
||||||
// ignore these
|
|
||||||
let tidy = "tidy:";
|
|
||||||
let prune = "prune:";
|
|
||||||
let test_url = "test_url:";
|
|
||||||
let autodetect = "autodetect_on_failure:";
|
|
||||||
|
|
||||||
let mut lines = buffer.lines();
|
|
||||||
|
|
||||||
while let Ok(Some(line)) = lines.next_line().await {
|
|
||||||
let line = line.trim();
|
|
||||||
if line.starts_with('#')
|
|
||||||
|| line.starts_with(tidy)
|
|
||||||
|| line.starts_with(prune)
|
|
||||||
|| line.starts_with(test_url)
|
|
||||||
|| line.starts_with(autodetect)
|
|
||||||
|| line.is_empty()
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
extract_vec_multi!(line, title, xpath_title);
|
|
||||||
extract_vec_multi!(line, body, xpath_body);
|
|
||||||
extract_vec_multi!(line, date, xpath_date);
|
|
||||||
extract_vec_multi!(line, author, xpath_author);
|
|
||||||
|
|
||||||
extract_vec_single!(line, strip, xpath_strip);
|
|
||||||
extract_vec_single!(line, strip_id, strip_id_or_class);
|
|
||||||
extract_vec_single!(line, strip_img, strip_image_src);
|
|
||||||
|
|
||||||
extract_option_single!(line, single_page, single_page_link);
|
|
||||||
extract_option_single!(line, next_page, next_page_link);
|
|
||||||
|
|
||||||
if line.starts_with(replace_single) {
|
|
||||||
let value = Util::extract_value(replace_single, line);
|
|
||||||
let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect();
|
|
||||||
if value.len() != 2 {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(to_replace) = value.get(0) {
|
|
||||||
if let Some(replace_with) = value.get(1) {
|
|
||||||
replace_vec.push(Replace {
|
|
||||||
to_replace: (*to_replace).to_string(),
|
|
||||||
replace_with: (*replace_with).to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if line.starts_with(find) {
|
|
||||||
let to_replace = Util::extract_value(find, line).into();
|
|
||||||
|
|
||||||
if let Ok(Some(ref next_line)) = lines.next_line().await {
|
|
||||||
let replace_with = Util::extract_value(replace, &next_line).into();
|
|
||||||
|
|
||||||
replace_vec.push(Replace {
|
|
||||||
to_replace,
|
|
||||||
replace_with,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if xpath_body.is_empty() {
|
|
||||||
warn!("No body xpath found for {}", config_path.display());
|
|
||||||
return Err(ConfigErrorKind::BadConfig.into());
|
|
||||||
}
|
|
||||||
|
|
||||||
let config = GrabberConfig {
|
|
||||||
xpath_title,
|
|
||||||
xpath_author,
|
|
||||||
xpath_date,
|
|
||||||
xpath_body,
|
|
||||||
xpath_strip,
|
|
||||||
strip_id_or_class,
|
|
||||||
strip_image_src,
|
|
||||||
replace: replace_vec,
|
|
||||||
single_page_link,
|
|
||||||
next_page_link,
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(config)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
use std::io::Cursor;
|
|
||||||
use self::error::{ImageDownloadError, ImageDownloadErrorKind};
|
use self::error::{ImageDownloadError, ImageDownloadErrorKind};
|
||||||
use crate::ArticleScraper;
|
use crate::ArticleScraper;
|
||||||
use failure::ResultExt;
|
use failure::ResultExt;
|
||||||
|
@ -7,6 +6,7 @@ use libxml::tree::{Node, SaveOptions};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
use log::{debug, error};
|
use log::{debug, error};
|
||||||
use reqwest::{Client, Response};
|
use reqwest::{Client, Response};
|
||||||
|
use std::io::Cursor;
|
||||||
|
|
||||||
mod error;
|
mod error;
|
||||||
|
|
||||||
|
@ -214,7 +214,10 @@ impl ImageDownloader {
|
||||||
.context(ImageDownloadErrorKind::ImageScale)?;
|
.context(ImageDownloadErrorKind::ImageScale)?;
|
||||||
|
|
||||||
image
|
image
|
||||||
.write_to(&mut Cursor::new(&mut original_image), image::ImageOutputFormat::Png)
|
.write_to(
|
||||||
|
&mut Cursor::new(&mut original_image),
|
||||||
|
image::ImageOutputFormat::Png,
|
||||||
|
)
|
||||||
.map_err(|err| {
|
.map_err(|err| {
|
||||||
error!("Failed to save resized image to resize");
|
error!("Failed to save resized image to resize");
|
||||||
err
|
err
|
||||||
|
@ -230,7 +233,10 @@ impl ImageDownloader {
|
||||||
);
|
);
|
||||||
let mut resized_buf: Vec<u8> = Vec::new();
|
let mut resized_buf: Vec<u8> = Vec::new();
|
||||||
image
|
image
|
||||||
.write_to(&mut Cursor::new(&mut resized_buf), image::ImageOutputFormat::Png)
|
.write_to(
|
||||||
|
&mut Cursor::new(&mut resized_buf),
|
||||||
|
image::ImageOutputFormat::Png,
|
||||||
|
)
|
||||||
.map_err(|err| {
|
.map_err(|err| {
|
||||||
error!("Failed to save resized image to resize");
|
error!("Failed to save resized image to resize");
|
||||||
err
|
err
|
||||||
|
|
126
src/lib.rs
126
src/lib.rs
|
@ -2,12 +2,15 @@ mod article;
|
||||||
mod config;
|
mod config;
|
||||||
mod error;
|
mod error;
|
||||||
pub mod images;
|
pub mod images;
|
||||||
mod youtube;
|
|
||||||
mod util;
|
mod util;
|
||||||
|
mod youtube;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests;
|
||||||
|
|
||||||
use self::error::{ScraperError, ScraperErrorKind};
|
use self::error::{ScraperError, ScraperErrorKind};
|
||||||
use crate::article::Article;
|
use crate::article::Article;
|
||||||
use crate::config::{ConfigCollection, GrabberConfig};
|
use crate::config::{ConfigCollection, ConfigEntry};
|
||||||
use crate::images::ImageDownloader;
|
use crate::images::ImageDownloader;
|
||||||
use chrono::DateTime;
|
use chrono::DateTime;
|
||||||
use encoding_rs::Encoding;
|
use encoding_rs::Encoding;
|
||||||
|
@ -16,27 +19,18 @@ use libxml::parser::Parser;
|
||||||
use libxml::tree::{Document, Node, SaveOptions};
|
use libxml::tree::{Document, Node, SaveOptions};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
use log::{debug, error, info, warn};
|
use log::{debug, error, info, warn};
|
||||||
use parking_lot::RwLock;
|
|
||||||
use reqwest::{Client, Response};
|
use reqwest::{Client, Response};
|
||||||
use std::collections::HashMap;
|
use std::path::Path;
|
||||||
use std::path::PathBuf;
|
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
pub struct ArticleScraper {
|
pub struct ArticleScraper {
|
||||||
pub image_downloader: ImageDownloader,
|
pub image_downloader: ImageDownloader,
|
||||||
config_files: Arc<RwLock<Option<ConfigCollection>>>,
|
config_files: ConfigCollection,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ArticleScraper {
|
impl ArticleScraper {
|
||||||
pub async fn new(config_path: PathBuf) -> Self {
|
pub async fn new(config_path: Option<&Path>) -> Self {
|
||||||
let config_files = Arc::new(RwLock::new(None));
|
let config_files = ConfigCollection::parse(config_path).await;
|
||||||
|
|
||||||
if let Ok(loaded_config_files) = GrabberConfig::parse_directory(&config_path).await {
|
|
||||||
config_files.write().replace(loaded_config_files);
|
|
||||||
} else {
|
|
||||||
config_files.write().replace(HashMap::new());
|
|
||||||
}
|
|
||||||
ArticleScraper {
|
ArticleScraper {
|
||||||
image_downloader: ImageDownloader::new((2048, 2048)),
|
image_downloader: ImageDownloader::new((2048, 2048)),
|
||||||
config_files,
|
config_files,
|
||||||
|
@ -52,7 +46,7 @@ impl ArticleScraper {
|
||||||
info!("Scraping article: '{}'", url.as_str());
|
info!("Scraping article: '{}'", url.as_str());
|
||||||
|
|
||||||
// custom youtube handling, but prefer config if exists
|
// custom youtube handling, but prefer config if exists
|
||||||
if !self.grabber_config_exists("youtube.com")? {
|
if !self.config_files.contains_config("youtube.com.txt") {
|
||||||
if let Some(article) = youtube::Youtube::handle(&url) {
|
if let Some(article) = youtube::Youtube::handle(&url) {
|
||||||
return Ok(article);
|
return Ok(article);
|
||||||
}
|
}
|
||||||
|
@ -145,7 +139,7 @@ impl ArticleScraper {
|
||||||
article: &mut Article,
|
article: &mut Article,
|
||||||
url: &url::Url,
|
url: &url::Url,
|
||||||
root: &mut Node,
|
root: &mut Node,
|
||||||
config: &GrabberConfig,
|
config: &ConfigEntry,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<(), ScraperError> {
|
) -> Result<(), ScraperError> {
|
||||||
let html = ArticleScraper::download(&url, client).await?;
|
let html = ArticleScraper::download(&url, client).await?;
|
||||||
|
@ -186,7 +180,7 @@ impl ArticleScraper {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_html(html: String, config: &GrabberConfig) -> Result<Document, ScraperError> {
|
fn parse_html(html: String, config: &ConfigEntry) -> Result<Document, ScraperError> {
|
||||||
// replace matches in raw html
|
// replace matches in raw html
|
||||||
|
|
||||||
let mut html = html;
|
let mut html = html;
|
||||||
|
@ -236,7 +230,7 @@ impl ArticleScraper {
|
||||||
article: &mut Article,
|
article: &mut Article,
|
||||||
url: &url::Url,
|
url: &url::Url,
|
||||||
root: &mut Node,
|
root: &mut Node,
|
||||||
config: &GrabberConfig,
|
config: &ConfigEntry,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<(), ScraperError> {
|
) -> Result<(), ScraperError> {
|
||||||
let html = ArticleScraper::download(&url, client).await?;
|
let html = ArticleScraper::download(&url, client).await?;
|
||||||
|
@ -346,29 +340,15 @@ impl ArticleScraper {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_grabber_config(&self, url: &url::Url) -> Result<GrabberConfig, ScraperError> {
|
fn get_grabber_config(&self, url: &url::Url) -> Result<ConfigEntry, ScraperError> {
|
||||||
let config_name = Self::get_host_name(url)? + ".txt";
|
let config_name = Self::get_host_name(url)? + ".txt";
|
||||||
|
|
||||||
if let Some(config_files) = self.config_files.read().as_ref() {
|
match self.config_files.get(&config_name) {
|
||||||
match config_files.get(&config_name) {
|
Some(config) => Ok(config.clone()),
|
||||||
Some(config) => Ok(config.clone()),
|
None => {
|
||||||
None => {
|
error!("No config file of the name '{}' found", config_name);
|
||||||
error!("No config file of the name '{}' found", config_name);
|
Err(ScraperErrorKind::Config.into())
|
||||||
Err(ScraperErrorKind::Config.into())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
error!("Config files have not been parsed yet.");
|
|
||||||
Err(ScraperErrorKind::Config.into())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn grabber_config_exists(&self, host: &str) -> Result<bool, ScraperError> {
|
|
||||||
if let Some(config_files) = self.config_files.read().as_ref() {
|
|
||||||
Ok(config_files.contains_key(&(host.to_owned() + ".txt")))
|
|
||||||
} else {
|
|
||||||
error!("Config files have not been parsed yet.");
|
|
||||||
Err(ScraperErrorKind::Config.into())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -595,7 +575,7 @@ impl ArticleScraper {
|
||||||
Ok(url)
|
Ok(url)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn strip_junk(context: &Context, config: &GrabberConfig, url: &url::Url) {
|
fn strip_junk(context: &Context, config: &ConfigEntry, url: &url::Url) {
|
||||||
// strip specified xpath
|
// strip specified xpath
|
||||||
for xpath_strip in &config.xpath_strip {
|
for xpath_strip in &config.xpath_strip {
|
||||||
let _ = ArticleScraper::strip_node(&context, xpath_strip);
|
let _ = ArticleScraper::strip_node(&context, xpath_strip);
|
||||||
|
@ -653,7 +633,7 @@ impl ArticleScraper {
|
||||||
let _ = ArticleScraper::strip_node(&context, &String::from("//*[@type='text/css']"));
|
let _ = ArticleScraper::strip_node(&context, &String::from("//*[@type='text/css']"));
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_metadata(context: &Context, config: &GrabberConfig, article: &mut Article) {
|
fn extract_metadata(context: &Context, config: &ConfigEntry, article: &mut Article) {
|
||||||
// try to get title
|
// try to get title
|
||||||
for xpath_title in &config.xpath_title {
|
for xpath_title in &config.xpath_title {
|
||||||
if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) {
|
if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) {
|
||||||
|
@ -689,7 +669,7 @@ impl ArticleScraper {
|
||||||
fn extract_body(
|
fn extract_body(
|
||||||
context: &Context,
|
context: &Context,
|
||||||
root: &mut Node,
|
root: &mut Node,
|
||||||
config: &GrabberConfig,
|
config: &ConfigEntry,
|
||||||
) -> Result<(), ScraperError> {
|
) -> Result<(), ScraperError> {
|
||||||
let mut found_something = false;
|
let mut found_something = false;
|
||||||
for xpath_body in &config.xpath_body {
|
for xpath_body in &config.xpath_body {
|
||||||
|
@ -729,7 +709,7 @@ impl ArticleScraper {
|
||||||
Ok(found_something)
|
Ok(found_something)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_for_next_page(&self, context: &Context, config: &GrabberConfig) -> Option<url::Url> {
|
fn check_for_next_page(&self, context: &Context, config: &ConfigEntry) -> Option<url::Url> {
|
||||||
if let Some(next_page_xpath) = config.next_page_link.clone() {
|
if let Some(next_page_xpath) = config.next_page_link.clone() {
|
||||||
if let Ok(next_page_string) =
|
if let Ok(next_page_string) =
|
||||||
ArticleScraper::get_attribute(&context, &next_page_xpath, "href")
|
ArticleScraper::get_attribute(&context, &next_page_xpath, "href")
|
||||||
|
@ -775,63 +755,3 @@ impl ArticleScraper {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use crate::*;
|
|
||||||
use reqwest::Client;
|
|
||||||
|
|
||||||
#[tokio::test(flavor = "current_thread")]
|
|
||||||
async fn golem() {
|
|
||||||
let config_path = PathBuf::from(r"./resources/tests/golem");
|
|
||||||
let out_path = PathBuf::from(r"./test_output");
|
|
||||||
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
|
|
||||||
|
|
||||||
let grabber = ArticleScraper::new(config_path).await;
|
|
||||||
let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
|
|
||||||
article.save_html(&out_path).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
article.title,
|
|
||||||
Some(String::from(
|
|
||||||
"HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben"
|
|
||||||
))
|
|
||||||
);
|
|
||||||
assert_eq!(article.author, Some(String::from("Hauke Gierow")));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test(flavor = "current_thread")]
|
|
||||||
async fn phoronix() {
|
|
||||||
let config_path = PathBuf::from(r"./resources/tests/phoronix");
|
|
||||||
let out_path = PathBuf::from(r"./test_output");
|
|
||||||
let url = url::Url::parse(
|
|
||||||
"http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let grabber = ArticleScraper::new(config_path).await;
|
|
||||||
let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
|
|
||||||
article.save_html(&out_path).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
article.title,
|
|
||||||
Some(String::from(
|
|
||||||
"Amazon EC2 Cloud Benchmarks Against Bare Metal Systems"
|
|
||||||
))
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test(flavor = "current_thread")]
|
|
||||||
async fn youtube() {
|
|
||||||
let config_path = PathBuf::from(r"./resources/tests/");
|
|
||||||
let url = url::Url::parse("https://www.youtube.com/watch?v=lHRkYLcmFY8").unwrap();
|
|
||||||
|
|
||||||
let grabber = ArticleScraper::new(config_path).await;
|
|
||||||
let article = grabber.parse(&url, false, &Client::new()).await.unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
article.html,
|
|
||||||
Some("<iframe width=\"650\" height=\"350\" frameborder=\"0\" src=\"https://www.youtube-nocookie.com/embed/lHRkYLcmFY8\" allowfullscreen></iframe>".into())
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
54
src/tests.rs
Normal file
54
src/tests.rs
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
use crate::*;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use reqwest::Client;
|
||||||
|
|
||||||
|
#[tokio::test(flavor = "current_thread")]
|
||||||
|
async fn golem() {
|
||||||
|
let out_path = PathBuf::from(r"./test_output");
|
||||||
|
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
|
||||||
|
|
||||||
|
let grabber = ArticleScraper::new(None).await;
|
||||||
|
let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
|
||||||
|
article.save_html(&out_path).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
article.title,
|
||||||
|
Some(String::from(
|
||||||
|
"HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben"
|
||||||
|
))
|
||||||
|
);
|
||||||
|
assert_eq!(article.author, Some(String::from("Hauke Gierow")));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test(flavor = "current_thread")]
|
||||||
|
async fn phoronix() {
|
||||||
|
let out_path = PathBuf::from(r"./test_output");
|
||||||
|
let url = url::Url::parse(
|
||||||
|
"http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let grabber = ArticleScraper::new(None).await;
|
||||||
|
let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
|
||||||
|
article.save_html(&out_path).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
article.title,
|
||||||
|
Some(String::from(
|
||||||
|
"Amazon EC2 Cloud Benchmarks Against Bare Metal Systems"
|
||||||
|
))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test(flavor = "current_thread")]
|
||||||
|
async fn youtube() {
|
||||||
|
let url = url::Url::parse("https://www.youtube.com/watch?v=lHRkYLcmFY8").unwrap();
|
||||||
|
|
||||||
|
let grabber = ArticleScraper::new(None).await;
|
||||||
|
let article = grabber.parse(&url, false, &Client::new()).await.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
article.html,
|
||||||
|
Some("<iframe width=\"650\" height=\"350\" frameborder=\"0\" src=\"https://www.youtube-nocookie.com/embed/lHRkYLcmFY8\" allowfullscreen></iframe>".into())
|
||||||
|
);
|
||||||
|
}
|
|
@ -23,4 +23,4 @@ impl Util {
|
||||||
pub fn split_values(values: &str) -> Vec<&str> {
|
pub fn split_values(values: &str) -> Vec<&str> {
|
||||||
values.split('|').map(|s| s.trim()).collect()
|
values.split('|').map(|s| s.trim()).collect()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue