1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 08:30:00 +02:00

initial commit

This commit is contained in:
Jan Lukas Gernert 2018-07-31 16:10:09 +02:00
commit 4b2e6a24eb
17 changed files with 2423 additions and 0 deletions

188
src/config/mod.rs Normal file
View file

@ -0,0 +1,188 @@
use std::collections;
use std::path::{PathBuf};
use std::fs;
use std::io;
use std::io::BufRead;
use failure::ResultExt;
use self::error::{ConfigError, ConfigErrorKind};
#[macro_use]
mod macros;
mod error;
pub type ConfigCollection = collections::HashMap<String, GrabberConfig>;
pub struct Replace {
pub to_replace: String,
pub replace_with: String,
}
pub struct GrabberConfig {
pub xpath_title: Vec<String>,
pub xpath_author: Vec<String>,
pub xpath_date: Vec<String>,
pub xpath_body: Vec<String>,
pub xpath_strip: Vec<String>,
pub strip_id_or_class: Vec<String>,
pub strip_image_src: Vec<String>,
pub replace: Vec<Replace>,
pub single_page_link: Option<String>,
pub next_page_link: Option<String>,
}
impl GrabberConfig {
pub fn parse_directory(directory: &PathBuf) -> Result<ConfigCollection, ConfigError> {
let paths = fs::read_dir(directory).context(ConfigErrorKind::IO)?;
let mut collection: collections::HashMap<String, GrabberConfig> = collections::HashMap::new();
for path in paths {
if let Ok(path) = path {
if let Some(extension) = path.path().extension() {
if let Some(extension) = extension.to_str() {
if extension == "txt" {
if let Ok(config) = GrabberConfig::new(path.path()) {
collection.insert(path.file_name().to_string_lossy().into_owned(), config);
}
}
}
}
}
}
Ok(collection)
}
fn new(config_path: PathBuf) -> Result<GrabberConfig, ConfigError> {
let file = fs::File::open(&config_path).context(ConfigErrorKind::IO)?;
let buffer = io::BufReader::new(&file);
let mut xpath_title: Vec<String> = Vec::new();
let mut xpath_author: Vec<String> = Vec::new();
let mut xpath_date: Vec<String> = Vec::new();
let mut xpath_body: Vec<String> = Vec::new();
let mut xpath_strip: Vec<String> = Vec::new();
let mut strip_id_or_class: Vec<String> = Vec::new();
let mut strip_image_src: Vec<String> = Vec::new();
let mut replace_vec: Vec<Replace> = Vec::new();
let mut next_page_link: Option<String> = None;
let mut single_page_link: Option<String> = None;
// ignore: tidy, prune, autodetect_on_failure and test_url
let title = "title:";
let body = "body:";
let date = "date:";
let author = "author:";
let strip = "strip:";
let strip_id = "strip_id_or_class:";
let strip_img = "strip_image_src:";
let single_page = "single_page_link:";
let next_page = "next_page_link:";
let find = "find_string:";
let replace = "replace_string:";
let replace_single = "replace_string(";
// ignore these
let tidy = "tidy:";
let prune = "prune:";
let test_url = "test_url:";
let autodetect = "autodetect_on_failure:";
let mut iterator = buffer.lines().peekable();
while let Some(Ok(line)) = iterator.next() {
let line = line.trim();
if line.starts_with("#")
|| line.starts_with(tidy)
|| line.starts_with(prune)
|| line.starts_with(test_url)
|| line.starts_with(autodetect)
|| line.is_empty() {
continue;
}
extract_vec_multi!(line, title, xpath_title);
extract_vec_multi!(line, body, xpath_body);
extract_vec_multi!(line, date, xpath_date);
extract_vec_multi!(line, author, xpath_author);
extract_vec_single!(line, strip, xpath_strip);
extract_vec_single!(line, strip_id, strip_id_or_class);
extract_vec_single!(line, strip_img, strip_image_src);
extract_option_single!(line, single_page, single_page_link);
extract_option_single!(line, next_page, next_page_link);
if line.starts_with(replace_single) {
let value = GrabberConfig::extract_value(replace_single, line);
let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect();
if value.len() != 2{
continue;
}
if let Some(to_replace) = value.get(0) {
if let Some(replace_with) = value.get(1) {
replace_vec.push(
Replace {
to_replace: to_replace.to_string(),
replace_with: replace_with.to_string(),
}
);
}
}
continue;
}
if line.starts_with(find) {
let value1 = GrabberConfig::extract_value(find, line);
if let Some(&Ok(ref next_line)) = iterator.peek() {
let value2 = GrabberConfig::extract_value(replace, &next_line);
let r = Replace {
to_replace: value1.to_string(),
replace_with: value2.to_string(),
};
replace_vec.push(r);
}
continue;
}
}
if xpath_body.len() == 0 {
error!("No body xpath found for {}", config_path.display());
Err(ConfigErrorKind::BadConfig)?
}
let config = GrabberConfig {
xpath_title: xpath_title,
xpath_author: xpath_author,
xpath_date: xpath_date,
xpath_body: xpath_body,
xpath_strip: xpath_strip,
strip_id_or_class: strip_id_or_class,
strip_image_src: strip_image_src,
replace: replace_vec,
single_page_link: single_page_link,
next_page_link: next_page_link,
};
Ok(config)
}
fn extract_value<'a>(identifier: &str, line: &'a str) -> &'a str {
let value = &line[identifier.len()..];
let value = value.trim();
match value.find('#') {
Some(pos) => &value[..pos],
None => value,
}
}
fn split_values<'a>(values: &'a str) -> Vec<&'a str> {
values.split('|').map(|s| s.trim()).collect()
}
}