1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00
article_scraper/src/full_text_parser/config/config_entry.rs
2022-12-01 09:22:08 +01:00

183 lines
5.9 KiB
Rust

use crate::util::Util;
use super::error::ConfigError;
use std::borrow::Cow;
use std::io::Cursor;
use std::path::Path;
use tokio::fs;
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
#[derive(Clone)]
pub struct Replace {
pub to_replace: String,
pub replace_with: String,
}
#[derive(Clone)]
pub struct Header {
pub name: String,
pub value: String,
}
#[derive(Clone)]
pub struct ConfigEntry {
pub xpath_title: Vec<String>,
pub xpath_author: Vec<String>,
pub xpath_date: Vec<String>,
pub xpath_body: Vec<String>,
pub xpath_strip: Vec<String>,
pub strip_id_or_class: Vec<String>,
pub strip_image_src: Vec<String>,
pub replace: Vec<Replace>,
pub header: Vec<Header>,
pub single_page_link: Option<String>,
pub next_page_link: Option<String>,
}
impl ConfigEntry {
pub async fn parse_path(config_path: &Path) -> Result<ConfigEntry, ConfigError> {
let mut file = fs::File::open(&config_path).await?;
let buffer = BufReader::new(&mut file);
Self::parse(buffer).await
}
pub async fn parse_data(data: Cow<'static, [u8]>) -> Result<ConfigEntry, ConfigError> {
let data = data.as_ref();
let mut cursor = Cursor::new(data);
let buffer = BufReader::new(&mut cursor);
Self::parse(buffer).await
}
async fn parse<R: AsyncRead + Unpin>(buffer: BufReader<R>) -> Result<ConfigEntry, ConfigError> {
let mut xpath_title: Vec<String> = Vec::new();
let mut xpath_author: Vec<String> = Vec::new();
let mut xpath_date: Vec<String> = Vec::new();
let mut xpath_body: Vec<String> = Vec::new();
let mut xpath_strip: Vec<String> = Vec::new();
let mut strip_id_or_class: Vec<String> = Vec::new();
let mut strip_image_src: Vec<String> = Vec::new();
let mut replace_vec: Vec<Replace> = Vec::new();
let mut header_vec: Vec<Header> = Vec::new();
let mut next_page_link: Option<String> = None;
let mut single_page_link: Option<String> = None;
// ignore: tidy, prune, autodetect_on_failure and test_url
let title = "title:";
let body = "body:";
let date = "date:";
let author = "author:";
let strip = "strip:";
let strip_id = "strip_id_or_class:";
let strip_img = "strip_image_src:";
let single_page = "single_page_link:";
let next_page = "next_page_link:";
let find = "find_string:";
let replace = "replace_string:";
let replace_single = "replace_string(";
let http_header = "http_header(";
// ignore these
let tidy = "tidy:";
let prune = "prune:";
let test_url = "test_url:";
let autodetect = "autodetect_on_failure:";
let mut lines = buffer.lines();
while let Ok(Some(line)) = lines.next_line().await {
let line = line.trim();
if line.starts_with('#')
|| line.starts_with(tidy)
|| line.starts_with(prune)
|| line.starts_with(test_url)
|| line.starts_with(autodetect)
|| line.is_empty()
{
continue;
}
extract_vec_multi!(line, title, xpath_title);
extract_vec_multi!(line, body, xpath_body);
extract_vec_multi!(line, date, xpath_date);
extract_vec_multi!(line, author, xpath_author);
extract_vec_single!(line, strip, xpath_strip);
extract_vec_single!(line, strip_id, strip_id_or_class);
extract_vec_single!(line, strip_img, strip_image_src);
extract_option_single!(line, single_page, single_page_link);
extract_option_single!(line, next_page, next_page_link);
if line.starts_with(replace_single) {
let value = Util::str_extract_value(replace_single, line);
let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect();
if value.len() != 2 {
continue;
}
if let Some(to_replace) = value.first() {
if let Some(replace_with) = value.get(1) {
replace_vec.push(Replace {
to_replace: (*to_replace).to_string(),
replace_with: (*replace_with).to_string(),
});
}
}
continue;
}
if line.starts_with(http_header) {
let value = Util::str_extract_value(http_header, line);
let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect();
if value.len() != 2 {
continue;
}
if let Some(name) = value.first() {
if let Some(value) = value.get(1) {
header_vec.push(Header {
name: (*name).to_string(),
value: (*value).to_string(),
});
}
}
continue;
}
if line.starts_with(find) {
let to_replace = Util::str_extract_value(find, line).into();
if let Ok(Some(next_line)) = lines.next_line().await {
let replace_with = Util::str_extract_value(replace, &next_line).into();
replace_vec.push(Replace {
to_replace,
replace_with,
});
}
continue;
}
}
let config = ConfigEntry {
xpath_title,
xpath_author,
xpath_date,
xpath_body,
xpath_strip,
strip_id_or_class,
strip_image_src,
replace: replace_vec,
header: header_vec,
single_page_link,
next_page_link,
};
Ok(config)
}
}