1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

async config loading

This commit is contained in:
Jan Lukas Gernert 2022-10-05 06:53:05 +02:00
parent 9fb772bfa8
commit aa09666f4c
5 changed files with 73 additions and 67 deletions

View file

@ -9,9 +9,9 @@ repository = "https://gitlab.com/news-flash/article_scraper"
[dependencies]
failure = "0.1"
libxml = "0.2"
libxml = "0.3"
reqwest = { version = "0.11", features = ["json", "native-tls"] }
tokio = { version = "1.21", features = ["macros"] }
tokio = { version = "1.21", features = ["macros", "fs", "io-util"] }
url = "2.2"
regex = "1.4"
encoding_rs = "0.8"

View file

@ -5,8 +5,8 @@ macro_rules! extract_vec_multi {
$vector: ident
) => {
if $line.starts_with($identifier) {
let value = GrabberConfig::extract_value($identifier, $line);
let value = GrabberConfig::split_values(value);
let value = Util::extract_value($identifier, $line);
let value = Util::split_values(value);
let value: Vec<String> = value.iter().map(|s| s.trim().to_string()).collect();
$vector.extend(value);
continue;
@ -21,7 +21,7 @@ macro_rules! extract_vec_single {
$vector: ident
) => {
if $line.starts_with($identifier) {
let value = GrabberConfig::extract_value($identifier, $line);
let value = Util::extract_value($identifier, $line);
$vector.push(value.to_string());
continue;
}
@ -35,7 +35,7 @@ macro_rules! extract_option_single {
$option: ident
) => {
if $line.starts_with($identifier) {
let value = GrabberConfig::extract_value($identifier, $line);
let value = Util::extract_value($identifier, $line);
$option = Some(value.to_string());
continue;
}

View file

@ -1,10 +1,12 @@
use crate::util::Util;
use self::error::{ConfigError, ConfigErrorKind};
use failure::ResultExt;
use log::warn;
use std::collections::HashMap;
use std::fs;
use std::io;
use std::io::BufRead;
use tokio::fs;
use tokio::io;
use tokio::io::AsyncBufReadExt;
use std::path::PathBuf;
#[macro_use]
@ -34,29 +36,22 @@ pub struct GrabberConfig {
}
impl GrabberConfig {
pub fn parse_directory(directory: &PathBuf) -> Result<ConfigCollection, ConfigError> {
pub async fn parse_directory(directory: &PathBuf) -> Result<ConfigCollection, ConfigError> {
// create data dir if it doesn't already exist
std::fs::DirBuilder::new()
.recursive(true)
.create(&directory)
.context(ConfigErrorKind::IO)?;
let paths = fs::read_dir(directory).context(ConfigErrorKind::IO)?;
let mut dir = tokio::fs::read_dir(directory).await.context(ConfigErrorKind::IO)?;
let mut collection = HashMap::new();
let mut collection: HashMap<String, GrabberConfig> = HashMap::new();
for path in paths {
if let Ok(path) = path {
if let Some(extension) = path.path().extension() {
if let Some(extension) = extension.to_str() {
if extension == "txt" {
if let Ok(config) = GrabberConfig::new(path.path()) {
collection.insert(
path.file_name().to_string_lossy().into_owned(),
config,
);
}
}
while let Ok(entry) = dir.next_entry().await {
if let Some(entry) = entry {
if Util::check_extension(&entry, "txt") {
if let Ok(config) = GrabberConfig::new(entry.path()).await {
let file_name = entry.file_name().to_string_lossy().into_owned();
collection.insert(file_name, config);
}
}
}
@ -65,9 +60,11 @@ impl GrabberConfig {
Ok(collection)
}
fn new(config_path: PathBuf) -> Result<GrabberConfig, ConfigError> {
let file = fs::File::open(&config_path).context(ConfigErrorKind::IO)?;
let buffer = io::BufReader::new(&file);
async fn new(config_path: PathBuf) -> Result<GrabberConfig, ConfigError> {
let mut file = fs::File::open(&config_path).await.context(ConfigErrorKind::IO)?;
let buffer = io::BufReader::new(&mut file);
let mut xpath_title: Vec<String> = Vec::new();
let mut xpath_author: Vec<String> = Vec::new();
@ -100,8 +97,9 @@ impl GrabberConfig {
let test_url = "test_url:";
let autodetect = "autodetect_on_failure:";
let mut iterator = buffer.lines().peekable();
while let Some(Ok(line)) = iterator.next() {
let mut lines = buffer.lines();
while let Ok(Some(line)) = lines.next_line().await {
let line = line.trim();
if line.starts_with('#')
|| line.starts_with(tidy)
@ -126,7 +124,7 @@ impl GrabberConfig {
extract_option_single!(line, next_page, next_page_link);
if line.starts_with(replace_single) {
let value = GrabberConfig::extract_value(replace_single, line);
let value = Util::extract_value(replace_single, line);
let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect();
if value.len() != 2 {
continue;
@ -145,18 +143,17 @@ impl GrabberConfig {
}
if line.starts_with(find) {
let value1 = GrabberConfig::extract_value(find, line);
let to_replace = Util::extract_value(find, line).into();
if let Some(&Ok(ref next_line)) = iterator.peek() {
let value2 = GrabberConfig::extract_value(replace, &next_line);
if let Ok(Some(ref next_line)) = lines.next_line().await {
let replace_with = Util::extract_value(replace, &next_line).into();
let r = Replace {
to_replace: value1.to_string(),
replace_with: value2.to_string(),
};
replace_vec.push(r);
replace_vec.push(Replace {
to_replace,
replace_with,
});
}
continue;
}
}
@ -181,17 +178,4 @@ impl GrabberConfig {
Ok(config)
}
fn extract_value<'a>(identifier: &str, line: &'a str) -> &'a str {
let value = &line[identifier.len()..];
let value = value.trim();
match value.find('#') {
Some(pos) => &value[..pos],
None => value,
}
}
fn split_values(values: &str) -> Vec<&str> {
values.split('|').map(|s| s.trim()).collect()
}
}

View file

@ -3,6 +3,7 @@ mod config;
mod error;
pub mod images;
mod youtube;
mod util;
use self::error::{ScraperError, ScraperErrorKind};
use crate::article::Article;
@ -21,7 +22,6 @@ use std::collections::HashMap;
use std::path::PathBuf;
use std::str::FromStr;
use std::sync::Arc;
use std::thread;
pub struct ArticleScraper {
pub image_downloader: ImageDownloader,
@ -29,18 +29,14 @@ pub struct ArticleScraper {
}
impl ArticleScraper {
pub fn new(config_path: PathBuf) -> Self {
pub async fn new(config_path: PathBuf) -> Self {
let config_files = Arc::new(RwLock::new(None));
let locked_config_files = config_files.clone();
thread::spawn(move || {
if let Ok(config_files) = GrabberConfig::parse_directory(&config_path) {
locked_config_files.write().replace(config_files);
} else {
locked_config_files.write().replace(HashMap::new());
}
});
if let Ok(loaded_config_files) = GrabberConfig::parse_directory(&config_path).await {
config_files.write().replace(loaded_config_files);
} else {
config_files.write().replace(HashMap::new());
}
ArticleScraper {
image_downloader: ImageDownloader::new((2048, 2048)),
config_files,
@ -791,7 +787,7 @@ mod tests {
let out_path = PathBuf::from(r"./test_output");
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
let grabber = ArticleScraper::new(config_path);
let grabber = ArticleScraper::new(config_path).await;
let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
article.save_html(&out_path).unwrap();
@ -813,7 +809,7 @@ mod tests {
)
.unwrap();
let grabber = ArticleScraper::new(config_path);
let grabber = ArticleScraper::new(config_path).await;
let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
article.save_html(&out_path).unwrap();
@ -830,7 +826,7 @@ mod tests {
let config_path = PathBuf::from(r"./resources/tests/");
let url = url::Url::parse("https://www.youtube.com/watch?v=lHRkYLcmFY8").unwrap();
let grabber = ArticleScraper::new(config_path);
let grabber = ArticleScraper::new(config_path).await;
let article = grabber.parse(&url, false, &Client::new()).await.unwrap();
assert_eq!(

26
src/util.rs Normal file
View file

@ -0,0 +1,26 @@
use tokio::fs::DirEntry;
pub struct Util;
impl Util {
pub fn check_extension(path: &DirEntry, extension: &str) -> bool {
if let Some(ext) = path.path().extension() {
ext.to_str() == Some(extension)
} else {
false
}
}
pub fn extract_value<'a>(identifier: &str, line: &'a str) -> &'a str {
let value = &line[identifier.len()..];
let value = value.trim();
match value.find('#') {
Some(pos) => &value[..pos],
None => value,
}
}
pub fn split_values(values: &str) -> Vec<&str> {
values.split('|').map(|s| s.trim()).collect()
}
}