From aa09666f4c9bcc379df91bd6a2a5d12a81afa0f5 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Wed, 5 Oct 2022 06:53:05 +0200 Subject: [PATCH] async config loading --- Cargo.toml | 4 +-- src/config/macros.rs | 8 ++--- src/config/mod.rs | 78 ++++++++++++++++++-------------------------- src/lib.rs | 24 ++++++-------- src/util.rs | 26 +++++++++++++++ 5 files changed, 73 insertions(+), 67 deletions(-) create mode 100644 src/util.rs diff --git a/Cargo.toml b/Cargo.toml index a1d4634..086ee6d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,9 +9,9 @@ repository = "https://gitlab.com/news-flash/article_scraper" [dependencies] failure = "0.1" -libxml = "0.2" +libxml = "0.3" reqwest = { version = "0.11", features = ["json", "native-tls"] } -tokio = { version = "1.21", features = ["macros"] } +tokio = { version = "1.21", features = ["macros", "fs", "io-util"] } url = "2.2" regex = "1.4" encoding_rs = "0.8" diff --git a/src/config/macros.rs b/src/config/macros.rs index 808a5e6..1fe309a 100644 --- a/src/config/macros.rs +++ b/src/config/macros.rs @@ -5,8 +5,8 @@ macro_rules! extract_vec_multi { $vector: ident ) => { if $line.starts_with($identifier) { - let value = GrabberConfig::extract_value($identifier, $line); - let value = GrabberConfig::split_values(value); + let value = Util::extract_value($identifier, $line); + let value = Util::split_values(value); let value: Vec = value.iter().map(|s| s.trim().to_string()).collect(); $vector.extend(value); continue; @@ -21,7 +21,7 @@ macro_rules! extract_vec_single { $vector: ident ) => { if $line.starts_with($identifier) { - let value = GrabberConfig::extract_value($identifier, $line); + let value = Util::extract_value($identifier, $line); $vector.push(value.to_string()); continue; } @@ -35,7 +35,7 @@ macro_rules! extract_option_single { $option: ident ) => { if $line.starts_with($identifier) { - let value = GrabberConfig::extract_value($identifier, $line); + let value = Util::extract_value($identifier, $line); $option = Some(value.to_string()); continue; } diff --git a/src/config/mod.rs b/src/config/mod.rs index 9323b6c..fdf5811 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1,10 +1,12 @@ +use crate::util::Util; + use self::error::{ConfigError, ConfigErrorKind}; use failure::ResultExt; use log::warn; use std::collections::HashMap; -use std::fs; -use std::io; -use std::io::BufRead; +use tokio::fs; +use tokio::io; +use tokio::io::AsyncBufReadExt; use std::path::PathBuf; #[macro_use] @@ -34,29 +36,22 @@ pub struct GrabberConfig { } impl GrabberConfig { - pub fn parse_directory(directory: &PathBuf) -> Result { + pub async fn parse_directory(directory: &PathBuf) -> Result { // create data dir if it doesn't already exist std::fs::DirBuilder::new() .recursive(true) .create(&directory) .context(ConfigErrorKind::IO)?; - let paths = fs::read_dir(directory).context(ConfigErrorKind::IO)?; + let mut dir = tokio::fs::read_dir(directory).await.context(ConfigErrorKind::IO)?; + let mut collection = HashMap::new(); - let mut collection: HashMap = HashMap::new(); - - for path in paths { - if let Ok(path) = path { - if let Some(extension) = path.path().extension() { - if let Some(extension) = extension.to_str() { - if extension == "txt" { - if let Ok(config) = GrabberConfig::new(path.path()) { - collection.insert( - path.file_name().to_string_lossy().into_owned(), - config, - ); - } - } + while let Ok(entry) = dir.next_entry().await { + if let Some(entry) = entry { + if Util::check_extension(&entry, "txt") { + if let Ok(config) = GrabberConfig::new(entry.path()).await { + let file_name = entry.file_name().to_string_lossy().into_owned(); + collection.insert(file_name, config); } } } @@ -65,9 +60,11 @@ impl GrabberConfig { Ok(collection) } - fn new(config_path: PathBuf) -> Result { - let file = fs::File::open(&config_path).context(ConfigErrorKind::IO)?; - let buffer = io::BufReader::new(&file); + + + async fn new(config_path: PathBuf) -> Result { + let mut file = fs::File::open(&config_path).await.context(ConfigErrorKind::IO)?; + let buffer = io::BufReader::new(&mut file); let mut xpath_title: Vec = Vec::new(); let mut xpath_author: Vec = Vec::new(); @@ -100,8 +97,9 @@ impl GrabberConfig { let test_url = "test_url:"; let autodetect = "autodetect_on_failure:"; - let mut iterator = buffer.lines().peekable(); - while let Some(Ok(line)) = iterator.next() { + let mut lines = buffer.lines(); + + while let Ok(Some(line)) = lines.next_line().await { let line = line.trim(); if line.starts_with('#') || line.starts_with(tidy) @@ -126,7 +124,7 @@ impl GrabberConfig { extract_option_single!(line, next_page, next_page_link); if line.starts_with(replace_single) { - let value = GrabberConfig::extract_value(replace_single, line); + let value = Util::extract_value(replace_single, line); let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect(); if value.len() != 2 { continue; @@ -145,18 +143,17 @@ impl GrabberConfig { } if line.starts_with(find) { - let value1 = GrabberConfig::extract_value(find, line); + let to_replace = Util::extract_value(find, line).into(); - if let Some(&Ok(ref next_line)) = iterator.peek() { - let value2 = GrabberConfig::extract_value(replace, &next_line); + if let Ok(Some(ref next_line)) = lines.next_line().await { + let replace_with = Util::extract_value(replace, &next_line).into(); - let r = Replace { - to_replace: value1.to_string(), - replace_with: value2.to_string(), - }; - - replace_vec.push(r); + replace_vec.push(Replace { + to_replace, + replace_with, + }); } + continue; } } @@ -181,17 +178,4 @@ impl GrabberConfig { Ok(config) } - - fn extract_value<'a>(identifier: &str, line: &'a str) -> &'a str { - let value = &line[identifier.len()..]; - let value = value.trim(); - match value.find('#') { - Some(pos) => &value[..pos], - None => value, - } - } - - fn split_values(values: &str) -> Vec<&str> { - values.split('|').map(|s| s.trim()).collect() - } } diff --git a/src/lib.rs b/src/lib.rs index f1057e2..e657393 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,6 +3,7 @@ mod config; mod error; pub mod images; mod youtube; +mod util; use self::error::{ScraperError, ScraperErrorKind}; use crate::article::Article; @@ -21,7 +22,6 @@ use std::collections::HashMap; use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; -use std::thread; pub struct ArticleScraper { pub image_downloader: ImageDownloader, @@ -29,18 +29,14 @@ pub struct ArticleScraper { } impl ArticleScraper { - pub fn new(config_path: PathBuf) -> Self { + pub async fn new(config_path: PathBuf) -> Self { let config_files = Arc::new(RwLock::new(None)); - let locked_config_files = config_files.clone(); - thread::spawn(move || { - if let Ok(config_files) = GrabberConfig::parse_directory(&config_path) { - locked_config_files.write().replace(config_files); - } else { - locked_config_files.write().replace(HashMap::new()); - } - }); - + if let Ok(loaded_config_files) = GrabberConfig::parse_directory(&config_path).await { + config_files.write().replace(loaded_config_files); + } else { + config_files.write().replace(HashMap::new()); + } ArticleScraper { image_downloader: ImageDownloader::new((2048, 2048)), config_files, @@ -791,7 +787,7 @@ mod tests { let out_path = PathBuf::from(r"./test_output"); let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap(); - let grabber = ArticleScraper::new(config_path); + let grabber = ArticleScraper::new(config_path).await; let article = grabber.parse(&url, true, &Client::new()).await.unwrap(); article.save_html(&out_path).unwrap(); @@ -813,7 +809,7 @@ mod tests { ) .unwrap(); - let grabber = ArticleScraper::new(config_path); + let grabber = ArticleScraper::new(config_path).await; let article = grabber.parse(&url, true, &Client::new()).await.unwrap(); article.save_html(&out_path).unwrap(); @@ -830,7 +826,7 @@ mod tests { let config_path = PathBuf::from(r"./resources/tests/"); let url = url::Url::parse("https://www.youtube.com/watch?v=lHRkYLcmFY8").unwrap(); - let grabber = ArticleScraper::new(config_path); + let grabber = ArticleScraper::new(config_path).await; let article = grabber.parse(&url, false, &Client::new()).await.unwrap(); assert_eq!( diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 0000000..9f92137 --- /dev/null +++ b/src/util.rs @@ -0,0 +1,26 @@ +use tokio::fs::DirEntry; + +pub struct Util; + +impl Util { + pub fn check_extension(path: &DirEntry, extension: &str) -> bool { + if let Some(ext) = path.path().extension() { + ext.to_str() == Some(extension) + } else { + false + } + } + + pub fn extract_value<'a>(identifier: &str, line: &'a str) -> &'a str { + let value = &line[identifier.len()..]; + let value = value.trim(); + match value.find('#') { + Some(pos) => &value[..pos], + None => value, + } + } + + pub fn split_values(values: &str) -> Vec<&str> { + values.split('|').map(|s| s.trim()).collect() + } +} \ No newline at end of file