mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
async config loading
This commit is contained in:
parent
9fb772bfa8
commit
aa09666f4c
5 changed files with 73 additions and 67 deletions
|
@ -9,9 +9,9 @@ repository = "https://gitlab.com/news-flash/article_scraper"
|
|||
|
||||
[dependencies]
|
||||
failure = "0.1"
|
||||
libxml = "0.2"
|
||||
libxml = "0.3"
|
||||
reqwest = { version = "0.11", features = ["json", "native-tls"] }
|
||||
tokio = { version = "1.21", features = ["macros"] }
|
||||
tokio = { version = "1.21", features = ["macros", "fs", "io-util"] }
|
||||
url = "2.2"
|
||||
regex = "1.4"
|
||||
encoding_rs = "0.8"
|
||||
|
|
|
@ -5,8 +5,8 @@ macro_rules! extract_vec_multi {
|
|||
$vector: ident
|
||||
) => {
|
||||
if $line.starts_with($identifier) {
|
||||
let value = GrabberConfig::extract_value($identifier, $line);
|
||||
let value = GrabberConfig::split_values(value);
|
||||
let value = Util::extract_value($identifier, $line);
|
||||
let value = Util::split_values(value);
|
||||
let value: Vec<String> = value.iter().map(|s| s.trim().to_string()).collect();
|
||||
$vector.extend(value);
|
||||
continue;
|
||||
|
@ -21,7 +21,7 @@ macro_rules! extract_vec_single {
|
|||
$vector: ident
|
||||
) => {
|
||||
if $line.starts_with($identifier) {
|
||||
let value = GrabberConfig::extract_value($identifier, $line);
|
||||
let value = Util::extract_value($identifier, $line);
|
||||
$vector.push(value.to_string());
|
||||
continue;
|
||||
}
|
||||
|
@ -35,7 +35,7 @@ macro_rules! extract_option_single {
|
|||
$option: ident
|
||||
) => {
|
||||
if $line.starts_with($identifier) {
|
||||
let value = GrabberConfig::extract_value($identifier, $line);
|
||||
let value = Util::extract_value($identifier, $line);
|
||||
$option = Some(value.to_string());
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
use crate::util::Util;
|
||||
|
||||
use self::error::{ConfigError, ConfigErrorKind};
|
||||
use failure::ResultExt;
|
||||
use log::warn;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::io::BufRead;
|
||||
use tokio::fs;
|
||||
use tokio::io;
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[macro_use]
|
||||
|
@ -34,29 +36,22 @@ pub struct GrabberConfig {
|
|||
}
|
||||
|
||||
impl GrabberConfig {
|
||||
pub fn parse_directory(directory: &PathBuf) -> Result<ConfigCollection, ConfigError> {
|
||||
pub async fn parse_directory(directory: &PathBuf) -> Result<ConfigCollection, ConfigError> {
|
||||
// create data dir if it doesn't already exist
|
||||
std::fs::DirBuilder::new()
|
||||
.recursive(true)
|
||||
.create(&directory)
|
||||
.context(ConfigErrorKind::IO)?;
|
||||
|
||||
let paths = fs::read_dir(directory).context(ConfigErrorKind::IO)?;
|
||||
let mut dir = tokio::fs::read_dir(directory).await.context(ConfigErrorKind::IO)?;
|
||||
let mut collection = HashMap::new();
|
||||
|
||||
let mut collection: HashMap<String, GrabberConfig> = HashMap::new();
|
||||
|
||||
for path in paths {
|
||||
if let Ok(path) = path {
|
||||
if let Some(extension) = path.path().extension() {
|
||||
if let Some(extension) = extension.to_str() {
|
||||
if extension == "txt" {
|
||||
if let Ok(config) = GrabberConfig::new(path.path()) {
|
||||
collection.insert(
|
||||
path.file_name().to_string_lossy().into_owned(),
|
||||
config,
|
||||
);
|
||||
}
|
||||
}
|
||||
while let Ok(entry) = dir.next_entry().await {
|
||||
if let Some(entry) = entry {
|
||||
if Util::check_extension(&entry, "txt") {
|
||||
if let Ok(config) = GrabberConfig::new(entry.path()).await {
|
||||
let file_name = entry.file_name().to_string_lossy().into_owned();
|
||||
collection.insert(file_name, config);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -65,9 +60,11 @@ impl GrabberConfig {
|
|||
Ok(collection)
|
||||
}
|
||||
|
||||
fn new(config_path: PathBuf) -> Result<GrabberConfig, ConfigError> {
|
||||
let file = fs::File::open(&config_path).context(ConfigErrorKind::IO)?;
|
||||
let buffer = io::BufReader::new(&file);
|
||||
|
||||
|
||||
async fn new(config_path: PathBuf) -> Result<GrabberConfig, ConfigError> {
|
||||
let mut file = fs::File::open(&config_path).await.context(ConfigErrorKind::IO)?;
|
||||
let buffer = io::BufReader::new(&mut file);
|
||||
|
||||
let mut xpath_title: Vec<String> = Vec::new();
|
||||
let mut xpath_author: Vec<String> = Vec::new();
|
||||
|
@ -100,8 +97,9 @@ impl GrabberConfig {
|
|||
let test_url = "test_url:";
|
||||
let autodetect = "autodetect_on_failure:";
|
||||
|
||||
let mut iterator = buffer.lines().peekable();
|
||||
while let Some(Ok(line)) = iterator.next() {
|
||||
let mut lines = buffer.lines();
|
||||
|
||||
while let Ok(Some(line)) = lines.next_line().await {
|
||||
let line = line.trim();
|
||||
if line.starts_with('#')
|
||||
|| line.starts_with(tidy)
|
||||
|
@ -126,7 +124,7 @@ impl GrabberConfig {
|
|||
extract_option_single!(line, next_page, next_page_link);
|
||||
|
||||
if line.starts_with(replace_single) {
|
||||
let value = GrabberConfig::extract_value(replace_single, line);
|
||||
let value = Util::extract_value(replace_single, line);
|
||||
let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect();
|
||||
if value.len() != 2 {
|
||||
continue;
|
||||
|
@ -145,18 +143,17 @@ impl GrabberConfig {
|
|||
}
|
||||
|
||||
if line.starts_with(find) {
|
||||
let value1 = GrabberConfig::extract_value(find, line);
|
||||
let to_replace = Util::extract_value(find, line).into();
|
||||
|
||||
if let Some(&Ok(ref next_line)) = iterator.peek() {
|
||||
let value2 = GrabberConfig::extract_value(replace, &next_line);
|
||||
if let Ok(Some(ref next_line)) = lines.next_line().await {
|
||||
let replace_with = Util::extract_value(replace, &next_line).into();
|
||||
|
||||
let r = Replace {
|
||||
to_replace: value1.to_string(),
|
||||
replace_with: value2.to_string(),
|
||||
};
|
||||
|
||||
replace_vec.push(r);
|
||||
replace_vec.push(Replace {
|
||||
to_replace,
|
||||
replace_with,
|
||||
});
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -181,17 +178,4 @@ impl GrabberConfig {
|
|||
|
||||
Ok(config)
|
||||
}
|
||||
|
||||
fn extract_value<'a>(identifier: &str, line: &'a str) -> &'a str {
|
||||
let value = &line[identifier.len()..];
|
||||
let value = value.trim();
|
||||
match value.find('#') {
|
||||
Some(pos) => &value[..pos],
|
||||
None => value,
|
||||
}
|
||||
}
|
||||
|
||||
fn split_values(values: &str) -> Vec<&str> {
|
||||
values.split('|').map(|s| s.trim()).collect()
|
||||
}
|
||||
}
|
||||
|
|
20
src/lib.rs
20
src/lib.rs
|
@ -3,6 +3,7 @@ mod config;
|
|||
mod error;
|
||||
pub mod images;
|
||||
mod youtube;
|
||||
mod util;
|
||||
|
||||
use self::error::{ScraperError, ScraperErrorKind};
|
||||
use crate::article::Article;
|
||||
|
@ -21,7 +22,6 @@ use std::collections::HashMap;
|
|||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
pub struct ArticleScraper {
|
||||
pub image_downloader: ImageDownloader,
|
||||
|
@ -29,18 +29,14 @@ pub struct ArticleScraper {
|
|||
}
|
||||
|
||||
impl ArticleScraper {
|
||||
pub fn new(config_path: PathBuf) -> Self {
|
||||
pub async fn new(config_path: PathBuf) -> Self {
|
||||
let config_files = Arc::new(RwLock::new(None));
|
||||
|
||||
let locked_config_files = config_files.clone();
|
||||
thread::spawn(move || {
|
||||
if let Ok(config_files) = GrabberConfig::parse_directory(&config_path) {
|
||||
locked_config_files.write().replace(config_files);
|
||||
if let Ok(loaded_config_files) = GrabberConfig::parse_directory(&config_path).await {
|
||||
config_files.write().replace(loaded_config_files);
|
||||
} else {
|
||||
locked_config_files.write().replace(HashMap::new());
|
||||
config_files.write().replace(HashMap::new());
|
||||
}
|
||||
});
|
||||
|
||||
ArticleScraper {
|
||||
image_downloader: ImageDownloader::new((2048, 2048)),
|
||||
config_files,
|
||||
|
@ -791,7 +787,7 @@ mod tests {
|
|||
let out_path = PathBuf::from(r"./test_output");
|
||||
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
|
||||
|
||||
let grabber = ArticleScraper::new(config_path);
|
||||
let grabber = ArticleScraper::new(config_path).await;
|
||||
let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
|
||||
article.save_html(&out_path).unwrap();
|
||||
|
||||
|
@ -813,7 +809,7 @@ mod tests {
|
|||
)
|
||||
.unwrap();
|
||||
|
||||
let grabber = ArticleScraper::new(config_path);
|
||||
let grabber = ArticleScraper::new(config_path).await;
|
||||
let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
|
||||
article.save_html(&out_path).unwrap();
|
||||
|
||||
|
@ -830,7 +826,7 @@ mod tests {
|
|||
let config_path = PathBuf::from(r"./resources/tests/");
|
||||
let url = url::Url::parse("https://www.youtube.com/watch?v=lHRkYLcmFY8").unwrap();
|
||||
|
||||
let grabber = ArticleScraper::new(config_path);
|
||||
let grabber = ArticleScraper::new(config_path).await;
|
||||
let article = grabber.parse(&url, false, &Client::new()).await.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
|
|
26
src/util.rs
Normal file
26
src/util.rs
Normal file
|
@ -0,0 +1,26 @@
|
|||
use tokio::fs::DirEntry;
|
||||
|
||||
pub struct Util;
|
||||
|
||||
impl Util {
|
||||
pub fn check_extension(path: &DirEntry, extension: &str) -> bool {
|
||||
if let Some(ext) = path.path().extension() {
|
||||
ext.to_str() == Some(extension)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
pub fn extract_value<'a>(identifier: &str, line: &'a str) -> &'a str {
|
||||
let value = &line[identifier.len()..];
|
||||
let value = value.trim();
|
||||
match value.find('#') {
|
||||
Some(pos) => &value[..pos],
|
||||
None => value,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn split_values(values: &str) -> Vec<&str> {
|
||||
values.split('|').map(|s| s.trim()).collect()
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue