mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
async config loading
This commit is contained in:
parent
9fb772bfa8
commit
aa09666f4c
5 changed files with 73 additions and 67 deletions
|
@ -9,9 +9,9 @@ repository = "https://gitlab.com/news-flash/article_scraper"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
failure = "0.1"
|
failure = "0.1"
|
||||||
libxml = "0.2"
|
libxml = "0.3"
|
||||||
reqwest = { version = "0.11", features = ["json", "native-tls"] }
|
reqwest = { version = "0.11", features = ["json", "native-tls"] }
|
||||||
tokio = { version = "1.21", features = ["macros"] }
|
tokio = { version = "1.21", features = ["macros", "fs", "io-util"] }
|
||||||
url = "2.2"
|
url = "2.2"
|
||||||
regex = "1.4"
|
regex = "1.4"
|
||||||
encoding_rs = "0.8"
|
encoding_rs = "0.8"
|
||||||
|
|
|
@ -5,8 +5,8 @@ macro_rules! extract_vec_multi {
|
||||||
$vector: ident
|
$vector: ident
|
||||||
) => {
|
) => {
|
||||||
if $line.starts_with($identifier) {
|
if $line.starts_with($identifier) {
|
||||||
let value = GrabberConfig::extract_value($identifier, $line);
|
let value = Util::extract_value($identifier, $line);
|
||||||
let value = GrabberConfig::split_values(value);
|
let value = Util::split_values(value);
|
||||||
let value: Vec<String> = value.iter().map(|s| s.trim().to_string()).collect();
|
let value: Vec<String> = value.iter().map(|s| s.trim().to_string()).collect();
|
||||||
$vector.extend(value);
|
$vector.extend(value);
|
||||||
continue;
|
continue;
|
||||||
|
@ -21,7 +21,7 @@ macro_rules! extract_vec_single {
|
||||||
$vector: ident
|
$vector: ident
|
||||||
) => {
|
) => {
|
||||||
if $line.starts_with($identifier) {
|
if $line.starts_with($identifier) {
|
||||||
let value = GrabberConfig::extract_value($identifier, $line);
|
let value = Util::extract_value($identifier, $line);
|
||||||
$vector.push(value.to_string());
|
$vector.push(value.to_string());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -35,7 +35,7 @@ macro_rules! extract_option_single {
|
||||||
$option: ident
|
$option: ident
|
||||||
) => {
|
) => {
|
||||||
if $line.starts_with($identifier) {
|
if $line.starts_with($identifier) {
|
||||||
let value = GrabberConfig::extract_value($identifier, $line);
|
let value = Util::extract_value($identifier, $line);
|
||||||
$option = Some(value.to_string());
|
$option = Some(value.to_string());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
|
use crate::util::Util;
|
||||||
|
|
||||||
use self::error::{ConfigError, ConfigErrorKind};
|
use self::error::{ConfigError, ConfigErrorKind};
|
||||||
use failure::ResultExt;
|
use failure::ResultExt;
|
||||||
use log::warn;
|
use log::warn;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fs;
|
use tokio::fs;
|
||||||
use std::io;
|
use tokio::io;
|
||||||
use std::io::BufRead;
|
use tokio::io::AsyncBufReadExt;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
|
@ -34,29 +36,22 @@ pub struct GrabberConfig {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl GrabberConfig {
|
impl GrabberConfig {
|
||||||
pub fn parse_directory(directory: &PathBuf) -> Result<ConfigCollection, ConfigError> {
|
pub async fn parse_directory(directory: &PathBuf) -> Result<ConfigCollection, ConfigError> {
|
||||||
// create data dir if it doesn't already exist
|
// create data dir if it doesn't already exist
|
||||||
std::fs::DirBuilder::new()
|
std::fs::DirBuilder::new()
|
||||||
.recursive(true)
|
.recursive(true)
|
||||||
.create(&directory)
|
.create(&directory)
|
||||||
.context(ConfigErrorKind::IO)?;
|
.context(ConfigErrorKind::IO)?;
|
||||||
|
|
||||||
let paths = fs::read_dir(directory).context(ConfigErrorKind::IO)?;
|
let mut dir = tokio::fs::read_dir(directory).await.context(ConfigErrorKind::IO)?;
|
||||||
|
let mut collection = HashMap::new();
|
||||||
|
|
||||||
let mut collection: HashMap<String, GrabberConfig> = HashMap::new();
|
while let Ok(entry) = dir.next_entry().await {
|
||||||
|
if let Some(entry) = entry {
|
||||||
for path in paths {
|
if Util::check_extension(&entry, "txt") {
|
||||||
if let Ok(path) = path {
|
if let Ok(config) = GrabberConfig::new(entry.path()).await {
|
||||||
if let Some(extension) = path.path().extension() {
|
let file_name = entry.file_name().to_string_lossy().into_owned();
|
||||||
if let Some(extension) = extension.to_str() {
|
collection.insert(file_name, config);
|
||||||
if extension == "txt" {
|
|
||||||
if let Ok(config) = GrabberConfig::new(path.path()) {
|
|
||||||
collection.insert(
|
|
||||||
path.file_name().to_string_lossy().into_owned(),
|
|
||||||
config,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -65,9 +60,11 @@ impl GrabberConfig {
|
||||||
Ok(collection)
|
Ok(collection)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn new(config_path: PathBuf) -> Result<GrabberConfig, ConfigError> {
|
|
||||||
let file = fs::File::open(&config_path).context(ConfigErrorKind::IO)?;
|
|
||||||
let buffer = io::BufReader::new(&file);
|
async fn new(config_path: PathBuf) -> Result<GrabberConfig, ConfigError> {
|
||||||
|
let mut file = fs::File::open(&config_path).await.context(ConfigErrorKind::IO)?;
|
||||||
|
let buffer = io::BufReader::new(&mut file);
|
||||||
|
|
||||||
let mut xpath_title: Vec<String> = Vec::new();
|
let mut xpath_title: Vec<String> = Vec::new();
|
||||||
let mut xpath_author: Vec<String> = Vec::new();
|
let mut xpath_author: Vec<String> = Vec::new();
|
||||||
|
@ -100,8 +97,9 @@ impl GrabberConfig {
|
||||||
let test_url = "test_url:";
|
let test_url = "test_url:";
|
||||||
let autodetect = "autodetect_on_failure:";
|
let autodetect = "autodetect_on_failure:";
|
||||||
|
|
||||||
let mut iterator = buffer.lines().peekable();
|
let mut lines = buffer.lines();
|
||||||
while let Some(Ok(line)) = iterator.next() {
|
|
||||||
|
while let Ok(Some(line)) = lines.next_line().await {
|
||||||
let line = line.trim();
|
let line = line.trim();
|
||||||
if line.starts_with('#')
|
if line.starts_with('#')
|
||||||
|| line.starts_with(tidy)
|
|| line.starts_with(tidy)
|
||||||
|
@ -126,7 +124,7 @@ impl GrabberConfig {
|
||||||
extract_option_single!(line, next_page, next_page_link);
|
extract_option_single!(line, next_page, next_page_link);
|
||||||
|
|
||||||
if line.starts_with(replace_single) {
|
if line.starts_with(replace_single) {
|
||||||
let value = GrabberConfig::extract_value(replace_single, line);
|
let value = Util::extract_value(replace_single, line);
|
||||||
let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect();
|
let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect();
|
||||||
if value.len() != 2 {
|
if value.len() != 2 {
|
||||||
continue;
|
continue;
|
||||||
|
@ -145,18 +143,17 @@ impl GrabberConfig {
|
||||||
}
|
}
|
||||||
|
|
||||||
if line.starts_with(find) {
|
if line.starts_with(find) {
|
||||||
let value1 = GrabberConfig::extract_value(find, line);
|
let to_replace = Util::extract_value(find, line).into();
|
||||||
|
|
||||||
if let Some(&Ok(ref next_line)) = iterator.peek() {
|
if let Ok(Some(ref next_line)) = lines.next_line().await {
|
||||||
let value2 = GrabberConfig::extract_value(replace, &next_line);
|
let replace_with = Util::extract_value(replace, &next_line).into();
|
||||||
|
|
||||||
let r = Replace {
|
replace_vec.push(Replace {
|
||||||
to_replace: value1.to_string(),
|
to_replace,
|
||||||
replace_with: value2.to_string(),
|
replace_with,
|
||||||
};
|
});
|
||||||
|
|
||||||
replace_vec.push(r);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -181,17 +178,4 @@ impl GrabberConfig {
|
||||||
|
|
||||||
Ok(config)
|
Ok(config)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_value<'a>(identifier: &str, line: &'a str) -> &'a str {
|
|
||||||
let value = &line[identifier.len()..];
|
|
||||||
let value = value.trim();
|
|
||||||
match value.find('#') {
|
|
||||||
Some(pos) => &value[..pos],
|
|
||||||
None => value,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn split_values(values: &str) -> Vec<&str> {
|
|
||||||
values.split('|').map(|s| s.trim()).collect()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
24
src/lib.rs
24
src/lib.rs
|
@ -3,6 +3,7 @@ mod config;
|
||||||
mod error;
|
mod error;
|
||||||
pub mod images;
|
pub mod images;
|
||||||
mod youtube;
|
mod youtube;
|
||||||
|
mod util;
|
||||||
|
|
||||||
use self::error::{ScraperError, ScraperErrorKind};
|
use self::error::{ScraperError, ScraperErrorKind};
|
||||||
use crate::article::Article;
|
use crate::article::Article;
|
||||||
|
@ -21,7 +22,6 @@ use std::collections::HashMap;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::thread;
|
|
||||||
|
|
||||||
pub struct ArticleScraper {
|
pub struct ArticleScraper {
|
||||||
pub image_downloader: ImageDownloader,
|
pub image_downloader: ImageDownloader,
|
||||||
|
@ -29,18 +29,14 @@ pub struct ArticleScraper {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ArticleScraper {
|
impl ArticleScraper {
|
||||||
pub fn new(config_path: PathBuf) -> Self {
|
pub async fn new(config_path: PathBuf) -> Self {
|
||||||
let config_files = Arc::new(RwLock::new(None));
|
let config_files = Arc::new(RwLock::new(None));
|
||||||
|
|
||||||
let locked_config_files = config_files.clone();
|
if let Ok(loaded_config_files) = GrabberConfig::parse_directory(&config_path).await {
|
||||||
thread::spawn(move || {
|
config_files.write().replace(loaded_config_files);
|
||||||
if let Ok(config_files) = GrabberConfig::parse_directory(&config_path) {
|
} else {
|
||||||
locked_config_files.write().replace(config_files);
|
config_files.write().replace(HashMap::new());
|
||||||
} else {
|
}
|
||||||
locked_config_files.write().replace(HashMap::new());
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
ArticleScraper {
|
ArticleScraper {
|
||||||
image_downloader: ImageDownloader::new((2048, 2048)),
|
image_downloader: ImageDownloader::new((2048, 2048)),
|
||||||
config_files,
|
config_files,
|
||||||
|
@ -791,7 +787,7 @@ mod tests {
|
||||||
let out_path = PathBuf::from(r"./test_output");
|
let out_path = PathBuf::from(r"./test_output");
|
||||||
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
|
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
|
||||||
|
|
||||||
let grabber = ArticleScraper::new(config_path);
|
let grabber = ArticleScraper::new(config_path).await;
|
||||||
let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
|
let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
|
||||||
article.save_html(&out_path).unwrap();
|
article.save_html(&out_path).unwrap();
|
||||||
|
|
||||||
|
@ -813,7 +809,7 @@ mod tests {
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let grabber = ArticleScraper::new(config_path);
|
let grabber = ArticleScraper::new(config_path).await;
|
||||||
let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
|
let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
|
||||||
article.save_html(&out_path).unwrap();
|
article.save_html(&out_path).unwrap();
|
||||||
|
|
||||||
|
@ -830,7 +826,7 @@ mod tests {
|
||||||
let config_path = PathBuf::from(r"./resources/tests/");
|
let config_path = PathBuf::from(r"./resources/tests/");
|
||||||
let url = url::Url::parse("https://www.youtube.com/watch?v=lHRkYLcmFY8").unwrap();
|
let url = url::Url::parse("https://www.youtube.com/watch?v=lHRkYLcmFY8").unwrap();
|
||||||
|
|
||||||
let grabber = ArticleScraper::new(config_path);
|
let grabber = ArticleScraper::new(config_path).await;
|
||||||
let article = grabber.parse(&url, false, &Client::new()).await.unwrap();
|
let article = grabber.parse(&url, false, &Client::new()).await.unwrap();
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
|
26
src/util.rs
Normal file
26
src/util.rs
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
use tokio::fs::DirEntry;
|
||||||
|
|
||||||
|
pub struct Util;
|
||||||
|
|
||||||
|
impl Util {
|
||||||
|
pub fn check_extension(path: &DirEntry, extension: &str) -> bool {
|
||||||
|
if let Some(ext) = path.path().extension() {
|
||||||
|
ext.to_str() == Some(extension)
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn extract_value<'a>(identifier: &str, line: &'a str) -> &'a str {
|
||||||
|
let value = &line[identifier.len()..];
|
||||||
|
let value = value.trim();
|
||||||
|
match value.find('#') {
|
||||||
|
Some(pos) => &value[..pos],
|
||||||
|
None => value,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn split_values(values: &str) -> Vec<&str> {
|
||||||
|
values.split('|').map(|s| s.trim()).collect()
|
||||||
|
}
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue