1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

command to use readability extractor

This commit is contained in:
Jan Lukas Gernert 2023-04-07 11:51:14 +02:00
parent 063996d62f
commit d978059709
5 changed files with 65 additions and 9 deletions

View file

@ -9,7 +9,7 @@ mod tests;
use self::config::{ConfigCollection, ConfigEntry}; use self::config::{ConfigCollection, ConfigEntry};
use self::error::FullTextParserError; use self::error::FullTextParserError;
use self::readability::Readability; pub use self::readability::Readability;
use crate::article::Article; use crate::article::Article;
use crate::constants; use crate::constants;
use crate::util::Util; use crate::util::Util;
@ -269,7 +269,7 @@ impl FullTextParser {
Ok(()) Ok(())
} }
async fn download( pub async fn download(
url: &url::Url, url: &url::Url,
client: &Client, client: &Client,
headers: HeaderMap, headers: HeaderMap,

View file

@ -14,7 +14,7 @@ use crate::{constants, util::Util};
pub struct Readability; pub struct Readability;
impl Readability { impl Readability {
async fn extract_from_str( pub async fn extract_from_str(
html: &str, html: &str,
base_url: Option<url::Url>, base_url: Option<url::Url>,
) -> Result<String, FullTextParserError> { ) -> Result<String, FullTextParserError> {

View file

@ -7,7 +7,8 @@ mod util;
use article::Article; use article::Article;
use error::ScraperError; use error::ScraperError;
use full_text_parser::FullTextParser; pub use full_text_parser::FullTextParser;
pub use full_text_parser::Readability;
use images::ImageDownloader; use images::ImageDownloader;
use reqwest::Client; use reqwest::Client;
use std::path::Path; use std::path::Path;

View file

@ -12,4 +12,6 @@ article_scraper = { path = "../article_scraper/" }
clap = { version = "4.2", features = [ "derive" ] } clap = { version = "4.2", features = [ "derive" ] }
simplelog = "0.12" simplelog = "0.12"
log = "0.4" log = "0.4"
url = "2.3" url = "2.3"
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli"] }
tokio = { version = "1.27", features = ["macros", "fs", "io-util", "rt-multi-thread" ] }

View file

@ -1,13 +1,18 @@
use std::{path::PathBuf, process::exit}; use std::{path::PathBuf, process::exit};
use crate::args::{Args, Commands}; use crate::args::{Args, Commands};
use article_scraper::FullTextParser;
use article_scraper::Readability;
use clap::Parser; use clap::Parser;
use reqwest::header::HeaderMap;
use reqwest::Client;
use simplelog::{ColorChoice, Config, LevelFilter, TermLogger, TerminalMode}; use simplelog::{ColorChoice, Config, LevelFilter, TermLogger, TerminalMode};
use url::Url; use url::Url;
mod args; mod args;
pub fn main() { #[tokio::main]
async fn main() {
let args = Args::parse(); let args = Args::parse();
let level = if args.debug { let level = if args.debug {
@ -28,22 +33,70 @@ pub fn main() {
html, html,
base_url, base_url,
source_url, source_url,
} => extract_readability(html, source_url, base_url), } => extract_readability(html, source_url, base_url, args.output).await,
} }
log::info!("hello world"); log::info!("hello world");
} }
fn extract_readability( async fn extract_readability(
html_file: Option<PathBuf>, html_file: Option<PathBuf>,
source_url: Option<String>, source_url: Option<String>,
base_url: Option<String>, base_url: Option<String>,
output: Option<PathBuf>,
) { ) {
if html_file.is_none() && source_url.is_none() { if html_file.is_none() && source_url.is_none() {
log::error!(""); log::error!("either need a source html file or source url");
exit(0);
}
if html_file.is_some() && source_url.is_some() {
log::error!("load source from html file or url? only specify one of the two options");
exit(0); exit(0);
} }
let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url")); let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url"));
let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url")); let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
let html = if let Some(source_url) = source_url {
match FullTextParser::download(&source_url, &Client::new(), HeaderMap::new()).await {
Ok(html) => html,
Err(err) => {
log::error!("Failed to download html from url: {err}");
exit(0);
}
}
} else if let Some(source_file) = html_file {
match std::fs::read_to_string(&source_file) {
Ok(html) => html,
Err(err) => {
log::error!("Failed to read file {source_file:?}: {err}");
exit(0);
}
}
} else {
unreachable!()
};
let result = match Readability::extract_from_str(&html, base_url).await {
Ok(res) => res,
Err(err) => {
log::error!("Failed to extract content with readability: {err}");
exit(0);
}
};
let output = if let Some(output) = output {
output
} else {
PathBuf::from("result.html")
};
match std::fs::write(&output, result) {
Ok(()) => log::info!("successfully written result to {output:?}"),
Err(err) => {
log::error!("Failed to write to file {output:?}: {err}");
exit(0);
}
}
} }