diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 207e408..6798045 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -9,7 +9,7 @@ mod tests; use self::config::{ConfigCollection, ConfigEntry}; use self::error::FullTextParserError; -use self::readability::Readability; +pub use self::readability::Readability; use crate::article::Article; use crate::constants; use crate::util::Util; @@ -269,7 +269,7 @@ impl FullTextParser { Ok(()) } - async fn download( + pub async fn download( url: &url::Url, client: &Client, headers: HeaderMap, diff --git a/article_scraper/src/full_text_parser/readability/mod.rs b/article_scraper/src/full_text_parser/readability/mod.rs index fedf563..2de2d85 100644 --- a/article_scraper/src/full_text_parser/readability/mod.rs +++ b/article_scraper/src/full_text_parser/readability/mod.rs @@ -14,7 +14,7 @@ use crate::{constants, util::Util}; pub struct Readability; impl Readability { - async fn extract_from_str( + pub async fn extract_from_str( html: &str, base_url: Option, ) -> Result { diff --git a/article_scraper/src/lib.rs b/article_scraper/src/lib.rs index 76244d9..b051199 100644 --- a/article_scraper/src/lib.rs +++ b/article_scraper/src/lib.rs @@ -7,7 +7,8 @@ mod util; use article::Article; use error::ScraperError; -use full_text_parser::FullTextParser; +pub use full_text_parser::FullTextParser; +pub use full_text_parser::Readability; use images::ImageDownloader; use reqwest::Client; use std::path::Path; diff --git a/article_scraper_cli/Cargo.toml b/article_scraper_cli/Cargo.toml index 24f8435..cccc64b 100644 --- a/article_scraper_cli/Cargo.toml +++ b/article_scraper_cli/Cargo.toml @@ -12,4 +12,6 @@ article_scraper = { path = "../article_scraper/" } clap = { version = "4.2", features = [ "derive" ] } simplelog = "0.12" log = "0.4" -url = "2.3" \ No newline at end of file +url = "2.3" +reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli"] } +tokio = { version = "1.27", features = ["macros", "fs", "io-util", "rt-multi-thread" ] } \ No newline at end of file diff --git a/article_scraper_cli/src/main.rs b/article_scraper_cli/src/main.rs index ccec235..ceda501 100644 --- a/article_scraper_cli/src/main.rs +++ b/article_scraper_cli/src/main.rs @@ -1,13 +1,18 @@ use std::{path::PathBuf, process::exit}; use crate::args::{Args, Commands}; +use article_scraper::FullTextParser; +use article_scraper::Readability; use clap::Parser; +use reqwest::header::HeaderMap; +use reqwest::Client; use simplelog::{ColorChoice, Config, LevelFilter, TermLogger, TerminalMode}; use url::Url; mod args; -pub fn main() { +#[tokio::main] +async fn main() { let args = Args::parse(); let level = if args.debug { @@ -28,22 +33,70 @@ pub fn main() { html, base_url, source_url, - } => extract_readability(html, source_url, base_url), + } => extract_readability(html, source_url, base_url, args.output).await, } log::info!("hello world"); } -fn extract_readability( +async fn extract_readability( html_file: Option, source_url: Option, base_url: Option, + output: Option, ) { if html_file.is_none() && source_url.is_none() { - log::error!(""); + log::error!("either need a source html file or source url"); + exit(0); + } + + if html_file.is_some() && source_url.is_some() { + log::error!("load source from html file or url? only specify one of the two options"); exit(0); } let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url")); let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url")); + + let html = if let Some(source_url) = source_url { + match FullTextParser::download(&source_url, &Client::new(), HeaderMap::new()).await { + Ok(html) => html, + Err(err) => { + log::error!("Failed to download html from url: {err}"); + exit(0); + } + } + } else if let Some(source_file) = html_file { + match std::fs::read_to_string(&source_file) { + Ok(html) => html, + Err(err) => { + log::error!("Failed to read file {source_file:?}: {err}"); + exit(0); + } + } + } else { + unreachable!() + }; + + let result = match Readability::extract_from_str(&html, base_url).await { + Ok(res) => res, + Err(err) => { + log::error!("Failed to extract content with readability: {err}"); + exit(0); + } + }; + + let output = if let Some(output) = output { + output + } else { + PathBuf::from("result.html") + }; + + match std::fs::write(&output, result) { + Ok(()) => log::info!("successfully written result to {output:?}"), + Err(err) => { + log::error!("Failed to write to file {output:?}: {err}"); + exit(0); + } + } }