diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 6798045..8473b30 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -35,7 +35,69 @@ impl FullTextParser { Self { config_files } } - pub async fn parse( + pub async fn parse_offline( + &self, + html: &str, + config: Option<&ConfigEntry>, + base_url: Option, + ) -> Result { + libxml::tree::node::set_node_rc_guard(10); + + let global_config = self + .config_files + .get("global.txt") + .ok_or(FullTextParserError::Config)?; + + let url = + base_url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap()); + + let mut article = Article { + title: None, + author: None, + url, + date: None, + thumbnail_url: None, + document: None, + }; + + let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?; + let mut root = + Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?; + document.set_root_element(&root); + + Self::generate_head(&mut root, &document)?; + + let document = Self::parse_html(html, config, global_config)?; + let xpath_ctx = Self::get_xpath_ctx(&document)?; + + metadata::extract(&xpath_ctx, config, Some(global_config), &mut article); + if article.thumbnail_url.is_none() { + Self::check_for_thumbnail(&xpath_ctx, &mut article); + } + Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document); + let found_body = Self::extract_body(&xpath_ctx, &mut root, config, global_config)?; + if !found_body { + log::error!("Ftr failed to find content"); + return Err(FullTextParserError::Scrape); + } + + while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) { + log::info!("Next page url: {url}"); + } + + if let Err(error) = Self::prevent_self_closing_tags(&xpath_ctx) { + log::error!("Preventing self closing tags failed - '{error}'"); + return Err(error); + } + + Self::post_process_document(&document)?; + + article.document = Some(document); + let html = article.get_content().ok_or(FullTextParserError::Scrape)?; + Ok(html) + } + + pub(crate) async fn parse( &self, url: &url::Url, client: &Client, diff --git a/article_scraper/src/lib.rs b/article_scraper/src/lib.rs index b051199..4344cd7 100644 --- a/article_scraper/src/lib.rs +++ b/article_scraper/src/lib.rs @@ -7,6 +7,7 @@ mod util; use article::Article; use error::ScraperError; +pub use full_text_parser::config::ConfigEntry as FtrConfigEntry; pub use full_text_parser::FullTextParser; pub use full_text_parser::Readability; use images::ImageDownloader; diff --git a/article_scraper_cli/src/args.rs b/article_scraper_cli/src/args.rs index a736c2d..d80a15c 100644 --- a/article_scraper_cli/src/args.rs +++ b/article_scraper_cli/src/args.rs @@ -32,4 +32,22 @@ pub enum Commands { #[arg(long, value_name = "URL")] source_url: Option, }, + Ftr { + /// Source HTML file + #[arg(long, value_name = "FILE")] + html: Option, + + /// Base to complete relative Url + #[arg(long, value_name = "URL")] + base_url: Option, + + /// Source Url to download HTML from + #[arg(long, value_name = "URL")] + source_url: Option, + + /// The Ftr config to use + /// Otherwise source_url and base_url will be used + #[arg(long, value_name = "domain")] + config: Option, + }, } diff --git a/article_scraper_cli/src/main.rs b/article_scraper_cli/src/main.rs index ceda501..5798bf9 100644 --- a/article_scraper_cli/src/main.rs +++ b/article_scraper_cli/src/main.rs @@ -1,6 +1,8 @@ +use std::path::Path; use std::{path::PathBuf, process::exit}; use crate::args::{Args, Commands}; +use article_scraper::FtrConfigEntry; use article_scraper::FullTextParser; use article_scraper::Readability; use clap::Parser; @@ -34,9 +36,62 @@ async fn main() { base_url, source_url, } => extract_readability(html, source_url, base_url, args.output).await, + Commands::Ftr { + html, + base_url, + source_url, + config, + } => extract_ftr(html, source_url, base_url, config, args.output).await, } +} - log::info!("hello world"); +async fn extract_ftr( + html_file: Option, + source_url: Option, + base_url: Option, + config: Option, + output: Option, +) { + let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url")); + let html = get_html(html_file, source_url).await; + + let config = if let Some(config_path) = config { + match FtrConfigEntry::parse_path(Path::new(&config_path)).await { + Ok(entry) => Some(entry), + Err(error) => { + log::error!("Failed to parse config entry {config_path}: {error}"); + exit(0); + } + } + } else { + None + }; + + let full_text_parser = FullTextParser::new(None).await; + let result = match full_text_parser + .parse_offline(&html, config.as_ref(), base_url) + .await + { + Ok(res) => res, + Err(err) => { + log::error!("Failed to extract content with ftr: {err}"); + exit(0); + } + }; + + let output = if let Some(output) = output { + output + } else { + PathBuf::from("result.html") + }; + + match std::fs::write(&output, result) { + Ok(()) => log::info!("successfully written result to {output:?}"), + Err(err) => { + log::error!("Failed to write to file {output:?}: {err}"); + exit(0); + } + } } async fn extract_readability( @@ -45,39 +100,8 @@ async fn extract_readability( base_url: Option, output: Option, ) { - if html_file.is_none() && source_url.is_none() { - log::error!("either need a source html file or source url"); - exit(0); - } - - if html_file.is_some() && source_url.is_some() { - log::error!("load source from html file or url? only specify one of the two options"); - exit(0); - } - - let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url")); let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url")); - - let html = if let Some(source_url) = source_url { - match FullTextParser::download(&source_url, &Client::new(), HeaderMap::new()).await { - Ok(html) => html, - Err(err) => { - log::error!("Failed to download html from url: {err}"); - exit(0); - } - } - } else if let Some(source_file) = html_file { - match std::fs::read_to_string(&source_file) { - Ok(html) => html, - Err(err) => { - log::error!("Failed to read file {source_file:?}: {err}"); - exit(0); - } - } - } else { - unreachable!() - }; - + let html = get_html(html_file, source_url).await; let result = match Readability::extract_from_str(&html, base_url).await { Ok(res) => res, Err(err) => { @@ -100,3 +124,37 @@ async fn extract_readability( } } } + +async fn get_html(html_file: Option, source_url: Option) -> String { + if html_file.is_none() && source_url.is_none() { + log::error!("either need a source html file or source url"); + exit(0); + } + + if html_file.is_some() && source_url.is_some() { + log::error!("load source from html file or url? only specify one of the two options"); + exit(0); + } + + let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url")); + + if let Some(source_url) = source_url { + match FullTextParser::download(&source_url, &Client::new(), HeaderMap::new()).await { + Ok(html) => html, + Err(err) => { + log::error!("Failed to download html from url: {err}"); + exit(0); + } + } + } else if let Some(source_file) = html_file { + match std::fs::read_to_string(&source_file) { + Ok(html) => html, + Err(err) => { + log::error!("Failed to read file {source_file:?}: {err}"); + exit(0); + } + } + } else { + unreachable!() + } +}