From 063996d62f80f7a3074d3cab7d5ee1a121163078 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Thu, 6 Apr 2023 08:53:19 +0200 Subject: [PATCH] readability cli --- .../src/full_text_parser/readability/mod.rs | 39 ++++++++++++++++ .../src/full_text_parser/readability/tests.rs | 2 - article_scraper_cli/Cargo.toml | 5 ++- article_scraper_cli/src/args.rs | 14 +++--- article_scraper_cli/src/main.rs | 45 ++++++++++++++++++- 5 files changed, 95 insertions(+), 10 deletions(-) diff --git a/article_scraper/src/full_text_parser/readability/mod.rs b/article_scraper/src/full_text_parser/readability/mod.rs index 941e5ee..fedf563 100644 --- a/article_scraper/src/full_text_parser/readability/mod.rs +++ b/article_scraper/src/full_text_parser/readability/mod.rs @@ -14,6 +14,45 @@ use crate::{constants, util::Util}; pub struct Readability; impl Readability { + async fn extract_from_str( + html: &str, + base_url: Option, + ) -> Result { + libxml::tree::node::set_node_rc_guard(10); + let empty_config = crate::full_text_parser::config::ConfigEntry::default(); + + let url = + base_url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap()); + let document = crate::FullTextParser::parse_html(html, None, &empty_config)?; + let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document)?; + + crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document); + let mut article = crate::article::Article { + title: None, + author: None, + url, + date: None, + thumbnail_url: None, + document: None, + }; + + let mut article_document = Document::new().map_err(|()| FullTextParserError::Xml)?; + let mut root = + Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?; + article_document.set_root_element(&root); + + crate::full_text_parser::metadata::extract(&xpath_ctx, None, None, &mut article); + super::Readability::extract_body(document, &mut root, article.title.as_deref())?; + crate::FullTextParser::post_process_document(&article_document)?; + + article.document = Some(article_document); + let html = article + .get_content() + .ok_or(FullTextParserError::Readability)?; + + Ok(html) + } + pub fn extract_body( document: Document, root: &mut Node, diff --git a/article_scraper/src/full_text_parser/readability/tests.rs b/article_scraper/src/full_text_parser/readability/tests.rs index 20ec703..27aa2c2 100644 --- a/article_scraper/src/full_text_parser/readability/tests.rs +++ b/article_scraper/src/full_text_parser/readability/tests.rs @@ -43,8 +43,6 @@ async fn run_test(name: &str) { article.document = Some(article_document); let html = article.get_content().unwrap(); - std::fs::write("expected.html", &html).unwrap(); - let expected = std::fs::read_to_string(format!( "./resources/tests/readability/{name}/expected.html" )) diff --git a/article_scraper_cli/Cargo.toml b/article_scraper_cli/Cargo.toml index 5fdedc4..24f8435 100644 --- a/article_scraper_cli/Cargo.toml +++ b/article_scraper_cli/Cargo.toml @@ -9,4 +9,7 @@ repository = "https://gitlab.com/news-flash/article_scraper" [dependencies] article_scraper = { path = "../article_scraper/" } -clap = { version = "4.2", features = [ "derive" ] } \ No newline at end of file +clap = { version = "4.2", features = [ "derive" ] } +simplelog = "0.12" +log = "0.4" +url = "2.3" \ No newline at end of file diff --git a/article_scraper_cli/src/args.rs b/article_scraper_cli/src/args.rs index ffb1686..a736c2d 100644 --- a/article_scraper_cli/src/args.rs +++ b/article_scraper_cli/src/args.rs @@ -6,25 +6,29 @@ use std::path::PathBuf; pub struct Args { /// Turn debug logging on #[arg(short, long)] - debug: bool, + pub debug: bool, #[command(subcommand)] - command: Option, + pub command: Commands, /// Destination of resulting HTML file #[arg(short, long, value_name = "FILE")] - output: Option, + pub output: Option, } #[derive(Subcommand)] -enum Commands { +pub enum Commands { /// Only use the Readability parser Readability { /// Source HTML file #[arg(long, value_name = "FILE")] html: Option, - /// Source Url + /// Base to complete relative Url + #[arg(long, value_name = "URL")] + base_url: Option, + + /// Source Url to download HTML from #[arg(long, value_name = "URL")] source_url: Option, }, diff --git a/article_scraper_cli/src/main.rs b/article_scraper_cli/src/main.rs index be7f10e..ccec235 100644 --- a/article_scraper_cli/src/main.rs +++ b/article_scraper_cli/src/main.rs @@ -1,8 +1,49 @@ +use std::{path::PathBuf, process::exit}; + +use crate::args::{Args, Commands}; use clap::Parser; +use simplelog::{ColorChoice, Config, LevelFilter, TermLogger, TerminalMode}; +use url::Url; mod args; pub fn main() { - let _args = args::Args::parse(); - println!("hello world"); + let args = Args::parse(); + + let level = if args.debug { + LevelFilter::Debug + } else { + LevelFilter::Info + }; + TermLogger::init( + level, + Config::default(), + TerminalMode::Mixed, + ColorChoice::Auto, + ) + .unwrap(); + + match args.command { + Commands::Readability { + html, + base_url, + source_url, + } => extract_readability(html, source_url, base_url), + } + + log::info!("hello world"); +} + +fn extract_readability( + html_file: Option, + source_url: Option, + base_url: Option, +) { + if html_file.is_none() && source_url.is_none() { + log::error!(""); + exit(0); + } + + let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url")); + let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url")); }