1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

readability cli

This commit is contained in:
Jan Lukas Gernert 2023-04-06 08:53:19 +02:00
parent a2719c8c7e
commit 063996d62f
5 changed files with 95 additions and 10 deletions

View file

@ -14,6 +14,45 @@ use crate::{constants, util::Util};
pub struct Readability; pub struct Readability;
impl Readability { impl Readability {
async fn extract_from_str(
html: &str,
base_url: Option<url::Url>,
) -> Result<String, FullTextParserError> {
libxml::tree::node::set_node_rc_guard(10);
let empty_config = crate::full_text_parser::config::ConfigEntry::default();
let url =
base_url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap());
let document = crate::FullTextParser::parse_html(html, None, &empty_config)?;
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document)?;
crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document);
let mut article = crate::article::Article {
title: None,
author: None,
url,
date: None,
thumbnail_url: None,
document: None,
};
let mut article_document = Document::new().map_err(|()| FullTextParserError::Xml)?;
let mut root =
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
article_document.set_root_element(&root);
crate::full_text_parser::metadata::extract(&xpath_ctx, None, None, &mut article);
super::Readability::extract_body(document, &mut root, article.title.as_deref())?;
crate::FullTextParser::post_process_document(&article_document)?;
article.document = Some(article_document);
let html = article
.get_content()
.ok_or(FullTextParserError::Readability)?;
Ok(html)
}
pub fn extract_body( pub fn extract_body(
document: Document, document: Document,
root: &mut Node, root: &mut Node,

View file

@ -43,8 +43,6 @@ async fn run_test(name: &str) {
article.document = Some(article_document); article.document = Some(article_document);
let html = article.get_content().unwrap(); let html = article.get_content().unwrap();
std::fs::write("expected.html", &html).unwrap();
let expected = std::fs::read_to_string(format!( let expected = std::fs::read_to_string(format!(
"./resources/tests/readability/{name}/expected.html" "./resources/tests/readability/{name}/expected.html"
)) ))

View file

@ -9,4 +9,7 @@ repository = "https://gitlab.com/news-flash/article_scraper"
[dependencies] [dependencies]
article_scraper = { path = "../article_scraper/" } article_scraper = { path = "../article_scraper/" }
clap = { version = "4.2", features = [ "derive" ] } clap = { version = "4.2", features = [ "derive" ] }
simplelog = "0.12"
log = "0.4"
url = "2.3"

View file

@ -6,25 +6,29 @@ use std::path::PathBuf;
pub struct Args { pub struct Args {
/// Turn debug logging on /// Turn debug logging on
#[arg(short, long)] #[arg(short, long)]
debug: bool, pub debug: bool,
#[command(subcommand)] #[command(subcommand)]
command: Option<Commands>, pub command: Commands,
/// Destination of resulting HTML file /// Destination of resulting HTML file
#[arg(short, long, value_name = "FILE")] #[arg(short, long, value_name = "FILE")]
output: Option<PathBuf>, pub output: Option<PathBuf>,
} }
#[derive(Subcommand)] #[derive(Subcommand)]
enum Commands { pub enum Commands {
/// Only use the Readability parser /// Only use the Readability parser
Readability { Readability {
/// Source HTML file /// Source HTML file
#[arg(long, value_name = "FILE")] #[arg(long, value_name = "FILE")]
html: Option<PathBuf>, html: Option<PathBuf>,
/// Source Url /// Base to complete relative Url
#[arg(long, value_name = "URL")]
base_url: Option<String>,
/// Source Url to download HTML from
#[arg(long, value_name = "URL")] #[arg(long, value_name = "URL")]
source_url: Option<String>, source_url: Option<String>,
}, },

View file

@ -1,8 +1,49 @@
use std::{path::PathBuf, process::exit};
use crate::args::{Args, Commands};
use clap::Parser; use clap::Parser;
use simplelog::{ColorChoice, Config, LevelFilter, TermLogger, TerminalMode};
use url::Url;
mod args; mod args;
pub fn main() { pub fn main() {
let _args = args::Args::parse(); let args = Args::parse();
println!("hello world");
let level = if args.debug {
LevelFilter::Debug
} else {
LevelFilter::Info
};
TermLogger::init(
level,
Config::default(),
TerminalMode::Mixed,
ColorChoice::Auto,
)
.unwrap();
match args.command {
Commands::Readability {
html,
base_url,
source_url,
} => extract_readability(html, source_url, base_url),
}
log::info!("hello world");
}
fn extract_readability(
html_file: Option<PathBuf>,
source_url: Option<String>,
base_url: Option<String>,
) {
if html_file.is_none() && source_url.is_none() {
log::error!("");
exit(0);
}
let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url"));
let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
} }