1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

readability cli

This commit is contained in:
Jan Lukas Gernert 2023-04-06 08:53:19 +02:00
parent a2719c8c7e
commit 063996d62f
5 changed files with 95 additions and 10 deletions

View file

@ -14,6 +14,45 @@ use crate::{constants, util::Util};
pub struct Readability;
impl Readability {
async fn extract_from_str(
html: &str,
base_url: Option<url::Url>,
) -> Result<String, FullTextParserError> {
libxml::tree::node::set_node_rc_guard(10);
let empty_config = crate::full_text_parser::config::ConfigEntry::default();
let url =
base_url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap());
let document = crate::FullTextParser::parse_html(html, None, &empty_config)?;
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document)?;
crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document);
let mut article = crate::article::Article {
title: None,
author: None,
url,
date: None,
thumbnail_url: None,
document: None,
};
let mut article_document = Document::new().map_err(|()| FullTextParserError::Xml)?;
let mut root =
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
article_document.set_root_element(&root);
crate::full_text_parser::metadata::extract(&xpath_ctx, None, None, &mut article);
super::Readability::extract_body(document, &mut root, article.title.as_deref())?;
crate::FullTextParser::post_process_document(&article_document)?;
article.document = Some(article_document);
let html = article
.get_content()
.ok_or(FullTextParserError::Readability)?;
Ok(html)
}
pub fn extract_body(
document: Document,
root: &mut Node,

View file

@ -43,8 +43,6 @@ async fn run_test(name: &str) {
article.document = Some(article_document);
let html = article.get_content().unwrap();
std::fs::write("expected.html", &html).unwrap();
let expected = std::fs::read_to_string(format!(
"./resources/tests/readability/{name}/expected.html"
))

View file

@ -9,4 +9,7 @@ repository = "https://gitlab.com/news-flash/article_scraper"
[dependencies]
article_scraper = { path = "../article_scraper/" }
clap = { version = "4.2", features = [ "derive" ] }
clap = { version = "4.2", features = [ "derive" ] }
simplelog = "0.12"
log = "0.4"
url = "2.3"

View file

@ -6,25 +6,29 @@ use std::path::PathBuf;
pub struct Args {
/// Turn debug logging on
#[arg(short, long)]
debug: bool,
pub debug: bool,
#[command(subcommand)]
command: Option<Commands>,
pub command: Commands,
/// Destination of resulting HTML file
#[arg(short, long, value_name = "FILE")]
output: Option<PathBuf>,
pub output: Option<PathBuf>,
}
#[derive(Subcommand)]
enum Commands {
pub enum Commands {
/// Only use the Readability parser
Readability {
/// Source HTML file
#[arg(long, value_name = "FILE")]
html: Option<PathBuf>,
/// Source Url
/// Base to complete relative Url
#[arg(long, value_name = "URL")]
base_url: Option<String>,
/// Source Url to download HTML from
#[arg(long, value_name = "URL")]
source_url: Option<String>,
},

View file

@ -1,8 +1,49 @@
use std::{path::PathBuf, process::exit};
use crate::args::{Args, Commands};
use clap::Parser;
use simplelog::{ColorChoice, Config, LevelFilter, TermLogger, TerminalMode};
use url::Url;
mod args;
pub fn main() {
let _args = args::Args::parse();
println!("hello world");
let args = Args::parse();
let level = if args.debug {
LevelFilter::Debug
} else {
LevelFilter::Info
};
TermLogger::init(
level,
Config::default(),
TerminalMode::Mixed,
ColorChoice::Auto,
)
.unwrap();
match args.command {
Commands::Readability {
html,
base_url,
source_url,
} => extract_readability(html, source_url, base_url),
}
log::info!("hello world");
}
fn extract_readability(
html_file: Option<PathBuf>,
source_url: Option<String>,
base_url: Option<String>,
) {
if html_file.is_none() && source_url.is_none() {
log::error!("");
exit(0);
}
let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url"));
let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
}