mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 08:05:31 +02:00
readability cli
This commit is contained in:
parent
a2719c8c7e
commit
063996d62f
5 changed files with 95 additions and 10 deletions
|
@ -14,6 +14,45 @@ use crate::{constants, util::Util};
|
|||
pub struct Readability;
|
||||
|
||||
impl Readability {
|
||||
async fn extract_from_str(
|
||||
html: &str,
|
||||
base_url: Option<url::Url>,
|
||||
) -> Result<String, FullTextParserError> {
|
||||
libxml::tree::node::set_node_rc_guard(10);
|
||||
let empty_config = crate::full_text_parser::config::ConfigEntry::default();
|
||||
|
||||
let url =
|
||||
base_url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap());
|
||||
let document = crate::FullTextParser::parse_html(html, None, &empty_config)?;
|
||||
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document)?;
|
||||
|
||||
crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document);
|
||||
let mut article = crate::article::Article {
|
||||
title: None,
|
||||
author: None,
|
||||
url,
|
||||
date: None,
|
||||
thumbnail_url: None,
|
||||
document: None,
|
||||
};
|
||||
|
||||
let mut article_document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
||||
let mut root =
|
||||
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
|
||||
article_document.set_root_element(&root);
|
||||
|
||||
crate::full_text_parser::metadata::extract(&xpath_ctx, None, None, &mut article);
|
||||
super::Readability::extract_body(document, &mut root, article.title.as_deref())?;
|
||||
crate::FullTextParser::post_process_document(&article_document)?;
|
||||
|
||||
article.document = Some(article_document);
|
||||
let html = article
|
||||
.get_content()
|
||||
.ok_or(FullTextParserError::Readability)?;
|
||||
|
||||
Ok(html)
|
||||
}
|
||||
|
||||
pub fn extract_body(
|
||||
document: Document,
|
||||
root: &mut Node,
|
||||
|
|
|
@ -43,8 +43,6 @@ async fn run_test(name: &str) {
|
|||
article.document = Some(article_document);
|
||||
let html = article.get_content().unwrap();
|
||||
|
||||
std::fs::write("expected.html", &html).unwrap();
|
||||
|
||||
let expected = std::fs::read_to_string(format!(
|
||||
"./resources/tests/readability/{name}/expected.html"
|
||||
))
|
||||
|
|
|
@ -9,4 +9,7 @@ repository = "https://gitlab.com/news-flash/article_scraper"
|
|||
|
||||
[dependencies]
|
||||
article_scraper = { path = "../article_scraper/" }
|
||||
clap = { version = "4.2", features = [ "derive" ] }
|
||||
clap = { version = "4.2", features = [ "derive" ] }
|
||||
simplelog = "0.12"
|
||||
log = "0.4"
|
||||
url = "2.3"
|
|
@ -6,25 +6,29 @@ use std::path::PathBuf;
|
|||
pub struct Args {
|
||||
/// Turn debug logging on
|
||||
#[arg(short, long)]
|
||||
debug: bool,
|
||||
pub debug: bool,
|
||||
|
||||
#[command(subcommand)]
|
||||
command: Option<Commands>,
|
||||
pub command: Commands,
|
||||
|
||||
/// Destination of resulting HTML file
|
||||
#[arg(short, long, value_name = "FILE")]
|
||||
output: Option<PathBuf>,
|
||||
pub output: Option<PathBuf>,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
pub enum Commands {
|
||||
/// Only use the Readability parser
|
||||
Readability {
|
||||
/// Source HTML file
|
||||
#[arg(long, value_name = "FILE")]
|
||||
html: Option<PathBuf>,
|
||||
|
||||
/// Source Url
|
||||
/// Base to complete relative Url
|
||||
#[arg(long, value_name = "URL")]
|
||||
base_url: Option<String>,
|
||||
|
||||
/// Source Url to download HTML from
|
||||
#[arg(long, value_name = "URL")]
|
||||
source_url: Option<String>,
|
||||
},
|
||||
|
|
|
@ -1,8 +1,49 @@
|
|||
use std::{path::PathBuf, process::exit};
|
||||
|
||||
use crate::args::{Args, Commands};
|
||||
use clap::Parser;
|
||||
use simplelog::{ColorChoice, Config, LevelFilter, TermLogger, TerminalMode};
|
||||
use url::Url;
|
||||
|
||||
mod args;
|
||||
|
||||
pub fn main() {
|
||||
let _args = args::Args::parse();
|
||||
println!("hello world");
|
||||
let args = Args::parse();
|
||||
|
||||
let level = if args.debug {
|
||||
LevelFilter::Debug
|
||||
} else {
|
||||
LevelFilter::Info
|
||||
};
|
||||
TermLogger::init(
|
||||
level,
|
||||
Config::default(),
|
||||
TerminalMode::Mixed,
|
||||
ColorChoice::Auto,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
match args.command {
|
||||
Commands::Readability {
|
||||
html,
|
||||
base_url,
|
||||
source_url,
|
||||
} => extract_readability(html, source_url, base_url),
|
||||
}
|
||||
|
||||
log::info!("hello world");
|
||||
}
|
||||
|
||||
fn extract_readability(
|
||||
html_file: Option<PathBuf>,
|
||||
source_url: Option<String>,
|
||||
base_url: Option<String>,
|
||||
) {
|
||||
if html_file.is_none() && source_url.is_none() {
|
||||
log::error!("");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url"));
|
||||
let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue