1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

cli to parse single page with ftr

This commit is contained in:
Jan Lukas Gernert 2023-04-10 13:47:45 +02:00
parent d978059709
commit fa41633e11
4 changed files with 173 additions and 34 deletions

View file

@ -32,4 +32,22 @@ pub enum Commands {
#[arg(long, value_name = "URL")]
source_url: Option<String>,
},
Ftr {
/// Source HTML file
#[arg(long, value_name = "FILE")]
html: Option<PathBuf>,
/// Base to complete relative Url
#[arg(long, value_name = "URL")]
base_url: Option<String>,
/// Source Url to download HTML from
#[arg(long, value_name = "URL")]
source_url: Option<String>,
/// The Ftr config to use
/// Otherwise source_url and base_url will be used
#[arg(long, value_name = "domain")]
config: Option<String>,
},
}

View file

@ -1,6 +1,8 @@
use std::path::Path;
use std::{path::PathBuf, process::exit};
use crate::args::{Args, Commands};
use article_scraper::FtrConfigEntry;
use article_scraper::FullTextParser;
use article_scraper::Readability;
use clap::Parser;
@ -34,9 +36,62 @@ async fn main() {
base_url,
source_url,
} => extract_readability(html, source_url, base_url, args.output).await,
Commands::Ftr {
html,
base_url,
source_url,
config,
} => extract_ftr(html, source_url, base_url, config, args.output).await,
}
}
log::info!("hello world");
async fn extract_ftr(
html_file: Option<PathBuf>,
source_url: Option<String>,
base_url: Option<String>,
config: Option<String>,
output: Option<PathBuf>,
) {
let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
let html = get_html(html_file, source_url).await;
let config = if let Some(config_path) = config {
match FtrConfigEntry::parse_path(Path::new(&config_path)).await {
Ok(entry) => Some(entry),
Err(error) => {
log::error!("Failed to parse config entry {config_path}: {error}");
exit(0);
}
}
} else {
None
};
let full_text_parser = FullTextParser::new(None).await;
let result = match full_text_parser
.parse_offline(&html, config.as_ref(), base_url)
.await
{
Ok(res) => res,
Err(err) => {
log::error!("Failed to extract content with ftr: {err}");
exit(0);
}
};
let output = if let Some(output) = output {
output
} else {
PathBuf::from("result.html")
};
match std::fs::write(&output, result) {
Ok(()) => log::info!("successfully written result to {output:?}"),
Err(err) => {
log::error!("Failed to write to file {output:?}: {err}");
exit(0);
}
}
}
async fn extract_readability(
@ -45,39 +100,8 @@ async fn extract_readability(
base_url: Option<String>,
output: Option<PathBuf>,
) {
if html_file.is_none() && source_url.is_none() {
log::error!("either need a source html file or source url");
exit(0);
}
if html_file.is_some() && source_url.is_some() {
log::error!("load source from html file or url? only specify one of the two options");
exit(0);
}
let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url"));
let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
let html = if let Some(source_url) = source_url {
match FullTextParser::download(&source_url, &Client::new(), HeaderMap::new()).await {
Ok(html) => html,
Err(err) => {
log::error!("Failed to download html from url: {err}");
exit(0);
}
}
} else if let Some(source_file) = html_file {
match std::fs::read_to_string(&source_file) {
Ok(html) => html,
Err(err) => {
log::error!("Failed to read file {source_file:?}: {err}");
exit(0);
}
}
} else {
unreachable!()
};
let html = get_html(html_file, source_url).await;
let result = match Readability::extract_from_str(&html, base_url).await {
Ok(res) => res,
Err(err) => {
@ -100,3 +124,37 @@ async fn extract_readability(
}
}
}
async fn get_html(html_file: Option<PathBuf>, source_url: Option<String>) -> String {
if html_file.is_none() && source_url.is_none() {
log::error!("either need a source html file or source url");
exit(0);
}
if html_file.is_some() && source_url.is_some() {
log::error!("load source from html file or url? only specify one of the two options");
exit(0);
}
let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url"));
if let Some(source_url) = source_url {
match FullTextParser::download(&source_url, &Client::new(), HeaderMap::new()).await {
Ok(html) => html,
Err(err) => {
log::error!("Failed to download html from url: {err}");
exit(0);
}
}
} else if let Some(source_file) = html_file {
match std::fs::read_to_string(&source_file) {
Ok(html) => html,
Err(err) => {
log::error!("Failed to read file {source_file:?}: {err}");
exit(0);
}
}
} else {
unreachable!()
}
}