1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

cli to parse single page with ftr

This commit is contained in:
Jan Lukas Gernert 2023-04-10 13:47:45 +02:00
parent d978059709
commit fa41633e11
4 changed files with 173 additions and 34 deletions

View file

@ -35,7 +35,69 @@ impl FullTextParser {
Self { config_files } Self { config_files }
} }
pub async fn parse( pub async fn parse_offline(
&self,
html: &str,
config: Option<&ConfigEntry>,
base_url: Option<Url>,
) -> Result<String, FullTextParserError> {
libxml::tree::node::set_node_rc_guard(10);
let global_config = self
.config_files
.get("global.txt")
.ok_or(FullTextParserError::Config)?;
let url =
base_url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap());
let mut article = Article {
title: None,
author: None,
url,
date: None,
thumbnail_url: None,
document: None,
};
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
let mut root =
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
document.set_root_element(&root);
Self::generate_head(&mut root, &document)?;
let document = Self::parse_html(html, config, global_config)?;
let xpath_ctx = Self::get_xpath_ctx(&document)?;
metadata::extract(&xpath_ctx, config, Some(global_config), &mut article);
if article.thumbnail_url.is_none() {
Self::check_for_thumbnail(&xpath_ctx, &mut article);
}
Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document);
let found_body = Self::extract_body(&xpath_ctx, &mut root, config, global_config)?;
if !found_body {
log::error!("Ftr failed to find content");
return Err(FullTextParserError::Scrape);
}
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
log::info!("Next page url: {url}");
}
if let Err(error) = Self::prevent_self_closing_tags(&xpath_ctx) {
log::error!("Preventing self closing tags failed - '{error}'");
return Err(error);
}
Self::post_process_document(&document)?;
article.document = Some(document);
let html = article.get_content().ok_or(FullTextParserError::Scrape)?;
Ok(html)
}
pub(crate) async fn parse(
&self, &self,
url: &url::Url, url: &url::Url,
client: &Client, client: &Client,

View file

@ -7,6 +7,7 @@ mod util;
use article::Article; use article::Article;
use error::ScraperError; use error::ScraperError;
pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
pub use full_text_parser::FullTextParser; pub use full_text_parser::FullTextParser;
pub use full_text_parser::Readability; pub use full_text_parser::Readability;
use images::ImageDownloader; use images::ImageDownloader;

View file

@ -32,4 +32,22 @@ pub enum Commands {
#[arg(long, value_name = "URL")] #[arg(long, value_name = "URL")]
source_url: Option<String>, source_url: Option<String>,
}, },
Ftr {
/// Source HTML file
#[arg(long, value_name = "FILE")]
html: Option<PathBuf>,
/// Base to complete relative Url
#[arg(long, value_name = "URL")]
base_url: Option<String>,
/// Source Url to download HTML from
#[arg(long, value_name = "URL")]
source_url: Option<String>,
/// The Ftr config to use
/// Otherwise source_url and base_url will be used
#[arg(long, value_name = "domain")]
config: Option<String>,
},
} }

View file

@ -1,6 +1,8 @@
use std::path::Path;
use std::{path::PathBuf, process::exit}; use std::{path::PathBuf, process::exit};
use crate::args::{Args, Commands}; use crate::args::{Args, Commands};
use article_scraper::FtrConfigEntry;
use article_scraper::FullTextParser; use article_scraper::FullTextParser;
use article_scraper::Readability; use article_scraper::Readability;
use clap::Parser; use clap::Parser;
@ -34,9 +36,62 @@ async fn main() {
base_url, base_url,
source_url, source_url,
} => extract_readability(html, source_url, base_url, args.output).await, } => extract_readability(html, source_url, base_url, args.output).await,
Commands::Ftr {
html,
base_url,
source_url,
config,
} => extract_ftr(html, source_url, base_url, config, args.output).await,
} }
}
log::info!("hello world"); async fn extract_ftr(
html_file: Option<PathBuf>,
source_url: Option<String>,
base_url: Option<String>,
config: Option<String>,
output: Option<PathBuf>,
) {
let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
let html = get_html(html_file, source_url).await;
let config = if let Some(config_path) = config {
match FtrConfigEntry::parse_path(Path::new(&config_path)).await {
Ok(entry) => Some(entry),
Err(error) => {
log::error!("Failed to parse config entry {config_path}: {error}");
exit(0);
}
}
} else {
None
};
let full_text_parser = FullTextParser::new(None).await;
let result = match full_text_parser
.parse_offline(&html, config.as_ref(), base_url)
.await
{
Ok(res) => res,
Err(err) => {
log::error!("Failed to extract content with ftr: {err}");
exit(0);
}
};
let output = if let Some(output) = output {
output
} else {
PathBuf::from("result.html")
};
match std::fs::write(&output, result) {
Ok(()) => log::info!("successfully written result to {output:?}"),
Err(err) => {
log::error!("Failed to write to file {output:?}: {err}");
exit(0);
}
}
} }
async fn extract_readability( async fn extract_readability(
@ -45,39 +100,8 @@ async fn extract_readability(
base_url: Option<String>, base_url: Option<String>,
output: Option<PathBuf>, output: Option<PathBuf>,
) { ) {
if html_file.is_none() && source_url.is_none() {
log::error!("either need a source html file or source url");
exit(0);
}
if html_file.is_some() && source_url.is_some() {
log::error!("load source from html file or url? only specify one of the two options");
exit(0);
}
let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url"));
let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url")); let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
let html = get_html(html_file, source_url).await;
let html = if let Some(source_url) = source_url {
match FullTextParser::download(&source_url, &Client::new(), HeaderMap::new()).await {
Ok(html) => html,
Err(err) => {
log::error!("Failed to download html from url: {err}");
exit(0);
}
}
} else if let Some(source_file) = html_file {
match std::fs::read_to_string(&source_file) {
Ok(html) => html,
Err(err) => {
log::error!("Failed to read file {source_file:?}: {err}");
exit(0);
}
}
} else {
unreachable!()
};
let result = match Readability::extract_from_str(&html, base_url).await { let result = match Readability::extract_from_str(&html, base_url).await {
Ok(res) => res, Ok(res) => res,
Err(err) => { Err(err) => {
@ -100,3 +124,37 @@ async fn extract_readability(
} }
} }
} }
async fn get_html(html_file: Option<PathBuf>, source_url: Option<String>) -> String {
if html_file.is_none() && source_url.is_none() {
log::error!("either need a source html file or source url");
exit(0);
}
if html_file.is_some() && source_url.is_some() {
log::error!("load source from html file or url? only specify one of the two options");
exit(0);
}
let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url"));
if let Some(source_url) = source_url {
match FullTextParser::download(&source_url, &Client::new(), HeaderMap::new()).await {
Ok(html) => html,
Err(err) => {
log::error!("Failed to download html from url: {err}");
exit(0);
}
}
} else if let Some(source_file) = html_file {
match std::fs::read_to_string(&source_file) {
Ok(html) => html,
Err(err) => {
log::error!("Failed to read file {source_file:?}: {err}");
exit(0);
}
}
} else {
unreachable!()
}
}