mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 08:05:31 +02:00
cli to parse single page with ftr
This commit is contained in:
parent
d978059709
commit
fa41633e11
4 changed files with 173 additions and 34 deletions
|
@ -35,7 +35,69 @@ impl FullTextParser {
|
|||
Self { config_files }
|
||||
}
|
||||
|
||||
pub async fn parse(
|
||||
pub async fn parse_offline(
|
||||
&self,
|
||||
html: &str,
|
||||
config: Option<&ConfigEntry>,
|
||||
base_url: Option<Url>,
|
||||
) -> Result<String, FullTextParserError> {
|
||||
libxml::tree::node::set_node_rc_guard(10);
|
||||
|
||||
let global_config = self
|
||||
.config_files
|
||||
.get("global.txt")
|
||||
.ok_or(FullTextParserError::Config)?;
|
||||
|
||||
let url =
|
||||
base_url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap());
|
||||
|
||||
let mut article = Article {
|
||||
title: None,
|
||||
author: None,
|
||||
url,
|
||||
date: None,
|
||||
thumbnail_url: None,
|
||||
document: None,
|
||||
};
|
||||
|
||||
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
||||
let mut root =
|
||||
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
|
||||
document.set_root_element(&root);
|
||||
|
||||
Self::generate_head(&mut root, &document)?;
|
||||
|
||||
let document = Self::parse_html(html, config, global_config)?;
|
||||
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||
|
||||
metadata::extract(&xpath_ctx, config, Some(global_config), &mut article);
|
||||
if article.thumbnail_url.is_none() {
|
||||
Self::check_for_thumbnail(&xpath_ctx, &mut article);
|
||||
}
|
||||
Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document);
|
||||
let found_body = Self::extract_body(&xpath_ctx, &mut root, config, global_config)?;
|
||||
if !found_body {
|
||||
log::error!("Ftr failed to find content");
|
||||
return Err(FullTextParserError::Scrape);
|
||||
}
|
||||
|
||||
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
|
||||
log::info!("Next page url: {url}");
|
||||
}
|
||||
|
||||
if let Err(error) = Self::prevent_self_closing_tags(&xpath_ctx) {
|
||||
log::error!("Preventing self closing tags failed - '{error}'");
|
||||
return Err(error);
|
||||
}
|
||||
|
||||
Self::post_process_document(&document)?;
|
||||
|
||||
article.document = Some(document);
|
||||
let html = article.get_content().ok_or(FullTextParserError::Scrape)?;
|
||||
Ok(html)
|
||||
}
|
||||
|
||||
pub(crate) async fn parse(
|
||||
&self,
|
||||
url: &url::Url,
|
||||
client: &Client,
|
||||
|
|
|
@ -7,6 +7,7 @@ mod util;
|
|||
|
||||
use article::Article;
|
||||
use error::ScraperError;
|
||||
pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
|
||||
pub use full_text_parser::FullTextParser;
|
||||
pub use full_text_parser::Readability;
|
||||
use images::ImageDownloader;
|
||||
|
|
|
@ -32,4 +32,22 @@ pub enum Commands {
|
|||
#[arg(long, value_name = "URL")]
|
||||
source_url: Option<String>,
|
||||
},
|
||||
Ftr {
|
||||
/// Source HTML file
|
||||
#[arg(long, value_name = "FILE")]
|
||||
html: Option<PathBuf>,
|
||||
|
||||
/// Base to complete relative Url
|
||||
#[arg(long, value_name = "URL")]
|
||||
base_url: Option<String>,
|
||||
|
||||
/// Source Url to download HTML from
|
||||
#[arg(long, value_name = "URL")]
|
||||
source_url: Option<String>,
|
||||
|
||||
/// The Ftr config to use
|
||||
/// Otherwise source_url and base_url will be used
|
||||
#[arg(long, value_name = "domain")]
|
||||
config: Option<String>,
|
||||
},
|
||||
}
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
use std::path::Path;
|
||||
use std::{path::PathBuf, process::exit};
|
||||
|
||||
use crate::args::{Args, Commands};
|
||||
use article_scraper::FtrConfigEntry;
|
||||
use article_scraper::FullTextParser;
|
||||
use article_scraper::Readability;
|
||||
use clap::Parser;
|
||||
|
@ -34,9 +36,62 @@ async fn main() {
|
|||
base_url,
|
||||
source_url,
|
||||
} => extract_readability(html, source_url, base_url, args.output).await,
|
||||
Commands::Ftr {
|
||||
html,
|
||||
base_url,
|
||||
source_url,
|
||||
config,
|
||||
} => extract_ftr(html, source_url, base_url, config, args.output).await,
|
||||
}
|
||||
}
|
||||
|
||||
log::info!("hello world");
|
||||
async fn extract_ftr(
|
||||
html_file: Option<PathBuf>,
|
||||
source_url: Option<String>,
|
||||
base_url: Option<String>,
|
||||
config: Option<String>,
|
||||
output: Option<PathBuf>,
|
||||
) {
|
||||
let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
|
||||
let html = get_html(html_file, source_url).await;
|
||||
|
||||
let config = if let Some(config_path) = config {
|
||||
match FtrConfigEntry::parse_path(Path::new(&config_path)).await {
|
||||
Ok(entry) => Some(entry),
|
||||
Err(error) => {
|
||||
log::error!("Failed to parse config entry {config_path}: {error}");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let full_text_parser = FullTextParser::new(None).await;
|
||||
let result = match full_text_parser
|
||||
.parse_offline(&html, config.as_ref(), base_url)
|
||||
.await
|
||||
{
|
||||
Ok(res) => res,
|
||||
Err(err) => {
|
||||
log::error!("Failed to extract content with ftr: {err}");
|
||||
exit(0);
|
||||
}
|
||||
};
|
||||
|
||||
let output = if let Some(output) = output {
|
||||
output
|
||||
} else {
|
||||
PathBuf::from("result.html")
|
||||
};
|
||||
|
||||
match std::fs::write(&output, result) {
|
||||
Ok(()) => log::info!("successfully written result to {output:?}"),
|
||||
Err(err) => {
|
||||
log::error!("Failed to write to file {output:?}: {err}");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn extract_readability(
|
||||
|
@ -45,39 +100,8 @@ async fn extract_readability(
|
|||
base_url: Option<String>,
|
||||
output: Option<PathBuf>,
|
||||
) {
|
||||
if html_file.is_none() && source_url.is_none() {
|
||||
log::error!("either need a source html file or source url");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if html_file.is_some() && source_url.is_some() {
|
||||
log::error!("load source from html file or url? only specify one of the two options");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url"));
|
||||
let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
|
||||
|
||||
let html = if let Some(source_url) = source_url {
|
||||
match FullTextParser::download(&source_url, &Client::new(), HeaderMap::new()).await {
|
||||
Ok(html) => html,
|
||||
Err(err) => {
|
||||
log::error!("Failed to download html from url: {err}");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
} else if let Some(source_file) = html_file {
|
||||
match std::fs::read_to_string(&source_file) {
|
||||
Ok(html) => html,
|
||||
Err(err) => {
|
||||
log::error!("Failed to read file {source_file:?}: {err}");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
unreachable!()
|
||||
};
|
||||
|
||||
let html = get_html(html_file, source_url).await;
|
||||
let result = match Readability::extract_from_str(&html, base_url).await {
|
||||
Ok(res) => res,
|
||||
Err(err) => {
|
||||
|
@ -100,3 +124,37 @@ async fn extract_readability(
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_html(html_file: Option<PathBuf>, source_url: Option<String>) -> String {
|
||||
if html_file.is_none() && source_url.is_none() {
|
||||
log::error!("either need a source html file or source url");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if html_file.is_some() && source_url.is_some() {
|
||||
log::error!("load source from html file or url? only specify one of the two options");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url"));
|
||||
|
||||
if let Some(source_url) = source_url {
|
||||
match FullTextParser::download(&source_url, &Client::new(), HeaderMap::new()).await {
|
||||
Ok(html) => html,
|
||||
Err(err) => {
|
||||
log::error!("Failed to download html from url: {err}");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
} else if let Some(source_file) = html_file {
|
||||
match std::fs::read_to_string(&source_file) {
|
||||
Ok(html) => html,
|
||||
Err(err) => {
|
||||
log::error!("Failed to read file {source_file:?}: {err}");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue