mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
cli to parse single page with ftr
This commit is contained in:
parent
d978059709
commit
fa41633e11
4 changed files with 173 additions and 34 deletions
|
@ -35,7 +35,69 @@ impl FullTextParser {
|
||||||
Self { config_files }
|
Self { config_files }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn parse(
|
pub async fn parse_offline(
|
||||||
|
&self,
|
||||||
|
html: &str,
|
||||||
|
config: Option<&ConfigEntry>,
|
||||||
|
base_url: Option<Url>,
|
||||||
|
) -> Result<String, FullTextParserError> {
|
||||||
|
libxml::tree::node::set_node_rc_guard(10);
|
||||||
|
|
||||||
|
let global_config = self
|
||||||
|
.config_files
|
||||||
|
.get("global.txt")
|
||||||
|
.ok_or(FullTextParserError::Config)?;
|
||||||
|
|
||||||
|
let url =
|
||||||
|
base_url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap());
|
||||||
|
|
||||||
|
let mut article = Article {
|
||||||
|
title: None,
|
||||||
|
author: None,
|
||||||
|
url,
|
||||||
|
date: None,
|
||||||
|
thumbnail_url: None,
|
||||||
|
document: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
||||||
|
let mut root =
|
||||||
|
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
|
||||||
|
document.set_root_element(&root);
|
||||||
|
|
||||||
|
Self::generate_head(&mut root, &document)?;
|
||||||
|
|
||||||
|
let document = Self::parse_html(html, config, global_config)?;
|
||||||
|
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
|
|
||||||
|
metadata::extract(&xpath_ctx, config, Some(global_config), &mut article);
|
||||||
|
if article.thumbnail_url.is_none() {
|
||||||
|
Self::check_for_thumbnail(&xpath_ctx, &mut article);
|
||||||
|
}
|
||||||
|
Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document);
|
||||||
|
let found_body = Self::extract_body(&xpath_ctx, &mut root, config, global_config)?;
|
||||||
|
if !found_body {
|
||||||
|
log::error!("Ftr failed to find content");
|
||||||
|
return Err(FullTextParserError::Scrape);
|
||||||
|
}
|
||||||
|
|
||||||
|
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
|
||||||
|
log::info!("Next page url: {url}");
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Err(error) = Self::prevent_self_closing_tags(&xpath_ctx) {
|
||||||
|
log::error!("Preventing self closing tags failed - '{error}'");
|
||||||
|
return Err(error);
|
||||||
|
}
|
||||||
|
|
||||||
|
Self::post_process_document(&document)?;
|
||||||
|
|
||||||
|
article.document = Some(document);
|
||||||
|
let html = article.get_content().ok_or(FullTextParserError::Scrape)?;
|
||||||
|
Ok(html)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn parse(
|
||||||
&self,
|
&self,
|
||||||
url: &url::Url,
|
url: &url::Url,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
|
|
|
@ -7,6 +7,7 @@ mod util;
|
||||||
|
|
||||||
use article::Article;
|
use article::Article;
|
||||||
use error::ScraperError;
|
use error::ScraperError;
|
||||||
|
pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
|
||||||
pub use full_text_parser::FullTextParser;
|
pub use full_text_parser::FullTextParser;
|
||||||
pub use full_text_parser::Readability;
|
pub use full_text_parser::Readability;
|
||||||
use images::ImageDownloader;
|
use images::ImageDownloader;
|
||||||
|
|
|
@ -32,4 +32,22 @@ pub enum Commands {
|
||||||
#[arg(long, value_name = "URL")]
|
#[arg(long, value_name = "URL")]
|
||||||
source_url: Option<String>,
|
source_url: Option<String>,
|
||||||
},
|
},
|
||||||
|
Ftr {
|
||||||
|
/// Source HTML file
|
||||||
|
#[arg(long, value_name = "FILE")]
|
||||||
|
html: Option<PathBuf>,
|
||||||
|
|
||||||
|
/// Base to complete relative Url
|
||||||
|
#[arg(long, value_name = "URL")]
|
||||||
|
base_url: Option<String>,
|
||||||
|
|
||||||
|
/// Source Url to download HTML from
|
||||||
|
#[arg(long, value_name = "URL")]
|
||||||
|
source_url: Option<String>,
|
||||||
|
|
||||||
|
/// The Ftr config to use
|
||||||
|
/// Otherwise source_url and base_url will be used
|
||||||
|
#[arg(long, value_name = "domain")]
|
||||||
|
config: Option<String>,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
|
use std::path::Path;
|
||||||
use std::{path::PathBuf, process::exit};
|
use std::{path::PathBuf, process::exit};
|
||||||
|
|
||||||
use crate::args::{Args, Commands};
|
use crate::args::{Args, Commands};
|
||||||
|
use article_scraper::FtrConfigEntry;
|
||||||
use article_scraper::FullTextParser;
|
use article_scraper::FullTextParser;
|
||||||
use article_scraper::Readability;
|
use article_scraper::Readability;
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
|
@ -34,9 +36,62 @@ async fn main() {
|
||||||
base_url,
|
base_url,
|
||||||
source_url,
|
source_url,
|
||||||
} => extract_readability(html, source_url, base_url, args.output).await,
|
} => extract_readability(html, source_url, base_url, args.output).await,
|
||||||
|
Commands::Ftr {
|
||||||
|
html,
|
||||||
|
base_url,
|
||||||
|
source_url,
|
||||||
|
config,
|
||||||
|
} => extract_ftr(html, source_url, base_url, config, args.output).await,
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
log::info!("hello world");
|
async fn extract_ftr(
|
||||||
|
html_file: Option<PathBuf>,
|
||||||
|
source_url: Option<String>,
|
||||||
|
base_url: Option<String>,
|
||||||
|
config: Option<String>,
|
||||||
|
output: Option<PathBuf>,
|
||||||
|
) {
|
||||||
|
let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
|
||||||
|
let html = get_html(html_file, source_url).await;
|
||||||
|
|
||||||
|
let config = if let Some(config_path) = config {
|
||||||
|
match FtrConfigEntry::parse_path(Path::new(&config_path)).await {
|
||||||
|
Ok(entry) => Some(entry),
|
||||||
|
Err(error) => {
|
||||||
|
log::error!("Failed to parse config entry {config_path}: {error}");
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
let full_text_parser = FullTextParser::new(None).await;
|
||||||
|
let result = match full_text_parser
|
||||||
|
.parse_offline(&html, config.as_ref(), base_url)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(res) => res,
|
||||||
|
Err(err) => {
|
||||||
|
log::error!("Failed to extract content with ftr: {err}");
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let output = if let Some(output) = output {
|
||||||
|
output
|
||||||
|
} else {
|
||||||
|
PathBuf::from("result.html")
|
||||||
|
};
|
||||||
|
|
||||||
|
match std::fs::write(&output, result) {
|
||||||
|
Ok(()) => log::info!("successfully written result to {output:?}"),
|
||||||
|
Err(err) => {
|
||||||
|
log::error!("Failed to write to file {output:?}: {err}");
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn extract_readability(
|
async fn extract_readability(
|
||||||
|
@ -45,39 +100,8 @@ async fn extract_readability(
|
||||||
base_url: Option<String>,
|
base_url: Option<String>,
|
||||||
output: Option<PathBuf>,
|
output: Option<PathBuf>,
|
||||||
) {
|
) {
|
||||||
if html_file.is_none() && source_url.is_none() {
|
|
||||||
log::error!("either need a source html file or source url");
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if html_file.is_some() && source_url.is_some() {
|
|
||||||
log::error!("load source from html file or url? only specify one of the two options");
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url"));
|
|
||||||
let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
|
let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
|
||||||
|
let html = get_html(html_file, source_url).await;
|
||||||
let html = if let Some(source_url) = source_url {
|
|
||||||
match FullTextParser::download(&source_url, &Client::new(), HeaderMap::new()).await {
|
|
||||||
Ok(html) => html,
|
|
||||||
Err(err) => {
|
|
||||||
log::error!("Failed to download html from url: {err}");
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if let Some(source_file) = html_file {
|
|
||||||
match std::fs::read_to_string(&source_file) {
|
|
||||||
Ok(html) => html,
|
|
||||||
Err(err) => {
|
|
||||||
log::error!("Failed to read file {source_file:?}: {err}");
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
unreachable!()
|
|
||||||
};
|
|
||||||
|
|
||||||
let result = match Readability::extract_from_str(&html, base_url).await {
|
let result = match Readability::extract_from_str(&html, base_url).await {
|
||||||
Ok(res) => res,
|
Ok(res) => res,
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
|
@ -100,3 +124,37 @@ async fn extract_readability(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn get_html(html_file: Option<PathBuf>, source_url: Option<String>) -> String {
|
||||||
|
if html_file.is_none() && source_url.is_none() {
|
||||||
|
log::error!("either need a source html file or source url");
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if html_file.is_some() && source_url.is_some() {
|
||||||
|
log::error!("load source from html file or url? only specify one of the two options");
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url"));
|
||||||
|
|
||||||
|
if let Some(source_url) = source_url {
|
||||||
|
match FullTextParser::download(&source_url, &Client::new(), HeaderMap::new()).await {
|
||||||
|
Ok(html) => html,
|
||||||
|
Err(err) => {
|
||||||
|
log::error!("Failed to download html from url: {err}");
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if let Some(source_file) = html_file {
|
||||||
|
match std::fs::read_to_string(&source_file) {
|
||||||
|
Ok(html) => html,
|
||||||
|
Err(err) => {
|
||||||
|
log::error!("Failed to read file {source_file:?}: {err}");
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
unreachable!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue