1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

start improving image download

This commit is contained in:
Jan Lukas Gernert 2023-04-12 08:27:22 +02:00
parent c198225012
commit 35a14b0a5f
6 changed files with 189 additions and 105 deletions

View file

@ -22,7 +22,11 @@ pub enum Commands {
All {
/// Source Url to download HTML from
#[arg(long, value_name = "URL")]
source_url: Option<String>,
source_url: String,
/// Source Url to download HTML from
#[arg(short, long)]
download_images: bool,
},
/// Only use the Readability parser
Readability {

View file

@ -2,9 +2,7 @@ use std::path::Path;
use std::{path::PathBuf, process::exit};
use crate::args::{Args, Commands};
use article_scraper::FtrConfigEntry;
use article_scraper::FullTextParser;
use article_scraper::Readability;
use article_scraper::{ArticleScraper, FtrConfigEntry, FullTextParser, Readability};
use clap::Parser;
use reqwest::header::HeaderMap;
use reqwest::Client;
@ -31,7 +29,10 @@ async fn main() {
.unwrap();
match args.command {
Commands::All { source_url: _ } => unimplemented!(),
Commands::All {
source_url,
download_images,
} => extract_full(source_url, download_images, args.output).await,
Commands::Readability {
html,
base_url,
@ -46,6 +47,51 @@ async fn main() {
}
}
async fn extract_full(source_url: String, download_images: bool, output: Option<PathBuf>) {
let scraper = ArticleScraper::new(None).await;
let source_url = match Url::parse(&source_url) {
Ok(url) => url,
Err(error) => {
log::error!("Failed to parse url {source_url}: {error}");
exit(0);
}
};
let res = scraper
.parse(&source_url, download_images, &Client::new())
.await;
let article = match res {
Ok(article) => article,
Err(error) => {
log::error!("Failed to grab article: {error}");
exit(0);
}
};
let output = if let Some(output) = output {
output
} else {
PathBuf::from("result.html")
};
let content = match article.get_content() {
Some(content) => content,
None => {
log::error!("No Content");
exit(0);
}
};
match std::fs::write(&output, content) {
Ok(()) => log::info!("successfully written result to {output:?}"),
Err(err) => {
log::error!("Failed to write to file {output:?}: {err}");
exit(0);
}
}
}
async fn extract_ftr(
html_file: Option<PathBuf>,
source_url: Option<String>,