mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 08:05:31 +02:00
start improving image download
This commit is contained in:
parent
c198225012
commit
35a14b0a5f
6 changed files with 189 additions and 105 deletions
|
@ -22,7 +22,11 @@ pub enum Commands {
|
|||
All {
|
||||
/// Source Url to download HTML from
|
||||
#[arg(long, value_name = "URL")]
|
||||
source_url: Option<String>,
|
||||
source_url: String,
|
||||
|
||||
/// Source Url to download HTML from
|
||||
#[arg(short, long)]
|
||||
download_images: bool,
|
||||
},
|
||||
/// Only use the Readability parser
|
||||
Readability {
|
||||
|
|
|
@ -2,9 +2,7 @@ use std::path::Path;
|
|||
use std::{path::PathBuf, process::exit};
|
||||
|
||||
use crate::args::{Args, Commands};
|
||||
use article_scraper::FtrConfigEntry;
|
||||
use article_scraper::FullTextParser;
|
||||
use article_scraper::Readability;
|
||||
use article_scraper::{ArticleScraper, FtrConfigEntry, FullTextParser, Readability};
|
||||
use clap::Parser;
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::Client;
|
||||
|
@ -31,7 +29,10 @@ async fn main() {
|
|||
.unwrap();
|
||||
|
||||
match args.command {
|
||||
Commands::All { source_url: _ } => unimplemented!(),
|
||||
Commands::All {
|
||||
source_url,
|
||||
download_images,
|
||||
} => extract_full(source_url, download_images, args.output).await,
|
||||
Commands::Readability {
|
||||
html,
|
||||
base_url,
|
||||
|
@ -46,6 +47,51 @@ async fn main() {
|
|||
}
|
||||
}
|
||||
|
||||
async fn extract_full(source_url: String, download_images: bool, output: Option<PathBuf>) {
|
||||
let scraper = ArticleScraper::new(None).await;
|
||||
|
||||
let source_url = match Url::parse(&source_url) {
|
||||
Ok(url) => url,
|
||||
Err(error) => {
|
||||
log::error!("Failed to parse url {source_url}: {error}");
|
||||
exit(0);
|
||||
}
|
||||
};
|
||||
|
||||
let res = scraper
|
||||
.parse(&source_url, download_images, &Client::new())
|
||||
.await;
|
||||
let article = match res {
|
||||
Ok(article) => article,
|
||||
Err(error) => {
|
||||
log::error!("Failed to grab article: {error}");
|
||||
exit(0);
|
||||
}
|
||||
};
|
||||
|
||||
let output = if let Some(output) = output {
|
||||
output
|
||||
} else {
|
||||
PathBuf::from("result.html")
|
||||
};
|
||||
|
||||
let content = match article.get_content() {
|
||||
Some(content) => content,
|
||||
None => {
|
||||
log::error!("No Content");
|
||||
exit(0);
|
||||
}
|
||||
};
|
||||
|
||||
match std::fs::write(&output, content) {
|
||||
Ok(()) => log::info!("successfully written result to {output:?}"),
|
||||
Err(err) => {
|
||||
log::error!("Failed to write to file {output:?}: {err}");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn extract_ftr(
|
||||
html_file: Option<PathBuf>,
|
||||
source_url: Option<String>,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue