1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

cli: progress bar for image download

This commit is contained in:
Jan Lukas Gernert 2023-04-16 21:31:11 +02:00
parent 3dd7c7d57a
commit f427b7c36f
7 changed files with 71 additions and 18 deletions

View file

@ -23,7 +23,6 @@ rust-embed="6.6"
once_cell = "1.17"
escaper = "0.1"
futures = "0.3"
byte-unit = "4.0"
[dev-dependencies]
env_logger = "0.10"

View file

@ -959,10 +959,6 @@ impl FullTextParser {
}
}
if !found_something {
log::error!("no body found");
}
Ok(found_something)
}

View file

@ -2,16 +2,17 @@ pub use self::error::ImageDownloadError;
use self::request::ImageRequest;
use crate::util::Util;
use base64::Engine;
use byte_unit::Byte;
use image::ImageOutputFormat;
use libxml::parser::Parser;
use libxml::tree::{Document, Node, SaveOptions};
use libxml::xpath::Context;
pub use progress::Progress;
use reqwest::{Client, Url};
use std::io::Cursor;
use tokio::sync::mpsc::{self, Sender};
mod error;
mod progress;
mod request;
pub struct ImageDownloader {
@ -27,13 +28,15 @@ impl ImageDownloader {
&self,
html: &str,
client: &Client,
progress: Option<Sender<Progress>>,
) -> Result<String, ImageDownloadError> {
let parser = Parser::default_html();
let doc = parser
.parse_string(html)
.map_err(|_| ImageDownloadError::HtmlParse)?;
self.download_images_from_document(&doc, client).await?;
self.download_images_from_document(&doc, client, progress)
.await?;
let options = SaveOptions {
format: false,
@ -52,6 +55,7 @@ impl ImageDownloader {
&self,
doc: &Document,
client: &Client,
progress: Option<Sender<Progress>>,
) -> Result<(), ImageDownloadError> {
let xpath_ctx = Context::new(doc).map_err(|()| {
log::error!("Failed to create xpath context for document");
@ -74,7 +78,7 @@ impl ImageDownloader {
.filter_map(|r| r.ok())
.collect::<Vec<_>>();
let size = res
let total_size = res
.iter()
.map(|(req, parent_req)| {
req.content_length() + parent_req.as_ref().map(|r| r.content_length()).unwrap_or(0)
@ -95,16 +99,18 @@ impl ImageDownloader {
tokio::spawn(async move {
let mut received = 0_usize;
let size = Byte::from_bytes(size as u128);
let adjusted_size = size.get_appropriate_unit(true);
println!("downloading {adjusted_size}");
while let Some(i) = rx.recv().await {
received += i;
let received_bytes = Byte::from_bytes(received as u128);
let received_adjusted = received_bytes.get_appropriate_unit(true);
println!("received {received_adjusted} / {adjusted_size}");
if let Some(progress) = progress.as_ref() {
_ = progress
.send(Progress {
total_size,
downloaded: received,
})
.await;
}
}
});
@ -262,7 +268,7 @@ mod tests {
let html = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
.expect("Failed to read HTML");
let result = image_dowloader
.download_images_from_string(&html, &Client::new())
.download_images_from_string(&html, &Client::new(), None)
.await
.expect("Failed to downalod images");
let mut file = fs::File::create(r"./test_output/fedora31_images_downloaded.html")

View file

@ -0,0 +1,4 @@
pub struct Progress {
pub total_size: usize,
pub downloaded: usize,
}

View file

@ -5,6 +5,7 @@ mod full_text_parser;
pub mod images;
mod util;
use crate::images::Progress;
use article::Article;
use error::ScraperError;
pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
@ -13,6 +14,7 @@ pub use full_text_parser::Readability;
use images::ImageDownloader;
use reqwest::Client;
use std::path::Path;
use tokio::sync::mpsc::Sender;
pub struct ArticleScraper {
full_text_parser: FullTextParser,
@ -32,6 +34,7 @@ impl ArticleScraper {
url: &url::Url,
download_images: bool,
client: &Client,
progress: Option<Sender<Progress>>,
) -> Result<Article, ScraperError> {
let res = self.full_text_parser.parse(url, client).await?;
@ -39,7 +42,7 @@ impl ArticleScraper {
if let Some(document) = res.document.as_ref() {
let _image_res = self
.image_downloader
.download_images_from_document(document, client)
.download_images_from_document(document, client, progress)
.await;
}
}

View file

@ -15,3 +15,4 @@ log = "0.4"
url = "2.3"
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
tokio = { version = "1.27", features = ["macros", "fs", "io-util", "rt-multi-thread" ] }
indicatif = "0.17"

View file

@ -2,11 +2,14 @@ use std::path::Path;
use std::{path::PathBuf, process::exit};
use crate::args::{Args, Commands};
use article_scraper::images::Progress;
use article_scraper::{ArticleScraper, FtrConfigEntry, FullTextParser, Readability};
use clap::Parser;
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
use reqwest::header::HeaderMap;
use reqwest::Client;
use simplelog::{ColorChoice, Config, LevelFilter, TermLogger, TerminalMode};
use tokio::sync::mpsc::{self, Sender};
use url::Url;
mod args;
@ -58,8 +61,10 @@ async fn extract_full(source_url: String, download_images: bool, output: Option<
}
};
let tx = monitor_progress();
let res = scraper
.parse(&source_url, download_images, &Client::new())
.parse(&source_url, download_images, &Client::new(), Some(tx))
.await;
let article = match res {
Ok(article) => article,
@ -205,3 +210,42 @@ async fn get_html(html_file: Option<PathBuf>, source_url: Option<String>) -> Str
unreachable!()
}
}
fn monitor_progress() -> Sender<Progress> {
let (tx, mut rx) = mpsc::channel::<Progress>(2);
tokio::spawn(async move {
let mut progress_bar: Option<ProgressBar> = None;
while let Some(progress) = rx.recv().await {
if let Some(progress_bar) = progress_bar.as_ref() {
if progress.downloaded >= progress.total_size {
progress_bar.finish_with_message("done");
} else {
progress_bar.set_position(progress.downloaded as u64);
}
} else {
let pb = ProgressBar::new(progress.total_size as u64);
pb.set_style(
ProgressStyle::with_template(
"[{elapsed_precise}] [{wide_bar:.cyan/blue}] {bytes}/{total_bytes} ({eta})",
)
.unwrap()
.with_key(
"eta",
|state: &ProgressState, w: &mut dyn std::fmt::Write| {
write!(w, "{:.1}s", state.eta().as_secs_f64()).unwrap()
},
)
.progress_chars("#>-"),
);
pb.set_position(progress.downloaded as u64);
progress_bar = Some(pb);
}
}
});
tx
}