From f427b7c36f62550d7514eb04630cbcd237176f83 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 16 Apr 2023 21:31:11 +0200 Subject: [PATCH] cli: progress bar for image download --- article_scraper/Cargo.toml | 1 - article_scraper/src/full_text_parser/mod.rs | 4 -- article_scraper/src/images/mod.rs | 26 +++++++----- article_scraper/src/images/progress.rs | 4 ++ article_scraper/src/lib.rs | 5 ++- article_scraper_cli/Cargo.toml | 3 +- article_scraper_cli/src/main.rs | 46 ++++++++++++++++++++- 7 files changed, 71 insertions(+), 18 deletions(-) create mode 100644 article_scraper/src/images/progress.rs diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index 30da478..3d26507 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -23,7 +23,6 @@ rust-embed="6.6" once_cell = "1.17" escaper = "0.1" futures = "0.3" -byte-unit = "4.0" [dev-dependencies] env_logger = "0.10" \ No newline at end of file diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 2d3a65f..72a36b8 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -959,10 +959,6 @@ impl FullTextParser { } } - if !found_something { - log::error!("no body found"); - } - Ok(found_something) } diff --git a/article_scraper/src/images/mod.rs b/article_scraper/src/images/mod.rs index dc2d1b5..4ecd873 100644 --- a/article_scraper/src/images/mod.rs +++ b/article_scraper/src/images/mod.rs @@ -2,16 +2,17 @@ pub use self::error::ImageDownloadError; use self::request::ImageRequest; use crate::util::Util; use base64::Engine; -use byte_unit::Byte; use image::ImageOutputFormat; use libxml::parser::Parser; use libxml::tree::{Document, Node, SaveOptions}; use libxml::xpath::Context; +pub use progress::Progress; use reqwest::{Client, Url}; use std::io::Cursor; use tokio::sync::mpsc::{self, Sender}; mod error; +mod progress; mod request; pub struct ImageDownloader { @@ -27,13 +28,15 @@ impl ImageDownloader { &self, html: &str, client: &Client, + progress: Option>, ) -> Result { let parser = Parser::default_html(); let doc = parser .parse_string(html) .map_err(|_| ImageDownloadError::HtmlParse)?; - self.download_images_from_document(&doc, client).await?; + self.download_images_from_document(&doc, client, progress) + .await?; let options = SaveOptions { format: false, @@ -52,6 +55,7 @@ impl ImageDownloader { &self, doc: &Document, client: &Client, + progress: Option>, ) -> Result<(), ImageDownloadError> { let xpath_ctx = Context::new(doc).map_err(|()| { log::error!("Failed to create xpath context for document"); @@ -74,7 +78,7 @@ impl ImageDownloader { .filter_map(|r| r.ok()) .collect::>(); - let size = res + let total_size = res .iter() .map(|(req, parent_req)| { req.content_length() + parent_req.as_ref().map(|r| r.content_length()).unwrap_or(0) @@ -95,16 +99,18 @@ impl ImageDownloader { tokio::spawn(async move { let mut received = 0_usize; - let size = Byte::from_bytes(size as u128); - let adjusted_size = size.get_appropriate_unit(true); - println!("downloading {adjusted_size}"); while let Some(i) = rx.recv().await { received += i; - let received_bytes = Byte::from_bytes(received as u128); - let received_adjusted = received_bytes.get_appropriate_unit(true); - println!("received {received_adjusted} / {adjusted_size}"); + if let Some(progress) = progress.as_ref() { + _ = progress + .send(Progress { + total_size, + downloaded: received, + }) + .await; + } } }); @@ -262,7 +268,7 @@ mod tests { let html = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html") .expect("Failed to read HTML"); let result = image_dowloader - .download_images_from_string(&html, &Client::new()) + .download_images_from_string(&html, &Client::new(), None) .await .expect("Failed to downalod images"); let mut file = fs::File::create(r"./test_output/fedora31_images_downloaded.html") diff --git a/article_scraper/src/images/progress.rs b/article_scraper/src/images/progress.rs new file mode 100644 index 0000000..8218fb9 --- /dev/null +++ b/article_scraper/src/images/progress.rs @@ -0,0 +1,4 @@ +pub struct Progress { + pub total_size: usize, + pub downloaded: usize, +} diff --git a/article_scraper/src/lib.rs b/article_scraper/src/lib.rs index 0818eba..9e14fb8 100644 --- a/article_scraper/src/lib.rs +++ b/article_scraper/src/lib.rs @@ -5,6 +5,7 @@ mod full_text_parser; pub mod images; mod util; +use crate::images::Progress; use article::Article; use error::ScraperError; pub use full_text_parser::config::ConfigEntry as FtrConfigEntry; @@ -13,6 +14,7 @@ pub use full_text_parser::Readability; use images::ImageDownloader; use reqwest::Client; use std::path::Path; +use tokio::sync::mpsc::Sender; pub struct ArticleScraper { full_text_parser: FullTextParser, @@ -32,6 +34,7 @@ impl ArticleScraper { url: &url::Url, download_images: bool, client: &Client, + progress: Option>, ) -> Result { let res = self.full_text_parser.parse(url, client).await?; @@ -39,7 +42,7 @@ impl ArticleScraper { if let Some(document) = res.document.as_ref() { let _image_res = self .image_downloader - .download_images_from_document(document, client) + .download_images_from_document(document, client, progress) .await; } } diff --git a/article_scraper_cli/Cargo.toml b/article_scraper_cli/Cargo.toml index ff77545..1ef293e 100644 --- a/article_scraper_cli/Cargo.toml +++ b/article_scraper_cli/Cargo.toml @@ -14,4 +14,5 @@ simplelog = "0.12" log = "0.4" url = "2.3" reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] } -tokio = { version = "1.27", features = ["macros", "fs", "io-util", "rt-multi-thread" ] } \ No newline at end of file +tokio = { version = "1.27", features = ["macros", "fs", "io-util", "rt-multi-thread" ] } +indicatif = "0.17" \ No newline at end of file diff --git a/article_scraper_cli/src/main.rs b/article_scraper_cli/src/main.rs index ac595e1..47d3e0a 100644 --- a/article_scraper_cli/src/main.rs +++ b/article_scraper_cli/src/main.rs @@ -2,11 +2,14 @@ use std::path::Path; use std::{path::PathBuf, process::exit}; use crate::args::{Args, Commands}; +use article_scraper::images::Progress; use article_scraper::{ArticleScraper, FtrConfigEntry, FullTextParser, Readability}; use clap::Parser; +use indicatif::{ProgressBar, ProgressState, ProgressStyle}; use reqwest::header::HeaderMap; use reqwest::Client; use simplelog::{ColorChoice, Config, LevelFilter, TermLogger, TerminalMode}; +use tokio::sync::mpsc::{self, Sender}; use url::Url; mod args; @@ -58,8 +61,10 @@ async fn extract_full(source_url: String, download_images: bool, output: Option< } }; + let tx = monitor_progress(); + let res = scraper - .parse(&source_url, download_images, &Client::new()) + .parse(&source_url, download_images, &Client::new(), Some(tx)) .await; let article = match res { Ok(article) => article, @@ -205,3 +210,42 @@ async fn get_html(html_file: Option, source_url: Option) -> Str unreachable!() } } + +fn monitor_progress() -> Sender { + let (tx, mut rx) = mpsc::channel::(2); + + tokio::spawn(async move { + let mut progress_bar: Option = None; + + while let Some(progress) = rx.recv().await { + if let Some(progress_bar) = progress_bar.as_ref() { + if progress.downloaded >= progress.total_size { + progress_bar.finish_with_message("done"); + } else { + progress_bar.set_position(progress.downloaded as u64); + } + } else { + let pb = ProgressBar::new(progress.total_size as u64); + pb.set_style( + ProgressStyle::with_template( + "[{elapsed_precise}] [{wide_bar:.cyan/blue}] {bytes}/{total_bytes} ({eta})", + ) + .unwrap() + .with_key( + "eta", + |state: &ProgressState, w: &mut dyn std::fmt::Write| { + write!(w, "{:.1}s", state.eta().as_secs_f64()).unwrap() + }, + ) + .progress_chars("#>-"), + ); + + pb.set_position(progress.downloaded as u64); + + progress_bar = Some(pb); + } + } + }); + + tx +}