1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

cli: progress bar for image download

This commit is contained in:
Jan Lukas Gernert 2023-04-16 21:31:11 +02:00
parent 3dd7c7d57a
commit f427b7c36f
7 changed files with 71 additions and 18 deletions

View file

@ -23,7 +23,6 @@ rust-embed="6.6"
once_cell = "1.17" once_cell = "1.17"
escaper = "0.1" escaper = "0.1"
futures = "0.3" futures = "0.3"
byte-unit = "4.0"
[dev-dependencies] [dev-dependencies]
env_logger = "0.10" env_logger = "0.10"

View file

@ -959,10 +959,6 @@ impl FullTextParser {
} }
} }
if !found_something {
log::error!("no body found");
}
Ok(found_something) Ok(found_something)
} }

View file

@ -2,16 +2,17 @@ pub use self::error::ImageDownloadError;
use self::request::ImageRequest; use self::request::ImageRequest;
use crate::util::Util; use crate::util::Util;
use base64::Engine; use base64::Engine;
use byte_unit::Byte;
use image::ImageOutputFormat; use image::ImageOutputFormat;
use libxml::parser::Parser; use libxml::parser::Parser;
use libxml::tree::{Document, Node, SaveOptions}; use libxml::tree::{Document, Node, SaveOptions};
use libxml::xpath::Context; use libxml::xpath::Context;
pub use progress::Progress;
use reqwest::{Client, Url}; use reqwest::{Client, Url};
use std::io::Cursor; use std::io::Cursor;
use tokio::sync::mpsc::{self, Sender}; use tokio::sync::mpsc::{self, Sender};
mod error; mod error;
mod progress;
mod request; mod request;
pub struct ImageDownloader { pub struct ImageDownloader {
@ -27,13 +28,15 @@ impl ImageDownloader {
&self, &self,
html: &str, html: &str,
client: &Client, client: &Client,
progress: Option<Sender<Progress>>,
) -> Result<String, ImageDownloadError> { ) -> Result<String, ImageDownloadError> {
let parser = Parser::default_html(); let parser = Parser::default_html();
let doc = parser let doc = parser
.parse_string(html) .parse_string(html)
.map_err(|_| ImageDownloadError::HtmlParse)?; .map_err(|_| ImageDownloadError::HtmlParse)?;
self.download_images_from_document(&doc, client).await?; self.download_images_from_document(&doc, client, progress)
.await?;
let options = SaveOptions { let options = SaveOptions {
format: false, format: false,
@ -52,6 +55,7 @@ impl ImageDownloader {
&self, &self,
doc: &Document, doc: &Document,
client: &Client, client: &Client,
progress: Option<Sender<Progress>>,
) -> Result<(), ImageDownloadError> { ) -> Result<(), ImageDownloadError> {
let xpath_ctx = Context::new(doc).map_err(|()| { let xpath_ctx = Context::new(doc).map_err(|()| {
log::error!("Failed to create xpath context for document"); log::error!("Failed to create xpath context for document");
@ -74,7 +78,7 @@ impl ImageDownloader {
.filter_map(|r| r.ok()) .filter_map(|r| r.ok())
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let size = res let total_size = res
.iter() .iter()
.map(|(req, parent_req)| { .map(|(req, parent_req)| {
req.content_length() + parent_req.as_ref().map(|r| r.content_length()).unwrap_or(0) req.content_length() + parent_req.as_ref().map(|r| r.content_length()).unwrap_or(0)
@ -95,16 +99,18 @@ impl ImageDownloader {
tokio::spawn(async move { tokio::spawn(async move {
let mut received = 0_usize; let mut received = 0_usize;
let size = Byte::from_bytes(size as u128);
let adjusted_size = size.get_appropriate_unit(true);
println!("downloading {adjusted_size}");
while let Some(i) = rx.recv().await { while let Some(i) = rx.recv().await {
received += i; received += i;
let received_bytes = Byte::from_bytes(received as u128); if let Some(progress) = progress.as_ref() {
let received_adjusted = received_bytes.get_appropriate_unit(true); _ = progress
println!("received {received_adjusted} / {adjusted_size}"); .send(Progress {
total_size,
downloaded: received,
})
.await;
}
} }
}); });
@ -262,7 +268,7 @@ mod tests {
let html = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html") let html = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
.expect("Failed to read HTML"); .expect("Failed to read HTML");
let result = image_dowloader let result = image_dowloader
.download_images_from_string(&html, &Client::new()) .download_images_from_string(&html, &Client::new(), None)
.await .await
.expect("Failed to downalod images"); .expect("Failed to downalod images");
let mut file = fs::File::create(r"./test_output/fedora31_images_downloaded.html") let mut file = fs::File::create(r"./test_output/fedora31_images_downloaded.html")

View file

@ -0,0 +1,4 @@
pub struct Progress {
pub total_size: usize,
pub downloaded: usize,
}

View file

@ -5,6 +5,7 @@ mod full_text_parser;
pub mod images; pub mod images;
mod util; mod util;
use crate::images::Progress;
use article::Article; use article::Article;
use error::ScraperError; use error::ScraperError;
pub use full_text_parser::config::ConfigEntry as FtrConfigEntry; pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
@ -13,6 +14,7 @@ pub use full_text_parser::Readability;
use images::ImageDownloader; use images::ImageDownloader;
use reqwest::Client; use reqwest::Client;
use std::path::Path; use std::path::Path;
use tokio::sync::mpsc::Sender;
pub struct ArticleScraper { pub struct ArticleScraper {
full_text_parser: FullTextParser, full_text_parser: FullTextParser,
@ -32,6 +34,7 @@ impl ArticleScraper {
url: &url::Url, url: &url::Url,
download_images: bool, download_images: bool,
client: &Client, client: &Client,
progress: Option<Sender<Progress>>,
) -> Result<Article, ScraperError> { ) -> Result<Article, ScraperError> {
let res = self.full_text_parser.parse(url, client).await?; let res = self.full_text_parser.parse(url, client).await?;
@ -39,7 +42,7 @@ impl ArticleScraper {
if let Some(document) = res.document.as_ref() { if let Some(document) = res.document.as_ref() {
let _image_res = self let _image_res = self
.image_downloader .image_downloader
.download_images_from_document(document, client) .download_images_from_document(document, client, progress)
.await; .await;
} }
} }

View file

@ -14,4 +14,5 @@ simplelog = "0.12"
log = "0.4" log = "0.4"
url = "2.3" url = "2.3"
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] } reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
tokio = { version = "1.27", features = ["macros", "fs", "io-util", "rt-multi-thread" ] } tokio = { version = "1.27", features = ["macros", "fs", "io-util", "rt-multi-thread" ] }
indicatif = "0.17"

View file

@ -2,11 +2,14 @@ use std::path::Path;
use std::{path::PathBuf, process::exit}; use std::{path::PathBuf, process::exit};
use crate::args::{Args, Commands}; use crate::args::{Args, Commands};
use article_scraper::images::Progress;
use article_scraper::{ArticleScraper, FtrConfigEntry, FullTextParser, Readability}; use article_scraper::{ArticleScraper, FtrConfigEntry, FullTextParser, Readability};
use clap::Parser; use clap::Parser;
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
use reqwest::header::HeaderMap; use reqwest::header::HeaderMap;
use reqwest::Client; use reqwest::Client;
use simplelog::{ColorChoice, Config, LevelFilter, TermLogger, TerminalMode}; use simplelog::{ColorChoice, Config, LevelFilter, TermLogger, TerminalMode};
use tokio::sync::mpsc::{self, Sender};
use url::Url; use url::Url;
mod args; mod args;
@ -58,8 +61,10 @@ async fn extract_full(source_url: String, download_images: bool, output: Option<
} }
}; };
let tx = monitor_progress();
let res = scraper let res = scraper
.parse(&source_url, download_images, &Client::new()) .parse(&source_url, download_images, &Client::new(), Some(tx))
.await; .await;
let article = match res { let article = match res {
Ok(article) => article, Ok(article) => article,
@ -205,3 +210,42 @@ async fn get_html(html_file: Option<PathBuf>, source_url: Option<String>) -> Str
unreachable!() unreachable!()
} }
} }
fn monitor_progress() -> Sender<Progress> {
let (tx, mut rx) = mpsc::channel::<Progress>(2);
tokio::spawn(async move {
let mut progress_bar: Option<ProgressBar> = None;
while let Some(progress) = rx.recv().await {
if let Some(progress_bar) = progress_bar.as_ref() {
if progress.downloaded >= progress.total_size {
progress_bar.finish_with_message("done");
} else {
progress_bar.set_position(progress.downloaded as u64);
}
} else {
let pb = ProgressBar::new(progress.total_size as u64);
pb.set_style(
ProgressStyle::with_template(
"[{elapsed_precise}] [{wide_bar:.cyan/blue}] {bytes}/{total_bytes} ({eta})",
)
.unwrap()
.with_key(
"eta",
|state: &ProgressState, w: &mut dyn std::fmt::Write| {
write!(w, "{:.1}s", state.eta().as_secs_f64()).unwrap()
},
)
.progress_chars("#>-"),
);
pb.set_position(progress.downloaded as u64);
progress_bar = Some(pb);
}
}
});
tx
}