mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 08:05:31 +02:00
cli: progress bar for image download
This commit is contained in:
parent
3dd7c7d57a
commit
f427b7c36f
7 changed files with 71 additions and 18 deletions
|
@ -23,7 +23,6 @@ rust-embed="6.6"
|
|||
once_cell = "1.17"
|
||||
escaper = "0.1"
|
||||
futures = "0.3"
|
||||
byte-unit = "4.0"
|
||||
|
||||
[dev-dependencies]
|
||||
env_logger = "0.10"
|
|
@ -959,10 +959,6 @@ impl FullTextParser {
|
|||
}
|
||||
}
|
||||
|
||||
if !found_something {
|
||||
log::error!("no body found");
|
||||
}
|
||||
|
||||
Ok(found_something)
|
||||
}
|
||||
|
||||
|
|
|
@ -2,16 +2,17 @@ pub use self::error::ImageDownloadError;
|
|||
use self::request::ImageRequest;
|
||||
use crate::util::Util;
|
||||
use base64::Engine;
|
||||
use byte_unit::Byte;
|
||||
use image::ImageOutputFormat;
|
||||
use libxml::parser::Parser;
|
||||
use libxml::tree::{Document, Node, SaveOptions};
|
||||
use libxml::xpath::Context;
|
||||
pub use progress::Progress;
|
||||
use reqwest::{Client, Url};
|
||||
use std::io::Cursor;
|
||||
use tokio::sync::mpsc::{self, Sender};
|
||||
|
||||
mod error;
|
||||
mod progress;
|
||||
mod request;
|
||||
|
||||
pub struct ImageDownloader {
|
||||
|
@ -27,13 +28,15 @@ impl ImageDownloader {
|
|||
&self,
|
||||
html: &str,
|
||||
client: &Client,
|
||||
progress: Option<Sender<Progress>>,
|
||||
) -> Result<String, ImageDownloadError> {
|
||||
let parser = Parser::default_html();
|
||||
let doc = parser
|
||||
.parse_string(html)
|
||||
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
||||
|
||||
self.download_images_from_document(&doc, client).await?;
|
||||
self.download_images_from_document(&doc, client, progress)
|
||||
.await?;
|
||||
|
||||
let options = SaveOptions {
|
||||
format: false,
|
||||
|
@ -52,6 +55,7 @@ impl ImageDownloader {
|
|||
&self,
|
||||
doc: &Document,
|
||||
client: &Client,
|
||||
progress: Option<Sender<Progress>>,
|
||||
) -> Result<(), ImageDownloadError> {
|
||||
let xpath_ctx = Context::new(doc).map_err(|()| {
|
||||
log::error!("Failed to create xpath context for document");
|
||||
|
@ -74,7 +78,7 @@ impl ImageDownloader {
|
|||
.filter_map(|r| r.ok())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let size = res
|
||||
let total_size = res
|
||||
.iter()
|
||||
.map(|(req, parent_req)| {
|
||||
req.content_length() + parent_req.as_ref().map(|r| r.content_length()).unwrap_or(0)
|
||||
|
@ -95,16 +99,18 @@ impl ImageDownloader {
|
|||
|
||||
tokio::spawn(async move {
|
||||
let mut received = 0_usize;
|
||||
let size = Byte::from_bytes(size as u128);
|
||||
let adjusted_size = size.get_appropriate_unit(true);
|
||||
println!("downloading {adjusted_size}");
|
||||
|
||||
while let Some(i) = rx.recv().await {
|
||||
received += i;
|
||||
|
||||
let received_bytes = Byte::from_bytes(received as u128);
|
||||
let received_adjusted = received_bytes.get_appropriate_unit(true);
|
||||
println!("received {received_adjusted} / {adjusted_size}");
|
||||
if let Some(progress) = progress.as_ref() {
|
||||
_ = progress
|
||||
.send(Progress {
|
||||
total_size,
|
||||
downloaded: received,
|
||||
})
|
||||
.await;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -262,7 +268,7 @@ mod tests {
|
|||
let html = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
|
||||
.expect("Failed to read HTML");
|
||||
let result = image_dowloader
|
||||
.download_images_from_string(&html, &Client::new())
|
||||
.download_images_from_string(&html, &Client::new(), None)
|
||||
.await
|
||||
.expect("Failed to downalod images");
|
||||
let mut file = fs::File::create(r"./test_output/fedora31_images_downloaded.html")
|
||||
|
|
4
article_scraper/src/images/progress.rs
Normal file
4
article_scraper/src/images/progress.rs
Normal file
|
@ -0,0 +1,4 @@
|
|||
pub struct Progress {
|
||||
pub total_size: usize,
|
||||
pub downloaded: usize,
|
||||
}
|
|
@ -5,6 +5,7 @@ mod full_text_parser;
|
|||
pub mod images;
|
||||
mod util;
|
||||
|
||||
use crate::images::Progress;
|
||||
use article::Article;
|
||||
use error::ScraperError;
|
||||
pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
|
||||
|
@ -13,6 +14,7 @@ pub use full_text_parser::Readability;
|
|||
use images::ImageDownloader;
|
||||
use reqwest::Client;
|
||||
use std::path::Path;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
|
||||
pub struct ArticleScraper {
|
||||
full_text_parser: FullTextParser,
|
||||
|
@ -32,6 +34,7 @@ impl ArticleScraper {
|
|||
url: &url::Url,
|
||||
download_images: bool,
|
||||
client: &Client,
|
||||
progress: Option<Sender<Progress>>,
|
||||
) -> Result<Article, ScraperError> {
|
||||
let res = self.full_text_parser.parse(url, client).await?;
|
||||
|
||||
|
@ -39,7 +42,7 @@ impl ArticleScraper {
|
|||
if let Some(document) = res.document.as_ref() {
|
||||
let _image_res = self
|
||||
.image_downloader
|
||||
.download_images_from_document(document, client)
|
||||
.download_images_from_document(document, client, progress)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,4 +14,5 @@ simplelog = "0.12"
|
|||
log = "0.4"
|
||||
url = "2.3"
|
||||
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
|
||||
tokio = { version = "1.27", features = ["macros", "fs", "io-util", "rt-multi-thread" ] }
|
||||
tokio = { version = "1.27", features = ["macros", "fs", "io-util", "rt-multi-thread" ] }
|
||||
indicatif = "0.17"
|
|
@ -2,11 +2,14 @@ use std::path::Path;
|
|||
use std::{path::PathBuf, process::exit};
|
||||
|
||||
use crate::args::{Args, Commands};
|
||||
use article_scraper::images::Progress;
|
||||
use article_scraper::{ArticleScraper, FtrConfigEntry, FullTextParser, Readability};
|
||||
use clap::Parser;
|
||||
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::Client;
|
||||
use simplelog::{ColorChoice, Config, LevelFilter, TermLogger, TerminalMode};
|
||||
use tokio::sync::mpsc::{self, Sender};
|
||||
use url::Url;
|
||||
|
||||
mod args;
|
||||
|
@ -58,8 +61,10 @@ async fn extract_full(source_url: String, download_images: bool, output: Option<
|
|||
}
|
||||
};
|
||||
|
||||
let tx = monitor_progress();
|
||||
|
||||
let res = scraper
|
||||
.parse(&source_url, download_images, &Client::new())
|
||||
.parse(&source_url, download_images, &Client::new(), Some(tx))
|
||||
.await;
|
||||
let article = match res {
|
||||
Ok(article) => article,
|
||||
|
@ -205,3 +210,42 @@ async fn get_html(html_file: Option<PathBuf>, source_url: Option<String>) -> Str
|
|||
unreachable!()
|
||||
}
|
||||
}
|
||||
|
||||
fn monitor_progress() -> Sender<Progress> {
|
||||
let (tx, mut rx) = mpsc::channel::<Progress>(2);
|
||||
|
||||
tokio::spawn(async move {
|
||||
let mut progress_bar: Option<ProgressBar> = None;
|
||||
|
||||
while let Some(progress) = rx.recv().await {
|
||||
if let Some(progress_bar) = progress_bar.as_ref() {
|
||||
if progress.downloaded >= progress.total_size {
|
||||
progress_bar.finish_with_message("done");
|
||||
} else {
|
||||
progress_bar.set_position(progress.downloaded as u64);
|
||||
}
|
||||
} else {
|
||||
let pb = ProgressBar::new(progress.total_size as u64);
|
||||
pb.set_style(
|
||||
ProgressStyle::with_template(
|
||||
"[{elapsed_precise}] [{wide_bar:.cyan/blue}] {bytes}/{total_bytes} ({eta})",
|
||||
)
|
||||
.unwrap()
|
||||
.with_key(
|
||||
"eta",
|
||||
|state: &ProgressState, w: &mut dyn std::fmt::Write| {
|
||||
write!(w, "{:.1}s", state.eta().as_secs_f64()).unwrap()
|
||||
},
|
||||
)
|
||||
.progress_chars("#>-"),
|
||||
);
|
||||
|
||||
pb.set_position(progress.downloaded as u64);
|
||||
|
||||
progress_bar = Some(pb);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
tx
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue