mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
cli: progress bar for image download
This commit is contained in:
parent
3dd7c7d57a
commit
f427b7c36f
7 changed files with 71 additions and 18 deletions
|
@ -23,7 +23,6 @@ rust-embed="6.6"
|
||||||
once_cell = "1.17"
|
once_cell = "1.17"
|
||||||
escaper = "0.1"
|
escaper = "0.1"
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
byte-unit = "4.0"
|
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
env_logger = "0.10"
|
env_logger = "0.10"
|
|
@ -959,10 +959,6 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !found_something {
|
|
||||||
log::error!("no body found");
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(found_something)
|
Ok(found_something)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,16 +2,17 @@ pub use self::error::ImageDownloadError;
|
||||||
use self::request::ImageRequest;
|
use self::request::ImageRequest;
|
||||||
use crate::util::Util;
|
use crate::util::Util;
|
||||||
use base64::Engine;
|
use base64::Engine;
|
||||||
use byte_unit::Byte;
|
|
||||||
use image::ImageOutputFormat;
|
use image::ImageOutputFormat;
|
||||||
use libxml::parser::Parser;
|
use libxml::parser::Parser;
|
||||||
use libxml::tree::{Document, Node, SaveOptions};
|
use libxml::tree::{Document, Node, SaveOptions};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
|
pub use progress::Progress;
|
||||||
use reqwest::{Client, Url};
|
use reqwest::{Client, Url};
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
use tokio::sync::mpsc::{self, Sender};
|
use tokio::sync::mpsc::{self, Sender};
|
||||||
|
|
||||||
mod error;
|
mod error;
|
||||||
|
mod progress;
|
||||||
mod request;
|
mod request;
|
||||||
|
|
||||||
pub struct ImageDownloader {
|
pub struct ImageDownloader {
|
||||||
|
@ -27,13 +28,15 @@ impl ImageDownloader {
|
||||||
&self,
|
&self,
|
||||||
html: &str,
|
html: &str,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
|
progress: Option<Sender<Progress>>,
|
||||||
) -> Result<String, ImageDownloadError> {
|
) -> Result<String, ImageDownloadError> {
|
||||||
let parser = Parser::default_html();
|
let parser = Parser::default_html();
|
||||||
let doc = parser
|
let doc = parser
|
||||||
.parse_string(html)
|
.parse_string(html)
|
||||||
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
||||||
|
|
||||||
self.download_images_from_document(&doc, client).await?;
|
self.download_images_from_document(&doc, client, progress)
|
||||||
|
.await?;
|
||||||
|
|
||||||
let options = SaveOptions {
|
let options = SaveOptions {
|
||||||
format: false,
|
format: false,
|
||||||
|
@ -52,6 +55,7 @@ impl ImageDownloader {
|
||||||
&self,
|
&self,
|
||||||
doc: &Document,
|
doc: &Document,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
|
progress: Option<Sender<Progress>>,
|
||||||
) -> Result<(), ImageDownloadError> {
|
) -> Result<(), ImageDownloadError> {
|
||||||
let xpath_ctx = Context::new(doc).map_err(|()| {
|
let xpath_ctx = Context::new(doc).map_err(|()| {
|
||||||
log::error!("Failed to create xpath context for document");
|
log::error!("Failed to create xpath context for document");
|
||||||
|
@ -74,7 +78,7 @@ impl ImageDownloader {
|
||||||
.filter_map(|r| r.ok())
|
.filter_map(|r| r.ok())
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
let size = res
|
let total_size = res
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(req, parent_req)| {
|
.map(|(req, parent_req)| {
|
||||||
req.content_length() + parent_req.as_ref().map(|r| r.content_length()).unwrap_or(0)
|
req.content_length() + parent_req.as_ref().map(|r| r.content_length()).unwrap_or(0)
|
||||||
|
@ -95,16 +99,18 @@ impl ImageDownloader {
|
||||||
|
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
let mut received = 0_usize;
|
let mut received = 0_usize;
|
||||||
let size = Byte::from_bytes(size as u128);
|
|
||||||
let adjusted_size = size.get_appropriate_unit(true);
|
|
||||||
println!("downloading {adjusted_size}");
|
|
||||||
|
|
||||||
while let Some(i) = rx.recv().await {
|
while let Some(i) = rx.recv().await {
|
||||||
received += i;
|
received += i;
|
||||||
|
|
||||||
let received_bytes = Byte::from_bytes(received as u128);
|
if let Some(progress) = progress.as_ref() {
|
||||||
let received_adjusted = received_bytes.get_appropriate_unit(true);
|
_ = progress
|
||||||
println!("received {received_adjusted} / {adjusted_size}");
|
.send(Progress {
|
||||||
|
total_size,
|
||||||
|
downloaded: received,
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -262,7 +268,7 @@ mod tests {
|
||||||
let html = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
|
let html = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
|
||||||
.expect("Failed to read HTML");
|
.expect("Failed to read HTML");
|
||||||
let result = image_dowloader
|
let result = image_dowloader
|
||||||
.download_images_from_string(&html, &Client::new())
|
.download_images_from_string(&html, &Client::new(), None)
|
||||||
.await
|
.await
|
||||||
.expect("Failed to downalod images");
|
.expect("Failed to downalod images");
|
||||||
let mut file = fs::File::create(r"./test_output/fedora31_images_downloaded.html")
|
let mut file = fs::File::create(r"./test_output/fedora31_images_downloaded.html")
|
||||||
|
|
4
article_scraper/src/images/progress.rs
Normal file
4
article_scraper/src/images/progress.rs
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
pub struct Progress {
|
||||||
|
pub total_size: usize,
|
||||||
|
pub downloaded: usize,
|
||||||
|
}
|
|
@ -5,6 +5,7 @@ mod full_text_parser;
|
||||||
pub mod images;
|
pub mod images;
|
||||||
mod util;
|
mod util;
|
||||||
|
|
||||||
|
use crate::images::Progress;
|
||||||
use article::Article;
|
use article::Article;
|
||||||
use error::ScraperError;
|
use error::ScraperError;
|
||||||
pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
|
pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
|
||||||
|
@ -13,6 +14,7 @@ pub use full_text_parser::Readability;
|
||||||
use images::ImageDownloader;
|
use images::ImageDownloader;
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
use tokio::sync::mpsc::Sender;
|
||||||
|
|
||||||
pub struct ArticleScraper {
|
pub struct ArticleScraper {
|
||||||
full_text_parser: FullTextParser,
|
full_text_parser: FullTextParser,
|
||||||
|
@ -32,6 +34,7 @@ impl ArticleScraper {
|
||||||
url: &url::Url,
|
url: &url::Url,
|
||||||
download_images: bool,
|
download_images: bool,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
|
progress: Option<Sender<Progress>>,
|
||||||
) -> Result<Article, ScraperError> {
|
) -> Result<Article, ScraperError> {
|
||||||
let res = self.full_text_parser.parse(url, client).await?;
|
let res = self.full_text_parser.parse(url, client).await?;
|
||||||
|
|
||||||
|
@ -39,7 +42,7 @@ impl ArticleScraper {
|
||||||
if let Some(document) = res.document.as_ref() {
|
if let Some(document) = res.document.as_ref() {
|
||||||
let _image_res = self
|
let _image_res = self
|
||||||
.image_downloader
|
.image_downloader
|
||||||
.download_images_from_document(document, client)
|
.download_images_from_document(document, client, progress)
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,4 +14,5 @@ simplelog = "0.12"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
url = "2.3"
|
url = "2.3"
|
||||||
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
|
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
|
||||||
tokio = { version = "1.27", features = ["macros", "fs", "io-util", "rt-multi-thread" ] }
|
tokio = { version = "1.27", features = ["macros", "fs", "io-util", "rt-multi-thread" ] }
|
||||||
|
indicatif = "0.17"
|
|
@ -2,11 +2,14 @@ use std::path::Path;
|
||||||
use std::{path::PathBuf, process::exit};
|
use std::{path::PathBuf, process::exit};
|
||||||
|
|
||||||
use crate::args::{Args, Commands};
|
use crate::args::{Args, Commands};
|
||||||
|
use article_scraper::images::Progress;
|
||||||
use article_scraper::{ArticleScraper, FtrConfigEntry, FullTextParser, Readability};
|
use article_scraper::{ArticleScraper, FtrConfigEntry, FullTextParser, Readability};
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
|
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
||||||
use reqwest::header::HeaderMap;
|
use reqwest::header::HeaderMap;
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
use simplelog::{ColorChoice, Config, LevelFilter, TermLogger, TerminalMode};
|
use simplelog::{ColorChoice, Config, LevelFilter, TermLogger, TerminalMode};
|
||||||
|
use tokio::sync::mpsc::{self, Sender};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
mod args;
|
mod args;
|
||||||
|
@ -58,8 +61,10 @@ async fn extract_full(source_url: String, download_images: bool, output: Option<
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let tx = monitor_progress();
|
||||||
|
|
||||||
let res = scraper
|
let res = scraper
|
||||||
.parse(&source_url, download_images, &Client::new())
|
.parse(&source_url, download_images, &Client::new(), Some(tx))
|
||||||
.await;
|
.await;
|
||||||
let article = match res {
|
let article = match res {
|
||||||
Ok(article) => article,
|
Ok(article) => article,
|
||||||
|
@ -205,3 +210,42 @@ async fn get_html(html_file: Option<PathBuf>, source_url: Option<String>) -> Str
|
||||||
unreachable!()
|
unreachable!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn monitor_progress() -> Sender<Progress> {
|
||||||
|
let (tx, mut rx) = mpsc::channel::<Progress>(2);
|
||||||
|
|
||||||
|
tokio::spawn(async move {
|
||||||
|
let mut progress_bar: Option<ProgressBar> = None;
|
||||||
|
|
||||||
|
while let Some(progress) = rx.recv().await {
|
||||||
|
if let Some(progress_bar) = progress_bar.as_ref() {
|
||||||
|
if progress.downloaded >= progress.total_size {
|
||||||
|
progress_bar.finish_with_message("done");
|
||||||
|
} else {
|
||||||
|
progress_bar.set_position(progress.downloaded as u64);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let pb = ProgressBar::new(progress.total_size as u64);
|
||||||
|
pb.set_style(
|
||||||
|
ProgressStyle::with_template(
|
||||||
|
"[{elapsed_precise}] [{wide_bar:.cyan/blue}] {bytes}/{total_bytes} ({eta})",
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
.with_key(
|
||||||
|
"eta",
|
||||||
|
|state: &ProgressState, w: &mut dyn std::fmt::Write| {
|
||||||
|
write!(w, "{:.1}s", state.eta().as_secs_f64()).unwrap()
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.progress_chars("#>-"),
|
||||||
|
);
|
||||||
|
|
||||||
|
pb.set_position(progress.downloaded as u64);
|
||||||
|
|
||||||
|
progress_bar = Some(pb);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
tx
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue