1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

tmp: calc download size & print progress

This commit is contained in:
Jan Lukas Gernert 2023-04-16 18:10:43 +02:00
parent ccc8223db0
commit 3dd7c7d57a
4 changed files with 55 additions and 16 deletions

View file

@ -10,7 +10,7 @@ repository = "https://gitlab.com/news-flash/article_scraper"
[dependencies] [dependencies]
thiserror = "1.0" thiserror = "1.0"
libxml = "0.3" libxml = "0.3"
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli"] } reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
tokio = { version = "1.27", features = ["macros", "fs", "io-util"] } tokio = { version = "1.27", features = ["macros", "fs", "io-util"] }
url = "2.3" url = "2.3"
regex = "1.7" regex = "1.7"
@ -23,6 +23,7 @@ rust-embed="6.6"
once_cell = "1.17" once_cell = "1.17"
escaper = "0.1" escaper = "0.1"
futures = "0.3" futures = "0.3"
byte-unit = "4.0"
[dev-dependencies] [dev-dependencies]
env_logger = "0.10" env_logger = "0.10"

View file

@ -2,12 +2,14 @@ pub use self::error::ImageDownloadError;
use self::request::ImageRequest; use self::request::ImageRequest;
use crate::util::Util; use crate::util::Util;
use base64::Engine; use base64::Engine;
use byte_unit::Byte;
use image::ImageOutputFormat; use image::ImageOutputFormat;
use libxml::parser::Parser; use libxml::parser::Parser;
use libxml::tree::{Document, Node, SaveOptions}; use libxml::tree::{Document, Node, SaveOptions};
use libxml::xpath::Context; use libxml::xpath::Context;
use reqwest::{Client, Url}; use reqwest::{Client, Url};
use std::io::Cursor; use std::io::Cursor;
use tokio::sync::mpsc::{self, Sender};
mod error; mod error;
mod request; mod request;
@ -72,12 +74,40 @@ impl ImageDownloader {
.filter_map(|r| r.ok()) .filter_map(|r| r.ok())
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let size = res
.iter()
.map(|(req, parent_req)| {
req.content_length() + parent_req.as_ref().map(|r| r.content_length()).unwrap_or(0)
})
.sum::<usize>();
let (tx, mut rx) = mpsc::channel::<usize>(2);
let mut download_futures = Vec::new(); let mut download_futures = Vec::new();
for (request, parent_request) in res { for (request, parent_request) in res {
download_futures.push(self.download_and_replace_image(request, parent_request)); download_futures.push(self.download_and_replace_image(
request,
parent_request,
tx.clone(),
));
} }
tokio::spawn(async move {
let mut received = 0_usize;
let size = Byte::from_bytes(size as u128);
let adjusted_size = size.get_appropriate_unit(true);
println!("downloading {adjusted_size}");
while let Some(i) = rx.recv().await {
received += i;
let received_bytes = Byte::from_bytes(received as u128);
let received_adjusted = received_bytes.get_appropriate_unit(true);
println!("received {received_adjusted} / {adjusted_size}");
}
});
_ = futures::future::join_all(download_futures).await; _ = futures::future::join_all(download_futures).await;
Ok(()) Ok(())
@ -118,13 +148,14 @@ impl ImageDownloader {
&self, &self,
mut request: ImageRequest, mut request: ImageRequest,
mut parent_request: Option<ImageRequest>, mut parent_request: Option<ImageRequest>,
tx: Sender<usize>,
) -> Result<(), ImageDownloadError> { ) -> Result<(), ImageDownloadError> {
let mut image = request.download().await?; let mut image = request.download(&tx).await?;
let mut parent_image: Option<Vec<u8>> = None; let mut parent_image: Option<Vec<u8>> = None;
if let Some(parent_request) = parent_request.as_mut() { if let Some(parent_request) = parent_request.as_mut() {
if parent_request.content_length() > request.content_length() { if parent_request.content_length() > request.content_length() {
parent_image = parent_request.download().await.ok(); parent_image = parent_request.download(&tx).await.ok();
} }
} }

View file

@ -1,12 +1,14 @@
use futures::StreamExt;
use libxml::tree::Node; use libxml::tree::Node;
use reqwest::{header::CONTENT_TYPE, Client, Response, Url}; use reqwest::{header::CONTENT_TYPE, Client, Response, Url};
use tokio::sync::mpsc::Sender;
use super::ImageDownloadError; use super::ImageDownloadError;
pub struct ImageRequest { pub struct ImageRequest {
node: Node, node: Node,
http_response: Option<Response>, http_response: Option<Response>,
content_length: u64, content_length: usize,
content_type: String, content_type: String,
} }
@ -33,14 +35,19 @@ impl ImageRequest {
}) })
} }
pub async fn download(&mut self) -> Result<Vec<u8>, ImageDownloadError> { pub async fn download(&mut self, tx: &Sender<usize>) -> Result<Vec<u8>, ImageDownloadError> {
if let Some(http_response) = self.http_response.take() { if let Some(http_response) = self.http_response.take() {
let result = http_response let mut stream = http_response.bytes_stream();
.bytes()
.await let mut result = Vec::with_capacity(self.content_length);
.map_err(|_| ImageDownloadError::Http)? while let Some(item) = stream.next().await {
.as_ref() let chunk = item.map_err(|_| ImageDownloadError::Http)?;
.to_vec(); _ = tx.send(chunk.len()).await;
for byte in chunk {
result.push(byte);
}
}
Ok(result) Ok(result)
} else { } else {
log::warn!("imagerequest already consumed"); log::warn!("imagerequest already consumed");
@ -52,7 +59,7 @@ impl ImageRequest {
&self.content_type &self.content_type
} }
pub fn content_length(&self) -> u64 { pub fn content_length(&self) -> usize {
self.content_length self.content_length
} }
@ -60,7 +67,7 @@ impl ImageRequest {
_ = self.node.set_property(prop_name, data); _ = self.node.set_property(prop_name, data);
} }
fn get_content_length(response: &Response) -> Result<u64, ImageDownloadError> { fn get_content_length(response: &Response) -> Result<usize, ImageDownloadError> {
let status_code = response.status(); let status_code = response.status();
if !status_code.is_success() { if !status_code.is_success() {
@ -72,7 +79,7 @@ impl ImageRequest {
.headers() .headers()
.get(reqwest::header::CONTENT_LENGTH) .get(reqwest::header::CONTENT_LENGTH)
.and_then(|content_length| content_length.to_str().ok()) .and_then(|content_length| content_length.to_str().ok())
.and_then(|content_length| content_length.parse::<u64>().ok()) .and_then(|content_length| content_length.parse::<usize>().ok())
.ok_or(ImageDownloadError::ContentLength) .ok_or(ImageDownloadError::ContentLength)
} }

View file

@ -13,5 +13,5 @@ clap = { version = "4.2", features = [ "derive" ] }
simplelog = "0.12" simplelog = "0.12"
log = "0.4" log = "0.4"
url = "2.3" url = "2.3"
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli"] } reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
tokio = { version = "1.27", features = ["macros", "fs", "io-util", "rt-multi-thread" ] } tokio = { version = "1.27", features = ["macros", "fs", "io-util", "rt-multi-thread" ] }