mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
tmp: calc download size & print progress
This commit is contained in:
parent
ccc8223db0
commit
3dd7c7d57a
4 changed files with 55 additions and 16 deletions
|
@ -10,7 +10,7 @@ repository = "https://gitlab.com/news-flash/article_scraper"
|
|||
[dependencies]
|
||||
thiserror = "1.0"
|
||||
libxml = "0.3"
|
||||
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli"] }
|
||||
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
|
||||
tokio = { version = "1.27", features = ["macros", "fs", "io-util"] }
|
||||
url = "2.3"
|
||||
regex = "1.7"
|
||||
|
@ -23,6 +23,7 @@ rust-embed="6.6"
|
|||
once_cell = "1.17"
|
||||
escaper = "0.1"
|
||||
futures = "0.3"
|
||||
byte-unit = "4.0"
|
||||
|
||||
[dev-dependencies]
|
||||
env_logger = "0.10"
|
|
@ -2,12 +2,14 @@ pub use self::error::ImageDownloadError;
|
|||
use self::request::ImageRequest;
|
||||
use crate::util::Util;
|
||||
use base64::Engine;
|
||||
use byte_unit::Byte;
|
||||
use image::ImageOutputFormat;
|
||||
use libxml::parser::Parser;
|
||||
use libxml::tree::{Document, Node, SaveOptions};
|
||||
use libxml::xpath::Context;
|
||||
use reqwest::{Client, Url};
|
||||
use std::io::Cursor;
|
||||
use tokio::sync::mpsc::{self, Sender};
|
||||
|
||||
mod error;
|
||||
mod request;
|
||||
|
@ -72,12 +74,40 @@ impl ImageDownloader {
|
|||
.filter_map(|r| r.ok())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let size = res
|
||||
.iter()
|
||||
.map(|(req, parent_req)| {
|
||||
req.content_length() + parent_req.as_ref().map(|r| r.content_length()).unwrap_or(0)
|
||||
})
|
||||
.sum::<usize>();
|
||||
|
||||
let (tx, mut rx) = mpsc::channel::<usize>(2);
|
||||
|
||||
let mut download_futures = Vec::new();
|
||||
|
||||
for (request, parent_request) in res {
|
||||
download_futures.push(self.download_and_replace_image(request, parent_request));
|
||||
download_futures.push(self.download_and_replace_image(
|
||||
request,
|
||||
parent_request,
|
||||
tx.clone(),
|
||||
));
|
||||
}
|
||||
|
||||
tokio::spawn(async move {
|
||||
let mut received = 0_usize;
|
||||
let size = Byte::from_bytes(size as u128);
|
||||
let adjusted_size = size.get_appropriate_unit(true);
|
||||
println!("downloading {adjusted_size}");
|
||||
|
||||
while let Some(i) = rx.recv().await {
|
||||
received += i;
|
||||
|
||||
let received_bytes = Byte::from_bytes(received as u128);
|
||||
let received_adjusted = received_bytes.get_appropriate_unit(true);
|
||||
println!("received {received_adjusted} / {adjusted_size}");
|
||||
}
|
||||
});
|
||||
|
||||
_ = futures::future::join_all(download_futures).await;
|
||||
|
||||
Ok(())
|
||||
|
@ -118,13 +148,14 @@ impl ImageDownloader {
|
|||
&self,
|
||||
mut request: ImageRequest,
|
||||
mut parent_request: Option<ImageRequest>,
|
||||
tx: Sender<usize>,
|
||||
) -> Result<(), ImageDownloadError> {
|
||||
let mut image = request.download().await?;
|
||||
let mut image = request.download(&tx).await?;
|
||||
let mut parent_image: Option<Vec<u8>> = None;
|
||||
|
||||
if let Some(parent_request) = parent_request.as_mut() {
|
||||
if parent_request.content_length() > request.content_length() {
|
||||
parent_image = parent_request.download().await.ok();
|
||||
parent_image = parent_request.download(&tx).await.ok();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,12 +1,14 @@
|
|||
use futures::StreamExt;
|
||||
use libxml::tree::Node;
|
||||
use reqwest::{header::CONTENT_TYPE, Client, Response, Url};
|
||||
use tokio::sync::mpsc::Sender;
|
||||
|
||||
use super::ImageDownloadError;
|
||||
|
||||
pub struct ImageRequest {
|
||||
node: Node,
|
||||
http_response: Option<Response>,
|
||||
content_length: u64,
|
||||
content_length: usize,
|
||||
content_type: String,
|
||||
}
|
||||
|
||||
|
@ -33,14 +35,19 @@ impl ImageRequest {
|
|||
})
|
||||
}
|
||||
|
||||
pub async fn download(&mut self) -> Result<Vec<u8>, ImageDownloadError> {
|
||||
pub async fn download(&mut self, tx: &Sender<usize>) -> Result<Vec<u8>, ImageDownloadError> {
|
||||
if let Some(http_response) = self.http_response.take() {
|
||||
let result = http_response
|
||||
.bytes()
|
||||
.await
|
||||
.map_err(|_| ImageDownloadError::Http)?
|
||||
.as_ref()
|
||||
.to_vec();
|
||||
let mut stream = http_response.bytes_stream();
|
||||
|
||||
let mut result = Vec::with_capacity(self.content_length);
|
||||
while let Some(item) = stream.next().await {
|
||||
let chunk = item.map_err(|_| ImageDownloadError::Http)?;
|
||||
_ = tx.send(chunk.len()).await;
|
||||
for byte in chunk {
|
||||
result.push(byte);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
} else {
|
||||
log::warn!("imagerequest already consumed");
|
||||
|
@ -52,7 +59,7 @@ impl ImageRequest {
|
|||
&self.content_type
|
||||
}
|
||||
|
||||
pub fn content_length(&self) -> u64 {
|
||||
pub fn content_length(&self) -> usize {
|
||||
self.content_length
|
||||
}
|
||||
|
||||
|
@ -60,7 +67,7 @@ impl ImageRequest {
|
|||
_ = self.node.set_property(prop_name, data);
|
||||
}
|
||||
|
||||
fn get_content_length(response: &Response) -> Result<u64, ImageDownloadError> {
|
||||
fn get_content_length(response: &Response) -> Result<usize, ImageDownloadError> {
|
||||
let status_code = response.status();
|
||||
|
||||
if !status_code.is_success() {
|
||||
|
@ -72,7 +79,7 @@ impl ImageRequest {
|
|||
.headers()
|
||||
.get(reqwest::header::CONTENT_LENGTH)
|
||||
.and_then(|content_length| content_length.to_str().ok())
|
||||
.and_then(|content_length| content_length.parse::<u64>().ok())
|
||||
.and_then(|content_length| content_length.parse::<usize>().ok())
|
||||
.ok_or(ImageDownloadError::ContentLength)
|
||||
}
|
||||
|
||||
|
|
|
@ -13,5 +13,5 @@ clap = { version = "4.2", features = [ "derive" ] }
|
|||
simplelog = "0.12"
|
||||
log = "0.4"
|
||||
url = "2.3"
|
||||
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli"] }
|
||||
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
|
||||
tokio = { version = "1.27", features = ["macros", "fs", "io-util", "rt-multi-thread" ] }
|
Loading…
Add table
Add a link
Reference in a new issue