mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
tmp: calc download size & print progress
This commit is contained in:
parent
ccc8223db0
commit
3dd7c7d57a
4 changed files with 55 additions and 16 deletions
|
@ -10,7 +10,7 @@ repository = "https://gitlab.com/news-flash/article_scraper"
|
||||||
[dependencies]
|
[dependencies]
|
||||||
thiserror = "1.0"
|
thiserror = "1.0"
|
||||||
libxml = "0.3"
|
libxml = "0.3"
|
||||||
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli"] }
|
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
|
||||||
tokio = { version = "1.27", features = ["macros", "fs", "io-util"] }
|
tokio = { version = "1.27", features = ["macros", "fs", "io-util"] }
|
||||||
url = "2.3"
|
url = "2.3"
|
||||||
regex = "1.7"
|
regex = "1.7"
|
||||||
|
@ -23,6 +23,7 @@ rust-embed="6.6"
|
||||||
once_cell = "1.17"
|
once_cell = "1.17"
|
||||||
escaper = "0.1"
|
escaper = "0.1"
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
|
byte-unit = "4.0"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
env_logger = "0.10"
|
env_logger = "0.10"
|
|
@ -2,12 +2,14 @@ pub use self::error::ImageDownloadError;
|
||||||
use self::request::ImageRequest;
|
use self::request::ImageRequest;
|
||||||
use crate::util::Util;
|
use crate::util::Util;
|
||||||
use base64::Engine;
|
use base64::Engine;
|
||||||
|
use byte_unit::Byte;
|
||||||
use image::ImageOutputFormat;
|
use image::ImageOutputFormat;
|
||||||
use libxml::parser::Parser;
|
use libxml::parser::Parser;
|
||||||
use libxml::tree::{Document, Node, SaveOptions};
|
use libxml::tree::{Document, Node, SaveOptions};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
use reqwest::{Client, Url};
|
use reqwest::{Client, Url};
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
|
use tokio::sync::mpsc::{self, Sender};
|
||||||
|
|
||||||
mod error;
|
mod error;
|
||||||
mod request;
|
mod request;
|
||||||
|
@ -72,12 +74,40 @@ impl ImageDownloader {
|
||||||
.filter_map(|r| r.ok())
|
.filter_map(|r| r.ok())
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let size = res
|
||||||
|
.iter()
|
||||||
|
.map(|(req, parent_req)| {
|
||||||
|
req.content_length() + parent_req.as_ref().map(|r| r.content_length()).unwrap_or(0)
|
||||||
|
})
|
||||||
|
.sum::<usize>();
|
||||||
|
|
||||||
|
let (tx, mut rx) = mpsc::channel::<usize>(2);
|
||||||
|
|
||||||
let mut download_futures = Vec::new();
|
let mut download_futures = Vec::new();
|
||||||
|
|
||||||
for (request, parent_request) in res {
|
for (request, parent_request) in res {
|
||||||
download_futures.push(self.download_and_replace_image(request, parent_request));
|
download_futures.push(self.download_and_replace_image(
|
||||||
|
request,
|
||||||
|
parent_request,
|
||||||
|
tx.clone(),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tokio::spawn(async move {
|
||||||
|
let mut received = 0_usize;
|
||||||
|
let size = Byte::from_bytes(size as u128);
|
||||||
|
let adjusted_size = size.get_appropriate_unit(true);
|
||||||
|
println!("downloading {adjusted_size}");
|
||||||
|
|
||||||
|
while let Some(i) = rx.recv().await {
|
||||||
|
received += i;
|
||||||
|
|
||||||
|
let received_bytes = Byte::from_bytes(received as u128);
|
||||||
|
let received_adjusted = received_bytes.get_appropriate_unit(true);
|
||||||
|
println!("received {received_adjusted} / {adjusted_size}");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
_ = futures::future::join_all(download_futures).await;
|
_ = futures::future::join_all(download_futures).await;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -118,13 +148,14 @@ impl ImageDownloader {
|
||||||
&self,
|
&self,
|
||||||
mut request: ImageRequest,
|
mut request: ImageRequest,
|
||||||
mut parent_request: Option<ImageRequest>,
|
mut parent_request: Option<ImageRequest>,
|
||||||
|
tx: Sender<usize>,
|
||||||
) -> Result<(), ImageDownloadError> {
|
) -> Result<(), ImageDownloadError> {
|
||||||
let mut image = request.download().await?;
|
let mut image = request.download(&tx).await?;
|
||||||
let mut parent_image: Option<Vec<u8>> = None;
|
let mut parent_image: Option<Vec<u8>> = None;
|
||||||
|
|
||||||
if let Some(parent_request) = parent_request.as_mut() {
|
if let Some(parent_request) = parent_request.as_mut() {
|
||||||
if parent_request.content_length() > request.content_length() {
|
if parent_request.content_length() > request.content_length() {
|
||||||
parent_image = parent_request.download().await.ok();
|
parent_image = parent_request.download(&tx).await.ok();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,14 @@
|
||||||
|
use futures::StreamExt;
|
||||||
use libxml::tree::Node;
|
use libxml::tree::Node;
|
||||||
use reqwest::{header::CONTENT_TYPE, Client, Response, Url};
|
use reqwest::{header::CONTENT_TYPE, Client, Response, Url};
|
||||||
|
use tokio::sync::mpsc::Sender;
|
||||||
|
|
||||||
use super::ImageDownloadError;
|
use super::ImageDownloadError;
|
||||||
|
|
||||||
pub struct ImageRequest {
|
pub struct ImageRequest {
|
||||||
node: Node,
|
node: Node,
|
||||||
http_response: Option<Response>,
|
http_response: Option<Response>,
|
||||||
content_length: u64,
|
content_length: usize,
|
||||||
content_type: String,
|
content_type: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -33,14 +35,19 @@ impl ImageRequest {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn download(&mut self) -> Result<Vec<u8>, ImageDownloadError> {
|
pub async fn download(&mut self, tx: &Sender<usize>) -> Result<Vec<u8>, ImageDownloadError> {
|
||||||
if let Some(http_response) = self.http_response.take() {
|
if let Some(http_response) = self.http_response.take() {
|
||||||
let result = http_response
|
let mut stream = http_response.bytes_stream();
|
||||||
.bytes()
|
|
||||||
.await
|
let mut result = Vec::with_capacity(self.content_length);
|
||||||
.map_err(|_| ImageDownloadError::Http)?
|
while let Some(item) = stream.next().await {
|
||||||
.as_ref()
|
let chunk = item.map_err(|_| ImageDownloadError::Http)?;
|
||||||
.to_vec();
|
_ = tx.send(chunk.len()).await;
|
||||||
|
for byte in chunk {
|
||||||
|
result.push(byte);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(result)
|
Ok(result)
|
||||||
} else {
|
} else {
|
||||||
log::warn!("imagerequest already consumed");
|
log::warn!("imagerequest already consumed");
|
||||||
|
@ -52,7 +59,7 @@ impl ImageRequest {
|
||||||
&self.content_type
|
&self.content_type
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn content_length(&self) -> u64 {
|
pub fn content_length(&self) -> usize {
|
||||||
self.content_length
|
self.content_length
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -60,7 +67,7 @@ impl ImageRequest {
|
||||||
_ = self.node.set_property(prop_name, data);
|
_ = self.node.set_property(prop_name, data);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_content_length(response: &Response) -> Result<u64, ImageDownloadError> {
|
fn get_content_length(response: &Response) -> Result<usize, ImageDownloadError> {
|
||||||
let status_code = response.status();
|
let status_code = response.status();
|
||||||
|
|
||||||
if !status_code.is_success() {
|
if !status_code.is_success() {
|
||||||
|
@ -72,7 +79,7 @@ impl ImageRequest {
|
||||||
.headers()
|
.headers()
|
||||||
.get(reqwest::header::CONTENT_LENGTH)
|
.get(reqwest::header::CONTENT_LENGTH)
|
||||||
.and_then(|content_length| content_length.to_str().ok())
|
.and_then(|content_length| content_length.to_str().ok())
|
||||||
.and_then(|content_length| content_length.parse::<u64>().ok())
|
.and_then(|content_length| content_length.parse::<usize>().ok())
|
||||||
.ok_or(ImageDownloadError::ContentLength)
|
.ok_or(ImageDownloadError::ContentLength)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -13,5 +13,5 @@ clap = { version = "4.2", features = [ "derive" ] }
|
||||||
simplelog = "0.12"
|
simplelog = "0.12"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
url = "2.3"
|
url = "2.3"
|
||||||
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli"] }
|
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
|
||||||
tokio = { version = "1.27", features = ["macros", "fs", "io-util", "rt-multi-thread" ] }
|
tokio = { version = "1.27", features = ["macros", "fs", "io-util", "rt-multi-thread" ] }
|
Loading…
Add table
Add a link
Reference in a new issue