diff --git a/article_scraper/src/images/mod.rs b/article_scraper/src/images/mod.rs index 02933b5..14087c3 100644 --- a/article_scraper/src/images/mod.rs +++ b/article_scraper/src/images/mod.rs @@ -4,6 +4,7 @@ use self::pair::Pair; use self::request::ImageRequest; use crate::util::Util; use base64::Engine; +use futures::StreamExt; use image::ImageOutputFormat; use libxml::parser::Parser; use libxml::tree::{Node, SaveOptions}; @@ -28,6 +29,45 @@ impl ImageDownloader { ImageDownloader { max_size } } + pub async fn single_from_url( + url: &str, + client: &Client, + progress: Option>, + ) -> Result, ImageDownloadError> { + let response = client.get(url).send().await?; + + let content_type = Util::get_content_type(&response)?; + let content_length = Util::get_content_length(&response)?; + + if !content_type.contains("image") { + return Err(ImageDownloadError::ContentType); + } + + let mut stream = response.bytes_stream(); + let mut downloaded_bytes = 0; + + let mut result = Vec::with_capacity(content_length); + while let Some(item) = stream.next().await { + let chunk = item?; + downloaded_bytes += chunk.len(); + + if let Some(sender) = progress.as_ref() { + _ = sender + .send(Progress { + total_size: content_length, + downloaded: downloaded_bytes, + }) + .await; + } + + for byte in chunk { + result.push(byte); + } + } + + Ok(result) + } + pub async fn download_images_from_string( &self, html: &str, diff --git a/article_scraper/src/images/request.rs b/article_scraper/src/images/request.rs index c145598..b7086ce 100644 --- a/article_scraper/src/images/request.rs +++ b/article_scraper/src/images/request.rs @@ -1,7 +1,9 @@ use futures::StreamExt; -use reqwest::{header::CONTENT_TYPE, Client, Response}; +use reqwest::{Client, Response}; use tokio::sync::mpsc::Sender; +use crate::util::Util; + use super::{image_data::ImageData, ImageDownloadError}; #[derive(Debug)] @@ -16,8 +18,8 @@ impl ImageRequest { pub async fn new(url: String, client: &Client) -> Result { let response = client.get(&url).send().await?; - let content_type = Self::get_content_type(&response)?; - let content_length = Self::get_content_length(&response)?; + let content_type = Util::get_content_type(&response)?; + let content_length = Util::get_content_length(&response)?; if !content_type.contains("image") { return Err(ImageDownloadError::ContentType); @@ -58,33 +60,4 @@ impl ImageRequest { pub fn content_length(&self) -> usize { self.content_length } - - fn get_content_length(response: &Response) -> Result { - let status_code = response.status(); - - if !status_code.is_success() { - log::warn!("response: {status_code}"); - return Err(ImageDownloadError::Http); - } - - response - .headers() - .get(reqwest::header::CONTENT_LENGTH) - .and_then(|content_length| content_length.to_str().ok()) - .and_then(|content_length| content_length.parse::().ok()) - .ok_or(ImageDownloadError::ContentLength) - } - - fn get_content_type(response: &Response) -> Result { - if response.status().is_success() { - response - .headers() - .get(CONTENT_TYPE) - .and_then(|val| val.to_str().ok()) - .map(|val| val.to_string()) - .ok_or(ImageDownloadError::ContentType) - } else { - Err(ImageDownloadError::ContentType) - } - } } diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index 57cd1b2..73adee8 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -5,7 +5,7 @@ use libxml::{ xpath::Context, }; use reqwest::{ - header::{HeaderMap, HeaderName, HeaderValue}, + header::{HeaderMap, HeaderName, HeaderValue, CONTENT_LENGTH, CONTENT_TYPE}, Response, }; use tokio::fs::DirEntry; @@ -14,6 +14,7 @@ use crate::{ constants::{self, NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX}, full_text_parser::{config::ConfigEntry, error::FullTextParserError}, image_object::ImageObject, + images::ImageDownloadError, video_object::VideoObject, }; @@ -1191,6 +1192,35 @@ impl Util { pub fn score_by_position(len: usize, index: usize) -> i32 { ((len as f32 / 2.0) - index as f32) as i32 } + + pub fn get_content_length(response: &Response) -> Result { + let status_code = response.status(); + + if !status_code.is_success() { + log::warn!("response: {status_code}"); + return Err(ImageDownloadError::Http); + } + + response + .headers() + .get(CONTENT_LENGTH) + .and_then(|content_length| content_length.to_str().ok()) + .and_then(|content_length| content_length.parse::().ok()) + .ok_or(ImageDownloadError::ContentLength) + } + + pub fn get_content_type(response: &Response) -> Result { + if response.status().is_success() { + response + .headers() + .get(CONTENT_TYPE) + .and_then(|val| val.to_str().ok()) + .map(|val| val.to_string()) + .ok_or(ImageDownloadError::ContentType) + } else { + Err(ImageDownloadError::ContentType) + } + } } #[cfg(test)]