diff --git a/article_scraper/src/images/error.rs b/article_scraper/src/images/error.rs index 831313b..7135f56 100644 --- a/article_scraper/src/images/error.rs +++ b/article_scraper/src/images/error.rs @@ -11,7 +11,7 @@ pub enum ImageDownloadError { #[error("Generating image name failed")] ImageName, #[error("Getting the content-length property failed")] - ContentLenght, + ContentLength, #[error("Content-type suggest no image")] ContentType, #[error("Http error")] diff --git a/article_scraper/src/images/mod.rs b/article_scraper/src/images/mod.rs index 7a89ad1..2a422c9 100644 --- a/article_scraper/src/images/mod.rs +++ b/article_scraper/src/images/mod.rs @@ -6,8 +6,7 @@ use image::ImageOutputFormat; use libxml::parser::Parser; use libxml::tree::{Document, Node, SaveOptions}; use libxml::xpath::Context; -use reqwest::header::{HeaderValue, CONTENT_TYPE}; -use reqwest::{Client, Response, Url}; +use reqwest::{Client, Url}; use std::io::Cursor; mod error; @@ -76,14 +75,7 @@ impl ImageDownloader { let mut download_futures = Vec::new(); for (request, parent_request) in res { - if let Some(parent_request) = parent_request { - if parent_request.content_lenght > request.content_lenght { - download_futures - .push(self.download_and_replace_image(parent_request, "big-src")); - } - } - - download_futures.push(self.download_and_replace_image(request, "src")); + download_futures.push(self.download_and_replace_image(request, parent_request)); } _ = futures::future::join_all(download_futures).await; @@ -91,23 +83,6 @@ impl ImageDownloader { Ok(()) } - async fn download_and_replace_image(&self, request: ImageRequest, prop_name: &str) { - let ImageRequest { - mut node, - http_response, - content_lenght, - content_type, - } = request; - - _ = self - .download_image_base64(http_response, content_lenght, content_type) - .await - .map(|image| { - _ = node.set_property(prop_name, &image); - }) - .map_err(|error| log::error!("Failed to download image: {error}")); - } - async fn harvest_image_urls( node: Node, client: &Client, @@ -128,71 +103,58 @@ impl ImageDownloader { }; let url = Url::parse(&src).map_err(ImageDownloadError::InvalidUrl)?; - let parent_request = Self::check_image_parent(&node, client).await.ok(); + let parent_url = Self::check_image_parent(&node).await.ok(); - println!("url: {url}"); - - let response = client - .get(url) - .send() - .await - .map_err(|_| ImageDownloadError::Http)?; - let content_type = ImageDownloader::get_content_type(&response); - let content_lenght = Self::get_content_lenght(&response).unwrap_or(0); - - let request = ImageRequest { - node, - http_response: response, - content_lenght, - content_type, + let request = ImageRequest::new(node.clone(), &url, client).await?; + let parent_request = match parent_url { + Some(parent_url) => Some(ImageRequest::new(node, &parent_url, client).await?), + None => None, }; Ok((request, parent_request)) } - async fn download_image_base64( + async fn download_and_replace_image( &self, - http_response: Response, - content_length: u64, - content_type: Option, - ) -> Result { - if content_length == 0 { - return Err(ImageDownloadError::ContentLenght); + mut request: ImageRequest, + mut parent_request: Option, + ) -> Result<(), ImageDownloadError> { + let mut image = request.download().await?; + let mut parent_image: Option> = None; + + if let Some(parent_request) = parent_request.as_mut() { + if parent_request.content_length() > request.content_length() { + parent_image = parent_request.download().await.ok(); + } } - let content_type = content_type - .as_ref() - .and_then(|content_type| content_type.to_str().ok()) - .ok_or(ImageDownloadError::ContentType)?; - - if !content_type.contains("image") { - return Err(ImageDownloadError::ContentType); - } - - let mut image = http_response - .bytes() - .await - .map_err(|_| ImageDownloadError::Http)? - .as_ref() - .to_vec(); - - if content_type != "image/svg+xml" && content_type != "image/gif" { + if request.content_type() != "image/svg+xml" && request.content_type() != "image/gif" { if let Some(resized_image) = Self::scale_image(&image, self.max_size) { + if parent_image.is_none() { + parent_image = Some(image); + } image = resized_image; } } let image_base64 = base64::engine::general_purpose::STANDARD.encode(&image); - let image_string = format!("data:{};base64,{}", content_type, image_base64); - Ok(image_string) - } + let image_string = format!("data:{};base64,{}", request.content_type(), image_base64); + request.write_image_to_property("src", &image_string); - fn get_content_type(response: &Response) -> Option { - if response.status().is_success() { - response.headers().get(CONTENT_TYPE).cloned() - } else { - None + if let Some(parent_image) = parent_image { + let parent_image_base64 = + base64::engine::general_purpose::STANDARD.encode(parent_image); + + let content_type = parent_request + .map(|pr| pr.content_type().to_string()) + .unwrap_or(request.content_type().to_string()); + let parent_image_string = + format!("data:{};base64,{}", content_type, parent_image_base64); + + request.write_image_to_property("big-src", &parent_image_string); } + + Ok(()) } fn scale_image(image_buffer: &[u8], max_dimensions: (u32, u32)) -> Option> { @@ -225,10 +187,7 @@ impl ImageDownloader { } } - async fn check_image_parent( - node: &Node, - client: &Client, - ) -> Result { + async fn check_image_parent(node: &Node) -> Result { let parent = match node.get_parent() { Some(parent) => parent, None => { @@ -255,38 +214,7 @@ impl ImageDownloader { ImageDownloadError::InvalidUrl(err) })?; - println!("parent url: {parent_url}"); - - let response = client - .get(parent_url.clone()) - .send() - .await - .map_err(|_| ImageDownloadError::Http)?; - let content_type = ImageDownloader::get_content_type(&response); - let content_lenght = Self::get_content_lenght(&response).unwrap_or(0); - - Ok(ImageRequest { - node: parent, - http_response: response, - content_lenght, - content_type, - }) - } - - fn get_content_lenght(response: &Response) -> Result { - let status_code = response.status(); - - if !status_code.is_success() { - log::warn!("response: {status_code}"); - return Err(ImageDownloadError::Http); - } - - response - .headers() - .get(reqwest::header::CONTENT_LENGTH) - .and_then(|content_length| content_length.to_str().ok()) - .and_then(|content_length| content_length.parse::().ok()) - .ok_or(ImageDownloadError::ContentLenght) + Ok(parent_url) } } diff --git a/article_scraper/src/images/request.rs b/article_scraper/src/images/request.rs index bf91326..ee7cbfd 100644 --- a/article_scraper/src/images/request.rs +++ b/article_scraper/src/images/request.rs @@ -1,9 +1,91 @@ use libxml::tree::Node; -use reqwest::{header::HeaderValue, Response}; +use reqwest::{header::CONTENT_TYPE, Client, Response, Url}; + +use super::ImageDownloadError; pub struct ImageRequest { - pub node: Node, - pub http_response: Response, - pub content_lenght: u64, - pub content_type: Option, + node: Node, + http_response: Option, + content_length: u64, + content_type: String, +} + +impl ImageRequest { + pub async fn new(node: Node, url: &Url, client: &Client) -> Result { + let response = client + .get(url.clone()) + .send() + .await + .map_err(|_| ImageDownloadError::Http)?; + + let content_type = Self::get_content_type(&response)?; + let content_length = Self::get_content_length(&response)?; + + if !content_type.contains("image") { + return Err(ImageDownloadError::ContentType); + } + + Ok(Self { + node, + http_response: Some(response), + content_length, + content_type, + }) + } + + pub async fn download(&mut self) -> Result, ImageDownloadError> { + if let Some(http_response) = self.http_response.take() { + let result = http_response + .bytes() + .await + .map_err(|_| ImageDownloadError::Http)? + .as_ref() + .to_vec(); + Ok(result) + } else { + log::warn!("imagerequest already consumed"); + Err(ImageDownloadError::Http) + } + } + + pub fn content_type(&self) -> &str { + &self.content_type + } + + pub fn content_length(&self) -> u64 { + self.content_length + } + + pub fn write_image_to_property(&mut self, prop_name: &str, data: &str) { + _ = self.node.set_property(prop_name, data); + } + + fn get_content_length(response: &Response) -> Result { + let status_code = response.status(); + + if !status_code.is_success() { + log::warn!("response: {status_code}"); + return Err(ImageDownloadError::Http); + } + + response + .headers() + .get(reqwest::header::CONTENT_LENGTH) + .and_then(|content_length| content_length.to_str().ok()) + .and_then(|content_length| content_length.parse::().ok()) + .ok_or(ImageDownloadError::ContentLength) + } + + fn get_content_type(response: &Response) -> Result { + if response.status().is_success() { + response + .headers() + .get(CONTENT_TYPE) + .and_then(|val| val.to_str().ok()) + .map(|val| val.to_string()) + .ok_or(ImageDownloadError::ContentType) + } else { + Err(ImageDownloadError::ContentType) + } + } }