mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
somehow made things much slower
This commit is contained in:
parent
4fd4dd39db
commit
3a465f2619
2 changed files with 110 additions and 151 deletions
|
@ -1,6 +1,8 @@
|
||||||
pub use self::error::ImageDownloadError;
|
pub use self::error::ImageDownloadError;
|
||||||
|
use self::request::ImageRequest;
|
||||||
use crate::util::Util;
|
use crate::util::Util;
|
||||||
use base64::Engine;
|
use base64::Engine;
|
||||||
|
use image::ImageOutputFormat;
|
||||||
use libxml::parser::Parser;
|
use libxml::parser::Parser;
|
||||||
use libxml::tree::{Document, Node, SaveOptions};
|
use libxml::tree::{Document, Node, SaveOptions};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
|
@ -9,6 +11,7 @@ use reqwest::{Client, Response, Url};
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
|
|
||||||
mod error;
|
mod error;
|
||||||
|
mod request;
|
||||||
|
|
||||||
pub struct ImageDownloader {
|
pub struct ImageDownloader {
|
||||||
max_size: (u32, u32),
|
max_size: (u32, u32),
|
||||||
|
@ -72,8 +75,15 @@ impl ImageDownloader {
|
||||||
|
|
||||||
let mut download_futures = Vec::new();
|
let mut download_futures = Vec::new();
|
||||||
|
|
||||||
for (node, url, parent_url) in res {
|
for (request, parent_request) in res {
|
||||||
download_futures.push(self.download_and_replace_image(node, url, parent_url, client));
|
if let Some(parent_request) = parent_request {
|
||||||
|
if parent_request.content_lenght > request.content_lenght {
|
||||||
|
download_futures
|
||||||
|
.push(self.download_and_replace_image(parent_request, "big-src"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
download_futures.push(self.download_and_replace_image(request, "src"));
|
||||||
}
|
}
|
||||||
|
|
||||||
_ = futures::future::join_all(download_futures).await;
|
_ = futures::future::join_all(download_futures).await;
|
||||||
|
@ -81,22 +91,19 @@ impl ImageDownloader {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn download_and_replace_image(
|
async fn download_and_replace_image(&self, request: ImageRequest, prop_name: &str) {
|
||||||
&self,
|
let ImageRequest {
|
||||||
mut node: Node,
|
mut node,
|
||||||
image_url: Url,
|
http_response,
|
||||||
parent_url: Option<Url>,
|
content_lenght,
|
||||||
client: &Client,
|
content_type,
|
||||||
) {
|
} = request;
|
||||||
_ = self
|
|
||||||
.download_image_base64(&image_url, parent_url.as_ref(), client)
|
|
||||||
.await
|
|
||||||
.map(|(small, big)| {
|
|
||||||
_ = node.set_property("src", &small);
|
|
||||||
|
|
||||||
if let Some(big) = big {
|
_ = self
|
||||||
_ = node.set_property("big-src", &big);
|
.download_image_base64(http_response, content_lenght, content_type)
|
||||||
}
|
.await
|
||||||
|
.map(|image| {
|
||||||
|
_ = node.set_property(prop_name, &image);
|
||||||
})
|
})
|
||||||
.map_err(|error| log::error!("Failed to download image: {error}"));
|
.map_err(|error| log::error!("Failed to download image: {error}"));
|
||||||
}
|
}
|
||||||
|
@ -104,7 +111,7 @@ impl ImageDownloader {
|
||||||
async fn harvest_image_urls(
|
async fn harvest_image_urls(
|
||||||
node: Node,
|
node: Node,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<(Node, Url, Option<Url>), ImageDownloadError> {
|
) -> Result<(ImageRequest, Option<ImageRequest>), ImageDownloadError> {
|
||||||
let src = match node.get_property("src") {
|
let src = match node.get_property("src") {
|
||||||
Some(src) => {
|
Some(src) => {
|
||||||
if src.starts_with("data:") {
|
if src.starts_with("data:") {
|
||||||
|
@ -121,132 +128,81 @@ impl ImageDownloader {
|
||||||
};
|
};
|
||||||
|
|
||||||
let url = Url::parse(&src).map_err(ImageDownloadError::InvalidUrl)?;
|
let url = Url::parse(&src).map_err(ImageDownloadError::InvalidUrl)?;
|
||||||
let parent_url = Self::check_image_parent(&node, &url, client).await.ok();
|
let parent_request = Self::check_image_parent(&node, client).await.ok();
|
||||||
|
|
||||||
Ok((node, url, parent_url))
|
println!("url: {url}");
|
||||||
|
|
||||||
|
let response = client
|
||||||
|
.get(url)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|_| ImageDownloadError::Http)?;
|
||||||
|
let content_type = ImageDownloader::get_content_type(&response);
|
||||||
|
let content_lenght = Self::get_content_lenght(&response).unwrap_or(0);
|
||||||
|
|
||||||
|
let request = ImageRequest {
|
||||||
|
node,
|
||||||
|
http_response: response,
|
||||||
|
content_lenght,
|
||||||
|
content_type,
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok((request, parent_request))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn download_image_base64(
|
async fn download_image_base64(
|
||||||
&self,
|
&self,
|
||||||
image_url: &Url,
|
http_response: Response,
|
||||||
parent_url: Option<&Url>,
|
content_length: u64,
|
||||||
client: &Client,
|
content_type: Option<HeaderValue>,
|
||||||
) -> Result<(String, Option<String>), ImageDownloadError> {
|
) -> Result<String, ImageDownloadError> {
|
||||||
let response = client.get(image_url.clone()).send().await.map_err(|err| {
|
if content_length == 0 {
|
||||||
log::error!("GET {} failed - {}", image_url.as_str(), err);
|
return Err(ImageDownloadError::ContentLenght);
|
||||||
ImageDownloadError::Http
|
}
|
||||||
})?;
|
|
||||||
|
|
||||||
let content_type_small = ImageDownloader::check_image_content_type(&response)?;
|
let content_type = content_type
|
||||||
let content_type_small = content_type_small
|
.as_ref()
|
||||||
.to_str()
|
.and_then(|content_type| content_type.to_str().ok())
|
||||||
.map_err(|_| ImageDownloadError::ContentType)?;
|
.ok_or_else(|| ImageDownloadError::ContentType)?;
|
||||||
let mut content_type_big: Option<String> = None;
|
|
||||||
|
|
||||||
let mut small_image = response
|
if !content_type.contains("image") {
|
||||||
|
return Err(ImageDownloadError::ContentType);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut image = http_response
|
||||||
.bytes()
|
.bytes()
|
||||||
.await
|
.await
|
||||||
.map_err(|_| ImageDownloadError::Http)?
|
.map_err(|_| ImageDownloadError::Http)?
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.to_vec();
|
.to_vec();
|
||||||
|
|
||||||
let mut big_image: Option<Vec<u8>> = None;
|
if content_type != "image/svg+xml" && content_type != "image/gif" {
|
||||||
|
if let Some(resized_image) = Self::scale_image(&image, self.max_size) {
|
||||||
if let Some(parent_url) = parent_url {
|
image = resized_image;
|
||||||
let response_big = client
|
|
||||||
.get(parent_url.clone())
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.map_err(|_| ImageDownloadError::Http)?;
|
|
||||||
content_type_big = Some(
|
|
||||||
ImageDownloader::check_image_content_type(&response_big)?
|
|
||||||
.to_str()
|
|
||||||
.map_err(|_| ImageDownloadError::ContentType)?
|
|
||||||
.to_owned(),
|
|
||||||
);
|
|
||||||
big_image = Some(
|
|
||||||
response_big
|
|
||||||
.bytes()
|
|
||||||
.await
|
|
||||||
.map_err(|_| ImageDownloadError::Http)?
|
|
||||||
.to_vec(),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
if content_type_small != "image/svg+xml" && content_type_small != "image/gif" {
|
|
||||||
let (original_image, resized_image) = Self::scale_image(&small_image, self.max_size)?;
|
|
||||||
if let Some(resized_image) = resized_image {
|
|
||||||
small_image = resized_image;
|
|
||||||
if big_image.is_none() {
|
|
||||||
big_image = Some(original_image);
|
|
||||||
content_type_big = Some(content_type_small.to_owned());
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
small_image = original_image;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let small_image_base64 = base64::engine::general_purpose::STANDARD.encode(&small_image);
|
let image_base64 = base64::engine::general_purpose::STANDARD.encode(&image);
|
||||||
let big_image_base64 =
|
let image_string = format!("data:{};base64,{}", content_type, image_base64);
|
||||||
big_image.map(|img| base64::engine::general_purpose::STANDARD.encode(img));
|
Ok(image_string)
|
||||||
let small_image_string =
|
|
||||||
format!("data:{};base64,{}", content_type_small, small_image_base64);
|
|
||||||
let big_image_string = match big_image_base64 {
|
|
||||||
Some(big_image_base64) => {
|
|
||||||
let content_type_big = content_type_big.ok_or_else(|| {
|
|
||||||
log::debug!("content_type_big should not be None when a big image exists");
|
|
||||||
ImageDownloadError::ParentDownload
|
|
||||||
})?;
|
|
||||||
Some(format!(
|
|
||||||
"data:{};base64,{}",
|
|
||||||
content_type_big, big_image_base64
|
|
||||||
))
|
|
||||||
}
|
|
||||||
None => None,
|
|
||||||
};
|
|
||||||
Ok((small_image_string, big_image_string))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_image_content_type(response: &Response) -> Result<HeaderValue, ImageDownloadError> {
|
fn get_content_type(response: &Response) -> Option<HeaderValue> {
|
||||||
if response.status().is_success() {
|
if response.status().is_success() {
|
||||||
if let Some(content_type) = response.headers().get(CONTENT_TYPE) {
|
response.headers().get(CONTENT_TYPE).cloned()
|
||||||
if content_type
|
|
||||||
.to_str()
|
|
||||||
.map_err(|_| ImageDownloadError::ContentType)?
|
|
||||||
.contains("image")
|
|
||||||
{
|
|
||||||
return Ok(content_type.clone());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
log::warn!("{} is not an image", response.url());
|
|
||||||
Err(ImageDownloadError::ContentType)
|
|
||||||
} else {
|
} else {
|
||||||
Err(ImageDownloadError::Http)
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn scale_image(
|
fn scale_image(image_buffer: &[u8], max_dimensions: (u32, u32)) -> Option<Vec<u8>> {
|
||||||
image_buffer: &[u8],
|
let mut image = match image::load_from_memory(image_buffer) {
|
||||||
max_dimensions: (u32, u32),
|
Err(error) => {
|
||||||
) -> Result<(Vec<u8>, Option<Vec<u8>>), ImageDownloadError> {
|
log::error!("Failed to open image to resize: {}", error);
|
||||||
let mut original_image: Vec<u8> = Vec::new();
|
return None;
|
||||||
let mut resized_image: Option<Vec<u8>> = None;
|
}
|
||||||
|
Ok(image) => image,
|
||||||
let mut image = image::load_from_memory(image_buffer).map_err(|err| {
|
};
|
||||||
log::error!("Failed to open image to resize: {}", err);
|
|
||||||
ImageDownloadError::ImageScale
|
|
||||||
})?;
|
|
||||||
|
|
||||||
image
|
|
||||||
.write_to(
|
|
||||||
&mut Cursor::new(&mut original_image),
|
|
||||||
image::ImageOutputFormat::Png,
|
|
||||||
)
|
|
||||||
.map_err(|err| {
|
|
||||||
log::error!("Failed to save resized image to resize: {}", err);
|
|
||||||
ImageDownloadError::ImageScale
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let dimensions = (image.width(), image.height());
|
let dimensions = (image.width(), image.height());
|
||||||
if dimensions.0 > max_dimensions.0 || dimensions.1 > max_dimensions.1 {
|
if dimensions.0 > max_dimensions.0 || dimensions.1 > max_dimensions.1 {
|
||||||
|
@ -256,26 +212,23 @@ impl ImageDownloader {
|
||||||
image::imageops::FilterType::Lanczos3,
|
image::imageops::FilterType::Lanczos3,
|
||||||
);
|
);
|
||||||
let mut resized_buf: Vec<u8> = Vec::new();
|
let mut resized_buf: Vec<u8> = Vec::new();
|
||||||
image
|
if let Err(error) =
|
||||||
.write_to(
|
image.write_to(&mut Cursor::new(&mut resized_buf), ImageOutputFormat::Png)
|
||||||
&mut Cursor::new(&mut resized_buf),
|
{
|
||||||
image::ImageOutputFormat::Png,
|
log::error!("Failed to save resized image to resize: {}", error);
|
||||||
)
|
return None;
|
||||||
.map_err(|err| {
|
}
|
||||||
log::error!("Failed to save resized image to resize: {}", err);
|
|
||||||
ImageDownloadError::ImageScale
|
|
||||||
})?;
|
|
||||||
resized_image = Some(resized_buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok((original_image, resized_image))
|
Some(resized_buf)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn check_image_parent(
|
async fn check_image_parent(
|
||||||
node: &Node,
|
node: &Node,
|
||||||
child_url: &Url,
|
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<Url, ImageDownloadError> {
|
) -> Result<ImageRequest, ImageDownloadError> {
|
||||||
let parent = match node.get_parent() {
|
let parent = match node.get_parent() {
|
||||||
Some(parent) => parent,
|
Some(parent) => parent,
|
||||||
None => {
|
None => {
|
||||||
|
@ -301,26 +254,23 @@ impl ImageDownloader {
|
||||||
log::debug!("Failed to parse parent image url: {}", err);
|
log::debug!("Failed to parse parent image url: {}", err);
|
||||||
ImageDownloadError::InvalidUrl(err)
|
ImageDownloadError::InvalidUrl(err)
|
||||||
})?;
|
})?;
|
||||||
let parent_response = client
|
|
||||||
.head(parent_url.clone())
|
println!("parent url: {parent_url}");
|
||||||
|
|
||||||
|
let response = client
|
||||||
|
.get(parent_url.clone())
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.map_err(|_| ImageDownloadError::Http)?;
|
.map_err(|_| ImageDownloadError::Http)?;
|
||||||
let _ = ImageDownloader::check_image_content_type(&parent_response)?;
|
let content_type = ImageDownloader::get_content_type(&response);
|
||||||
let child_response = client
|
let content_lenght = Self::get_content_lenght(&response).unwrap_or(0);
|
||||||
.head(child_url.clone())
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.map_err(|_| ImageDownloadError::Http)?;
|
|
||||||
let parent_length = Self::get_content_lenght(&parent_response)?;
|
|
||||||
let child_length = Self::get_content_lenght(&child_response)?;
|
|
||||||
|
|
||||||
if parent_length > child_length {
|
Ok(ImageRequest {
|
||||||
return Ok(parent_url);
|
node: parent,
|
||||||
}
|
http_response: response,
|
||||||
|
content_lenght,
|
||||||
log::debug!("Image parent element not relevant");
|
content_type,
|
||||||
Err(ImageDownloadError::ParentDownload)
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_content_lenght(response: &Response) -> Result<u64, ImageDownloadError> {
|
fn get_content_lenght(response: &Response) -> Result<u64, ImageDownloadError> {
|
||||||
|
|
9
article_scraper/src/images/request.rs
Normal file
9
article_scraper/src/images/request.rs
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
use libxml::tree::Node;
|
||||||
|
use reqwest::{header::HeaderValue, Response};
|
||||||
|
|
||||||
|
pub struct ImageRequest {
|
||||||
|
pub node: Node,
|
||||||
|
pub http_response: Response,
|
||||||
|
pub content_lenght: u64,
|
||||||
|
pub content_type: Option<HeaderValue>,
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue