mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
cleanup & fixes
This commit is contained in:
parent
57f74c635b
commit
ccc8223db0
3 changed files with 127 additions and 117 deletions
|
@ -11,7 +11,7 @@ pub enum ImageDownloadError {
|
||||||
#[error("Generating image name failed")]
|
#[error("Generating image name failed")]
|
||||||
ImageName,
|
ImageName,
|
||||||
#[error("Getting the content-length property failed")]
|
#[error("Getting the content-length property failed")]
|
||||||
ContentLenght,
|
ContentLength,
|
||||||
#[error("Content-type suggest no image")]
|
#[error("Content-type suggest no image")]
|
||||||
ContentType,
|
ContentType,
|
||||||
#[error("Http error")]
|
#[error("Http error")]
|
||||||
|
|
|
@ -6,8 +6,7 @@ use image::ImageOutputFormat;
|
||||||
use libxml::parser::Parser;
|
use libxml::parser::Parser;
|
||||||
use libxml::tree::{Document, Node, SaveOptions};
|
use libxml::tree::{Document, Node, SaveOptions};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
use reqwest::header::{HeaderValue, CONTENT_TYPE};
|
use reqwest::{Client, Url};
|
||||||
use reqwest::{Client, Response, Url};
|
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
|
|
||||||
mod error;
|
mod error;
|
||||||
|
@ -76,14 +75,7 @@ impl ImageDownloader {
|
||||||
let mut download_futures = Vec::new();
|
let mut download_futures = Vec::new();
|
||||||
|
|
||||||
for (request, parent_request) in res {
|
for (request, parent_request) in res {
|
||||||
if let Some(parent_request) = parent_request {
|
download_futures.push(self.download_and_replace_image(request, parent_request));
|
||||||
if parent_request.content_lenght > request.content_lenght {
|
|
||||||
download_futures
|
|
||||||
.push(self.download_and_replace_image(parent_request, "big-src"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
download_futures.push(self.download_and_replace_image(request, "src"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_ = futures::future::join_all(download_futures).await;
|
_ = futures::future::join_all(download_futures).await;
|
||||||
|
@ -91,23 +83,6 @@ impl ImageDownloader {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn download_and_replace_image(&self, request: ImageRequest, prop_name: &str) {
|
|
||||||
let ImageRequest {
|
|
||||||
mut node,
|
|
||||||
http_response,
|
|
||||||
content_lenght,
|
|
||||||
content_type,
|
|
||||||
} = request;
|
|
||||||
|
|
||||||
_ = self
|
|
||||||
.download_image_base64(http_response, content_lenght, content_type)
|
|
||||||
.await
|
|
||||||
.map(|image| {
|
|
||||||
_ = node.set_property(prop_name, &image);
|
|
||||||
})
|
|
||||||
.map_err(|error| log::error!("Failed to download image: {error}"));
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn harvest_image_urls(
|
async fn harvest_image_urls(
|
||||||
node: Node,
|
node: Node,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
|
@ -128,71 +103,58 @@ impl ImageDownloader {
|
||||||
};
|
};
|
||||||
|
|
||||||
let url = Url::parse(&src).map_err(ImageDownloadError::InvalidUrl)?;
|
let url = Url::parse(&src).map_err(ImageDownloadError::InvalidUrl)?;
|
||||||
let parent_request = Self::check_image_parent(&node, client).await.ok();
|
let parent_url = Self::check_image_parent(&node).await.ok();
|
||||||
|
|
||||||
println!("url: {url}");
|
let request = ImageRequest::new(node.clone(), &url, client).await?;
|
||||||
|
let parent_request = match parent_url {
|
||||||
let response = client
|
Some(parent_url) => Some(ImageRequest::new(node, &parent_url, client).await?),
|
||||||
.get(url)
|
None => None,
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.map_err(|_| ImageDownloadError::Http)?;
|
|
||||||
let content_type = ImageDownloader::get_content_type(&response);
|
|
||||||
let content_lenght = Self::get_content_lenght(&response).unwrap_or(0);
|
|
||||||
|
|
||||||
let request = ImageRequest {
|
|
||||||
node,
|
|
||||||
http_response: response,
|
|
||||||
content_lenght,
|
|
||||||
content_type,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok((request, parent_request))
|
Ok((request, parent_request))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn download_image_base64(
|
async fn download_and_replace_image(
|
||||||
&self,
|
&self,
|
||||||
http_response: Response,
|
mut request: ImageRequest,
|
||||||
content_length: u64,
|
mut parent_request: Option<ImageRequest>,
|
||||||
content_type: Option<HeaderValue>,
|
) -> Result<(), ImageDownloadError> {
|
||||||
) -> Result<String, ImageDownloadError> {
|
let mut image = request.download().await?;
|
||||||
if content_length == 0 {
|
let mut parent_image: Option<Vec<u8>> = None;
|
||||||
return Err(ImageDownloadError::ContentLenght);
|
|
||||||
|
if let Some(parent_request) = parent_request.as_mut() {
|
||||||
|
if parent_request.content_length() > request.content_length() {
|
||||||
|
parent_image = parent_request.download().await.ok();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let content_type = content_type
|
if request.content_type() != "image/svg+xml" && request.content_type() != "image/gif" {
|
||||||
.as_ref()
|
|
||||||
.and_then(|content_type| content_type.to_str().ok())
|
|
||||||
.ok_or(ImageDownloadError::ContentType)?;
|
|
||||||
|
|
||||||
if !content_type.contains("image") {
|
|
||||||
return Err(ImageDownloadError::ContentType);
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut image = http_response
|
|
||||||
.bytes()
|
|
||||||
.await
|
|
||||||
.map_err(|_| ImageDownloadError::Http)?
|
|
||||||
.as_ref()
|
|
||||||
.to_vec();
|
|
||||||
|
|
||||||
if content_type != "image/svg+xml" && content_type != "image/gif" {
|
|
||||||
if let Some(resized_image) = Self::scale_image(&image, self.max_size) {
|
if let Some(resized_image) = Self::scale_image(&image, self.max_size) {
|
||||||
|
if parent_image.is_none() {
|
||||||
|
parent_image = Some(image);
|
||||||
|
}
|
||||||
image = resized_image;
|
image = resized_image;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let image_base64 = base64::engine::general_purpose::STANDARD.encode(&image);
|
let image_base64 = base64::engine::general_purpose::STANDARD.encode(&image);
|
||||||
let image_string = format!("data:{};base64,{}", content_type, image_base64);
|
let image_string = format!("data:{};base64,{}", request.content_type(), image_base64);
|
||||||
Ok(image_string)
|
request.write_image_to_property("src", &image_string);
|
||||||
}
|
|
||||||
|
|
||||||
fn get_content_type(response: &Response) -> Option<HeaderValue> {
|
if let Some(parent_image) = parent_image {
|
||||||
if response.status().is_success() {
|
let parent_image_base64 =
|
||||||
response.headers().get(CONTENT_TYPE).cloned()
|
base64::engine::general_purpose::STANDARD.encode(parent_image);
|
||||||
} else {
|
|
||||||
None
|
let content_type = parent_request
|
||||||
|
.map(|pr| pr.content_type().to_string())
|
||||||
|
.unwrap_or(request.content_type().to_string());
|
||||||
|
let parent_image_string =
|
||||||
|
format!("data:{};base64,{}", content_type, parent_image_base64);
|
||||||
|
|
||||||
|
request.write_image_to_property("big-src", &parent_image_string);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn scale_image(image_buffer: &[u8], max_dimensions: (u32, u32)) -> Option<Vec<u8>> {
|
fn scale_image(image_buffer: &[u8], max_dimensions: (u32, u32)) -> Option<Vec<u8>> {
|
||||||
|
@ -225,10 +187,7 @@ impl ImageDownloader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn check_image_parent(
|
async fn check_image_parent(node: &Node) -> Result<Url, ImageDownloadError> {
|
||||||
node: &Node,
|
|
||||||
client: &Client,
|
|
||||||
) -> Result<ImageRequest, ImageDownloadError> {
|
|
||||||
let parent = match node.get_parent() {
|
let parent = match node.get_parent() {
|
||||||
Some(parent) => parent,
|
Some(parent) => parent,
|
||||||
None => {
|
None => {
|
||||||
|
@ -255,38 +214,7 @@ impl ImageDownloader {
|
||||||
ImageDownloadError::InvalidUrl(err)
|
ImageDownloadError::InvalidUrl(err)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
println!("parent url: {parent_url}");
|
Ok(parent_url)
|
||||||
|
|
||||||
let response = client
|
|
||||||
.get(parent_url.clone())
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.map_err(|_| ImageDownloadError::Http)?;
|
|
||||||
let content_type = ImageDownloader::get_content_type(&response);
|
|
||||||
let content_lenght = Self::get_content_lenght(&response).unwrap_or(0);
|
|
||||||
|
|
||||||
Ok(ImageRequest {
|
|
||||||
node: parent,
|
|
||||||
http_response: response,
|
|
||||||
content_lenght,
|
|
||||||
content_type,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_content_lenght(response: &Response) -> Result<u64, ImageDownloadError> {
|
|
||||||
let status_code = response.status();
|
|
||||||
|
|
||||||
if !status_code.is_success() {
|
|
||||||
log::warn!("response: {status_code}");
|
|
||||||
return Err(ImageDownloadError::Http);
|
|
||||||
}
|
|
||||||
|
|
||||||
response
|
|
||||||
.headers()
|
|
||||||
.get(reqwest::header::CONTENT_LENGTH)
|
|
||||||
.and_then(|content_length| content_length.to_str().ok())
|
|
||||||
.and_then(|content_length| content_length.parse::<u64>().ok())
|
|
||||||
.ok_or(ImageDownloadError::ContentLenght)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,91 @@
|
||||||
use libxml::tree::Node;
|
use libxml::tree::Node;
|
||||||
use reqwest::{header::HeaderValue, Response};
|
use reqwest::{header::CONTENT_TYPE, Client, Response, Url};
|
||||||
|
|
||||||
|
use super::ImageDownloadError;
|
||||||
|
|
||||||
pub struct ImageRequest {
|
pub struct ImageRequest {
|
||||||
pub node: Node,
|
node: Node,
|
||||||
pub http_response: Response,
|
http_response: Option<Response>,
|
||||||
pub content_lenght: u64,
|
content_length: u64,
|
||||||
pub content_type: Option<HeaderValue>,
|
content_type: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ImageRequest {
|
||||||
|
pub async fn new(node: Node, url: &Url, client: &Client) -> Result<Self, ImageDownloadError> {
|
||||||
|
let response = client
|
||||||
|
.get(url.clone())
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|_| ImageDownloadError::Http)?;
|
||||||
|
|
||||||
|
let content_type = Self::get_content_type(&response)?;
|
||||||
|
let content_length = Self::get_content_length(&response)?;
|
||||||
|
|
||||||
|
if !content_type.contains("image") {
|
||||||
|
return Err(ImageDownloadError::ContentType);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
node,
|
||||||
|
http_response: Some(response),
|
||||||
|
content_length,
|
||||||
|
content_type,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn download(&mut self) -> Result<Vec<u8>, ImageDownloadError> {
|
||||||
|
if let Some(http_response) = self.http_response.take() {
|
||||||
|
let result = http_response
|
||||||
|
.bytes()
|
||||||
|
.await
|
||||||
|
.map_err(|_| ImageDownloadError::Http)?
|
||||||
|
.as_ref()
|
||||||
|
.to_vec();
|
||||||
|
Ok(result)
|
||||||
|
} else {
|
||||||
|
log::warn!("imagerequest already consumed");
|
||||||
|
Err(ImageDownloadError::Http)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn content_type(&self) -> &str {
|
||||||
|
&self.content_type
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn content_length(&self) -> u64 {
|
||||||
|
self.content_length
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn write_image_to_property(&mut self, prop_name: &str, data: &str) {
|
||||||
|
_ = self.node.set_property(prop_name, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_content_length(response: &Response) -> Result<u64, ImageDownloadError> {
|
||||||
|
let status_code = response.status();
|
||||||
|
|
||||||
|
if !status_code.is_success() {
|
||||||
|
log::warn!("response: {status_code}");
|
||||||
|
return Err(ImageDownloadError::Http);
|
||||||
|
}
|
||||||
|
|
||||||
|
response
|
||||||
|
.headers()
|
||||||
|
.get(reqwest::header::CONTENT_LENGTH)
|
||||||
|
.and_then(|content_length| content_length.to_str().ok())
|
||||||
|
.and_then(|content_length| content_length.parse::<u64>().ok())
|
||||||
|
.ok_or(ImageDownloadError::ContentLength)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_content_type(response: &Response) -> Result<String, ImageDownloadError> {
|
||||||
|
if response.status().is_success() {
|
||||||
|
response
|
||||||
|
.headers()
|
||||||
|
.get(CONTENT_TYPE)
|
||||||
|
.and_then(|val| val.to_str().ok())
|
||||||
|
.map(|val| val.to_string())
|
||||||
|
.ok_or(ImageDownloadError::ContentType)
|
||||||
|
} else {
|
||||||
|
Err(ImageDownloadError::ContentType)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue