1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

download single image

This commit is contained in:
Jan Lukas Gernert 2023-07-16 21:40:10 +02:00
parent be40383b1a
commit d562d41b81
3 changed files with 76 additions and 33 deletions

View file

@ -4,6 +4,7 @@ use self::pair::Pair;
use self::request::ImageRequest;
use crate::util::Util;
use base64::Engine;
use futures::StreamExt;
use image::ImageOutputFormat;
use libxml::parser::Parser;
use libxml::tree::{Node, SaveOptions};
@ -28,6 +29,45 @@ impl ImageDownloader {
ImageDownloader { max_size }
}
pub async fn single_from_url(
url: &str,
client: &Client,
progress: Option<Sender<Progress>>,
) -> Result<Vec<u8>, ImageDownloadError> {
let response = client.get(url).send().await?;
let content_type = Util::get_content_type(&response)?;
let content_length = Util::get_content_length(&response)?;
if !content_type.contains("image") {
return Err(ImageDownloadError::ContentType);
}
let mut stream = response.bytes_stream();
let mut downloaded_bytes = 0;
let mut result = Vec::with_capacity(content_length);
while let Some(item) = stream.next().await {
let chunk = item?;
downloaded_bytes += chunk.len();
if let Some(sender) = progress.as_ref() {
_ = sender
.send(Progress {
total_size: content_length,
downloaded: downloaded_bytes,
})
.await;
}
for byte in chunk {
result.push(byte);
}
}
Ok(result)
}
pub async fn download_images_from_string(
&self,
html: &str,

View file

@ -1,7 +1,9 @@
use futures::StreamExt;
use reqwest::{header::CONTENT_TYPE, Client, Response};
use reqwest::{Client, Response};
use tokio::sync::mpsc::Sender;
use crate::util::Util;
use super::{image_data::ImageData, ImageDownloadError};
#[derive(Debug)]
@ -16,8 +18,8 @@ impl ImageRequest {
pub async fn new(url: String, client: &Client) -> Result<Self, ImageDownloadError> {
let response = client.get(&url).send().await?;
let content_type = Self::get_content_type(&response)?;
let content_length = Self::get_content_length(&response)?;
let content_type = Util::get_content_type(&response)?;
let content_length = Util::get_content_length(&response)?;
if !content_type.contains("image") {
return Err(ImageDownloadError::ContentType);
@ -58,33 +60,4 @@ impl ImageRequest {
pub fn content_length(&self) -> usize {
self.content_length
}
fn get_content_length(response: &Response) -> Result<usize, ImageDownloadError> {
let status_code = response.status();
if !status_code.is_success() {
log::warn!("response: {status_code}");
return Err(ImageDownloadError::Http);
}
response
.headers()
.get(reqwest::header::CONTENT_LENGTH)
.and_then(|content_length| content_length.to_str().ok())
.and_then(|content_length| content_length.parse::<usize>().ok())
.ok_or(ImageDownloadError::ContentLength)
}
fn get_content_type(response: &Response) -> Result<String, ImageDownloadError> {
if response.status().is_success() {
response
.headers()
.get(CONTENT_TYPE)
.and_then(|val| val.to_str().ok())
.map(|val| val.to_string())
.ok_or(ImageDownloadError::ContentType)
} else {
Err(ImageDownloadError::ContentType)
}
}
}

View file

@ -5,7 +5,7 @@ use libxml::{
xpath::Context,
};
use reqwest::{
header::{HeaderMap, HeaderName, HeaderValue},
header::{HeaderMap, HeaderName, HeaderValue, CONTENT_LENGTH, CONTENT_TYPE},
Response,
};
use tokio::fs::DirEntry;
@ -14,6 +14,7 @@ use crate::{
constants::{self, NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX},
full_text_parser::{config::ConfigEntry, error::FullTextParserError},
image_object::ImageObject,
images::ImageDownloadError,
video_object::VideoObject,
};
@ -1191,6 +1192,35 @@ impl Util {
pub fn score_by_position(len: usize, index: usize) -> i32 {
((len as f32 / 2.0) - index as f32) as i32
}
pub fn get_content_length(response: &Response) -> Result<usize, ImageDownloadError> {
let status_code = response.status();
if !status_code.is_success() {
log::warn!("response: {status_code}");
return Err(ImageDownloadError::Http);
}
response
.headers()
.get(CONTENT_LENGTH)
.and_then(|content_length| content_length.to_str().ok())
.and_then(|content_length| content_length.parse::<usize>().ok())
.ok_or(ImageDownloadError::ContentLength)
}
pub fn get_content_type(response: &Response) -> Result<String, ImageDownloadError> {
if response.status().is_success() {
response
.headers()
.get(CONTENT_TYPE)
.and_then(|val| val.to_str().ok())
.map(|val| val.to_string())
.ok_or(ImageDownloadError::ContentType)
} else {
Err(ImageDownloadError::ContentType)
}
}
}
#[cfg(test)]