1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

download single image

This commit is contained in:
Jan Lukas Gernert 2023-07-16 21:40:10 +02:00
parent be40383b1a
commit d562d41b81
3 changed files with 76 additions and 33 deletions

View file

@ -4,6 +4,7 @@ use self::pair::Pair;
use self::request::ImageRequest; use self::request::ImageRequest;
use crate::util::Util; use crate::util::Util;
use base64::Engine; use base64::Engine;
use futures::StreamExt;
use image::ImageOutputFormat; use image::ImageOutputFormat;
use libxml::parser::Parser; use libxml::parser::Parser;
use libxml::tree::{Node, SaveOptions}; use libxml::tree::{Node, SaveOptions};
@ -28,6 +29,45 @@ impl ImageDownloader {
ImageDownloader { max_size } ImageDownloader { max_size }
} }
pub async fn single_from_url(
url: &str,
client: &Client,
progress: Option<Sender<Progress>>,
) -> Result<Vec<u8>, ImageDownloadError> {
let response = client.get(url).send().await?;
let content_type = Util::get_content_type(&response)?;
let content_length = Util::get_content_length(&response)?;
if !content_type.contains("image") {
return Err(ImageDownloadError::ContentType);
}
let mut stream = response.bytes_stream();
let mut downloaded_bytes = 0;
let mut result = Vec::with_capacity(content_length);
while let Some(item) = stream.next().await {
let chunk = item?;
downloaded_bytes += chunk.len();
if let Some(sender) = progress.as_ref() {
_ = sender
.send(Progress {
total_size: content_length,
downloaded: downloaded_bytes,
})
.await;
}
for byte in chunk {
result.push(byte);
}
}
Ok(result)
}
pub async fn download_images_from_string( pub async fn download_images_from_string(
&self, &self,
html: &str, html: &str,

View file

@ -1,7 +1,9 @@
use futures::StreamExt; use futures::StreamExt;
use reqwest::{header::CONTENT_TYPE, Client, Response}; use reqwest::{Client, Response};
use tokio::sync::mpsc::Sender; use tokio::sync::mpsc::Sender;
use crate::util::Util;
use super::{image_data::ImageData, ImageDownloadError}; use super::{image_data::ImageData, ImageDownloadError};
#[derive(Debug)] #[derive(Debug)]
@ -16,8 +18,8 @@ impl ImageRequest {
pub async fn new(url: String, client: &Client) -> Result<Self, ImageDownloadError> { pub async fn new(url: String, client: &Client) -> Result<Self, ImageDownloadError> {
let response = client.get(&url).send().await?; let response = client.get(&url).send().await?;
let content_type = Self::get_content_type(&response)?; let content_type = Util::get_content_type(&response)?;
let content_length = Self::get_content_length(&response)?; let content_length = Util::get_content_length(&response)?;
if !content_type.contains("image") { if !content_type.contains("image") {
return Err(ImageDownloadError::ContentType); return Err(ImageDownloadError::ContentType);
@ -58,33 +60,4 @@ impl ImageRequest {
pub fn content_length(&self) -> usize { pub fn content_length(&self) -> usize {
self.content_length self.content_length
} }
fn get_content_length(response: &Response) -> Result<usize, ImageDownloadError> {
let status_code = response.status();
if !status_code.is_success() {
log::warn!("response: {status_code}");
return Err(ImageDownloadError::Http);
}
response
.headers()
.get(reqwest::header::CONTENT_LENGTH)
.and_then(|content_length| content_length.to_str().ok())
.and_then(|content_length| content_length.parse::<usize>().ok())
.ok_or(ImageDownloadError::ContentLength)
}
fn get_content_type(response: &Response) -> Result<String, ImageDownloadError> {
if response.status().is_success() {
response
.headers()
.get(CONTENT_TYPE)
.and_then(|val| val.to_str().ok())
.map(|val| val.to_string())
.ok_or(ImageDownloadError::ContentType)
} else {
Err(ImageDownloadError::ContentType)
}
}
} }

View file

@ -5,7 +5,7 @@ use libxml::{
xpath::Context, xpath::Context,
}; };
use reqwest::{ use reqwest::{
header::{HeaderMap, HeaderName, HeaderValue}, header::{HeaderMap, HeaderName, HeaderValue, CONTENT_LENGTH, CONTENT_TYPE},
Response, Response,
}; };
use tokio::fs::DirEntry; use tokio::fs::DirEntry;
@ -14,6 +14,7 @@ use crate::{
constants::{self, NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX}, constants::{self, NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX},
full_text_parser::{config::ConfigEntry, error::FullTextParserError}, full_text_parser::{config::ConfigEntry, error::FullTextParserError},
image_object::ImageObject, image_object::ImageObject,
images::ImageDownloadError,
video_object::VideoObject, video_object::VideoObject,
}; };
@ -1191,6 +1192,35 @@ impl Util {
pub fn score_by_position(len: usize, index: usize) -> i32 { pub fn score_by_position(len: usize, index: usize) -> i32 {
((len as f32 / 2.0) - index as f32) as i32 ((len as f32 / 2.0) - index as f32) as i32
} }
pub fn get_content_length(response: &Response) -> Result<usize, ImageDownloadError> {
let status_code = response.status();
if !status_code.is_success() {
log::warn!("response: {status_code}");
return Err(ImageDownloadError::Http);
}
response
.headers()
.get(CONTENT_LENGTH)
.and_then(|content_length| content_length.to_str().ok())
.and_then(|content_length| content_length.parse::<usize>().ok())
.ok_or(ImageDownloadError::ContentLength)
}
pub fn get_content_type(response: &Response) -> Result<String, ImageDownloadError> {
if response.status().is_success() {
response
.headers()
.get(CONTENT_TYPE)
.and_then(|val| val.to_str().ok())
.map(|val| val.to_string())
.ok_or(ImageDownloadError::ContentType)
} else {
Err(ImageDownloadError::ContentType)
}
}
} }
#[cfg(test)] #[cfg(test)]