mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 00:19:59 +02:00
download single image
This commit is contained in:
parent
be40383b1a
commit
d562d41b81
3 changed files with 76 additions and 33 deletions
|
@ -4,6 +4,7 @@ use self::pair::Pair;
|
|||
use self::request::ImageRequest;
|
||||
use crate::util::Util;
|
||||
use base64::Engine;
|
||||
use futures::StreamExt;
|
||||
use image::ImageOutputFormat;
|
||||
use libxml::parser::Parser;
|
||||
use libxml::tree::{Node, SaveOptions};
|
||||
|
@ -28,6 +29,45 @@ impl ImageDownloader {
|
|||
ImageDownloader { max_size }
|
||||
}
|
||||
|
||||
pub async fn single_from_url(
|
||||
url: &str,
|
||||
client: &Client,
|
||||
progress: Option<Sender<Progress>>,
|
||||
) -> Result<Vec<u8>, ImageDownloadError> {
|
||||
let response = client.get(url).send().await?;
|
||||
|
||||
let content_type = Util::get_content_type(&response)?;
|
||||
let content_length = Util::get_content_length(&response)?;
|
||||
|
||||
if !content_type.contains("image") {
|
||||
return Err(ImageDownloadError::ContentType);
|
||||
}
|
||||
|
||||
let mut stream = response.bytes_stream();
|
||||
let mut downloaded_bytes = 0;
|
||||
|
||||
let mut result = Vec::with_capacity(content_length);
|
||||
while let Some(item) = stream.next().await {
|
||||
let chunk = item?;
|
||||
downloaded_bytes += chunk.len();
|
||||
|
||||
if let Some(sender) = progress.as_ref() {
|
||||
_ = sender
|
||||
.send(Progress {
|
||||
total_size: content_length,
|
||||
downloaded: downloaded_bytes,
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
||||
for byte in chunk {
|
||||
result.push(byte);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub async fn download_images_from_string(
|
||||
&self,
|
||||
html: &str,
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
use futures::StreamExt;
|
||||
use reqwest::{header::CONTENT_TYPE, Client, Response};
|
||||
use reqwest::{Client, Response};
|
||||
use tokio::sync::mpsc::Sender;
|
||||
|
||||
use crate::util::Util;
|
||||
|
||||
use super::{image_data::ImageData, ImageDownloadError};
|
||||
|
||||
#[derive(Debug)]
|
||||
|
@ -16,8 +18,8 @@ impl ImageRequest {
|
|||
pub async fn new(url: String, client: &Client) -> Result<Self, ImageDownloadError> {
|
||||
let response = client.get(&url).send().await?;
|
||||
|
||||
let content_type = Self::get_content_type(&response)?;
|
||||
let content_length = Self::get_content_length(&response)?;
|
||||
let content_type = Util::get_content_type(&response)?;
|
||||
let content_length = Util::get_content_length(&response)?;
|
||||
|
||||
if !content_type.contains("image") {
|
||||
return Err(ImageDownloadError::ContentType);
|
||||
|
@ -58,33 +60,4 @@ impl ImageRequest {
|
|||
pub fn content_length(&self) -> usize {
|
||||
self.content_length
|
||||
}
|
||||
|
||||
fn get_content_length(response: &Response) -> Result<usize, ImageDownloadError> {
|
||||
let status_code = response.status();
|
||||
|
||||
if !status_code.is_success() {
|
||||
log::warn!("response: {status_code}");
|
||||
return Err(ImageDownloadError::Http);
|
||||
}
|
||||
|
||||
response
|
||||
.headers()
|
||||
.get(reqwest::header::CONTENT_LENGTH)
|
||||
.and_then(|content_length| content_length.to_str().ok())
|
||||
.and_then(|content_length| content_length.parse::<usize>().ok())
|
||||
.ok_or(ImageDownloadError::ContentLength)
|
||||
}
|
||||
|
||||
fn get_content_type(response: &Response) -> Result<String, ImageDownloadError> {
|
||||
if response.status().is_success() {
|
||||
response
|
||||
.headers()
|
||||
.get(CONTENT_TYPE)
|
||||
.and_then(|val| val.to_str().ok())
|
||||
.map(|val| val.to_string())
|
||||
.ok_or(ImageDownloadError::ContentType)
|
||||
} else {
|
||||
Err(ImageDownloadError::ContentType)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ use libxml::{
|
|||
xpath::Context,
|
||||
};
|
||||
use reqwest::{
|
||||
header::{HeaderMap, HeaderName, HeaderValue},
|
||||
header::{HeaderMap, HeaderName, HeaderValue, CONTENT_LENGTH, CONTENT_TYPE},
|
||||
Response,
|
||||
};
|
||||
use tokio::fs::DirEntry;
|
||||
|
@ -14,6 +14,7 @@ use crate::{
|
|||
constants::{self, NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX},
|
||||
full_text_parser::{config::ConfigEntry, error::FullTextParserError},
|
||||
image_object::ImageObject,
|
||||
images::ImageDownloadError,
|
||||
video_object::VideoObject,
|
||||
};
|
||||
|
||||
|
@ -1191,6 +1192,35 @@ impl Util {
|
|||
pub fn score_by_position(len: usize, index: usize) -> i32 {
|
||||
((len as f32 / 2.0) - index as f32) as i32
|
||||
}
|
||||
|
||||
pub fn get_content_length(response: &Response) -> Result<usize, ImageDownloadError> {
|
||||
let status_code = response.status();
|
||||
|
||||
if !status_code.is_success() {
|
||||
log::warn!("response: {status_code}");
|
||||
return Err(ImageDownloadError::Http);
|
||||
}
|
||||
|
||||
response
|
||||
.headers()
|
||||
.get(CONTENT_LENGTH)
|
||||
.and_then(|content_length| content_length.to_str().ok())
|
||||
.and_then(|content_length| content_length.parse::<usize>().ok())
|
||||
.ok_or(ImageDownloadError::ContentLength)
|
||||
}
|
||||
|
||||
pub fn get_content_type(response: &Response) -> Result<String, ImageDownloadError> {
|
||||
if response.status().is_success() {
|
||||
response
|
||||
.headers()
|
||||
.get(CONTENT_TYPE)
|
||||
.and_then(|val| val.to_str().ok())
|
||||
.map(|val| val.to_string())
|
||||
.ok_or(ImageDownloadError::ContentType)
|
||||
} else {
|
||||
Err(ImageDownloadError::ContentType)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue