mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
download single image
This commit is contained in:
parent
be40383b1a
commit
d562d41b81
3 changed files with 76 additions and 33 deletions
|
@ -4,6 +4,7 @@ use self::pair::Pair;
|
||||||
use self::request::ImageRequest;
|
use self::request::ImageRequest;
|
||||||
use crate::util::Util;
|
use crate::util::Util;
|
||||||
use base64::Engine;
|
use base64::Engine;
|
||||||
|
use futures::StreamExt;
|
||||||
use image::ImageOutputFormat;
|
use image::ImageOutputFormat;
|
||||||
use libxml::parser::Parser;
|
use libxml::parser::Parser;
|
||||||
use libxml::tree::{Node, SaveOptions};
|
use libxml::tree::{Node, SaveOptions};
|
||||||
|
@ -28,6 +29,45 @@ impl ImageDownloader {
|
||||||
ImageDownloader { max_size }
|
ImageDownloader { max_size }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn single_from_url(
|
||||||
|
url: &str,
|
||||||
|
client: &Client,
|
||||||
|
progress: Option<Sender<Progress>>,
|
||||||
|
) -> Result<Vec<u8>, ImageDownloadError> {
|
||||||
|
let response = client.get(url).send().await?;
|
||||||
|
|
||||||
|
let content_type = Util::get_content_type(&response)?;
|
||||||
|
let content_length = Util::get_content_length(&response)?;
|
||||||
|
|
||||||
|
if !content_type.contains("image") {
|
||||||
|
return Err(ImageDownloadError::ContentType);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut stream = response.bytes_stream();
|
||||||
|
let mut downloaded_bytes = 0;
|
||||||
|
|
||||||
|
let mut result = Vec::with_capacity(content_length);
|
||||||
|
while let Some(item) = stream.next().await {
|
||||||
|
let chunk = item?;
|
||||||
|
downloaded_bytes += chunk.len();
|
||||||
|
|
||||||
|
if let Some(sender) = progress.as_ref() {
|
||||||
|
_ = sender
|
||||||
|
.send(Progress {
|
||||||
|
total_size: content_length,
|
||||||
|
downloaded: downloaded_bytes,
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
|
||||||
|
for byte in chunk {
|
||||||
|
result.push(byte);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn download_images_from_string(
|
pub async fn download_images_from_string(
|
||||||
&self,
|
&self,
|
||||||
html: &str,
|
html: &str,
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
use reqwest::{header::CONTENT_TYPE, Client, Response};
|
use reqwest::{Client, Response};
|
||||||
use tokio::sync::mpsc::Sender;
|
use tokio::sync::mpsc::Sender;
|
||||||
|
|
||||||
|
use crate::util::Util;
|
||||||
|
|
||||||
use super::{image_data::ImageData, ImageDownloadError};
|
use super::{image_data::ImageData, ImageDownloadError};
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
|
@ -16,8 +18,8 @@ impl ImageRequest {
|
||||||
pub async fn new(url: String, client: &Client) -> Result<Self, ImageDownloadError> {
|
pub async fn new(url: String, client: &Client) -> Result<Self, ImageDownloadError> {
|
||||||
let response = client.get(&url).send().await?;
|
let response = client.get(&url).send().await?;
|
||||||
|
|
||||||
let content_type = Self::get_content_type(&response)?;
|
let content_type = Util::get_content_type(&response)?;
|
||||||
let content_length = Self::get_content_length(&response)?;
|
let content_length = Util::get_content_length(&response)?;
|
||||||
|
|
||||||
if !content_type.contains("image") {
|
if !content_type.contains("image") {
|
||||||
return Err(ImageDownloadError::ContentType);
|
return Err(ImageDownloadError::ContentType);
|
||||||
|
@ -58,33 +60,4 @@ impl ImageRequest {
|
||||||
pub fn content_length(&self) -> usize {
|
pub fn content_length(&self) -> usize {
|
||||||
self.content_length
|
self.content_length
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_content_length(response: &Response) -> Result<usize, ImageDownloadError> {
|
|
||||||
let status_code = response.status();
|
|
||||||
|
|
||||||
if !status_code.is_success() {
|
|
||||||
log::warn!("response: {status_code}");
|
|
||||||
return Err(ImageDownloadError::Http);
|
|
||||||
}
|
|
||||||
|
|
||||||
response
|
|
||||||
.headers()
|
|
||||||
.get(reqwest::header::CONTENT_LENGTH)
|
|
||||||
.and_then(|content_length| content_length.to_str().ok())
|
|
||||||
.and_then(|content_length| content_length.parse::<usize>().ok())
|
|
||||||
.ok_or(ImageDownloadError::ContentLength)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_content_type(response: &Response) -> Result<String, ImageDownloadError> {
|
|
||||||
if response.status().is_success() {
|
|
||||||
response
|
|
||||||
.headers()
|
|
||||||
.get(CONTENT_TYPE)
|
|
||||||
.and_then(|val| val.to_str().ok())
|
|
||||||
.map(|val| val.to_string())
|
|
||||||
.ok_or(ImageDownloadError::ContentType)
|
|
||||||
} else {
|
|
||||||
Err(ImageDownloadError::ContentType)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,7 @@ use libxml::{
|
||||||
xpath::Context,
|
xpath::Context,
|
||||||
};
|
};
|
||||||
use reqwest::{
|
use reqwest::{
|
||||||
header::{HeaderMap, HeaderName, HeaderValue},
|
header::{HeaderMap, HeaderName, HeaderValue, CONTENT_LENGTH, CONTENT_TYPE},
|
||||||
Response,
|
Response,
|
||||||
};
|
};
|
||||||
use tokio::fs::DirEntry;
|
use tokio::fs::DirEntry;
|
||||||
|
@ -14,6 +14,7 @@ use crate::{
|
||||||
constants::{self, NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX},
|
constants::{self, NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX},
|
||||||
full_text_parser::{config::ConfigEntry, error::FullTextParserError},
|
full_text_parser::{config::ConfigEntry, error::FullTextParserError},
|
||||||
image_object::ImageObject,
|
image_object::ImageObject,
|
||||||
|
images::ImageDownloadError,
|
||||||
video_object::VideoObject,
|
video_object::VideoObject,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1191,6 +1192,35 @@ impl Util {
|
||||||
pub fn score_by_position(len: usize, index: usize) -> i32 {
|
pub fn score_by_position(len: usize, index: usize) -> i32 {
|
||||||
((len as f32 / 2.0) - index as f32) as i32
|
((len as f32 / 2.0) - index as f32) as i32
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_content_length(response: &Response) -> Result<usize, ImageDownloadError> {
|
||||||
|
let status_code = response.status();
|
||||||
|
|
||||||
|
if !status_code.is_success() {
|
||||||
|
log::warn!("response: {status_code}");
|
||||||
|
return Err(ImageDownloadError::Http);
|
||||||
|
}
|
||||||
|
|
||||||
|
response
|
||||||
|
.headers()
|
||||||
|
.get(CONTENT_LENGTH)
|
||||||
|
.and_then(|content_length| content_length.to_str().ok())
|
||||||
|
.and_then(|content_length| content_length.parse::<usize>().ok())
|
||||||
|
.ok_or(ImageDownloadError::ContentLength)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_content_type(response: &Response) -> Result<String, ImageDownloadError> {
|
||||||
|
if response.status().is_success() {
|
||||||
|
response
|
||||||
|
.headers()
|
||||||
|
.get(CONTENT_TYPE)
|
||||||
|
.and_then(|val| val.to_str().ok())
|
||||||
|
.map(|val| val.to_string())
|
||||||
|
.ok_or(ImageDownloadError::ContentType)
|
||||||
|
} else {
|
||||||
|
Err(ImageDownloadError::ContentType)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue