1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 08:05:31 +02:00

allow downloads without content type smaller than 5mb

This commit is contained in:
Jan Lukas Gernert 2023-07-28 07:03:50 +02:00
parent db007f752c
commit 40f065d9cd
2 changed files with 11 additions and 3 deletions

View file

@ -3,6 +3,7 @@ use std::collections::HashSet;
use once_cell::sync::Lazy;
use regex::{Regex, RegexBuilder};
pub const UNKNOWN_CONTENT_SIZE_LIMIT: usize = 5 * 1024 * 1024;
pub const DEFAULT_CHAR_THRESHOLD: usize = 500;
pub static IS_IMAGE: Lazy<Regex> = Lazy::new(|| {
RegexBuilder::new(r#"\.(jpg|jpeg|png|webp)"#)

View file

@ -2,6 +2,7 @@ pub use self::error::ImageDownloadError;
use self::image_data::ImageDataBase64;
use self::pair::Pair;
use self::request::ImageRequest;
use crate::constants;
use crate::util::Util;
use base64::Engine;
use futures::StreamExt;
@ -36,12 +37,18 @@ impl ImageDownloader {
) -> Result<Vec<u8>, ImageDownloadError> {
let response = client.get(url).send().await?;
let content_type = Util::get_content_type(&response)?;
let content_length = Util::get_content_length(&response).unwrap_or(0);
let content_type = Util::get_content_type(&response);
let content_length = Util::get_content_length(&response);
if !content_type.contains("image") {
if let (Err(_), Ok(content_length)) = (&content_type, &content_length) {
if *content_length > constants::UNKNOWN_CONTENT_SIZE_LIMIT {
return Err(ImageDownloadError::ContentType);
}
} else if !content_type?.contains("image") {
return Err(ImageDownloadError::ContentType);
}
let content_length = content_length.unwrap_or(0);
let mut stream = response.bytes_stream();
let mut downloaded_bytes = 0;