mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
allow downloads without content type smaller than 5mb
This commit is contained in:
parent
db007f752c
commit
40f065d9cd
2 changed files with 11 additions and 3 deletions
|
@ -3,6 +3,7 @@ use std::collections::HashSet;
|
|||
use once_cell::sync::Lazy;
|
||||
use regex::{Regex, RegexBuilder};
|
||||
|
||||
pub const UNKNOWN_CONTENT_SIZE_LIMIT: usize = 5 * 1024 * 1024;
|
||||
pub const DEFAULT_CHAR_THRESHOLD: usize = 500;
|
||||
pub static IS_IMAGE: Lazy<Regex> = Lazy::new(|| {
|
||||
RegexBuilder::new(r#"\.(jpg|jpeg|png|webp)"#)
|
||||
|
|
|
@ -2,6 +2,7 @@ pub use self::error::ImageDownloadError;
|
|||
use self::image_data::ImageDataBase64;
|
||||
use self::pair::Pair;
|
||||
use self::request::ImageRequest;
|
||||
use crate::constants;
|
||||
use crate::util::Util;
|
||||
use base64::Engine;
|
||||
use futures::StreamExt;
|
||||
|
@ -36,12 +37,18 @@ impl ImageDownloader {
|
|||
) -> Result<Vec<u8>, ImageDownloadError> {
|
||||
let response = client.get(url).send().await?;
|
||||
|
||||
let content_type = Util::get_content_type(&response)?;
|
||||
let content_length = Util::get_content_length(&response).unwrap_or(0);
|
||||
let content_type = Util::get_content_type(&response);
|
||||
let content_length = Util::get_content_length(&response);
|
||||
|
||||
if !content_type.contains("image") {
|
||||
if let (Err(_), Ok(content_length)) = (&content_type, &content_length) {
|
||||
if *content_length > constants::UNKNOWN_CONTENT_SIZE_LIMIT {
|
||||
return Err(ImageDownloadError::ContentType);
|
||||
}
|
||||
} else if !content_type?.contains("image") {
|
||||
return Err(ImageDownloadError::ContentType);
|
||||
}
|
||||
|
||||
let content_length = content_length.unwrap_or(0);
|
||||
|
||||
let mut stream = response.bytes_stream();
|
||||
let mut downloaded_bytes = 0;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue