mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
allow downloads without content type smaller than 5mb
This commit is contained in:
parent
db007f752c
commit
40f065d9cd
2 changed files with 11 additions and 3 deletions
|
@ -3,6 +3,7 @@ use std::collections::HashSet;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use regex::{Regex, RegexBuilder};
|
use regex::{Regex, RegexBuilder};
|
||||||
|
|
||||||
|
pub const UNKNOWN_CONTENT_SIZE_LIMIT: usize = 5 * 1024 * 1024;
|
||||||
pub const DEFAULT_CHAR_THRESHOLD: usize = 500;
|
pub const DEFAULT_CHAR_THRESHOLD: usize = 500;
|
||||||
pub static IS_IMAGE: Lazy<Regex> = Lazy::new(|| {
|
pub static IS_IMAGE: Lazy<Regex> = Lazy::new(|| {
|
||||||
RegexBuilder::new(r#"\.(jpg|jpeg|png|webp)"#)
|
RegexBuilder::new(r#"\.(jpg|jpeg|png|webp)"#)
|
||||||
|
|
|
@ -2,6 +2,7 @@ pub use self::error::ImageDownloadError;
|
||||||
use self::image_data::ImageDataBase64;
|
use self::image_data::ImageDataBase64;
|
||||||
use self::pair::Pair;
|
use self::pair::Pair;
|
||||||
use self::request::ImageRequest;
|
use self::request::ImageRequest;
|
||||||
|
use crate::constants;
|
||||||
use crate::util::Util;
|
use crate::util::Util;
|
||||||
use base64::Engine;
|
use base64::Engine;
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
|
@ -36,13 +37,19 @@ impl ImageDownloader {
|
||||||
) -> Result<Vec<u8>, ImageDownloadError> {
|
) -> Result<Vec<u8>, ImageDownloadError> {
|
||||||
let response = client.get(url).send().await?;
|
let response = client.get(url).send().await?;
|
||||||
|
|
||||||
let content_type = Util::get_content_type(&response)?;
|
let content_type = Util::get_content_type(&response);
|
||||||
let content_length = Util::get_content_length(&response).unwrap_or(0);
|
let content_length = Util::get_content_length(&response);
|
||||||
|
|
||||||
if !content_type.contains("image") {
|
if let (Err(_), Ok(content_length)) = (&content_type, &content_length) {
|
||||||
|
if *content_length > constants::UNKNOWN_CONTENT_SIZE_LIMIT {
|
||||||
|
return Err(ImageDownloadError::ContentType);
|
||||||
|
}
|
||||||
|
} else if !content_type?.contains("image") {
|
||||||
return Err(ImageDownloadError::ContentType);
|
return Err(ImageDownloadError::ContentType);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let content_length = content_length.unwrap_or(0);
|
||||||
|
|
||||||
let mut stream = response.bytes_stream();
|
let mut stream = response.bytes_stream();
|
||||||
let mut downloaded_bytes = 0;
|
let mut downloaded_bytes = 0;
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue