1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 08:30:00 +02:00

option to set custom reqwest client

This commit is contained in:
Jan Lukas Gernert 2020-02-03 17:46:54 +01:00
parent 71055eed1c
commit 1ecc0fc4b4
2 changed files with 36 additions and 19 deletions

View file

@ -7,7 +7,7 @@ use libxml::parser::Parser;
use libxml::tree::{Node, SaveOptions}; use libxml::tree::{Node, SaveOptions};
use libxml::xpath::Context; use libxml::xpath::Context;
use log::{debug, error}; use log::{debug, error};
use reqwest; use reqwest::{Client, Response};
use std; use std;
use std::error::Error; use std::error::Error;
use url; use url;
@ -15,18 +15,26 @@ use url;
mod error; mod error;
pub struct ImageDownloader { pub struct ImageDownloader {
client: reqwest::Client, client: Client,
max_size: (u32, u32), max_size: (u32, u32),
} }
impl ImageDownloader { impl ImageDownloader {
pub fn new(max_size: (u32, u32)) -> ImageDownloader { pub fn new(max_size: (u32, u32)) -> Self {
Self::new_with_client(max_size, Client::new())
}
pub fn new_with_client(max_size: (u32, u32), client: Client) -> Self {
ImageDownloader { ImageDownloader {
client: reqwest::Client::new(), client,
max_size: max_size, max_size,
} }
} }
pub fn set_client(&mut self, client: Client) {
self.client = client;
}
pub async fn download_images_from_string( pub async fn download_images_from_string(
&self, &self,
html: &str, html: &str,
@ -185,7 +193,7 @@ impl ImageDownloader {
} }
fn check_image_content_type( fn check_image_content_type(
response: &reqwest::Response, response: &Response,
) -> Result<reqwest::header::HeaderValue, ImageDownloadError> { ) -> Result<reqwest::header::HeaderValue, ImageDownloadError> {
if response.status().is_success() { if response.status().is_success() {
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) { if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
@ -301,7 +309,7 @@ impl ImageDownloader {
Err(ImageDownloadErrorKind::ParentDownload)? Err(ImageDownloadErrorKind::ParentDownload)?
} }
fn get_content_lenght(response: &reqwest::Response) -> Result<u64, ImageDownloadError> { fn get_content_lenght(response: &Response) -> Result<u64, ImageDownloadError> {
if response.status().is_success() { if response.status().is_success() {
if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) { if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) {
if let Ok(content_length) = content_length.to_str() { if let Ok(content_length) = content_length.to_str() {

View file

@ -15,7 +15,7 @@ use libxml::tree::{Document, Node, SaveOptions};
use libxml::xpath::Context; use libxml::xpath::Context;
use log::{debug, error, info, warn}; use log::{debug, error, info, warn};
use regex; use regex;
use reqwest; use reqwest::{Client, Response};
use std::collections; use std::collections;
use std::error::Error; use std::error::Error;
use std::path::PathBuf; use std::path::PathBuf;
@ -27,11 +27,15 @@ use url;
pub struct ArticleScraper { pub struct ArticleScraper {
pub image_downloader: ImageDownloader, pub image_downloader: ImageDownloader,
config_files: Arc<RwLock<Option<ConfigCollection>>>, config_files: Arc<RwLock<Option<ConfigCollection>>>,
client: reqwest::Client, client: Client,
} }
impl ArticleScraper { impl ArticleScraper {
pub fn new(config_path: PathBuf) -> Result<ArticleScraper, ScraperError> { pub fn new(config_path: PathBuf) -> Self {
Self::new_with_client(config_path, Client::new())
}
pub fn new_with_client(config_path: PathBuf, client: Client) -> Self {
let config_files = Arc::new(RwLock::new(None)); let config_files = Arc::new(RwLock::new(None));
let locked_config_files = config_files.clone(); let locked_config_files = config_files.clone();
@ -49,11 +53,16 @@ impl ArticleScraper {
} }
}); });
Ok(ArticleScraper { ArticleScraper {
image_downloader: ImageDownloader::new((2048, 2048)), image_downloader: ImageDownloader::new_with_client((2048, 2048), client.clone()),
config_files, config_files,
client: reqwest::Client::new(), client,
}) }
}
pub fn set_client(&mut self, client: Client) {
self.client = client.clone();
self.image_downloader.set_client(client);
} }
pub async fn parse( pub async fn parse(
@ -263,7 +272,7 @@ impl ArticleScraper {
Ok(()) Ok(())
} }
async fn download(url: &url::Url, client: &reqwest::Client) -> Result<String, ScraperError> { async fn download(url: &url::Url, client: &Client) -> Result<String, ScraperError> {
let response = client let response = client
.get(url.as_str()) .get(url.as_str())
.send() .send()
@ -373,7 +382,7 @@ impl ArticleScraper {
} }
} }
fn check_content_type(response: &reqwest::Response) -> Result<bool, ScraperError> { fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
if response.status().is_success() { if response.status().is_success() {
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) { if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
if let Ok(content_type) = content_type.to_str() { if let Ok(content_type) = content_type.to_str() {
@ -391,7 +400,7 @@ impl ArticleScraper {
Err(ScraperErrorKind::Http)? Err(ScraperErrorKind::Http)?
} }
fn check_redirect(response: &reqwest::Response) -> Option<url::Url> { fn check_redirect(response: &Response) -> Option<url::Url> {
if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT { if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
debug!("Article url redirects to '{}'", response.url().as_str()); debug!("Article url redirects to '{}'", response.url().as_str());
return Some(response.url().clone()); return Some(response.url().clone());
@ -809,7 +818,7 @@ mod tests {
let out_path = PathBuf::from(r"./test_output"); let out_path = PathBuf::from(r"./test_output");
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap(); let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
let grabber = ArticleScraper::new(config_path).unwrap(); let grabber = ArticleScraper::new(config_path);
let article = grabber.parse(url, true).await.unwrap(); let article = grabber.parse(url, true).await.unwrap();
article.save_html(&out_path).unwrap(); article.save_html(&out_path).unwrap();
@ -831,7 +840,7 @@ mod tests {
) )
.unwrap(); .unwrap();
let grabber = ArticleScraper::new(config_path).unwrap(); let grabber = ArticleScraper::new(config_path);
let article = grabber.parse(url, true).await.unwrap(); let article = grabber.parse(url, true).await.unwrap();
article.save_html(&out_path).unwrap(); article.save_html(&out_path).unwrap();