1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 00:19:59 +02:00

option to set custom reqwest client

This commit is contained in:
Jan Lukas Gernert 2020-02-03 17:46:54 +01:00
parent 71055eed1c
commit 1ecc0fc4b4
2 changed files with 36 additions and 19 deletions

View file

@ -7,7 +7,7 @@ use libxml::parser::Parser;
use libxml::tree::{Node, SaveOptions};
use libxml::xpath::Context;
use log::{debug, error};
use reqwest;
use reqwest::{Client, Response};
use std;
use std::error::Error;
use url;
@ -15,18 +15,26 @@ use url;
mod error;
pub struct ImageDownloader {
client: reqwest::Client,
client: Client,
max_size: (u32, u32),
}
impl ImageDownloader {
pub fn new(max_size: (u32, u32)) -> ImageDownloader {
pub fn new(max_size: (u32, u32)) -> Self {
Self::new_with_client(max_size, Client::new())
}
pub fn new_with_client(max_size: (u32, u32), client: Client) -> Self {
ImageDownloader {
client: reqwest::Client::new(),
max_size: max_size,
client,
max_size,
}
}
pub fn set_client(&mut self, client: Client) {
self.client = client;
}
pub async fn download_images_from_string(
&self,
html: &str,
@ -185,7 +193,7 @@ impl ImageDownloader {
}
fn check_image_content_type(
response: &reqwest::Response,
response: &Response,
) -> Result<reqwest::header::HeaderValue, ImageDownloadError> {
if response.status().is_success() {
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
@ -301,7 +309,7 @@ impl ImageDownloader {
Err(ImageDownloadErrorKind::ParentDownload)?
}
fn get_content_lenght(response: &reqwest::Response) -> Result<u64, ImageDownloadError> {
fn get_content_lenght(response: &Response) -> Result<u64, ImageDownloadError> {
if response.status().is_success() {
if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) {
if let Ok(content_length) = content_length.to_str() {

View file

@ -15,7 +15,7 @@ use libxml::tree::{Document, Node, SaveOptions};
use libxml::xpath::Context;
use log::{debug, error, info, warn};
use regex;
use reqwest;
use reqwest::{Client, Response};
use std::collections;
use std::error::Error;
use std::path::PathBuf;
@ -27,11 +27,15 @@ use url;
pub struct ArticleScraper {
pub image_downloader: ImageDownloader,
config_files: Arc<RwLock<Option<ConfigCollection>>>,
client: reqwest::Client,
client: Client,
}
impl ArticleScraper {
pub fn new(config_path: PathBuf) -> Result<ArticleScraper, ScraperError> {
pub fn new(config_path: PathBuf) -> Self {
Self::new_with_client(config_path, Client::new())
}
pub fn new_with_client(config_path: PathBuf, client: Client) -> Self {
let config_files = Arc::new(RwLock::new(None));
let locked_config_files = config_files.clone();
@ -49,11 +53,16 @@ impl ArticleScraper {
}
});
Ok(ArticleScraper {
image_downloader: ImageDownloader::new((2048, 2048)),
ArticleScraper {
image_downloader: ImageDownloader::new_with_client((2048, 2048), client.clone()),
config_files,
client: reqwest::Client::new(),
})
client,
}
}
pub fn set_client(&mut self, client: Client) {
self.client = client.clone();
self.image_downloader.set_client(client);
}
pub async fn parse(
@ -263,7 +272,7 @@ impl ArticleScraper {
Ok(())
}
async fn download(url: &url::Url, client: &reqwest::Client) -> Result<String, ScraperError> {
async fn download(url: &url::Url, client: &Client) -> Result<String, ScraperError> {
let response = client
.get(url.as_str())
.send()
@ -373,7 +382,7 @@ impl ArticleScraper {
}
}
fn check_content_type(response: &reqwest::Response) -> Result<bool, ScraperError> {
fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
if response.status().is_success() {
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
if let Ok(content_type) = content_type.to_str() {
@ -391,7 +400,7 @@ impl ArticleScraper {
Err(ScraperErrorKind::Http)?
}
fn check_redirect(response: &reqwest::Response) -> Option<url::Url> {
fn check_redirect(response: &Response) -> Option<url::Url> {
if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
debug!("Article url redirects to '{}'", response.url().as_str());
return Some(response.url().clone());
@ -809,7 +818,7 @@ mod tests {
let out_path = PathBuf::from(r"./test_output");
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
let grabber = ArticleScraper::new(config_path).unwrap();
let grabber = ArticleScraper::new(config_path);
let article = grabber.parse(url, true).await.unwrap();
article.save_html(&out_path).unwrap();
@ -831,7 +840,7 @@ mod tests {
)
.unwrap();
let grabber = ArticleScraper::new(config_path).unwrap();
let grabber = ArticleScraper::new(config_path);
let article = grabber.parse(url, true).await.unwrap();
article.save_html(&out_path).unwrap();