mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 00:19:59 +02:00
option to set custom reqwest client
This commit is contained in:
parent
71055eed1c
commit
1ecc0fc4b4
2 changed files with 36 additions and 19 deletions
|
@ -7,7 +7,7 @@ use libxml::parser::Parser;
|
|||
use libxml::tree::{Node, SaveOptions};
|
||||
use libxml::xpath::Context;
|
||||
use log::{debug, error};
|
||||
use reqwest;
|
||||
use reqwest::{Client, Response};
|
||||
use std;
|
||||
use std::error::Error;
|
||||
use url;
|
||||
|
@ -15,18 +15,26 @@ use url;
|
|||
mod error;
|
||||
|
||||
pub struct ImageDownloader {
|
||||
client: reqwest::Client,
|
||||
client: Client,
|
||||
max_size: (u32, u32),
|
||||
}
|
||||
|
||||
impl ImageDownloader {
|
||||
pub fn new(max_size: (u32, u32)) -> ImageDownloader {
|
||||
pub fn new(max_size: (u32, u32)) -> Self {
|
||||
Self::new_with_client(max_size, Client::new())
|
||||
}
|
||||
|
||||
pub fn new_with_client(max_size: (u32, u32), client: Client) -> Self {
|
||||
ImageDownloader {
|
||||
client: reqwest::Client::new(),
|
||||
max_size: max_size,
|
||||
client,
|
||||
max_size,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_client(&mut self, client: Client) {
|
||||
self.client = client;
|
||||
}
|
||||
|
||||
pub async fn download_images_from_string(
|
||||
&self,
|
||||
html: &str,
|
||||
|
@ -185,7 +193,7 @@ impl ImageDownloader {
|
|||
}
|
||||
|
||||
fn check_image_content_type(
|
||||
response: &reqwest::Response,
|
||||
response: &Response,
|
||||
) -> Result<reqwest::header::HeaderValue, ImageDownloadError> {
|
||||
if response.status().is_success() {
|
||||
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
|
||||
|
@ -301,7 +309,7 @@ impl ImageDownloader {
|
|||
Err(ImageDownloadErrorKind::ParentDownload)?
|
||||
}
|
||||
|
||||
fn get_content_lenght(response: &reqwest::Response) -> Result<u64, ImageDownloadError> {
|
||||
fn get_content_lenght(response: &Response) -> Result<u64, ImageDownloadError> {
|
||||
if response.status().is_success() {
|
||||
if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) {
|
||||
if let Ok(content_length) = content_length.to_str() {
|
||||
|
|
33
src/lib.rs
33
src/lib.rs
|
@ -15,7 +15,7 @@ use libxml::tree::{Document, Node, SaveOptions};
|
|||
use libxml::xpath::Context;
|
||||
use log::{debug, error, info, warn};
|
||||
use regex;
|
||||
use reqwest;
|
||||
use reqwest::{Client, Response};
|
||||
use std::collections;
|
||||
use std::error::Error;
|
||||
use std::path::PathBuf;
|
||||
|
@ -27,11 +27,15 @@ use url;
|
|||
pub struct ArticleScraper {
|
||||
pub image_downloader: ImageDownloader,
|
||||
config_files: Arc<RwLock<Option<ConfigCollection>>>,
|
||||
client: reqwest::Client,
|
||||
client: Client,
|
||||
}
|
||||
|
||||
impl ArticleScraper {
|
||||
pub fn new(config_path: PathBuf) -> Result<ArticleScraper, ScraperError> {
|
||||
pub fn new(config_path: PathBuf) -> Self {
|
||||
Self::new_with_client(config_path, Client::new())
|
||||
}
|
||||
|
||||
pub fn new_with_client(config_path: PathBuf, client: Client) -> Self {
|
||||
let config_files = Arc::new(RwLock::new(None));
|
||||
|
||||
let locked_config_files = config_files.clone();
|
||||
|
@ -49,11 +53,16 @@ impl ArticleScraper {
|
|||
}
|
||||
});
|
||||
|
||||
Ok(ArticleScraper {
|
||||
image_downloader: ImageDownloader::new((2048, 2048)),
|
||||
ArticleScraper {
|
||||
image_downloader: ImageDownloader::new_with_client((2048, 2048), client.clone()),
|
||||
config_files,
|
||||
client: reqwest::Client::new(),
|
||||
})
|
||||
client,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_client(&mut self, client: Client) {
|
||||
self.client = client.clone();
|
||||
self.image_downloader.set_client(client);
|
||||
}
|
||||
|
||||
pub async fn parse(
|
||||
|
@ -263,7 +272,7 @@ impl ArticleScraper {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(url: &url::Url, client: &reqwest::Client) -> Result<String, ScraperError> {
|
||||
async fn download(url: &url::Url, client: &Client) -> Result<String, ScraperError> {
|
||||
let response = client
|
||||
.get(url.as_str())
|
||||
.send()
|
||||
|
@ -373,7 +382,7 @@ impl ArticleScraper {
|
|||
}
|
||||
}
|
||||
|
||||
fn check_content_type(response: &reqwest::Response) -> Result<bool, ScraperError> {
|
||||
fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
|
||||
if response.status().is_success() {
|
||||
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
|
||||
if let Ok(content_type) = content_type.to_str() {
|
||||
|
@ -391,7 +400,7 @@ impl ArticleScraper {
|
|||
Err(ScraperErrorKind::Http)?
|
||||
}
|
||||
|
||||
fn check_redirect(response: &reqwest::Response) -> Option<url::Url> {
|
||||
fn check_redirect(response: &Response) -> Option<url::Url> {
|
||||
if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
|
||||
debug!("Article url redirects to '{}'", response.url().as_str());
|
||||
return Some(response.url().clone());
|
||||
|
@ -809,7 +818,7 @@ mod tests {
|
|||
let out_path = PathBuf::from(r"./test_output");
|
||||
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
|
||||
|
||||
let grabber = ArticleScraper::new(config_path).unwrap();
|
||||
let grabber = ArticleScraper::new(config_path);
|
||||
let article = grabber.parse(url, true).await.unwrap();
|
||||
article.save_html(&out_path).unwrap();
|
||||
|
||||
|
@ -831,7 +840,7 @@ mod tests {
|
|||
)
|
||||
.unwrap();
|
||||
|
||||
let grabber = ArticleScraper::new(config_path).unwrap();
|
||||
let grabber = ArticleScraper::new(config_path);
|
||||
let article = grabber.parse(url, true).await.unwrap();
|
||||
article.save_html(&out_path).unwrap();
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue