mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 00:19:59 +02:00
option to set custom reqwest client
This commit is contained in:
parent
71055eed1c
commit
1ecc0fc4b4
2 changed files with 36 additions and 19 deletions
|
@ -7,7 +7,7 @@ use libxml::parser::Parser;
|
||||||
use libxml::tree::{Node, SaveOptions};
|
use libxml::tree::{Node, SaveOptions};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
use log::{debug, error};
|
use log::{debug, error};
|
||||||
use reqwest;
|
use reqwest::{Client, Response};
|
||||||
use std;
|
use std;
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use url;
|
use url;
|
||||||
|
@ -15,18 +15,26 @@ use url;
|
||||||
mod error;
|
mod error;
|
||||||
|
|
||||||
pub struct ImageDownloader {
|
pub struct ImageDownloader {
|
||||||
client: reqwest::Client,
|
client: Client,
|
||||||
max_size: (u32, u32),
|
max_size: (u32, u32),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ImageDownloader {
|
impl ImageDownloader {
|
||||||
pub fn new(max_size: (u32, u32)) -> ImageDownloader {
|
pub fn new(max_size: (u32, u32)) -> Self {
|
||||||
|
Self::new_with_client(max_size, Client::new())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new_with_client(max_size: (u32, u32), client: Client) -> Self {
|
||||||
ImageDownloader {
|
ImageDownloader {
|
||||||
client: reqwest::Client::new(),
|
client,
|
||||||
max_size: max_size,
|
max_size,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn set_client(&mut self, client: Client) {
|
||||||
|
self.client = client;
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn download_images_from_string(
|
pub async fn download_images_from_string(
|
||||||
&self,
|
&self,
|
||||||
html: &str,
|
html: &str,
|
||||||
|
@ -185,7 +193,7 @@ impl ImageDownloader {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_image_content_type(
|
fn check_image_content_type(
|
||||||
response: &reqwest::Response,
|
response: &Response,
|
||||||
) -> Result<reqwest::header::HeaderValue, ImageDownloadError> {
|
) -> Result<reqwest::header::HeaderValue, ImageDownloadError> {
|
||||||
if response.status().is_success() {
|
if response.status().is_success() {
|
||||||
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
|
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
|
||||||
|
@ -301,7 +309,7 @@ impl ImageDownloader {
|
||||||
Err(ImageDownloadErrorKind::ParentDownload)?
|
Err(ImageDownloadErrorKind::ParentDownload)?
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_content_lenght(response: &reqwest::Response) -> Result<u64, ImageDownloadError> {
|
fn get_content_lenght(response: &Response) -> Result<u64, ImageDownloadError> {
|
||||||
if response.status().is_success() {
|
if response.status().is_success() {
|
||||||
if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) {
|
if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) {
|
||||||
if let Ok(content_length) = content_length.to_str() {
|
if let Ok(content_length) = content_length.to_str() {
|
||||||
|
|
33
src/lib.rs
33
src/lib.rs
|
@ -15,7 +15,7 @@ use libxml::tree::{Document, Node, SaveOptions};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
use log::{debug, error, info, warn};
|
use log::{debug, error, info, warn};
|
||||||
use regex;
|
use regex;
|
||||||
use reqwest;
|
use reqwest::{Client, Response};
|
||||||
use std::collections;
|
use std::collections;
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
@ -27,11 +27,15 @@ use url;
|
||||||
pub struct ArticleScraper {
|
pub struct ArticleScraper {
|
||||||
pub image_downloader: ImageDownloader,
|
pub image_downloader: ImageDownloader,
|
||||||
config_files: Arc<RwLock<Option<ConfigCollection>>>,
|
config_files: Arc<RwLock<Option<ConfigCollection>>>,
|
||||||
client: reqwest::Client,
|
client: Client,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ArticleScraper {
|
impl ArticleScraper {
|
||||||
pub fn new(config_path: PathBuf) -> Result<ArticleScraper, ScraperError> {
|
pub fn new(config_path: PathBuf) -> Self {
|
||||||
|
Self::new_with_client(config_path, Client::new())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new_with_client(config_path: PathBuf, client: Client) -> Self {
|
||||||
let config_files = Arc::new(RwLock::new(None));
|
let config_files = Arc::new(RwLock::new(None));
|
||||||
|
|
||||||
let locked_config_files = config_files.clone();
|
let locked_config_files = config_files.clone();
|
||||||
|
@ -49,11 +53,16 @@ impl ArticleScraper {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
Ok(ArticleScraper {
|
ArticleScraper {
|
||||||
image_downloader: ImageDownloader::new((2048, 2048)),
|
image_downloader: ImageDownloader::new_with_client((2048, 2048), client.clone()),
|
||||||
config_files,
|
config_files,
|
||||||
client: reqwest::Client::new(),
|
client,
|
||||||
})
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_client(&mut self, client: Client) {
|
||||||
|
self.client = client.clone();
|
||||||
|
self.image_downloader.set_client(client);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn parse(
|
pub async fn parse(
|
||||||
|
@ -263,7 +272,7 @@ impl ArticleScraper {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn download(url: &url::Url, client: &reqwest::Client) -> Result<String, ScraperError> {
|
async fn download(url: &url::Url, client: &Client) -> Result<String, ScraperError> {
|
||||||
let response = client
|
let response = client
|
||||||
.get(url.as_str())
|
.get(url.as_str())
|
||||||
.send()
|
.send()
|
||||||
|
@ -373,7 +382,7 @@ impl ArticleScraper {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_content_type(response: &reqwest::Response) -> Result<bool, ScraperError> {
|
fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
|
||||||
if response.status().is_success() {
|
if response.status().is_success() {
|
||||||
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
|
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
|
||||||
if let Ok(content_type) = content_type.to_str() {
|
if let Ok(content_type) = content_type.to_str() {
|
||||||
|
@ -391,7 +400,7 @@ impl ArticleScraper {
|
||||||
Err(ScraperErrorKind::Http)?
|
Err(ScraperErrorKind::Http)?
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_redirect(response: &reqwest::Response) -> Option<url::Url> {
|
fn check_redirect(response: &Response) -> Option<url::Url> {
|
||||||
if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
|
if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
|
||||||
debug!("Article url redirects to '{}'", response.url().as_str());
|
debug!("Article url redirects to '{}'", response.url().as_str());
|
||||||
return Some(response.url().clone());
|
return Some(response.url().clone());
|
||||||
|
@ -809,7 +818,7 @@ mod tests {
|
||||||
let out_path = PathBuf::from(r"./test_output");
|
let out_path = PathBuf::from(r"./test_output");
|
||||||
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
|
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
|
||||||
|
|
||||||
let grabber = ArticleScraper::new(config_path).unwrap();
|
let grabber = ArticleScraper::new(config_path);
|
||||||
let article = grabber.parse(url, true).await.unwrap();
|
let article = grabber.parse(url, true).await.unwrap();
|
||||||
article.save_html(&out_path).unwrap();
|
article.save_html(&out_path).unwrap();
|
||||||
|
|
||||||
|
@ -831,7 +840,7 @@ mod tests {
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let grabber = ArticleScraper::new(config_path).unwrap();
|
let grabber = ArticleScraper::new(config_path);
|
||||||
let article = grabber.parse(url, true).await.unwrap();
|
let article = grabber.parse(url, true).await.unwrap();
|
||||||
article.save_html(&out_path).unwrap();
|
article.save_html(&out_path).unwrap();
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue