mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
require client for parsing
This commit is contained in:
parent
a7c247549a
commit
d2960d8539
2 changed files with 31 additions and 48 deletions
|
@ -15,29 +15,18 @@ use url;
|
|||
mod error;
|
||||
|
||||
pub struct ImageDownloader {
|
||||
client: Client,
|
||||
max_size: (u32, u32),
|
||||
}
|
||||
|
||||
impl ImageDownloader {
|
||||
pub fn new(max_size: (u32, u32)) -> Self {
|
||||
Self::new_with_client(max_size, Client::new())
|
||||
}
|
||||
|
||||
pub fn new_with_client(max_size: (u32, u32), client: Client) -> Self {
|
||||
ImageDownloader {
|
||||
client,
|
||||
max_size,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_client(&mut self, client: Client) {
|
||||
self.client = client;
|
||||
ImageDownloader { max_size }
|
||||
}
|
||||
|
||||
pub async fn download_images_from_string(
|
||||
&self,
|
||||
html: &str,
|
||||
client: &Client,
|
||||
) -> Result<String, ImageDownloadError> {
|
||||
let parser = Parser::default_html();
|
||||
let doc = parser.parse_string(html).map_err(|_| {
|
||||
|
@ -50,7 +39,8 @@ impl ImageDownloader {
|
|||
ImageDownloadErrorKind::HtmlParse
|
||||
})?;
|
||||
|
||||
self.download_images_from_context(&xpath_ctx).await?;
|
||||
self.download_images_from_context(&xpath_ctx, client)
|
||||
.await?;
|
||||
|
||||
let options = SaveOptions {
|
||||
format: false,
|
||||
|
@ -68,6 +58,7 @@ impl ImageDownloader {
|
|||
pub async fn download_images_from_context(
|
||||
&self,
|
||||
context: &Context,
|
||||
client: &Client,
|
||||
) -> Result<(), ImageDownloadError> {
|
||||
let xpath = "//img";
|
||||
let node_vec = ArticleScraper::evaluate_xpath(context, xpath, false)
|
||||
|
@ -76,13 +67,13 @@ impl ImageDownloader {
|
|||
if let Some(url) = node.get_property("src") {
|
||||
if !url.starts_with("data:") {
|
||||
if let Ok(url) = url::Url::parse(&url) {
|
||||
let parent_url = match self.check_image_parent(&node, &url).await {
|
||||
let parent_url = match self.check_image_parent(&node, &url, client).await {
|
||||
Ok(url) => Some(url),
|
||||
Err(_) => None,
|
||||
};
|
||||
|
||||
if let Ok((small_image, big_image)) =
|
||||
self.save_image(&url, &parent_url).await
|
||||
self.save_image(&url, &parent_url, client).await
|
||||
{
|
||||
if let Err(_) = node.set_property("src", &small_image) {
|
||||
return Err(ImageDownloadErrorKind::HtmlParse)?;
|
||||
|
@ -105,9 +96,9 @@ impl ImageDownloader {
|
|||
&self,
|
||||
image_url: &url::Url,
|
||||
parent_url: &Option<url::Url>,
|
||||
client: &Client,
|
||||
) -> Result<(String, Option<String>), ImageDownloadError> {
|
||||
let response = self
|
||||
.client
|
||||
let response = client
|
||||
.get(image_url.clone())
|
||||
.send()
|
||||
.await
|
||||
|
@ -133,8 +124,7 @@ impl ImageDownloader {
|
|||
let mut big_image: Option<Vec<u8>> = None;
|
||||
|
||||
if let Some(parent_url) = parent_url {
|
||||
let response_big = self
|
||||
.client
|
||||
let response_big = client
|
||||
.get(parent_url.clone())
|
||||
.send()
|
||||
.await
|
||||
|
@ -271,22 +261,21 @@ impl ImageDownloader {
|
|||
&self,
|
||||
node: &Node,
|
||||
child_url: &url::Url,
|
||||
client: &Client,
|
||||
) -> Result<url::Url, ImageDownloadError> {
|
||||
if let Some(parent) = node.get_parent() {
|
||||
if parent.get_name() == "a" {
|
||||
if let Some(url) = parent.get_property("href") {
|
||||
let parent_url =
|
||||
url::Url::parse(&url).context(ImageDownloadErrorKind::ParentDownload)?;
|
||||
let parent_response = self
|
||||
.client
|
||||
let parent_response = client
|
||||
.head(parent_url.clone())
|
||||
.send()
|
||||
.await
|
||||
.context(ImageDownloadErrorKind::ParentDownload)?;
|
||||
let _ = ImageDownloader::check_image_content_type(&parent_response)
|
||||
.context(ImageDownloadErrorKind::ParentDownload)?;
|
||||
let child_response = self
|
||||
.client
|
||||
let child_response = client
|
||||
.get(child_url.clone())
|
||||
.send()
|
||||
.await
|
||||
|
@ -326,6 +315,7 @@ impl ImageDownloader {
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use reqwest::Client;
|
||||
use std::fs;
|
||||
use std::io::Write;
|
||||
|
||||
|
@ -335,7 +325,7 @@ mod tests {
|
|||
let hdyleaflet = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
|
||||
.expect("Failed to read HTML");
|
||||
let result = image_dowloader
|
||||
.download_images_from_string(&hdyleaflet)
|
||||
.download_images_from_string(&hdyleaflet, &Client::new())
|
||||
.await
|
||||
.expect("Failed to downalod images");
|
||||
let mut file =
|
||||
|
|
39
src/lib.rs
39
src/lib.rs
|
@ -27,15 +27,10 @@ use url;
|
|||
pub struct ArticleScraper {
|
||||
pub image_downloader: ImageDownloader,
|
||||
config_files: Arc<RwLock<Option<ConfigCollection>>>,
|
||||
client: Client,
|
||||
}
|
||||
|
||||
impl ArticleScraper {
|
||||
pub fn new(config_path: PathBuf) -> Self {
|
||||
Self::new_with_client(config_path, Client::new())
|
||||
}
|
||||
|
||||
pub fn new_with_client(config_path: PathBuf, client: Client) -> Self {
|
||||
let config_files = Arc::new(RwLock::new(None));
|
||||
|
||||
let locked_config_files = config_files.clone();
|
||||
|
@ -54,25 +49,19 @@ impl ArticleScraper {
|
|||
});
|
||||
|
||||
ArticleScraper {
|
||||
image_downloader: ImageDownloader::new_with_client((2048, 2048), client.clone()),
|
||||
image_downloader: ImageDownloader::new((2048, 2048)),
|
||||
config_files,
|
||||
client,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_client(&mut self, client: Client) {
|
||||
self.client = client.clone();
|
||||
self.image_downloader.set_client(client);
|
||||
}
|
||||
|
||||
pub async fn parse(
|
||||
&self,
|
||||
url: url::Url,
|
||||
download_images: bool,
|
||||
client: &Client,
|
||||
) -> Result<Article, ScraperError> {
|
||||
info!("Scraping article: '{}'", url.as_str());
|
||||
let response = self
|
||||
.client
|
||||
let response = client
|
||||
.head(url.clone())
|
||||
.send()
|
||||
.await
|
||||
|
@ -117,7 +106,7 @@ impl ArticleScraper {
|
|||
|
||||
ArticleScraper::generate_head(&mut root, &document)?;
|
||||
|
||||
self.parse_pages(&mut article, &url, &mut root, &config)
|
||||
self.parse_pages(&mut article, &url, &mut root, &config, client)
|
||||
.await?;
|
||||
|
||||
let context = Context::new(&document).map_err(|()| {
|
||||
|
@ -138,7 +127,7 @@ impl ArticleScraper {
|
|||
if download_images {
|
||||
if let Err(error) = self
|
||||
.image_downloader
|
||||
.download_images_from_context(&context)
|
||||
.download_images_from_context(&context, client)
|
||||
.await
|
||||
{
|
||||
error!("Downloading images failed: '{}'", error);
|
||||
|
@ -168,8 +157,9 @@ impl ArticleScraper {
|
|||
url: &url::Url,
|
||||
root: &mut Node,
|
||||
config: &GrabberConfig,
|
||||
client: &Client,
|
||||
) -> Result<(), ScraperError> {
|
||||
let html = ArticleScraper::download(&url, &self.client).await?;
|
||||
let html = ArticleScraper::download(&url, client).await?;
|
||||
let mut document = Self::parse_html(html, config)?;
|
||||
let mut xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||
|
||||
|
@ -183,9 +173,10 @@ impl ArticleScraper {
|
|||
if !result.trim().is_empty() {
|
||||
// parse again with single page url
|
||||
debug!("Single page link found '{}'", result);
|
||||
let single_page_url = url::Url::parse(&result).context(ScraperErrorKind::Url)?;
|
||||
let single_page_url =
|
||||
url::Url::parse(&result).context(ScraperErrorKind::Url)?;
|
||||
return self
|
||||
.parse_single_page(article, &single_page_url, root, config)
|
||||
.parse_single_page(article, &single_page_url, root, config, client)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
@ -197,7 +188,7 @@ impl ArticleScraper {
|
|||
|
||||
loop {
|
||||
if let Some(url) = self.check_for_next_page(&xpath_ctx, config) {
|
||||
let html = ArticleScraper::download(&url, &self.client).await?;
|
||||
let html = ArticleScraper::download(&url, client).await?;
|
||||
document = Self::parse_html(html, config)?;
|
||||
xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||
ArticleScraper::strip_junk(&xpath_ctx, config, &url);
|
||||
|
@ -261,8 +252,9 @@ impl ArticleScraper {
|
|||
url: &url::Url,
|
||||
root: &mut Node,
|
||||
config: &GrabberConfig,
|
||||
client: &Client,
|
||||
) -> Result<(), ScraperError> {
|
||||
let html = ArticleScraper::download(&url, &self.client).await?;
|
||||
let html = ArticleScraper::download(&url, client).await?;
|
||||
let document = Self::parse_html(html, config)?;
|
||||
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||
ArticleScraper::extract_metadata(&xpath_ctx, config, article);
|
||||
|
@ -811,6 +803,7 @@ impl ArticleScraper {
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::*;
|
||||
use reqwest::Client;
|
||||
|
||||
#[tokio::test(basic_scheduler)]
|
||||
async fn golem() {
|
||||
|
@ -819,7 +812,7 @@ mod tests {
|
|||
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
|
||||
|
||||
let grabber = ArticleScraper::new(config_path);
|
||||
let article = grabber.parse(url, true).await.unwrap();
|
||||
let article = grabber.parse(url, true, &Client::new()).await.unwrap();
|
||||
article.save_html(&out_path).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
|
@ -841,7 +834,7 @@ mod tests {
|
|||
.unwrap();
|
||||
|
||||
let grabber = ArticleScraper::new(config_path);
|
||||
let article = grabber.parse(url, true).await.unwrap();
|
||||
let article = grabber.parse(url, true, &Client::new()).await.unwrap();
|
||||
article.save_html(&out_path).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue