1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 16:40:00 +02:00

Merge branch 'master' of gitlab.com:news-flash/article_scraper

This commit is contained in:
Jan Lukas Gernert 2020-04-28 02:34:24 +02:00
commit 1fbce6413d
3 changed files with 64 additions and 66 deletions

View file

@ -19,7 +19,7 @@ impl Article {
if let Some(ref html) = self.html { if let Some(ref html) = self.html {
if let Ok(()) = std::fs::create_dir_all(&path) { if let Ok(()) = std::fs::create_dir_all(&path) {
let mut file_name = match self.title.clone() { let mut file_name = match self.title.clone() {
Some(file_name) => file_name, Some(file_name) => file_name.replace("/", "_"),
None => "Unknown Title".to_owned(), None => "Unknown Title".to_owned(),
}; };
file_name.push_str(".html"); file_name.push_str(".html");

View file

@ -7,7 +7,7 @@ use libxml::parser::Parser;
use libxml::tree::{Node, SaveOptions}; use libxml::tree::{Node, SaveOptions};
use libxml::xpath::Context; use libxml::xpath::Context;
use log::{debug, error}; use log::{debug, error};
use reqwest; use reqwest::{Client, Response};
use std; use std;
use std::error::Error; use std::error::Error;
use url; use url;
@ -15,21 +15,18 @@ use url;
mod error; mod error;
pub struct ImageDownloader { pub struct ImageDownloader {
client: reqwest::Client,
max_size: (u32, u32), max_size: (u32, u32),
} }
impl ImageDownloader { impl ImageDownloader {
pub fn new(max_size: (u32, u32)) -> ImageDownloader { pub fn new(max_size: (u32, u32)) -> Self {
ImageDownloader { ImageDownloader { max_size }
client: reqwest::Client::new(),
max_size: max_size,
}
} }
pub async fn download_images_from_string( pub async fn download_images_from_string(
&self, &self,
html: &str, html: &str,
client: &Client,
) -> Result<String, ImageDownloadError> { ) -> Result<String, ImageDownloadError> {
let parser = Parser::default_html(); let parser = Parser::default_html();
let doc = parser.parse_string(html).map_err(|_| { let doc = parser.parse_string(html).map_err(|_| {
@ -42,7 +39,8 @@ impl ImageDownloader {
ImageDownloadErrorKind::HtmlParse ImageDownloadErrorKind::HtmlParse
})?; })?;
self.download_images_from_context(&xpath_ctx).await?; self.download_images_from_context(&xpath_ctx, client)
.await?;
let options = SaveOptions { let options = SaveOptions {
format: false, format: false,
@ -60,6 +58,7 @@ impl ImageDownloader {
pub async fn download_images_from_context( pub async fn download_images_from_context(
&self, &self,
context: &Context, context: &Context,
client: &Client,
) -> Result<(), ImageDownloadError> { ) -> Result<(), ImageDownloadError> {
let xpath = "//img"; let xpath = "//img";
let node_vec = ArticleScraper::evaluate_xpath(context, xpath, false) let node_vec = ArticleScraper::evaluate_xpath(context, xpath, false)
@ -68,13 +67,13 @@ impl ImageDownloader {
if let Some(url) = node.get_property("src") { if let Some(url) = node.get_property("src") {
if !url.starts_with("data:") { if !url.starts_with("data:") {
if let Ok(url) = url::Url::parse(&url) { if let Ok(url) = url::Url::parse(&url) {
let parent_url = match self.check_image_parent(&node, &url).await { let parent_url = match self.check_image_parent(&node, &url, client).await {
Ok(url) => Some(url), Ok(url) => Some(url),
Err(_) => None, Err(_) => None,
}; };
if let Ok((small_image, big_image)) = if let Ok((small_image, big_image)) =
self.save_image(&url, &parent_url).await self.save_image(&url, &parent_url, client).await
{ {
if let Err(_) = node.set_property("src", &small_image) { if let Err(_) = node.set_property("src", &small_image) {
return Err(ImageDownloadErrorKind::HtmlParse)?; return Err(ImageDownloadErrorKind::HtmlParse)?;
@ -97,9 +96,9 @@ impl ImageDownloader {
&self, &self,
image_url: &url::Url, image_url: &url::Url,
parent_url: &Option<url::Url>, parent_url: &Option<url::Url>,
client: &Client,
) -> Result<(String, Option<String>), ImageDownloadError> { ) -> Result<(String, Option<String>), ImageDownloadError> {
let response = self let response = client
.client
.get(image_url.clone()) .get(image_url.clone())
.send() .send()
.await .await
@ -125,8 +124,7 @@ impl ImageDownloader {
let mut big_image: Option<Vec<u8>> = None; let mut big_image: Option<Vec<u8>> = None;
if let Some(parent_url) = parent_url { if let Some(parent_url) = parent_url {
let response_big = self let response_big = client
.client
.get(parent_url.clone()) .get(parent_url.clone())
.send() .send()
.await .await
@ -185,7 +183,7 @@ impl ImageDownloader {
} }
fn check_image_content_type( fn check_image_content_type(
response: &reqwest::Response, response: &Response,
) -> Result<reqwest::header::HeaderValue, ImageDownloadError> { ) -> Result<reqwest::header::HeaderValue, ImageDownloadError> {
if response.status().is_success() { if response.status().is_success() {
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) { if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
@ -263,22 +261,21 @@ impl ImageDownloader {
&self, &self,
node: &Node, node: &Node,
child_url: &url::Url, child_url: &url::Url,
client: &Client,
) -> Result<url::Url, ImageDownloadError> { ) -> Result<url::Url, ImageDownloadError> {
if let Some(parent) = node.get_parent() { if let Some(parent) = node.get_parent() {
if parent.get_name() == "a" { if parent.get_name() == "a" {
if let Some(url) = parent.get_property("href") { if let Some(url) = parent.get_property("href") {
let parent_url = let parent_url =
url::Url::parse(&url).context(ImageDownloadErrorKind::ParentDownload)?; url::Url::parse(&url).context(ImageDownloadErrorKind::ParentDownload)?;
let parent_response = self let parent_response = client
.client
.head(parent_url.clone()) .head(parent_url.clone())
.send() .send()
.await .await
.context(ImageDownloadErrorKind::ParentDownload)?; .context(ImageDownloadErrorKind::ParentDownload)?;
let _ = ImageDownloader::check_image_content_type(&parent_response) let _ = ImageDownloader::check_image_content_type(&parent_response)
.context(ImageDownloadErrorKind::ParentDownload)?; .context(ImageDownloadErrorKind::ParentDownload)?;
let child_response = self let child_response = client
.client
.get(child_url.clone()) .get(child_url.clone())
.send() .send()
.await .await
@ -301,7 +298,7 @@ impl ImageDownloader {
Err(ImageDownloadErrorKind::ParentDownload)? Err(ImageDownloadErrorKind::ParentDownload)?
} }
fn get_content_lenght(response: &reqwest::Response) -> Result<u64, ImageDownloadError> { fn get_content_lenght(response: &Response) -> Result<u64, ImageDownloadError> {
if response.status().is_success() { if response.status().is_success() {
if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) { if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) {
if let Ok(content_length) = content_length.to_str() { if let Ok(content_length) = content_length.to_str() {
@ -318,6 +315,7 @@ impl ImageDownloader {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use reqwest::Client;
use std::fs; use std::fs;
use std::io::Write; use std::io::Write;
@ -327,7 +325,7 @@ mod tests {
let hdyleaflet = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html") let hdyleaflet = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
.expect("Failed to read HTML"); .expect("Failed to read HTML");
let result = image_dowloader let result = image_dowloader
.download_images_from_string(&hdyleaflet) .download_images_from_string(&hdyleaflet, &Client::new())
.await .await
.expect("Failed to downalod images"); .expect("Failed to downalod images");
let mut file = let mut file =

View file

@ -15,7 +15,7 @@ use libxml::tree::{Document, Node, SaveOptions};
use libxml::xpath::Context; use libxml::xpath::Context;
use log::{debug, error, info, warn}; use log::{debug, error, info, warn};
use regex; use regex;
use reqwest; use reqwest::{Client, Response};
use std::collections; use std::collections;
use std::error::Error; use std::error::Error;
use std::path::PathBuf; use std::path::PathBuf;
@ -27,11 +27,10 @@ use url;
pub struct ArticleScraper { pub struct ArticleScraper {
pub image_downloader: ImageDownloader, pub image_downloader: ImageDownloader,
config_files: Arc<RwLock<Option<ConfigCollection>>>, config_files: Arc<RwLock<Option<ConfigCollection>>>,
client: reqwest::Client,
} }
impl ArticleScraper { impl ArticleScraper {
pub fn new(config_path: PathBuf) -> Result<ArticleScraper, ScraperError> { pub fn new(config_path: PathBuf) -> Self {
let config_files = Arc::new(RwLock::new(None)); let config_files = Arc::new(RwLock::new(None));
let locked_config_files = config_files.clone(); let locked_config_files = config_files.clone();
@ -49,21 +48,20 @@ impl ArticleScraper {
} }
}); });
Ok(ArticleScraper { ArticleScraper {
image_downloader: ImageDownloader::new((2048, 2048)), image_downloader: ImageDownloader::new((2048, 2048)),
config_files, config_files,
client: reqwest::Client::new(), }
})
} }
pub async fn parse( pub async fn parse(
&self, &self,
url: url::Url, url: url::Url,
download_images: bool, download_images: bool,
client: &Client,
) -> Result<Article, ScraperError> { ) -> Result<Article, ScraperError> {
info!("Scraping article: '{}'", url.as_str()); info!("Scraping article: '{}'", url.as_str());
let response = self let response = client
.client
.head(url.clone()) .head(url.clone())
.send() .send()
.await .await
@ -108,7 +106,7 @@ impl ArticleScraper {
ArticleScraper::generate_head(&mut root, &document)?; ArticleScraper::generate_head(&mut root, &document)?;
self.parse_pages(&mut article, &url, &mut root, &config) self.parse_pages(&mut article, &url, &mut root, &config, client)
.await?; .await?;
let context = Context::new(&document).map_err(|()| { let context = Context::new(&document).map_err(|()| {
@ -121,15 +119,15 @@ impl ArticleScraper {
return Err(error); return Err(error);
} }
if let Err(error) = ArticleScraper::eliminate_noscrip_tag(&context) { // if let Err(error) = ArticleScraper::eliminate_noscript_tag(&context) {
error!("Eliminating <noscript> tag failed - '{}'", error); // error!("Eliminating <noscript> tag failed - {}", error);
return Err(error); // return Err(error)
} // }
if download_images { if download_images {
if let Err(error) = self if let Err(error) = self
.image_downloader .image_downloader
.download_images_from_context(&context) .download_images_from_context(&context, client)
.await .await
{ {
error!("Downloading images failed: '{}'", error); error!("Downloading images failed: '{}'", error);
@ -159,8 +157,9 @@ impl ArticleScraper {
url: &url::Url, url: &url::Url,
root: &mut Node, root: &mut Node,
config: &GrabberConfig, config: &GrabberConfig,
client: &Client,
) -> Result<(), ScraperError> { ) -> Result<(), ScraperError> {
let html = ArticleScraper::download(&url, &self.client).await?; let html = ArticleScraper::download(&url, client).await?;
let mut document = Self::parse_html(html, config)?; let mut document = Self::parse_html(html, config)?;
let mut xpath_ctx = Self::get_xpath_ctx(&document)?; let mut xpath_ctx = Self::get_xpath_ctx(&document)?;
@ -174,9 +173,10 @@ impl ArticleScraper {
if !result.trim().is_empty() { if !result.trim().is_empty() {
// parse again with single page url // parse again with single page url
debug!("Single page link found '{}'", result); debug!("Single page link found '{}'", result);
let single_page_url = url::Url::parse(&result).context(ScraperErrorKind::Url)?; let single_page_url =
url::Url::parse(&result).context(ScraperErrorKind::Url)?;
return self return self
.parse_single_page(article, &single_page_url, root, config) .parse_single_page(article, &single_page_url, root, config, client)
.await; .await;
} }
} }
@ -188,7 +188,7 @@ impl ArticleScraper {
loop { loop {
if let Some(url) = self.check_for_next_page(&xpath_ctx, config) { if let Some(url) = self.check_for_next_page(&xpath_ctx, config) {
let html = ArticleScraper::download(&url, &self.client).await?; let html = ArticleScraper::download(&url, client).await?;
document = Self::parse_html(html, config)?; document = Self::parse_html(html, config)?;
xpath_ctx = Self::get_xpath_ctx(&document)?; xpath_ctx = Self::get_xpath_ctx(&document)?;
ArticleScraper::strip_junk(&xpath_ctx, config, &url); ArticleScraper::strip_junk(&xpath_ctx, config, &url);
@ -252,8 +252,9 @@ impl ArticleScraper {
url: &url::Url, url: &url::Url,
root: &mut Node, root: &mut Node,
config: &GrabberConfig, config: &GrabberConfig,
client: &Client,
) -> Result<(), ScraperError> { ) -> Result<(), ScraperError> {
let html = ArticleScraper::download(&url, &self.client).await?; let html = ArticleScraper::download(&url, client).await?;
let document = Self::parse_html(html, config)?; let document = Self::parse_html(html, config)?;
let xpath_ctx = Self::get_xpath_ctx(&document)?; let xpath_ctx = Self::get_xpath_ctx(&document)?;
ArticleScraper::extract_metadata(&xpath_ctx, config, article); ArticleScraper::extract_metadata(&xpath_ctx, config, article);
@ -263,7 +264,7 @@ impl ArticleScraper {
Ok(()) Ok(())
} }
async fn download(url: &url::Url, client: &reqwest::Client) -> Result<String, ScraperError> { async fn download(url: &url::Url, client: &Client) -> Result<String, ScraperError> {
let response = client let response = client
.get(url.as_str()) .get(url.as_str())
.send() .send()
@ -373,7 +374,7 @@ impl ArticleScraper {
} }
} }
fn check_content_type(response: &reqwest::Response) -> Result<bool, ScraperError> { fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
if response.status().is_success() { if response.status().is_success() {
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) { if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
if let Ok(content_type) = content_type.to_str() { if let Ok(content_type) = content_type.to_str() {
@ -391,7 +392,7 @@ impl ArticleScraper {
Err(ScraperErrorKind::Http)? Err(ScraperErrorKind::Http)?
} }
fn check_redirect(response: &reqwest::Response) -> Option<url::Url> { fn check_redirect(response: &Response) -> Option<url::Url> {
if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT { if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
debug!("Article url redirects to '{}'", response.url().as_str()); debug!("Article url redirects to '{}'", response.url().as_str());
return Some(response.url().clone()); return Some(response.url().clone());
@ -646,7 +647,7 @@ impl ArticleScraper {
); );
// strip all scripts // strip all scripts
let _ = ArticleScraper::strip_node(&context, &String::from("//script")); //let _ = ArticleScraper::strip_node(&context, &String::from("//script"));
// strip all comments // strip all comments
let _ = ArticleScraper::strip_node(&context, &String::from("//comment()")); let _ = ArticleScraper::strip_node(&context, &String::from("//comment()"));
@ -782,28 +783,27 @@ impl ArticleScraper {
Ok(()) Ok(())
} }
fn eliminate_noscrip_tag(context: &Context) -> Result<(), ScraperError> { // fn eliminate_noscript_tag(context: &Context) -> Result<(), ScraperError> {
let xpath = "//noscript"; // let xpath = "//noscript";
let node_vec = Self::evaluate_xpath(context, xpath, false)?; // let node_vec = Self::evaluate_xpath(context, xpath, false)?;
// for mut node in node_vec {
for mut node in node_vec { // if let Some(mut parent) = node.get_parent() {
if let Some(mut parent) = node.get_parent() { // node.unlink();
node.unlink(); // let children = node.get_child_nodes();
let children = node.get_child_nodes(); // for mut child in children {
for mut child in children { // child.unlink();
child.unlink(); // let _ = parent.add_child(&mut child);
let _ = parent.add_child(&mut child); // }
} // }
} // }
} // Ok(())
// }
Ok(())
}
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::*; use crate::*;
use reqwest::Client;
#[tokio::test(basic_scheduler)] #[tokio::test(basic_scheduler)]
async fn golem() { async fn golem() {
@ -811,8 +811,8 @@ mod tests {
let out_path = PathBuf::from(r"./test_output"); let out_path = PathBuf::from(r"./test_output");
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap(); let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
let grabber = ArticleScraper::new(config_path).unwrap(); let grabber = ArticleScraper::new(config_path);
let article = grabber.parse(url, true).await.unwrap(); let article = grabber.parse(url, true, &Client::new()).await.unwrap();
article.save_html(&out_path).unwrap(); article.save_html(&out_path).unwrap();
assert_eq!( assert_eq!(
@ -833,8 +833,8 @@ mod tests {
) )
.unwrap(); .unwrap();
let grabber = ArticleScraper::new(config_path).unwrap(); let grabber = ArticleScraper::new(config_path);
let article = grabber.parse(url, true).await.unwrap(); let article = grabber.parse(url, true, &Client::new()).await.unwrap();
article.save_html(&out_path).unwrap(); article.save_html(&out_path).unwrap();
assert_eq!( assert_eq!(