1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

port failure -> thiserror

This commit is contained in:
Jan Lukas Gernert 2022-12-01 09:22:08 +01:00
parent d906f6b7fe
commit 27be5a3204
11 changed files with 137 additions and 366 deletions

View file

@ -8,16 +8,16 @@ description = "Scrap article contents from the web. Powered by fivefilters full
repository = "https://gitlab.com/news-flash/article_scraper"
[dependencies]
failure = "0.1"
thiserror = "1.0"
libxml = "0.3"
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli"] }
tokio = { version = "1.21", features = ["macros", "fs", "io-util"] }
url = "2.2"
regex = "1.4"
tokio = { version = "1.22", features = ["macros", "fs", "io-util"] }
url = "2.3"
regex = "1.7"
encoding_rs = "0.8"
chrono = "0.4"
base64 = "0.13"
image = "0.24"
log = "0.4"
rust-embed="6.4"
once_cell = "1.15"
once_cell = "1.16"

View file

@ -1,57 +1,15 @@
use failure::{Backtrace, Context, Error, Fail};
use std::fmt;
use crate::{
full_text_parser::{config::ConfigError, error::FullTextParserError},
images::ImageDownloadError,
};
use thiserror::Error;
#[derive(Debug)]
pub struct ScraperError {
inner: Context<ScraperErrorKind>,
}
#[derive(Copy, Clone, Eq, PartialEq, Debug, Fail)]
pub enum ScraperErrorKind {
#[fail(display = "Unknown Error")]
Unknown,
}
impl Fail for ScraperError {
fn cause(&self) -> Option<&dyn Fail> {
self.inner.cause()
}
fn backtrace(&self) -> Option<&Backtrace> {
self.inner.backtrace()
}
}
impl fmt::Display for ScraperError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fmt::Display::fmt(&self.inner, f)
}
}
impl ScraperError {
pub fn kind(&self) -> ScraperErrorKind {
*self.inner.get_context()
}
}
impl From<ScraperErrorKind> for ScraperError {
fn from(kind: ScraperErrorKind) -> ScraperError {
ScraperError {
inner: Context::new(kind),
}
}
}
impl From<Context<ScraperErrorKind>> for ScraperError {
fn from(inner: Context<ScraperErrorKind>) -> ScraperError {
ScraperError { inner }
}
}
impl From<Error> for ScraperError {
fn from(_: Error) -> ScraperError {
ScraperError {
inner: Context::new(ScraperErrorKind::Unknown),
}
}
#[derive(Error, Debug)]
pub enum ScraperError {
#[error("")]
Config(#[from] ConfigError),
#[error("")]
Image(#[from] ImageDownloadError),
#[error("")]
Scrap(#[from] FullTextParserError),
}

View file

@ -1,7 +1,6 @@
use crate::util::Util;
use super::error::{ConfigError, ConfigErrorKind};
use failure::ResultExt;
use super::error::ConfigError;
use std::borrow::Cow;
use std::io::Cursor;
use std::path::Path;
@ -37,9 +36,7 @@ pub struct ConfigEntry {
impl ConfigEntry {
pub async fn parse_path(config_path: &Path) -> Result<ConfigEntry, ConfigError> {
let mut file = fs::File::open(&config_path)
.await
.context(ConfigErrorKind::IO)?;
let mut file = fs::File::open(&config_path).await?;
let buffer = BufReader::new(&mut file);
Self::parse(buffer).await

View file

@ -1,59 +1,9 @@
use failure::{Backtrace, Context, Error, Fail};
use std::fmt;
use thiserror::Error;
#[derive(Debug)]
pub struct ConfigError {
inner: Context<ConfigErrorKind>,
}
#[derive(Copy, Clone, Eq, PartialEq, Debug, Fail)]
pub enum ConfigErrorKind {
#[fail(display = "IO Error")]
IO,
#[fail(display = "Unknown Error")]
#[derive(Error, Debug)]
pub enum ConfigError {
#[error("IO error")]
IO(#[from] std::io::Error),
#[error("Unknown Error")]
Unknown,
}
impl Fail for ConfigError {
fn cause(&self) -> Option<&dyn Fail> {
self.inner.cause()
}
fn backtrace(&self) -> Option<&Backtrace> {
self.inner.backtrace()
}
}
impl fmt::Display for ConfigError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fmt::Display::fmt(&self.inner, f)
}
}
// impl ConfigError {
// pub fn kind(&self) -> ConfigErrorKind {
// *self.inner.get_context()
// }
// }
impl From<ConfigErrorKind> for ConfigError {
fn from(kind: ConfigErrorKind) -> ConfigError {
ConfigError {
inner: Context::new(kind),
}
}
}
impl From<Context<ConfigErrorKind>> for ConfigError {
fn from(inner: Context<ConfigErrorKind>) -> ConfigError {
ConfigError { inner }
}
}
impl From<Error> for ConfigError {
fn from(_: Error) -> ConfigError {
ConfigError {
inner: Context::new(ConfigErrorKind::Unknown),
}
}
}

View file

@ -6,3 +6,4 @@ mod error;
pub use config_collection::ConfigCollection;
pub use config_entry::ConfigEntry;
pub use error::ConfigError;

View file

@ -1,71 +1,21 @@
use failure::{Backtrace, Context, Error, Fail};
use std::fmt;
use thiserror::Error;
#[derive(Debug)]
pub struct FullTextParserError {
inner: Context<FullTextParserErrorKind>,
}
#[derive(Copy, Clone, Eq, PartialEq, Debug, Fail)]
pub enum FullTextParserErrorKind {
#[fail(display = "libXml Error")]
#[derive(Error, Debug)]
pub enum FullTextParserError {
#[error("libXml Error")]
Xml,
#[fail(display = "No content found")]
#[error("No content found")]
Scrape,
#[fail(display = "Url Error")]
Url,
#[fail(display = "Http request failed")]
#[error("Url Error")]
Url(#[from] url::ParseError),
#[error("Http request failed")]
Http,
#[fail(display = "Config Error")]
#[error("Config Error")]
Config,
#[fail(display = "IO Error")]
#[error("IO Error")]
IO,
#[fail(display = "Content-type suggest no html")]
#[error("Content-type suggest no html")]
ContentType,
#[fail(display = "Unknown Error")]
#[error("Unknown Error")]
Unknown,
}
impl Fail for FullTextParserError {
fn cause(&self) -> Option<&dyn Fail> {
self.inner.cause()
}
fn backtrace(&self) -> Option<&Backtrace> {
self.inner.backtrace()
}
}
impl fmt::Display for FullTextParserError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fmt::Display::fmt(&self.inner, f)
}
}
impl FullTextParserError {
pub fn kind(&self) -> FullTextParserErrorKind {
*self.inner.get_context()
}
}
impl From<FullTextParserErrorKind> for FullTextParserError {
fn from(kind: FullTextParserErrorKind) -> FullTextParserError {
FullTextParserError {
inner: Context::new(kind),
}
}
}
impl From<Context<FullTextParserErrorKind>> for FullTextParserError {
fn from(inner: Context<FullTextParserErrorKind>) -> FullTextParserError {
FullTextParserError { inner }
}
}
impl From<Error> for FullTextParserError {
fn from(_: Error) -> FullTextParserError {
FullTextParserError {
inner: Context::new(FullTextParserErrorKind::Unknown),
}
}
}

View file

@ -6,12 +6,11 @@ mod fingerprints;
mod tests;
use self::config::{ConfigCollection, ConfigEntry};
use self::error::{FullTextParserError, FullTextParserErrorKind};
use self::error::FullTextParserError;
use crate::article::Article;
use crate::util::Util;
use chrono::DateTime;
use encoding_rs::Encoding;
use failure::ResultExt;
use fingerprints::Fingerprints;
use libxml::parser::Parser;
use libxml::tree::{Document, Node, SaveOptions};
@ -44,7 +43,7 @@ impl FullTextParser {
let global_config = self
.config_files
.get("global.txt")
.ok_or(FullTextParserErrorKind::Config)?;
.ok_or(FullTextParserError::Config)?;
let headers = Util::generate_headers(config, global_config)?;
@ -55,9 +54,8 @@ impl FullTextParser {
.await
.map_err(|err| {
error!("Failed head request to: '{}' - '{}'", url.as_str(), err);
err
})
.context(FullTextParserErrorKind::Http)?;
FullTextParserError::Http
})?;
// check if url redirects and we need to pick up the new url
let url = if let Some(new_url) = Util::check_redirect(&response, url) {
@ -69,7 +67,7 @@ impl FullTextParser {
// check if we are dealing with text/html
if !Util::check_content_type(&response)? {
return Err(FullTextParserErrorKind::ContentType.into());
return Err(FullTextParserError::ContentType);
}
let mut article = Article {
@ -80,9 +78,9 @@ impl FullTextParser {
html: None,
};
let mut document = Document::new().map_err(|()| FullTextParserErrorKind::Xml)?;
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
let mut root =
Node::new("article", None, &document).map_err(|()| FullTextParserErrorKind::Xml)?;
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
document.set_root_element(&root);
Self::generate_head(&mut root, &document)?;
@ -92,7 +90,7 @@ impl FullTextParser {
let context = Context::new(&document).map_err(|()| {
error!("Failed to create xpath context for extracted article");
FullTextParserErrorKind::Xml
FullTextParserError::Xml
})?;
if let Err(error) = Self::prevent_self_closing_tags(&context) {
@ -209,14 +207,14 @@ impl FullTextParser {
let parser = Parser::default_html();
Ok(parser.parse_string(html.as_str()).map_err(|err| {
error!("Parsing HTML failed for downloaded HTML {:?}", err);
FullTextParserErrorKind::Xml
FullTextParserError::Xml
})?)
}
fn get_xpath_ctx(doc: &Document) -> Result<Context, FullTextParserError> {
Ok(Context::new(doc).map_err(|()| {
error!("Creating xpath context failed for downloaded HTML");
FullTextParserErrorKind::Xml
FullTextParserError::Xml
})?)
}
@ -256,16 +254,15 @@ impl FullTextParser {
url.as_str(),
err
);
err
})
.context(FullTextParserErrorKind::Http)?;
FullTextParserError::Http
})?;
if response.status().is_success() {
let headers = response.headers().clone();
let text = response
.text()
.await
.context(FullTextParserErrorKind::Http)?;
.map_err(|_| FullTextParserError::Http)?;
{
if let Some(decoded_html) =
Self::decode_html(&text, Self::get_encoding_from_html(&text))
@ -284,7 +281,7 @@ impl FullTextParser {
return Ok(text);
}
Err(FullTextParserErrorKind::Http.into())
Err(FullTextParserError::Http)
}
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
@ -338,7 +335,7 @@ impl FullTextParser {
}
None => {
error!("Getting config failed due to bad Url");
Err(FullTextParserErrorKind::Config.into())
Err(FullTextParserError::Config)
}
}
}
@ -366,7 +363,7 @@ impl FullTextParser {
for mut node in node_vec {
if let Some(correct_url) = node.get_property(property_url) {
if node.set_property("src", &correct_url).is_err() {
return Err(FullTextParserErrorKind::Xml.into());
return Err(FullTextParserError::Xml);
}
}
}
@ -385,7 +382,7 @@ impl FullTextParser {
node.unlink();
video_wrapper.add_child(&mut node).map_err(|_| {
error!("Failed to add iframe as child of video wrapper <div>");
FullTextParserErrorKind::Xml
FullTextParserError::Xml
})?;
}
}
@ -393,7 +390,7 @@ impl FullTextParser {
}
error!("Failed to add video wrapper <div> as parent of iframe");
return Err(FullTextParserErrorKind::Xml.into());
return Err(FullTextParserError::Xml);
}
error!("Failed to get parent of iframe");
@ -413,7 +410,7 @@ impl FullTextParser {
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec {
if node.remove_property(attribute).is_err() {
return Err(FullTextParserErrorKind::Xml.into());
return Err(FullTextParserError::Xml);
}
}
Ok(())
@ -431,7 +428,7 @@ impl FullTextParser {
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec {
if node.set_attribute(attribute, value).is_err() {
return Err(FullTextParserErrorKind::Xml.into());
return Err(FullTextParserError::Xml);
}
}
Ok(())
@ -449,7 +446,7 @@ impl FullTextParser {
}
}
Err(FullTextParserErrorKind::Xml.into())
Err(FullTextParserError::Xml)
}
fn repair_urls(
@ -464,7 +461,7 @@ impl FullTextParser {
if let Err(url::ParseError::RelativeUrlWithoutBase) = url::Url::parse(&val) {
if let Ok(fixed_url) = Self::complete_url(article_url, &val) {
if node.set_attribute(attribute, fixed_url.as_str()).is_err() {
return Err(FullTextParserErrorKind::Scrape.into());
return Err(FullTextParserError::Scrape);
}
}
}
@ -486,7 +483,7 @@ impl FullTextParser {
completed_url.push_str("//");
completed_url.push_str(host);
}
_ => return Err(FullTextParserErrorKind::Scrape.into()),
_ => return Err(FullTextParserError::Scrape),
};
}
@ -494,7 +491,7 @@ impl FullTextParser {
completed_url.push('/');
}
completed_url.push_str(incomplete_url);
let url = url::Url::parse(&completed_url).context(FullTextParserErrorKind::Url)?;
let url = url::Url::parse(&completed_url)?;
Ok(url)
}
@ -678,7 +675,7 @@ impl FullTextParser {
if !found_something {
log::error!("no body found");
return Err(FullTextParserErrorKind::Scrape.into());
return Err(FullTextParserError::Scrape);
}
Ok(())
@ -694,7 +691,7 @@ impl FullTextParser {
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec {
if node.get_property("style").is_some() && node.remove_property("style").is_err() {
return Err(FullTextParserErrorKind::Xml.into());
return Err(FullTextParserError::Xml);
}
node.unlink();
@ -702,7 +699,7 @@ impl FullTextParser {
found_something = true;
} else {
error!("Failed to add body to prepared document");
return Err(FullTextParserErrorKind::Xml.into());
return Err(FullTextParserError::Xml);
}
}
}
@ -748,7 +745,7 @@ impl FullTextParser {
}
}
Err(FullTextParserErrorKind::Xml.into())
Err(FullTextParserError::Xml)
}
fn prevent_self_closing_tags(context: &Context) -> Result<(), FullTextParserError> {

View file

@ -1,89 +1,25 @@
use crate::full_text_parser::error::FullTextParserErrorKind;
use failure::{Backtrace, Context, Error, Fail};
use std::fmt;
use thiserror::Error;
#[derive(Debug)]
pub struct ImageDownloadError {
inner: Context<ImageDownloadErrorKind>,
}
#[derive(Copy, Clone, Eq, PartialEq, Debug, Fail)]
pub enum ImageDownloadErrorKind {
#[fail(display = "Parsing the supplied html string failed")]
#[derive(Error, Debug)]
pub enum ImageDownloadError {
#[error("Parsing the supplied html string failed")]
HtmlParse,
#[fail(display = "Scaling down a downloaded image failed")]
#[error("Scaling down a downloaded image failed")]
ImageScale,
#[fail(display = "Downloading the parent element of an image failed")]
#[error("Downloading the parent element of an image failed")]
ParentDownload,
#[fail(display = "Generating image name failed")]
#[error("Generating image name failed")]
ImageName,
#[fail(display = "Getting the content-length property failed")]
#[error("Getting the content-length property failed")]
ContentLenght,
#[fail(display = "Content-type suggest no image")]
#[error("Content-type suggest no image")]
ContentType,
#[fail(display = "Http error")]
#[error("Http error")]
Http,
#[fail(display = "IO error")]
#[error("IO error")]
IO,
#[fail(display = "Invalid URL")]
InvalidUrl,
#[fail(display = "Unknown Error")]
#[error("Invalid URL")]
InvalidUrl(#[from] url::ParseError),
#[error("Unknown Error")]
Unknown,
}
impl Fail for ImageDownloadError {
fn cause(&self) -> Option<&dyn Fail> {
self.inner.cause()
}
fn backtrace(&self) -> Option<&Backtrace> {
self.inner.backtrace()
}
}
impl fmt::Display for ImageDownloadError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fmt::Display::fmt(&self.inner, f)
}
}
impl ImageDownloadError {
pub fn kind(&self) -> ImageDownloadErrorKind {
*self.inner.get_context()
}
}
impl From<ImageDownloadErrorKind> for ImageDownloadError {
fn from(kind: ImageDownloadErrorKind) -> ImageDownloadError {
ImageDownloadError {
inner: Context::new(kind),
}
}
}
impl From<Context<ImageDownloadErrorKind>> for ImageDownloadError {
fn from(inner: Context<ImageDownloadErrorKind>) -> ImageDownloadError {
ImageDownloadError { inner }
}
}
impl From<FullTextParserErrorKind> for ImageDownloadError {
fn from(kind: FullTextParserErrorKind) -> ImageDownloadError {
let kind = match kind {
FullTextParserErrorKind::Xml => ImageDownloadErrorKind::HtmlParse,
_ => ImageDownloadErrorKind::Unknown,
};
ImageDownloadError {
inner: Context::new(kind),
}
}
}
impl From<Error> for ImageDownloadError {
fn from(_: Error) -> ImageDownloadError {
ImageDownloadError {
inner: Context::new(ImageDownloadErrorKind::Unknown),
}
}
}

View file

@ -1,6 +1,5 @@
use self::error::{ImageDownloadError, ImageDownloadErrorKind};
pub use self::error::ImageDownloadError;
use crate::util::Util;
use failure::ResultExt;
use libxml::parser::Parser;
use libxml::tree::{Node, SaveOptions};
use libxml::xpath::Context;
@ -25,14 +24,13 @@ impl ImageDownloader {
client: &Client,
) -> Result<String, ImageDownloadError> {
let parser = Parser::default_html();
let doc = parser.parse_string(html).map_err(|_| {
error!("Failed to parse HTML string");
ImageDownloadErrorKind::HtmlParse
})?;
let doc = parser
.parse_string(html)
.map_err(|_| ImageDownloadError::HtmlParse)?;
let xpath_ctx = Context::new(&doc).map_err(|()| {
error!("Failed to create xpath context for document");
ImageDownloadErrorKind::HtmlParse
ImageDownloadError::HtmlParse
})?;
self.download_images_from_context(&xpath_ctx, client)
@ -58,7 +56,7 @@ impl ImageDownloader {
) -> Result<(), ImageDownloadError> {
let xpath = "//img";
let node_vec = Util::evaluate_xpath(context, xpath, false)
.context(ImageDownloadErrorKind::HtmlParse)?;
.map_err(|_| ImageDownloadError::HtmlParse)?;
for mut node in node_vec {
if let Some(url) = node.get_property("src") {
if !url.starts_with("data:") {
@ -72,11 +70,11 @@ impl ImageDownloader {
self.save_image(&url, &parent_url, client).await
{
if node.set_property("src", &small_image).is_err() {
return Err(ImageDownloadErrorKind::HtmlParse.into());
return Err(ImageDownloadError::HtmlParse);
}
if let Some(big_image) = big_image {
if node.set_property("big-src", &big_image).is_err() {
return Err(ImageDownloadErrorKind::HtmlParse.into());
return Err(ImageDownloadError::HtmlParse);
}
}
}
@ -94,26 +92,21 @@ impl ImageDownloader {
parent_url: &Option<url::Url>,
client: &Client,
) -> Result<(String, Option<String>), ImageDownloadError> {
let response = client
.get(image_url.clone())
.send()
.await
.map_err(|err| {
error!("GET {} failed - {}", image_url.as_str(), err);
err
})
.context(ImageDownloadErrorKind::Http)?;
let response = client.get(image_url.clone()).send().await.map_err(|err| {
error!("GET {} failed - {}", image_url.as_str(), err);
ImageDownloadError::Http
})?;
let content_type_small = ImageDownloader::check_image_content_type(&response)?;
let content_type_small = content_type_small
.to_str()
.context(ImageDownloadErrorKind::ContentType)?;
.map_err(|_| ImageDownloadError::ContentType)?;
let mut content_type_big: Option<String> = None;
let mut small_image = response
.bytes()
.await
.context(ImageDownloadErrorKind::IO)?
.map_err(|_| ImageDownloadError::Http)?
.as_ref()
.to_vec();
@ -124,18 +117,18 @@ impl ImageDownloader {
.get(parent_url.clone())
.send()
.await
.context(ImageDownloadErrorKind::Http)?;
.map_err(|_| ImageDownloadError::Http)?;
content_type_big = Some(
ImageDownloader::check_image_content_type(&response_big)?
.to_str()
.context(ImageDownloadErrorKind::ContentType)?
.map_err(|_| ImageDownloadError::ContentType)?
.to_owned(),
);
big_image = Some(
response_big
.bytes()
.await
.context(ImageDownloadErrorKind::IO)?
.map_err(|_| ImageDownloadError::Http)?
.to_vec(),
);
}
@ -159,12 +152,10 @@ impl ImageDownloader {
format!("data:{};base64,{}", content_type_small, small_image_base64);
let big_image_string = match big_image_base64 {
Some(big_image_base64) => {
let content_type_big = content_type_big
.ok_or(ImageDownloadErrorKind::ParentDownload)
.map_err(|err| {
debug!("content_type_big should not be None when a big image exists");
err
})?;
let content_type_big = content_type_big.ok_or_else(|| {
debug!("content_type_big should not be None when a big image exists");
ImageDownloadError::ParentDownload
})?;
Some(format!(
"data:{};base64,{}",
content_type_big, big_image_base64
@ -182,7 +173,7 @@ impl ImageDownloader {
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
if content_type
.to_str()
.context(ImageDownloadErrorKind::ContentType)?
.map_err(|_| ImageDownloadError::ContentType)?
.contains("image")
{
return Ok(content_type.clone());
@ -190,10 +181,10 @@ impl ImageDownloader {
}
error!("{} is not an image", response.url());
return Err(ImageDownloadErrorKind::ContentType.into());
Err(ImageDownloadError::ContentType)
} else {
Err(ImageDownloadError::Http)
}
Err(ImageDownloadErrorKind::Http.into())
}
fn scale_image(
@ -203,12 +194,10 @@ impl ImageDownloader {
let mut original_image: Vec<u8> = Vec::new();
let mut resized_image: Option<Vec<u8>> = None;
let mut image = image::load_from_memory(image_buffer)
.map_err(|err| {
error!("Failed to open image to resize");
err
})
.context(ImageDownloadErrorKind::ImageScale)?;
let mut image = image::load_from_memory(image_buffer).map_err(|err| {
error!("Failed to open image to resize: {}", err);
ImageDownloadError::ImageScale
})?;
image
.write_to(
@ -216,10 +205,9 @@ impl ImageDownloader {
image::ImageOutputFormat::Png,
)
.map_err(|err| {
error!("Failed to save resized image to resize");
err
})
.context(ImageDownloadErrorKind::ImageScale)?;
error!("Failed to save resized image to resize: {}", err);
ImageDownloadError::ImageScale
})?;
let dimensions = (image.width(), image.height());
if dimensions.0 > max_dimensions.0 || dimensions.1 > max_dimensions.1 {
@ -235,10 +223,9 @@ impl ImageDownloader {
image::ImageOutputFormat::Png,
)
.map_err(|err| {
error!("Failed to save resized image to resize");
err
})
.context(ImageDownloadErrorKind::ImageScale)?;
error!("Failed to save resized image to resize: {}", err);
ImageDownloadError::ImageScale
})?;
resized_image = Some(resized_buf);
}
@ -254,24 +241,23 @@ impl ImageDownloader {
if let Some(parent) = node.get_parent() {
if parent.get_name() == "a" {
if let Some(url) = parent.get_property("href") {
let parent_url =
url::Url::parse(&url).context(ImageDownloadErrorKind::ParentDownload)?;
let parent_url = url::Url::parse(&url).map_err(|err| {
error!("Failed to parse parent image url: {}", err);
ImageDownloadError::InvalidUrl(err)
})?;
let parent_response = client
.head(parent_url.clone())
.send()
.await
.context(ImageDownloadErrorKind::ParentDownload)?;
let _ = ImageDownloader::check_image_content_type(&parent_response)
.context(ImageDownloadErrorKind::ParentDownload)?;
.map_err(|_| ImageDownloadError::Http)?;
let _ = ImageDownloader::check_image_content_type(&parent_response)?;
let child_response = client
.get(child_url.clone())
.send()
.await
.context(ImageDownloadErrorKind::ParentDownload)?;
let parent_length = Self::get_content_lenght(&parent_response)
.context(ImageDownloadErrorKind::ParentDownload)?;
let child_length = Self::get_content_lenght(&child_response)
.context(ImageDownloadErrorKind::ParentDownload)?;
.map_err(|_| ImageDownloadError::Http)?;
let parent_length = Self::get_content_lenght(&parent_response)?;
let child_length = Self::get_content_lenght(&child_response)?;
if parent_length > child_length {
return Ok(parent_url);
@ -283,7 +269,7 @@ impl ImageDownloader {
}
debug!("Image parent element not relevant");
Err(ImageDownloadErrorKind::ParentDownload.into())
Err(ImageDownloadError::ParentDownload)
}
fn get_content_lenght(response: &Response) -> Result<u64, ImageDownloadError> {
@ -296,7 +282,7 @@ impl ImageDownloader {
}
}
}
Err(ImageDownloadErrorKind::ContentLenght.into())
Err(ImageDownloadError::ContentLenght)
}
}

View file

@ -6,7 +6,7 @@ mod readability;
mod util;
use article::Article;
use error::{ScraperError, ScraperErrorKind};
use error::ScraperError;
use full_text_parser::FullTextParser;
use images::ImageDownloader;
use readability::Readability;

View file

@ -1,4 +1,3 @@
use failure::ResultExt;
use libxml::{tree::Node, xpath::Context};
use reqwest::{
header::{HeaderMap, HeaderName, HeaderValue},
@ -6,10 +5,7 @@ use reqwest::{
};
use tokio::fs::DirEntry;
use crate::full_text_parser::{
config::ConfigEntry,
error::{FullTextParserError, FullTextParserErrorKind},
};
use crate::full_text_parser::{config::ConfigEntry, error::FullTextParserError};
pub struct Util;
@ -55,22 +51,22 @@ impl Util {
if let Some(config) = site_specific_rule {
for header in &config.header {
let name = HeaderName::from_bytes(header.name.as_bytes())
.context(FullTextParserErrorKind::Config)?;
.map_err(|_| FullTextParserError::Config)?;
let value = header
.value
.parse::<HeaderValue>()
.context(FullTextParserErrorKind::Config)?;
.map_err(|_| FullTextParserError::Config)?;
headers.insert(name, value);
}
}
for header in &global_rule.header {
let name = HeaderName::from_bytes(header.name.as_bytes())
.context(FullTextParserErrorKind::Config)?;
.map_err(|_| FullTextParserError::Config)?;
let value = header
.value
.parse::<HeaderValue>()
.context(FullTextParserErrorKind::Config)?;
.map_err(|_| FullTextParserError::Config)?;
headers.insert(name, value);
}
@ -105,7 +101,7 @@ impl Util {
) -> Result<Vec<Node>, FullTextParserError> {
let res = xpath_ctx.evaluate(xpath).map_err(|()| {
log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
FullTextParserErrorKind::Xml
FullTextParserError::Xml
})?;
let node_vec = res.get_nodes_as_vec();
@ -113,7 +109,7 @@ impl Util {
if node_vec.is_empty() {
log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
if thorw_if_empty {
return Err(FullTextParserErrorKind::Xml.into());
return Err(FullTextParserError::Xml);
}
}
@ -135,7 +131,7 @@ impl Util {
}
log::error!("Failed to determine content type");
Err(FullTextParserErrorKind::Http.into())
Err(FullTextParserError::Http)
}
pub fn check_redirect(response: &Response, original_url: &url::Url) -> Option<url::Url> {
@ -155,7 +151,7 @@ impl Util {
return Ok(val.get_content());
}
Err(FullTextParserErrorKind::Xml.into())
Err(FullTextParserError::Xml)
}
pub fn extract_value_merge(