mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
port failure -> thiserror
This commit is contained in:
parent
d906f6b7fe
commit
27be5a3204
11 changed files with 137 additions and 366 deletions
10
Cargo.toml
10
Cargo.toml
|
@ -8,16 +8,16 @@ description = "Scrap article contents from the web. Powered by fivefilters full
|
|||
repository = "https://gitlab.com/news-flash/article_scraper"
|
||||
|
||||
[dependencies]
|
||||
failure = "0.1"
|
||||
thiserror = "1.0"
|
||||
libxml = "0.3"
|
||||
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli"] }
|
||||
tokio = { version = "1.21", features = ["macros", "fs", "io-util"] }
|
||||
url = "2.2"
|
||||
regex = "1.4"
|
||||
tokio = { version = "1.22", features = ["macros", "fs", "io-util"] }
|
||||
url = "2.3"
|
||||
regex = "1.7"
|
||||
encoding_rs = "0.8"
|
||||
chrono = "0.4"
|
||||
base64 = "0.13"
|
||||
image = "0.24"
|
||||
log = "0.4"
|
||||
rust-embed="6.4"
|
||||
once_cell = "1.15"
|
||||
once_cell = "1.16"
|
68
src/error.rs
68
src/error.rs
|
@ -1,57 +1,15 @@
|
|||
use failure::{Backtrace, Context, Error, Fail};
|
||||
use std::fmt;
|
||||
use crate::{
|
||||
full_text_parser::{config::ConfigError, error::FullTextParserError},
|
||||
images::ImageDownloadError,
|
||||
};
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ScraperError {
|
||||
inner: Context<ScraperErrorKind>,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Debug, Fail)]
|
||||
pub enum ScraperErrorKind {
|
||||
#[fail(display = "Unknown Error")]
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl Fail for ScraperError {
|
||||
fn cause(&self) -> Option<&dyn Fail> {
|
||||
self.inner.cause()
|
||||
}
|
||||
|
||||
fn backtrace(&self) -> Option<&Backtrace> {
|
||||
self.inner.backtrace()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ScraperError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fmt::Display::fmt(&self.inner, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl ScraperError {
|
||||
pub fn kind(&self) -> ScraperErrorKind {
|
||||
*self.inner.get_context()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ScraperErrorKind> for ScraperError {
|
||||
fn from(kind: ScraperErrorKind) -> ScraperError {
|
||||
ScraperError {
|
||||
inner: Context::new(kind),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Context<ScraperErrorKind>> for ScraperError {
|
||||
fn from(inner: Context<ScraperErrorKind>) -> ScraperError {
|
||||
ScraperError { inner }
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Error> for ScraperError {
|
||||
fn from(_: Error) -> ScraperError {
|
||||
ScraperError {
|
||||
inner: Context::new(ScraperErrorKind::Unknown),
|
||||
}
|
||||
}
|
||||
#[derive(Error, Debug)]
|
||||
pub enum ScraperError {
|
||||
#[error("")]
|
||||
Config(#[from] ConfigError),
|
||||
#[error("")]
|
||||
Image(#[from] ImageDownloadError),
|
||||
#[error("")]
|
||||
Scrap(#[from] FullTextParserError),
|
||||
}
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
use crate::util::Util;
|
||||
|
||||
use super::error::{ConfigError, ConfigErrorKind};
|
||||
use failure::ResultExt;
|
||||
use super::error::ConfigError;
|
||||
use std::borrow::Cow;
|
||||
use std::io::Cursor;
|
||||
use std::path::Path;
|
||||
|
@ -37,9 +36,7 @@ pub struct ConfigEntry {
|
|||
|
||||
impl ConfigEntry {
|
||||
pub async fn parse_path(config_path: &Path) -> Result<ConfigEntry, ConfigError> {
|
||||
let mut file = fs::File::open(&config_path)
|
||||
.await
|
||||
.context(ConfigErrorKind::IO)?;
|
||||
let mut file = fs::File::open(&config_path).await?;
|
||||
let buffer = BufReader::new(&mut file);
|
||||
|
||||
Self::parse(buffer).await
|
||||
|
|
|
@ -1,59 +1,9 @@
|
|||
use failure::{Backtrace, Context, Error, Fail};
|
||||
use std::fmt;
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ConfigError {
|
||||
inner: Context<ConfigErrorKind>,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Debug, Fail)]
|
||||
pub enum ConfigErrorKind {
|
||||
#[fail(display = "IO Error")]
|
||||
IO,
|
||||
#[fail(display = "Unknown Error")]
|
||||
#[derive(Error, Debug)]
|
||||
pub enum ConfigError {
|
||||
#[error("IO error")]
|
||||
IO(#[from] std::io::Error),
|
||||
#[error("Unknown Error")]
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl Fail for ConfigError {
|
||||
fn cause(&self) -> Option<&dyn Fail> {
|
||||
self.inner.cause()
|
||||
}
|
||||
|
||||
fn backtrace(&self) -> Option<&Backtrace> {
|
||||
self.inner.backtrace()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ConfigError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fmt::Display::fmt(&self.inner, f)
|
||||
}
|
||||
}
|
||||
|
||||
// impl ConfigError {
|
||||
// pub fn kind(&self) -> ConfigErrorKind {
|
||||
// *self.inner.get_context()
|
||||
// }
|
||||
// }
|
||||
|
||||
impl From<ConfigErrorKind> for ConfigError {
|
||||
fn from(kind: ConfigErrorKind) -> ConfigError {
|
||||
ConfigError {
|
||||
inner: Context::new(kind),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Context<ConfigErrorKind>> for ConfigError {
|
||||
fn from(inner: Context<ConfigErrorKind>) -> ConfigError {
|
||||
ConfigError { inner }
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Error> for ConfigError {
|
||||
fn from(_: Error) -> ConfigError {
|
||||
ConfigError {
|
||||
inner: Context::new(ConfigErrorKind::Unknown),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,3 +6,4 @@ mod error;
|
|||
|
||||
pub use config_collection::ConfigCollection;
|
||||
pub use config_entry::ConfigEntry;
|
||||
pub use error::ConfigError;
|
||||
|
|
|
@ -1,71 +1,21 @@
|
|||
use failure::{Backtrace, Context, Error, Fail};
|
||||
use std::fmt;
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FullTextParserError {
|
||||
inner: Context<FullTextParserErrorKind>,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Debug, Fail)]
|
||||
pub enum FullTextParserErrorKind {
|
||||
#[fail(display = "libXml Error")]
|
||||
#[derive(Error, Debug)]
|
||||
pub enum FullTextParserError {
|
||||
#[error("libXml Error")]
|
||||
Xml,
|
||||
#[fail(display = "No content found")]
|
||||
#[error("No content found")]
|
||||
Scrape,
|
||||
#[fail(display = "Url Error")]
|
||||
Url,
|
||||
#[fail(display = "Http request failed")]
|
||||
#[error("Url Error")]
|
||||
Url(#[from] url::ParseError),
|
||||
#[error("Http request failed")]
|
||||
Http,
|
||||
#[fail(display = "Config Error")]
|
||||
#[error("Config Error")]
|
||||
Config,
|
||||
#[fail(display = "IO Error")]
|
||||
#[error("IO Error")]
|
||||
IO,
|
||||
#[fail(display = "Content-type suggest no html")]
|
||||
#[error("Content-type suggest no html")]
|
||||
ContentType,
|
||||
#[fail(display = "Unknown Error")]
|
||||
#[error("Unknown Error")]
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl Fail for FullTextParserError {
|
||||
fn cause(&self) -> Option<&dyn Fail> {
|
||||
self.inner.cause()
|
||||
}
|
||||
|
||||
fn backtrace(&self) -> Option<&Backtrace> {
|
||||
self.inner.backtrace()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for FullTextParserError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fmt::Display::fmt(&self.inner, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl FullTextParserError {
|
||||
pub fn kind(&self) -> FullTextParserErrorKind {
|
||||
*self.inner.get_context()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FullTextParserErrorKind> for FullTextParserError {
|
||||
fn from(kind: FullTextParserErrorKind) -> FullTextParserError {
|
||||
FullTextParserError {
|
||||
inner: Context::new(kind),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Context<FullTextParserErrorKind>> for FullTextParserError {
|
||||
fn from(inner: Context<FullTextParserErrorKind>) -> FullTextParserError {
|
||||
FullTextParserError { inner }
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Error> for FullTextParserError {
|
||||
fn from(_: Error) -> FullTextParserError {
|
||||
FullTextParserError {
|
||||
inner: Context::new(FullTextParserErrorKind::Unknown),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,12 +6,11 @@ mod fingerprints;
|
|||
mod tests;
|
||||
|
||||
use self::config::{ConfigCollection, ConfigEntry};
|
||||
use self::error::{FullTextParserError, FullTextParserErrorKind};
|
||||
use self::error::FullTextParserError;
|
||||
use crate::article::Article;
|
||||
use crate::util::Util;
|
||||
use chrono::DateTime;
|
||||
use encoding_rs::Encoding;
|
||||
use failure::ResultExt;
|
||||
use fingerprints::Fingerprints;
|
||||
use libxml::parser::Parser;
|
||||
use libxml::tree::{Document, Node, SaveOptions};
|
||||
|
@ -44,7 +43,7 @@ impl FullTextParser {
|
|||
let global_config = self
|
||||
.config_files
|
||||
.get("global.txt")
|
||||
.ok_or(FullTextParserErrorKind::Config)?;
|
||||
.ok_or(FullTextParserError::Config)?;
|
||||
|
||||
let headers = Util::generate_headers(config, global_config)?;
|
||||
|
||||
|
@ -55,9 +54,8 @@ impl FullTextParser {
|
|||
.await
|
||||
.map_err(|err| {
|
||||
error!("Failed head request to: '{}' - '{}'", url.as_str(), err);
|
||||
err
|
||||
})
|
||||
.context(FullTextParserErrorKind::Http)?;
|
||||
FullTextParserError::Http
|
||||
})?;
|
||||
|
||||
// check if url redirects and we need to pick up the new url
|
||||
let url = if let Some(new_url) = Util::check_redirect(&response, url) {
|
||||
|
@ -69,7 +67,7 @@ impl FullTextParser {
|
|||
|
||||
// check if we are dealing with text/html
|
||||
if !Util::check_content_type(&response)? {
|
||||
return Err(FullTextParserErrorKind::ContentType.into());
|
||||
return Err(FullTextParserError::ContentType);
|
||||
}
|
||||
|
||||
let mut article = Article {
|
||||
|
@ -80,9 +78,9 @@ impl FullTextParser {
|
|||
html: None,
|
||||
};
|
||||
|
||||
let mut document = Document::new().map_err(|()| FullTextParserErrorKind::Xml)?;
|
||||
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
||||
let mut root =
|
||||
Node::new("article", None, &document).map_err(|()| FullTextParserErrorKind::Xml)?;
|
||||
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
|
||||
document.set_root_element(&root);
|
||||
|
||||
Self::generate_head(&mut root, &document)?;
|
||||
|
@ -92,7 +90,7 @@ impl FullTextParser {
|
|||
|
||||
let context = Context::new(&document).map_err(|()| {
|
||||
error!("Failed to create xpath context for extracted article");
|
||||
FullTextParserErrorKind::Xml
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
|
||||
if let Err(error) = Self::prevent_self_closing_tags(&context) {
|
||||
|
@ -209,14 +207,14 @@ impl FullTextParser {
|
|||
let parser = Parser::default_html();
|
||||
Ok(parser.parse_string(html.as_str()).map_err(|err| {
|
||||
error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
||||
FullTextParserErrorKind::Xml
|
||||
FullTextParserError::Xml
|
||||
})?)
|
||||
}
|
||||
|
||||
fn get_xpath_ctx(doc: &Document) -> Result<Context, FullTextParserError> {
|
||||
Ok(Context::new(doc).map_err(|()| {
|
||||
error!("Creating xpath context failed for downloaded HTML");
|
||||
FullTextParserErrorKind::Xml
|
||||
FullTextParserError::Xml
|
||||
})?)
|
||||
}
|
||||
|
||||
|
@ -256,16 +254,15 @@ impl FullTextParser {
|
|||
url.as_str(),
|
||||
err
|
||||
);
|
||||
err
|
||||
})
|
||||
.context(FullTextParserErrorKind::Http)?;
|
||||
FullTextParserError::Http
|
||||
})?;
|
||||
|
||||
if response.status().is_success() {
|
||||
let headers = response.headers().clone();
|
||||
let text = response
|
||||
.text()
|
||||
.await
|
||||
.context(FullTextParserErrorKind::Http)?;
|
||||
.map_err(|_| FullTextParserError::Http)?;
|
||||
{
|
||||
if let Some(decoded_html) =
|
||||
Self::decode_html(&text, Self::get_encoding_from_html(&text))
|
||||
|
@ -284,7 +281,7 @@ impl FullTextParser {
|
|||
return Ok(text);
|
||||
}
|
||||
|
||||
Err(FullTextParserErrorKind::Http.into())
|
||||
Err(FullTextParserError::Http)
|
||||
}
|
||||
|
||||
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
|
||||
|
@ -338,7 +335,7 @@ impl FullTextParser {
|
|||
}
|
||||
None => {
|
||||
error!("Getting config failed due to bad Url");
|
||||
Err(FullTextParserErrorKind::Config.into())
|
||||
Err(FullTextParserError::Config)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -366,7 +363,7 @@ impl FullTextParser {
|
|||
for mut node in node_vec {
|
||||
if let Some(correct_url) = node.get_property(property_url) {
|
||||
if node.set_property("src", &correct_url).is_err() {
|
||||
return Err(FullTextParserErrorKind::Xml.into());
|
||||
return Err(FullTextParserError::Xml);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -385,7 +382,7 @@ impl FullTextParser {
|
|||
node.unlink();
|
||||
video_wrapper.add_child(&mut node).map_err(|_| {
|
||||
error!("Failed to add iframe as child of video wrapper <div>");
|
||||
FullTextParserErrorKind::Xml
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
@ -393,7 +390,7 @@ impl FullTextParser {
|
|||
}
|
||||
|
||||
error!("Failed to add video wrapper <div> as parent of iframe");
|
||||
return Err(FullTextParserErrorKind::Xml.into());
|
||||
return Err(FullTextParserError::Xml);
|
||||
}
|
||||
|
||||
error!("Failed to get parent of iframe");
|
||||
|
@ -413,7 +410,7 @@ impl FullTextParser {
|
|||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||
for mut node in node_vec {
|
||||
if node.remove_property(attribute).is_err() {
|
||||
return Err(FullTextParserErrorKind::Xml.into());
|
||||
return Err(FullTextParserError::Xml);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
@ -431,7 +428,7 @@ impl FullTextParser {
|
|||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||
for mut node in node_vec {
|
||||
if node.set_attribute(attribute, value).is_err() {
|
||||
return Err(FullTextParserErrorKind::Xml.into());
|
||||
return Err(FullTextParserError::Xml);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
@ -449,7 +446,7 @@ impl FullTextParser {
|
|||
}
|
||||
}
|
||||
|
||||
Err(FullTextParserErrorKind::Xml.into())
|
||||
Err(FullTextParserError::Xml)
|
||||
}
|
||||
|
||||
fn repair_urls(
|
||||
|
@ -464,7 +461,7 @@ impl FullTextParser {
|
|||
if let Err(url::ParseError::RelativeUrlWithoutBase) = url::Url::parse(&val) {
|
||||
if let Ok(fixed_url) = Self::complete_url(article_url, &val) {
|
||||
if node.set_attribute(attribute, fixed_url.as_str()).is_err() {
|
||||
return Err(FullTextParserErrorKind::Scrape.into());
|
||||
return Err(FullTextParserError::Scrape);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -486,7 +483,7 @@ impl FullTextParser {
|
|||
completed_url.push_str("//");
|
||||
completed_url.push_str(host);
|
||||
}
|
||||
_ => return Err(FullTextParserErrorKind::Scrape.into()),
|
||||
_ => return Err(FullTextParserError::Scrape),
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -494,7 +491,7 @@ impl FullTextParser {
|
|||
completed_url.push('/');
|
||||
}
|
||||
completed_url.push_str(incomplete_url);
|
||||
let url = url::Url::parse(&completed_url).context(FullTextParserErrorKind::Url)?;
|
||||
let url = url::Url::parse(&completed_url)?;
|
||||
Ok(url)
|
||||
}
|
||||
|
||||
|
@ -678,7 +675,7 @@ impl FullTextParser {
|
|||
|
||||
if !found_something {
|
||||
log::error!("no body found");
|
||||
return Err(FullTextParserErrorKind::Scrape.into());
|
||||
return Err(FullTextParserError::Scrape);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
@ -694,7 +691,7 @@ impl FullTextParser {
|
|||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||
for mut node in node_vec {
|
||||
if node.get_property("style").is_some() && node.remove_property("style").is_err() {
|
||||
return Err(FullTextParserErrorKind::Xml.into());
|
||||
return Err(FullTextParserError::Xml);
|
||||
}
|
||||
|
||||
node.unlink();
|
||||
|
@ -702,7 +699,7 @@ impl FullTextParser {
|
|||
found_something = true;
|
||||
} else {
|
||||
error!("Failed to add body to prepared document");
|
||||
return Err(FullTextParserErrorKind::Xml.into());
|
||||
return Err(FullTextParserError::Xml);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -748,7 +745,7 @@ impl FullTextParser {
|
|||
}
|
||||
}
|
||||
|
||||
Err(FullTextParserErrorKind::Xml.into())
|
||||
Err(FullTextParserError::Xml)
|
||||
}
|
||||
|
||||
fn prevent_self_closing_tags(context: &Context) -> Result<(), FullTextParserError> {
|
||||
|
|
|
@ -1,89 +1,25 @@
|
|||
use crate::full_text_parser::error::FullTextParserErrorKind;
|
||||
use failure::{Backtrace, Context, Error, Fail};
|
||||
use std::fmt;
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ImageDownloadError {
|
||||
inner: Context<ImageDownloadErrorKind>,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Debug, Fail)]
|
||||
pub enum ImageDownloadErrorKind {
|
||||
#[fail(display = "Parsing the supplied html string failed")]
|
||||
#[derive(Error, Debug)]
|
||||
pub enum ImageDownloadError {
|
||||
#[error("Parsing the supplied html string failed")]
|
||||
HtmlParse,
|
||||
#[fail(display = "Scaling down a downloaded image failed")]
|
||||
#[error("Scaling down a downloaded image failed")]
|
||||
ImageScale,
|
||||
#[fail(display = "Downloading the parent element of an image failed")]
|
||||
#[error("Downloading the parent element of an image failed")]
|
||||
ParentDownload,
|
||||
#[fail(display = "Generating image name failed")]
|
||||
#[error("Generating image name failed")]
|
||||
ImageName,
|
||||
#[fail(display = "Getting the content-length property failed")]
|
||||
#[error("Getting the content-length property failed")]
|
||||
ContentLenght,
|
||||
#[fail(display = "Content-type suggest no image")]
|
||||
#[error("Content-type suggest no image")]
|
||||
ContentType,
|
||||
#[fail(display = "Http error")]
|
||||
#[error("Http error")]
|
||||
Http,
|
||||
#[fail(display = "IO error")]
|
||||
#[error("IO error")]
|
||||
IO,
|
||||
#[fail(display = "Invalid URL")]
|
||||
InvalidUrl,
|
||||
#[fail(display = "Unknown Error")]
|
||||
#[error("Invalid URL")]
|
||||
InvalidUrl(#[from] url::ParseError),
|
||||
#[error("Unknown Error")]
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl Fail for ImageDownloadError {
|
||||
fn cause(&self) -> Option<&dyn Fail> {
|
||||
self.inner.cause()
|
||||
}
|
||||
|
||||
fn backtrace(&self) -> Option<&Backtrace> {
|
||||
self.inner.backtrace()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ImageDownloadError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fmt::Display::fmt(&self.inner, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl ImageDownloadError {
|
||||
pub fn kind(&self) -> ImageDownloadErrorKind {
|
||||
*self.inner.get_context()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ImageDownloadErrorKind> for ImageDownloadError {
|
||||
fn from(kind: ImageDownloadErrorKind) -> ImageDownloadError {
|
||||
ImageDownloadError {
|
||||
inner: Context::new(kind),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Context<ImageDownloadErrorKind>> for ImageDownloadError {
|
||||
fn from(inner: Context<ImageDownloadErrorKind>) -> ImageDownloadError {
|
||||
ImageDownloadError { inner }
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FullTextParserErrorKind> for ImageDownloadError {
|
||||
fn from(kind: FullTextParserErrorKind) -> ImageDownloadError {
|
||||
let kind = match kind {
|
||||
FullTextParserErrorKind::Xml => ImageDownloadErrorKind::HtmlParse,
|
||||
_ => ImageDownloadErrorKind::Unknown,
|
||||
};
|
||||
|
||||
ImageDownloadError {
|
||||
inner: Context::new(kind),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Error> for ImageDownloadError {
|
||||
fn from(_: Error) -> ImageDownloadError {
|
||||
ImageDownloadError {
|
||||
inner: Context::new(ImageDownloadErrorKind::Unknown),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
use self::error::{ImageDownloadError, ImageDownloadErrorKind};
|
||||
pub use self::error::ImageDownloadError;
|
||||
use crate::util::Util;
|
||||
use failure::ResultExt;
|
||||
use libxml::parser::Parser;
|
||||
use libxml::tree::{Node, SaveOptions};
|
||||
use libxml::xpath::Context;
|
||||
|
@ -25,14 +24,13 @@ impl ImageDownloader {
|
|||
client: &Client,
|
||||
) -> Result<String, ImageDownloadError> {
|
||||
let parser = Parser::default_html();
|
||||
let doc = parser.parse_string(html).map_err(|_| {
|
||||
error!("Failed to parse HTML string");
|
||||
ImageDownloadErrorKind::HtmlParse
|
||||
})?;
|
||||
let doc = parser
|
||||
.parse_string(html)
|
||||
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
||||
|
||||
let xpath_ctx = Context::new(&doc).map_err(|()| {
|
||||
error!("Failed to create xpath context for document");
|
||||
ImageDownloadErrorKind::HtmlParse
|
||||
ImageDownloadError::HtmlParse
|
||||
})?;
|
||||
|
||||
self.download_images_from_context(&xpath_ctx, client)
|
||||
|
@ -58,7 +56,7 @@ impl ImageDownloader {
|
|||
) -> Result<(), ImageDownloadError> {
|
||||
let xpath = "//img";
|
||||
let node_vec = Util::evaluate_xpath(context, xpath, false)
|
||||
.context(ImageDownloadErrorKind::HtmlParse)?;
|
||||
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
||||
for mut node in node_vec {
|
||||
if let Some(url) = node.get_property("src") {
|
||||
if !url.starts_with("data:") {
|
||||
|
@ -72,11 +70,11 @@ impl ImageDownloader {
|
|||
self.save_image(&url, &parent_url, client).await
|
||||
{
|
||||
if node.set_property("src", &small_image).is_err() {
|
||||
return Err(ImageDownloadErrorKind::HtmlParse.into());
|
||||
return Err(ImageDownloadError::HtmlParse);
|
||||
}
|
||||
if let Some(big_image) = big_image {
|
||||
if node.set_property("big-src", &big_image).is_err() {
|
||||
return Err(ImageDownloadErrorKind::HtmlParse.into());
|
||||
return Err(ImageDownloadError::HtmlParse);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -94,26 +92,21 @@ impl ImageDownloader {
|
|||
parent_url: &Option<url::Url>,
|
||||
client: &Client,
|
||||
) -> Result<(String, Option<String>), ImageDownloadError> {
|
||||
let response = client
|
||||
.get(image_url.clone())
|
||||
.send()
|
||||
.await
|
||||
.map_err(|err| {
|
||||
error!("GET {} failed - {}", image_url.as_str(), err);
|
||||
err
|
||||
})
|
||||
.context(ImageDownloadErrorKind::Http)?;
|
||||
let response = client.get(image_url.clone()).send().await.map_err(|err| {
|
||||
error!("GET {} failed - {}", image_url.as_str(), err);
|
||||
ImageDownloadError::Http
|
||||
})?;
|
||||
|
||||
let content_type_small = ImageDownloader::check_image_content_type(&response)?;
|
||||
let content_type_small = content_type_small
|
||||
.to_str()
|
||||
.context(ImageDownloadErrorKind::ContentType)?;
|
||||
.map_err(|_| ImageDownloadError::ContentType)?;
|
||||
let mut content_type_big: Option<String> = None;
|
||||
|
||||
let mut small_image = response
|
||||
.bytes()
|
||||
.await
|
||||
.context(ImageDownloadErrorKind::IO)?
|
||||
.map_err(|_| ImageDownloadError::Http)?
|
||||
.as_ref()
|
||||
.to_vec();
|
||||
|
||||
|
@ -124,18 +117,18 @@ impl ImageDownloader {
|
|||
.get(parent_url.clone())
|
||||
.send()
|
||||
.await
|
||||
.context(ImageDownloadErrorKind::Http)?;
|
||||
.map_err(|_| ImageDownloadError::Http)?;
|
||||
content_type_big = Some(
|
||||
ImageDownloader::check_image_content_type(&response_big)?
|
||||
.to_str()
|
||||
.context(ImageDownloadErrorKind::ContentType)?
|
||||
.map_err(|_| ImageDownloadError::ContentType)?
|
||||
.to_owned(),
|
||||
);
|
||||
big_image = Some(
|
||||
response_big
|
||||
.bytes()
|
||||
.await
|
||||
.context(ImageDownloadErrorKind::IO)?
|
||||
.map_err(|_| ImageDownloadError::Http)?
|
||||
.to_vec(),
|
||||
);
|
||||
}
|
||||
|
@ -159,12 +152,10 @@ impl ImageDownloader {
|
|||
format!("data:{};base64,{}", content_type_small, small_image_base64);
|
||||
let big_image_string = match big_image_base64 {
|
||||
Some(big_image_base64) => {
|
||||
let content_type_big = content_type_big
|
||||
.ok_or(ImageDownloadErrorKind::ParentDownload)
|
||||
.map_err(|err| {
|
||||
debug!("content_type_big should not be None when a big image exists");
|
||||
err
|
||||
})?;
|
||||
let content_type_big = content_type_big.ok_or_else(|| {
|
||||
debug!("content_type_big should not be None when a big image exists");
|
||||
ImageDownloadError::ParentDownload
|
||||
})?;
|
||||
Some(format!(
|
||||
"data:{};base64,{}",
|
||||
content_type_big, big_image_base64
|
||||
|
@ -182,7 +173,7 @@ impl ImageDownloader {
|
|||
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
|
||||
if content_type
|
||||
.to_str()
|
||||
.context(ImageDownloadErrorKind::ContentType)?
|
||||
.map_err(|_| ImageDownloadError::ContentType)?
|
||||
.contains("image")
|
||||
{
|
||||
return Ok(content_type.clone());
|
||||
|
@ -190,10 +181,10 @@ impl ImageDownloader {
|
|||
}
|
||||
|
||||
error!("{} is not an image", response.url());
|
||||
return Err(ImageDownloadErrorKind::ContentType.into());
|
||||
Err(ImageDownloadError::ContentType)
|
||||
} else {
|
||||
Err(ImageDownloadError::Http)
|
||||
}
|
||||
|
||||
Err(ImageDownloadErrorKind::Http.into())
|
||||
}
|
||||
|
||||
fn scale_image(
|
||||
|
@ -203,12 +194,10 @@ impl ImageDownloader {
|
|||
let mut original_image: Vec<u8> = Vec::new();
|
||||
let mut resized_image: Option<Vec<u8>> = None;
|
||||
|
||||
let mut image = image::load_from_memory(image_buffer)
|
||||
.map_err(|err| {
|
||||
error!("Failed to open image to resize");
|
||||
err
|
||||
})
|
||||
.context(ImageDownloadErrorKind::ImageScale)?;
|
||||
let mut image = image::load_from_memory(image_buffer).map_err(|err| {
|
||||
error!("Failed to open image to resize: {}", err);
|
||||
ImageDownloadError::ImageScale
|
||||
})?;
|
||||
|
||||
image
|
||||
.write_to(
|
||||
|
@ -216,10 +205,9 @@ impl ImageDownloader {
|
|||
image::ImageOutputFormat::Png,
|
||||
)
|
||||
.map_err(|err| {
|
||||
error!("Failed to save resized image to resize");
|
||||
err
|
||||
})
|
||||
.context(ImageDownloadErrorKind::ImageScale)?;
|
||||
error!("Failed to save resized image to resize: {}", err);
|
||||
ImageDownloadError::ImageScale
|
||||
})?;
|
||||
|
||||
let dimensions = (image.width(), image.height());
|
||||
if dimensions.0 > max_dimensions.0 || dimensions.1 > max_dimensions.1 {
|
||||
|
@ -235,10 +223,9 @@ impl ImageDownloader {
|
|||
image::ImageOutputFormat::Png,
|
||||
)
|
||||
.map_err(|err| {
|
||||
error!("Failed to save resized image to resize");
|
||||
err
|
||||
})
|
||||
.context(ImageDownloadErrorKind::ImageScale)?;
|
||||
error!("Failed to save resized image to resize: {}", err);
|
||||
ImageDownloadError::ImageScale
|
||||
})?;
|
||||
resized_image = Some(resized_buf);
|
||||
}
|
||||
|
||||
|
@ -254,24 +241,23 @@ impl ImageDownloader {
|
|||
if let Some(parent) = node.get_parent() {
|
||||
if parent.get_name() == "a" {
|
||||
if let Some(url) = parent.get_property("href") {
|
||||
let parent_url =
|
||||
url::Url::parse(&url).context(ImageDownloadErrorKind::ParentDownload)?;
|
||||
let parent_url = url::Url::parse(&url).map_err(|err| {
|
||||
error!("Failed to parse parent image url: {}", err);
|
||||
ImageDownloadError::InvalidUrl(err)
|
||||
})?;
|
||||
let parent_response = client
|
||||
.head(parent_url.clone())
|
||||
.send()
|
||||
.await
|
||||
.context(ImageDownloadErrorKind::ParentDownload)?;
|
||||
let _ = ImageDownloader::check_image_content_type(&parent_response)
|
||||
.context(ImageDownloadErrorKind::ParentDownload)?;
|
||||
.map_err(|_| ImageDownloadError::Http)?;
|
||||
let _ = ImageDownloader::check_image_content_type(&parent_response)?;
|
||||
let child_response = client
|
||||
.get(child_url.clone())
|
||||
.send()
|
||||
.await
|
||||
.context(ImageDownloadErrorKind::ParentDownload)?;
|
||||
let parent_length = Self::get_content_lenght(&parent_response)
|
||||
.context(ImageDownloadErrorKind::ParentDownload)?;
|
||||
let child_length = Self::get_content_lenght(&child_response)
|
||||
.context(ImageDownloadErrorKind::ParentDownload)?;
|
||||
.map_err(|_| ImageDownloadError::Http)?;
|
||||
let parent_length = Self::get_content_lenght(&parent_response)?;
|
||||
let child_length = Self::get_content_lenght(&child_response)?;
|
||||
|
||||
if parent_length > child_length {
|
||||
return Ok(parent_url);
|
||||
|
@ -283,7 +269,7 @@ impl ImageDownloader {
|
|||
}
|
||||
|
||||
debug!("Image parent element not relevant");
|
||||
Err(ImageDownloadErrorKind::ParentDownload.into())
|
||||
Err(ImageDownloadError::ParentDownload)
|
||||
}
|
||||
|
||||
fn get_content_lenght(response: &Response) -> Result<u64, ImageDownloadError> {
|
||||
|
@ -296,7 +282,7 @@ impl ImageDownloader {
|
|||
}
|
||||
}
|
||||
}
|
||||
Err(ImageDownloadErrorKind::ContentLenght.into())
|
||||
Err(ImageDownloadError::ContentLenght)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ mod readability;
|
|||
mod util;
|
||||
|
||||
use article::Article;
|
||||
use error::{ScraperError, ScraperErrorKind};
|
||||
use error::ScraperError;
|
||||
use full_text_parser::FullTextParser;
|
||||
use images::ImageDownloader;
|
||||
use readability::Readability;
|
||||
|
|
22
src/util.rs
22
src/util.rs
|
@ -1,4 +1,3 @@
|
|||
use failure::ResultExt;
|
||||
use libxml::{tree::Node, xpath::Context};
|
||||
use reqwest::{
|
||||
header::{HeaderMap, HeaderName, HeaderValue},
|
||||
|
@ -6,10 +5,7 @@ use reqwest::{
|
|||
};
|
||||
use tokio::fs::DirEntry;
|
||||
|
||||
use crate::full_text_parser::{
|
||||
config::ConfigEntry,
|
||||
error::{FullTextParserError, FullTextParserErrorKind},
|
||||
};
|
||||
use crate::full_text_parser::{config::ConfigEntry, error::FullTextParserError};
|
||||
|
||||
pub struct Util;
|
||||
|
||||
|
@ -55,22 +51,22 @@ impl Util {
|
|||
if let Some(config) = site_specific_rule {
|
||||
for header in &config.header {
|
||||
let name = HeaderName::from_bytes(header.name.as_bytes())
|
||||
.context(FullTextParserErrorKind::Config)?;
|
||||
.map_err(|_| FullTextParserError::Config)?;
|
||||
let value = header
|
||||
.value
|
||||
.parse::<HeaderValue>()
|
||||
.context(FullTextParserErrorKind::Config)?;
|
||||
.map_err(|_| FullTextParserError::Config)?;
|
||||
headers.insert(name, value);
|
||||
}
|
||||
}
|
||||
|
||||
for header in &global_rule.header {
|
||||
let name = HeaderName::from_bytes(header.name.as_bytes())
|
||||
.context(FullTextParserErrorKind::Config)?;
|
||||
.map_err(|_| FullTextParserError::Config)?;
|
||||
let value = header
|
||||
.value
|
||||
.parse::<HeaderValue>()
|
||||
.context(FullTextParserErrorKind::Config)?;
|
||||
.map_err(|_| FullTextParserError::Config)?;
|
||||
headers.insert(name, value);
|
||||
}
|
||||
|
||||
|
@ -105,7 +101,7 @@ impl Util {
|
|||
) -> Result<Vec<Node>, FullTextParserError> {
|
||||
let res = xpath_ctx.evaluate(xpath).map_err(|()| {
|
||||
log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
|
||||
FullTextParserErrorKind::Xml
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
|
||||
let node_vec = res.get_nodes_as_vec();
|
||||
|
@ -113,7 +109,7 @@ impl Util {
|
|||
if node_vec.is_empty() {
|
||||
log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
|
||||
if thorw_if_empty {
|
||||
return Err(FullTextParserErrorKind::Xml.into());
|
||||
return Err(FullTextParserError::Xml);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -135,7 +131,7 @@ impl Util {
|
|||
}
|
||||
|
||||
log::error!("Failed to determine content type");
|
||||
Err(FullTextParserErrorKind::Http.into())
|
||||
Err(FullTextParserError::Http)
|
||||
}
|
||||
|
||||
pub fn check_redirect(response: &Response, original_url: &url::Url) -> Option<url::Url> {
|
||||
|
@ -155,7 +151,7 @@ impl Util {
|
|||
return Ok(val.get_content());
|
||||
}
|
||||
|
||||
Err(FullTextParserErrorKind::Xml.into())
|
||||
Err(FullTextParserError::Xml)
|
||||
}
|
||||
|
||||
pub fn extract_value_merge(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue