mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
port failure -> thiserror
This commit is contained in:
parent
d906f6b7fe
commit
27be5a3204
11 changed files with 137 additions and 366 deletions
10
Cargo.toml
10
Cargo.toml
|
@ -8,16 +8,16 @@ description = "Scrap article contents from the web. Powered by fivefilters full
|
||||||
repository = "https://gitlab.com/news-flash/article_scraper"
|
repository = "https://gitlab.com/news-flash/article_scraper"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
failure = "0.1"
|
thiserror = "1.0"
|
||||||
libxml = "0.3"
|
libxml = "0.3"
|
||||||
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli"] }
|
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli"] }
|
||||||
tokio = { version = "1.21", features = ["macros", "fs", "io-util"] }
|
tokio = { version = "1.22", features = ["macros", "fs", "io-util"] }
|
||||||
url = "2.2"
|
url = "2.3"
|
||||||
regex = "1.4"
|
regex = "1.7"
|
||||||
encoding_rs = "0.8"
|
encoding_rs = "0.8"
|
||||||
chrono = "0.4"
|
chrono = "0.4"
|
||||||
base64 = "0.13"
|
base64 = "0.13"
|
||||||
image = "0.24"
|
image = "0.24"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
rust-embed="6.4"
|
rust-embed="6.4"
|
||||||
once_cell = "1.15"
|
once_cell = "1.16"
|
68
src/error.rs
68
src/error.rs
|
@ -1,57 +1,15 @@
|
||||||
use failure::{Backtrace, Context, Error, Fail};
|
use crate::{
|
||||||
use std::fmt;
|
full_text_parser::{config::ConfigError, error::FullTextParserError},
|
||||||
|
images::ImageDownloadError,
|
||||||
|
};
|
||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Error, Debug)]
|
||||||
pub struct ScraperError {
|
pub enum ScraperError {
|
||||||
inner: Context<ScraperErrorKind>,
|
#[error("")]
|
||||||
}
|
Config(#[from] ConfigError),
|
||||||
|
#[error("")]
|
||||||
#[derive(Copy, Clone, Eq, PartialEq, Debug, Fail)]
|
Image(#[from] ImageDownloadError),
|
||||||
pub enum ScraperErrorKind {
|
#[error("")]
|
||||||
#[fail(display = "Unknown Error")]
|
Scrap(#[from] FullTextParserError),
|
||||||
Unknown,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Fail for ScraperError {
|
|
||||||
fn cause(&self) -> Option<&dyn Fail> {
|
|
||||||
self.inner.cause()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn backtrace(&self) -> Option<&Backtrace> {
|
|
||||||
self.inner.backtrace()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Display for ScraperError {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
||||||
fmt::Display::fmt(&self.inner, f)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ScraperError {
|
|
||||||
pub fn kind(&self) -> ScraperErrorKind {
|
|
||||||
*self.inner.get_context()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<ScraperErrorKind> for ScraperError {
|
|
||||||
fn from(kind: ScraperErrorKind) -> ScraperError {
|
|
||||||
ScraperError {
|
|
||||||
inner: Context::new(kind),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<Context<ScraperErrorKind>> for ScraperError {
|
|
||||||
fn from(inner: Context<ScraperErrorKind>) -> ScraperError {
|
|
||||||
ScraperError { inner }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<Error> for ScraperError {
|
|
||||||
fn from(_: Error) -> ScraperError {
|
|
||||||
ScraperError {
|
|
||||||
inner: Context::new(ScraperErrorKind::Unknown),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
use crate::util::Util;
|
use crate::util::Util;
|
||||||
|
|
||||||
use super::error::{ConfigError, ConfigErrorKind};
|
use super::error::ConfigError;
|
||||||
use failure::ResultExt;
|
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
@ -37,9 +36,7 @@ pub struct ConfigEntry {
|
||||||
|
|
||||||
impl ConfigEntry {
|
impl ConfigEntry {
|
||||||
pub async fn parse_path(config_path: &Path) -> Result<ConfigEntry, ConfigError> {
|
pub async fn parse_path(config_path: &Path) -> Result<ConfigEntry, ConfigError> {
|
||||||
let mut file = fs::File::open(&config_path)
|
let mut file = fs::File::open(&config_path).await?;
|
||||||
.await
|
|
||||||
.context(ConfigErrorKind::IO)?;
|
|
||||||
let buffer = BufReader::new(&mut file);
|
let buffer = BufReader::new(&mut file);
|
||||||
|
|
||||||
Self::parse(buffer).await
|
Self::parse(buffer).await
|
||||||
|
|
|
@ -1,59 +1,9 @@
|
||||||
use failure::{Backtrace, Context, Error, Fail};
|
use thiserror::Error;
|
||||||
use std::fmt;
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Error, Debug)]
|
||||||
pub struct ConfigError {
|
pub enum ConfigError {
|
||||||
inner: Context<ConfigErrorKind>,
|
#[error("IO error")]
|
||||||
}
|
IO(#[from] std::io::Error),
|
||||||
|
#[error("Unknown Error")]
|
||||||
#[derive(Copy, Clone, Eq, PartialEq, Debug, Fail)]
|
|
||||||
pub enum ConfigErrorKind {
|
|
||||||
#[fail(display = "IO Error")]
|
|
||||||
IO,
|
|
||||||
#[fail(display = "Unknown Error")]
|
|
||||||
Unknown,
|
Unknown,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Fail for ConfigError {
|
|
||||||
fn cause(&self) -> Option<&dyn Fail> {
|
|
||||||
self.inner.cause()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn backtrace(&self) -> Option<&Backtrace> {
|
|
||||||
self.inner.backtrace()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Display for ConfigError {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
||||||
fmt::Display::fmt(&self.inner, f)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// impl ConfigError {
|
|
||||||
// pub fn kind(&self) -> ConfigErrorKind {
|
|
||||||
// *self.inner.get_context()
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
impl From<ConfigErrorKind> for ConfigError {
|
|
||||||
fn from(kind: ConfigErrorKind) -> ConfigError {
|
|
||||||
ConfigError {
|
|
||||||
inner: Context::new(kind),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<Context<ConfigErrorKind>> for ConfigError {
|
|
||||||
fn from(inner: Context<ConfigErrorKind>) -> ConfigError {
|
|
||||||
ConfigError { inner }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<Error> for ConfigError {
|
|
||||||
fn from(_: Error) -> ConfigError {
|
|
||||||
ConfigError {
|
|
||||||
inner: Context::new(ConfigErrorKind::Unknown),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
@ -6,3 +6,4 @@ mod error;
|
||||||
|
|
||||||
pub use config_collection::ConfigCollection;
|
pub use config_collection::ConfigCollection;
|
||||||
pub use config_entry::ConfigEntry;
|
pub use config_entry::ConfigEntry;
|
||||||
|
pub use error::ConfigError;
|
||||||
|
|
|
@ -1,71 +1,21 @@
|
||||||
use failure::{Backtrace, Context, Error, Fail};
|
use thiserror::Error;
|
||||||
use std::fmt;
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Error, Debug)]
|
||||||
pub struct FullTextParserError {
|
pub enum FullTextParserError {
|
||||||
inner: Context<FullTextParserErrorKind>,
|
#[error("libXml Error")]
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Copy, Clone, Eq, PartialEq, Debug, Fail)]
|
|
||||||
pub enum FullTextParserErrorKind {
|
|
||||||
#[fail(display = "libXml Error")]
|
|
||||||
Xml,
|
Xml,
|
||||||
#[fail(display = "No content found")]
|
#[error("No content found")]
|
||||||
Scrape,
|
Scrape,
|
||||||
#[fail(display = "Url Error")]
|
#[error("Url Error")]
|
||||||
Url,
|
Url(#[from] url::ParseError),
|
||||||
#[fail(display = "Http request failed")]
|
#[error("Http request failed")]
|
||||||
Http,
|
Http,
|
||||||
#[fail(display = "Config Error")]
|
#[error("Config Error")]
|
||||||
Config,
|
Config,
|
||||||
#[fail(display = "IO Error")]
|
#[error("IO Error")]
|
||||||
IO,
|
IO,
|
||||||
#[fail(display = "Content-type suggest no html")]
|
#[error("Content-type suggest no html")]
|
||||||
ContentType,
|
ContentType,
|
||||||
#[fail(display = "Unknown Error")]
|
#[error("Unknown Error")]
|
||||||
Unknown,
|
Unknown,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Fail for FullTextParserError {
|
|
||||||
fn cause(&self) -> Option<&dyn Fail> {
|
|
||||||
self.inner.cause()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn backtrace(&self) -> Option<&Backtrace> {
|
|
||||||
self.inner.backtrace()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Display for FullTextParserError {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
||||||
fmt::Display::fmt(&self.inner, f)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl FullTextParserError {
|
|
||||||
pub fn kind(&self) -> FullTextParserErrorKind {
|
|
||||||
*self.inner.get_context()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<FullTextParserErrorKind> for FullTextParserError {
|
|
||||||
fn from(kind: FullTextParserErrorKind) -> FullTextParserError {
|
|
||||||
FullTextParserError {
|
|
||||||
inner: Context::new(kind),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<Context<FullTextParserErrorKind>> for FullTextParserError {
|
|
||||||
fn from(inner: Context<FullTextParserErrorKind>) -> FullTextParserError {
|
|
||||||
FullTextParserError { inner }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<Error> for FullTextParserError {
|
|
||||||
fn from(_: Error) -> FullTextParserError {
|
|
||||||
FullTextParserError {
|
|
||||||
inner: Context::new(FullTextParserErrorKind::Unknown),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
@ -6,12 +6,11 @@ mod fingerprints;
|
||||||
mod tests;
|
mod tests;
|
||||||
|
|
||||||
use self::config::{ConfigCollection, ConfigEntry};
|
use self::config::{ConfigCollection, ConfigEntry};
|
||||||
use self::error::{FullTextParserError, FullTextParserErrorKind};
|
use self::error::FullTextParserError;
|
||||||
use crate::article::Article;
|
use crate::article::Article;
|
||||||
use crate::util::Util;
|
use crate::util::Util;
|
||||||
use chrono::DateTime;
|
use chrono::DateTime;
|
||||||
use encoding_rs::Encoding;
|
use encoding_rs::Encoding;
|
||||||
use failure::ResultExt;
|
|
||||||
use fingerprints::Fingerprints;
|
use fingerprints::Fingerprints;
|
||||||
use libxml::parser::Parser;
|
use libxml::parser::Parser;
|
||||||
use libxml::tree::{Document, Node, SaveOptions};
|
use libxml::tree::{Document, Node, SaveOptions};
|
||||||
|
@ -44,7 +43,7 @@ impl FullTextParser {
|
||||||
let global_config = self
|
let global_config = self
|
||||||
.config_files
|
.config_files
|
||||||
.get("global.txt")
|
.get("global.txt")
|
||||||
.ok_or(FullTextParserErrorKind::Config)?;
|
.ok_or(FullTextParserError::Config)?;
|
||||||
|
|
||||||
let headers = Util::generate_headers(config, global_config)?;
|
let headers = Util::generate_headers(config, global_config)?;
|
||||||
|
|
||||||
|
@ -55,9 +54,8 @@ impl FullTextParser {
|
||||||
.await
|
.await
|
||||||
.map_err(|err| {
|
.map_err(|err| {
|
||||||
error!("Failed head request to: '{}' - '{}'", url.as_str(), err);
|
error!("Failed head request to: '{}' - '{}'", url.as_str(), err);
|
||||||
err
|
FullTextParserError::Http
|
||||||
})
|
})?;
|
||||||
.context(FullTextParserErrorKind::Http)?;
|
|
||||||
|
|
||||||
// check if url redirects and we need to pick up the new url
|
// check if url redirects and we need to pick up the new url
|
||||||
let url = if let Some(new_url) = Util::check_redirect(&response, url) {
|
let url = if let Some(new_url) = Util::check_redirect(&response, url) {
|
||||||
|
@ -69,7 +67,7 @@ impl FullTextParser {
|
||||||
|
|
||||||
// check if we are dealing with text/html
|
// check if we are dealing with text/html
|
||||||
if !Util::check_content_type(&response)? {
|
if !Util::check_content_type(&response)? {
|
||||||
return Err(FullTextParserErrorKind::ContentType.into());
|
return Err(FullTextParserError::ContentType);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut article = Article {
|
let mut article = Article {
|
||||||
|
@ -80,9 +78,9 @@ impl FullTextParser {
|
||||||
html: None,
|
html: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut document = Document::new().map_err(|()| FullTextParserErrorKind::Xml)?;
|
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
||||||
let mut root =
|
let mut root =
|
||||||
Node::new("article", None, &document).map_err(|()| FullTextParserErrorKind::Xml)?;
|
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
|
||||||
document.set_root_element(&root);
|
document.set_root_element(&root);
|
||||||
|
|
||||||
Self::generate_head(&mut root, &document)?;
|
Self::generate_head(&mut root, &document)?;
|
||||||
|
@ -92,7 +90,7 @@ impl FullTextParser {
|
||||||
|
|
||||||
let context = Context::new(&document).map_err(|()| {
|
let context = Context::new(&document).map_err(|()| {
|
||||||
error!("Failed to create xpath context for extracted article");
|
error!("Failed to create xpath context for extracted article");
|
||||||
FullTextParserErrorKind::Xml
|
FullTextParserError::Xml
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
if let Err(error) = Self::prevent_self_closing_tags(&context) {
|
if let Err(error) = Self::prevent_self_closing_tags(&context) {
|
||||||
|
@ -209,14 +207,14 @@ impl FullTextParser {
|
||||||
let parser = Parser::default_html();
|
let parser = Parser::default_html();
|
||||||
Ok(parser.parse_string(html.as_str()).map_err(|err| {
|
Ok(parser.parse_string(html.as_str()).map_err(|err| {
|
||||||
error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
||||||
FullTextParserErrorKind::Xml
|
FullTextParserError::Xml
|
||||||
})?)
|
})?)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_xpath_ctx(doc: &Document) -> Result<Context, FullTextParserError> {
|
fn get_xpath_ctx(doc: &Document) -> Result<Context, FullTextParserError> {
|
||||||
Ok(Context::new(doc).map_err(|()| {
|
Ok(Context::new(doc).map_err(|()| {
|
||||||
error!("Creating xpath context failed for downloaded HTML");
|
error!("Creating xpath context failed for downloaded HTML");
|
||||||
FullTextParserErrorKind::Xml
|
FullTextParserError::Xml
|
||||||
})?)
|
})?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -256,16 +254,15 @@ impl FullTextParser {
|
||||||
url.as_str(),
|
url.as_str(),
|
||||||
err
|
err
|
||||||
);
|
);
|
||||||
err
|
FullTextParserError::Http
|
||||||
})
|
})?;
|
||||||
.context(FullTextParserErrorKind::Http)?;
|
|
||||||
|
|
||||||
if response.status().is_success() {
|
if response.status().is_success() {
|
||||||
let headers = response.headers().clone();
|
let headers = response.headers().clone();
|
||||||
let text = response
|
let text = response
|
||||||
.text()
|
.text()
|
||||||
.await
|
.await
|
||||||
.context(FullTextParserErrorKind::Http)?;
|
.map_err(|_| FullTextParserError::Http)?;
|
||||||
{
|
{
|
||||||
if let Some(decoded_html) =
|
if let Some(decoded_html) =
|
||||||
Self::decode_html(&text, Self::get_encoding_from_html(&text))
|
Self::decode_html(&text, Self::get_encoding_from_html(&text))
|
||||||
|
@ -284,7 +281,7 @@ impl FullTextParser {
|
||||||
return Ok(text);
|
return Ok(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
Err(FullTextParserErrorKind::Http.into())
|
Err(FullTextParserError::Http)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
|
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
|
||||||
|
@ -338,7 +335,7 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
error!("Getting config failed due to bad Url");
|
error!("Getting config failed due to bad Url");
|
||||||
Err(FullTextParserErrorKind::Config.into())
|
Err(FullTextParserError::Config)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -366,7 +363,7 @@ impl FullTextParser {
|
||||||
for mut node in node_vec {
|
for mut node in node_vec {
|
||||||
if let Some(correct_url) = node.get_property(property_url) {
|
if let Some(correct_url) = node.get_property(property_url) {
|
||||||
if node.set_property("src", &correct_url).is_err() {
|
if node.set_property("src", &correct_url).is_err() {
|
||||||
return Err(FullTextParserErrorKind::Xml.into());
|
return Err(FullTextParserError::Xml);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -385,7 +382,7 @@ impl FullTextParser {
|
||||||
node.unlink();
|
node.unlink();
|
||||||
video_wrapper.add_child(&mut node).map_err(|_| {
|
video_wrapper.add_child(&mut node).map_err(|_| {
|
||||||
error!("Failed to add iframe as child of video wrapper <div>");
|
error!("Failed to add iframe as child of video wrapper <div>");
|
||||||
FullTextParserErrorKind::Xml
|
FullTextParserError::Xml
|
||||||
})?;
|
})?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -393,7 +390,7 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
error!("Failed to add video wrapper <div> as parent of iframe");
|
error!("Failed to add video wrapper <div> as parent of iframe");
|
||||||
return Err(FullTextParserErrorKind::Xml.into());
|
return Err(FullTextParserError::Xml);
|
||||||
}
|
}
|
||||||
|
|
||||||
error!("Failed to get parent of iframe");
|
error!("Failed to get parent of iframe");
|
||||||
|
@ -413,7 +410,7 @@ impl FullTextParser {
|
||||||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||||
for mut node in node_vec {
|
for mut node in node_vec {
|
||||||
if node.remove_property(attribute).is_err() {
|
if node.remove_property(attribute).is_err() {
|
||||||
return Err(FullTextParserErrorKind::Xml.into());
|
return Err(FullTextParserError::Xml);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -431,7 +428,7 @@ impl FullTextParser {
|
||||||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||||
for mut node in node_vec {
|
for mut node in node_vec {
|
||||||
if node.set_attribute(attribute, value).is_err() {
|
if node.set_attribute(attribute, value).is_err() {
|
||||||
return Err(FullTextParserErrorKind::Xml.into());
|
return Err(FullTextParserError::Xml);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -449,7 +446,7 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Err(FullTextParserErrorKind::Xml.into())
|
Err(FullTextParserError::Xml)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn repair_urls(
|
fn repair_urls(
|
||||||
|
@ -464,7 +461,7 @@ impl FullTextParser {
|
||||||
if let Err(url::ParseError::RelativeUrlWithoutBase) = url::Url::parse(&val) {
|
if let Err(url::ParseError::RelativeUrlWithoutBase) = url::Url::parse(&val) {
|
||||||
if let Ok(fixed_url) = Self::complete_url(article_url, &val) {
|
if let Ok(fixed_url) = Self::complete_url(article_url, &val) {
|
||||||
if node.set_attribute(attribute, fixed_url.as_str()).is_err() {
|
if node.set_attribute(attribute, fixed_url.as_str()).is_err() {
|
||||||
return Err(FullTextParserErrorKind::Scrape.into());
|
return Err(FullTextParserError::Scrape);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -486,7 +483,7 @@ impl FullTextParser {
|
||||||
completed_url.push_str("//");
|
completed_url.push_str("//");
|
||||||
completed_url.push_str(host);
|
completed_url.push_str(host);
|
||||||
}
|
}
|
||||||
_ => return Err(FullTextParserErrorKind::Scrape.into()),
|
_ => return Err(FullTextParserError::Scrape),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -494,7 +491,7 @@ impl FullTextParser {
|
||||||
completed_url.push('/');
|
completed_url.push('/');
|
||||||
}
|
}
|
||||||
completed_url.push_str(incomplete_url);
|
completed_url.push_str(incomplete_url);
|
||||||
let url = url::Url::parse(&completed_url).context(FullTextParserErrorKind::Url)?;
|
let url = url::Url::parse(&completed_url)?;
|
||||||
Ok(url)
|
Ok(url)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -678,7 +675,7 @@ impl FullTextParser {
|
||||||
|
|
||||||
if !found_something {
|
if !found_something {
|
||||||
log::error!("no body found");
|
log::error!("no body found");
|
||||||
return Err(FullTextParserErrorKind::Scrape.into());
|
return Err(FullTextParserError::Scrape);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -694,7 +691,7 @@ impl FullTextParser {
|
||||||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||||
for mut node in node_vec {
|
for mut node in node_vec {
|
||||||
if node.get_property("style").is_some() && node.remove_property("style").is_err() {
|
if node.get_property("style").is_some() && node.remove_property("style").is_err() {
|
||||||
return Err(FullTextParserErrorKind::Xml.into());
|
return Err(FullTextParserError::Xml);
|
||||||
}
|
}
|
||||||
|
|
||||||
node.unlink();
|
node.unlink();
|
||||||
|
@ -702,7 +699,7 @@ impl FullTextParser {
|
||||||
found_something = true;
|
found_something = true;
|
||||||
} else {
|
} else {
|
||||||
error!("Failed to add body to prepared document");
|
error!("Failed to add body to prepared document");
|
||||||
return Err(FullTextParserErrorKind::Xml.into());
|
return Err(FullTextParserError::Xml);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -748,7 +745,7 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Err(FullTextParserErrorKind::Xml.into())
|
Err(FullTextParserError::Xml)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn prevent_self_closing_tags(context: &Context) -> Result<(), FullTextParserError> {
|
fn prevent_self_closing_tags(context: &Context) -> Result<(), FullTextParserError> {
|
||||||
|
|
|
@ -1,89 +1,25 @@
|
||||||
use crate::full_text_parser::error::FullTextParserErrorKind;
|
use thiserror::Error;
|
||||||
use failure::{Backtrace, Context, Error, Fail};
|
|
||||||
use std::fmt;
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Error, Debug)]
|
||||||
pub struct ImageDownloadError {
|
pub enum ImageDownloadError {
|
||||||
inner: Context<ImageDownloadErrorKind>,
|
#[error("Parsing the supplied html string failed")]
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Copy, Clone, Eq, PartialEq, Debug, Fail)]
|
|
||||||
pub enum ImageDownloadErrorKind {
|
|
||||||
#[fail(display = "Parsing the supplied html string failed")]
|
|
||||||
HtmlParse,
|
HtmlParse,
|
||||||
#[fail(display = "Scaling down a downloaded image failed")]
|
#[error("Scaling down a downloaded image failed")]
|
||||||
ImageScale,
|
ImageScale,
|
||||||
#[fail(display = "Downloading the parent element of an image failed")]
|
#[error("Downloading the parent element of an image failed")]
|
||||||
ParentDownload,
|
ParentDownload,
|
||||||
#[fail(display = "Generating image name failed")]
|
#[error("Generating image name failed")]
|
||||||
ImageName,
|
ImageName,
|
||||||
#[fail(display = "Getting the content-length property failed")]
|
#[error("Getting the content-length property failed")]
|
||||||
ContentLenght,
|
ContentLenght,
|
||||||
#[fail(display = "Content-type suggest no image")]
|
#[error("Content-type suggest no image")]
|
||||||
ContentType,
|
ContentType,
|
||||||
#[fail(display = "Http error")]
|
#[error("Http error")]
|
||||||
Http,
|
Http,
|
||||||
#[fail(display = "IO error")]
|
#[error("IO error")]
|
||||||
IO,
|
IO,
|
||||||
#[fail(display = "Invalid URL")]
|
#[error("Invalid URL")]
|
||||||
InvalidUrl,
|
InvalidUrl(#[from] url::ParseError),
|
||||||
#[fail(display = "Unknown Error")]
|
#[error("Unknown Error")]
|
||||||
Unknown,
|
Unknown,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Fail for ImageDownloadError {
|
|
||||||
fn cause(&self) -> Option<&dyn Fail> {
|
|
||||||
self.inner.cause()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn backtrace(&self) -> Option<&Backtrace> {
|
|
||||||
self.inner.backtrace()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Display for ImageDownloadError {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
||||||
fmt::Display::fmt(&self.inner, f)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ImageDownloadError {
|
|
||||||
pub fn kind(&self) -> ImageDownloadErrorKind {
|
|
||||||
*self.inner.get_context()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<ImageDownloadErrorKind> for ImageDownloadError {
|
|
||||||
fn from(kind: ImageDownloadErrorKind) -> ImageDownloadError {
|
|
||||||
ImageDownloadError {
|
|
||||||
inner: Context::new(kind),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<Context<ImageDownloadErrorKind>> for ImageDownloadError {
|
|
||||||
fn from(inner: Context<ImageDownloadErrorKind>) -> ImageDownloadError {
|
|
||||||
ImageDownloadError { inner }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<FullTextParserErrorKind> for ImageDownloadError {
|
|
||||||
fn from(kind: FullTextParserErrorKind) -> ImageDownloadError {
|
|
||||||
let kind = match kind {
|
|
||||||
FullTextParserErrorKind::Xml => ImageDownloadErrorKind::HtmlParse,
|
|
||||||
_ => ImageDownloadErrorKind::Unknown,
|
|
||||||
};
|
|
||||||
|
|
||||||
ImageDownloadError {
|
|
||||||
inner: Context::new(kind),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<Error> for ImageDownloadError {
|
|
||||||
fn from(_: Error) -> ImageDownloadError {
|
|
||||||
ImageDownloadError {
|
|
||||||
inner: Context::new(ImageDownloadErrorKind::Unknown),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
use self::error::{ImageDownloadError, ImageDownloadErrorKind};
|
pub use self::error::ImageDownloadError;
|
||||||
use crate::util::Util;
|
use crate::util::Util;
|
||||||
use failure::ResultExt;
|
|
||||||
use libxml::parser::Parser;
|
use libxml::parser::Parser;
|
||||||
use libxml::tree::{Node, SaveOptions};
|
use libxml::tree::{Node, SaveOptions};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
|
@ -25,14 +24,13 @@ impl ImageDownloader {
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<String, ImageDownloadError> {
|
) -> Result<String, ImageDownloadError> {
|
||||||
let parser = Parser::default_html();
|
let parser = Parser::default_html();
|
||||||
let doc = parser.parse_string(html).map_err(|_| {
|
let doc = parser
|
||||||
error!("Failed to parse HTML string");
|
.parse_string(html)
|
||||||
ImageDownloadErrorKind::HtmlParse
|
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
||||||
})?;
|
|
||||||
|
|
||||||
let xpath_ctx = Context::new(&doc).map_err(|()| {
|
let xpath_ctx = Context::new(&doc).map_err(|()| {
|
||||||
error!("Failed to create xpath context for document");
|
error!("Failed to create xpath context for document");
|
||||||
ImageDownloadErrorKind::HtmlParse
|
ImageDownloadError::HtmlParse
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
self.download_images_from_context(&xpath_ctx, client)
|
self.download_images_from_context(&xpath_ctx, client)
|
||||||
|
@ -58,7 +56,7 @@ impl ImageDownloader {
|
||||||
) -> Result<(), ImageDownloadError> {
|
) -> Result<(), ImageDownloadError> {
|
||||||
let xpath = "//img";
|
let xpath = "//img";
|
||||||
let node_vec = Util::evaluate_xpath(context, xpath, false)
|
let node_vec = Util::evaluate_xpath(context, xpath, false)
|
||||||
.context(ImageDownloadErrorKind::HtmlParse)?;
|
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
||||||
for mut node in node_vec {
|
for mut node in node_vec {
|
||||||
if let Some(url) = node.get_property("src") {
|
if let Some(url) = node.get_property("src") {
|
||||||
if !url.starts_with("data:") {
|
if !url.starts_with("data:") {
|
||||||
|
@ -72,11 +70,11 @@ impl ImageDownloader {
|
||||||
self.save_image(&url, &parent_url, client).await
|
self.save_image(&url, &parent_url, client).await
|
||||||
{
|
{
|
||||||
if node.set_property("src", &small_image).is_err() {
|
if node.set_property("src", &small_image).is_err() {
|
||||||
return Err(ImageDownloadErrorKind::HtmlParse.into());
|
return Err(ImageDownloadError::HtmlParse);
|
||||||
}
|
}
|
||||||
if let Some(big_image) = big_image {
|
if let Some(big_image) = big_image {
|
||||||
if node.set_property("big-src", &big_image).is_err() {
|
if node.set_property("big-src", &big_image).is_err() {
|
||||||
return Err(ImageDownloadErrorKind::HtmlParse.into());
|
return Err(ImageDownloadError::HtmlParse);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -94,26 +92,21 @@ impl ImageDownloader {
|
||||||
parent_url: &Option<url::Url>,
|
parent_url: &Option<url::Url>,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<(String, Option<String>), ImageDownloadError> {
|
) -> Result<(String, Option<String>), ImageDownloadError> {
|
||||||
let response = client
|
let response = client.get(image_url.clone()).send().await.map_err(|err| {
|
||||||
.get(image_url.clone())
|
error!("GET {} failed - {}", image_url.as_str(), err);
|
||||||
.send()
|
ImageDownloadError::Http
|
||||||
.await
|
})?;
|
||||||
.map_err(|err| {
|
|
||||||
error!("GET {} failed - {}", image_url.as_str(), err);
|
|
||||||
err
|
|
||||||
})
|
|
||||||
.context(ImageDownloadErrorKind::Http)?;
|
|
||||||
|
|
||||||
let content_type_small = ImageDownloader::check_image_content_type(&response)?;
|
let content_type_small = ImageDownloader::check_image_content_type(&response)?;
|
||||||
let content_type_small = content_type_small
|
let content_type_small = content_type_small
|
||||||
.to_str()
|
.to_str()
|
||||||
.context(ImageDownloadErrorKind::ContentType)?;
|
.map_err(|_| ImageDownloadError::ContentType)?;
|
||||||
let mut content_type_big: Option<String> = None;
|
let mut content_type_big: Option<String> = None;
|
||||||
|
|
||||||
let mut small_image = response
|
let mut small_image = response
|
||||||
.bytes()
|
.bytes()
|
||||||
.await
|
.await
|
||||||
.context(ImageDownloadErrorKind::IO)?
|
.map_err(|_| ImageDownloadError::Http)?
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.to_vec();
|
.to_vec();
|
||||||
|
|
||||||
|
@ -124,18 +117,18 @@ impl ImageDownloader {
|
||||||
.get(parent_url.clone())
|
.get(parent_url.clone())
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.context(ImageDownloadErrorKind::Http)?;
|
.map_err(|_| ImageDownloadError::Http)?;
|
||||||
content_type_big = Some(
|
content_type_big = Some(
|
||||||
ImageDownloader::check_image_content_type(&response_big)?
|
ImageDownloader::check_image_content_type(&response_big)?
|
||||||
.to_str()
|
.to_str()
|
||||||
.context(ImageDownloadErrorKind::ContentType)?
|
.map_err(|_| ImageDownloadError::ContentType)?
|
||||||
.to_owned(),
|
.to_owned(),
|
||||||
);
|
);
|
||||||
big_image = Some(
|
big_image = Some(
|
||||||
response_big
|
response_big
|
||||||
.bytes()
|
.bytes()
|
||||||
.await
|
.await
|
||||||
.context(ImageDownloadErrorKind::IO)?
|
.map_err(|_| ImageDownloadError::Http)?
|
||||||
.to_vec(),
|
.to_vec(),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -159,12 +152,10 @@ impl ImageDownloader {
|
||||||
format!("data:{};base64,{}", content_type_small, small_image_base64);
|
format!("data:{};base64,{}", content_type_small, small_image_base64);
|
||||||
let big_image_string = match big_image_base64 {
|
let big_image_string = match big_image_base64 {
|
||||||
Some(big_image_base64) => {
|
Some(big_image_base64) => {
|
||||||
let content_type_big = content_type_big
|
let content_type_big = content_type_big.ok_or_else(|| {
|
||||||
.ok_or(ImageDownloadErrorKind::ParentDownload)
|
debug!("content_type_big should not be None when a big image exists");
|
||||||
.map_err(|err| {
|
ImageDownloadError::ParentDownload
|
||||||
debug!("content_type_big should not be None when a big image exists");
|
})?;
|
||||||
err
|
|
||||||
})?;
|
|
||||||
Some(format!(
|
Some(format!(
|
||||||
"data:{};base64,{}",
|
"data:{};base64,{}",
|
||||||
content_type_big, big_image_base64
|
content_type_big, big_image_base64
|
||||||
|
@ -182,7 +173,7 @@ impl ImageDownloader {
|
||||||
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
|
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
|
||||||
if content_type
|
if content_type
|
||||||
.to_str()
|
.to_str()
|
||||||
.context(ImageDownloadErrorKind::ContentType)?
|
.map_err(|_| ImageDownloadError::ContentType)?
|
||||||
.contains("image")
|
.contains("image")
|
||||||
{
|
{
|
||||||
return Ok(content_type.clone());
|
return Ok(content_type.clone());
|
||||||
|
@ -190,10 +181,10 @@ impl ImageDownloader {
|
||||||
}
|
}
|
||||||
|
|
||||||
error!("{} is not an image", response.url());
|
error!("{} is not an image", response.url());
|
||||||
return Err(ImageDownloadErrorKind::ContentType.into());
|
Err(ImageDownloadError::ContentType)
|
||||||
|
} else {
|
||||||
|
Err(ImageDownloadError::Http)
|
||||||
}
|
}
|
||||||
|
|
||||||
Err(ImageDownloadErrorKind::Http.into())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn scale_image(
|
fn scale_image(
|
||||||
|
@ -203,12 +194,10 @@ impl ImageDownloader {
|
||||||
let mut original_image: Vec<u8> = Vec::new();
|
let mut original_image: Vec<u8> = Vec::new();
|
||||||
let mut resized_image: Option<Vec<u8>> = None;
|
let mut resized_image: Option<Vec<u8>> = None;
|
||||||
|
|
||||||
let mut image = image::load_from_memory(image_buffer)
|
let mut image = image::load_from_memory(image_buffer).map_err(|err| {
|
||||||
.map_err(|err| {
|
error!("Failed to open image to resize: {}", err);
|
||||||
error!("Failed to open image to resize");
|
ImageDownloadError::ImageScale
|
||||||
err
|
})?;
|
||||||
})
|
|
||||||
.context(ImageDownloadErrorKind::ImageScale)?;
|
|
||||||
|
|
||||||
image
|
image
|
||||||
.write_to(
|
.write_to(
|
||||||
|
@ -216,10 +205,9 @@ impl ImageDownloader {
|
||||||
image::ImageOutputFormat::Png,
|
image::ImageOutputFormat::Png,
|
||||||
)
|
)
|
||||||
.map_err(|err| {
|
.map_err(|err| {
|
||||||
error!("Failed to save resized image to resize");
|
error!("Failed to save resized image to resize: {}", err);
|
||||||
err
|
ImageDownloadError::ImageScale
|
||||||
})
|
})?;
|
||||||
.context(ImageDownloadErrorKind::ImageScale)?;
|
|
||||||
|
|
||||||
let dimensions = (image.width(), image.height());
|
let dimensions = (image.width(), image.height());
|
||||||
if dimensions.0 > max_dimensions.0 || dimensions.1 > max_dimensions.1 {
|
if dimensions.0 > max_dimensions.0 || dimensions.1 > max_dimensions.1 {
|
||||||
|
@ -235,10 +223,9 @@ impl ImageDownloader {
|
||||||
image::ImageOutputFormat::Png,
|
image::ImageOutputFormat::Png,
|
||||||
)
|
)
|
||||||
.map_err(|err| {
|
.map_err(|err| {
|
||||||
error!("Failed to save resized image to resize");
|
error!("Failed to save resized image to resize: {}", err);
|
||||||
err
|
ImageDownloadError::ImageScale
|
||||||
})
|
})?;
|
||||||
.context(ImageDownloadErrorKind::ImageScale)?;
|
|
||||||
resized_image = Some(resized_buf);
|
resized_image = Some(resized_buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -254,24 +241,23 @@ impl ImageDownloader {
|
||||||
if let Some(parent) = node.get_parent() {
|
if let Some(parent) = node.get_parent() {
|
||||||
if parent.get_name() == "a" {
|
if parent.get_name() == "a" {
|
||||||
if let Some(url) = parent.get_property("href") {
|
if let Some(url) = parent.get_property("href") {
|
||||||
let parent_url =
|
let parent_url = url::Url::parse(&url).map_err(|err| {
|
||||||
url::Url::parse(&url).context(ImageDownloadErrorKind::ParentDownload)?;
|
error!("Failed to parse parent image url: {}", err);
|
||||||
|
ImageDownloadError::InvalidUrl(err)
|
||||||
|
})?;
|
||||||
let parent_response = client
|
let parent_response = client
|
||||||
.head(parent_url.clone())
|
.head(parent_url.clone())
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.context(ImageDownloadErrorKind::ParentDownload)?;
|
.map_err(|_| ImageDownloadError::Http)?;
|
||||||
let _ = ImageDownloader::check_image_content_type(&parent_response)
|
let _ = ImageDownloader::check_image_content_type(&parent_response)?;
|
||||||
.context(ImageDownloadErrorKind::ParentDownload)?;
|
|
||||||
let child_response = client
|
let child_response = client
|
||||||
.get(child_url.clone())
|
.get(child_url.clone())
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.context(ImageDownloadErrorKind::ParentDownload)?;
|
.map_err(|_| ImageDownloadError::Http)?;
|
||||||
let parent_length = Self::get_content_lenght(&parent_response)
|
let parent_length = Self::get_content_lenght(&parent_response)?;
|
||||||
.context(ImageDownloadErrorKind::ParentDownload)?;
|
let child_length = Self::get_content_lenght(&child_response)?;
|
||||||
let child_length = Self::get_content_lenght(&child_response)
|
|
||||||
.context(ImageDownloadErrorKind::ParentDownload)?;
|
|
||||||
|
|
||||||
if parent_length > child_length {
|
if parent_length > child_length {
|
||||||
return Ok(parent_url);
|
return Ok(parent_url);
|
||||||
|
@ -283,7 +269,7 @@ impl ImageDownloader {
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!("Image parent element not relevant");
|
debug!("Image parent element not relevant");
|
||||||
Err(ImageDownloadErrorKind::ParentDownload.into())
|
Err(ImageDownloadError::ParentDownload)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_content_lenght(response: &Response) -> Result<u64, ImageDownloadError> {
|
fn get_content_lenght(response: &Response) -> Result<u64, ImageDownloadError> {
|
||||||
|
@ -296,7 +282,7 @@ impl ImageDownloader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(ImageDownloadErrorKind::ContentLenght.into())
|
Err(ImageDownloadError::ContentLenght)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ mod readability;
|
||||||
mod util;
|
mod util;
|
||||||
|
|
||||||
use article::Article;
|
use article::Article;
|
||||||
use error::{ScraperError, ScraperErrorKind};
|
use error::ScraperError;
|
||||||
use full_text_parser::FullTextParser;
|
use full_text_parser::FullTextParser;
|
||||||
use images::ImageDownloader;
|
use images::ImageDownloader;
|
||||||
use readability::Readability;
|
use readability::Readability;
|
||||||
|
|
22
src/util.rs
22
src/util.rs
|
@ -1,4 +1,3 @@
|
||||||
use failure::ResultExt;
|
|
||||||
use libxml::{tree::Node, xpath::Context};
|
use libxml::{tree::Node, xpath::Context};
|
||||||
use reqwest::{
|
use reqwest::{
|
||||||
header::{HeaderMap, HeaderName, HeaderValue},
|
header::{HeaderMap, HeaderName, HeaderValue},
|
||||||
|
@ -6,10 +5,7 @@ use reqwest::{
|
||||||
};
|
};
|
||||||
use tokio::fs::DirEntry;
|
use tokio::fs::DirEntry;
|
||||||
|
|
||||||
use crate::full_text_parser::{
|
use crate::full_text_parser::{config::ConfigEntry, error::FullTextParserError};
|
||||||
config::ConfigEntry,
|
|
||||||
error::{FullTextParserError, FullTextParserErrorKind},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub struct Util;
|
pub struct Util;
|
||||||
|
|
||||||
|
@ -55,22 +51,22 @@ impl Util {
|
||||||
if let Some(config) = site_specific_rule {
|
if let Some(config) = site_specific_rule {
|
||||||
for header in &config.header {
|
for header in &config.header {
|
||||||
let name = HeaderName::from_bytes(header.name.as_bytes())
|
let name = HeaderName::from_bytes(header.name.as_bytes())
|
||||||
.context(FullTextParserErrorKind::Config)?;
|
.map_err(|_| FullTextParserError::Config)?;
|
||||||
let value = header
|
let value = header
|
||||||
.value
|
.value
|
||||||
.parse::<HeaderValue>()
|
.parse::<HeaderValue>()
|
||||||
.context(FullTextParserErrorKind::Config)?;
|
.map_err(|_| FullTextParserError::Config)?;
|
||||||
headers.insert(name, value);
|
headers.insert(name, value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for header in &global_rule.header {
|
for header in &global_rule.header {
|
||||||
let name = HeaderName::from_bytes(header.name.as_bytes())
|
let name = HeaderName::from_bytes(header.name.as_bytes())
|
||||||
.context(FullTextParserErrorKind::Config)?;
|
.map_err(|_| FullTextParserError::Config)?;
|
||||||
let value = header
|
let value = header
|
||||||
.value
|
.value
|
||||||
.parse::<HeaderValue>()
|
.parse::<HeaderValue>()
|
||||||
.context(FullTextParserErrorKind::Config)?;
|
.map_err(|_| FullTextParserError::Config)?;
|
||||||
headers.insert(name, value);
|
headers.insert(name, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -105,7 +101,7 @@ impl Util {
|
||||||
) -> Result<Vec<Node>, FullTextParserError> {
|
) -> Result<Vec<Node>, FullTextParserError> {
|
||||||
let res = xpath_ctx.evaluate(xpath).map_err(|()| {
|
let res = xpath_ctx.evaluate(xpath).map_err(|()| {
|
||||||
log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
|
log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
|
||||||
FullTextParserErrorKind::Xml
|
FullTextParserError::Xml
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let node_vec = res.get_nodes_as_vec();
|
let node_vec = res.get_nodes_as_vec();
|
||||||
|
@ -113,7 +109,7 @@ impl Util {
|
||||||
if node_vec.is_empty() {
|
if node_vec.is_empty() {
|
||||||
log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
|
log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
|
||||||
if thorw_if_empty {
|
if thorw_if_empty {
|
||||||
return Err(FullTextParserErrorKind::Xml.into());
|
return Err(FullTextParserError::Xml);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -135,7 +131,7 @@ impl Util {
|
||||||
}
|
}
|
||||||
|
|
||||||
log::error!("Failed to determine content type");
|
log::error!("Failed to determine content type");
|
||||||
Err(FullTextParserErrorKind::Http.into())
|
Err(FullTextParserError::Http)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn check_redirect(response: &Response, original_url: &url::Url) -> Option<url::Url> {
|
pub fn check_redirect(response: &Response, original_url: &url::Url) -> Option<url::Url> {
|
||||||
|
@ -155,7 +151,7 @@ impl Util {
|
||||||
return Ok(val.get_content());
|
return Ok(val.get_content());
|
||||||
}
|
}
|
||||||
|
|
||||||
Err(FullTextParserErrorKind::Xml.into())
|
Err(FullTextParserError::Xml)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn extract_value_merge(
|
pub fn extract_value_merge(
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue