diff --git a/Cargo.toml b/Cargo.toml index baf4283..a11749c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,3 +20,4 @@ base64 = "0.13" image = "0.24" log = "0.4" rust-embed="6.4" +once_cell = "1.15" \ No newline at end of file diff --git a/src/article.rs b/src/article.rs index 7de035f..1c6c160 100644 --- a/src/article.rs +++ b/src/article.rs @@ -1,8 +1,7 @@ -use crate::error::{ScraperError, ScraperErrorKind}; use chrono::{DateTime, Utc}; -use failure::ResultExt; -use std::io::Write; +use std::io::{Error, ErrorKind, Write}; use std::path::PathBuf; +use std::fs::File; use url::Url; pub struct Article { @@ -14,7 +13,7 @@ pub struct Article { } impl Article { - pub fn save_html(&self, path: &PathBuf) -> Result<(), ScraperError> { + pub fn save_html(&self, path: &PathBuf) -> Result<(), Error> { if let Some(ref html) = self.html { if let Ok(()) = std::fs::create_dir_all(&path) { let mut file_name = match self.title.clone() { @@ -23,14 +22,12 @@ impl Article { }; file_name.push_str(".html"); let path = path.join(file_name); - let mut html_file = std::fs::File::create(&path).context(ScraperErrorKind::IO)?; - html_file - .write_all(html.as_bytes()) - .context(ScraperErrorKind::IO)?; + let mut html_file = File::create(&path)?; + html_file.write_all(html.as_bytes())?; return Ok(()); } } - Err(ScraperErrorKind::Unknown.into()) + Err(Error::new(ErrorKind::NotFound, "Article does not contain HTML")) } } diff --git a/src/error.rs b/src/error.rs index 82473c1..297f0a9 100644 --- a/src/error.rs +++ b/src/error.rs @@ -8,20 +8,6 @@ pub struct ScraperError { #[derive(Copy, Clone, Eq, PartialEq, Debug, Fail)] pub enum ScraperErrorKind { - #[fail(display = "libXml Error")] - Xml, - #[fail(display = "No content found")] - Scrape, - #[fail(display = "Url Error")] - Url, - #[fail(display = "Http request failed")] - Http, - #[fail(display = "Config Error")] - Config, - #[fail(display = "IO Error")] - IO, - #[fail(display = "Content-type suggest no html")] - ContentType, #[fail(display = "Unknown Error")] Unknown, } diff --git a/src/config/config_collection.rs b/src/full_text_parser/config/config_collection.rs similarity index 93% rename from src/config/config_collection.rs rename to src/full_text_parser/config/config_collection.rs index ec72cb7..b4a7ff6 100644 --- a/src/config/config_collection.rs +++ b/src/full_text_parser/config/config_collection.rs @@ -66,8 +66,4 @@ impl ConfigCollection { self.embedded_entries.get(key) } } - - pub fn contains_config(&self, key: &str) -> bool { - self.user_entries.contains_key(key) || self.embedded_entries.contains_key(key) - } } diff --git a/src/config/config_entry.rs b/src/full_text_parser/config/config_entry.rs similarity index 100% rename from src/config/config_entry.rs rename to src/full_text_parser/config/config_entry.rs diff --git a/src/config/error.rs b/src/full_text_parser/config/error.rs similarity index 100% rename from src/config/error.rs rename to src/full_text_parser/config/error.rs diff --git a/src/config/macros.rs b/src/full_text_parser/config/macros.rs similarity index 100% rename from src/config/macros.rs rename to src/full_text_parser/config/macros.rs diff --git a/src/config/mod.rs b/src/full_text_parser/config/mod.rs similarity index 100% rename from src/config/mod.rs rename to src/full_text_parser/config/mod.rs diff --git a/src/full_text_parser/error.rs b/src/full_text_parser/error.rs new file mode 100644 index 0000000..4045e06 --- /dev/null +++ b/src/full_text_parser/error.rs @@ -0,0 +1,71 @@ +use failure::{Backtrace, Context, Error, Fail}; +use std::fmt; + +#[derive(Debug)] +pub struct FullTextParserError { + inner: Context, +} + +#[derive(Copy, Clone, Eq, PartialEq, Debug, Fail)] +pub enum FullTextParserErrorKind { + #[fail(display = "libXml Error")] + Xml, + #[fail(display = "No content found")] + Scrape, + #[fail(display = "Url Error")] + Url, + #[fail(display = "Http request failed")] + Http, + #[fail(display = "Config Error")] + Config, + #[fail(display = "IO Error")] + IO, + #[fail(display = "Content-type suggest no html")] + ContentType, + #[fail(display = "Unknown Error")] + Unknown, +} + +impl Fail for FullTextParserError { + fn cause(&self) -> Option<&dyn Fail> { + self.inner.cause() + } + + fn backtrace(&self) -> Option<&Backtrace> { + self.inner.backtrace() + } +} + +impl fmt::Display for FullTextParserError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(&self.inner, f) + } +} + +impl FullTextParserError { + pub fn kind(&self) -> FullTextParserErrorKind { + *self.inner.get_context() + } +} + +impl From for FullTextParserError { + fn from(kind: FullTextParserErrorKind) -> FullTextParserError { + FullTextParserError { + inner: Context::new(kind), + } + } +} + +impl From> for FullTextParserError { + fn from(inner: Context) -> FullTextParserError { + FullTextParserError { inner } + } +} + +impl From for FullTextParserError { + fn from(_: Error) -> FullTextParserError { + FullTextParserError { + inner: Context::new(FullTextParserErrorKind::Unknown), + } + } +} diff --git a/src/full_text_parser/fingerprints.rs b/src/full_text_parser/fingerprints.rs new file mode 100644 index 0000000..ecb5bab --- /dev/null +++ b/src/full_text_parser/fingerprints.rs @@ -0,0 +1,47 @@ +use once_cell::sync::Lazy; +use regex::Regex; +use reqwest::Url; +use std::collections::HashMap; + +static FINGERPRINT_REGEXES: Lazy> = Lazy::new(|| { + let mut m = HashMap::with_capacity(4); + m.insert( + "fingerprint.blogspot.com", + regex::Regex::new( + r#"/\/i"#) + .expect("failed to build static regex"), + ); + m +}); + +pub struct Fingerprints; + +impl Fingerprints { + pub fn detect(html: &str) -> Option { + for (url, regex) in FINGERPRINT_REGEXES.iter() { + if regex.captures(html).is_some() { + return Some(Url::parse(url).expect("failed to parse static url")); + } + } + + None + } +} diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs new file mode 100644 index 0000000..a2bcd54 --- /dev/null +++ b/src/full_text_parser/mod.rs @@ -0,0 +1,768 @@ +pub mod error; +pub mod config; +mod fingerprints; + +#[cfg(test)] +mod tests; + +use self::error::{FullTextParserError, FullTextParserErrorKind}; +use crate::article::Article; +use self::config::{ConfigCollection, ConfigEntry}; +use chrono::DateTime; +use encoding_rs::Encoding; +use failure::ResultExt; +use fingerprints::Fingerprints; +use libxml::parser::Parser; +use libxml::tree::{Document, Node, SaveOptions}; +use libxml::xpath::Context; +use log::{debug, error, info, warn}; +use reqwest::header::HeaderMap; +use reqwest::Client; +use std::path::Path; +use std::str::FromStr; +use crate::util::Util; + +pub struct FullTextParser { + config_files: ConfigCollection, +} + +impl FullTextParser { + pub async fn new(config_path: Option<&Path>) -> Self { + let config_files = ConfigCollection::parse(config_path).await; + Self { + config_files, + } + } + + pub async fn parse( + &self, + url: &url::Url, + client: &Client, + ) -> Result { + info!("Scraping article: '{}'", url.as_str()); + + // check if we have a config for the url + let config = self.get_grabber_config(url); + let global_config = self + .config_files + .get("global.txt") + .ok_or(FullTextParserErrorKind::Config)?; + + let headers = Util::generate_headers(config, global_config)?; + + let response = client + .head(url.clone()) + .headers(headers) + .send() + .await + .map_err(|err| { + error!("Failed head request to: '{}' - '{}'", url.as_str(), err); + err + }) + .context(FullTextParserErrorKind::Http)?; + + // check if url redirects and we need to pick up the new url + let url = if let Some(new_url) = Util::check_redirect(&response, url) { + debug!("Url '{}' redirects to '{}'", url.as_str(), new_url.as_str()); + new_url + } else { + url.clone() + }; + + // check if we are dealing with text/html + if !Util::check_content_type(&response)? { + return Err(FullTextParserErrorKind::ContentType.into()); + } + + let mut article = Article { + title: None, + author: None, + url: url.clone(), + date: None, + html: None, + }; + + let mut document = Document::new().map_err(|()| FullTextParserErrorKind::Xml)?; + let mut root = Node::new("article", None, &document).map_err(|()| FullTextParserErrorKind::Xml)?; + document.set_root_element(&root); + + Self::generate_head(&mut root, &document)?; + + self.parse_pages(&mut article, &url, &mut root, config, global_config, client) + .await?; + + let context = Context::new(&document).map_err(|()| { + error!("Failed to create xpath context for extracted article"); + FullTextParserErrorKind::Xml + })?; + + if let Err(error) = Self::prevent_self_closing_tags(&context) { + error!("Preventing self closing tags failed - '{}'", error); + return Err(error); + } + + // serialize content + let options = SaveOptions { + format: false, + no_declaration: false, + no_empty_tags: true, + no_xhtml: false, + xhtml: false, + as_xml: false, + as_html: true, + non_significant_whitespace: false, + }; + let html = document.to_string_with_options(options); + article.html = Some(html); + + Ok(article) + } + + async fn parse_pages( + &self, + article: &mut Article, + url: &url::Url, + root: &mut Node, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry, + client: &Client, + ) -> Result<(), FullTextParserError> { + let headers = Util::generate_headers(config, global_config)?; + let html = Self::download(url, client, headers).await?; + + // see if + let config = if config.is_none() { + if let Some(url) = Fingerprints::detect(&html) { + self.get_grabber_config(&url) + } else { + config + } + } else { + config + }; + + let mut document = Self::parse_html(html, config, global_config)?; + let mut xpath_ctx = Self::get_xpath_ctx(&document)?; + + // check for single page link + let rule = Util::select_rule( + config.and_then(|c| c.single_page_link.as_deref()), + global_config.single_page_link.as_deref(), + ); + if let Some(xpath_single_page_link) = rule { + debug!( + "Single page link xpath specified in config '{}'", + xpath_single_page_link + ); + + if let Some(single_page_url) = Util::find_page_url(&xpath_ctx, xpath_single_page_link) { + // parse again with single page url + debug!("Single page link found '{}'", single_page_url); + + return self + .parse_single_page( + article, + &single_page_url, + root, + config, + global_config, + client, + ) + .await; + } + } + + Self::extract_metadata(&xpath_ctx, config, global_config, article); + Self::strip_junk(&xpath_ctx, config, global_config, url); + Self::extract_body(&xpath_ctx, root, config, global_config)?; + + while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) { + let headers = Util::generate_headers(config, global_config)?; + let html = Self::download(&url, client, headers).await?; + document = Self::parse_html(html, config, global_config)?; + xpath_ctx = Self::get_xpath_ctx(&document)?; + Self::strip_junk(&xpath_ctx, config, global_config, &url); + Self::extract_body(&xpath_ctx, root, config, global_config)?; + } + + Ok(()) + } + + fn parse_html( + html: String, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry, + ) -> Result { + // replace matches in raw html + + let mut html = html; + if let Some(config) = config { + for replace in &config.replace { + html = html.replace(&replace.to_replace, &replace.replace_with); + } + } + + for replace in &global_config.replace { + html = html.replace(&replace.to_replace, &replace.replace_with); + } + + // parse html + let parser = Parser::default_html(); + Ok(parser.parse_string(html.as_str()).map_err(|err| { + error!("Parsing HTML failed for downloaded HTML {:?}", err); + FullTextParserErrorKind::Xml + })?) + } + + fn get_xpath_ctx(doc: &Document) -> Result { + Ok(Context::new(doc).map_err(|()| { + error!("Creating xpath context failed for downloaded HTML"); + FullTextParserErrorKind::Xml + })?) + } + + async fn parse_single_page( + &self, + article: &mut Article, + url: &url::Url, + root: &mut Node, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry, + client: &Client, + ) -> Result<(), FullTextParserError> { + let headers = Util::generate_headers(config, global_config)?; + let html = Self::download(url, client, headers).await?; + let document = Self::parse_html(html, config, global_config)?; + let xpath_ctx = Self::get_xpath_ctx(&document)?; + Self::extract_metadata(&xpath_ctx, config, global_config, article); + Self::strip_junk(&xpath_ctx, config, global_config, url); + Self::extract_body(&xpath_ctx, root, config, global_config)?; + + Ok(()) + } + + async fn download( + url: &url::Url, + client: &Client, + headers: HeaderMap, + ) -> Result { + let response = client + .get(url.as_str()) + .headers(headers) + .send() + .await + .map_err(|err| { + error!( + "Downloading HTML failed: GET '{}' - '{}'", + url.as_str(), + err + ); + err + }) + .context(FullTextParserErrorKind::Http)?; + + if response.status().is_success() { + let headers = response.headers().clone(); + let text = response.text().await.context(FullTextParserErrorKind::Http)?; + { + if let Some(decoded_html) = + Self::decode_html(&text, Self::get_encoding_from_html(&text)) + { + return Ok(decoded_html); + } + + if let Some(decoded_html) = + Self::decode_html(&text, Self::get_encoding_from_http_header(&headers)) + { + return Ok(decoded_html); + } + } + + warn!("No encoding of HTML detected - assuming utf-8"); + return Ok(text); + } + + Err(FullTextParserErrorKind::Http.into()) + } + + fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> { + if let Some(content_type) = headers.get(reqwest::header::CONTENT_TYPE) { + if let Ok(content_type) = content_type.to_str() { + let regex = + regex::Regex::new(r#"charset=([^"']+)"#).expect("Failed to parse regex"); + if let Some(captures) = regex.captures(content_type) { + if let Some(regex_match) = captures.get(1) { + return Some(regex_match.as_str()); + } + } + } + } + None + } + + fn get_encoding_from_html(html: &str) -> Option<&str> { + let regex = + regex::Regex::new(r#") -> Option { + if let Some(encoding) = encoding { + if let Some(encoding) = Encoding::for_label(encoding.as_bytes()) { + let (decoded_html, _, invalid_chars) = encoding.decode(html.as_bytes()); + + if !invalid_chars { + return Some(decoded_html.into_owned()); + } + } + warn!("Could not decode HTML. Encoding: '{}'", encoding); + } + None + } + + fn get_host_name(url: &url::Url) -> Result { + match url.host_str() { + Some(name) => { + let mut name = name; + if name.starts_with("www.") && name.len() > 4 { + name = &name[4..] + } + Ok(name.into()) + } + None => { + error!("Getting config failed due to bad Url"); + Err(FullTextParserErrorKind::Config.into()) + } + } + } + + fn get_grabber_config(&self, url: &url::Url) -> Option<&ConfigEntry> { + let conf = Self::get_host_name(url) + .ok() + .map(|url| url + ".txt") + .and_then(|name| self.config_files.get(&name)); + + if conf.is_none() { + log::warn!("No config found for url '{}'", url); + } + + conf + } + + fn fix_lazy_images( + context: &Context, + class: &str, + property_url: &str, + ) -> Result<(), FullTextParserError> { + let xpath = &format!("//img[contains(@class, '{}')]", class); + let node_vec = Util::evaluate_xpath(context, xpath, false)?; + for mut node in node_vec { + if let Some(correct_url) = node.get_property(property_url) { + if node.set_property("src", &correct_url).is_err() { + return Err(FullTextParserErrorKind::Xml.into()); + } + } + } + Ok(()) + } + + fn fix_iframe_size(context: &Context, site_name: &str) -> Result<(), FullTextParserError> { + let xpath = &format!("//iframe[contains(@src, '{}')]", site_name); + let node_vec = Util::evaluate_xpath(context, xpath, false)?; + for mut node in node_vec { + if let Some(mut parent) = node.get_parent() { + if let Ok(mut video_wrapper) = parent.new_child(None, "div") { + if let Ok(()) = video_wrapper.set_property("class", "videoWrapper") { + if let Ok(()) = node.set_property("width", "100%") { + if let Ok(()) = node.set_property("height", "100%") { + node.unlink(); + video_wrapper.add_child(&mut node).map_err(|_| { + error!("Failed to add iframe as child of video wrapper
"); + FullTextParserErrorKind::Xml + })?; + } + } + } + } + + error!("Failed to add video wrapper
as parent of iframe"); + return Err(FullTextParserErrorKind::Xml.into()); + } + + error!("Failed to get parent of iframe"); + // return Err(ScraperErrorKind::Xml.into()); + } + Ok(()) + } + + fn remove_attribute( + context: &Context, + tag: Option<&str>, + attribute: &str, + ) -> Result<(), FullTextParserError> { + let xpath_tag = tag.unwrap_or("*"); + + let xpath = &format!("//{}[@{}]", xpath_tag, attribute); + let node_vec = Util::evaluate_xpath(context, xpath, false)?; + for mut node in node_vec { + if node.remove_property(attribute).is_err() { + return Err(FullTextParserErrorKind::Xml.into()); + } + } + Ok(()) + } + + fn add_attribute( + context: &Context, + tag: Option<&str>, + attribute: &str, + value: &str, + ) -> Result<(), FullTextParserError> { + let xpath_tag = tag.unwrap_or("*"); + + let xpath = &format!("//{}", xpath_tag); + let node_vec = Util::evaluate_xpath(context, xpath, false)?; + for mut node in node_vec { + if node.set_attribute(attribute, value).is_err() { + return Err(FullTextParserErrorKind::Xml.into()); + } + } + Ok(()) + } + + fn get_attribute( + context: &Context, + xpath: &str, + attribute: &str, + ) -> Result { + let node_vec = Util::evaluate_xpath(context, xpath, false)?; + for node in node_vec { + if let Some(value) = node.get_attribute(attribute) { + return Ok(value); + } + } + + Err(FullTextParserErrorKind::Xml.into()) + } + + fn repair_urls( + context: &Context, + xpath: &str, + attribute: &str, + article_url: &url::Url, + ) -> Result<(), FullTextParserError> { + let node_vec = Util::evaluate_xpath(context, xpath, false)?; + for mut node in node_vec { + if let Some(val) = node.get_attribute(attribute) { + if let Err(url::ParseError::RelativeUrlWithoutBase) = url::Url::parse(&val) { + if let Ok(fixed_url) = Self::complete_url(article_url, &val) { + if node.set_attribute(attribute, fixed_url.as_str()).is_err() { + return Err(FullTextParserErrorKind::Scrape.into()); + } + } + } + } + } + Ok(()) + } + + fn complete_url( + article_url: &url::Url, + incomplete_url: &str, + ) -> Result { + let mut completed_url = article_url.scheme().to_owned(); + completed_url.push(':'); + + if !incomplete_url.starts_with("//") { + match article_url.host() { + Some(url::Host::Domain(host)) => { + completed_url.push_str("//"); + completed_url.push_str(host); + } + _ => return Err(FullTextParserErrorKind::Scrape.into()), + }; + } + + if !completed_url.ends_with('/') && !incomplete_url.starts_with('/') { + completed_url.push('/'); + } + completed_url.push_str(incomplete_url); + let url = url::Url::parse(&completed_url).context(FullTextParserErrorKind::Url)?; + Ok(url) + } + + fn strip_junk( + context: &Context, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry, + url: &url::Url, + ) { + // strip specified xpath + if let Some(config) = config { + for xpath_strip in &config.xpath_strip { + let _ = Util::strip_node(context, xpath_strip); + } + } + + for xpath_strip in &global_config.xpath_strip { + let _ = Util::strip_node(context, xpath_strip); + } + + // strip everything with specified 'id' or 'class' + if let Some(config) = config { + for xpaht_strip_class in &config.strip_id_or_class { + let _ = Util::strip_id_or_class(context, xpaht_strip_class); + } + } + + for xpaht_strip_class in &global_config.strip_id_or_class { + let _ = Util::strip_id_or_class(context, xpaht_strip_class); + } + + // strip any element where @src attribute contains this substring + if let Some(config) = config { + for xpath_strip_img_src in &config.strip_image_src { + let _ = Util::strip_node( + context, + &format!("//img[contains(@src,'{}')]", xpath_strip_img_src), + ); + } + } + + for xpath_strip_img_src in &global_config.strip_image_src { + let _ = Util::strip_node( + context, + &format!("//img[contains(@src,'{}')]", xpath_strip_img_src), + ); + } + + let _ = Self::fix_lazy_images(context, "lazyload", "data-src"); + let _ = Self::fix_iframe_size(context, "youtube.com"); + let _ = Self::remove_attribute(context, None, "style"); + let _ = Self::remove_attribute(context, Some("a"), "onclick"); + let _ = Self::remove_attribute(context, Some("img"), "srcset"); + let _ = Self::remove_attribute(context, Some("img"), "sizes"); + let _ = Self::add_attribute(context, Some("a"), "target", "_blank"); + + let _ = Self::repair_urls(context, "//img", "src", url); + let _ = Self::repair_urls(context, "//a", "src", url); + let _ = Self::repair_urls(context, "//a", "href", url); + let _ = Self::repair_urls(context, "//object", "data", url); + let _ = Self::repair_urls(context, "//iframe", "src", url); + + // strip elements using Readability.com and Instapaper.com ignore class names + // .entry-unrelated and .instapaper_ignore + // See http://blog.instapaper.com/post/730281947 + let _ = Util::strip_node( + context, + "//*[contains(@class,' entry-unrelated ') or contains(@class,' instapaper_ignore ')]", + ); + + // strip elements that contain style="display: none;" + let _ = Util::strip_node(context, "//*[contains(@style,'display:none')]"); + + // strip all comments + let _ = Util::strip_node(context, "//comment()"); + + // strip all empty url-tags + let _ = Util::strip_node(context, "//a[not(node())]"); + + // strip all external css and fonts + let _ = Util::strip_node(context, "//*[@type='text/css']"); + } + + fn extract_metadata( + context: &Context, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry, + article: &mut Article, + ) { + // try to get title + if let Some(config) = config { + for xpath_title in &config.xpath_title { + if let Ok(title) = Util::extract_value_merge(context, xpath_title) { + debug!("Article title: '{}'", title); + article.title = Some(title); + break; + } + } + } + + if article.title.is_none() { + for xpath_title in &global_config.xpath_title { + if let Ok(title) = Util::extract_value_merge(context, xpath_title) { + debug!("Article title: '{}'", title); + article.title = Some(title); + break; + } + } + } + + // try to get the author + if let Some(config) = config { + for xpath_author in &config.xpath_author { + if let Ok(author) = Util::extract_value(context, xpath_author) { + debug!("Article author: '{}'", author); + article.author = Some(author); + break; + } + } + } + + if article.author.is_none() { + for xpath_author in &global_config.xpath_author { + if let Ok(author) = Util::extract_value(context, xpath_author) { + debug!("Article author: '{}'", author); + article.author = Some(author); + break; + } + } + } + + // try to get the date + if let Some(config) = config { + for xpath_date in &config.xpath_date { + if let Ok(date_string) = Util::extract_value(context, xpath_date) { + debug!("Article date: '{}'", date_string); + if let Ok(date) = DateTime::from_str(&date_string) { + article.date = Some(date); + break; + } else { + warn!("Parsing the date string '{}' failed", date_string); + } + } + } + } + + if article.date.is_none() { + for xpath_date in &global_config.xpath_date { + if let Ok(date_string) = Util::extract_value(context, xpath_date) { + debug!("Article date: '{}'", date_string); + if let Ok(date) = DateTime::from_str(&date_string) { + article.date = Some(date); + break; + } else { + warn!("Parsing the date string '{}' failed", date_string); + } + } + } + } + } + + fn extract_body( + context: &Context, + root: &mut Node, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry, + ) -> Result<(), FullTextParserError> { + let mut found_something = false; + + if let Some(config) = config { + for xpath_body in &config.xpath_body { + found_something = Self::extract_body_single(context, root, xpath_body)?; + } + } + + if !found_something { + for xpath_body in &global_config.xpath_body { + found_something = Self::extract_body_single(context, root, xpath_body)?; + } + } + + if !found_something { + log::error!("no body found"); + return Err(FullTextParserErrorKind::Scrape.into()); + } + + Ok(()) + } + + fn extract_body_single( + context: &Context, + root: &mut Node, + xpath: &str, + ) -> Result { + let mut found_something = false; + { + let node_vec = Util::evaluate_xpath(context, xpath, false)?; + for mut node in node_vec { + if node.get_property("style").is_some() && node.remove_property("style").is_err() { + return Err(FullTextParserErrorKind::Xml.into()); + } + + node.unlink(); + if root.add_child(&mut node).is_ok() { + found_something = true; + } else { + error!("Failed to add body to prepared document"); + return Err(FullTextParserErrorKind::Xml.into()); + } + } + } + + Ok(found_something) + } + + fn check_for_next_page( + &self, + context: &Context, + config: Option<&ConfigEntry>, + global_config: &ConfigEntry, + ) -> Option { + if let Some(config) = config { + if let Some(next_page_xpath) = config.next_page_link.as_deref() { + if let Ok(next_page_string) = Self::get_attribute(context, next_page_xpath, "href") + { + if let Ok(next_page_url) = url::Url::parse(&next_page_string) { + return Some(next_page_url); + } + } + } + } else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() { + if let Ok(next_page_string) = Self::get_attribute(context, next_page_xpath, "href") { + if let Ok(next_page_url) = url::Url::parse(&next_page_string) { + return Some(next_page_url); + } + } + } + + // last page reached + None + } + + fn generate_head(root: &mut Node, document: &Document) -> Result<(), FullTextParserError> { + if let Ok(mut head_node) = Node::new("head", None, document) { + if let Ok(()) = root.add_prev_sibling(&mut head_node) { + if let Ok(mut meta) = head_node.new_child(None, "meta") { + if meta.set_property("charset", "utf-8").is_ok() { + return Ok(()); + } + } + } + } + + Err(FullTextParserErrorKind::Xml.into()) + } + + fn prevent_self_closing_tags(context: &Context) -> Result<(), FullTextParserError> { + // search document for empty tags and add a empty text node as child + // this prevents libxml from self closing non void elements such as iframe + + let xpath = "//*[not(node())]"; + let node_vec = Util::evaluate_xpath(context, xpath, false)?; + for mut node in node_vec { + if node.get_name() == "meta" { + continue; + } + + let _ = node.add_text_child(None, "empty", ""); + } + + Ok(()) + } +} diff --git a/src/tests.rs b/src/full_text_parser/tests.rs similarity index 79% rename from src/tests.rs rename to src/full_text_parser/tests.rs index e82ad2d..f720312 100644 --- a/src/tests.rs +++ b/src/full_text_parser/tests.rs @@ -1,4 +1,4 @@ -use crate::*; +use super::FullTextParser; use reqwest::Client; use std::path::PathBuf; @@ -7,8 +7,8 @@ async fn golem() { let out_path = PathBuf::from(r"./test_output"); let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap(); - let grabber = ArticleScraper::new(None).await; - let article = grabber.parse(&url, true, &Client::new()).await.unwrap(); + let grabber = FullTextParser::new(None).await; + let article = grabber.parse(&url, &Client::new()).await.unwrap(); article.save_html(&out_path).unwrap(); assert_eq!( @@ -27,8 +27,8 @@ async fn phoronix() { url::Url::parse("http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1") .unwrap(); - let grabber = ArticleScraper::new(None).await; - let article = grabber.parse(&url, false, &Client::new()).await.unwrap(); + let grabber = FullTextParser::new(None).await; + let article = grabber.parse(&url, &Client::new()).await.unwrap(); article.save_html(&out_path).unwrap(); assert_eq!( @@ -44,8 +44,8 @@ async fn youtube() { let out_path = PathBuf::from(r"./test_output"); let url = url::Url::parse("https://www.youtube.com/watch?v=8KjaIumu-jI").unwrap(); - let grabber = ArticleScraper::new(None).await; - let article = grabber.parse(&url, false, &Client::new()).await.unwrap(); + let grabber = FullTextParser::new(None).await; + let article = grabber.parse(&url, &Client::new()).await.unwrap(); article.save_html(&out_path).unwrap(); assert_eq!( diff --git a/src/images/error.rs b/src/images/error.rs index 34f3480..15a3dac 100644 --- a/src/images/error.rs +++ b/src/images/error.rs @@ -1,4 +1,4 @@ -use super::super::ScraperErrorKind; +use crate::full_text_parser::error::FullTextParserErrorKind; use failure::{Backtrace, Context, Error, Fail}; use std::fmt; @@ -67,10 +67,10 @@ impl From> for ImageDownloadError { } } -impl From for ImageDownloadError { - fn from(kind: ScraperErrorKind) -> ImageDownloadError { +impl From for ImageDownloadError { + fn from(kind: FullTextParserErrorKind) -> ImageDownloadError { let kind = match kind { - ScraperErrorKind::Xml => ImageDownloadErrorKind::HtmlParse, + FullTextParserErrorKind::Xml => ImageDownloadErrorKind::HtmlParse, _ => ImageDownloadErrorKind::Unknown, }; diff --git a/src/lib.rs b/src/lib.rs index db33674..a426958 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,41 +1,26 @@ -mod article; -mod config; -mod error; pub mod images; +mod article; +mod full_text_parser; mod util; -mod youtube; +mod error; -#[cfg(test)] -mod tests; - -use self::error::{ScraperError, ScraperErrorKind}; -use crate::article::Article; -use crate::config::{ConfigCollection, ConfigEntry}; -use crate::images::ImageDownloader; -use chrono::DateTime; -use encoding_rs::Encoding; -use failure::ResultExt; -use libxml::parser::Parser; -use libxml::tree::{Document, Node, SaveOptions}; -use libxml::xpath::Context; -use log::{debug, error, info, warn}; -use reqwest::header::HeaderMap; -use reqwest::Client; use std::path::Path; -use std::str::FromStr; -use util::Util; +use article::Article; +use full_text_parser::FullTextParser; +use error::{ScraperError, ScraperErrorKind}; +use images::ImageDownloader; +use reqwest::Client; pub struct ArticleScraper { - pub image_downloader: ImageDownloader, - config_files: ConfigCollection, + full_text_parser: FullTextParser, + image_downloader: ImageDownloader, } impl ArticleScraper { - pub async fn new(config_path: Option<&Path>) -> Self { - let config_files = ConfigCollection::parse(config_path).await; - ArticleScraper { + pub async fn new(user_configs: Option<&Path>) -> Self { + Self { + full_text_parser: FullTextParser::new(user_configs).await, image_downloader: ImageDownloader::new((2048, 2048)), - config_files, } } @@ -45,742 +30,19 @@ impl ArticleScraper { download_images: bool, client: &Client, ) -> Result { - info!("Scraping article: '{}'", url.as_str()); - // custom youtube handling, but prefer config if exists - if !self.config_files.contains_config("youtube.com.txt") { - if let Some(article) = youtube::Youtube::handle(url) { - return Ok(article); - } - } - - // check if we have a config for the url - let config = self.get_grabber_config(url); - let global_config = self - .config_files - .get("global.txt") - .ok_or(ScraperErrorKind::Config)?; - - let headers = Util::generate_headers(config, global_config)?; - - let response = client - .head(url.clone()) - .headers(headers) - .send() - .await - .map_err(|err| { - error!("Failed head request to: '{}' - '{}'", url.as_str(), err); - err - }) - .context(ScraperErrorKind::Http)?; - - // check if url redirects and we need to pick up the new url - let url = if let Some(new_url) = Util::check_redirect(&response, url) { - debug!("Url '{}' redirects to '{}'", url.as_str(), new_url.as_str()); - new_url - } else { - url.clone() - }; - - // check if we are dealing with text/html - if !Util::check_content_type(&response)? { - return Err(ScraperErrorKind::ContentType.into()); - } - - let mut article = Article { - title: None, - author: None, - url: url.clone(), - date: None, - html: None, - }; - - let mut document = Document::new().map_err(|()| ScraperErrorKind::Xml)?; - - let mut root = Node::new("article", None, &document).map_err(|()| ScraperErrorKind::Xml)?; - - document.set_root_element(&root); - - ArticleScraper::generate_head(&mut root, &document)?; - - self.parse_pages(&mut article, &url, &mut root, config, global_config, client) - .await?; - - let context = Context::new(&document).map_err(|()| { - error!("Failed to create xpath context for extracted article"); - ScraperErrorKind::Xml - })?; - - if let Err(error) = ArticleScraper::prevent_self_closing_tags(&context) { - error!("Preventing self closing tags failed - '{}'", error); - return Err(error); - } + let res = self.full_text_parser.parse(url, client).await; if download_images { - if let Err(error) = self - .image_downloader - .download_images_from_context(&context, client) - .await - { - error!("Downloading images failed: '{}'", error); - } + // if let Err(error) = self + // .image_downloader + // .download_images_from_context(&context, client) + // .await + // { + // log::error!("Downloading images failed: '{}'", error); + // } } - // serialize content - let options = SaveOptions { - format: false, - no_declaration: false, - no_empty_tags: true, - no_xhtml: false, - xhtml: false, - as_xml: false, - as_html: true, - non_significant_whitespace: false, - }; - let html = document.to_string_with_options(options); - article.html = Some(html); - - Ok(article) + unimplemented!() } - - async fn parse_pages( - &self, - article: &mut Article, - url: &url::Url, - root: &mut Node, - config: Option<&ConfigEntry>, - global_config: &ConfigEntry, - client: &Client, - ) -> Result<(), ScraperError> { - let headers = Util::generate_headers(config, global_config)?; - let html = ArticleScraper::download(url, client, headers).await?; - let mut document = Self::parse_html(html, config, global_config)?; - let mut xpath_ctx = Self::get_xpath_ctx(&document)?; - - // check for single page link - let rule = Util::select_rule( - config.and_then(|c| c.single_page_link.as_deref()), - global_config.single_page_link.as_deref(), - ); - if let Some(xpath_single_page_link) = rule { - debug!( - "Single page link xpath specified in config '{}'", - xpath_single_page_link - ); - - if let Some(single_page_url) = Util::find_page_url(&xpath_ctx, xpath_single_page_link) { - // parse again with single page url - debug!("Single page link found '{}'", single_page_url); - - return self - .parse_single_page( - article, - &single_page_url, - root, - config, - global_config, - client, - ) - .await; - } - } - - ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article); - ArticleScraper::strip_junk(&xpath_ctx, config, global_config, url); - ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?; - - while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) { - let headers = Util::generate_headers(config, global_config)?; - let html = ArticleScraper::download(&url, client, headers).await?; - document = Self::parse_html(html, config, global_config)?; - xpath_ctx = Self::get_xpath_ctx(&document)?; - ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url); - ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?; - } - - Ok(()) - } - - fn parse_html( - html: String, - config: Option<&ConfigEntry>, - global_config: &ConfigEntry, - ) -> Result { - // replace matches in raw html - - let mut html = html; - if let Some(config) = config { - for replace in &config.replace { - html = html.replace(&replace.to_replace, &replace.replace_with); - } - } - - for replace in &global_config.replace { - html = html.replace(&replace.to_replace, &replace.replace_with); - } - - // parse html - let parser = Parser::default_html(); - Ok(parser.parse_string(html.as_str()).map_err(|err| { - error!("Parsing HTML failed for downloaded HTML {:?}", err); - ScraperErrorKind::Xml - })?) - } - - fn get_xpath_ctx(doc: &Document) -> Result { - Ok(Context::new(doc).map_err(|()| { - error!("Creating xpath context failed for downloaded HTML"); - ScraperErrorKind::Xml - })?) - } - - async fn parse_single_page( - &self, - article: &mut Article, - url: &url::Url, - root: &mut Node, - config: Option<&ConfigEntry>, - global_config: &ConfigEntry, - client: &Client, - ) -> Result<(), ScraperError> { - let headers = Util::generate_headers(config, global_config)?; - let html = ArticleScraper::download(url, client, headers).await?; - let document = Self::parse_html(html, config, global_config)?; - let xpath_ctx = Self::get_xpath_ctx(&document)?; - ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article); - ArticleScraper::strip_junk(&xpath_ctx, config, global_config, url); - ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?; - - Ok(()) - } - - async fn download( - url: &url::Url, - client: &Client, - headers: HeaderMap, - ) -> Result { - let response = client - .get(url.as_str()) - .headers(headers) - .send() - .await - .map_err(|err| { - error!( - "Downloading HTML failed: GET '{}' - '{}'", - url.as_str(), - err - ); - err - }) - .context(ScraperErrorKind::Http)?; - - if response.status().is_success() { - let headers = response.headers().clone(); - let text = response.text().await.context(ScraperErrorKind::Http)?; - { - if let Some(decoded_html) = ArticleScraper::decode_html( - &text, - ArticleScraper::get_encoding_from_html(&text), - ) { - return Ok(decoded_html); - } - - if let Some(decoded_html) = ArticleScraper::decode_html( - &text, - ArticleScraper::get_encoding_from_http_header(&headers), - ) { - return Ok(decoded_html); - } - } - - warn!("No encoding of HTML detected - assuming utf-8"); - return Ok(text); - } - - Err(ScraperErrorKind::Http.into()) - } - - fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> { - if let Some(content_type) = headers.get(reqwest::header::CONTENT_TYPE) { - if let Ok(content_type) = content_type.to_str() { - let regex = - regex::Regex::new(r#"charset=([^"']+)"#).expect("Failed to parse regex"); - if let Some(captures) = regex.captures(content_type) { - if let Some(regex_match) = captures.get(1) { - return Some(regex_match.as_str()); - } - } - } - } - None - } - - fn get_encoding_from_html(html: &str) -> Option<&str> { - let regex = - regex::Regex::new(r#") -> Option { - if let Some(encoding) = encoding { - if let Some(encoding) = Encoding::for_label(encoding.as_bytes()) { - let (decoded_html, _, invalid_chars) = encoding.decode(html.as_bytes()); - - if !invalid_chars { - return Some(decoded_html.into_owned()); - } - } - warn!("Could not decode HTML. Encoding: '{}'", encoding); - } - None - } - - fn get_host_name(url: &url::Url) -> Result { - match url.host_str() { - Some(name) => { - let mut name = name; - if name.starts_with("www.") && name.len() > 4 { - name = &name[4..] - } - Ok(name.into()) - } - None => { - error!("Getting config failed due to bad Url"); - Err(ScraperErrorKind::Config.into()) - } - } - } - - fn get_grabber_config(&self, url: &url::Url) -> Option<&ConfigEntry> { - let conf = Self::get_host_name(url) - .ok() - .map(|url| url + ".txt") - .and_then(|name| self.config_files.get(&name)); - - if conf.is_none() { - log::warn!("No config found for url '{}'", url); - } - - conf - } - - fn fix_lazy_images( - context: &Context, - class: &str, - property_url: &str, - ) -> Result<(), ScraperError> { - let xpath = &format!("//img[contains(@class, '{}')]", class); - let node_vec = Util::evaluate_xpath(context, xpath, false)?; - for mut node in node_vec { - if let Some(correct_url) = node.get_property(property_url) { - if node.set_property("src", &correct_url).is_err() { - return Err(ScraperErrorKind::Xml.into()); - } - } - } - Ok(()) - } - - fn fix_iframe_size(context: &Context, site_name: &str) -> Result<(), ScraperError> { - let xpath = &format!("//iframe[contains(@src, '{}')]", site_name); - let node_vec = Util::evaluate_xpath(context, xpath, false)?; - for mut node in node_vec { - if let Some(mut parent) = node.get_parent() { - if let Ok(mut video_wrapper) = parent.new_child(None, "div") { - if let Ok(()) = video_wrapper.set_property("class", "videoWrapper") { - if let Ok(()) = node.set_property("width", "100%") { - if let Ok(()) = node.set_property("height", "100%") { - node.unlink(); - video_wrapper.add_child(&mut node).map_err(|_| { - error!("Failed to add iframe as child of video wrapper
"); - ScraperErrorKind::Xml - })?; - } - } - } - } - - error!("Failed to add video wrapper
as parent of iframe"); - return Err(ScraperErrorKind::Xml.into()); - } - - error!("Failed to get parent of iframe"); - // return Err(ScraperErrorKind::Xml.into()); - } - Ok(()) - } - - fn remove_attribute( - context: &Context, - tag: Option<&str>, - attribute: &str, - ) -> Result<(), ScraperError> { - let xpath_tag = tag.unwrap_or("*"); - - let xpath = &format!("//{}[@{}]", xpath_tag, attribute); - let node_vec = Util::evaluate_xpath(context, xpath, false)?; - for mut node in node_vec { - if node.remove_property(attribute).is_err() { - return Err(ScraperErrorKind::Xml.into()); - } - } - Ok(()) - } - - fn add_attribute( - context: &Context, - tag: Option<&str>, - attribute: &str, - value: &str, - ) -> Result<(), ScraperError> { - let xpath_tag = tag.unwrap_or("*"); - - let xpath = &format!("//{}", xpath_tag); - let node_vec = Util::evaluate_xpath(context, xpath, false)?; - for mut node in node_vec { - if node.set_attribute(attribute, value).is_err() { - return Err(ScraperErrorKind::Xml.into()); - } - } - Ok(()) - } - - fn get_attribute( - context: &Context, - xpath: &str, - attribute: &str, - ) -> Result { - let node_vec = Util::evaluate_xpath(context, xpath, false)?; - for node in node_vec { - if let Some(value) = node.get_attribute(attribute) { - return Ok(value); - } - } - - Err(ScraperErrorKind::Xml.into()) - } - - fn repair_urls( - context: &Context, - xpath: &str, - attribute: &str, - article_url: &url::Url, - ) -> Result<(), ScraperError> { - let node_vec = Util::evaluate_xpath(context, xpath, false)?; - for mut node in node_vec { - if let Some(val) = node.get_attribute(attribute) { - if let Err(url::ParseError::RelativeUrlWithoutBase) = url::Url::parse(&val) { - if let Ok(fixed_url) = ArticleScraper::complete_url(article_url, &val) { - if node.set_attribute(attribute, fixed_url.as_str()).is_err() { - return Err(ScraperErrorKind::Scrape.into()); - } - } - } - } - } - Ok(()) - } - - fn complete_url( - article_url: &url::Url, - incomplete_url: &str, - ) -> Result { - let mut completed_url = article_url.scheme().to_owned(); - completed_url.push(':'); - - if !incomplete_url.starts_with("//") { - match article_url.host() { - Some(url::Host::Domain(host)) => { - completed_url.push_str("//"); - completed_url.push_str(host); - } - _ => return Err(ScraperErrorKind::Scrape.into()), - }; - } - - if !completed_url.ends_with('/') && !incomplete_url.starts_with('/') { - completed_url.push('/'); - } - completed_url.push_str(incomplete_url); - let url = url::Url::parse(&completed_url).context(ScraperErrorKind::Url)?; - Ok(url) - } - - fn strip_junk( - context: &Context, - config: Option<&ConfigEntry>, - global_config: &ConfigEntry, - url: &url::Url, - ) { - // strip specified xpath - if let Some(config) = config { - for xpath_strip in &config.xpath_strip { - let _ = Util::strip_node(context, xpath_strip); - } - } - - for xpath_strip in &global_config.xpath_strip { - let _ = Util::strip_node(context, xpath_strip); - } - - // strip everything with specified 'id' or 'class' - if let Some(config) = config { - for xpaht_strip_class in &config.strip_id_or_class { - let _ = Util::strip_id_or_class(context, xpaht_strip_class); - } - } - - for xpaht_strip_class in &global_config.strip_id_or_class { - let _ = Util::strip_id_or_class(context, xpaht_strip_class); - } - - // strip any element where @src attribute contains this substring - if let Some(config) = config { - for xpath_strip_img_src in &config.strip_image_src { - let _ = Util::strip_node( - context, - &format!("//img[contains(@src,'{}')]", xpath_strip_img_src), - ); - } - } - - for xpath_strip_img_src in &global_config.strip_image_src { - let _ = Util::strip_node( - context, - &format!("//img[contains(@src,'{}')]", xpath_strip_img_src), - ); - } - - let _ = ArticleScraper::fix_lazy_images(context, "lazyload", "data-src"); - let _ = ArticleScraper::fix_iframe_size(context, "youtube.com"); - let _ = ArticleScraper::remove_attribute(context, None, "style"); - let _ = ArticleScraper::remove_attribute(context, Some("a"), "onclick"); - let _ = ArticleScraper::remove_attribute(context, Some("img"), "srcset"); - let _ = ArticleScraper::remove_attribute(context, Some("img"), "sizes"); - let _ = ArticleScraper::add_attribute(context, Some("a"), "target", "_blank"); - - let _ = ArticleScraper::repair_urls(context, "//img", "src", url); - let _ = ArticleScraper::repair_urls(context, "//a", "src", url); - let _ = ArticleScraper::repair_urls(context, "//a", "href", url); - let _ = ArticleScraper::repair_urls(context, "//object", "data", url); - let _ = ArticleScraper::repair_urls(context, "//iframe", "src", url); - - // strip elements using Readability.com and Instapaper.com ignore class names - // .entry-unrelated and .instapaper_ignore - // See http://blog.instapaper.com/post/730281947 - let _ = Util::strip_node( - context, - "//*[contains(@class,' entry-unrelated ') or contains(@class,' instapaper_ignore ')]", - ); - - // strip elements that contain style="display: none;" - let _ = Util::strip_node(context, "//*[contains(@style,'display:none')]"); - - // strip all comments - let _ = Util::strip_node(context, "//comment()"); - - // strip all empty url-tags - let _ = Util::strip_node(context, "//a[not(node())]"); - - // strip all external css and fonts - let _ = Util::strip_node(context, "//*[@type='text/css']"); - } - - fn extract_metadata( - context: &Context, - config: Option<&ConfigEntry>, - global_config: &ConfigEntry, - article: &mut Article, - ) { - // try to get title - if let Some(config) = config { - for xpath_title in &config.xpath_title { - if let Ok(title) = Util::extract_value_merge(context, xpath_title) { - debug!("Article title: '{}'", title); - article.title = Some(title); - break; - } - } - } - - if article.title.is_none() { - for xpath_title in &global_config.xpath_title { - if let Ok(title) = Util::extract_value_merge(context, xpath_title) { - debug!("Article title: '{}'", title); - article.title = Some(title); - break; - } - } - } - - // try to get the author - if let Some(config) = config { - for xpath_author in &config.xpath_author { - if let Ok(author) = Util::extract_value(context, xpath_author) { - debug!("Article author: '{}'", author); - article.author = Some(author); - break; - } - } - } - - if article.author.is_none() { - for xpath_author in &global_config.xpath_author { - if let Ok(author) = Util::extract_value(context, xpath_author) { - debug!("Article author: '{}'", author); - article.author = Some(author); - break; - } - } - } - - // try to get the date - if let Some(config) = config { - for xpath_date in &config.xpath_date { - if let Ok(date_string) = Util::extract_value(context, xpath_date) { - debug!("Article date: '{}'", date_string); - if let Ok(date) = DateTime::from_str(&date_string) { - article.date = Some(date); - break; - } else { - warn!("Parsing the date string '{}' failed", date_string); - } - } - } - } - - if article.date.is_none() { - for xpath_date in &global_config.xpath_date { - if let Ok(date_string) = Util::extract_value(context, xpath_date) { - debug!("Article date: '{}'", date_string); - if let Ok(date) = DateTime::from_str(&date_string) { - article.date = Some(date); - break; - } else { - warn!("Parsing the date string '{}' failed", date_string); - } - } - } - } - } - - fn extract_body( - context: &Context, - root: &mut Node, - config: Option<&ConfigEntry>, - global_config: &ConfigEntry, - ) -> Result<(), ScraperError> { - let mut found_something = false; - - if let Some(config) = config { - for xpath_body in &config.xpath_body { - found_something = ArticleScraper::extract_body_single(context, root, xpath_body)?; - } - } - - if !found_something { - for xpath_body in &global_config.xpath_body { - found_something = ArticleScraper::extract_body_single(context, root, xpath_body)?; - } - } - - if !found_something { - log::error!("no body found"); - return Err(ScraperErrorKind::Scrape.into()); - } - - Ok(()) - } - - fn extract_body_single( - context: &Context, - root: &mut Node, - xpath: &str, - ) -> Result { - let mut found_something = false; - { - let node_vec = Util::evaluate_xpath(context, xpath, false)?; - for mut node in node_vec { - if node.get_property("style").is_some() && node.remove_property("style").is_err() { - return Err(ScraperErrorKind::Xml.into()); - } - - node.unlink(); - if root.add_child(&mut node).is_ok() { - found_something = true; - } else { - error!("Failed to add body to prepared document"); - return Err(ScraperErrorKind::Xml.into()); - } - } - } - - Ok(found_something) - } - - fn check_for_next_page( - &self, - context: &Context, - config: Option<&ConfigEntry>, - global_config: &ConfigEntry, - ) -> Option { - if let Some(config) = config { - if let Some(next_page_xpath) = config.next_page_link.as_deref() { - if let Ok(next_page_string) = - ArticleScraper::get_attribute(context, next_page_xpath, "href") - { - if let Ok(next_page_url) = url::Url::parse(&next_page_string) { - return Some(next_page_url); - } - } - } - } else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() { - if let Ok(next_page_string) = - ArticleScraper::get_attribute(context, next_page_xpath, "href") - { - if let Ok(next_page_url) = url::Url::parse(&next_page_string) { - return Some(next_page_url); - } - } - } - - // last page reached - None - } - - fn generate_head(root: &mut Node, document: &Document) -> Result<(), ScraperError> { - if let Ok(mut head_node) = Node::new("head", None, document) { - if let Ok(()) = root.add_prev_sibling(&mut head_node) { - if let Ok(mut meta) = head_node.new_child(None, "meta") { - if meta.set_property("charset", "utf-8").is_ok() { - return Ok(()); - } - } - } - } - - Err(ScraperErrorKind::Xml.into()) - } - - fn prevent_self_closing_tags(context: &Context) -> Result<(), ScraperError> { - // search document for empty tags and add a empty text node as child - // this prevents libxml from self closing non void elements such as iframe - - let xpath = "//*[not(node())]"; - let node_vec = Util::evaluate_xpath(context, xpath, false)?; - for mut node in node_vec { - if node.get_name() == "meta" { - continue; - } - - let _ = node.add_text_child(None, "empty", ""); - } - - Ok(()) - } -} +} \ No newline at end of file diff --git a/src/util.rs b/src/util.rs index fdfee6f..c29a7bd 100644 --- a/src/util.rs +++ b/src/util.rs @@ -6,9 +6,9 @@ use reqwest::{ }; use tokio::fs::DirEntry; -use crate::{ +use crate::full_text_parser::{ config::ConfigEntry, - error::{ScraperError, ScraperErrorKind}, + error::{FullTextParserError, FullTextParserErrorKind}, }; pub struct Util; @@ -49,28 +49,28 @@ impl Util { pub fn generate_headers( site_specific_rule: Option<&ConfigEntry>, global_rule: &ConfigEntry, - ) -> Result { + ) -> Result { let mut headers = HeaderMap::new(); if let Some(config) = site_specific_rule { for header in &config.header { let name = HeaderName::from_bytes(header.name.as_bytes()) - .context(ScraperErrorKind::Config)?; + .context(FullTextParserErrorKind::Config)?; let value = header .value .parse::() - .context(ScraperErrorKind::Config)?; + .context(FullTextParserErrorKind::Config)?; headers.insert(name, value); } } for header in &global_rule.header { let name = - HeaderName::from_bytes(header.name.as_bytes()).context(ScraperErrorKind::Config)?; + HeaderName::from_bytes(header.name.as_bytes()).context(FullTextParserErrorKind::Config)?; let value = header .value .parse::() - .context(ScraperErrorKind::Config)?; + .context(FullTextParserErrorKind::Config)?; headers.insert(name, value); } @@ -102,10 +102,10 @@ impl Util { xpath_ctx: &Context, xpath: &str, thorw_if_empty: bool, - ) -> Result, ScraperError> { + ) -> Result, FullTextParserError> { let res = xpath_ctx.evaluate(xpath).map_err(|()| { log::debug!("Evaluation of xpath '{}' yielded no results", xpath); - ScraperErrorKind::Xml + FullTextParserErrorKind::Xml })?; let node_vec = res.get_nodes_as_vec(); @@ -113,14 +113,14 @@ impl Util { if node_vec.is_empty() { log::debug!("Evaluation of xpath '{}' yielded no results", xpath); if thorw_if_empty { - return Err(ScraperErrorKind::Xml.into()); + return Err(FullTextParserErrorKind::Xml.into()); } } Ok(node_vec) } - pub fn check_content_type(response: &Response) -> Result { + pub fn check_content_type(response: &Response) -> Result { if response.status().is_success() { if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) { if let Ok(content_type) = content_type.to_str() { @@ -135,7 +135,7 @@ impl Util { } log::error!("Failed to determine content type"); - Err(ScraperErrorKind::Http.into()) + Err(FullTextParserErrorKind::Http.into()) } pub fn check_redirect(response: &Response, original_url: &url::Url) -> Option { @@ -149,16 +149,16 @@ impl Util { None } - pub fn extract_value(context: &Context, xpath: &str) -> Result { + pub fn extract_value(context: &Context, xpath: &str) -> Result { let node_vec = Util::evaluate_xpath(context, xpath, false)?; if let Some(val) = node_vec.get(0) { return Ok(val.get_content()); } - Err(ScraperErrorKind::Xml.into()) + Err(FullTextParserErrorKind::Xml.into()) } - pub fn extract_value_merge(context: &Context, xpath: &str) -> Result { + pub fn extract_value_merge(context: &Context, xpath: &str) -> Result { let node_vec = Util::evaluate_xpath(context, xpath, true)?; let mut val = String::new(); for node in node_vec { @@ -174,7 +174,7 @@ impl Util { Ok(val.trim().to_string()) } - pub fn strip_node(context: &Context, xpath: &str) -> Result<(), ScraperError> { + pub fn strip_node(context: &Context, xpath: &str) -> Result<(), FullTextParserError> { let mut ancestor = xpath.to_string(); if ancestor.starts_with("//") { ancestor = ancestor.chars().skip(2).collect(); @@ -188,7 +188,7 @@ impl Util { Ok(()) } - pub fn strip_id_or_class(context: &Context, id_or_class: &str) -> Result<(), ScraperError> { + pub fn strip_id_or_class(context: &Context, id_or_class: &str) -> Result<(), FullTextParserError> { let xpath = &format!( "//*[contains(@class, '{}') or contains(@id, '{}')]", id_or_class, id_or_class diff --git a/src/youtube.rs b/src/youtube.rs deleted file mode 100644 index bd2c12e..0000000 --- a/src/youtube.rs +++ /dev/null @@ -1,32 +0,0 @@ -use crate::article::Article; -use crate::ArticleScraper; - -pub struct Youtube; - -impl Youtube { - pub fn handle(url: &url::Url) -> Option
{ - let host_name = match ArticleScraper::get_host_name(url) { - Ok(host_name) => host_name, - Err(_) => return None, - }; - if &host_name == "youtube.com" { - let regex = - regex::Regex::new(r#"youtube\.com/watch\?v=(.*)"#).expect("Failed to parse regex"); - if let Some(captures) = regex.captures(url.as_str()) { - if let Some(video_id) = captures.get(1) { - let html = format!("", video_id.as_str()); - - return Some(Article { - title: None, - date: None, - author: None, - url: url.clone(), - html: Some(html), - }); - } - } - } - - None - } -}