pub mod config; pub mod error; mod fingerprints; mod metadata; mod readability; #[cfg(test)] mod tests; use self::config::{ConfigCollection, ConfigEntry}; use self::error::FullTextParserError; use self::readability::Readability; use crate::article::Article; use crate::constants; use crate::util::Util; use encoding_rs::Encoding; use fingerprints::Fingerprints; use libxml::parser::Parser; use libxml::tree::{Document, Node, NodeType}; use libxml::xpath::Context; use reqwest::header::HeaderMap; use reqwest::{Client, Url}; use std::path::Path; use std::str::from_utf8; pub struct FullTextParser { config_files: ConfigCollection, } impl FullTextParser { pub async fn new(config_path: Option<&Path>) -> Self { let config_files = ConfigCollection::parse(config_path).await; Self { config_files } } pub async fn parse( &self, url: &url::Url, client: &Client, ) -> Result { libxml::tree::node::set_node_rc_guard(10); log::debug!("Scraping article: '{url}'"); // check if we have a config for the url let config = self.get_grabber_config(url); let global_config = self .config_files .get("global.txt") .ok_or(FullTextParserError::Config)?; let headers = Util::generate_headers(config, global_config)?; let response = client .head(url.clone()) .headers(headers) .send() .await .map_err(|error| { log::error!("Failed head request to: '{url}' - '{error}'"); FullTextParserError::Http })?; // check if url redirects and we need to pick up the new url let url = if let Some(new_url) = Util::check_redirect(&response, url) { log::debug!("Url '{url}' redirects to '{new_url}'"); new_url } else { url.clone() }; // check if we are dealing with text/html if !Util::check_content_type(&response)? { return Err(FullTextParserError::ContentType); } let mut article = Article { title: None, author: None, url: url.clone(), date: None, thumbnail_url: None, document: None, }; let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?; let mut root = Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?; document.set_root_element(&root); Self::generate_head(&mut root, &document)?; let headers = Util::generate_headers(config, global_config)?; let html = Self::download(&url, client, headers).await?; // check for fingerprints let config = if config.is_none() { if let Some(url) = Fingerprints::detect(&html) { self.get_grabber_config(&url) } else { config } } else { config }; self.parse_pages( &mut article, &html, &mut root, config, global_config, client, ) .await?; let context = Context::new(&document).map_err(|()| { log::error!("Failed to create xpath context for extracted article"); FullTextParserError::Xml })?; if let Err(error) = Self::prevent_self_closing_tags(&context) { log::error!("Preventing self closing tags failed - '{error}'"); return Err(error); } Self::post_process_document(&document)?; article.document = Some(document); Ok(article) } async fn parse_pages( &self, article: &mut Article, html: &str, root: &mut Node, config: Option<&ConfigEntry>, global_config: &ConfigEntry, client: &Client, ) -> Result<(), FullTextParserError> { let mut document = Self::parse_html(html, config, global_config)?; let mut xpath_ctx = Self::get_xpath_ctx(&document)?; // check for single page link let rule = Util::select_rule( config.and_then(|c| c.single_page_link.as_deref()), global_config.single_page_link.as_deref(), ); if let Some(xpath_single_page_link) = rule { log::debug!( "Single page link xpath specified in config '{}'", xpath_single_page_link ); if let Some(single_page_url) = Util::find_page_url(&xpath_ctx, xpath_single_page_link) { // parse again with single page url log::debug!("Single page link found '{}'", single_page_url); if let Err(error) = self .parse_single_page( article, &single_page_url, root, config, global_config, client, ) .await { log::warn!("Single Page parsing: {error}"); log::info!("Continuing with regular parser."); } } } metadata::extract(&xpath_ctx, config, Some(global_config), article); if article.thumbnail_url.is_none() { Self::check_for_thumbnail(&xpath_ctx, article); } Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document); let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?; if !found_body { if let Err(error) = Readability::extract_body(document, root, article.title.as_deref()) { log::error!("Both ftr and readability failed to find content: {error}"); return Err(error); } } while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) { log::debug!(""); let headers = Util::generate_headers(config, global_config)?; let html = Self::download(&url, client, headers).await?; document = Self::parse_html(&html, config, global_config)?; xpath_ctx = Self::get_xpath_ctx(&document)?; Self::prep_content(&xpath_ctx, config, global_config, &url, &document); let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?; if !found_body { if let Err(error) = Readability::extract_body(document, root, article.title.as_deref()) { log::error!("Both ftr and readability failed to find content: {error}"); return Err(error); } } } Ok(()) } fn parse_html( html: &str, config: Option<&ConfigEntry>, global_config: &ConfigEntry, ) -> Result { // replace matches in raw html let mut html = html.to_owned(); if let Some(config) = config { for replace in &config.replace { html = html.replace(&replace.to_replace, &replace.replace_with); } } for replace in &global_config.replace { html = html.replace(&replace.to_replace, &replace.replace_with); } // parse html let parser = Parser::default_html(); parser.parse_string(html.as_str()).map_err(|err| { log::error!("Parsing HTML failed for downloaded HTML {:?}", err); FullTextParserError::Xml }) } fn get_xpath_ctx(doc: &Document) -> Result { Context::new(doc).map_err(|()| { log::error!("Creating xpath context failed for downloaded HTML"); FullTextParserError::Xml }) } async fn parse_single_page( &self, article: &mut Article, url: &url::Url, root: &mut Node, config: Option<&ConfigEntry>, global_config: &ConfigEntry, client: &Client, ) -> Result<(), FullTextParserError> { let headers = Util::generate_headers(config, global_config)?; let html = Self::download(url, client, headers).await?; let document = Self::parse_html(&html, config, global_config)?; let xpath_ctx = Self::get_xpath_ctx(&document)?; metadata::extract(&xpath_ctx, config, Some(global_config), article); Self::check_for_thumbnail(&xpath_ctx, article); Self::prep_content(&xpath_ctx, config, global_config, url, &document); Self::extract_body(&xpath_ctx, root, config, global_config)?; Ok(()) } async fn download( url: &url::Url, client: &Client, headers: HeaderMap, ) -> Result { let response = client .get(url.as_str()) .headers(headers) .send() .await .map_err(|err| { log::error!( "Downloading HTML failed: GET '{}' - '{}'", url.as_str(), err ); FullTextParserError::Http })?; if response.status().is_success() { let headers = response.headers().clone(); let bytes = response .bytes() .await .map_err(|_| FullTextParserError::Http)?; match from_utf8(&bytes) { Ok(utf8_str) => { log::debug!("Valid utf-8 string"); return Ok(utf8_str.into()); } Err(error) => { log::debug!("Invalid utf-8 string"); let lossy_string = std::string::String::from_utf8_lossy(&bytes); if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) { log::debug!("Encoding extracted from HTML: '{}'", encoding); if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { return Ok(decoded_html); } } if let Some(encoding) = Self::get_encoding_from_http_header(&headers) { log::debug!("Encoding extracted from headers: '{}'", encoding); if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { return Ok(decoded_html); } } return Err(FullTextParserError::Utf8(error)); } } } Err(FullTextParserError::Http) } fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> { headers .get(reqwest::header::CONTENT_TYPE) .and_then(|header| header.to_str().ok()) .and_then(|content_type| { regex::Regex::new(r#"charset=([^"']+)"#) .expect("Failed to parse regex") .captures(content_type) }) .and_then(|captures| captures.get(1)) .map(|regex_match| regex_match.as_str()) } fn get_encoding_from_html(html: &str) -> Option<&str> { let regex = regex::Regex::new(r#" Option { if let Some(encoding) = Encoding::for_label(encoding.as_bytes()) { let (decoded_html, _, invalid_chars) = encoding.decode(bytes); if !invalid_chars { return Some(decoded_html.into_owned()); } } log::warn!("Could not decode HTML. Encoding: '{}'", encoding); None } fn get_host_name(url: &url::Url) -> Result { match url.host_str() { Some(name) => { let mut name = name; if name.starts_with("www.") && name.len() > 4 { name = &name[4..] } Ok(name.into()) } None => { log::error!("Getting config failed due to bad Url"); Err(FullTextParserError::Config) } } } fn get_grabber_config(&self, url: &url::Url) -> Option<&ConfigEntry> { let conf = Self::get_host_name(url) .ok() .map(|url| url + ".txt") .and_then(|name| self.config_files.get(&name)); if conf.is_none() { log::warn!("No config found for url '{}'", url); } conf } fn check_for_thumbnail(context: &Context, article: &mut Article) { if let Ok(thumb) = Util::get_attribute( context, "//meta[contains(@name, 'twitter:image')]", "content", ) { article.thumbnail_url = Some(thumb); return; } if let Ok(thumb) = Util::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content") { article.thumbnail_url = Some(thumb); return; } if let Ok(thumb) = Util::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href") { article.thumbnail_url = Some(thumb); } } fn fix_lazy_images( context: &Context, class: &str, property_url: &str, ) -> Result<(), FullTextParserError> { let xpath = &format!("//img[contains(@class, '{}')]", class); let node_vec = Util::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { if node .get_property(property_url) .and_then(|correct_url| node.set_property("src", &correct_url).ok()) .is_none() { log::warn!("Failed to fix lazy loading image"); } } Ok(()) } fn fix_iframe_size(context: &Context, site_name: &str) -> Result<(), FullTextParserError> { let xpath = &format!("//iframe[contains(@src, '{}')]", site_name); let node_vec = Util::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { let video_wrapper = node .get_parent() .and_then(|mut parent| parent.new_child(None, "div").ok()); if let Some(mut video_wrapper) = video_wrapper { let success = video_wrapper .set_property("class", "videoWrapper") .ok() .and_then(|()| node.set_property("width", "100%").ok()) .and_then(|()| node.set_property("height", "100%").ok()) .ok_or_else(|| { node.unlink(); video_wrapper.add_child(&mut node) }) .is_err(); if !success { log::warn!("Failed to add iframe as child of video wrapper
"); } } else { log::warn!("Failed to get parent of iframe"); } } Ok(()) } fn remove_attribute( context: &Context, tag: Option<&str>, attribute: &str, ) -> Result<(), FullTextParserError> { let xpath_tag = tag.unwrap_or("*"); let xpath = &format!("//{}[@{}]", xpath_tag, attribute); let node_vec = Util::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { if let Err(err) = node.remove_property(attribute) { log::warn!( "Failed to remove attribute '{}' from node: {}", attribute, err ); } } Ok(()) } fn add_attribute( context: &Context, tag: Option<&str>, attribute: &str, value: &str, ) -> Result<(), FullTextParserError> { let xpath_tag = tag.unwrap_or("*"); let xpath = &format!("//{}", xpath_tag); let node_vec = Util::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { if let Err(err) = node.set_attribute(attribute, value) { log::warn!("Failed to set attribute '{}' on node: {}", attribute, err); } } Ok(()) } fn repair_urls( context: &Context, xpath: &str, attribute: &str, article_url: &url::Url, document: &Document, ) -> Result<(), FullTextParserError> { let node_vec = Util::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { if let Some(url) = node.get_attribute(attribute) { let trimmed_url = url.trim(); let is_relative_url = url::Url::parse(&url) .err() .map(|err| err == url::ParseError::RelativeUrlWithoutBase) .unwrap_or(false); let is_javascript = trimmed_url.contains("javascript:"); if is_relative_url { let completed_url = match article_url.join(trimmed_url) { Ok(joined_url) => joined_url, Err(_) => continue, }; _ = node.set_attribute(attribute, completed_url.as_str()); } else if is_javascript { // if the link only contains simple text content, it can be converted to a text node let mut child_nodes = node.get_child_nodes(); let child_count = child_nodes.len(); let first_child_is_text = child_nodes .first() .and_then(|n| n.get_type()) .map(|t| t == NodeType::TextNode) .unwrap_or(false); if let Some(mut parent) = node.get_parent() { let new_node = if child_count == 1 && first_child_is_text { let link_content = node.get_content(); Node::new_text(&link_content, document) .expect("Failed to create new text node") } else { let mut container = Node::new("span", None, document) .expect("Failed to create new span container node"); for mut child in child_nodes.drain(..) { child.unlink(); _ = container.add_child(&mut child); } container }; _ = parent.replace_child_node(new_node, node); } } else if let Ok(parsed_url) = Url::parse(trimmed_url) { _ = node.set_attribute(attribute, parsed_url.as_str()); } else { _ = node.set_attribute(attribute, trimmed_url); }; } } Ok(()) } fn fix_urls(context: &Context, url: &Url, document: &Document) { _ = Self::repair_urls(context, "//img", "src", url, document); _ = Self::repair_urls(context, "//a", "src", url, document); _ = Self::repair_urls(context, "//a", "href", url, document); _ = Self::repair_urls(context, "//object", "data", url, document); _ = Self::repair_urls(context, "//iframe", "src", url, document); } fn prep_content( context: &Context, config: Option<&ConfigEntry>, global_config: &ConfigEntry, url: &Url, document: &Document, ) { // replace H1 with H2 as H1 should be only title that is displayed separately if let Ok(h1_nodes) = Util::evaluate_xpath(context, "//h1", false) { for mut h1_node in h1_nodes { _ = h1_node.set_name("h2"); } } _ = Util::mark_data_tables(context); // strip specified xpath if let Some(config) = config { for xpath_strip in &config.xpath_strip { _ = Util::strip_node(context, xpath_strip); } } for xpath_strip in &global_config.xpath_strip { _ = Util::strip_node(context, xpath_strip); } // strip everything with specified 'id' or 'class' if let Some(config) = config { for xpaht_strip_class in &config.strip_id_or_class { _ = Util::strip_id_or_class(context, xpaht_strip_class); } } for xpaht_strip_class in &global_config.strip_id_or_class { _ = Util::strip_id_or_class(context, xpaht_strip_class); } // strip any element where @src attribute contains this substring if let Some(config) = config { for xpath_strip_img_src in &config.strip_image_src { _ = Util::strip_node( context, &format!("//img[contains(@src,'{}')]", xpath_strip_img_src), ); } } for xpath_strip_img_src in &global_config.strip_image_src { _ = Util::strip_node( context, &format!("//img[contains(@src,'{}')]", xpath_strip_img_src), ); } _ = Self::unwrap_noscript_images(context); _ = Self::fix_lazy_images(context, "lazyload", "data-src"); _ = Self::fix_iframe_size(context, "youtube.com"); _ = Self::remove_attribute(context, Some("a"), "onclick"); _ = Self::remove_attribute(context, Some("img"), "srcset"); _ = Self::remove_attribute(context, Some("img"), "sizes"); _ = Self::add_attribute(context, Some("a"), "target", "_blank"); // strip elements using Readability.com and Instapaper.com ignore class names // .entry-unrelated and .instapaper_ignore // See http://blog.instapaper.com/post/730281947 _ = Util::strip_node( context, "//*[contains(@class,' entry-unrelated ') or contains(@class,' instapaper_ignore ')]", ); // strip elements that contain style="display: none;" _ = Util::strip_node(context, "//*[contains(@style,'display:none')]"); _ = Util::strip_node(context, "//*[contains(@style,'display: none')]"); _ = Self::remove_attribute(context, None, "style"); // strip all input elements _ = Util::strip_node(context, "//form"); _ = Util::strip_node(context, "//input"); _ = Util::strip_node(context, "//textarea"); _ = Util::strip_node(context, "//select"); _ = Util::strip_node(context, "//button"); // strip all comments _ = Util::strip_node(context, "//comment()"); // strip all scripts _ = Util::strip_node(context, "//script"); // strip all styles _ = Util::strip_node(context, "//style"); // strip all empty url-tags _ = Util::strip_node(context, "//a[not(node())]"); // strip all external css and fonts _ = Util::strip_node(context, "//*[@type='text/css']"); // other junk _ = Util::strip_node(context, "//iframe"); _ = Util::strip_node(context, "//object"); _ = Util::strip_node(context, "//embed"); _ = Util::strip_node(context, "//footer"); _ = Util::strip_node(context, "//link"); _ = Util::strip_node(context, "//aside"); Self::fix_urls(context, url, document); } /** * Find all