mod article; mod config; mod error; pub mod images; use self::error::{ScraperError, ScraperErrorKind}; use crate::article::Article; use crate::config::{ConfigCollection, GrabberConfig}; use crate::images::ImageDownloader; use chrono::NaiveDateTime; use encoding_rs::Encoding; use failure::ResultExt; use libxml::parser::Parser; use libxml::tree::{Document, Node, SaveOptions}; use libxml::xpath::Context; use log::{debug, error, info, warn}; use regex; use reqwest::{Client, Response}; use std::collections; use std::error::Error; use std::path::PathBuf; use std::str::FromStr; use std::sync::{Arc, RwLock}; use std::thread; use url; pub struct ArticleScraper { pub image_downloader: ImageDownloader, config_files: Arc>>, } impl ArticleScraper { pub fn new(config_path: PathBuf) -> Self { let config_files = Arc::new(RwLock::new(None)); let locked_config_files = config_files.clone(); thread::spawn(move || { if let Ok(config_files) = GrabberConfig::parse_directory(&config_path) { locked_config_files .write() .expect("Failed to lock config file cache") .replace(config_files); } else { locked_config_files .write() .expect("Failed to lock config file cache") .replace(collections::HashMap::new()); } }); ArticleScraper { image_downloader: ImageDownloader::new((2048, 2048)), config_files, } } pub async fn parse( &self, url: url::Url, download_images: bool, client: &Client, ) -> Result { info!("Scraping article: '{}'", url.as_str()); let response = client .head(url.clone()) .send() .await .map_err(|err| { error!( "Failed head request to: '{}' - '{}'", url.as_str(), err.description() ); err }) .context(ScraperErrorKind::Http)?; // check if url redirects and we need to pick up the new url let mut url = url; if let Some(new_url) = ArticleScraper::check_redirect(&response) { debug!("Url '{}' redirects to '{}'", url.as_str(), new_url.as_str()); url = new_url; } // check if we are dealing with text/html if !ArticleScraper::check_content_type(&response)? { return Err(ScraperErrorKind::ContentType)?; } // check if we have a config for the url let config = self.get_grabber_config(&url)?; let mut article = Article { title: None, author: None, url: url.clone(), date: None, html: None, }; let mut document = Document::new().map_err(|()| ScraperErrorKind::Xml)?; let mut root = Node::new("article", None, &document).map_err(|()| ScraperErrorKind::Xml)?; document.set_root_element(&root); ArticleScraper::generate_head(&mut root, &document)?; self.parse_pages(&mut article, &url, &mut root, &config, client) .await?; let context = Context::new(&document).map_err(|()| { error!("Failed to create xpath context for extracted article"); ScraperErrorKind::Xml })?; if let Err(error) = ArticleScraper::prevent_self_closing_tags(&context) { error!("Preventing self closing tags failed - '{}'", error); return Err(error); } // if let Err(error) = ArticleScraper::eliminate_noscript_tag(&context) { // error!("Eliminating