#[macro_use] mod macros; mod config; mod error; mod article; pub mod images; use reqwest; use url; use regex; use log::{ error, debug, info, warn, }; use crate::article::Article; use libxml::parser::Parser; use libxml::xpath::Context; use libxml::tree::{ Document, Node }; use std::path::PathBuf; use std::ops::Index; use failure::ResultExt; use std::error::Error; use crate::config::{ GrabberConfig, ConfigCollection }; use encoding_rs::{ Encoding, }; use chrono::NaiveDateTime; use std::str::FromStr; use crate::images::ImageDownloader; use self::error::{ ScraperError, ScraperErrorKind }; pub struct ArticleScraper { pub image_downloader: ImageDownloader, config_files: ConfigCollection, client: reqwest::Client, } impl ArticleScraper { pub fn new(config_path: PathBuf) -> Result { let config_files = GrabberConfig::parse_directory(&config_path).context(ScraperErrorKind::Config)?; Ok(ArticleScraper { image_downloader: ImageDownloader::new((2048, 2048)), config_files: config_files, client: reqwest::Client::new(), }) } pub fn parse(&self, url: url::Url, download_images: bool) -> Result { info!("Scraping article: {}", url.as_str()); let response = self.client.head(url.clone()).send() .map_err(|err| { error!("Failed head request to: {} - {}", url.as_str(), err.description()); err }) .context(ScraperErrorKind::Http)?; // check if url redirects and we need to pick up the new url let mut url = url; if let Some(new_url) = ArticleScraper::check_redirect(&response) { debug!("Url {} redirects to {}", url.as_str(), new_url.as_str()); url = new_url; } // check if we are dealing with text/html if !ArticleScraper::check_content_type(&response)? { return Err(ScraperErrorKind::ContentType)? } // check if we have a config for the url let config = self.get_grabber_config(&url)?; let mut article = Article { title: None, author: None, url: url.clone(), date: None, html: None, }; let mut document = Document::new().map_err(|()| { ScraperErrorKind::Xml })?; let mut root = Node::new("article", None, &document).map_err(|()| { ScraperErrorKind::Xml })?; document.set_root_element(&root); ArticleScraper::generate_head(&mut root, &document)?; self.parse_first_page(&mut article, &url, &mut root, config)?; let context = Context::new(&document).map_err(|()| { error!("Failed to create xpath context for extracted article"); ScraperErrorKind::Xml })?; if let Err(error) = ArticleScraper::prevent_self_closing_tags(&context) { error!("Preventing self closing tags failed - {}", error); return Err(error) } if let Err(error) = ArticleScraper::eliminate_noscrip_tag(&context) { error!("Eliminating