diff --git a/src/article.rs b/src/article.rs index 1c6c160..dbe2738 100644 --- a/src/article.rs +++ b/src/article.rs @@ -1,7 +1,7 @@ use chrono::{DateTime, Utc}; +use std::fs::File; use std::io::{Error, ErrorKind, Write}; use std::path::PathBuf; -use std::fs::File; use url::Url; pub struct Article { @@ -28,6 +28,9 @@ impl Article { } } - Err(Error::new(ErrorKind::NotFound, "Article does not contain HTML")) + Err(Error::new( + ErrorKind::NotFound, + "Article does not contain HTML", + )) } } diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs index a2bcd54..b41ac73 100644 --- a/src/full_text_parser/mod.rs +++ b/src/full_text_parser/mod.rs @@ -1,13 +1,14 @@ -pub mod error; pub mod config; +pub mod error; mod fingerprints; #[cfg(test)] mod tests; +use self::config::{ConfigCollection, ConfigEntry}; use self::error::{FullTextParserError, FullTextParserErrorKind}; use crate::article::Article; -use self::config::{ConfigCollection, ConfigEntry}; +use crate::util::Util; use chrono::DateTime; use encoding_rs::Encoding; use failure::ResultExt; @@ -20,7 +21,6 @@ use reqwest::header::HeaderMap; use reqwest::Client; use std::path::Path; use std::str::FromStr; -use crate::util::Util; pub struct FullTextParser { config_files: ConfigCollection, @@ -29,9 +29,7 @@ pub struct FullTextParser { impl FullTextParser { pub async fn new(config_path: Option<&Path>) -> Self { let config_files = ConfigCollection::parse(config_path).await; - Self { - config_files, - } + Self { config_files } } pub async fn parse( @@ -83,7 +81,8 @@ impl FullTextParser { }; let mut document = Document::new().map_err(|()| FullTextParserErrorKind::Xml)?; - let mut root = Node::new("article", None, &document).map_err(|()| FullTextParserErrorKind::Xml)?; + let mut root = + Node::new("article", None, &document).map_err(|()| FullTextParserErrorKind::Xml)?; document.set_root_element(&root); Self::generate_head(&mut root, &document)?; @@ -263,7 +262,10 @@ impl FullTextParser { if response.status().is_success() { let headers = response.headers().clone(); - let text = response.text().await.context(FullTextParserErrorKind::Http)?; + let text = response + .text() + .await + .context(FullTextParserErrorKind::Http)?; { if let Some(decoded_html) = Self::decode_html(&text, Self::get_encoding_from_html(&text)) diff --git a/src/lib.rs b/src/lib.rs index a426958..f5a90b6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,18 +1,21 @@ -pub mod images; mod article; -mod full_text_parser; -mod util; mod error; +mod full_text_parser; +pub mod images; +mod readability; +mod util; -use std::path::Path; use article::Article; -use full_text_parser::FullTextParser; use error::{ScraperError, ScraperErrorKind}; +use full_text_parser::FullTextParser; use images::ImageDownloader; +use readability::Readability; use reqwest::Client; +use std::path::Path; pub struct ArticleScraper { full_text_parser: FullTextParser, + readability: Readability, image_downloader: ImageDownloader, } @@ -20,6 +23,7 @@ impl ArticleScraper { pub async fn new(user_configs: Option<&Path>) -> Self { Self { full_text_parser: FullTextParser::new(user_configs).await, + readability: Readability::new(), image_downloader: ImageDownloader::new((2048, 2048)), } } @@ -30,7 +34,6 @@ impl ArticleScraper { download_images: bool, client: &Client, ) -> Result { - let res = self.full_text_parser.parse(url, client).await; if download_images { @@ -45,4 +48,4 @@ impl ArticleScraper { unimplemented!() } -} \ No newline at end of file +} diff --git a/src/readability/mod.rs b/src/readability/mod.rs new file mode 100644 index 0000000..80c9fb3 --- /dev/null +++ b/src/readability/mod.rs @@ -0,0 +1,7 @@ +pub struct Readability; + +impl Readability { + pub fn new() -> Self { + unimplemented!() + } +} diff --git a/src/util.rs b/src/util.rs index c29a7bd..c40e958 100644 --- a/src/util.rs +++ b/src/util.rs @@ -65,8 +65,8 @@ impl Util { } for header in &global_rule.header { - let name = - HeaderName::from_bytes(header.name.as_bytes()).context(FullTextParserErrorKind::Config)?; + let name = HeaderName::from_bytes(header.name.as_bytes()) + .context(FullTextParserErrorKind::Config)?; let value = header .value .parse::() @@ -158,7 +158,10 @@ impl Util { Err(FullTextParserErrorKind::Xml.into()) } - pub fn extract_value_merge(context: &Context, xpath: &str) -> Result { + pub fn extract_value_merge( + context: &Context, + xpath: &str, + ) -> Result { let node_vec = Util::evaluate_xpath(context, xpath, true)?; let mut val = String::new(); for node in node_vec { @@ -188,7 +191,10 @@ impl Util { Ok(()) } - pub fn strip_id_or_class(context: &Context, id_or_class: &str) -> Result<(), FullTextParserError> { + pub fn strip_id_or_class( + context: &Context, + id_or_class: &str, + ) -> Result<(), FullTextParserError> { let xpath = &format!( "//*[contains(@class, '{}') or contains(@id, '{}')]", id_or_class, id_or_class