From c08f5afa5dca76e304f842de6f32265d78927c81 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Tue, 13 Dec 2022 08:54:57 +0100 Subject: [PATCH] move stuff around --- src/article.rs | 25 +++++++++++-- src/full_text_parser/mod.rs | 67 +++++++++++++++++------------------ src/full_text_parser/tests.rs | 2 +- src/images/mod.rs | 45 +++++++++++------------ src/lib.rs | 19 +++++----- src/readability/mod.rs | 7 ---- 6 files changed, 84 insertions(+), 81 deletions(-) delete mode 100644 src/readability/mod.rs diff --git a/src/article.rs b/src/article.rs index 73bef18..4882562 100644 --- a/src/article.rs +++ b/src/article.rs @@ -1,4 +1,5 @@ use chrono::{DateTime, Utc}; +use libxml::tree::{Document, SaveOptions}; use std::fs::File; use std::io::{Error, ErrorKind, Write}; use std::path::PathBuf; @@ -9,13 +10,31 @@ pub struct Article { pub author: Option, pub url: Url, pub date: Option>, - pub html: Option, pub thumbnail_url: Option, + pub document: Option, } impl Article { - pub fn save_html(&self, path: &PathBuf) -> Result<(), Error> { - if let Some(ref html) = self.html { + pub fn get_content(&self) -> Option { + // serialize content + let options = SaveOptions { + format: false, + no_declaration: false, + no_empty_tags: true, + no_xhtml: false, + xhtml: false, + as_xml: false, + as_html: true, + non_significant_whitespace: false, + }; + self.document + .as_ref() + .map(|doc| doc.to_string_with_options(options)) + } + + #[allow(dead_code)] + pub(crate) fn save_html(&self, path: &PathBuf) -> Result<(), Error> { + if let Some(ref html) = self.get_content() { if let Ok(()) = std::fs::create_dir_all(path) { let mut file_name = match self.title.clone() { Some(file_name) => file_name.replace('/', "_"), diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs index d109c63..651b711 100644 --- a/src/full_text_parser/mod.rs +++ b/src/full_text_parser/mod.rs @@ -13,7 +13,7 @@ use chrono::DateTime; use encoding_rs::Encoding; use fingerprints::Fingerprints; use libxml::parser::Parser; -use libxml::tree::{Document, Node, SaveOptions}; +use libxml::tree::{Document, Node}; use libxml::xpath::Context; use log::{debug, error, info, warn}; use reqwest::header::HeaderMap; @@ -75,8 +75,8 @@ impl FullTextParser { author: None, url: url.clone(), date: None, - html: None, thumbnail_url: None, + document: None, }; let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?; @@ -86,8 +86,30 @@ impl FullTextParser { Self::generate_head(&mut root, &document)?; - self.parse_pages(&mut article, &url, &mut root, config, global_config, client) - .await?; + let headers = Util::generate_headers(config, global_config)?; + let html = Self::download(&url, client, headers).await?; + + // check for fingerprints + let config = if config.is_none() { + if let Some(url) = Fingerprints::detect(&html) { + self.get_grabber_config(&url) + } else { + config + } + } else { + config + }; + + self.parse_pages( + &mut article, + &url, + &html, + &mut root, + config, + global_config, + client, + ) + .await?; let context = Context::new(&document).map_err(|()| { error!("Failed to create xpath context for extracted article"); @@ -99,19 +121,7 @@ impl FullTextParser { return Err(error); } - // serialize content - let options = SaveOptions { - format: false, - no_declaration: false, - no_empty_tags: true, - no_xhtml: false, - xhtml: false, - as_xml: false, - as_html: true, - non_significant_whitespace: false, - }; - let html = document.to_string_with_options(options); - article.html = Some(html); + article.document = Some(document); Ok(article) } @@ -120,25 +130,12 @@ impl FullTextParser { &self, article: &mut Article, url: &url::Url, + html: &str, root: &mut Node, config: Option<&ConfigEntry>, global_config: &ConfigEntry, client: &Client, ) -> Result<(), FullTextParserError> { - let headers = Util::generate_headers(config, global_config)?; - let html = Self::download(url, client, headers).await?; - - // see if - let config = if config.is_none() { - if let Some(url) = Fingerprints::detect(&html) { - self.get_grabber_config(&url) - } else { - config - } - } else { - config - }; - let mut document = Self::parse_html(html, config, global_config)?; let mut xpath_ctx = Self::get_xpath_ctx(&document)?; @@ -180,7 +177,7 @@ impl FullTextParser { while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) { let headers = Util::generate_headers(config, global_config)?; let html = Self::download(&url, client, headers).await?; - document = Self::parse_html(html, config, global_config)?; + document = Self::parse_html(&html, config, global_config)?; xpath_ctx = Self::get_xpath_ctx(&document)?; Self::strip_junk(&xpath_ctx, config, global_config, &url); Self::extract_body(&xpath_ctx, root, config, global_config)?; @@ -190,13 +187,13 @@ impl FullTextParser { } fn parse_html( - html: String, + html: &str, config: Option<&ConfigEntry>, global_config: &ConfigEntry, ) -> Result { // replace matches in raw html - let mut html = html; + let mut html = html.to_owned(); if let Some(config) = config { for replace in &config.replace { html = html.replace(&replace.to_replace, &replace.replace_with); @@ -233,7 +230,7 @@ impl FullTextParser { ) -> Result<(), FullTextParserError> { let headers = Util::generate_headers(config, global_config)?; let html = Self::download(url, client, headers).await?; - let document = Self::parse_html(html, config, global_config)?; + let document = Self::parse_html(&html, config, global_config)?; let xpath_ctx = Self::get_xpath_ctx(&document)?; Self::extract_metadata(&xpath_ctx, config, global_config, article); Self::check_for_thumbnail(&xpath_ctx, article); diff --git a/src/full_text_parser/tests.rs b/src/full_text_parser/tests.rs index 6f5e962..3fbaedc 100644 --- a/src/full_text_parser/tests.rs +++ b/src/full_text_parser/tests.rs @@ -59,7 +59,7 @@ async fn youtube() { Some("RIGGED! Arena Shuffler is BROKEN | 13 Land Mono Red Burn") ); assert!(article - .html + .get_content() .map(|html| html.contains("https://www.youtube.com/embed/8KjaIumu-jI?feature=oembed")) .unwrap_or(false)); } diff --git a/src/images/mod.rs b/src/images/mod.rs index 996e936..526c0d4 100644 --- a/src/images/mod.rs +++ b/src/images/mod.rs @@ -1,7 +1,7 @@ pub use self::error::ImageDownloadError; use crate::util::Util; use libxml::parser::Parser; -use libxml::tree::{Node, SaveOptions}; +use libxml::tree::{Document, Node, SaveOptions}; use libxml::xpath::Context; use log::{debug, error}; use reqwest::{Client, Response}; @@ -28,34 +28,21 @@ impl ImageDownloader { .parse_string(html) .map_err(|_| ImageDownloadError::HtmlParse)?; + self.download_images_from_document(&doc, client).await + } + + pub async fn download_images_from_document( + &self, + doc: &Document, + client: &Client, + ) -> Result { let xpath_ctx = Context::new(&doc).map_err(|()| { error!("Failed to create xpath context for document"); ImageDownloadError::HtmlParse })?; - self.download_images_from_context(&xpath_ctx, client) - .await?; - - let options = SaveOptions { - format: false, - no_declaration: false, - no_empty_tags: true, - no_xhtml: false, - xhtml: false, - as_xml: false, - as_html: true, - non_significant_whitespace: false, - }; - Ok(doc.to_string_with_options(options)) - } - - pub async fn download_images_from_context( - &self, - context: &Context, - client: &Client, - ) -> Result<(), ImageDownloadError> { let xpath = "//img"; - let node_vec = Util::evaluate_xpath(context, xpath, false) + let node_vec = Util::evaluate_xpath(&xpath_ctx, xpath, false) .map_err(|_| ImageDownloadError::HtmlParse)?; for mut node in node_vec { if let Some(url) = node.get_property("src") { @@ -83,7 +70,17 @@ impl ImageDownloader { } } - Ok(()) + let options = SaveOptions { + format: false, + no_declaration: false, + no_empty_tags: true, + no_xhtml: false, + xhtml: false, + as_xml: false, + as_html: true, + non_significant_whitespace: false, + }; + Ok(doc.to_string_with_options(options)) } async fn save_image( diff --git a/src/lib.rs b/src/lib.rs index 2f4b878..08f6e76 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,20 +2,17 @@ mod article; mod error; mod full_text_parser; pub mod images; -mod readability; mod util; use article::Article; use error::ScraperError; use full_text_parser::FullTextParser; use images::ImageDownloader; -use readability::Readability; use reqwest::Client; use std::path::Path; pub struct ArticleScraper { full_text_parser: FullTextParser, - readability: Readability, image_downloader: ImageDownloader, } @@ -23,7 +20,6 @@ impl ArticleScraper { pub async fn new(user_configs: Option<&Path>) -> Self { Self { full_text_parser: FullTextParser::new(user_configs).await, - readability: Readability::new(), image_downloader: ImageDownloader::new((2048, 2048)), } } @@ -37,13 +33,14 @@ impl ArticleScraper { let res = self.full_text_parser.parse(url, client).await; if download_images { - // if let Err(error) = self - // .image_downloader - // .download_images_from_context(&context, client) - // .await - // { - // log::error!("Downloading images failed: '{}'", error); - // } + if let Ok(res) = res { + if let Some(document) = res.document.as_ref() { + let _image_res = self + .image_downloader + .download_images_from_document(document, client) + .await; + } + } } unimplemented!() diff --git a/src/readability/mod.rs b/src/readability/mod.rs deleted file mode 100644 index 80c9fb3..0000000 --- a/src/readability/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -pub struct Readability; - -impl Readability { - pub fn new() -> Self { - unimplemented!() - } -}