diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index 6d07980..b46936d 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -22,6 +22,7 @@ log = "0.4" rust-embed="6.6" once_cell = "1.17" escaper = "0.1" +futures = "0.3" [dev-dependencies] env_logger = "0.10" \ No newline at end of file diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 6231057..2d3a65f 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -114,7 +114,7 @@ impl FullTextParser { .ok_or(FullTextParserError::Config)?; let headers = Util::generate_headers(config, global_config)?; - let response = Self::get_response(&url, &client, headers).await?; + let response = Self::get_response(url, client, headers).await?; // check if url redirects and we need to pick up the new url let url = if let Some(new_url) = Util::check_redirect(&response, url) { diff --git a/article_scraper/src/images/mod.rs b/article_scraper/src/images/mod.rs index 96acd4f..994b0a4 100644 --- a/article_scraper/src/images/mod.rs +++ b/article_scraper/src/images/mod.rs @@ -4,8 +4,7 @@ use base64::Engine; use libxml::parser::Parser; use libxml::tree::{Document, Node, SaveOptions}; use libxml::xpath::Context; -use log::{debug, error}; -use reqwest::{Client, Response}; +use reqwest::{Client, Response, Url}; use std::io::Cursor; mod error; @@ -29,47 +28,7 @@ impl ImageDownloader { .parse_string(html) .map_err(|_| ImageDownloadError::HtmlParse)?; - self.download_images_from_document(&doc, client).await - } - - pub async fn download_images_from_document( - &self, - doc: &Document, - client: &Client, - ) -> Result { - let xpath_ctx = Context::new(doc).map_err(|()| { - error!("Failed to create xpath context for document"); - ImageDownloadError::HtmlParse - })?; - - let xpath = "//img"; - let node_vec = Util::evaluate_xpath(&xpath_ctx, xpath, false) - .map_err(|_| ImageDownloadError::HtmlParse)?; - for mut node in node_vec { - if let Some(url) = node.get_property("src") { - if !url.starts_with("data:") { - if let Ok(url) = url::Url::parse(&url) { - let parent_url = match self.check_image_parent(&node, &url, client).await { - Ok(url) => Some(url), - Err(_) => None, - }; - - if let Ok((small_image, big_image)) = - self.save_image(&url, &parent_url, client).await - { - if node.set_property("src", &small_image).is_err() { - return Err(ImageDownloadError::HtmlParse); - } - if let Some(big_image) = big_image { - if node.set_property("big-src", &big_image).is_err() { - return Err(ImageDownloadError::HtmlParse); - } - } - } - } - } - } - } + self.download_images_from_document(&doc, client).await?; let options = SaveOptions { format: false, @@ -84,6 +43,67 @@ impl ImageDownloader { Ok(doc.to_string_with_options(options)) } + pub async fn download_images_from_document( + &self, + doc: &Document, + client: &Client, + ) -> Result<(), ImageDownloadError> { + let xpath_ctx = Context::new(doc).map_err(|()| { + log::error!("Failed to create xpath context for document"); + ImageDownloadError::HtmlParse + })?; + + let xpath = "//img"; + let node_vec = Util::evaluate_xpath(&xpath_ctx, xpath, false) + .map_err(|_| ImageDownloadError::HtmlParse)?; + + let mut image_urls = Vec::new(); + + for node in node_vec { + image_urls.push(Self::harvest_image_urls(node, client)); + } + + let res = futures::future::join_all(image_urls).await; + + // if let Ok((small_image, big_image)) = self.save_image(&url, &parent_url, client).await { + // if node.set_property("src", &small_image).is_err() { + // return Err(ImageDownloadError::HtmlParse); + // } + // if let Some(big_image) = big_image { + // if node.set_property("big-src", &big_image).is_err() { + // return Err(ImageDownloadError::HtmlParse); + // } + // } + // } + + Ok(()) + } + + async fn harvest_image_urls( + node: Node, + client: &Client, + ) -> Result<(Url, Option), ImageDownloadError> { + let src = match node.get_property("src") { + Some(src) => { + if src.starts_with("data:") { + log::debug!(""); + return Err(ImageDownloadError::Unknown); + } else { + src + } + } + None => { + log::debug!(""); + return Err(ImageDownloadError::Unknown); + } + }; + + let url = Url::parse(&src).map_err(ImageDownloadError::InvalidUrl)?; + let parent_url = Self::check_image_parent(&node, &url, client).await.ok(); + + Ok((url, parent_url)) + } + async fn save_image( &self, image_url: &url::Url, @@ -91,7 +111,7 @@ impl ImageDownloader { client: &Client, ) -> Result<(String, Option), ImageDownloadError> { let response = client.get(image_url.clone()).send().await.map_err(|err| { - error!("GET {} failed - {}", image_url.as_str(), err); + log::error!("GET {} failed - {}", image_url.as_str(), err); ImageDownloadError::Http })?; @@ -152,7 +172,7 @@ impl ImageDownloader { let big_image_string = match big_image_base64 { Some(big_image_base64) => { let content_type_big = content_type_big.ok_or_else(|| { - debug!("content_type_big should not be None when a big image exists"); + log::debug!("content_type_big should not be None when a big image exists"); ImageDownloadError::ParentDownload })?; Some(format!( @@ -179,7 +199,7 @@ impl ImageDownloader { } } - error!("{} is not an image", response.url()); + log::warn!("{} is not an image", response.url()); Err(ImageDownloadError::ContentType) } else { Err(ImageDownloadError::Http) @@ -194,7 +214,7 @@ impl ImageDownloader { let mut resized_image: Option> = None; let mut image = image::load_from_memory(image_buffer).map_err(|err| { - error!("Failed to open image to resize: {}", err); + log::error!("Failed to open image to resize: {}", err); ImageDownloadError::ImageScale })?; @@ -204,7 +224,7 @@ impl ImageDownloader { image::ImageOutputFormat::Png, ) .map_err(|err| { - error!("Failed to save resized image to resize: {}", err); + log::error!("Failed to save resized image to resize: {}", err); ImageDownloadError::ImageScale })?; @@ -222,7 +242,7 @@ impl ImageDownloader { image::ImageOutputFormat::Png, ) .map_err(|err| { - error!("Failed to save resized image to resize: {}", err); + log::error!("Failed to save resized image to resize: {}", err); ImageDownloadError::ImageScale })?; resized_image = Some(resized_buf); @@ -232,56 +252,71 @@ impl ImageDownloader { } async fn check_image_parent( - &self, node: &Node, - child_url: &url::Url, + child_url: &Url, client: &Client, - ) -> Result { - if let Some(parent) = node.get_parent() { - if parent.get_name() == "a" { - if let Some(url) = parent.get_property("href") { - let parent_url = url::Url::parse(&url).map_err(|err| { - error!("Failed to parse parent image url: {}", err); - ImageDownloadError::InvalidUrl(err) - })?; - let parent_response = client - .head(parent_url.clone()) - .send() - .await - .map_err(|_| ImageDownloadError::Http)?; - let _ = ImageDownloader::check_image_content_type(&parent_response)?; - let child_response = client - .get(child_url.clone()) - .send() - .await - .map_err(|_| ImageDownloadError::Http)?; - let parent_length = Self::get_content_lenght(&parent_response)?; - let child_length = Self::get_content_lenght(&child_response)?; - - if parent_length > child_length { - return Ok(parent_url); - } - - return Ok(child_url.clone()); - } + ) -> Result { + let parent = match node.get_parent() { + Some(parent) => parent, + None => { + log::debug!("No parent node"); + return Err(ImageDownloadError::ParentDownload); } + }; + + if parent.get_name().to_lowercase() != "a" { + log::debug!("parent is not an node"); + return Err(ImageDownloadError::ParentDownload); } - debug!("Image parent element not relevant"); + let href = match parent.get_property("href") { + Some(href) => href, + None => { + log::debug!("Parent doesn't have href prop"); + return Err(ImageDownloadError::ParentDownload); + } + }; + + let parent_url = Url::parse(&href).map_err(|err| { + log::debug!("Failed to parse parent image url: {}", err); + ImageDownloadError::InvalidUrl(err) + })?; + let parent_response = client + .head(parent_url.clone()) + .send() + .await + .map_err(|_| ImageDownloadError::Http)?; + let _ = ImageDownloader::check_image_content_type(&parent_response)?; + let child_response = client + .head(child_url.clone()) + .send() + .await + .map_err(|_| ImageDownloadError::Http)?; + let parent_length = Self::get_content_lenght(&parent_response)?; + let child_length = Self::get_content_lenght(&child_response)?; + + if parent_length > child_length { + return Ok(parent_url); + } + + log::debug!("Image parent element not relevant"); Err(ImageDownloadError::ParentDownload) } fn get_content_lenght(response: &Response) -> Result { - if response.status().is_success() { - if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) { - if let Ok(content_length) = content_length.to_str() { - if let Ok(content_length) = content_length.parse::() { - return Ok(content_length); - } - } - } + let status_code = response.status(); + + if !status_code.is_success() { + log::warn!("response: {status_code}"); + return Err(ImageDownloadError::Http); } - Err(ImageDownloadError::ContentLenght) + + response + .headers() + .get(reqwest::header::CONTENT_LENGTH) + .and_then(|content_length| content_length.to_str().ok()) + .and_then(|content_length| content_length.parse::().ok()) + .ok_or(ImageDownloadError::ContentLenght) } } @@ -293,7 +328,7 @@ mod tests { use std::io::Write; #[tokio::test] - async fn close_tags() { + async fn fedora31() { let image_dowloader = ImageDownloader::new((2048, 2048)); let html = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html") .expect("Failed to read HTML"); diff --git a/article_scraper/src/lib.rs b/article_scraper/src/lib.rs index 4344cd7..0818eba 100644 --- a/article_scraper/src/lib.rs +++ b/article_scraper/src/lib.rs @@ -33,19 +33,17 @@ impl ArticleScraper { download_images: bool, client: &Client, ) -> Result { - let res = self.full_text_parser.parse(url, client).await; + let res = self.full_text_parser.parse(url, client).await?; if download_images { - if let Ok(res) = res { - if let Some(document) = res.document.as_ref() { - let _image_res = self - .image_downloader - .download_images_from_document(document, client) - .await; - } + if let Some(document) = res.document.as_ref() { + let _image_res = self + .image_downloader + .download_images_from_document(document, client) + .await; } } - unimplemented!() + Ok(res) } } diff --git a/article_scraper_cli/src/args.rs b/article_scraper_cli/src/args.rs index b647413..9a0e658 100644 --- a/article_scraper_cli/src/args.rs +++ b/article_scraper_cli/src/args.rs @@ -22,7 +22,11 @@ pub enum Commands { All { /// Source Url to download HTML from #[arg(long, value_name = "URL")] - source_url: Option, + source_url: String, + + /// Source Url to download HTML from + #[arg(short, long)] + download_images: bool, }, /// Only use the Readability parser Readability { diff --git a/article_scraper_cli/src/main.rs b/article_scraper_cli/src/main.rs index 92ca257..ac595e1 100644 --- a/article_scraper_cli/src/main.rs +++ b/article_scraper_cli/src/main.rs @@ -2,9 +2,7 @@ use std::path::Path; use std::{path::PathBuf, process::exit}; use crate::args::{Args, Commands}; -use article_scraper::FtrConfigEntry; -use article_scraper::FullTextParser; -use article_scraper::Readability; +use article_scraper::{ArticleScraper, FtrConfigEntry, FullTextParser, Readability}; use clap::Parser; use reqwest::header::HeaderMap; use reqwest::Client; @@ -31,7 +29,10 @@ async fn main() { .unwrap(); match args.command { - Commands::All { source_url: _ } => unimplemented!(), + Commands::All { + source_url, + download_images, + } => extract_full(source_url, download_images, args.output).await, Commands::Readability { html, base_url, @@ -46,6 +47,51 @@ async fn main() { } } +async fn extract_full(source_url: String, download_images: bool, output: Option) { + let scraper = ArticleScraper::new(None).await; + + let source_url = match Url::parse(&source_url) { + Ok(url) => url, + Err(error) => { + log::error!("Failed to parse url {source_url}: {error}"); + exit(0); + } + }; + + let res = scraper + .parse(&source_url, download_images, &Client::new()) + .await; + let article = match res { + Ok(article) => article, + Err(error) => { + log::error!("Failed to grab article: {error}"); + exit(0); + } + }; + + let output = if let Some(output) = output { + output + } else { + PathBuf::from("result.html") + }; + + let content = match article.get_content() { + Some(content) => content, + None => { + log::error!("No Content"); + exit(0); + } + }; + + match std::fs::write(&output, content) { + Ok(()) => log::info!("successfully written result to {output:?}"), + Err(err) => { + log::error!("Failed to write to file {output:?}: {err}"); + exit(0); + } + } +} + async fn extract_ftr( html_file: Option, source_url: Option,