From 3ca59d7f023933a304cce57f5e0c44b2c035319e Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Wed, 6 Mar 2019 18:37:24 +0100 Subject: [PATCH] embed images as base64 inside article html --- Cargo.toml | 5 +- src/images/mod.rs | 275 +++++++++++++++++++--------------------------- src/lib.rs | 21 ++-- 3 files changed, 125 insertions(+), 176 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 8c39acf..47f94c1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,5 @@ encoding_rs = "0.8" chrono = "0.4" htmlescape = "0.3" base64 = "0.10" -image = "0.20" -log = "0.4" -mime_guess = "1.8" \ No newline at end of file +image = "0.21" +log = "0.4" \ No newline at end of file diff --git a/src/images/mod.rs b/src/images/mod.rs index 76960a2..5fc475e 100644 --- a/src/images/mod.rs +++ b/src/images/mod.rs @@ -1,4 +1,3 @@ -use std::path::PathBuf; use reqwest; use log::{ error, @@ -14,31 +13,25 @@ use self::error::{ImageDownloadError, ImageDownloadErrorKind}; use base64; use std; use image; -use mime_guess; use super::ScraperErrorKind; mod error; pub struct ImageDownloader { - save_image_path: PathBuf, client: reqwest::Client, max_size: (u32, u32), - scale_size: (u32, u32), } impl ImageDownloader { - pub fn new(save_image_path: PathBuf, max_size: (u32, u32), scale_size: (u32, u32)) -> ImageDownloader { + pub fn new(max_size: (u32, u32)) -> ImageDownloader { ImageDownloader { - save_image_path: save_image_path, client: reqwest::Client::new(), max_size: max_size, - scale_size: scale_size, } } - pub fn download_images_from_string(&self, html: &str, article_url: &url::Url) -> Result { - + pub fn download_images_from_string(&self, html: &str) -> Result { let parser = Parser::default_html(); let doc = parser.parse_string(html).map_err(|_| { error!("Failed to parse HTML string"); @@ -50,52 +43,29 @@ impl ImageDownloader { ImageDownloadErrorKind::HtmlParse })?; - self.download_images_from_context(&xpath_ctx, article_url)?; + self.download_images_from_context(&xpath_ctx)?; Ok(doc.to_string(/*format:*/ false)) } - pub fn download_images_from_context(&self, context: &Context, article_url: &url::Url) -> Result<(), ImageDownloadError> { + pub fn download_images_from_context(&self, context: &Context) -> Result<(), ImageDownloadError> { let xpath = "//img"; evaluate_xpath!(context, xpath, node_vec); for mut node in node_vec { if let Some(url) = node.get_property("src") { - let url = url::Url::parse(&url).context(ImageDownloadErrorKind::InvalidUrl)?; - let parent_url_result = match self.check_image_parent(&node, &url) { + let url = url::Url::parse(&url) + .context(ImageDownloadErrorKind::InvalidUrl)?; + let parent_url = match self.check_image_parent(&node, &url) { Ok(url) => Some(url), Err(_) => None, }; - if let Some(parent_url) = parent_url_result.clone() { - if let Ok(path) = self.save_image(&parent_url, article_url) { - if let Some(path) = path.to_str() { - if let Err(_) = node.set_property("parent_img", path) { - return Err(ImageDownloadErrorKind::HtmlParse)?; - } - } - } + let (small_image, big_image) = self.save_image(&url, &parent_url)?; + if let Err(_) = node.set_property("src", &small_image) { + return Err(ImageDownloadErrorKind::HtmlParse)?; } - - let mut img_path = self.save_image(&url, article_url)?; - - if let Some((width, height)) = ImageDownloader::get_image_dimensions(&node) { - if width > self.max_size.0 || height > self.max_size.1 { - if let Ok(small_img_path) = ImageDownloader::scale_image(&img_path, self.scale_size.0, self.scale_size.1) { - if parent_url_result.is_none() { - if let Some(img_path) = img_path.to_str() { - if let Err(_) = node.set_property("big_img", img_path) { - return Err(ImageDownloadErrorKind::HtmlParse)?; - } - } - - img_path = small_img_path; - } - } - } - } - - if let Some(img_path) = img_path.to_str() { - if let Err(_) = node.set_property("src", img_path) { + if let Some(big_image) = big_image { + if let Err(_) = node.set_property("big-src", &big_image) { return Err(ImageDownloadErrorKind::HtmlParse)?; } } @@ -105,35 +75,69 @@ impl ImageDownloader { Ok(()) } - fn save_image(&self, image_url: &url::Url, article_url: &url::Url) -> Result { + fn save_image(&self, image_url: &url::Url, parent_url: &Option) -> Result<(String, Option), ImageDownloadError> { let mut response = self.client.get(image_url.clone()).send().map_err(|err| { error!("GET {} failed - {}", image_url.as_str(), err.description()); err }).context(ImageDownloadErrorKind::Http)?; - let content_type = ImageDownloader::check_image_content_type(&response)?; + let content_type_small = ImageDownloader::check_image_content_type(&response)?; + let content_type_small = content_type_small.to_str() + .context(ImageDownloadErrorKind::ContentType)?; + let mut content_type_big : Option = None; + + let mut small_image : Vec = Vec::new(); + let mut big_image : Option> = None; + + response.copy_to(&mut small_image) + .context(ImageDownloadErrorKind::IO)?; - if let Some(host) = article_url.host_str() { - let folder_name = base64::encode(article_url.as_str()).replace("/", "_"); - let path = self.save_image_path.join(host); - let path = path.join(folder_name); - - if let Ok(()) = std::fs::create_dir_all(&path) { - let file_name = ImageDownloader::extract_image_name(image_url, content_type)?; - let path = path.join(file_name); - let mut image_buffer = std::fs::File::create(&path).map_err(|err| { - error!("Failed to create file {}", path.display()); - err - }).context(ImageDownloadErrorKind::IO)?; - - response.copy_to(&mut image_buffer).context(ImageDownloadErrorKind::IO)?; - let path = std::fs::canonicalize(&path).context(ImageDownloadErrorKind::IO)?; - return Ok(path) - } + if let Some(parent_url) = parent_url { + let mut response_big = self.client.get(parent_url.clone()).send() + .context(ImageDownloadErrorKind::Http)?; + content_type_big = Some(ImageDownloader::check_image_content_type(&response)? + .to_str() + .context(ImageDownloadErrorKind::ContentType)? + .to_owned()); + let mut big_buffer : Vec = Vec::new(); + response_big.copy_to(&mut big_buffer) + .context(ImageDownloadErrorKind::IO)?; + big_image = Some(big_buffer); } - Err(ImageDownloadErrorKind::InvalidUrl)? + if content_type_small != "image/svg+xml" { + let (original_image, resized_image) = Self::scale_image(&small_image, self.max_size)?; + if let Some(resized_image) = resized_image { + small_image = resized_image; + if big_image.is_none() { + big_image = Some(original_image); + content_type_big = Some(content_type_small.to_owned()); + } + } + else { + small_image = original_image; + } + } + + let small_image_base64 = base64::encode(&small_image); + let big_image_base64 = match big_image { + Some(big_image) => Some(base64::encode(&big_image)), + None => None, + }; + let small_image_string = format!("data:{};base64,{}", content_type_small, small_image_base64); + let big_image_string = match big_image_base64 { + Some(big_image_base64) => { + let content_type_big = content_type_big.ok_or(ImageDownloadErrorKind::ParentDownload) + .map_err(|err| { + debug!("content_type_big should not be None when a big image exists"); + err + })?; + Some(format!("data:{};base64,{}", content_type_big, big_image_base64)) + }, + None => None, + }; + Ok((small_image_string, big_image_string)) } fn check_image_content_type(response: &reqwest::Response) -> Result { @@ -152,91 +156,50 @@ impl ImageDownloader { Err(ImageDownloadErrorKind::Http)? } - fn get_content_lenght(response: &reqwest::Response) -> Result { + fn scale_image(image_buffer: &[u8], max_dimensions: (u32, u32)) -> Result<(Vec, Option>), ImageDownloadError> { + let mut original_image : Vec = Vec::new(); + let mut resized_image : Option> = None; - if response.status().is_success() { - if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) { - if let Ok(content_length) = content_length.to_str() { - if let Ok(content_length) = content_length.parse::() { - return Ok(content_length) - } - } - } + let mut image = image::load_from_memory(image_buffer) + .map_err(|err| { + error!("Failed to open image to resize"); + err + }).context(ImageDownloadErrorKind::ImageScale)?; + + image.write_to(&mut original_image, image::ImageOutputFormat::PNG) + .map_err(|err| { + error!("Failed to save resized image to resize"); + err + }).context(ImageDownloadErrorKind::ImageScale)?; + + let dimensions = Self::get_image_dimensions(&image); + if dimensions.0 > max_dimensions.0 + || dimensions.1 > max_dimensions.1 { + image = image.resize(max_dimensions.0, max_dimensions.1, image::FilterType::Lanczos3); + let mut resized_buf : Vec = Vec::new(); + image.write_to(&mut resized_buf, image::ImageOutputFormat::PNG) + .map_err(|err| { + error!("Failed to save resized image to resize"); + err + }).context(ImageDownloadErrorKind::ImageScale)?; + resized_image = Some(resized_buf); } - Err(ImageDownloadErrorKind::ContentLenght)? + Ok((original_image, resized_image)) } - fn get_image_dimensions(node: &Node) -> Option<(u32, u32)> { - - if let Some(width) = node.get_property("width") { - if let Some(height) = node.get_property("height") { - if let Ok(width) = width.parse::() { - if let Ok(height) = height.parse::() { - if width > 1 && height > 1 { - return Some((width, height)) - } - } - } - } + fn get_image_dimensions(image: &image::DynamicImage) -> (u32, u32) { + match image { + image::DynamicImage::ImageLuma8(image) => (image.width(), image.height()), + image::DynamicImage::ImageLumaA8(image) => (image.width(), image.height()), + image::DynamicImage::ImageRgb8(image) => (image.width(), image.height()), + image::DynamicImage::ImageRgba8(image) => (image.width(), image.height()), + image::DynamicImage::ImageBgr8(image) => (image.width(), image.height()), + image::DynamicImage::ImageBgra8(image) => (image.width(), image.height()), } - - debug!("Image dimensions not available"); - None - } - - fn extract_image_name(url: &url::Url, content_type: reqwest::header::HeaderValue) -> Result { - - if let Some(file_name) = url.path_segments().and_then(|segments| segments.last()) { - let mut image_name = file_name.to_owned(); - if let Some(query) = url.query() { - image_name.push_str("_"); - image_name.push_str(query); - } - - let header = content_type.to_str().context(ImageDownloadErrorKind::ContentType)?; - let primary_type = match header.find("/") { - Some(end) => header[..end-1].to_string(), - None => "unknown".to_string(), - }; - let mut sub_type = match header.find("/") { - None => "unknown".to_string(), - Some(start) => { - match header.find("+") { - None => "unknown".to_string(), - Some(end) => header[start..end-1].to_string(), - } - }, - }; - if let Some(start) = header.find("+") { - sub_type.push_str("+"); - sub_type.push_str(&header[start..].to_string()); - }; - - if let Some(extensions) = mime_guess::get_extensions(&primary_type, &sub_type) { - let mut extension_present = false; - for extension in extensions { - if image_name.ends_with(extension) { - extension_present = true; - break; - } - } - - if !extension_present { - image_name.push_str("."); - image_name.push_str(extensions[0]); - } - } - - return Ok(image_name) - } - - error!("Could not generate image name for {}", url.as_str()); - Err(ImageDownloadErrorKind::ImageName)? } fn check_image_parent(&self, node: &Node, child_url: &url::Url) -> Result { - if let Some(parent) = node.get_parent() { if parent.get_name() == "a" { if let Some(url) = parent.get_property("href") { @@ -244,8 +207,8 @@ impl ImageDownloader { let parent_response = self.client.head(parent_url.clone()).send().context(ImageDownloadErrorKind::ParentDownload)?; let _ = ImageDownloader::check_image_content_type(&parent_response).context(ImageDownloadErrorKind::ParentDownload)?; let child_response = self.client.get(child_url.clone()).send().context(ImageDownloadErrorKind::ParentDownload)?; - let parent_length = ImageDownloader::get_content_lenght(&parent_response).context(ImageDownloadErrorKind::ParentDownload)?; - let child_length = ImageDownloader::get_content_lenght(&child_response).context(ImageDownloadErrorKind::ParentDownload)?; + let parent_length = Self::get_content_lenght(&parent_response).context(ImageDownloadErrorKind::ParentDownload)?; + let child_length = Self::get_content_lenght(&child_response).context(ImageDownloadErrorKind::ParentDownload)?; if parent_length > child_length { return Ok(parent_url) @@ -260,28 +223,18 @@ impl ImageDownloader { Err(ImageDownloadErrorKind::ParentDownload)? } - fn scale_image(image_path: &PathBuf, max_width: u32, max_height: u32) -> Result { - - let image = image::open(image_path).map_err(|err| { - error!("Failed to open image to resize: {:?}", image_path); - err - }).context(ImageDownloadErrorKind::ImageScale)?; - let image = image.resize(max_width, max_height, image::FilterType::Lanczos3); - - if let Some(file_name) = image_path.file_name() { - - let mut file_name = file_name.to_os_string(); - file_name.push("_resized"); - let mut resized_path = image_path.clone(); - resized_path.set_file_name(file_name); - if let Err(error) = image.save(&resized_path) { - error!("Failed to write resized image to disk."); - return Err(error).context(ImageDownloadErrorKind::ImageScale)? + fn get_content_lenght(response: &reqwest::Response) -> Result { + if response.status().is_success() { + if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) { + if let Ok(content_length) = content_length.to_str() { + if let Ok(content_length) = content_length.parse::() { + return Ok(content_length) + } + } } - - return Ok(resized_path) } - - Err(ImageDownloadErrorKind::ImageScale)? + Err(ImageDownloadErrorKind::ContentLenght)? } + + } \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 9fd14b1..19b8730 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,9 +8,6 @@ pub mod images; use reqwest; use url; use regex; -use base64; -use image; -use mime_guess; use log::{ error, debug, @@ -52,12 +49,12 @@ pub struct ArticleScraper { } impl ArticleScraper { - pub fn new(config_path: PathBuf, save_image_path: PathBuf, download_images: bool) -> Result { + pub fn new(config_path: PathBuf, download_images: bool) -> Result { let config_files = GrabberConfig::parse_directory(&config_path).context(ScraperErrorKind::Config)?; Ok(ArticleScraper { - image_downloader: ImageDownloader::new(save_image_path, (2000, 2000), (1000, 800)), + image_downloader: ImageDownloader::new((2000, 2000)), config_files: config_files, client: reqwest::Client::new(), download_images: download_images, @@ -127,7 +124,7 @@ impl ArticleScraper { } if self.download_images { - if let Err(error) = self.image_downloader.download_images_from_context(&context, &url) { + if let Err(error) = self.image_downloader.download_images_from_context(&context) { error!("Downloading images failed: {}", error); } } @@ -689,12 +686,12 @@ mod tests { #[test] pub fn golem() { let config_path = PathBuf::from(r"./resources/tests/golem"); - let image_path = PathBuf::from(r"./test_output"); + let out_path = PathBuf::from(r"./test_output"); let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap(); - let grabber = ArticleScraper::new(config_path, image_path.clone(), true).unwrap(); + let grabber = ArticleScraper::new(config_path, true).unwrap(); let article = grabber.parse(url).unwrap(); - article.save_html(&image_path).unwrap(); + article.save_html(&out_path).unwrap(); assert_eq!(article.title, Some(String::from("HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben"))); assert_eq!(article.author, Some(String::from("Hauke Gierow"))); @@ -703,12 +700,12 @@ mod tests { #[test] pub fn phoronix() { let config_path = PathBuf::from(r"./resources/tests/phoronix"); - let image_path = PathBuf::from(r"./test_output"); + let out_path = PathBuf::from(r"./test_output"); let url = url::Url::parse("http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1").unwrap(); - let grabber = ArticleScraper::new(config_path, image_path.clone(), true).unwrap(); + let grabber = ArticleScraper::new(config_path, true).unwrap(); let article = grabber.parse(url).unwrap(); - article.save_html(&image_path).unwrap(); + article.save_html(&out_path).unwrap(); assert_eq!(article.title, Some(String::from("Amazon EC2 Cloud Benchmarks Against Bare Metal Systems"))); }