mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
start improving image download
This commit is contained in:
parent
c198225012
commit
35a14b0a5f
6 changed files with 189 additions and 105 deletions
|
@ -22,6 +22,7 @@ log = "0.4"
|
||||||
rust-embed="6.6"
|
rust-embed="6.6"
|
||||||
once_cell = "1.17"
|
once_cell = "1.17"
|
||||||
escaper = "0.1"
|
escaper = "0.1"
|
||||||
|
futures = "0.3"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
env_logger = "0.10"
|
env_logger = "0.10"
|
|
@ -114,7 +114,7 @@ impl FullTextParser {
|
||||||
.ok_or(FullTextParserError::Config)?;
|
.ok_or(FullTextParserError::Config)?;
|
||||||
|
|
||||||
let headers = Util::generate_headers(config, global_config)?;
|
let headers = Util::generate_headers(config, global_config)?;
|
||||||
let response = Self::get_response(&url, &client, headers).await?;
|
let response = Self::get_response(url, client, headers).await?;
|
||||||
|
|
||||||
// check if url redirects and we need to pick up the new url
|
// check if url redirects and we need to pick up the new url
|
||||||
let url = if let Some(new_url) = Util::check_redirect(&response, url) {
|
let url = if let Some(new_url) = Util::check_redirect(&response, url) {
|
||||||
|
|
|
@ -4,8 +4,7 @@ use base64::Engine;
|
||||||
use libxml::parser::Parser;
|
use libxml::parser::Parser;
|
||||||
use libxml::tree::{Document, Node, SaveOptions};
|
use libxml::tree::{Document, Node, SaveOptions};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
use log::{debug, error};
|
use reqwest::{Client, Response, Url};
|
||||||
use reqwest::{Client, Response};
|
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
|
|
||||||
mod error;
|
mod error;
|
||||||
|
@ -29,47 +28,7 @@ impl ImageDownloader {
|
||||||
.parse_string(html)
|
.parse_string(html)
|
||||||
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
||||||
|
|
||||||
self.download_images_from_document(&doc, client).await
|
self.download_images_from_document(&doc, client).await?;
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn download_images_from_document(
|
|
||||||
&self,
|
|
||||||
doc: &Document,
|
|
||||||
client: &Client,
|
|
||||||
) -> Result<String, ImageDownloadError> {
|
|
||||||
let xpath_ctx = Context::new(doc).map_err(|()| {
|
|
||||||
error!("Failed to create xpath context for document");
|
|
||||||
ImageDownloadError::HtmlParse
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let xpath = "//img";
|
|
||||||
let node_vec = Util::evaluate_xpath(&xpath_ctx, xpath, false)
|
|
||||||
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
|
||||||
for mut node in node_vec {
|
|
||||||
if let Some(url) = node.get_property("src") {
|
|
||||||
if !url.starts_with("data:") {
|
|
||||||
if let Ok(url) = url::Url::parse(&url) {
|
|
||||||
let parent_url = match self.check_image_parent(&node, &url, client).await {
|
|
||||||
Ok(url) => Some(url),
|
|
||||||
Err(_) => None,
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Ok((small_image, big_image)) =
|
|
||||||
self.save_image(&url, &parent_url, client).await
|
|
||||||
{
|
|
||||||
if node.set_property("src", &small_image).is_err() {
|
|
||||||
return Err(ImageDownloadError::HtmlParse);
|
|
||||||
}
|
|
||||||
if let Some(big_image) = big_image {
|
|
||||||
if node.set_property("big-src", &big_image).is_err() {
|
|
||||||
return Err(ImageDownloadError::HtmlParse);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let options = SaveOptions {
|
let options = SaveOptions {
|
||||||
format: false,
|
format: false,
|
||||||
|
@ -84,6 +43,67 @@ impl ImageDownloader {
|
||||||
Ok(doc.to_string_with_options(options))
|
Ok(doc.to_string_with_options(options))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn download_images_from_document(
|
||||||
|
&self,
|
||||||
|
doc: &Document,
|
||||||
|
client: &Client,
|
||||||
|
) -> Result<(), ImageDownloadError> {
|
||||||
|
let xpath_ctx = Context::new(doc).map_err(|()| {
|
||||||
|
log::error!("Failed to create xpath context for document");
|
||||||
|
ImageDownloadError::HtmlParse
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let xpath = "//img";
|
||||||
|
let node_vec = Util::evaluate_xpath(&xpath_ctx, xpath, false)
|
||||||
|
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
||||||
|
|
||||||
|
let mut image_urls = Vec::new();
|
||||||
|
|
||||||
|
for node in node_vec {
|
||||||
|
image_urls.push(Self::harvest_image_urls(node, client));
|
||||||
|
}
|
||||||
|
|
||||||
|
let res = futures::future::join_all(image_urls).await;
|
||||||
|
|
||||||
|
// if let Ok((small_image, big_image)) = self.save_image(&url, &parent_url, client).await {
|
||||||
|
// if node.set_property("src", &small_image).is_err() {
|
||||||
|
// return Err(ImageDownloadError::HtmlParse);
|
||||||
|
// }
|
||||||
|
// if let Some(big_image) = big_image {
|
||||||
|
// if node.set_property("big-src", &big_image).is_err() {
|
||||||
|
// return Err(ImageDownloadError::HtmlParse);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn harvest_image_urls(
|
||||||
|
node: Node,
|
||||||
|
client: &Client,
|
||||||
|
) -> Result<(Url, Option<Url>), ImageDownloadError> {
|
||||||
|
let src = match node.get_property("src") {
|
||||||
|
Some(src) => {
|
||||||
|
if src.starts_with("data:") {
|
||||||
|
log::debug!("");
|
||||||
|
return Err(ImageDownloadError::Unknown);
|
||||||
|
} else {
|
||||||
|
src
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
log::debug!("");
|
||||||
|
return Err(ImageDownloadError::Unknown);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let url = Url::parse(&src).map_err(ImageDownloadError::InvalidUrl)?;
|
||||||
|
let parent_url = Self::check_image_parent(&node, &url, client).await.ok();
|
||||||
|
|
||||||
|
Ok((url, parent_url))
|
||||||
|
}
|
||||||
|
|
||||||
async fn save_image(
|
async fn save_image(
|
||||||
&self,
|
&self,
|
||||||
image_url: &url::Url,
|
image_url: &url::Url,
|
||||||
|
@ -91,7 +111,7 @@ impl ImageDownloader {
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<(String, Option<String>), ImageDownloadError> {
|
) -> Result<(String, Option<String>), ImageDownloadError> {
|
||||||
let response = client.get(image_url.clone()).send().await.map_err(|err| {
|
let response = client.get(image_url.clone()).send().await.map_err(|err| {
|
||||||
error!("GET {} failed - {}", image_url.as_str(), err);
|
log::error!("GET {} failed - {}", image_url.as_str(), err);
|
||||||
ImageDownloadError::Http
|
ImageDownloadError::Http
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
|
@ -152,7 +172,7 @@ impl ImageDownloader {
|
||||||
let big_image_string = match big_image_base64 {
|
let big_image_string = match big_image_base64 {
|
||||||
Some(big_image_base64) => {
|
Some(big_image_base64) => {
|
||||||
let content_type_big = content_type_big.ok_or_else(|| {
|
let content_type_big = content_type_big.ok_or_else(|| {
|
||||||
debug!("content_type_big should not be None when a big image exists");
|
log::debug!("content_type_big should not be None when a big image exists");
|
||||||
ImageDownloadError::ParentDownload
|
ImageDownloadError::ParentDownload
|
||||||
})?;
|
})?;
|
||||||
Some(format!(
|
Some(format!(
|
||||||
|
@ -179,7 +199,7 @@ impl ImageDownloader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
error!("{} is not an image", response.url());
|
log::warn!("{} is not an image", response.url());
|
||||||
Err(ImageDownloadError::ContentType)
|
Err(ImageDownloadError::ContentType)
|
||||||
} else {
|
} else {
|
||||||
Err(ImageDownloadError::Http)
|
Err(ImageDownloadError::Http)
|
||||||
|
@ -194,7 +214,7 @@ impl ImageDownloader {
|
||||||
let mut resized_image: Option<Vec<u8>> = None;
|
let mut resized_image: Option<Vec<u8>> = None;
|
||||||
|
|
||||||
let mut image = image::load_from_memory(image_buffer).map_err(|err| {
|
let mut image = image::load_from_memory(image_buffer).map_err(|err| {
|
||||||
error!("Failed to open image to resize: {}", err);
|
log::error!("Failed to open image to resize: {}", err);
|
||||||
ImageDownloadError::ImageScale
|
ImageDownloadError::ImageScale
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
|
@ -204,7 +224,7 @@ impl ImageDownloader {
|
||||||
image::ImageOutputFormat::Png,
|
image::ImageOutputFormat::Png,
|
||||||
)
|
)
|
||||||
.map_err(|err| {
|
.map_err(|err| {
|
||||||
error!("Failed to save resized image to resize: {}", err);
|
log::error!("Failed to save resized image to resize: {}", err);
|
||||||
ImageDownloadError::ImageScale
|
ImageDownloadError::ImageScale
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
|
@ -222,7 +242,7 @@ impl ImageDownloader {
|
||||||
image::ImageOutputFormat::Png,
|
image::ImageOutputFormat::Png,
|
||||||
)
|
)
|
||||||
.map_err(|err| {
|
.map_err(|err| {
|
||||||
error!("Failed to save resized image to resize: {}", err);
|
log::error!("Failed to save resized image to resize: {}", err);
|
||||||
ImageDownloadError::ImageScale
|
ImageDownloadError::ImageScale
|
||||||
})?;
|
})?;
|
||||||
resized_image = Some(resized_buf);
|
resized_image = Some(resized_buf);
|
||||||
|
@ -232,56 +252,71 @@ impl ImageDownloader {
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn check_image_parent(
|
async fn check_image_parent(
|
||||||
&self,
|
|
||||||
node: &Node,
|
node: &Node,
|
||||||
child_url: &url::Url,
|
child_url: &Url,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<url::Url, ImageDownloadError> {
|
) -> Result<Url, ImageDownloadError> {
|
||||||
if let Some(parent) = node.get_parent() {
|
let parent = match node.get_parent() {
|
||||||
if parent.get_name() == "a" {
|
Some(parent) => parent,
|
||||||
if let Some(url) = parent.get_property("href") {
|
None => {
|
||||||
let parent_url = url::Url::parse(&url).map_err(|err| {
|
log::debug!("No parent node");
|
||||||
error!("Failed to parse parent image url: {}", err);
|
return Err(ImageDownloadError::ParentDownload);
|
||||||
ImageDownloadError::InvalidUrl(err)
|
|
||||||
})?;
|
|
||||||
let parent_response = client
|
|
||||||
.head(parent_url.clone())
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.map_err(|_| ImageDownloadError::Http)?;
|
|
||||||
let _ = ImageDownloader::check_image_content_type(&parent_response)?;
|
|
||||||
let child_response = client
|
|
||||||
.get(child_url.clone())
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.map_err(|_| ImageDownloadError::Http)?;
|
|
||||||
let parent_length = Self::get_content_lenght(&parent_response)?;
|
|
||||||
let child_length = Self::get_content_lenght(&child_response)?;
|
|
||||||
|
|
||||||
if parent_length > child_length {
|
|
||||||
return Ok(parent_url);
|
|
||||||
}
|
|
||||||
|
|
||||||
return Ok(child_url.clone());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if parent.get_name().to_lowercase() != "a" {
|
||||||
|
log::debug!("parent is not an <a> node");
|
||||||
|
return Err(ImageDownloadError::ParentDownload);
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!("Image parent element not relevant");
|
let href = match parent.get_property("href") {
|
||||||
|
Some(href) => href,
|
||||||
|
None => {
|
||||||
|
log::debug!("Parent doesn't have href prop");
|
||||||
|
return Err(ImageDownloadError::ParentDownload);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let parent_url = Url::parse(&href).map_err(|err| {
|
||||||
|
log::debug!("Failed to parse parent image url: {}", err);
|
||||||
|
ImageDownloadError::InvalidUrl(err)
|
||||||
|
})?;
|
||||||
|
let parent_response = client
|
||||||
|
.head(parent_url.clone())
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|_| ImageDownloadError::Http)?;
|
||||||
|
let _ = ImageDownloader::check_image_content_type(&parent_response)?;
|
||||||
|
let child_response = client
|
||||||
|
.head(child_url.clone())
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|_| ImageDownloadError::Http)?;
|
||||||
|
let parent_length = Self::get_content_lenght(&parent_response)?;
|
||||||
|
let child_length = Self::get_content_lenght(&child_response)?;
|
||||||
|
|
||||||
|
if parent_length > child_length {
|
||||||
|
return Ok(parent_url);
|
||||||
|
}
|
||||||
|
|
||||||
|
log::debug!("Image parent element not relevant");
|
||||||
Err(ImageDownloadError::ParentDownload)
|
Err(ImageDownloadError::ParentDownload)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_content_lenght(response: &Response) -> Result<u64, ImageDownloadError> {
|
fn get_content_lenght(response: &Response) -> Result<u64, ImageDownloadError> {
|
||||||
if response.status().is_success() {
|
let status_code = response.status();
|
||||||
if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) {
|
|
||||||
if let Ok(content_length) = content_length.to_str() {
|
if !status_code.is_success() {
|
||||||
if let Ok(content_length) = content_length.parse::<u64>() {
|
log::warn!("response: {status_code}");
|
||||||
return Ok(content_length);
|
return Err(ImageDownloadError::Http);
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Err(ImageDownloadError::ContentLenght)
|
|
||||||
|
response
|
||||||
|
.headers()
|
||||||
|
.get(reqwest::header::CONTENT_LENGTH)
|
||||||
|
.and_then(|content_length| content_length.to_str().ok())
|
||||||
|
.and_then(|content_length| content_length.parse::<u64>().ok())
|
||||||
|
.ok_or(ImageDownloadError::ContentLenght)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -293,7 +328,7 @@ mod tests {
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn close_tags() {
|
async fn fedora31() {
|
||||||
let image_dowloader = ImageDownloader::new((2048, 2048));
|
let image_dowloader = ImageDownloader::new((2048, 2048));
|
||||||
let html = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
|
let html = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
|
||||||
.expect("Failed to read HTML");
|
.expect("Failed to read HTML");
|
||||||
|
|
|
@ -33,19 +33,17 @@ impl ArticleScraper {
|
||||||
download_images: bool,
|
download_images: bool,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<Article, ScraperError> {
|
) -> Result<Article, ScraperError> {
|
||||||
let res = self.full_text_parser.parse(url, client).await;
|
let res = self.full_text_parser.parse(url, client).await?;
|
||||||
|
|
||||||
if download_images {
|
if download_images {
|
||||||
if let Ok(res) = res {
|
if let Some(document) = res.document.as_ref() {
|
||||||
if let Some(document) = res.document.as_ref() {
|
let _image_res = self
|
||||||
let _image_res = self
|
.image_downloader
|
||||||
.image_downloader
|
.download_images_from_document(document, client)
|
||||||
.download_images_from_document(document, client)
|
.await;
|
||||||
.await;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unimplemented!()
|
Ok(res)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,7 +22,11 @@ pub enum Commands {
|
||||||
All {
|
All {
|
||||||
/// Source Url to download HTML from
|
/// Source Url to download HTML from
|
||||||
#[arg(long, value_name = "URL")]
|
#[arg(long, value_name = "URL")]
|
||||||
source_url: Option<String>,
|
source_url: String,
|
||||||
|
|
||||||
|
/// Source Url to download HTML from
|
||||||
|
#[arg(short, long)]
|
||||||
|
download_images: bool,
|
||||||
},
|
},
|
||||||
/// Only use the Readability parser
|
/// Only use the Readability parser
|
||||||
Readability {
|
Readability {
|
||||||
|
|
|
@ -2,9 +2,7 @@ use std::path::Path;
|
||||||
use std::{path::PathBuf, process::exit};
|
use std::{path::PathBuf, process::exit};
|
||||||
|
|
||||||
use crate::args::{Args, Commands};
|
use crate::args::{Args, Commands};
|
||||||
use article_scraper::FtrConfigEntry;
|
use article_scraper::{ArticleScraper, FtrConfigEntry, FullTextParser, Readability};
|
||||||
use article_scraper::FullTextParser;
|
|
||||||
use article_scraper::Readability;
|
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use reqwest::header::HeaderMap;
|
use reqwest::header::HeaderMap;
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
|
@ -31,7 +29,10 @@ async fn main() {
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
match args.command {
|
match args.command {
|
||||||
Commands::All { source_url: _ } => unimplemented!(),
|
Commands::All {
|
||||||
|
source_url,
|
||||||
|
download_images,
|
||||||
|
} => extract_full(source_url, download_images, args.output).await,
|
||||||
Commands::Readability {
|
Commands::Readability {
|
||||||
html,
|
html,
|
||||||
base_url,
|
base_url,
|
||||||
|
@ -46,6 +47,51 @@ async fn main() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn extract_full(source_url: String, download_images: bool, output: Option<PathBuf>) {
|
||||||
|
let scraper = ArticleScraper::new(None).await;
|
||||||
|
|
||||||
|
let source_url = match Url::parse(&source_url) {
|
||||||
|
Ok(url) => url,
|
||||||
|
Err(error) => {
|
||||||
|
log::error!("Failed to parse url {source_url}: {error}");
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let res = scraper
|
||||||
|
.parse(&source_url, download_images, &Client::new())
|
||||||
|
.await;
|
||||||
|
let article = match res {
|
||||||
|
Ok(article) => article,
|
||||||
|
Err(error) => {
|
||||||
|
log::error!("Failed to grab article: {error}");
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let output = if let Some(output) = output {
|
||||||
|
output
|
||||||
|
} else {
|
||||||
|
PathBuf::from("result.html")
|
||||||
|
};
|
||||||
|
|
||||||
|
let content = match article.get_content() {
|
||||||
|
Some(content) => content,
|
||||||
|
None => {
|
||||||
|
log::error!("No Content");
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
match std::fs::write(&output, content) {
|
||||||
|
Ok(()) => log::info!("successfully written result to {output:?}"),
|
||||||
|
Err(err) => {
|
||||||
|
log::error!("Failed to write to file {output:?}: {err}");
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn extract_ftr(
|
async fn extract_ftr(
|
||||||
html_file: Option<PathBuf>,
|
html_file: Option<PathBuf>,
|
||||||
source_url: Option<String>,
|
source_url: Option<String>,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue