mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
start improving image download
This commit is contained in:
parent
c198225012
commit
35a14b0a5f
6 changed files with 189 additions and 105 deletions
|
@ -22,6 +22,7 @@ log = "0.4"
|
|||
rust-embed="6.6"
|
||||
once_cell = "1.17"
|
||||
escaper = "0.1"
|
||||
futures = "0.3"
|
||||
|
||||
[dev-dependencies]
|
||||
env_logger = "0.10"
|
|
@ -114,7 +114,7 @@ impl FullTextParser {
|
|||
.ok_or(FullTextParserError::Config)?;
|
||||
|
||||
let headers = Util::generate_headers(config, global_config)?;
|
||||
let response = Self::get_response(&url, &client, headers).await?;
|
||||
let response = Self::get_response(url, client, headers).await?;
|
||||
|
||||
// check if url redirects and we need to pick up the new url
|
||||
let url = if let Some(new_url) = Util::check_redirect(&response, url) {
|
||||
|
|
|
@ -4,8 +4,7 @@ use base64::Engine;
|
|||
use libxml::parser::Parser;
|
||||
use libxml::tree::{Document, Node, SaveOptions};
|
||||
use libxml::xpath::Context;
|
||||
use log::{debug, error};
|
||||
use reqwest::{Client, Response};
|
||||
use reqwest::{Client, Response, Url};
|
||||
use std::io::Cursor;
|
||||
|
||||
mod error;
|
||||
|
@ -29,47 +28,7 @@ impl ImageDownloader {
|
|||
.parse_string(html)
|
||||
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
||||
|
||||
self.download_images_from_document(&doc, client).await
|
||||
}
|
||||
|
||||
pub async fn download_images_from_document(
|
||||
&self,
|
||||
doc: &Document,
|
||||
client: &Client,
|
||||
) -> Result<String, ImageDownloadError> {
|
||||
let xpath_ctx = Context::new(doc).map_err(|()| {
|
||||
error!("Failed to create xpath context for document");
|
||||
ImageDownloadError::HtmlParse
|
||||
})?;
|
||||
|
||||
let xpath = "//img";
|
||||
let node_vec = Util::evaluate_xpath(&xpath_ctx, xpath, false)
|
||||
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
||||
for mut node in node_vec {
|
||||
if let Some(url) = node.get_property("src") {
|
||||
if !url.starts_with("data:") {
|
||||
if let Ok(url) = url::Url::parse(&url) {
|
||||
let parent_url = match self.check_image_parent(&node, &url, client).await {
|
||||
Ok(url) => Some(url),
|
||||
Err(_) => None,
|
||||
};
|
||||
|
||||
if let Ok((small_image, big_image)) =
|
||||
self.save_image(&url, &parent_url, client).await
|
||||
{
|
||||
if node.set_property("src", &small_image).is_err() {
|
||||
return Err(ImageDownloadError::HtmlParse);
|
||||
}
|
||||
if let Some(big_image) = big_image {
|
||||
if node.set_property("big-src", &big_image).is_err() {
|
||||
return Err(ImageDownloadError::HtmlParse);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
self.download_images_from_document(&doc, client).await?;
|
||||
|
||||
let options = SaveOptions {
|
||||
format: false,
|
||||
|
@ -84,6 +43,67 @@ impl ImageDownloader {
|
|||
Ok(doc.to_string_with_options(options))
|
||||
}
|
||||
|
||||
pub async fn download_images_from_document(
|
||||
&self,
|
||||
doc: &Document,
|
||||
client: &Client,
|
||||
) -> Result<(), ImageDownloadError> {
|
||||
let xpath_ctx = Context::new(doc).map_err(|()| {
|
||||
log::error!("Failed to create xpath context for document");
|
||||
ImageDownloadError::HtmlParse
|
||||
})?;
|
||||
|
||||
let xpath = "//img";
|
||||
let node_vec = Util::evaluate_xpath(&xpath_ctx, xpath, false)
|
||||
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
||||
|
||||
let mut image_urls = Vec::new();
|
||||
|
||||
for node in node_vec {
|
||||
image_urls.push(Self::harvest_image_urls(node, client));
|
||||
}
|
||||
|
||||
let res = futures::future::join_all(image_urls).await;
|
||||
|
||||
// if let Ok((small_image, big_image)) = self.save_image(&url, &parent_url, client).await {
|
||||
// if node.set_property("src", &small_image).is_err() {
|
||||
// return Err(ImageDownloadError::HtmlParse);
|
||||
// }
|
||||
// if let Some(big_image) = big_image {
|
||||
// if node.set_property("big-src", &big_image).is_err() {
|
||||
// return Err(ImageDownloadError::HtmlParse);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn harvest_image_urls(
|
||||
node: Node,
|
||||
client: &Client,
|
||||
) -> Result<(Url, Option<Url>), ImageDownloadError> {
|
||||
let src = match node.get_property("src") {
|
||||
Some(src) => {
|
||||
if src.starts_with("data:") {
|
||||
log::debug!("");
|
||||
return Err(ImageDownloadError::Unknown);
|
||||
} else {
|
||||
src
|
||||
}
|
||||
}
|
||||
None => {
|
||||
log::debug!("");
|
||||
return Err(ImageDownloadError::Unknown);
|
||||
}
|
||||
};
|
||||
|
||||
let url = Url::parse(&src).map_err(ImageDownloadError::InvalidUrl)?;
|
||||
let parent_url = Self::check_image_parent(&node, &url, client).await.ok();
|
||||
|
||||
Ok((url, parent_url))
|
||||
}
|
||||
|
||||
async fn save_image(
|
||||
&self,
|
||||
image_url: &url::Url,
|
||||
|
@ -91,7 +111,7 @@ impl ImageDownloader {
|
|||
client: &Client,
|
||||
) -> Result<(String, Option<String>), ImageDownloadError> {
|
||||
let response = client.get(image_url.clone()).send().await.map_err(|err| {
|
||||
error!("GET {} failed - {}", image_url.as_str(), err);
|
||||
log::error!("GET {} failed - {}", image_url.as_str(), err);
|
||||
ImageDownloadError::Http
|
||||
})?;
|
||||
|
||||
|
@ -152,7 +172,7 @@ impl ImageDownloader {
|
|||
let big_image_string = match big_image_base64 {
|
||||
Some(big_image_base64) => {
|
||||
let content_type_big = content_type_big.ok_or_else(|| {
|
||||
debug!("content_type_big should not be None when a big image exists");
|
||||
log::debug!("content_type_big should not be None when a big image exists");
|
||||
ImageDownloadError::ParentDownload
|
||||
})?;
|
||||
Some(format!(
|
||||
|
@ -179,7 +199,7 @@ impl ImageDownloader {
|
|||
}
|
||||
}
|
||||
|
||||
error!("{} is not an image", response.url());
|
||||
log::warn!("{} is not an image", response.url());
|
||||
Err(ImageDownloadError::ContentType)
|
||||
} else {
|
||||
Err(ImageDownloadError::Http)
|
||||
|
@ -194,7 +214,7 @@ impl ImageDownloader {
|
|||
let mut resized_image: Option<Vec<u8>> = None;
|
||||
|
||||
let mut image = image::load_from_memory(image_buffer).map_err(|err| {
|
||||
error!("Failed to open image to resize: {}", err);
|
||||
log::error!("Failed to open image to resize: {}", err);
|
||||
ImageDownloadError::ImageScale
|
||||
})?;
|
||||
|
||||
|
@ -204,7 +224,7 @@ impl ImageDownloader {
|
|||
image::ImageOutputFormat::Png,
|
||||
)
|
||||
.map_err(|err| {
|
||||
error!("Failed to save resized image to resize: {}", err);
|
||||
log::error!("Failed to save resized image to resize: {}", err);
|
||||
ImageDownloadError::ImageScale
|
||||
})?;
|
||||
|
||||
|
@ -222,7 +242,7 @@ impl ImageDownloader {
|
|||
image::ImageOutputFormat::Png,
|
||||
)
|
||||
.map_err(|err| {
|
||||
error!("Failed to save resized image to resize: {}", err);
|
||||
log::error!("Failed to save resized image to resize: {}", err);
|
||||
ImageDownloadError::ImageScale
|
||||
})?;
|
||||
resized_image = Some(resized_buf);
|
||||
|
@ -232,16 +252,33 @@ impl ImageDownloader {
|
|||
}
|
||||
|
||||
async fn check_image_parent(
|
||||
&self,
|
||||
node: &Node,
|
||||
child_url: &url::Url,
|
||||
child_url: &Url,
|
||||
client: &Client,
|
||||
) -> Result<url::Url, ImageDownloadError> {
|
||||
if let Some(parent) = node.get_parent() {
|
||||
if parent.get_name() == "a" {
|
||||
if let Some(url) = parent.get_property("href") {
|
||||
let parent_url = url::Url::parse(&url).map_err(|err| {
|
||||
error!("Failed to parse parent image url: {}", err);
|
||||
) -> Result<Url, ImageDownloadError> {
|
||||
let parent = match node.get_parent() {
|
||||
Some(parent) => parent,
|
||||
None => {
|
||||
log::debug!("No parent node");
|
||||
return Err(ImageDownloadError::ParentDownload);
|
||||
}
|
||||
};
|
||||
|
||||
if parent.get_name().to_lowercase() != "a" {
|
||||
log::debug!("parent is not an <a> node");
|
||||
return Err(ImageDownloadError::ParentDownload);
|
||||
}
|
||||
|
||||
let href = match parent.get_property("href") {
|
||||
Some(href) => href,
|
||||
None => {
|
||||
log::debug!("Parent doesn't have href prop");
|
||||
return Err(ImageDownloadError::ParentDownload);
|
||||
}
|
||||
};
|
||||
|
||||
let parent_url = Url::parse(&href).map_err(|err| {
|
||||
log::debug!("Failed to parse parent image url: {}", err);
|
||||
ImageDownloadError::InvalidUrl(err)
|
||||
})?;
|
||||
let parent_response = client
|
||||
|
@ -251,7 +288,7 @@ impl ImageDownloader {
|
|||
.map_err(|_| ImageDownloadError::Http)?;
|
||||
let _ = ImageDownloader::check_image_content_type(&parent_response)?;
|
||||
let child_response = client
|
||||
.get(child_url.clone())
|
||||
.head(child_url.clone())
|
||||
.send()
|
||||
.await
|
||||
.map_err(|_| ImageDownloadError::Http)?;
|
||||
|
@ -262,26 +299,24 @@ impl ImageDownloader {
|
|||
return Ok(parent_url);
|
||||
}
|
||||
|
||||
return Ok(child_url.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("Image parent element not relevant");
|
||||
log::debug!("Image parent element not relevant");
|
||||
Err(ImageDownloadError::ParentDownload)
|
||||
}
|
||||
|
||||
fn get_content_lenght(response: &Response) -> Result<u64, ImageDownloadError> {
|
||||
if response.status().is_success() {
|
||||
if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) {
|
||||
if let Ok(content_length) = content_length.to_str() {
|
||||
if let Ok(content_length) = content_length.parse::<u64>() {
|
||||
return Ok(content_length);
|
||||
let status_code = response.status();
|
||||
|
||||
if !status_code.is_success() {
|
||||
log::warn!("response: {status_code}");
|
||||
return Err(ImageDownloadError::Http);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(ImageDownloadError::ContentLenght)
|
||||
|
||||
response
|
||||
.headers()
|
||||
.get(reqwest::header::CONTENT_LENGTH)
|
||||
.and_then(|content_length| content_length.to_str().ok())
|
||||
.and_then(|content_length| content_length.parse::<u64>().ok())
|
||||
.ok_or(ImageDownloadError::ContentLenght)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -293,7 +328,7 @@ mod tests {
|
|||
use std::io::Write;
|
||||
|
||||
#[tokio::test]
|
||||
async fn close_tags() {
|
||||
async fn fedora31() {
|
||||
let image_dowloader = ImageDownloader::new((2048, 2048));
|
||||
let html = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
|
||||
.expect("Failed to read HTML");
|
||||
|
|
|
@ -33,10 +33,9 @@ impl ArticleScraper {
|
|||
download_images: bool,
|
||||
client: &Client,
|
||||
) -> Result<Article, ScraperError> {
|
||||
let res = self.full_text_parser.parse(url, client).await;
|
||||
let res = self.full_text_parser.parse(url, client).await?;
|
||||
|
||||
if download_images {
|
||||
if let Ok(res) = res {
|
||||
if let Some(document) = res.document.as_ref() {
|
||||
let _image_res = self
|
||||
.image_downloader
|
||||
|
@ -44,8 +43,7 @@ impl ArticleScraper {
|
|||
.await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unimplemented!()
|
||||
Ok(res)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,7 +22,11 @@ pub enum Commands {
|
|||
All {
|
||||
/// Source Url to download HTML from
|
||||
#[arg(long, value_name = "URL")]
|
||||
source_url: Option<String>,
|
||||
source_url: String,
|
||||
|
||||
/// Source Url to download HTML from
|
||||
#[arg(short, long)]
|
||||
download_images: bool,
|
||||
},
|
||||
/// Only use the Readability parser
|
||||
Readability {
|
||||
|
|
|
@ -2,9 +2,7 @@ use std::path::Path;
|
|||
use std::{path::PathBuf, process::exit};
|
||||
|
||||
use crate::args::{Args, Commands};
|
||||
use article_scraper::FtrConfigEntry;
|
||||
use article_scraper::FullTextParser;
|
||||
use article_scraper::Readability;
|
||||
use article_scraper::{ArticleScraper, FtrConfigEntry, FullTextParser, Readability};
|
||||
use clap::Parser;
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::Client;
|
||||
|
@ -31,7 +29,10 @@ async fn main() {
|
|||
.unwrap();
|
||||
|
||||
match args.command {
|
||||
Commands::All { source_url: _ } => unimplemented!(),
|
||||
Commands::All {
|
||||
source_url,
|
||||
download_images,
|
||||
} => extract_full(source_url, download_images, args.output).await,
|
||||
Commands::Readability {
|
||||
html,
|
||||
base_url,
|
||||
|
@ -46,6 +47,51 @@ async fn main() {
|
|||
}
|
||||
}
|
||||
|
||||
async fn extract_full(source_url: String, download_images: bool, output: Option<PathBuf>) {
|
||||
let scraper = ArticleScraper::new(None).await;
|
||||
|
||||
let source_url = match Url::parse(&source_url) {
|
||||
Ok(url) => url,
|
||||
Err(error) => {
|
||||
log::error!("Failed to parse url {source_url}: {error}");
|
||||
exit(0);
|
||||
}
|
||||
};
|
||||
|
||||
let res = scraper
|
||||
.parse(&source_url, download_images, &Client::new())
|
||||
.await;
|
||||
let article = match res {
|
||||
Ok(article) => article,
|
||||
Err(error) => {
|
||||
log::error!("Failed to grab article: {error}");
|
||||
exit(0);
|
||||
}
|
||||
};
|
||||
|
||||
let output = if let Some(output) = output {
|
||||
output
|
||||
} else {
|
||||
PathBuf::from("result.html")
|
||||
};
|
||||
|
||||
let content = match article.get_content() {
|
||||
Some(content) => content,
|
||||
None => {
|
||||
log::error!("No Content");
|
||||
exit(0);
|
||||
}
|
||||
};
|
||||
|
||||
match std::fs::write(&output, content) {
|
||||
Ok(()) => log::info!("successfully written result to {output:?}"),
|
||||
Err(err) => {
|
||||
log::error!("Failed to write to file {output:?}: {err}");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn extract_ftr(
|
||||
html_file: Option<PathBuf>,
|
||||
source_url: Option<String>,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue