mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
merge
This commit is contained in:
commit
6f38c2bc4c
4 changed files with 64 additions and 99 deletions
|
@ -15,4 +15,4 @@ htmlescape = "0.3"
|
||||||
base64 = "0.10"
|
base64 = "0.10"
|
||||||
image = "0.20"
|
image = "0.20"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
mime_guess = "1.8"
|
mime_guess = "1.8"
|
|
@ -36,21 +36,15 @@ impl ImageDownloader {
|
||||||
pub fn download_images_from_string(&self, html: &str, article_url: &url::Url) -> Result<String, ImageDownloadError> {
|
pub fn download_images_from_string(&self, html: &str, article_url: &url::Url) -> Result<String, ImageDownloadError> {
|
||||||
|
|
||||||
let parser = Parser::default_html();
|
let parser = Parser::default_html();
|
||||||
let doc = match parser.parse_string(html) {
|
let doc = parser.parse_string(html).map_err(|_| {
|
||||||
Ok(doc) => doc,
|
error!("Failed to parse HTML string");
|
||||||
Err(_) => {
|
ImageDownloadErrorKind::HtmlParse
|
||||||
error!("Failed to parse HTML string");
|
})?;
|
||||||
return Err(ImageDownloadErrorKind::HtmlParse)?
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let xpath_ctx = match Context::new(&doc) {
|
let xpath_ctx = Context::new(&doc).map_err(|()| {
|
||||||
Ok(context) => context,
|
error!("Failed to create xpath context for document");
|
||||||
Err(_) => {
|
ImageDownloadErrorKind::HtmlParse
|
||||||
error!("Failed to create xpath context for document");
|
})?;
|
||||||
return Err(ImageDownloadErrorKind::HtmlParse)?
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
self.download_images_from_context(&xpath_ctx, article_url)?;
|
self.download_images_from_context(&xpath_ctx, article_url)?;
|
||||||
|
|
||||||
|
@ -109,13 +103,11 @@ impl ImageDownloader {
|
||||||
|
|
||||||
fn save_image(&self, image_url: &url::Url, article_url: &url::Url) -> Result<PathBuf, ImageDownloadError> {
|
fn save_image(&self, image_url: &url::Url, article_url: &url::Url) -> Result<PathBuf, ImageDownloadError> {
|
||||||
|
|
||||||
let mut response = match self.client.get(image_url.clone()).send() {
|
let mut response = self.client.get(image_url.clone()).send().map_err(|err| {
|
||||||
Ok(response) => response,
|
error!("GET {} failed - {}", image_url.as_str(), err.description());
|
||||||
Err(error) => {
|
err
|
||||||
error!("GET {} failed - {}", image_url.as_str(), error.description());
|
}).context(ImageDownloadErrorKind::Http)?;
|
||||||
Err(error).context(ImageDownloadErrorKind::Http)?
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let content_type = ImageDownloader::check_image_content_type(&response)?;
|
let content_type = ImageDownloader::check_image_content_type(&response)?;
|
||||||
|
|
||||||
if let Some(host) = article_url.host_str() {
|
if let Some(host) = article_url.host_str() {
|
||||||
|
@ -126,13 +118,10 @@ impl ImageDownloader {
|
||||||
if let Ok(()) = std::fs::create_dir_all(&path) {
|
if let Ok(()) = std::fs::create_dir_all(&path) {
|
||||||
let file_name = ImageDownloader::extract_image_name(image_url, content_type)?;
|
let file_name = ImageDownloader::extract_image_name(image_url, content_type)?;
|
||||||
let path = path.join(file_name);
|
let path = path.join(file_name);
|
||||||
let mut image_buffer = match std::fs::File::create(&path) {
|
let mut image_buffer = std::fs::File::create(&path).map_err(|err| {
|
||||||
Ok(buffer) => buffer,
|
error!("Failed to create file {}", path.display());
|
||||||
Err(error) => {
|
err
|
||||||
error!("Failed to create file {}", path.display());
|
}).context(ImageDownloadErrorKind::IO)?;
|
||||||
Err(error).context(ImageDownloadErrorKind::IO)?
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
response.copy_to(&mut image_buffer).context(ImageDownloadErrorKind::IO)?;
|
response.copy_to(&mut image_buffer).context(ImageDownloadErrorKind::IO)?;
|
||||||
let path = std::fs::canonicalize(&path).context(ImageDownloadErrorKind::IO)?;
|
let path = std::fs::canonicalize(&path).context(ImageDownloadErrorKind::IO)?;
|
||||||
|
@ -269,13 +258,10 @@ impl ImageDownloader {
|
||||||
|
|
||||||
fn scale_image(image_path: &PathBuf, max_width: u32, max_height: u32) -> Result<PathBuf, ImageDownloadError> {
|
fn scale_image(image_path: &PathBuf, max_width: u32, max_height: u32) -> Result<PathBuf, ImageDownloadError> {
|
||||||
|
|
||||||
let image = match image::open(image_path) {
|
let image = image::open(image_path).map_err(|err| {
|
||||||
Ok(image) => image,
|
error!("Failed to open image to resize: {:?}", image_path);
|
||||||
Err(error) => {
|
err
|
||||||
error!("Failed to open image to resize: {:?}", image_path);
|
}).context(ImageDownloadErrorKind::ImageScale)?;
|
||||||
return Err(error).context(ImageDownloadErrorKind::ImageScale)?
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let image = image.resize(max_width, max_height, image::FilterType::Lanczos3);
|
let image = image.resize(max_width, max_height, image::FilterType::Lanczos3);
|
||||||
|
|
||||||
if let Some(file_name) = image_path.file_name() {
|
if let Some(file_name) = image_path.file_name() {
|
||||||
|
|
70
src/lib.rs
70
src/lib.rs
|
@ -5,7 +5,6 @@ extern crate reqwest;
|
||||||
extern crate url;
|
extern crate url;
|
||||||
extern crate regex;
|
extern crate regex;
|
||||||
extern crate encoding_rs;
|
extern crate encoding_rs;
|
||||||
extern crate htmlescape;
|
|
||||||
extern crate base64;
|
extern crate base64;
|
||||||
extern crate image;
|
extern crate image;
|
||||||
extern crate chrono;
|
extern crate chrono;
|
||||||
|
@ -35,7 +34,9 @@ use config::{
|
||||||
GrabberConfig,
|
GrabberConfig,
|
||||||
ConfigCollection
|
ConfigCollection
|
||||||
};
|
};
|
||||||
use encoding_rs::*;
|
use encoding_rs::{
|
||||||
|
Encoding,
|
||||||
|
};
|
||||||
use chrono::NaiveDateTime;
|
use chrono::NaiveDateTime;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use images::ImageDownloader;
|
use images::ImageDownloader;
|
||||||
|
@ -68,15 +69,12 @@ impl ArticleScraper {
|
||||||
pub fn parse(&self, url: url::Url) -> Result<Article, ScraperError> {
|
pub fn parse(&self, url: url::Url) -> Result<Article, ScraperError> {
|
||||||
|
|
||||||
info!("Scraping article: {}", url.as_str());
|
info!("Scraping article: {}", url.as_str());
|
||||||
|
let response = self.client.head(url.clone()).send()
|
||||||
// do a HEAD request to url
|
.map_err(|err| {
|
||||||
let response = match self.client.head(url.clone()).send() {
|
error!("Failed head request to: {} - {}", url.as_str(), err.description());
|
||||||
Ok(response) => response,
|
err
|
||||||
Err(error) => {
|
})
|
||||||
error!("Failed head request to: {} - {}", url.as_str(), error.description());
|
.context(ScraperErrorKind::Http)?;
|
||||||
Err(error).context(ScraperErrorKind::Http)?
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// check if url redirects and we need to pick up the new url
|
// check if url redirects and we need to pick up the new url
|
||||||
let mut url = url;
|
let mut url = url;
|
||||||
|
@ -101,16 +99,13 @@ impl ArticleScraper {
|
||||||
html: None,
|
html: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
// create empty document to hold the content
|
let mut document = Document::new().map_err(|()| {
|
||||||
let mut document = match Document::new() {
|
ScraperErrorKind::Xml
|
||||||
Ok(doc) => doc,
|
})?;
|
||||||
Err(()) => return Err(ScraperErrorKind::Xml)?
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut root = match Node::new("article", None, &document) {
|
let mut root = Node::new("article", None, &document).map_err(|()| {
|
||||||
Ok(root) => root,
|
ScraperErrorKind::Xml
|
||||||
Err(()) => return Err(ScraperErrorKind::Xml)?
|
})?;
|
||||||
};
|
|
||||||
|
|
||||||
document.set_root_element(&root);
|
document.set_root_element(&root);
|
||||||
|
|
||||||
|
@ -118,13 +113,10 @@ impl ArticleScraper {
|
||||||
|
|
||||||
self.parse_first_page(&mut article, &url, &mut root, config)?;
|
self.parse_first_page(&mut article, &url, &mut root, config)?;
|
||||||
|
|
||||||
let context = match Context::new(&document) {
|
let context = Context::new(&document).map_err(|()| {
|
||||||
Ok(context) => context,
|
error!("Failed to create xpath context for extracted article");
|
||||||
Err(_) => {
|
ScraperErrorKind::Xml
|
||||||
error!("Failed to create xpath context for extracted article");
|
})?;
|
||||||
return Err(ScraperErrorKind::Xml)?
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Err(error) = ArticleScraper::prevent_self_closing_tags(&context) {
|
if let Err(error) = ArticleScraper::prevent_self_closing_tags(&context) {
|
||||||
error!("Preventing self closing tags failed - {}", error);
|
error!("Preventing self closing tags failed - {}", error);
|
||||||
|
@ -196,13 +188,12 @@ impl ArticleScraper {
|
||||||
|
|
||||||
fn download(url: &url::Url, client: &reqwest::Client) -> Result<String, ScraperError> {
|
fn download(url: &url::Url, client: &reqwest::Client) -> Result<String, ScraperError> {
|
||||||
|
|
||||||
let mut response = match client.get(url.as_str()).send() {
|
let mut response = client.get(url.as_str()).send()
|
||||||
Ok(response) => response,
|
.map_err(|err| {
|
||||||
Err(error) => {
|
error!("Downloading HTML failed: GET {} - {}", url.as_str(), err.description());
|
||||||
error!("Downloading HTML failed: GET {} - {}", url.as_str(), error.description());
|
err
|
||||||
return Err(error).context(ScraperErrorKind::Http)?
|
})
|
||||||
}
|
.context(ScraperErrorKind::Http)?;
|
||||||
};
|
|
||||||
|
|
||||||
if response.status().is_success() {
|
if response.status().is_success() {
|
||||||
let text = response.text().context(ScraperErrorKind::Http)?;
|
let text = response.text().context(ScraperErrorKind::Http)?;
|
||||||
|
@ -392,13 +383,10 @@ impl ArticleScraper {
|
||||||
if let Ok(()) = node.set_property("width", "100%") {
|
if let Ok(()) = node.set_property("width", "100%") {
|
||||||
if let Ok(()) = node.remove_property("height") {
|
if let Ok(()) = node.remove_property("height") {
|
||||||
node.unlink();
|
node.unlink();
|
||||||
match video_wrapper.add_child(&mut node) {
|
video_wrapper.add_child(&mut node).map_err(|_| {
|
||||||
Ok(_) => continue,
|
error!("Failed to add iframe as child of video wrapper <div>");
|
||||||
Err(_) => {
|
ScraperErrorKind::Xml
|
||||||
error!("Failed to add iframe as child of video wrapper <div>");
|
})?;
|
||||||
return Err(ScraperErrorKind::Xml)?
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,21 +11,15 @@ macro_rules! parse_html {
|
||||||
|
|
||||||
// parse html
|
// parse html
|
||||||
let parser = Parser::default_html();
|
let parser = Parser::default_html();
|
||||||
let doc = match parser.parse_string($html.as_str()) {
|
let doc = parser.parse_string($html.as_str()).map_err(|err| {
|
||||||
Ok(doc) => doc,
|
error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
||||||
Err(_) => {
|
ScraperErrorKind::Xml
|
||||||
error!("Parsing HTML failed for downloaded HTML");
|
})?;
|
||||||
return Err(ScraperErrorKind::Xml)?
|
|
||||||
}
|
let $xpath_ctx = Context::new(&doc).map_err(|()| {
|
||||||
};
|
error!("Creating xpath context failed for downloaded HTML");
|
||||||
|
ScraperErrorKind::Xml
|
||||||
let $xpath_ctx = match Context::new(&doc) {
|
})?;
|
||||||
Ok(context) => context,
|
|
||||||
Err(_) => {
|
|
||||||
error!("Creating xpath context failed for downloaded HTML");
|
|
||||||
return Err(ScraperErrorKind::Xml)?
|
|
||||||
}
|
|
||||||
};
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -35,13 +29,10 @@ macro_rules! evaluate_xpath {
|
||||||
$xpath: ident,
|
$xpath: ident,
|
||||||
$node_vec: ident
|
$node_vec: ident
|
||||||
) => {
|
) => {
|
||||||
let res = match $context.evaluate($xpath) {
|
let res = $context.evaluate($xpath).map_err(|()| {
|
||||||
Ok(result) => result,
|
error!("Evaluation of xpath {} yielded no results", $xpath);
|
||||||
Err(_) => {
|
ScraperErrorKind::Xml
|
||||||
error!("Evaluation of xpath {} yielded no results", $xpath);
|
})?;
|
||||||
return Err(ScraperErrorKind::Xml)?
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let $node_vec = res.get_nodes_as_vec();
|
let $node_vec = res.get_nodes_as_vec();
|
||||||
};
|
};
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue