1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

TIL: map_err

This commit is contained in:
Jan Lukas Gernert 2018-08-31 16:49:58 +02:00
parent 5beb25a575
commit fab4306ed9
3 changed files with 60 additions and 96 deletions

View file

@ -36,21 +36,15 @@ impl ImageDownloader {
pub fn download_images_from_string(&self, html: &str, article_url: &url::Url) -> Result<String, ImageDownloadError> { pub fn download_images_from_string(&self, html: &str, article_url: &url::Url) -> Result<String, ImageDownloadError> {
let parser = Parser::default_html(); let parser = Parser::default_html();
let doc = match parser.parse_string(html) { let doc = parser.parse_string(html).map_err(|_| {
Ok(doc) => doc, error!("Failed to parse HTML string");
Err(_) => { ImageDownloadErrorKind::HtmlParse
error!("Failed to parse HTML string"); })?;
return Err(ImageDownloadErrorKind::HtmlParse)?
}
};
let xpath_ctx = match Context::new(&doc) { let xpath_ctx = Context::new(&doc).map_err(|()| {
Ok(context) => context, error!("Failed to create xpath context for document");
Err(_) => { ImageDownloadErrorKind::HtmlParse
error!("Failed to create xpath context for document"); })?;
return Err(ImageDownloadErrorKind::HtmlParse)?
}
};
self.download_images_from_context(&xpath_ctx, article_url)?; self.download_images_from_context(&xpath_ctx, article_url)?;
@ -109,13 +103,11 @@ impl ImageDownloader {
fn save_image(&self, image_url: &url::Url, article_url: &url::Url) -> Result<PathBuf, ImageDownloadError> { fn save_image(&self, image_url: &url::Url, article_url: &url::Url) -> Result<PathBuf, ImageDownloadError> {
let mut response = match self.client.get(image_url.clone()).send() { let mut response = self.client.get(image_url.clone()).send().map_err(|err| {
Ok(response) => response, error!("GET {} failed - {}", image_url.as_str(), err.description());
Err(error) => { err
error!("GET {} failed - {}", image_url.as_str(), error.description()); }).context(ImageDownloadErrorKind::Http)?;
Err(error).context(ImageDownloadErrorKind::Http)?
}
};
let content_type = ImageDownloader::check_image_content_type(&response)?; let content_type = ImageDownloader::check_image_content_type(&response)?;
if let Some(host) = article_url.host_str() { if let Some(host) = article_url.host_str() {
@ -126,13 +118,10 @@ impl ImageDownloader {
if let Ok(()) = std::fs::create_dir_all(&path) { if let Ok(()) = std::fs::create_dir_all(&path) {
let file_name = ImageDownloader::extract_image_name(image_url, content_type)?; let file_name = ImageDownloader::extract_image_name(image_url, content_type)?;
let path = path.join(file_name); let path = path.join(file_name);
let mut image_buffer = match std::fs::File::create(&path) { let mut image_buffer = std::fs::File::create(&path).map_err(|err| {
Ok(buffer) => buffer, error!("Failed to create file {}", path.display());
Err(error) => { err
error!("Failed to create file {}", path.display()); }).context(ImageDownloadErrorKind::IO)?;
Err(error).context(ImageDownloadErrorKind::IO)?
}
};
response.copy_to(&mut image_buffer).context(ImageDownloadErrorKind::IO)?; response.copy_to(&mut image_buffer).context(ImageDownloadErrorKind::IO)?;
let path = std::fs::canonicalize(&path).context(ImageDownloadErrorKind::IO)?; let path = std::fs::canonicalize(&path).context(ImageDownloadErrorKind::IO)?;
@ -252,13 +241,10 @@ impl ImageDownloader {
fn scale_image(image_path: &PathBuf, max_width: u32, max_height: u32) -> Result<PathBuf, ImageDownloadError> { fn scale_image(image_path: &PathBuf, max_width: u32, max_height: u32) -> Result<PathBuf, ImageDownloadError> {
let image = match image::open(image_path) { let image = image::open(image_path).map_err(|err| {
Ok(image) => image, error!("Failed to open image to resize: {:?}", image_path);
Err(error) => { err
error!("Failed to open image to resize: {:?}", image_path); }).context(ImageDownloadErrorKind::ImageScale)?;
return Err(error).context(ImageDownloadErrorKind::ImageScale)?
}
};
let image = image.resize(max_width, max_height, image::FilterType::Lanczos3); let image = image.resize(max_width, max_height, image::FilterType::Lanczos3);
if let Some(file_name) = image_path.file_name() { if let Some(file_name) = image_path.file_name() {

View file

@ -69,15 +69,12 @@ impl ArticleScraper {
pub fn parse(&self, url: url::Url) -> Result<Article, ScraperError> { pub fn parse(&self, url: url::Url) -> Result<Article, ScraperError> {
info!("Scraping article: {}", url.as_str()); info!("Scraping article: {}", url.as_str());
let response = self.client.head(url.clone()).send()
// do a HEAD request to url .map_err(|err| {
let response = match self.client.head(url.clone()).send() { error!("Failed head request to: {} - {}", url.as_str(), err.description());
Ok(response) => response, err
Err(error) => { })
error!("Failed head request to: {} - {}", url.as_str(), error.description()); .context(ScraperErrorKind::Http)?;
Err(error).context(ScraperErrorKind::Http)?
}
};
// check if url redirects and we need to pick up the new url // check if url redirects and we need to pick up the new url
let mut url = url; let mut url = url;
@ -102,16 +99,13 @@ impl ArticleScraper {
html: None, html: None,
}; };
// create empty document to hold the content let mut document = Document::new().map_err(|()| {
let mut document = match Document::new() { ScraperErrorKind::Xml
Ok(doc) => doc, })?;
Err(()) => return Err(ScraperErrorKind::Xml)?
};
let mut root = match Node::new("article", None, &document) { let mut root = Node::new("article", None, &document).map_err(|()| {
Ok(root) => root, ScraperErrorKind::Xml
Err(()) => return Err(ScraperErrorKind::Xml)? })?;
};
document.set_root_element(&root); document.set_root_element(&root);
@ -119,13 +113,10 @@ impl ArticleScraper {
self.parse_first_page(&mut article, &url, &mut root, config)?; self.parse_first_page(&mut article, &url, &mut root, config)?;
let context = match Context::new(&document) { let context = Context::new(&document).map_err(|()| {
Ok(context) => context, error!("Failed to create xpath context for extracted article");
Err(_) => { ScraperErrorKind::Xml
error!("Failed to create xpath context for extracted article"); })?;
return Err(ScraperErrorKind::Xml)?
}
};
if let Err(error) = ArticleScraper::prevent_self_closing_tags(&context) { if let Err(error) = ArticleScraper::prevent_self_closing_tags(&context) {
error!("Preventing self closing tags failed - {}", error); error!("Preventing self closing tags failed - {}", error);
@ -197,13 +188,12 @@ impl ArticleScraper {
fn download(url: &url::Url, client: &reqwest::Client) -> Result<String, ScraperError> { fn download(url: &url::Url, client: &reqwest::Client) -> Result<String, ScraperError> {
let mut response = match client.get(url.as_str()).send() { let mut response = client.get(url.as_str()).send()
Ok(response) => response, .map_err(|err| {
Err(error) => { error!("Downloading HTML failed: GET {} - {}", url.as_str(), err.description());
error!("Downloading HTML failed: GET {} - {}", url.as_str(), error.description()); err
return Err(error).context(ScraperErrorKind::Http)? })
} .context(ScraperErrorKind::Http)?;
};
if response.status().is_success() { if response.status().is_success() {
let text = response.text().context(ScraperErrorKind::Http)?; let text = response.text().context(ScraperErrorKind::Http)?;
@ -386,13 +376,10 @@ impl ArticleScraper {
if let Ok(()) = node.set_property("width", "100%") { if let Ok(()) = node.set_property("width", "100%") {
if let Ok(()) = node.remove_property("height") { if let Ok(()) = node.remove_property("height") {
node.unlink(); node.unlink();
match video_wrapper.add_child(&mut node) { video_wrapper.add_child(&mut node).map_err(|_| {
Ok(_) => continue, error!("Failed to add iframe as child of video wrapper <div>");
Err(_) => { ScraperErrorKind::Xml
error!("Failed to add iframe as child of video wrapper <div>"); })?;
return Err(ScraperErrorKind::Xml)?
}
}
} }
} }
} }

View file

@ -11,21 +11,15 @@ macro_rules! parse_html {
// parse html // parse html
let parser = Parser::default_html(); let parser = Parser::default_html();
let doc = match parser.parse_string($html.as_str()) { let doc = parser.parse_string($html.as_str()).map_err(|err| {
Ok(doc) => doc, error!("Parsing HTML failed for downloaded HTML {:?}", err);
Err(_) => { ScraperErrorKind::Xml
error!("Parsing HTML failed for downloaded HTML"); })?;
return Err(ScraperErrorKind::Xml)?
} let $xpath_ctx = Context::new(&doc).map_err(|()| {
}; error!("Creating xpath context failed for downloaded HTML");
ScraperErrorKind::Xml
let $xpath_ctx = match Context::new(&doc) { })?;
Ok(context) => context,
Err(_) => {
error!("Creating xpath context failed for downloaded HTML");
return Err(ScraperErrorKind::Xml)?
}
};
}; };
} }
@ -35,13 +29,10 @@ macro_rules! evaluate_xpath {
$xpath: ident, $xpath: ident,
$node_vec: ident $node_vec: ident
) => { ) => {
let res = match $context.evaluate($xpath) { let res = $context.evaluate($xpath).map_err(|()| {
Ok(result) => result, error!("Evaluation of xpath {} yielded no results", $xpath);
Err(_) => { ScraperErrorKind::Xml
error!("Evaluation of xpath {} yielded no results", $xpath); })?;
return Err(ScraperErrorKind::Xml)?
}
};
let $node_vec = res.get_nodes_as_vec(); let $node_vec = res.get_nodes_as_vec();
}; };