1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

special handling trying to find single page links: fixes youtube

This commit is contained in:
Jan Lukas Gernert 2022-10-07 08:48:09 +02:00
parent 7b1b027c6d
commit 8c2af14871
6 changed files with 226 additions and 174 deletions

View file

@ -114,7 +114,7 @@ impl ConfigEntry {
extract_option_single!(line, next_page, next_page_link); extract_option_single!(line, next_page, next_page_link);
if line.starts_with(replace_single) { if line.starts_with(replace_single) {
let value = Util::extract_value(replace_single, line); let value = Util::str_extract_value(replace_single, line);
let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect(); let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect();
if value.len() != 2 { if value.len() != 2 {
continue; continue;
@ -133,7 +133,7 @@ impl ConfigEntry {
} }
if line.starts_with(http_header) { if line.starts_with(http_header) {
let value = Util::extract_value(http_header, line); let value = Util::str_extract_value(http_header, line);
let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect(); let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect();
if value.len() != 2 { if value.len() != 2 {
continue; continue;
@ -152,10 +152,10 @@ impl ConfigEntry {
} }
if line.starts_with(find) { if line.starts_with(find) {
let to_replace = Util::extract_value(find, line).into(); let to_replace = Util::str_extract_value(find, line).into();
if let Ok(Some(next_line)) = lines.next_line().await { if let Ok(Some(next_line)) = lines.next_line().await {
let replace_with = Util::extract_value(replace, &next_line).into(); let replace_with = Util::str_extract_value(replace, &next_line).into();
replace_vec.push(Replace { replace_vec.push(Replace {
to_replace, to_replace,

View file

@ -5,7 +5,7 @@ macro_rules! extract_vec_multi {
$vector: ident $vector: ident
) => { ) => {
if $line.starts_with($identifier) { if $line.starts_with($identifier) {
let value = Util::extract_value($identifier, $line); let value = Util::str_extract_value($identifier, $line);
let value = Util::split_values(value); let value = Util::split_values(value);
let value: Vec<String> = value.iter().map(|s| s.trim().to_string()).collect(); let value: Vec<String> = value.iter().map(|s| s.trim().to_string()).collect();
$vector.extend(value); $vector.extend(value);
@ -21,7 +21,7 @@ macro_rules! extract_vec_single {
$vector: ident $vector: ident
) => { ) => {
if $line.starts_with($identifier) { if $line.starts_with($identifier) {
let value = Util::extract_value($identifier, $line); let value = Util::str_extract_value($identifier, $line);
$vector.push(value.to_string()); $vector.push(value.to_string());
continue; continue;
} }
@ -35,7 +35,7 @@ macro_rules! extract_option_single {
$option: ident $option: ident
) => { ) => {
if $line.starts_with($identifier) { if $line.starts_with($identifier) {
let value = Util::extract_value($identifier, $line); let value = Util::str_extract_value($identifier, $line);
$option = Some(value.to_string()); $option = Some(value.to_string());
continue; continue;
} }

View file

@ -1,5 +1,5 @@
use self::error::{ImageDownloadError, ImageDownloadErrorKind}; use self::error::{ImageDownloadError, ImageDownloadErrorKind};
use crate::ArticleScraper; use crate::util::Util;
use failure::ResultExt; use failure::ResultExt;
use libxml::parser::Parser; use libxml::parser::Parser;
use libxml::tree::{Node, SaveOptions}; use libxml::tree::{Node, SaveOptions};
@ -57,7 +57,7 @@ impl ImageDownloader {
client: &Client, client: &Client,
) -> Result<(), ImageDownloadError> { ) -> Result<(), ImageDownloadError> {
let xpath = "//img"; let xpath = "//img";
let node_vec = ArticleScraper::evaluate_xpath(context, xpath, false) let node_vec = Util::evaluate_xpath(context, xpath, false)
.context(ImageDownloadErrorKind::HtmlParse)?; .context(ImageDownloadErrorKind::HtmlParse)?;
for mut node in node_vec { for mut node in node_vec {
if let Some(url) = node.get_property("src") { if let Some(url) = node.get_property("src") {

View file

@ -20,7 +20,7 @@ use libxml::tree::{Document, Node, SaveOptions};
use libxml::xpath::Context; use libxml::xpath::Context;
use log::{debug, error, info, warn}; use log::{debug, error, info, warn};
use reqwest::header::HeaderMap; use reqwest::header::HeaderMap;
use reqwest::{Client, Response}; use reqwest::Client;
use std::path::Path; use std::path::Path;
use std::str::FromStr; use std::str::FromStr;
use util::Util; use util::Util;
@ -75,7 +75,7 @@ impl ArticleScraper {
.context(ScraperErrorKind::Http)?; .context(ScraperErrorKind::Http)?;
// check if url redirects and we need to pick up the new url // check if url redirects and we need to pick up the new url
let url = if let Some(new_url) = ArticleScraper::check_redirect(&response, &url) { let url = if let Some(new_url) = Util::check_redirect(&response, &url) {
debug!("Url '{}' redirects to '{}'", url.as_str(), new_url.as_str()); debug!("Url '{}' redirects to '{}'", url.as_str(), new_url.as_str());
new_url new_url
} else { } else {
@ -83,7 +83,7 @@ impl ArticleScraper {
}; };
// check if we are dealing with text/html // check if we are dealing with text/html
if !ArticleScraper::check_content_type(&response)? { if !Util::check_content_type(&response)? {
return Err(ScraperErrorKind::ContentType.into()); return Err(ScraperErrorKind::ContentType.into());
} }
@ -167,23 +167,22 @@ impl ArticleScraper {
"Single page link xpath specified in config '{}'", "Single page link xpath specified in config '{}'",
xpath_single_page_link xpath_single_page_link
); );
if let Ok(result) = xpath_ctx.findvalue(&xpath_single_page_link, None) {
if !result.trim().is_empty() { if let Some(single_page_url) = Util::find_page_url(&xpath_ctx, &xpath_single_page_link)
// parse again with single page url {
debug!("Single page link found '{}'", result); // parse again with single page url
let single_page_url = debug!("Single page link found '{}'", single_page_url);
url::Url::parse(&result).context(ScraperErrorKind::Url)?;
return self return self
.parse_single_page( .parse_single_page(
article, article,
&single_page_url, &single_page_url,
root, root,
config, config,
global_config, global_config,
client, client,
) )
.await; .await;
}
} }
} }
@ -236,28 +235,6 @@ impl ArticleScraper {
})?) })?)
} }
fn evaluate_xpath(
xpath_ctx: &Context,
xpath: &str,
thorw_if_empty: bool,
) -> Result<Vec<Node>, ScraperError> {
let res = xpath_ctx.evaluate(xpath).map_err(|()| {
debug!("Evaluation of xpath '{}' yielded no results", xpath);
ScraperErrorKind::Xml
})?;
let node_vec = res.get_nodes_as_vec();
if node_vec.is_empty() {
debug!("Evaluation of xpath '{}' yielded no results", xpath);
if thorw_if_empty {
return Err(ScraperErrorKind::Xml.into());
}
}
Ok(node_vec)
}
async fn parse_single_page( async fn parse_single_page(
&self, &self,
article: &mut Article, article: &mut Article,
@ -278,7 +255,11 @@ impl ArticleScraper {
Ok(()) Ok(())
} }
async fn download(url: &url::Url, client: &Client, headers: HeaderMap) -> Result<String, ScraperError> { async fn download(
url: &url::Url,
client: &Client,
headers: HeaderMap,
) -> Result<String, ScraperError> {
let response = client let response = client
.get(url.as_str()) .get(url.as_str())
.headers(headers) .headers(headers)
@ -389,96 +370,13 @@ impl ArticleScraper {
conf conf
} }
fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
if response.status().is_success() {
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
if let Ok(content_type) = content_type.to_str() {
if content_type.contains("text/html") {
return Ok(true);
}
}
}
error!("Content type is not text/HTML");
return Ok(false);
}
error!("Failed to determine content type");
Err(ScraperErrorKind::Http.into())
}
fn check_redirect(response: &Response, original_url: &url::Url) -> Option<url::Url> {
if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
debug!("Article url redirects to '{}'", response.url().as_str());
return Some(response.url().clone());
} else if response.url() != original_url {
return Some(response.url().clone());
}
None
}
fn extract_value(context: &Context, xpath: &str) -> Result<String, ScraperError> {
let node_vec = Self::evaluate_xpath(context, xpath, false)?;
if let Some(val) = node_vec.get(0) {
return Ok(val.get_content());
}
Err(ScraperErrorKind::Xml.into())
}
fn extract_value_merge(context: &Context, xpath: &str) -> Result<String, ScraperError> {
let node_vec = Self::evaluate_xpath(context, xpath, true)?;
let mut val = String::new();
for node in node_vec {
let part = node.get_content().split_whitespace().map(|s| format!("{} ", s)).collect::<String>();
val.push_str(&part);
val.push_str(" ");
}
Ok(val.trim().to_string())
}
fn strip_node(context: &Context, xpath: &str) -> Result<(), ScraperError> {
let mut ancestor = xpath.to_string();
if ancestor.starts_with("//") {
ancestor = ancestor.chars().skip(2).collect();
}
let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
let node_vec = Self::evaluate_xpath(context, query, false)?;
for mut node in node_vec {
node.unlink();
}
Ok(())
}
fn strip_id_or_class(context: &Context, id_or_class: &str) -> Result<(), ScraperError> {
let xpath = &format!(
"//*[contains(@class, '{}') or contains(@id, '{}')]",
id_or_class, id_or_class
);
let mut ancestor = xpath.clone();
if ancestor.starts_with("//") {
ancestor = ancestor.chars().skip(2).collect();
}
let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
let node_vec = Self::evaluate_xpath(context, query, false)?;
for mut node in node_vec {
node.unlink();
}
Ok(())
}
fn fix_lazy_images( fn fix_lazy_images(
context: &Context, context: &Context,
class: &str, class: &str,
property_url: &str, property_url: &str,
) -> Result<(), ScraperError> { ) -> Result<(), ScraperError> {
let xpath = &format!("//img[contains(@class, '{}')]", class); let xpath = &format!("//img[contains(@class, '{}')]", class);
let node_vec = Self::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if let Some(correct_url) = node.get_property(property_url) { if let Some(correct_url) = node.get_property(property_url) {
if node.set_property("src", &correct_url).is_err() { if node.set_property("src", &correct_url).is_err() {
@ -491,13 +389,13 @@ impl ArticleScraper {
fn fix_iframe_size(context: &Context, site_name: &str) -> Result<(), ScraperError> { fn fix_iframe_size(context: &Context, site_name: &str) -> Result<(), ScraperError> {
let xpath = &format!("//iframe[contains(@src, '{}')]", site_name); let xpath = &format!("//iframe[contains(@src, '{}')]", site_name);
let node_vec = Self::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if let Some(mut parent) = node.get_parent() { if let Some(mut parent) = node.get_parent() {
if let Ok(mut video_wrapper) = parent.new_child(None, "div") { if let Ok(mut video_wrapper) = parent.new_child(None, "div") {
if let Ok(()) = video_wrapper.set_property("class", "videoWrapper") { if let Ok(()) = video_wrapper.set_property("class", "videoWrapper") {
if let Ok(()) = node.set_property("width", "100%") { if let Ok(()) = node.set_property("width", "100%") {
if let Ok(()) = node.remove_property("height") { if let Ok(()) = node.set_property("height", "100%") {
node.unlink(); node.unlink();
video_wrapper.add_child(&mut node).map_err(|_| { video_wrapper.add_child(&mut node).map_err(|_| {
error!("Failed to add iframe as child of video wrapper <div>"); error!("Failed to add iframe as child of video wrapper <div>");
@ -526,7 +424,7 @@ impl ArticleScraper {
let xpath_tag = tag.unwrap_or("*"); let xpath_tag = tag.unwrap_or("*");
let xpath = &format!("//{}[@{}]", xpath_tag, attribute); let xpath = &format!("//{}[@{}]", xpath_tag, attribute);
let node_vec = Self::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if node.remove_property(attribute).is_err() { if node.remove_property(attribute).is_err() {
return Err(ScraperErrorKind::Xml.into()); return Err(ScraperErrorKind::Xml.into());
@ -544,7 +442,7 @@ impl ArticleScraper {
let xpath_tag = tag.unwrap_or("*"); let xpath_tag = tag.unwrap_or("*");
let xpath = &format!("//{}", xpath_tag); let xpath = &format!("//{}", xpath_tag);
let node_vec = Self::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if node.set_attribute(attribute, value).is_err() { if node.set_attribute(attribute, value).is_err() {
return Err(ScraperErrorKind::Xml.into()); return Err(ScraperErrorKind::Xml.into());
@ -558,7 +456,7 @@ impl ArticleScraper {
xpath: &str, xpath: &str,
attribute: &str, attribute: &str,
) -> Result<String, ScraperError> { ) -> Result<String, ScraperError> {
let node_vec = Self::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for node in node_vec { for node in node_vec {
if let Some(value) = node.get_attribute(attribute) { if let Some(value) = node.get_attribute(attribute) {
return Ok(value); return Ok(value);
@ -574,7 +472,7 @@ impl ArticleScraper {
attribute: &str, attribute: &str,
article_url: &url::Url, article_url: &url::Url,
) -> Result<(), ScraperError> { ) -> Result<(), ScraperError> {
let node_vec = Self::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if let Some(val) = node.get_attribute(attribute) { if let Some(val) = node.get_attribute(attribute) {
if let Err(url::ParseError::RelativeUrlWithoutBase) = url::Url::parse(&val) { if let Err(url::ParseError::RelativeUrlWithoutBase) = url::Url::parse(&val) {
@ -623,29 +521,29 @@ impl ArticleScraper {
// strip specified xpath // strip specified xpath
if let Some(config) = config { if let Some(config) = config {
for xpath_strip in &config.xpath_strip { for xpath_strip in &config.xpath_strip {
let _ = ArticleScraper::strip_node(&context, xpath_strip); let _ = Util::strip_node(&context, xpath_strip);
} }
} }
for xpath_strip in &global_config.xpath_strip { for xpath_strip in &global_config.xpath_strip {
let _ = ArticleScraper::strip_node(&context, xpath_strip); let _ = Util::strip_node(&context, xpath_strip);
} }
// strip everything with specified 'id' or 'class' // strip everything with specified 'id' or 'class'
if let Some(config) = config { if let Some(config) = config {
for xpaht_strip_class in &config.strip_id_or_class { for xpaht_strip_class in &config.strip_id_or_class {
let _ = ArticleScraper::strip_id_or_class(&context, xpaht_strip_class); let _ = Util::strip_id_or_class(&context, xpaht_strip_class);
} }
} }
for xpaht_strip_class in &global_config.strip_id_or_class { for xpaht_strip_class in &global_config.strip_id_or_class {
let _ = ArticleScraper::strip_id_or_class(&context, xpaht_strip_class); let _ = Util::strip_id_or_class(&context, xpaht_strip_class);
} }
// strip any <img> element where @src attribute contains this substring // strip any <img> element where @src attribute contains this substring
if let Some(config) = config { if let Some(config) = config {
for xpath_strip_img_src in &config.strip_image_src { for xpath_strip_img_src in &config.strip_image_src {
let _ = ArticleScraper::strip_node( let _ = Util::strip_node(
&context, &context,
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src), &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
); );
@ -653,7 +551,7 @@ impl ArticleScraper {
} }
for xpath_strip_img_src in &global_config.strip_image_src { for xpath_strip_img_src in &global_config.strip_image_src {
let _ = ArticleScraper::strip_node( let _ = Util::strip_node(
&context, &context,
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src), &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
); );
@ -676,23 +574,23 @@ impl ArticleScraper {
// strip elements using Readability.com and Instapaper.com ignore class names // strip elements using Readability.com and Instapaper.com ignore class names
// .entry-unrelated and .instapaper_ignore // .entry-unrelated and .instapaper_ignore
// See http://blog.instapaper.com/post/730281947 // See http://blog.instapaper.com/post/730281947
let _ = ArticleScraper::strip_node(&context, &String::from( let _ = Util::strip_node(&context, &String::from(
"//*[contains(@class,' entry-unrelated ') or contains(@class,' instapaper_ignore ')]")); "//*[contains(@class,' entry-unrelated ') or contains(@class,' instapaper_ignore ')]"));
// strip elements that contain style="display: none;" // strip elements that contain style="display: none;"
let _ = ArticleScraper::strip_node( let _ = Util::strip_node(
&context, &context,
&String::from("//*[contains(@style,'display:none')]"), &String::from("//*[contains(@style,'display:none')]"),
); );
// strip all comments // strip all comments
let _ = ArticleScraper::strip_node(&context, &String::from("//comment()")); let _ = Util::strip_node(&context, &String::from("//comment()"));
// strip all empty url-tags <a/> // strip all empty url-tags <a/>
let _ = ArticleScraper::strip_node(&context, &String::from("//a[not(node())]")); let _ = Util::strip_node(&context, &String::from("//a[not(node())]"));
// strip all external css and fonts // strip all external css and fonts
let _ = ArticleScraper::strip_node(&context, &String::from("//*[@type='text/css']")); let _ = Util::strip_node(&context, &String::from("//*[@type='text/css']"));
} }
fn extract_metadata( fn extract_metadata(
@ -704,7 +602,7 @@ impl ArticleScraper {
// try to get title // try to get title
if let Some(config) = config { if let Some(config) = config {
for xpath_title in &config.xpath_title { for xpath_title in &config.xpath_title {
if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) { if let Ok(title) = Util::extract_value_merge(&context, xpath_title) {
debug!("Article title: '{}'", title); debug!("Article title: '{}'", title);
article.title = Some(title); article.title = Some(title);
break; break;
@ -714,7 +612,7 @@ impl ArticleScraper {
if article.title.is_none() { if article.title.is_none() {
for xpath_title in &global_config.xpath_title { for xpath_title in &global_config.xpath_title {
if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) { if let Ok(title) = Util::extract_value_merge(&context, xpath_title) {
debug!("Article title: '{}'", title); debug!("Article title: '{}'", title);
article.title = Some(title); article.title = Some(title);
break; break;
@ -725,7 +623,7 @@ impl ArticleScraper {
// try to get the author // try to get the author
if let Some(config) = config { if let Some(config) = config {
for xpath_author in &config.xpath_author { for xpath_author in &config.xpath_author {
if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) { if let Ok(author) = Util::extract_value(&context, xpath_author) {
debug!("Article author: '{}'", author); debug!("Article author: '{}'", author);
article.author = Some(author); article.author = Some(author);
break; break;
@ -733,9 +631,9 @@ impl ArticleScraper {
} }
} }
if article.title.is_none() { if article.author.is_none() {
for xpath_author in &global_config.xpath_author { for xpath_author in &global_config.xpath_author {
if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) { if let Ok(author) = Util::extract_value(&context, xpath_author) {
debug!("Article author: '{}'", author); debug!("Article author: '{}'", author);
article.author = Some(author); article.author = Some(author);
break; break;
@ -746,7 +644,7 @@ impl ArticleScraper {
// try to get the date // try to get the date
if let Some(config) = config { if let Some(config) = config {
for xpath_date in &config.xpath_date { for xpath_date in &config.xpath_date {
if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) { if let Ok(date_string) = Util::extract_value(&context, xpath_date) {
debug!("Article date: '{}'", date_string); debug!("Article date: '{}'", date_string);
if let Ok(date) = DateTime::from_str(&date_string) { if let Ok(date) = DateTime::from_str(&date_string) {
article.date = Some(date); article.date = Some(date);
@ -758,9 +656,9 @@ impl ArticleScraper {
} }
} }
if article.title.is_none() { if article.date.is_none() {
for xpath_date in &global_config.xpath_date { for xpath_date in &global_config.xpath_date {
if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) { if let Ok(date_string) = Util::extract_value(&context, xpath_date) {
debug!("Article date: '{}'", date_string); debug!("Article date: '{}'", date_string);
if let Ok(date) = DateTime::from_str(&date_string) { if let Ok(date) = DateTime::from_str(&date_string) {
article.date = Some(date); article.date = Some(date);
@ -808,7 +706,7 @@ impl ArticleScraper {
) -> Result<bool, ScraperError> { ) -> Result<bool, ScraperError> {
let mut found_something = false; let mut found_something = false;
{ {
let node_vec = Self::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if node.get_property("style").is_some() && node.remove_property("style").is_err() { if node.get_property("style").is_some() && node.remove_property("style").is_err() {
return Err(ScraperErrorKind::Xml.into()); return Err(ScraperErrorKind::Xml.into());
@ -876,7 +774,7 @@ impl ArticleScraper {
// this prevents libxml from self closing non void elements such as iframe // this prevents libxml from self closing non void elements such as iframe
let xpath = "//*[not(node())]"; let xpath = "//*[not(node())]";
let node_vec = Self::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if node.get_name() == "meta" { if node.get_name() == "meta" {
continue; continue;

View file

@ -41,13 +41,19 @@ async fn phoronix() {
#[tokio::test(flavor = "current_thread")] #[tokio::test(flavor = "current_thread")]
async fn youtube() { async fn youtube() {
let url = url::Url::parse("https://www.youtube.com/watch?v=lHRkYLcmFY8").unwrap(); let out_path = PathBuf::from(r"./test_output");
let url = url::Url::parse("https://www.youtube.com/watch?v=8KjaIumu-jI").unwrap();
let grabber = ArticleScraper::new(None).await; let grabber = ArticleScraper::new(None).await;
let article = grabber.parse(&url, false, &Client::new()).await.unwrap(); let article = grabber.parse(&url, false, &Client::new()).await.unwrap();
article.save_html(&out_path).unwrap();
assert_eq!( assert_eq!(
article.html, article.title.as_deref(),
Some("<iframe width=\"650\" height=\"350\" frameborder=\"0\" src=\"https://www.youtube-nocookie.com/embed/lHRkYLcmFY8\" allowfullscreen></iframe>".into()) Some("RIGGED! Arena Shuffler is BROKEN | 13 Land Mono Red Burn")
); );
assert!(article
.html
.map(|html| html.contains("https://www.youtube.com/embed/8KjaIumu-jI?feature=oembed"))
.unwrap_or(false));
} }

View file

@ -1,8 +1,15 @@
use failure::ResultExt; use failure::ResultExt;
use reqwest::header::{HeaderMap, HeaderValue, HeaderName}; use libxml::{tree::Node, xpath::Context};
use reqwest::{
header::{HeaderMap, HeaderName, HeaderValue},
Response,
};
use tokio::fs::DirEntry; use tokio::fs::DirEntry;
use crate::{config::ConfigEntry, error::{ScraperErrorKind, ScraperError}}; use crate::{
config::ConfigEntry,
error::{ScraperError, ScraperErrorKind},
};
pub struct Util; pub struct Util;
@ -15,7 +22,7 @@ impl Util {
} }
} }
pub fn extract_value<'a>(identifier: &str, line: &'a str) -> &'a str { pub fn str_extract_value<'a>(identifier: &str, line: &'a str) -> &'a str {
let value = &line[identifier.len()..]; let value = &line[identifier.len()..];
let value = value.trim(); let value = value.trim();
match value.find('#') { match value.find('#') {
@ -39,23 +46,164 @@ impl Util {
} }
} }
pub fn generate_headers(site_specific_rule: Option<&ConfigEntry>, global_rule: &ConfigEntry) -> Result<HeaderMap, ScraperError> { pub fn generate_headers(
site_specific_rule: Option<&ConfigEntry>,
global_rule: &ConfigEntry,
) -> Result<HeaderMap, ScraperError> {
let mut headers = HeaderMap::new(); let mut headers = HeaderMap::new();
if let Some(config) = site_specific_rule { if let Some(config) = site_specific_rule {
for header in &config.header { for header in &config.header {
let name = HeaderName::from_bytes(header.name.as_bytes()).context(ScraperErrorKind::Config)?; let name = HeaderName::from_bytes(header.name.as_bytes())
let value = header.value.parse::<HeaderValue>().context(ScraperErrorKind::Config)?; .context(ScraperErrorKind::Config)?;
let value = header
.value
.parse::<HeaderValue>()
.context(ScraperErrorKind::Config)?;
headers.insert(name, value); headers.insert(name, value);
} }
} }
for header in &global_rule.header { for header in &global_rule.header {
let name = HeaderName::from_bytes(header.name.as_bytes()).context(ScraperErrorKind::Config)?; let name =
let value = header.value.parse::<HeaderValue>().context(ScraperErrorKind::Config)?; HeaderName::from_bytes(header.name.as_bytes()).context(ScraperErrorKind::Config)?;
let value = header
.value
.parse::<HeaderValue>()
.context(ScraperErrorKind::Config)?;
headers.insert(name, value); headers.insert(name, value);
} }
Ok(headers) Ok(headers)
} }
pub fn find_page_url(xpath_ctx: &Context, xpath_page_link: &str) -> Option<url::Url> {
let res = Self::evaluate_xpath(&xpath_ctx, &xpath_page_link, false).ok()?;
let mut url = None;
for node in res {
let content = node.get_content();
let url_str = if content.trim().is_empty() && node.has_attribute("href") {
node.get_attribute("href").unwrap()
} else {
content
};
if let Ok(parsed_url) = url::Url::parse(&url_str) {
url = Some(parsed_url);
break;
}
}
url
}
pub fn evaluate_xpath(
xpath_ctx: &Context,
xpath: &str,
thorw_if_empty: bool,
) -> Result<Vec<Node>, ScraperError> {
let res = xpath_ctx.evaluate(xpath).map_err(|()| {
log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
ScraperErrorKind::Xml
})?;
let node_vec = res.get_nodes_as_vec();
if node_vec.is_empty() {
log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
if thorw_if_empty {
return Err(ScraperErrorKind::Xml.into());
}
}
Ok(node_vec)
}
pub fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
if response.status().is_success() {
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
if let Ok(content_type) = content_type.to_str() {
if content_type.contains("text/html") {
return Ok(true);
}
}
}
log::error!("Content type is not text/HTML");
return Ok(false);
}
log::error!("Failed to determine content type");
Err(ScraperErrorKind::Http.into())
}
pub fn check_redirect(response: &Response, original_url: &url::Url) -> Option<url::Url> {
if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
log::debug!("Article url redirects to '{}'", response.url().as_str());
return Some(response.url().clone());
} else if response.url() != original_url {
return Some(response.url().clone());
}
None
}
pub fn extract_value(context: &Context, xpath: &str) -> Result<String, ScraperError> {
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
if let Some(val) = node_vec.get(0) {
return Ok(val.get_content());
}
Err(ScraperErrorKind::Xml.into())
}
pub fn extract_value_merge(context: &Context, xpath: &str) -> Result<String, ScraperError> {
let node_vec = Util::evaluate_xpath(context, xpath, true)?;
let mut val = String::new();
for node in node_vec {
let part = node
.get_content()
.split_whitespace()
.map(|s| format!("{} ", s))
.collect::<String>();
val.push_str(&part);
val.push_str(" ");
}
Ok(val.trim().to_string())
}
pub fn strip_node(context: &Context, xpath: &str) -> Result<(), ScraperError> {
let mut ancestor = xpath.to_string();
if ancestor.starts_with("//") {
ancestor = ancestor.chars().skip(2).collect();
}
let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
let node_vec = Util::evaluate_xpath(context, query, false)?;
for mut node in node_vec {
node.unlink();
}
Ok(())
}
pub fn strip_id_or_class(context: &Context, id_or_class: &str) -> Result<(), ScraperError> {
let xpath = &format!(
"//*[contains(@class, '{}') or contains(@id, '{}')]",
id_or_class, id_or_class
);
let mut ancestor = xpath.clone();
if ancestor.starts_with("//") {
ancestor = ancestor.chars().skip(2).collect();
}
let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
let node_vec = Util::evaluate_xpath(context, query, false)?;
for mut node in node_vec {
node.unlink();
}
Ok(())
}
} }