From 8c2af148718f53a4d368210772c6fbcfd5476a4e Mon Sep 17 00:00:00 2001
From: Jan Lukas Gernert <jangernert@gmail.com>
Date: Fri, 7 Oct 2022 08:48:09 +0200
Subject: [PATCH] special handling trying to find single page links: fixes
 youtube

---
 src/config/config_entry.rs |   8 +-
 src/config/macros.rs       |   6 +-
 src/images/mod.rs          |   4 +-
 src/lib.rs                 | 206 ++++++++++---------------------------
 src/tests.rs               |  12 ++-
 src/util.rs                | 164 +++++++++++++++++++++++++++--
 6 files changed, 226 insertions(+), 174 deletions(-)
diff --git a/src/config/config_entry.rs b/src/config/config_entry.rs
index b6411e6..8478f2f 100644
--- a/src/config/config_entry.rs
+++ b/src/config/config_entry.rs
@@ -114,7 +114,7 @@ impl ConfigEntry {
             extract_option_single!(line, next_page, next_page_link);
 
             if line.starts_with(replace_single) {
-                let value = Util::extract_value(replace_single, line);
+                let value = Util::str_extract_value(replace_single, line);
                 let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect();
                 if value.len() != 2 {
                     continue;
@@ -133,7 +133,7 @@ impl ConfigEntry {
             }
 
             if line.starts_with(http_header) {
-                let value = Util::extract_value(http_header, line);
+                let value = Util::str_extract_value(http_header, line);
                 let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect();
                 if value.len() != 2 {
                     continue;
@@ -152,10 +152,10 @@ impl ConfigEntry {
             }
 
             if line.starts_with(find) {
-                let to_replace = Util::extract_value(find, line).into();
+                let to_replace = Util::str_extract_value(find, line).into();
 
                 if let Ok(Some(next_line)) = lines.next_line().await {
-                    let replace_with = Util::extract_value(replace, &next_line).into();
+                    let replace_with = Util::str_extract_value(replace, &next_line).into();
 
                     replace_vec.push(Replace {
                         to_replace,
diff --git a/src/config/macros.rs b/src/config/macros.rs
index 1fe309a..b511d4b 100644
--- a/src/config/macros.rs
+++ b/src/config/macros.rs
@@ -5,7 +5,7 @@ macro_rules! extract_vec_multi {
 		$vector: ident
 	) => {
         if $line.starts_with($identifier) {
-            let value = Util::extract_value($identifier, $line);
+            let value = Util::str_extract_value($identifier, $line);
             let value = Util::split_values(value);
             let value: Vec<String> = value.iter().map(|s| s.trim().to_string()).collect();
             $vector.extend(value);
@@ -21,7 +21,7 @@ macro_rules! extract_vec_single {
 		$vector: ident
 	) => {
         if $line.starts_with($identifier) {
-            let value = Util::extract_value($identifier, $line);
+            let value = Util::str_extract_value($identifier, $line);
             $vector.push(value.to_string());
             continue;
         }
@@ -35,7 +35,7 @@ macro_rules! extract_option_single {
 		$option: ident
 	) => {
         if $line.starts_with($identifier) {
-            let value = Util::extract_value($identifier, $line);
+            let value = Util::str_extract_value($identifier, $line);
             $option = Some(value.to_string());
             continue;
         }
diff --git a/src/images/mod.rs b/src/images/mod.rs
index 48a1bba..dbcfaff 100644
--- a/src/images/mod.rs
+++ b/src/images/mod.rs
@@ -1,5 +1,5 @@
 use self::error::{ImageDownloadError, ImageDownloadErrorKind};
-use crate::ArticleScraper;
+use crate::util::Util;
 use failure::ResultExt;
 use libxml::parser::Parser;
 use libxml::tree::{Node, SaveOptions};
@@ -57,7 +57,7 @@ impl ImageDownloader {
         client: &Client,
     ) -> Result<(), ImageDownloadError> {
         let xpath = "//img";
-        let node_vec = ArticleScraper::evaluate_xpath(context, xpath, false)
+        let node_vec = Util::evaluate_xpath(context, xpath, false)
             .context(ImageDownloadErrorKind::HtmlParse)?;
         for mut node in node_vec {
             if let Some(url) = node.get_property("src") {
diff --git a/src/lib.rs b/src/lib.rs
index 6eef491..ef76704 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -20,7 +20,7 @@ use libxml::tree::{Document, Node, SaveOptions};
 use libxml::xpath::Context;
 use log::{debug, error, info, warn};
 use reqwest::header::HeaderMap;
-use reqwest::{Client, Response};
+use reqwest::Client;
 use std::path::Path;
 use std::str::FromStr;
 use util::Util;
@@ -75,7 +75,7 @@ impl ArticleScraper {
             .context(ScraperErrorKind::Http)?;
 
         // check if url redirects and we need to pick up the new url
-        let url = if let Some(new_url) = ArticleScraper::check_redirect(&response, &url) {
+        let url = if let Some(new_url) = Util::check_redirect(&response, &url) {
             debug!("Url '{}' redirects to '{}'", url.as_str(), new_url.as_str());
             new_url
         } else {
@@ -83,7 +83,7 @@ impl ArticleScraper {
         };
 
         // check if we are dealing with text/html
-        if !ArticleScraper::check_content_type(&response)? {
+        if !Util::check_content_type(&response)? {
             return Err(ScraperErrorKind::ContentType.into());
         }
 
@@ -167,23 +167,22 @@ impl ArticleScraper {
                 "Single page link xpath specified in config '{}'",
                 xpath_single_page_link
             );
-            if let Ok(result) = xpath_ctx.findvalue(&xpath_single_page_link, None) {
-                if !result.trim().is_empty() {
-                    // parse again with single page url
-                    debug!("Single page link found '{}'", result);
-                    let single_page_url =
-                        url::Url::parse(&result).context(ScraperErrorKind::Url)?;
-                    return self
-                        .parse_single_page(
-                            article,
-                            &single_page_url,
-                            root,
-                            config,
-                            global_config,
-                            client,
-                        )
-                        .await;
-                }
+
+            if let Some(single_page_url) = Util::find_page_url(&xpath_ctx, &xpath_single_page_link)
+            {
+                // parse again with single page url
+                debug!("Single page link found '{}'", single_page_url);
+
+                return self
+                    .parse_single_page(
+                        article,
+                        &single_page_url,
+                        root,
+                        config,
+                        global_config,
+                        client,
+                    )
+                    .await;
             }
         }
 
@@ -236,28 +235,6 @@ impl ArticleScraper {
         })?)
     }
 
-    fn evaluate_xpath(
-        xpath_ctx: &Context,
-        xpath: &str,
-        thorw_if_empty: bool,
-    ) -> Result<Vec<Node>, ScraperError> {
-        let res = xpath_ctx.evaluate(xpath).map_err(|()| {
-            debug!("Evaluation of xpath '{}' yielded no results", xpath);
-            ScraperErrorKind::Xml
-        })?;
-
-        let node_vec = res.get_nodes_as_vec();
-
-        if node_vec.is_empty() {
-            debug!("Evaluation of xpath '{}' yielded no results", xpath);
-            if thorw_if_empty {
-                return Err(ScraperErrorKind::Xml.into());
-            }
-        }
-
-        Ok(node_vec)
-    }
-
     async fn parse_single_page(
         &self,
         article: &mut Article,
@@ -278,7 +255,11 @@ impl ArticleScraper {
         Ok(())
     }
 
-    async fn download(url: &url::Url, client: &Client, headers: HeaderMap) -> Result<String, ScraperError> {
+    async fn download(
+        url: &url::Url,
+        client: &Client,
+        headers: HeaderMap,
+    ) -> Result<String, ScraperError> {
         let response = client
             .get(url.as_str())
             .headers(headers)
@@ -389,96 +370,13 @@ impl ArticleScraper {
         conf
     }
 
-    fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
-        if response.status().is_success() {
-            if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
-                if let Ok(content_type) = content_type.to_str() {
-                    if content_type.contains("text/html") {
-                        return Ok(true);
-                    }
-                }
-            }
-
-            error!("Content type is not text/HTML");
-            return Ok(false);
-        }
-
-        error!("Failed to determine content type");
-        Err(ScraperErrorKind::Http.into())
-    }
-
-    fn check_redirect(response: &Response, original_url: &url::Url) -> Option<url::Url> {
-        if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
-            debug!("Article url redirects to '{}'", response.url().as_str());
-            return Some(response.url().clone());
-        } else if response.url() != original_url {
-            return Some(response.url().clone());
-        }
-
-        None
-    }
-
-    fn extract_value(context: &Context, xpath: &str) -> Result<String, ScraperError> {
-        let node_vec = Self::evaluate_xpath(context, xpath, false)?;
-        if let Some(val) = node_vec.get(0) {
-            return Ok(val.get_content());
-        }
-
-        Err(ScraperErrorKind::Xml.into())
-    }
-
-    fn extract_value_merge(context: &Context, xpath: &str) -> Result<String, ScraperError> {
-        let node_vec = Self::evaluate_xpath(context, xpath, true)?;
-        let mut val = String::new();
-        for node in node_vec {
-            let part = node.get_content().split_whitespace().map(|s| format!("{} ", s)).collect::<String>();
-            val.push_str(&part);
-            val.push_str(" ");
-        }
-
-        Ok(val.trim().to_string())
-    }
-
-    fn strip_node(context: &Context, xpath: &str) -> Result<(), ScraperError> {
-        let mut ancestor = xpath.to_string();
-        if ancestor.starts_with("//") {
-            ancestor = ancestor.chars().skip(2).collect();
-        }
-
-        let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
-        let node_vec = Self::evaluate_xpath(context, query, false)?;
-        for mut node in node_vec {
-            node.unlink();
-        }
-        Ok(())
-    }
-
-    fn strip_id_or_class(context: &Context, id_or_class: &str) -> Result<(), ScraperError> {
-        let xpath = &format!(
-            "//*[contains(@class, '{}') or contains(@id, '{}')]",
-            id_or_class, id_or_class
-        );
-
-        let mut ancestor = xpath.clone();
-        if ancestor.starts_with("//") {
-            ancestor = ancestor.chars().skip(2).collect();
-        }
-
-        let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
-        let node_vec = Self::evaluate_xpath(context, query, false)?;
-        for mut node in node_vec {
-            node.unlink();
-        }
-        Ok(())
-    }
-
     fn fix_lazy_images(
         context: &Context,
         class: &str,
         property_url: &str,
     ) -> Result<(), ScraperError> {
         let xpath = &format!("//img[contains(@class, '{}')]", class);
-        let node_vec = Self::evaluate_xpath(context, xpath, false)?;
+        let node_vec = Util::evaluate_xpath(context, xpath, false)?;
         for mut node in node_vec {
             if let Some(correct_url) = node.get_property(property_url) {
                 if node.set_property("src", &correct_url).is_err() {
@@ -491,13 +389,13 @@ impl ArticleScraper {
 
     fn fix_iframe_size(context: &Context, site_name: &str) -> Result<(), ScraperError> {
         let xpath = &format!("//iframe[contains(@src, '{}')]", site_name);
-        let node_vec = Self::evaluate_xpath(context, xpath, false)?;
+        let node_vec = Util::evaluate_xpath(context, xpath, false)?;
         for mut node in node_vec {
             if let Some(mut parent) = node.get_parent() {
                 if let Ok(mut video_wrapper) = parent.new_child(None, "div") {
                     if let Ok(()) = video_wrapper.set_property("class", "videoWrapper") {
                         if let Ok(()) = node.set_property("width", "100%") {
-                            if let Ok(()) = node.remove_property("height") {
+                            if let Ok(()) = node.set_property("height", "100%") {
                                 node.unlink();
                                 video_wrapper.add_child(&mut node).map_err(|_| {
                                     error!("Failed to add iframe as child of video wrapper <div>");
@@ -526,7 +424,7 @@ impl ArticleScraper {
         let xpath_tag = tag.unwrap_or("*");
 
         let xpath = &format!("//{}[@{}]", xpath_tag, attribute);
-        let node_vec = Self::evaluate_xpath(context, xpath, false)?;
+        let node_vec = Util::evaluate_xpath(context, xpath, false)?;
         for mut node in node_vec {
             if node.remove_property(attribute).is_err() {
                 return Err(ScraperErrorKind::Xml.into());
@@ -544,7 +442,7 @@ impl ArticleScraper {
         let xpath_tag = tag.unwrap_or("*");
 
         let xpath = &format!("//{}", xpath_tag);
-        let node_vec = Self::evaluate_xpath(context, xpath, false)?;
+        let node_vec = Util::evaluate_xpath(context, xpath, false)?;
         for mut node in node_vec {
             if node.set_attribute(attribute, value).is_err() {
                 return Err(ScraperErrorKind::Xml.into());
@@ -558,7 +456,7 @@ impl ArticleScraper {
         xpath: &str,
         attribute: &str,
     ) -> Result<String, ScraperError> {
-        let node_vec = Self::evaluate_xpath(context, xpath, false)?;
+        let node_vec = Util::evaluate_xpath(context, xpath, false)?;
         for node in node_vec {
             if let Some(value) = node.get_attribute(attribute) {
                 return Ok(value);
@@ -574,7 +472,7 @@ impl ArticleScraper {
         attribute: &str,
         article_url: &url::Url,
     ) -> Result<(), ScraperError> {
-        let node_vec = Self::evaluate_xpath(context, xpath, false)?;
+        let node_vec = Util::evaluate_xpath(context, xpath, false)?;
         for mut node in node_vec {
             if let Some(val) = node.get_attribute(attribute) {
                 if let Err(url::ParseError::RelativeUrlWithoutBase) = url::Url::parse(&val) {
@@ -623,29 +521,29 @@ impl ArticleScraper {
         // strip specified xpath
         if let Some(config) = config {
             for xpath_strip in &config.xpath_strip {
-                let _ = ArticleScraper::strip_node(&context, xpath_strip);
+                let _ = Util::strip_node(&context, xpath_strip);
             }
         }
 
         for xpath_strip in &global_config.xpath_strip {
-            let _ = ArticleScraper::strip_node(&context, xpath_strip);
+            let _ = Util::strip_node(&context, xpath_strip);
         }
 
         // strip everything with specified 'id' or 'class'
         if let Some(config) = config {
             for xpaht_strip_class in &config.strip_id_or_class {
-                let _ = ArticleScraper::strip_id_or_class(&context, xpaht_strip_class);
+                let _ = Util::strip_id_or_class(&context, xpaht_strip_class);
             }
         }
 
         for xpaht_strip_class in &global_config.strip_id_or_class {
-            let _ = ArticleScraper::strip_id_or_class(&context, xpaht_strip_class);
+            let _ = Util::strip_id_or_class(&context, xpaht_strip_class);
         }
 
         // strip any <img> element where @src attribute contains this substring
         if let Some(config) = config {
             for xpath_strip_img_src in &config.strip_image_src {
-                let _ = ArticleScraper::strip_node(
+                let _ = Util::strip_node(
                     &context,
                     &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
                 );
@@ -653,7 +551,7 @@ impl ArticleScraper {
         }
 
         for xpath_strip_img_src in &global_config.strip_image_src {
-            let _ = ArticleScraper::strip_node(
+            let _ = Util::strip_node(
                 &context,
                 &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
             );
@@ -676,23 +574,23 @@ impl ArticleScraper {
         // strip elements using Readability.com and Instapaper.com ignore class names
         // .entry-unrelated and .instapaper_ignore
         // See http://blog.instapaper.com/post/730281947
-        let _ = ArticleScraper::strip_node(&context, &String::from(
+        let _ = Util::strip_node(&context, &String::from(
             "//*[contains(@class,' entry-unrelated ') or contains(@class,' instapaper_ignore ')]"));
 
         // strip elements that contain style="display: none;"
-        let _ = ArticleScraper::strip_node(
+        let _ = Util::strip_node(
             &context,
             &String::from("//*[contains(@style,'display:none')]"),
         );
 
         // strip all comments
-        let _ = ArticleScraper::strip_node(&context, &String::from("//comment()"));
+        let _ = Util::strip_node(&context, &String::from("//comment()"));
 
         // strip all empty url-tags <a/>
-        let _ = ArticleScraper::strip_node(&context, &String::from("//a[not(node())]"));
+        let _ = Util::strip_node(&context, &String::from("//a[not(node())]"));
 
         // strip all external css and fonts
-        let _ = ArticleScraper::strip_node(&context, &String::from("//*[@type='text/css']"));
+        let _ = Util::strip_node(&context, &String::from("//*[@type='text/css']"));
     }
 
     fn extract_metadata(
@@ -704,7 +602,7 @@ impl ArticleScraper {
         // try to get title
         if let Some(config) = config {
             for xpath_title in &config.xpath_title {
-                if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) {
+                if let Ok(title) = Util::extract_value_merge(&context, xpath_title) {
                     debug!("Article title: '{}'", title);
                     article.title = Some(title);
                     break;
@@ -714,7 +612,7 @@ impl ArticleScraper {
 
         if article.title.is_none() {
             for xpath_title in &global_config.xpath_title {
-                if let Ok(title) = ArticleScraper::extract_value_merge(&context, xpath_title) {
+                if let Ok(title) = Util::extract_value_merge(&context, xpath_title) {
                     debug!("Article title: '{}'", title);
                     article.title = Some(title);
                     break;
@@ -725,7 +623,7 @@ impl ArticleScraper {
         // try to get the author
         if let Some(config) = config {
             for xpath_author in &config.xpath_author {
-                if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) {
+                if let Ok(author) = Util::extract_value(&context, xpath_author) {
                     debug!("Article author: '{}'", author);
                     article.author = Some(author);
                     break;
@@ -733,9 +631,9 @@ impl ArticleScraper {
             }
         }
 
-        if article.title.is_none() {
+        if article.author.is_none() {
             for xpath_author in &global_config.xpath_author {
-                if let Ok(author) = ArticleScraper::extract_value(&context, xpath_author) {
+                if let Ok(author) = Util::extract_value(&context, xpath_author) {
                     debug!("Article author: '{}'", author);
                     article.author = Some(author);
                     break;
@@ -746,7 +644,7 @@ impl ArticleScraper {
         // try to get the date
         if let Some(config) = config {
             for xpath_date in &config.xpath_date {
-                if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) {
+                if let Ok(date_string) = Util::extract_value(&context, xpath_date) {
                     debug!("Article date: '{}'", date_string);
                     if let Ok(date) = DateTime::from_str(&date_string) {
                         article.date = Some(date);
@@ -758,9 +656,9 @@ impl ArticleScraper {
             }
         }
 
-        if article.title.is_none() {
+        if article.date.is_none() {
             for xpath_date in &global_config.xpath_date {
-                if let Ok(date_string) = ArticleScraper::extract_value(&context, xpath_date) {
+                if let Ok(date_string) = Util::extract_value(&context, xpath_date) {
                     debug!("Article date: '{}'", date_string);
                     if let Ok(date) = DateTime::from_str(&date_string) {
                         article.date = Some(date);
@@ -808,7 +706,7 @@ impl ArticleScraper {
     ) -> Result<bool, ScraperError> {
         let mut found_something = false;
         {
-            let node_vec = Self::evaluate_xpath(context, xpath, false)?;
+            let node_vec = Util::evaluate_xpath(context, xpath, false)?;
             for mut node in node_vec {
                 if node.get_property("style").is_some() && node.remove_property("style").is_err() {
                     return Err(ScraperErrorKind::Xml.into());
@@ -876,7 +774,7 @@ impl ArticleScraper {
         // this prevents libxml from self closing non void elements such as iframe
 
         let xpath = "//*[not(node())]";
-        let node_vec = Self::evaluate_xpath(context, xpath, false)?;
+        let node_vec = Util::evaluate_xpath(context, xpath, false)?;
         for mut node in node_vec {
             if node.get_name() == "meta" {
                 continue;
diff --git a/src/tests.rs b/src/tests.rs
index bd47f82..e82ad2d 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -41,13 +41,19 @@ async fn phoronix() {
 
 #[tokio::test(flavor = "current_thread")]
 async fn youtube() {
-    let url = url::Url::parse("https://www.youtube.com/watch?v=lHRkYLcmFY8").unwrap();
+    let out_path = PathBuf::from(r"./test_output");
+    let url = url::Url::parse("https://www.youtube.com/watch?v=8KjaIumu-jI").unwrap();
 
     let grabber = ArticleScraper::new(None).await;
     let article = grabber.parse(&url, false, &Client::new()).await.unwrap();
+    article.save_html(&out_path).unwrap();
 
     assert_eq!(
-        article.html,
-        Some("<iframe width=\"650\" height=\"350\" frameborder=\"0\" src=\"https://www.youtube-nocookie.com/embed/lHRkYLcmFY8\" allowfullscreen></iframe>".into())
+        article.title.as_deref(),
+        Some("RIGGED! Arena Shuffler is BROKEN | 13 Land Mono Red Burn")
     );
+    assert!(article
+        .html
+        .map(|html| html.contains("https://www.youtube.com/embed/8KjaIumu-jI?feature=oembed"))
+        .unwrap_or(false));
 }
diff --git a/src/util.rs b/src/util.rs
index baac1b1..4827fab 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -1,8 +1,15 @@
 use failure::ResultExt;
-use reqwest::header::{HeaderMap, HeaderValue, HeaderName};
+use libxml::{tree::Node, xpath::Context};
+use reqwest::{
+    header::{HeaderMap, HeaderName, HeaderValue},
+    Response,
+};
 use tokio::fs::DirEntry;
 
-use crate::{config::ConfigEntry, error::{ScraperErrorKind, ScraperError}};
+use crate::{
+    config::ConfigEntry,
+    error::{ScraperError, ScraperErrorKind},
+};
 
 pub struct Util;
 
@@ -15,7 +22,7 @@ impl Util {
         }
     }
 
-    pub fn extract_value<'a>(identifier: &str, line: &'a str) -> &'a str {
+    pub fn str_extract_value<'a>(identifier: &str, line: &'a str) -> &'a str {
         let value = &line[identifier.len()..];
         let value = value.trim();
         match value.find('#') {
@@ -39,23 +46,164 @@ impl Util {
         }
     }
 
-    pub fn generate_headers(site_specific_rule: Option<&ConfigEntry>, global_rule: &ConfigEntry) -> Result<HeaderMap, ScraperError> {
+    pub fn generate_headers(
+        site_specific_rule: Option<&ConfigEntry>,
+        global_rule: &ConfigEntry,
+    ) -> Result<HeaderMap, ScraperError> {
         let mut headers = HeaderMap::new();
 
         if let Some(config) = site_specific_rule {
             for header in &config.header {
-                let name = HeaderName::from_bytes(header.name.as_bytes()).context(ScraperErrorKind::Config)?;
-                let value = header.value.parse::<HeaderValue>().context(ScraperErrorKind::Config)?;
+                let name = HeaderName::from_bytes(header.name.as_bytes())
+                    .context(ScraperErrorKind::Config)?;
+                let value = header
+                    .value
+                    .parse::<HeaderValue>()
+                    .context(ScraperErrorKind::Config)?;
                 headers.insert(name, value);
             }
         }
 
         for header in &global_rule.header {
-            let name = HeaderName::from_bytes(header.name.as_bytes()).context(ScraperErrorKind::Config)?;
-            let value = header.value.parse::<HeaderValue>().context(ScraperErrorKind::Config)?;
+            let name =
+                HeaderName::from_bytes(header.name.as_bytes()).context(ScraperErrorKind::Config)?;
+            let value = header
+                .value
+                .parse::<HeaderValue>()
+                .context(ScraperErrorKind::Config)?;
             headers.insert(name, value);
         }
 
         Ok(headers)
     }
+
+    pub fn find_page_url(xpath_ctx: &Context, xpath_page_link: &str) -> Option<url::Url> {
+        let res = Self::evaluate_xpath(&xpath_ctx, &xpath_page_link, false).ok()?;
+        let mut url = None;
+
+        for node in res {
+            let content = node.get_content();
+            let url_str = if content.trim().is_empty() && node.has_attribute("href") {
+                node.get_attribute("href").unwrap()
+            } else {
+                content
+            };
+
+            if let Ok(parsed_url) = url::Url::parse(&url_str) {
+                url = Some(parsed_url);
+                break;
+            }
+        }
+
+        url
+    }
+
+    pub fn evaluate_xpath(
+        xpath_ctx: &Context,
+        xpath: &str,
+        thorw_if_empty: bool,
+    ) -> Result<Vec<Node>, ScraperError> {
+        let res = xpath_ctx.evaluate(xpath).map_err(|()| {
+            log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
+            ScraperErrorKind::Xml
+        })?;
+
+        let node_vec = res.get_nodes_as_vec();
+
+        if node_vec.is_empty() {
+            log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
+            if thorw_if_empty {
+                return Err(ScraperErrorKind::Xml.into());
+            }
+        }
+
+        Ok(node_vec)
+    }
+
+    pub fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
+        if response.status().is_success() {
+            if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
+                if let Ok(content_type) = content_type.to_str() {
+                    if content_type.contains("text/html") {
+                        return Ok(true);
+                    }
+                }
+            }
+
+            log::error!("Content type is not text/HTML");
+            return Ok(false);
+        }
+
+        log::error!("Failed to determine content type");
+        Err(ScraperErrorKind::Http.into())
+    }
+
+    pub fn check_redirect(response: &Response, original_url: &url::Url) -> Option<url::Url> {
+        if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
+            log::debug!("Article url redirects to '{}'", response.url().as_str());
+            return Some(response.url().clone());
+        } else if response.url() != original_url {
+            return Some(response.url().clone());
+        }
+
+        None
+    }
+
+    pub fn extract_value(context: &Context, xpath: &str) -> Result<String, ScraperError> {
+        let node_vec = Util::evaluate_xpath(context, xpath, false)?;
+        if let Some(val) = node_vec.get(0) {
+            return Ok(val.get_content());
+        }
+
+        Err(ScraperErrorKind::Xml.into())
+    }
+
+    pub fn extract_value_merge(context: &Context, xpath: &str) -> Result<String, ScraperError> {
+        let node_vec = Util::evaluate_xpath(context, xpath, true)?;
+        let mut val = String::new();
+        for node in node_vec {
+            let part = node
+                .get_content()
+                .split_whitespace()
+                .map(|s| format!("{} ", s))
+                .collect::<String>();
+            val.push_str(&part);
+            val.push_str(" ");
+        }
+
+        Ok(val.trim().to_string())
+    }
+
+    pub fn strip_node(context: &Context, xpath: &str) -> Result<(), ScraperError> {
+        let mut ancestor = xpath.to_string();
+        if ancestor.starts_with("//") {
+            ancestor = ancestor.chars().skip(2).collect();
+        }
+
+        let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
+        let node_vec = Util::evaluate_xpath(context, query, false)?;
+        for mut node in node_vec {
+            node.unlink();
+        }
+        Ok(())
+    }
+
+    pub fn strip_id_or_class(context: &Context, id_or_class: &str) -> Result<(), ScraperError> {
+        let xpath = &format!(
+            "//*[contains(@class, '{}') or contains(@id, '{}')]",
+            id_or_class, id_or_class
+        );
+
+        let mut ancestor = xpath.clone();
+        if ancestor.starts_with("//") {
+            ancestor = ancestor.chars().skip(2).collect();
+        }
+
+        let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
+        let node_vec = Util::evaluate_xpath(context, query, false)?;
+        for mut node in node_vec {
+            node.unlink();
+        }
+        Ok(())
+    }
 }