special handling trying to find single page links: fixes youtube

2025-07-10 01:15:31 +02:00 · 2022-10-07 08:48:09 +02:00 · 2022-10-07 08:48:09 +02:00 · 8c2af14871
commit 8c2af14871
parent 7b1b027c6d
6 changed files with 226 additions and 174 deletions
--- a/src/util.rs
+++ b/src/util.rs
@ -1,8 +1,15 @@
 use failure::ResultExt;
-use reqwest::header::{HeaderMap, HeaderValue, HeaderName};
+use libxml::{tree::Node, xpath::Context};
+use reqwest::{
+    header::{HeaderMap, HeaderName, HeaderValue},
+    Response,
+};
 use tokio::fs::DirEntry;

-use crate::{config::ConfigEntry, error::{ScraperErrorKind, ScraperError}};
+use crate::{
+    config::ConfigEntry,
+    error::{ScraperError, ScraperErrorKind},
+};

 pub struct Util;

@ -15,7 +22,7 @@ impl Util {
        }
    }

-    pub fn extract_value<'a>(identifier: &str, line: &'a str) -> &'a str {
+    pub fn str_extract_value<'a>(identifier: &str, line: &'a str) -> &'a str {
        let value = &line[identifier.len()..];
        let value = value.trim();
        match value.find('#') {
@ -39,23 +46,164 @@ impl Util {
        }
    }

-    pub fn generate_headers(site_specific_rule: Option<&ConfigEntry>, global_rule: &ConfigEntry) -> Result<HeaderMap, ScraperError> {
+    pub fn generate_headers(
+        site_specific_rule: Option<&ConfigEntry>,
+        global_rule: &ConfigEntry,
+    ) -> Result<HeaderMap, ScraperError> {
        let mut headers = HeaderMap::new();

        if let Some(config) = site_specific_rule {
            for header in &config.header {
-                let name = HeaderName::from_bytes(header.name.as_bytes()).context(ScraperErrorKind::Config)?;
-                let value = header.value.parse::<HeaderValue>().context(ScraperErrorKind::Config)?;
+                let name = HeaderName::from_bytes(header.name.as_bytes())
+                    .context(ScraperErrorKind::Config)?;
+                let value = header
+                    .value
+                    .parse::<HeaderValue>()
+                    .context(ScraperErrorKind::Config)?;
                headers.insert(name, value);
            }
        }

        for header in &global_rule.header {
-            let name = HeaderName::from_bytes(header.name.as_bytes()).context(ScraperErrorKind::Config)?;
-            let value = header.value.parse::<HeaderValue>().context(ScraperErrorKind::Config)?;
+            let name =
+                HeaderName::from_bytes(header.name.as_bytes()).context(ScraperErrorKind::Config)?;
+            let value = header
+                .value
+                .parse::<HeaderValue>()
+                .context(ScraperErrorKind::Config)?;
            headers.insert(name, value);
        }

        Ok(headers)
    }
+
+    pub fn find_page_url(xpath_ctx: &Context, xpath_page_link: &str) -> Option<url::Url> {
+        let res = Self::evaluate_xpath(&xpath_ctx, &xpath_page_link, false).ok()?;
+        let mut url = None;
+
+        for node in res {
+            let content = node.get_content();
+            let url_str = if content.trim().is_empty() && node.has_attribute("href") {
+                node.get_attribute("href").unwrap()
+            } else {
+                content
+            };
+
+            if let Ok(parsed_url) = url::Url::parse(&url_str) {
+                url = Some(parsed_url);
+                break;
+            }
+        }
+
+        url
+    }
+
+    pub fn evaluate_xpath(
+        xpath_ctx: &Context,
+        xpath: &str,
+        thorw_if_empty: bool,
+    ) -> Result<Vec<Node>, ScraperError> {
+        let res = xpath_ctx.evaluate(xpath).map_err(|()| {
+            log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
+            ScraperErrorKind::Xml
+        })?;
+
+        let node_vec = res.get_nodes_as_vec();
+
+        if node_vec.is_empty() {
+            log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
+            if thorw_if_empty {
+                return Err(ScraperErrorKind::Xml.into());
+            }
+        }
+
+        Ok(node_vec)
+    }
+
+    pub fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
+        if response.status().is_success() {
+            if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
+                if let Ok(content_type) = content_type.to_str() {
+                    if content_type.contains("text/html") {
+                        return Ok(true);
+                    }
+                }
+            }
+
+            log::error!("Content type is not text/HTML");
+            return Ok(false);
+        }
+
+        log::error!("Failed to determine content type");
+        Err(ScraperErrorKind::Http.into())
+    }
+
+    pub fn check_redirect(response: &Response, original_url: &url::Url) -> Option<url::Url> {
+        if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
+            log::debug!("Article url redirects to '{}'", response.url().as_str());
+            return Some(response.url().clone());
+        } else if response.url() != original_url {
+            return Some(response.url().clone());
+        }
+
+        None
+    }
+
+    pub fn extract_value(context: &Context, xpath: &str) -> Result<String, ScraperError> {
+        let node_vec = Util::evaluate_xpath(context, xpath, false)?;
+        if let Some(val) = node_vec.get(0) {
+            return Ok(val.get_content());
+        }
+
+        Err(ScraperErrorKind::Xml.into())
+    }
+
+    pub fn extract_value_merge(context: &Context, xpath: &str) -> Result<String, ScraperError> {
+        let node_vec = Util::evaluate_xpath(context, xpath, true)?;
+        let mut val = String::new();
+        for node in node_vec {
+            let part = node
+                .get_content()
+                .split_whitespace()
+                .map(|s| format!("{} ", s))
+                .collect::<String>();
+            val.push_str(&part);
+            val.push_str(" ");
+        }
+
+        Ok(val.trim().to_string())
+    }
+
+    pub fn strip_node(context: &Context, xpath: &str) -> Result<(), ScraperError> {
+        let mut ancestor = xpath.to_string();
+        if ancestor.starts_with("//") {
+            ancestor = ancestor.chars().skip(2).collect();
+        }
+
+        let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
+        let node_vec = Util::evaluate_xpath(context, query, false)?;
+        for mut node in node_vec {
+            node.unlink();
+        }
+        Ok(())
+    }
+
+    pub fn strip_id_or_class(context: &Context, id_or_class: &str) -> Result<(), ScraperError> {
+        let xpath = &format!(
+            "//*[contains(@class, '{}') or contains(@id, '{}')]",
+            id_or_class, id_or_class
+        );
+
+        let mut ancestor = xpath.clone();
+        if ancestor.starts_with("//") {
+            ancestor = ancestor.chars().skip(2).collect();
+        }
+
+        let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
+        let node_vec = Util::evaluate_xpath(context, query, false)?;
+        for mut node in node_vec {
+            node.unlink();
+        }
+        Ok(())
+    }
 }