only use builtin youtube parsing if no config is provided

2025-07-07 16:15:32 +02:00 · 2020-06-07 13:21:53 +02:00 · 2020-06-07 13:21:53 +02:00 · 6b6c52f315
commit 6b6c52f315
parent 34eaf1eeb1
4 changed files with 44 additions and 48 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -19,3 +19,4 @@ chrono = "0.4"
 base64 = "0.12"
 image = "0.23"
 log = "0.4"
+parking_lot = "0.10"
--- a/src/config/mod.rs
+++ b/src/config/mod.rs
@ -1,7 +1,7 @@
 use self::error::{ConfigError, ConfigErrorKind};
 use failure::ResultExt;
 use log::warn;
-use std::collections;
+use std::collections::HashMap;
 use std::fs;
 use std::io;
 use std::io::BufRead;
@ -11,7 +11,7 @@ use std::path::PathBuf;
 mod macros;
 mod error;

-pub type ConfigCollection = collections::HashMap<String, GrabberConfig>;
+pub type ConfigCollection = HashMap<String, GrabberConfig>;

 #[derive(Clone)]
 pub struct Replace {
@ -43,8 +43,7 @@ impl GrabberConfig {

        let paths = fs::read_dir(directory).context(ConfigErrorKind::IO)?;

-        let mut collection: collections::HashMap<String, GrabberConfig> =
-            collections::HashMap::new();
+        let mut collection: HashMap<String, GrabberConfig> = HashMap::new();

        for path in paths {
            if let Ok(path) = path {
--- a/src/lib.rs
+++ b/src/lib.rs
@ -15,11 +15,12 @@ use libxml::parser::Parser;
 use libxml::tree::{Document, Node, SaveOptions};
 use libxml::xpath::Context;
 use log::{debug, error, info, warn};
+use parking_lot::RwLock;
 use reqwest::{Client, Response};
-use std::collections;
+use std::collections::HashMap;
 use std::path::PathBuf;
 use std::str::FromStr;
-use std::sync::{Arc, RwLock};
+use std::sync::Arc;
 use std::thread;

 pub struct ArticleScraper {
@ -34,15 +35,9 @@ impl ArticleScraper {
        let locked_config_files = config_files.clone();
        thread::spawn(move || {
            if let Ok(config_files) = GrabberConfig::parse_directory(&config_path) {
-                locked_config_files
-                    .write()
-                    .expect("Failed to lock config file cache")
-                    .replace(config_files);
+                locked_config_files.write().replace(config_files);
            } else {
-                locked_config_files
-                    .write()
-                    .expect("Failed to lock config file cache")
-                    .replace(collections::HashMap::new());
+                locked_config_files.write().replace(HashMap::new());
            }
        });

@ -60,9 +55,12 @@ impl ArticleScraper {
    ) -> Result<Article, ScraperError> {
        info!("Scraping article: '{}'", url.as_str());

+        // custom youtube handling, but prefer config if exists
+        if !self.grabber_config_exists("youtube.com")? {
            if let Some(article) = youtube::Youtube::handle(&url) {
                return Ok(article);
            }
+        }

        let response = client
            .head(url.clone())
@ -119,11 +117,6 @@ impl ArticleScraper {
            return Err(error);
        }

-        // if let Err(error) = ArticleScraper::eliminate_noscript_tag(&context) {
-        //     error!("Eliminating <noscript> tag failed - {}", error);
-        //     return Err(error)
-        // }
-
        if download_images {
            if let Err(error) = self
                .image_downloader
@ -304,7 +297,8 @@ impl ArticleScraper {
    fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
        if let Some(content_type) = headers.get(reqwest::header::CONTENT_TYPE) {
            if let Ok(content_type) = content_type.to_str() {
-                let regex = regex::Regex::new(r#"charset=([^"']+)"#).unwrap();
+                let regex =
+                    regex::Regex::new(r#"charset=([^"']+)"#).expect("Failed to parse regex");
                if let Some(captures) = regex.captures(content_type) {
                    if let Some(regex_match) = captures.get(1) {
                        return Some(regex_match.as_str());
@ -316,7 +310,8 @@ impl ArticleScraper {
    }

    fn get_encoding_from_html(html: &str) -> Option<&str> {
-        let regex = regex::Regex::new(r#"<meta.*?charset=([^"']+)"#).unwrap();
+        let regex =
+            regex::Regex::new(r#"<meta.*?charset=([^"']+)"#).expect("Failed to parse regex");
        if let Some(captures) = regex.captures(html) {
            if let Some(regex_match) = captures.get(1) {
                return Some(regex_match.as_str());
@ -339,24 +334,26 @@ impl ArticleScraper {
        None
    }

-    fn get_grabber_config(&self, url: &url::Url) -> Result<GrabberConfig, ScraperError> {
-        let config_name = match url.host_str() {
+    fn get_host_name(url: &url::Url) -> Result<String, ScraperError> {
+        match url.host_str() {
            Some(name) => {
                let mut name = name;
-                if name.starts_with("www.") {
+                if name.starts_with("www.") && name.len() > 4 {
                    name = &name[4..]
                }
-                name
+                Ok(name.into())
            }
            None => {
                error!("Getting config failed due to bad Url");
                return Err(ScraperErrorKind::Config.into());
            }
-        };
+        }
+    }

-        let config_name = config_name.to_owned() + ".txt";
+    fn get_grabber_config(&self, url: &url::Url) -> Result<GrabberConfig, ScraperError> {
+        let config_name = Self::get_host_name(url)? + ".txt";

-        if let Some(config_files) = &*self.config_files.read().unwrap() {
+        if let Some(config_files) = self.config_files.read().as_ref() {
            match config_files.get(&config_name) {
                Some(config) => Ok(config.clone()),
                None => {
@ -370,6 +367,15 @@ impl ArticleScraper {
        }
    }

+    fn grabber_config_exists(&self, host: &str) -> Result<bool, ScraperError> {
+        if let Some(config_files) = self.config_files.read().as_ref() {
+            Ok(config_files.contains_key(&(host.to_owned() + ".txt")))
+        } else {
+            error!("Config files have not been parsed yet.");
+            Err(ScraperErrorKind::Config.into())
+        }
+    }
+
    fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
        if response.status().is_success() {
            if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
@ -778,22 +784,6 @@ impl ArticleScraper {

        Ok(())
    }
-
-    // fn eliminate_noscript_tag(context: &Context) -> Result<(), ScraperError> {
-    //     let xpath = "//noscript";
-    //     let node_vec = Self::evaluate_xpath(context, xpath, false)?;
-    //     for mut node in node_vec {
-    //         if let Some(mut parent) = node.get_parent() {
-    //             node.unlink();
-    //             let children = node.get_child_nodes();
-    //             for mut child in children {
-    //                 child.unlink();
-    //                 let _ = parent.add_child(&mut child);
-    //             }
-    //         }
-    //     }
-    //     Ok(())
-    // }
 }

 #[cfg(test)]
--- a/src/youtube.rs
+++ b/src/youtube.rs
@ -1,11 +1,17 @@
 use crate::article::Article;
+use crate::ArticleScraper;

 pub struct Youtube;

 impl Youtube {
    pub fn handle(url: &url::Url) -> Option<Article> {
-        if url.host_str() == Some("youtube.com") || url.host_str() == Some("www.youtube.com") {
-            let regex = regex::Regex::new(r#"youtube\.com/watch\?v=(.*)"#).unwrap();
+        let host_name = match ArticleScraper::get_host_name(url) {
+            Ok(host_name) => host_name,
+            Err(_) => return None,
+        };
+        if &host_name == "youtube.com" {
+            let regex =
+                regex::Regex::new(r#"youtube\.com/watch\?v=(.*)"#).expect("Failed to parse regex");
            if let Some(captures) = regex.captures(url.as_str()) {
                if let Some(video_id) = captures.get(1) {
                    let html = format!("<iframe width=\"650\" height=\"350\" frameborder=\"0\" src=\"https://www.youtube-nocookie.com/embed/{}\" allowfullscreen></iframe>", video_id.as_str());