From 82a0a46323358da04a156bb036c4d11ea2efa85a Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 7 Jun 2020 12:39:44 +0200 Subject: [PATCH] special handling for youtube videos --- src/lib.rs | 23 +++++++++++++++++++++++ src/youtube.rs | 26 ++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 src/youtube.rs diff --git a/src/lib.rs b/src/lib.rs index d182428..a748e05 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ mod article; mod config; mod error; +mod youtube; pub mod images; use self::error::{ScraperError, ScraperErrorKind}; @@ -58,6 +59,11 @@ impl ArticleScraper { client: &Client, ) -> Result { info!("Scraping article: '{}'", url.as_str()); + + if let Some(article) = youtube::Youtube::handle(&url) { + return Ok(article); + } + let response = client .head(url.clone()) .send() @@ -834,4 +840,21 @@ mod tests { )) ); } + + #[tokio::test(basic_scheduler)] + async fn youtube() { + let config_path = PathBuf::from(r"./resources/tests/"); + let url = url::Url::parse( + "https://www.youtube.com/watch?v=lHRkYLcmFY8", + ) + .unwrap(); + + let grabber = ArticleScraper::new(config_path); + let article = grabber.parse(url, false, &Client::new()).await.unwrap(); + + assert_eq!( + article.html, + Some("".into()) + ); + } } diff --git a/src/youtube.rs b/src/youtube.rs new file mode 100644 index 0000000..0c62506 --- /dev/null +++ b/src/youtube.rs @@ -0,0 +1,26 @@ +use crate::article::Article; + +pub struct Youtube; + +impl Youtube { + pub fn handle(url: &url::Url) -> Option
{ + if url.host_str() == Some("youtube.com") || url.host_str() == Some("www.youtube.com") { + let regex = regex::Regex::new(r#"youtube\.com/watch\?v=(.*)"#).unwrap(); + if let Some(captures) = regex.captures(url.as_str()) { + if let Some(video_id) = captures.get(1) { + let html = format!("", video_id.as_str()); + + return Some(Article { + title: None, + date: None, + author: None, + url: url.clone(), + html: Some(html), + }) + } + } + } + + None + } +} \ No newline at end of file