mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
special handling for youtube videos
This commit is contained in:
parent
a871d5b82e
commit
82a0a46323
2 changed files with 49 additions and 0 deletions
23
src/lib.rs
23
src/lib.rs
|
@ -1,6 +1,7 @@
|
||||||
mod article;
|
mod article;
|
||||||
mod config;
|
mod config;
|
||||||
mod error;
|
mod error;
|
||||||
|
mod youtube;
|
||||||
pub mod images;
|
pub mod images;
|
||||||
|
|
||||||
use self::error::{ScraperError, ScraperErrorKind};
|
use self::error::{ScraperError, ScraperErrorKind};
|
||||||
|
@ -58,6 +59,11 @@ impl ArticleScraper {
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<Article, ScraperError> {
|
) -> Result<Article, ScraperError> {
|
||||||
info!("Scraping article: '{}'", url.as_str());
|
info!("Scraping article: '{}'", url.as_str());
|
||||||
|
|
||||||
|
if let Some(article) = youtube::Youtube::handle(&url) {
|
||||||
|
return Ok(article);
|
||||||
|
}
|
||||||
|
|
||||||
let response = client
|
let response = client
|
||||||
.head(url.clone())
|
.head(url.clone())
|
||||||
.send()
|
.send()
|
||||||
|
@ -834,4 +840,21 @@ mod tests {
|
||||||
))
|
))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test(basic_scheduler)]
|
||||||
|
async fn youtube() {
|
||||||
|
let config_path = PathBuf::from(r"./resources/tests/");
|
||||||
|
let url = url::Url::parse(
|
||||||
|
"https://www.youtube.com/watch?v=lHRkYLcmFY8",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let grabber = ArticleScraper::new(config_path);
|
||||||
|
let article = grabber.parse(url, false, &Client::new()).await.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
article.html,
|
||||||
|
Some("<iframe width=\"650\" height=\"350\" frameborder=\"0\" src=\"https://www.youtube-nocookie.com/embed/lHRkYLcmFY8\" allowfullscreen></iframe>".into())
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
26
src/youtube.rs
Normal file
26
src/youtube.rs
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
use crate::article::Article;
|
||||||
|
|
||||||
|
pub struct Youtube;
|
||||||
|
|
||||||
|
impl Youtube {
|
||||||
|
pub fn handle(url: &url::Url) -> Option<Article> {
|
||||||
|
if url.host_str() == Some("youtube.com") || url.host_str() == Some("www.youtube.com") {
|
||||||
|
let regex = regex::Regex::new(r#"youtube\.com/watch\?v=(.*)"#).unwrap();
|
||||||
|
if let Some(captures) = regex.captures(url.as_str()) {
|
||||||
|
if let Some(video_id) = captures.get(1) {
|
||||||
|
let html = format!("<iframe width=\"650\" height=\"350\" frameborder=\"0\" src=\"https://www.youtube-nocookie.com/embed/{}\" allowfullscreen></iframe>", video_id.as_str());
|
||||||
|
|
||||||
|
return Some(Article {
|
||||||
|
title: None,
|
||||||
|
date: None,
|
||||||
|
author: None,
|
||||||
|
url: url.clone(),
|
||||||
|
html: Some(html),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue