1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00
This commit is contained in:
Jan Lukas Gernert 2023-04-25 19:12:15 +02:00
parent a0161e92d4
commit bd413a795c
2 changed files with 10 additions and 6 deletions

View file

@ -71,8 +71,8 @@ impl ConfigCollection {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::path::Path;
use super::ConfigCollection; use super::ConfigCollection;
use std::path::Path;
#[tokio::test] #[tokio::test]
async fn read_dir() { async fn read_dir() {

View file

@ -71,7 +71,8 @@ impl FullTextParser {
let old_document = Self::parse_html(html, config, global_config)?; let old_document = Self::parse_html(html, config, global_config)?;
let xpath_ctx = Self::get_xpath_ctx(&old_document)?; let xpath_ctx = Self::get_xpath_ctx(&old_document)?;
if let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config, &article.url) { if let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config, &article.url)
{
log::info!("Next page url: {url}"); log::info!("Next page url: {url}");
} }
@ -240,7 +241,8 @@ impl FullTextParser {
} }
metadata::extract(&xpath_ctx, config, Some(global_config), article); metadata::extract(&xpath_ctx, config, Some(global_config), article);
let mut next_page_url = self.check_for_next_page(&xpath_ctx, config, global_config, &article.url); let mut next_page_url =
self.check_for_next_page(&xpath_ctx, config, global_config, &article.url);
if article.thumbnail_url.is_none() { if article.thumbnail_url.is_none() {
Self::check_for_thumbnail(&xpath_ctx, article); Self::check_for_thumbnail(&xpath_ctx, article);
@ -263,7 +265,9 @@ impl FullTextParser {
let html = Self::download(&url, client, headers).await?; let html = Self::download(&url, client, headers).await?;
document = Self::parse_html(&html, config, global_config)?; document = Self::parse_html(&html, config, global_config)?;
xpath_ctx = Self::get_xpath_ctx(&document)?; xpath_ctx = Self::get_xpath_ctx(&document)?;
if let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config, &article.url) { if let Some(url) =
self.check_for_next_page(&xpath_ctx, config, global_config, &article.url)
{
next_page_url.replace(url); next_page_url.replace(url);
} }
Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document); Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document);
@ -1034,7 +1038,7 @@ impl FullTextParser {
} }
fn parse_url(url: &str, article_url: &url::Url) -> Option<url::Url> { fn parse_url(url: &str, article_url: &url::Url) -> Option<url::Url> {
let is_relative_url = url::Url::parse(&url) let is_relative_url = url::Url::parse(url)
.err() .err()
.map(|err| err == url::ParseError::RelativeUrlWithoutBase) .map(|err| err == url::ParseError::RelativeUrlWithoutBase)
.unwrap_or(false); .unwrap_or(false);
@ -1042,7 +1046,7 @@ impl FullTextParser {
if is_relative_url { if is_relative_url {
article_url.join(url.trim()).ok() article_url.join(url.trim()).ok()
} else { } else {
url::Url::parse(&url).ok() url::Url::parse(url).ok()
} }
} }