1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 08:05:31 +02:00
This commit is contained in:
Jan Lukas Gernert 2023-04-25 19:12:15 +02:00
parent a0161e92d4
commit bd413a795c
2 changed files with 10 additions and 6 deletions

View file

@ -71,8 +71,8 @@ impl ConfigCollection {
#[cfg(test)]
mod tests {
use std::path::Path;
use super::ConfigCollection;
use std::path::Path;
#[tokio::test]
async fn read_dir() {

View file

@ -71,7 +71,8 @@ impl FullTextParser {
let old_document = Self::parse_html(html, config, global_config)?;
let xpath_ctx = Self::get_xpath_ctx(&old_document)?;
if let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config, &article.url) {
if let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config, &article.url)
{
log::info!("Next page url: {url}");
}
@ -240,7 +241,8 @@ impl FullTextParser {
}
metadata::extract(&xpath_ctx, config, Some(global_config), article);
let mut next_page_url = self.check_for_next_page(&xpath_ctx, config, global_config, &article.url);
let mut next_page_url =
self.check_for_next_page(&xpath_ctx, config, global_config, &article.url);
if article.thumbnail_url.is_none() {
Self::check_for_thumbnail(&xpath_ctx, article);
@ -263,7 +265,9 @@ impl FullTextParser {
let html = Self::download(&url, client, headers).await?;
document = Self::parse_html(&html, config, global_config)?;
xpath_ctx = Self::get_xpath_ctx(&document)?;
if let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config, &article.url) {
if let Some(url) =
self.check_for_next_page(&xpath_ctx, config, global_config, &article.url)
{
next_page_url.replace(url);
}
Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document);
@ -1034,7 +1038,7 @@ impl FullTextParser {
}
fn parse_url(url: &str, article_url: &url::Url) -> Option<url::Url> {
let is_relative_url = url::Url::parse(&url)
let is_relative_url = url::Url::parse(url)
.err()
.map(|err| err == url::ParseError::RelativeUrlWithoutBase)
.unwrap_or(false);
@ -1042,7 +1046,7 @@ impl FullTextParser {
if is_relative_url {
article_url.join(url.trim()).ok()
} else {
url::Url::parse(&url).ok()
url::Url::parse(url).ok()
}
}