mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 16:40:00 +02:00
only go for single page link if xpath res isn't empty
This commit is contained in:
parent
e58acf828c
commit
afe661fe6c
1 changed files with 8 additions and 6 deletions
|
@ -171,6 +171,7 @@ impl ArticleScraper {
|
||||||
xpath_single_page_link
|
xpath_single_page_link
|
||||||
);
|
);
|
||||||
if let Ok(result) = xpath_ctx.findvalue(&xpath_single_page_link, None) {
|
if let Ok(result) = xpath_ctx.findvalue(&xpath_single_page_link, None) {
|
||||||
|
if !result.trim().is_empty() {
|
||||||
// parse again with single page url
|
// parse again with single page url
|
||||||
debug!("Single page link found '{}'", result);
|
debug!("Single page link found '{}'", result);
|
||||||
let single_page_url = url::Url::parse(&result).context(ScraperErrorKind::Url)?;
|
let single_page_url = url::Url::parse(&result).context(ScraperErrorKind::Url)?;
|
||||||
|
@ -179,6 +180,7 @@ impl ArticleScraper {
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ArticleScraper::extract_metadata(&xpath_ctx, config, article);
|
ArticleScraper::extract_metadata(&xpath_ctx, config, article);
|
||||||
ArticleScraper::strip_junk(&xpath_ctx, config, &url);
|
ArticleScraper::strip_junk(&xpath_ctx, config, &url);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue