1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

next page fixes

This commit is contained in:
Jan Lukas Gernert 2023-04-25 18:57:24 +02:00
parent 37d317ad86
commit a0161e92d4
3 changed files with 28 additions and 3268 deletions

File diff suppressed because one or more lines are too long

@ -1 +1 @@
Subproject commit f552f1d5178786e3bdbdbe88952244eac8e8838f Subproject commit 3ad5a9f8bb8222507ef01aca526d5ca42e309df1

View file

@ -71,6 +71,10 @@ impl FullTextParser {
let old_document = Self::parse_html(html, config, global_config)?; let old_document = Self::parse_html(html, config, global_config)?;
let xpath_ctx = Self::get_xpath_ctx(&old_document)?; let xpath_ctx = Self::get_xpath_ctx(&old_document)?;
if let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config, &article.url) {
log::info!("Next page url: {url}");
}
metadata::extract(&xpath_ctx, config, Some(global_config), &mut article); metadata::extract(&xpath_ctx, config, Some(global_config), &mut article);
if article.thumbnail_url.is_none() { if article.thumbnail_url.is_none() {
Self::check_for_thumbnail(&xpath_ctx, &mut article); Self::check_for_thumbnail(&xpath_ctx, &mut article);
@ -88,10 +92,6 @@ impl FullTextParser {
return Err(FullTextParserError::Scrape); return Err(FullTextParserError::Scrape);
} }
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
log::info!("Next page url: {url}");
}
if let Err(error) = Self::prevent_self_closing_tags(&xpath_ctx) { if let Err(error) = Self::prevent_self_closing_tags(&xpath_ctx) {
log::error!("Preventing self closing tags failed - '{error}'"); log::error!("Preventing self closing tags failed - '{error}'");
return Err(error); return Err(error);
@ -240,6 +240,8 @@ impl FullTextParser {
} }
metadata::extract(&xpath_ctx, config, Some(global_config), article); metadata::extract(&xpath_ctx, config, Some(global_config), article);
let mut next_page_url = self.check_for_next_page(&xpath_ctx, config, global_config, &article.url);
if article.thumbnail_url.is_none() { if article.thumbnail_url.is_none() {
Self::check_for_thumbnail(&xpath_ctx, article); Self::check_for_thumbnail(&xpath_ctx, article);
} }
@ -254,14 +256,17 @@ impl FullTextParser {
} }
} }
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) { while let Some(url) = next_page_url.take() {
log::debug!("next page"); log::debug!("next page");
let headers = Util::generate_headers(config, global_config)?; let headers = Util::generate_headers(config, global_config)?;
let html = Self::download(&url, client, headers).await?; let html = Self::download(&url, client, headers).await?;
document = Self::parse_html(&html, config, global_config)?; document = Self::parse_html(&html, config, global_config)?;
xpath_ctx = Self::get_xpath_ctx(&document)?; xpath_ctx = Self::get_xpath_ctx(&document)?;
Self::prep_content(&xpath_ctx, config, global_config, &url, &document); if let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config, &article.url) {
next_page_url.replace(url);
}
Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document);
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?; let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
if !found_body { if !found_body {
@ -1005,19 +1010,20 @@ impl FullTextParser {
context: &Context, context: &Context,
config: Option<&ConfigEntry>, config: Option<&ConfigEntry>,
global_config: &ConfigEntry, global_config: &ConfigEntry,
article_url: &url::Url,
) -> Option<url::Url> { ) -> Option<url::Url> {
if let Some(config) = config { if let Some(config) = config {
if let Some(next_page_xpath) = config.next_page_link.as_deref() { if let Some(next_page_xpath) = config.next_page_link.as_deref() {
if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href") if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href")
{ {
if let Ok(next_page_url) = url::Url::parse(&next_page_string) { if let Some(next_page_url) = Self::parse_url(&next_page_string, article_url) {
return Some(next_page_url); return Some(next_page_url);
} }
} }
} }
} else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() { } else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() {
if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href") { if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href") {
if let Ok(next_page_url) = url::Url::parse(&next_page_string) { if let Some(next_page_url) = Self::parse_url(&next_page_string, article_url) {
return Some(next_page_url); return Some(next_page_url);
} }
} }
@ -1027,6 +1033,19 @@ impl FullTextParser {
None None
} }
fn parse_url(url: &str, article_url: &url::Url) -> Option<url::Url> {
let is_relative_url = url::Url::parse(&url)
.err()
.map(|err| err == url::ParseError::RelativeUrlWithoutBase)
.unwrap_or(false);
if is_relative_url {
article_url.join(url.trim()).ok()
} else {
url::Url::parse(&url).ok()
}
}
fn generate_head(root: &mut Node, document: &Document) -> Result<(), FullTextParserError> { fn generate_head(root: &mut Node, document: &Document) -> Result<(), FullTextParserError> {
if let Ok(mut head_node) = Node::new("head", None, document) { if let Ok(mut head_node) = Node::new("head", None, document) {
if let Ok(()) = root.add_prev_sibling(&mut head_node) { if let Ok(()) = root.add_prev_sibling(&mut head_node) {