mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
next page fixes
This commit is contained in:
parent
37d317ad86
commit
a0161e92d4
3 changed files with 28 additions and 3268 deletions
File diff suppressed because one or more lines are too long
|
@ -1 +1 @@
|
||||||
Subproject commit f552f1d5178786e3bdbdbe88952244eac8e8838f
|
Subproject commit 3ad5a9f8bb8222507ef01aca526d5ca42e309df1
|
|
@ -71,6 +71,10 @@ impl FullTextParser {
|
||||||
let old_document = Self::parse_html(html, config, global_config)?;
|
let old_document = Self::parse_html(html, config, global_config)?;
|
||||||
let xpath_ctx = Self::get_xpath_ctx(&old_document)?;
|
let xpath_ctx = Self::get_xpath_ctx(&old_document)?;
|
||||||
|
|
||||||
|
if let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config, &article.url) {
|
||||||
|
log::info!("Next page url: {url}");
|
||||||
|
}
|
||||||
|
|
||||||
metadata::extract(&xpath_ctx, config, Some(global_config), &mut article);
|
metadata::extract(&xpath_ctx, config, Some(global_config), &mut article);
|
||||||
if article.thumbnail_url.is_none() {
|
if article.thumbnail_url.is_none() {
|
||||||
Self::check_for_thumbnail(&xpath_ctx, &mut article);
|
Self::check_for_thumbnail(&xpath_ctx, &mut article);
|
||||||
|
@ -88,10 +92,6 @@ impl FullTextParser {
|
||||||
return Err(FullTextParserError::Scrape);
|
return Err(FullTextParserError::Scrape);
|
||||||
}
|
}
|
||||||
|
|
||||||
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
|
|
||||||
log::info!("Next page url: {url}");
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Err(error) = Self::prevent_self_closing_tags(&xpath_ctx) {
|
if let Err(error) = Self::prevent_self_closing_tags(&xpath_ctx) {
|
||||||
log::error!("Preventing self closing tags failed - '{error}'");
|
log::error!("Preventing self closing tags failed - '{error}'");
|
||||||
return Err(error);
|
return Err(error);
|
||||||
|
@ -240,6 +240,8 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
metadata::extract(&xpath_ctx, config, Some(global_config), article);
|
metadata::extract(&xpath_ctx, config, Some(global_config), article);
|
||||||
|
let mut next_page_url = self.check_for_next_page(&xpath_ctx, config, global_config, &article.url);
|
||||||
|
|
||||||
if article.thumbnail_url.is_none() {
|
if article.thumbnail_url.is_none() {
|
||||||
Self::check_for_thumbnail(&xpath_ctx, article);
|
Self::check_for_thumbnail(&xpath_ctx, article);
|
||||||
}
|
}
|
||||||
|
@ -254,14 +256,17 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
|
while let Some(url) = next_page_url.take() {
|
||||||
log::debug!("next page");
|
log::debug!("next page");
|
||||||
|
|
||||||
let headers = Util::generate_headers(config, global_config)?;
|
let headers = Util::generate_headers(config, global_config)?;
|
||||||
let html = Self::download(&url, client, headers).await?;
|
let html = Self::download(&url, client, headers).await?;
|
||||||
document = Self::parse_html(&html, config, global_config)?;
|
document = Self::parse_html(&html, config, global_config)?;
|
||||||
xpath_ctx = Self::get_xpath_ctx(&document)?;
|
xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
Self::prep_content(&xpath_ctx, config, global_config, &url, &document);
|
if let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config, &article.url) {
|
||||||
|
next_page_url.replace(url);
|
||||||
|
}
|
||||||
|
Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document);
|
||||||
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||||
|
|
||||||
if !found_body {
|
if !found_body {
|
||||||
|
@ -1005,19 +1010,20 @@ impl FullTextParser {
|
||||||
context: &Context,
|
context: &Context,
|
||||||
config: Option<&ConfigEntry>,
|
config: Option<&ConfigEntry>,
|
||||||
global_config: &ConfigEntry,
|
global_config: &ConfigEntry,
|
||||||
|
article_url: &url::Url,
|
||||||
) -> Option<url::Url> {
|
) -> Option<url::Url> {
|
||||||
if let Some(config) = config {
|
if let Some(config) = config {
|
||||||
if let Some(next_page_xpath) = config.next_page_link.as_deref() {
|
if let Some(next_page_xpath) = config.next_page_link.as_deref() {
|
||||||
if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href")
|
if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href")
|
||||||
{
|
{
|
||||||
if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
|
if let Some(next_page_url) = Self::parse_url(&next_page_string, article_url) {
|
||||||
return Some(next_page_url);
|
return Some(next_page_url);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() {
|
} else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() {
|
||||||
if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href") {
|
if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href") {
|
||||||
if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
|
if let Some(next_page_url) = Self::parse_url(&next_page_string, article_url) {
|
||||||
return Some(next_page_url);
|
return Some(next_page_url);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1027,6 +1033,19 @@ impl FullTextParser {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn parse_url(url: &str, article_url: &url::Url) -> Option<url::Url> {
|
||||||
|
let is_relative_url = url::Url::parse(&url)
|
||||||
|
.err()
|
||||||
|
.map(|err| err == url::ParseError::RelativeUrlWithoutBase)
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
if is_relative_url {
|
||||||
|
article_url.join(url.trim()).ok()
|
||||||
|
} else {
|
||||||
|
url::Url::parse(&url).ok()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn generate_head(root: &mut Node, document: &Document) -> Result<(), FullTextParserError> {
|
fn generate_head(root: &mut Node, document: &Document) -> Result<(), FullTextParserError> {
|
||||||
if let Ok(mut head_node) = Node::new("head", None, document) {
|
if let Ok(mut head_node) = Node::new("head", None, document) {
|
||||||
if let Ok(()) = root.add_prev_sibling(&mut head_node) {
|
if let Ok(()) = root.add_prev_sibling(&mut head_node) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue