From c19822501277244845f6a34e6766d18bffd01c3e Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Tue, 11 Apr 2023 07:49:01 +0200 Subject: [PATCH] eliminate additional head request --- article_scraper/src/full_text_parser/mod.rs | 35 +++++++++++---------- article_scraper_cli/src/args.rs | 7 +++++ article_scraper_cli/src/main.rs | 1 + 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 8473b30..6231057 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -20,7 +20,7 @@ use libxml::parser::Parser; use libxml::tree::{Document, Node, NodeType}; use libxml::xpath::Context; use reqwest::header::HeaderMap; -use reqwest::{Client, Url}; +use reqwest::{Client, Response, Url}; use std::collections::HashSet; use std::path::Path; use std::str::from_utf8; @@ -114,16 +114,7 @@ impl FullTextParser { .ok_or(FullTextParserError::Config)?; let headers = Util::generate_headers(config, global_config)?; - - let response = client - .head(url.clone()) - .headers(headers) - .send() - .await - .map_err(|error| { - log::error!("Failed head request to: '{url}' - '{error}'"); - FullTextParserError::Http - })?; + let response = Self::get_response(&url, &client, headers).await?; // check if url redirects and we need to pick up the new url let url = if let Some(new_url) = Util::check_redirect(&response, url) { @@ -154,8 +145,7 @@ impl FullTextParser { Self::generate_head(&mut root, &document)?; - let headers = Util::generate_headers(config, global_config)?; - let html = Self::download(&url, client, headers).await?; + let html = Self::get_body(response).await?; // check for fingerprints let config = if config.is_none() { @@ -255,7 +245,7 @@ impl FullTextParser { } while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) { - log::debug!(""); + log::debug!("next page"); let headers = Util::generate_headers(config, global_config)?; let html = Self::download(&url, client, headers).await?; @@ -331,11 +321,11 @@ impl FullTextParser { Ok(()) } - pub async fn download( + async fn get_response( url: &url::Url, client: &Client, headers: HeaderMap, - ) -> Result { + ) -> Result { let response = client .get(url.as_str()) .headers(headers) @@ -349,7 +339,10 @@ impl FullTextParser { ); FullTextParserError::Http })?; + Ok(response) + } + async fn get_body(response: Response) -> Result { if response.status().is_success() { let headers = response.headers().clone(); let bytes = response @@ -388,6 +381,16 @@ impl FullTextParser { Err(FullTextParserError::Http) } + pub async fn download( + url: &url::Url, + client: &Client, + headers: HeaderMap, + ) -> Result { + let response = Self::get_response(url, client, headers).await?; + let body = Self::get_body(response).await?; + Ok(body) + } + fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> { headers .get(reqwest::header::CONTENT_TYPE) diff --git a/article_scraper_cli/src/args.rs b/article_scraper_cli/src/args.rs index d80a15c..b647413 100644 --- a/article_scraper_cli/src/args.rs +++ b/article_scraper_cli/src/args.rs @@ -18,6 +18,12 @@ pub struct Args { #[derive(Subcommand)] pub enum Commands { + /// Use the complete pipeline + All { + /// Source Url to download HTML from + #[arg(long, value_name = "URL")] + source_url: Option, + }, /// Only use the Readability parser Readability { /// Source HTML file @@ -32,6 +38,7 @@ pub enum Commands { #[arg(long, value_name = "URL")] source_url: Option, }, + /// Only use (a subset of) the Ftr parser Ftr { /// Source HTML file #[arg(long, value_name = "FILE")] diff --git a/article_scraper_cli/src/main.rs b/article_scraper_cli/src/main.rs index 5798bf9..92ca257 100644 --- a/article_scraper_cli/src/main.rs +++ b/article_scraper_cli/src/main.rs @@ -31,6 +31,7 @@ async fn main() { .unwrap(); match args.command { + Commands::All { source_url: _ } => unimplemented!(), Commands::Readability { html, base_url,