1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

eliminate additional head request

This commit is contained in:
Jan Lukas Gernert 2023-04-11 07:49:01 +02:00
parent fa41633e11
commit c198225012
3 changed files with 27 additions and 16 deletions

View file

@ -20,7 +20,7 @@ use libxml::parser::Parser;
use libxml::tree::{Document, Node, NodeType}; use libxml::tree::{Document, Node, NodeType};
use libxml::xpath::Context; use libxml::xpath::Context;
use reqwest::header::HeaderMap; use reqwest::header::HeaderMap;
use reqwest::{Client, Url}; use reqwest::{Client, Response, Url};
use std::collections::HashSet; use std::collections::HashSet;
use std::path::Path; use std::path::Path;
use std::str::from_utf8; use std::str::from_utf8;
@ -114,16 +114,7 @@ impl FullTextParser {
.ok_or(FullTextParserError::Config)?; .ok_or(FullTextParserError::Config)?;
let headers = Util::generate_headers(config, global_config)?; let headers = Util::generate_headers(config, global_config)?;
let response = Self::get_response(&url, &client, headers).await?;
let response = client
.head(url.clone())
.headers(headers)
.send()
.await
.map_err(|error| {
log::error!("Failed head request to: '{url}' - '{error}'");
FullTextParserError::Http
})?;
// check if url redirects and we need to pick up the new url // check if url redirects and we need to pick up the new url
let url = if let Some(new_url) = Util::check_redirect(&response, url) { let url = if let Some(new_url) = Util::check_redirect(&response, url) {
@ -154,8 +145,7 @@ impl FullTextParser {
Self::generate_head(&mut root, &document)?; Self::generate_head(&mut root, &document)?;
let headers = Util::generate_headers(config, global_config)?; let html = Self::get_body(response).await?;
let html = Self::download(&url, client, headers).await?;
// check for fingerprints // check for fingerprints
let config = if config.is_none() { let config = if config.is_none() {
@ -255,7 +245,7 @@ impl FullTextParser {
} }
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) { while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
log::debug!(""); log::debug!("next page");
let headers = Util::generate_headers(config, global_config)?; let headers = Util::generate_headers(config, global_config)?;
let html = Self::download(&url, client, headers).await?; let html = Self::download(&url, client, headers).await?;
@ -331,11 +321,11 @@ impl FullTextParser {
Ok(()) Ok(())
} }
pub async fn download( async fn get_response(
url: &url::Url, url: &url::Url,
client: &Client, client: &Client,
headers: HeaderMap, headers: HeaderMap,
) -> Result<String, FullTextParserError> { ) -> Result<Response, FullTextParserError> {
let response = client let response = client
.get(url.as_str()) .get(url.as_str())
.headers(headers) .headers(headers)
@ -349,7 +339,10 @@ impl FullTextParser {
); );
FullTextParserError::Http FullTextParserError::Http
})?; })?;
Ok(response)
}
async fn get_body(response: Response) -> Result<String, FullTextParserError> {
if response.status().is_success() { if response.status().is_success() {
let headers = response.headers().clone(); let headers = response.headers().clone();
let bytes = response let bytes = response
@ -388,6 +381,16 @@ impl FullTextParser {
Err(FullTextParserError::Http) Err(FullTextParserError::Http)
} }
pub async fn download(
url: &url::Url,
client: &Client,
headers: HeaderMap,
) -> Result<String, FullTextParserError> {
let response = Self::get_response(url, client, headers).await?;
let body = Self::get_body(response).await?;
Ok(body)
}
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> { fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
headers headers
.get(reqwest::header::CONTENT_TYPE) .get(reqwest::header::CONTENT_TYPE)

View file

@ -18,6 +18,12 @@ pub struct Args {
#[derive(Subcommand)] #[derive(Subcommand)]
pub enum Commands { pub enum Commands {
/// Use the complete pipeline
All {
/// Source Url to download HTML from
#[arg(long, value_name = "URL")]
source_url: Option<String>,
},
/// Only use the Readability parser /// Only use the Readability parser
Readability { Readability {
/// Source HTML file /// Source HTML file
@ -32,6 +38,7 @@ pub enum Commands {
#[arg(long, value_name = "URL")] #[arg(long, value_name = "URL")]
source_url: Option<String>, source_url: Option<String>,
}, },
/// Only use (a subset of) the Ftr parser
Ftr { Ftr {
/// Source HTML file /// Source HTML file
#[arg(long, value_name = "FILE")] #[arg(long, value_name = "FILE")]

View file

@ -31,6 +31,7 @@ async fn main() {
.unwrap(); .unwrap();
match args.command { match args.command {
Commands::All { source_url: _ } => unimplemented!(),
Commands::Readability { Commands::Readability {
html, html,
base_url, base_url,