mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
eliminate additional head request
This commit is contained in:
parent
fa41633e11
commit
c198225012
3 changed files with 27 additions and 16 deletions
|
@ -20,7 +20,7 @@ use libxml::parser::Parser;
|
||||||
use libxml::tree::{Document, Node, NodeType};
|
use libxml::tree::{Document, Node, NodeType};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
use reqwest::header::HeaderMap;
|
use reqwest::header::HeaderMap;
|
||||||
use reqwest::{Client, Url};
|
use reqwest::{Client, Response, Url};
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::str::from_utf8;
|
use std::str::from_utf8;
|
||||||
|
@ -114,16 +114,7 @@ impl FullTextParser {
|
||||||
.ok_or(FullTextParserError::Config)?;
|
.ok_or(FullTextParserError::Config)?;
|
||||||
|
|
||||||
let headers = Util::generate_headers(config, global_config)?;
|
let headers = Util::generate_headers(config, global_config)?;
|
||||||
|
let response = Self::get_response(&url, &client, headers).await?;
|
||||||
let response = client
|
|
||||||
.head(url.clone())
|
|
||||||
.headers(headers)
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.map_err(|error| {
|
|
||||||
log::error!("Failed head request to: '{url}' - '{error}'");
|
|
||||||
FullTextParserError::Http
|
|
||||||
})?;
|
|
||||||
|
|
||||||
// check if url redirects and we need to pick up the new url
|
// check if url redirects and we need to pick up the new url
|
||||||
let url = if let Some(new_url) = Util::check_redirect(&response, url) {
|
let url = if let Some(new_url) = Util::check_redirect(&response, url) {
|
||||||
|
@ -154,8 +145,7 @@ impl FullTextParser {
|
||||||
|
|
||||||
Self::generate_head(&mut root, &document)?;
|
Self::generate_head(&mut root, &document)?;
|
||||||
|
|
||||||
let headers = Util::generate_headers(config, global_config)?;
|
let html = Self::get_body(response).await?;
|
||||||
let html = Self::download(&url, client, headers).await?;
|
|
||||||
|
|
||||||
// check for fingerprints
|
// check for fingerprints
|
||||||
let config = if config.is_none() {
|
let config = if config.is_none() {
|
||||||
|
@ -255,7 +245,7 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
|
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
|
||||||
log::debug!("");
|
log::debug!("next page");
|
||||||
|
|
||||||
let headers = Util::generate_headers(config, global_config)?;
|
let headers = Util::generate_headers(config, global_config)?;
|
||||||
let html = Self::download(&url, client, headers).await?;
|
let html = Self::download(&url, client, headers).await?;
|
||||||
|
@ -331,11 +321,11 @@ impl FullTextParser {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn download(
|
async fn get_response(
|
||||||
url: &url::Url,
|
url: &url::Url,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
headers: HeaderMap,
|
headers: HeaderMap,
|
||||||
) -> Result<String, FullTextParserError> {
|
) -> Result<Response, FullTextParserError> {
|
||||||
let response = client
|
let response = client
|
||||||
.get(url.as_str())
|
.get(url.as_str())
|
||||||
.headers(headers)
|
.headers(headers)
|
||||||
|
@ -349,7 +339,10 @@ impl FullTextParser {
|
||||||
);
|
);
|
||||||
FullTextParserError::Http
|
FullTextParserError::Http
|
||||||
})?;
|
})?;
|
||||||
|
Ok(response)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_body(response: Response) -> Result<String, FullTextParserError> {
|
||||||
if response.status().is_success() {
|
if response.status().is_success() {
|
||||||
let headers = response.headers().clone();
|
let headers = response.headers().clone();
|
||||||
let bytes = response
|
let bytes = response
|
||||||
|
@ -388,6 +381,16 @@ impl FullTextParser {
|
||||||
Err(FullTextParserError::Http)
|
Err(FullTextParserError::Http)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn download(
|
||||||
|
url: &url::Url,
|
||||||
|
client: &Client,
|
||||||
|
headers: HeaderMap,
|
||||||
|
) -> Result<String, FullTextParserError> {
|
||||||
|
let response = Self::get_response(url, client, headers).await?;
|
||||||
|
let body = Self::get_body(response).await?;
|
||||||
|
Ok(body)
|
||||||
|
}
|
||||||
|
|
||||||
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
|
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
|
||||||
headers
|
headers
|
||||||
.get(reqwest::header::CONTENT_TYPE)
|
.get(reqwest::header::CONTENT_TYPE)
|
||||||
|
|
|
@ -18,6 +18,12 @@ pub struct Args {
|
||||||
|
|
||||||
#[derive(Subcommand)]
|
#[derive(Subcommand)]
|
||||||
pub enum Commands {
|
pub enum Commands {
|
||||||
|
/// Use the complete pipeline
|
||||||
|
All {
|
||||||
|
/// Source Url to download HTML from
|
||||||
|
#[arg(long, value_name = "URL")]
|
||||||
|
source_url: Option<String>,
|
||||||
|
},
|
||||||
/// Only use the Readability parser
|
/// Only use the Readability parser
|
||||||
Readability {
|
Readability {
|
||||||
/// Source HTML file
|
/// Source HTML file
|
||||||
|
@ -32,6 +38,7 @@ pub enum Commands {
|
||||||
#[arg(long, value_name = "URL")]
|
#[arg(long, value_name = "URL")]
|
||||||
source_url: Option<String>,
|
source_url: Option<String>,
|
||||||
},
|
},
|
||||||
|
/// Only use (a subset of) the Ftr parser
|
||||||
Ftr {
|
Ftr {
|
||||||
/// Source HTML file
|
/// Source HTML file
|
||||||
#[arg(long, value_name = "FILE")]
|
#[arg(long, value_name = "FILE")]
|
||||||
|
|
|
@ -31,6 +31,7 @@ async fn main() {
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
match args.command {
|
match args.command {
|
||||||
|
Commands::All { source_url: _ } => unimplemented!(),
|
||||||
Commands::Readability {
|
Commands::Readability {
|
||||||
html,
|
html,
|
||||||
base_url,
|
base_url,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue