From 7b1b027c6d2f66d8336012bfd35fca794a2988ac Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 7 Oct 2022 07:14:39 +0200 Subject: [PATCH] add support for header values: fixes golem test --- Cargo.toml | 2 +- src/config/config_entry.rs | 29 +++++++++++++++++++++++++++++ src/lib.rs | 34 ++++++++++++++++++++++------------ src/tests.rs | 18 ++++++++++++++++++ src/util.rs | 24 ++++++++++++++++++++++++ 5 files changed, 94 insertions(+), 13 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 349d0e8..baf4283 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ repository = "https://gitlab.com/news-flash/article_scraper" [dependencies] failure = "0.1" libxml = "0.3" -reqwest = { version = "0.11", features = ["json", "native-tls"] } +reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli"] } tokio = { version = "1.21", features = ["macros", "fs", "io-util"] } url = "2.2" regex = "1.4" diff --git a/src/config/config_entry.rs b/src/config/config_entry.rs index f1de7ea..b6411e6 100644 --- a/src/config/config_entry.rs +++ b/src/config/config_entry.rs @@ -14,6 +14,12 @@ pub struct Replace { pub replace_with: String, } +#[derive(Clone)] +pub struct Header { + pub name: String, + pub value: String, +} + #[derive(Clone)] pub struct ConfigEntry { pub xpath_title: Vec, @@ -24,6 +30,7 @@ pub struct ConfigEntry { pub strip_id_or_class: Vec, pub strip_image_src: Vec, pub replace: Vec, + pub header: Vec
, pub single_page_link: Option, pub next_page_link: Option, } @@ -55,6 +62,7 @@ impl ConfigEntry { let mut strip_id_or_class: Vec = Vec::new(); let mut strip_image_src: Vec = Vec::new(); let mut replace_vec: Vec = Vec::new(); + let mut header_vec: Vec
= Vec::new(); let mut next_page_link: Option = None; let mut single_page_link: Option = None; @@ -71,6 +79,7 @@ impl ConfigEntry { let find = "find_string:"; let replace = "replace_string:"; let replace_single = "replace_string("; + let http_header = "http_header("; // ignore these let tidy = "tidy:"; @@ -123,6 +132,25 @@ impl ConfigEntry { continue; } + if line.starts_with(http_header) { + let value = Util::extract_value(http_header, line); + let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect(); + if value.len() != 2 { + continue; + } + + if let Some(name) = value.get(0) { + if let Some(value) = value.get(1) { + header_vec.push(Header { + name: (*name).to_string(), + value: (*value).to_string(), + }); + } + } + + continue; + } + if line.starts_with(find) { let to_replace = Util::extract_value(find, line).into(); @@ -148,6 +176,7 @@ impl ConfigEntry { strip_id_or_class, strip_image_src, replace: replace_vec, + header: header_vec, single_page_link, next_page_link, }; diff --git a/src/lib.rs b/src/lib.rs index 813f9a7..6eef491 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,6 +19,7 @@ use libxml::parser::Parser; use libxml::tree::{Document, Node, SaveOptions}; use libxml::xpath::Context; use log::{debug, error, info, warn}; +use reqwest::header::HeaderMap; use reqwest::{Client, Response}; use std::path::Path; use std::str::FromStr; @@ -53,8 +54,18 @@ impl ArticleScraper { } } + // check if we have a config for the url + let config = self.get_grabber_config(&url); + let global_config = self + .config_files + .get("global.txt") + .ok_or_else(|| ScraperErrorKind::Config)?; + + let headers = Util::generate_headers(config, global_config)?; + let response = client .head(url.clone()) + .headers(headers) .send() .await .map_err(|err| { @@ -76,13 +87,6 @@ impl ArticleScraper { return Err(ScraperErrorKind::ContentType.into()); } - // check if we have a config for the url - let config = self.get_grabber_config(&url); - let global_config = self - .config_files - .get("global.txt") - .ok_or_else(|| ScraperErrorKind::Config)?; - let mut article = Article { title: None, author: None, @@ -148,7 +152,8 @@ impl ArticleScraper { global_config: &ConfigEntry, client: &Client, ) -> Result<(), ScraperError> { - let html = ArticleScraper::download(&url, client).await?; + let headers = Util::generate_headers(config, global_config)?; + let html = ArticleScraper::download(&url, client, headers).await?; let mut document = Self::parse_html(html, config, global_config)?; let mut xpath_ctx = Self::get_xpath_ctx(&document)?; @@ -187,7 +192,8 @@ impl ArticleScraper { ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?; while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) { - let html = ArticleScraper::download(&url, client).await?; + let headers = Util::generate_headers(config, global_config)?; + let html = ArticleScraper::download(&url, client, headers).await?; document = Self::parse_html(html, config, global_config)?; xpath_ctx = Self::get_xpath_ctx(&document)?; ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url); @@ -261,7 +267,8 @@ impl ArticleScraper { global_config: &ConfigEntry, client: &Client, ) -> Result<(), ScraperError> { - let html = ArticleScraper::download(&url, client).await?; + let headers = Util::generate_headers(config, global_config)?; + let html = ArticleScraper::download(&url, client, headers).await?; let document = Self::parse_html(html, config, global_config)?; let xpath_ctx = Self::get_xpath_ctx(&document)?; ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article); @@ -271,9 +278,10 @@ impl ArticleScraper { Ok(()) } - async fn download(url: &url::Url, client: &Client) -> Result { + async fn download(url: &url::Url, client: &Client, headers: HeaderMap) -> Result { let response = client .get(url.as_str()) + .headers(headers) .send() .await .map_err(|err| { @@ -423,7 +431,9 @@ impl ArticleScraper { let node_vec = Self::evaluate_xpath(context, xpath, true)?; let mut val = String::new(); for node in node_vec { - val.push_str(&node.get_content()); + let part = node.get_content().split_whitespace().map(|s| format!("{} ", s)).collect::(); + val.push_str(&part); + val.push_str(" "); } Ok(val.trim().to_string()) diff --git a/src/tests.rs b/src/tests.rs index a612d3d..bd47f82 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -2,6 +2,24 @@ use crate::*; use reqwest::Client; use std::path::PathBuf; +#[tokio::test(flavor = "current_thread")] +async fn golem() { + let out_path = PathBuf::from(r"./test_output"); + let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap(); + + let grabber = ArticleScraper::new(None).await; + let article = grabber.parse(&url, true, &Client::new()).await.unwrap(); + article.save_html(&out_path).unwrap(); + + assert_eq!( + article.title, + Some(String::from( + "HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben" + )) + ); + assert_eq!(article.author, Some(String::from("Hauke Gierow"))); +} + #[tokio::test(flavor = "current_thread")] async fn phoronix() { let out_path = PathBuf::from(r"./test_output"); diff --git a/src/util.rs b/src/util.rs index f6519a7..baac1b1 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,5 +1,9 @@ +use failure::ResultExt; +use reqwest::header::{HeaderMap, HeaderValue, HeaderName}; use tokio::fs::DirEntry; +use crate::{config::ConfigEntry, error::{ScraperErrorKind, ScraperError}}; + pub struct Util; impl Util { @@ -34,4 +38,24 @@ impl Util { global_rule } } + + pub fn generate_headers(site_specific_rule: Option<&ConfigEntry>, global_rule: &ConfigEntry) -> Result { + let mut headers = HeaderMap::new(); + + if let Some(config) = site_specific_rule { + for header in &config.header { + let name = HeaderName::from_bytes(header.name.as_bytes()).context(ScraperErrorKind::Config)?; + let value = header.value.parse::().context(ScraperErrorKind::Config)?; + headers.insert(name, value); + } + } + + for header in &global_rule.header { + let name = HeaderName::from_bytes(header.name.as_bytes()).context(ScraperErrorKind::Config)?; + let value = header.value.parse::().context(ScraperErrorKind::Config)?; + headers.insert(name, value); + } + + Ok(headers) + } }