mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
add support for header values: fixes golem test
This commit is contained in:
parent
0e3553b647
commit
7b1b027c6d
5 changed files with 94 additions and 13 deletions
|
@ -10,7 +10,7 @@ repository = "https://gitlab.com/news-flash/article_scraper"
|
||||||
[dependencies]
|
[dependencies]
|
||||||
failure = "0.1"
|
failure = "0.1"
|
||||||
libxml = "0.3"
|
libxml = "0.3"
|
||||||
reqwest = { version = "0.11", features = ["json", "native-tls"] }
|
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli"] }
|
||||||
tokio = { version = "1.21", features = ["macros", "fs", "io-util"] }
|
tokio = { version = "1.21", features = ["macros", "fs", "io-util"] }
|
||||||
url = "2.2"
|
url = "2.2"
|
||||||
regex = "1.4"
|
regex = "1.4"
|
||||||
|
|
|
@ -14,6 +14,12 @@ pub struct Replace {
|
||||||
pub replace_with: String,
|
pub replace_with: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct Header {
|
||||||
|
pub name: String,
|
||||||
|
pub value: String,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct ConfigEntry {
|
pub struct ConfigEntry {
|
||||||
pub xpath_title: Vec<String>,
|
pub xpath_title: Vec<String>,
|
||||||
|
@ -24,6 +30,7 @@ pub struct ConfigEntry {
|
||||||
pub strip_id_or_class: Vec<String>,
|
pub strip_id_or_class: Vec<String>,
|
||||||
pub strip_image_src: Vec<String>,
|
pub strip_image_src: Vec<String>,
|
||||||
pub replace: Vec<Replace>,
|
pub replace: Vec<Replace>,
|
||||||
|
pub header: Vec<Header>,
|
||||||
pub single_page_link: Option<String>,
|
pub single_page_link: Option<String>,
|
||||||
pub next_page_link: Option<String>,
|
pub next_page_link: Option<String>,
|
||||||
}
|
}
|
||||||
|
@ -55,6 +62,7 @@ impl ConfigEntry {
|
||||||
let mut strip_id_or_class: Vec<String> = Vec::new();
|
let mut strip_id_or_class: Vec<String> = Vec::new();
|
||||||
let mut strip_image_src: Vec<String> = Vec::new();
|
let mut strip_image_src: Vec<String> = Vec::new();
|
||||||
let mut replace_vec: Vec<Replace> = Vec::new();
|
let mut replace_vec: Vec<Replace> = Vec::new();
|
||||||
|
let mut header_vec: Vec<Header> = Vec::new();
|
||||||
let mut next_page_link: Option<String> = None;
|
let mut next_page_link: Option<String> = None;
|
||||||
let mut single_page_link: Option<String> = None;
|
let mut single_page_link: Option<String> = None;
|
||||||
|
|
||||||
|
@ -71,6 +79,7 @@ impl ConfigEntry {
|
||||||
let find = "find_string:";
|
let find = "find_string:";
|
||||||
let replace = "replace_string:";
|
let replace = "replace_string:";
|
||||||
let replace_single = "replace_string(";
|
let replace_single = "replace_string(";
|
||||||
|
let http_header = "http_header(";
|
||||||
|
|
||||||
// ignore these
|
// ignore these
|
||||||
let tidy = "tidy:";
|
let tidy = "tidy:";
|
||||||
|
@ -123,6 +132,25 @@ impl ConfigEntry {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if line.starts_with(http_header) {
|
||||||
|
let value = Util::extract_value(http_header, line);
|
||||||
|
let value: Vec<&str> = value.split("): ").map(|s| s.trim()).collect();
|
||||||
|
if value.len() != 2 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(name) = value.get(0) {
|
||||||
|
if let Some(value) = value.get(1) {
|
||||||
|
header_vec.push(Header {
|
||||||
|
name: (*name).to_string(),
|
||||||
|
value: (*value).to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if line.starts_with(find) {
|
if line.starts_with(find) {
|
||||||
let to_replace = Util::extract_value(find, line).into();
|
let to_replace = Util::extract_value(find, line).into();
|
||||||
|
|
||||||
|
@ -148,6 +176,7 @@ impl ConfigEntry {
|
||||||
strip_id_or_class,
|
strip_id_or_class,
|
||||||
strip_image_src,
|
strip_image_src,
|
||||||
replace: replace_vec,
|
replace: replace_vec,
|
||||||
|
header: header_vec,
|
||||||
single_page_link,
|
single_page_link,
|
||||||
next_page_link,
|
next_page_link,
|
||||||
};
|
};
|
||||||
|
|
34
src/lib.rs
34
src/lib.rs
|
@ -19,6 +19,7 @@ use libxml::parser::Parser;
|
||||||
use libxml::tree::{Document, Node, SaveOptions};
|
use libxml::tree::{Document, Node, SaveOptions};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
use log::{debug, error, info, warn};
|
use log::{debug, error, info, warn};
|
||||||
|
use reqwest::header::HeaderMap;
|
||||||
use reqwest::{Client, Response};
|
use reqwest::{Client, Response};
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
@ -53,8 +54,18 @@ impl ArticleScraper {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// check if we have a config for the url
|
||||||
|
let config = self.get_grabber_config(&url);
|
||||||
|
let global_config = self
|
||||||
|
.config_files
|
||||||
|
.get("global.txt")
|
||||||
|
.ok_or_else(|| ScraperErrorKind::Config)?;
|
||||||
|
|
||||||
|
let headers = Util::generate_headers(config, global_config)?;
|
||||||
|
|
||||||
let response = client
|
let response = client
|
||||||
.head(url.clone())
|
.head(url.clone())
|
||||||
|
.headers(headers)
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.map_err(|err| {
|
.map_err(|err| {
|
||||||
|
@ -76,13 +87,6 @@ impl ArticleScraper {
|
||||||
return Err(ScraperErrorKind::ContentType.into());
|
return Err(ScraperErrorKind::ContentType.into());
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if we have a config for the url
|
|
||||||
let config = self.get_grabber_config(&url);
|
|
||||||
let global_config = self
|
|
||||||
.config_files
|
|
||||||
.get("global.txt")
|
|
||||||
.ok_or_else(|| ScraperErrorKind::Config)?;
|
|
||||||
|
|
||||||
let mut article = Article {
|
let mut article = Article {
|
||||||
title: None,
|
title: None,
|
||||||
author: None,
|
author: None,
|
||||||
|
@ -148,7 +152,8 @@ impl ArticleScraper {
|
||||||
global_config: &ConfigEntry,
|
global_config: &ConfigEntry,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<(), ScraperError> {
|
) -> Result<(), ScraperError> {
|
||||||
let html = ArticleScraper::download(&url, client).await?;
|
let headers = Util::generate_headers(config, global_config)?;
|
||||||
|
let html = ArticleScraper::download(&url, client, headers).await?;
|
||||||
let mut document = Self::parse_html(html, config, global_config)?;
|
let mut document = Self::parse_html(html, config, global_config)?;
|
||||||
let mut xpath_ctx = Self::get_xpath_ctx(&document)?;
|
let mut xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
|
|
||||||
|
@ -187,7 +192,8 @@ impl ArticleScraper {
|
||||||
ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?;
|
ArticleScraper::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||||
|
|
||||||
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
|
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
|
||||||
let html = ArticleScraper::download(&url, client).await?;
|
let headers = Util::generate_headers(config, global_config)?;
|
||||||
|
let html = ArticleScraper::download(&url, client, headers).await?;
|
||||||
document = Self::parse_html(html, config, global_config)?;
|
document = Self::parse_html(html, config, global_config)?;
|
||||||
xpath_ctx = Self::get_xpath_ctx(&document)?;
|
xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url);
|
ArticleScraper::strip_junk(&xpath_ctx, config, global_config, &url);
|
||||||
|
@ -261,7 +267,8 @@ impl ArticleScraper {
|
||||||
global_config: &ConfigEntry,
|
global_config: &ConfigEntry,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<(), ScraperError> {
|
) -> Result<(), ScraperError> {
|
||||||
let html = ArticleScraper::download(&url, client).await?;
|
let headers = Util::generate_headers(config, global_config)?;
|
||||||
|
let html = ArticleScraper::download(&url, client, headers).await?;
|
||||||
let document = Self::parse_html(html, config, global_config)?;
|
let document = Self::parse_html(html, config, global_config)?;
|
||||||
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article);
|
ArticleScraper::extract_metadata(&xpath_ctx, config, global_config, article);
|
||||||
|
@ -271,9 +278,10 @@ impl ArticleScraper {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn download(url: &url::Url, client: &Client) -> Result<String, ScraperError> {
|
async fn download(url: &url::Url, client: &Client, headers: HeaderMap) -> Result<String, ScraperError> {
|
||||||
let response = client
|
let response = client
|
||||||
.get(url.as_str())
|
.get(url.as_str())
|
||||||
|
.headers(headers)
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.map_err(|err| {
|
.map_err(|err| {
|
||||||
|
@ -423,7 +431,9 @@ impl ArticleScraper {
|
||||||
let node_vec = Self::evaluate_xpath(context, xpath, true)?;
|
let node_vec = Self::evaluate_xpath(context, xpath, true)?;
|
||||||
let mut val = String::new();
|
let mut val = String::new();
|
||||||
for node in node_vec {
|
for node in node_vec {
|
||||||
val.push_str(&node.get_content());
|
let part = node.get_content().split_whitespace().map(|s| format!("{} ", s)).collect::<String>();
|
||||||
|
val.push_str(&part);
|
||||||
|
val.push_str(" ");
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(val.trim().to_string())
|
Ok(val.trim().to_string())
|
||||||
|
|
18
src/tests.rs
18
src/tests.rs
|
@ -2,6 +2,24 @@ use crate::*;
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
#[tokio::test(flavor = "current_thread")]
|
||||||
|
async fn golem() {
|
||||||
|
let out_path = PathBuf::from(r"./test_output");
|
||||||
|
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
|
||||||
|
|
||||||
|
let grabber = ArticleScraper::new(None).await;
|
||||||
|
let article = grabber.parse(&url, true, &Client::new()).await.unwrap();
|
||||||
|
article.save_html(&out_path).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
article.title,
|
||||||
|
Some(String::from(
|
||||||
|
"HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben"
|
||||||
|
))
|
||||||
|
);
|
||||||
|
assert_eq!(article.author, Some(String::from("Hauke Gierow")));
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test(flavor = "current_thread")]
|
#[tokio::test(flavor = "current_thread")]
|
||||||
async fn phoronix() {
|
async fn phoronix() {
|
||||||
let out_path = PathBuf::from(r"./test_output");
|
let out_path = PathBuf::from(r"./test_output");
|
||||||
|
|
24
src/util.rs
24
src/util.rs
|
@ -1,5 +1,9 @@
|
||||||
|
use failure::ResultExt;
|
||||||
|
use reqwest::header::{HeaderMap, HeaderValue, HeaderName};
|
||||||
use tokio::fs::DirEntry;
|
use tokio::fs::DirEntry;
|
||||||
|
|
||||||
|
use crate::{config::ConfigEntry, error::{ScraperErrorKind, ScraperError}};
|
||||||
|
|
||||||
pub struct Util;
|
pub struct Util;
|
||||||
|
|
||||||
impl Util {
|
impl Util {
|
||||||
|
@ -34,4 +38,24 @@ impl Util {
|
||||||
global_rule
|
global_rule
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn generate_headers(site_specific_rule: Option<&ConfigEntry>, global_rule: &ConfigEntry) -> Result<HeaderMap, ScraperError> {
|
||||||
|
let mut headers = HeaderMap::new();
|
||||||
|
|
||||||
|
if let Some(config) = site_specific_rule {
|
||||||
|
for header in &config.header {
|
||||||
|
let name = HeaderName::from_bytes(header.name.as_bytes()).context(ScraperErrorKind::Config)?;
|
||||||
|
let value = header.value.parse::<HeaderValue>().context(ScraperErrorKind::Config)?;
|
||||||
|
headers.insert(name, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for header in &global_rule.header {
|
||||||
|
let name = HeaderName::from_bytes(header.name.as_bytes()).context(ScraperErrorKind::Config)?;
|
||||||
|
let value = header.value.parse::<HeaderValue>().context(ScraperErrorKind::Config)?;
|
||||||
|
headers.insert(name, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(headers)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue