1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 08:30:00 +02:00

Compare commits

..

No commits in common. "master" and "article_scraper-v2.1.1" have entirely different histories.

11 changed files with 97 additions and 154 deletions

View file

@ -1,16 +1,15 @@
stages: stages:
- build - build
run-build: run-build:
stage: build stage: build
image: rust:1.86 image: rust:1.79
before_script: before_script:
- rustup component add rustfmt - rustup component add rustfmt
- rustup component add clippy - rustup component add clippy
- export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so
script: script:
- rustc --version && cargo --version - rustc --version && cargo --version
- echo $LIBXML2 - cargo fmt -- --check
- cargo fmt -- --check - cargo clippy --all-targets --all-features -- -D warnings
- cargo clippy --all-targets --all-features -- -D warnings - cargo build --release
- cargo build --release

View file

@ -3,7 +3,7 @@ members = ["article_scraper", "article_scraper_cli"]
resolver = "2" resolver = "2"
[workspace.package] [workspace.package]
version = "2.1.3" version = "2.1.1"
authors = ["Jan Lukas Gernert <jangernert@gmail.com>"] authors = ["Jan Lukas Gernert <jangernert@gmail.com>"]
edition = "2021" edition = "2021"
license = "GPL-3.0-or-later" license = "GPL-3.0-or-later"

View file

@ -14,17 +14,17 @@ exclude = ["resources/tests"]
[dependencies] [dependencies]
thiserror = "2.0" thiserror = "2.0"
libxml = "0.3" libxml = "0.3"
reqwest = { version = "0.12", features = ["stream"] } reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
tokio = { version = "1", features = ["macros", "fs", "io-util"] } tokio = { version = "1", features = ["macros", "fs", "io-util"] }
url = "2.5" url = "2.5"
regex = "1.11" regex = "1.10"
encoding_rs = "0.8" encoding_rs = "0.8"
chrono = "0.4" chrono = "0.4"
base64 = "0.22" base64 = "0.22"
image = "0.25" image = "0.25"
log = "0.4" log = "0.4"
rust-embed = "8.7" rust-embed="8.4"
once_cell = "1.21" once_cell = "1.19"
escaper = "0.1" escaper = "0.1"
futures = "0.3" futures = "0.3"
unic-emoji-char = "0.9" unic-emoji-char = "0.9"

@ -1 +1 @@
Subproject commit 69aa220193d99427d3822fabccdfaeede56cd532 Subproject commit ccde390b11893cbafdc84f74b449ddc3cc05c024

File diff suppressed because one or more lines are too long

View file

@ -6,10 +6,10 @@ use thiserror::Error;
#[derive(Error, Debug)] #[derive(Error, Debug)]
pub enum ScraperError { pub enum ScraperError {
#[error("Configerror {0}")] #[error("")]
Config(#[from] ConfigError), Config(#[from] ConfigError),
#[error("ImageDownloadError {0}")] #[error("")]
Image(#[from] ImageDownloadError), Image(#[from] ImageDownloadError),
#[error("FullTextParserError {0}")] #[error("")]
Scrap(#[from] FullTextParserError), Scrap(#[from] FullTextParserError),
} }

View file

@ -158,7 +158,7 @@ fn extract_date(
fn get_meta(context: &Context, name: &str) -> Option<String> { fn get_meta(context: &Context, name: &str) -> Option<String> {
Util::get_attribute( Util::get_attribute(
context, context,
&format!("//meta[contains(@name, '{name}')]"), &format!("//meta[contains(@name, '{}')]", name),
"content", "content",
) )
.ok() .ok()

View file

@ -69,11 +69,6 @@ impl FullTextParser {
let html = Self::get_body(response).await?; let html = Self::get_body(response).await?;
if html.is_empty() {
log::error!("Empty response body");
return Err(FullTextParserError::Http);
}
// check for fingerprints // check for fingerprints
let config = if config.is_none() { let config = if config.is_none() {
if let Some(url) = Fingerprints::detect(&html) { if let Some(url) = Fingerprints::detect(&html) {
@ -269,17 +264,10 @@ impl FullTextParser {
} }
// parse html // parse html
let document = Self::parse_html_string_patched(html.as_str()).map_err(|err| { Self::parse_html_string_patched(html.as_str()).map_err(|err| {
log::error!("Parsing HTML failed for downloaded HTML {:?}", err); log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
FullTextParserError::Xml FullTextParserError::Xml
})?; })
if document.get_root_element().is_none() {
log::error!("document without root");
Err(FullTextParserError::Xml)
} else {
Ok(document)
}
} }
/// FIXME: Here are some patched functions of libxml crate. /// FIXME: Here are some patched functions of libxml crate.
@ -287,7 +275,7 @@ impl FullTextParser {
/// See: /// See:
/// - <https://github.com/KWARC/rust-libxml/issues/111> /// - <https://github.com/KWARC/rust-libxml/issues/111>
/// - <https://github.com/Orange-OpenSource/hurl/issues/1535> /// - <https://github.com/Orange-OpenSource/hurl/issues/1535>
/// These two functions should be removed when the issue is fixed in libxml crate. /// These two functions should be removed when the issue is fixed in libxml crate.
fn try_usize_to_i32(value: usize) -> Result<i32, libxml::parser::XmlParseError> { fn try_usize_to_i32(value: usize) -> Result<i32, libxml::parser::XmlParseError> {
if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) { if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) {
// Cannot safely use our value comparison, but the conversion if always safe. // Cannot safely use our value comparison, but the conversion if always safe.
@ -366,73 +354,63 @@ impl FullTextParser {
.send() .send()
.await .await
.map_err(|err| { .map_err(|err| {
log::error!("Downloading HTML failed: GET '{url}' - '{err}'"); log::error!(
"Downloading HTML failed: GET '{}' - '{}'",
url.as_str(),
err
);
FullTextParserError::Http FullTextParserError::Http
})?; })?;
Ok(response) Ok(response)
} }
async fn get_body(response: Response) -> Result<String, FullTextParserError> { async fn get_body(response: Response) -> Result<String, FullTextParserError> {
let status = response.status(); if response.status().is_success() {
if !status.is_success() { let headers = response.headers().clone();
log::error!("status code: {status}"); let bytes = response
return Err(FullTextParserError::Http); .bytes()
} .await
.map_err(|_| FullTextParserError::Http)?;
let headers = response.headers().clone(); match from_utf8(&bytes) {
Ok(utf8_str) => {
if headers log::debug!("Valid utf-8 string");
.get(reqwest::header::CONTENT_LENGTH) return Ok(utf8_str.into());
.and_then(|hv| hv.to_str().ok())
.and_then(|str| str.parse::<i64>().ok())
.map(|content_length| content_length == 0)
.unwrap_or(false)
{
log::error!("Empty response body");
return Err(FullTextParserError::Http);
}
let bytes = response
.bytes()
.await
.map_err(|_| FullTextParserError::Http)?;
match from_utf8(&bytes) {
Ok(utf8_str) => {
log::debug!("Valid utf-8 string");
Ok(utf8_str.into())
}
Err(error) => {
log::debug!("Invalid utf-8 string");
let lossy_string = std::string::String::from_utf8_lossy(&bytes);
if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) {
log::debug!("Encoding extracted from HTML: '{encoding}'");
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
let decoded_html = decoded_html.replacen(
&format!("charset=\"{encoding}\""),
"charset=\"utf-8\"",
1,
);
return Ok(decoded_html);
}
} }
Err(error) => {
log::debug!("Invalid utf-8 string");
let lossy_string = std::string::String::from_utf8_lossy(&bytes);
if let Some(encoding) = Self::get_encoding_from_http_header(&headers) { if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) {
log::debug!("Encoding extracted from headers: '{encoding}'"); log::debug!("Encoding extracted from HTML: '{}'", encoding);
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
let decoded_html = decoded_html.replacen( let decoded_html = decoded_html.replacen(
&format!("charset=\"{encoding}\""), &format!("charset=\"{encoding}\""),
"charset=\"utf-8\"", "charset=\"utf-8\"",
1, 1,
); );
return Ok(decoded_html); return Ok(decoded_html);
}
} }
}
Err(FullTextParserError::Utf8(error)) if let Some(encoding) = Self::get_encoding_from_http_header(&headers) {
log::debug!("Encoding extracted from headers: '{}'", encoding);
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
let decoded_html = decoded_html.replacen(
&format!("charset=\"{encoding}\""),
"charset=\"utf-8\"",
1,
);
return Ok(decoded_html);
}
}
return Err(FullTextParserError::Utf8(error));
}
} }
} }
Err(FullTextParserError::Http)
} }
pub async fn download( pub async fn download(
@ -444,12 +422,7 @@ impl FullTextParser {
let headers = Util::generate_headers(config, global_config)?; let headers = Util::generate_headers(config, global_config)?;
let response = Self::get_response(url, client, headers).await?; let response = Self::get_response(url, client, headers).await?;
let body = Self::get_body(response).await?; let body = Self::get_body(response).await?;
if body.is_empty() { Ok(body)
log::error!("Empty response body");
Err(FullTextParserError::Http)
} else {
Ok(body)
}
} }
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> { fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
@ -722,7 +695,7 @@ impl FullTextParser {
} }
fn fix_iframe_size(context: &Context, site_name: &str) -> Result<(), FullTextParserError> { fn fix_iframe_size(context: &Context, site_name: &str) -> Result<(), FullTextParserError> {
let xpath = &format!("//iframe[contains(@src, '{site_name}')]"); let xpath = &format!("//iframe[contains(@src, '{}')]", site_name);
let node_vec = Util::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if node.is_null() { if node.is_null() {
@ -761,7 +734,7 @@ impl FullTextParser {
) -> Result<(), FullTextParserError> { ) -> Result<(), FullTextParserError> {
let xpath_tag = tag.unwrap_or("*"); let xpath_tag = tag.unwrap_or("*");
let xpath = &format!("//{xpath_tag}[@{attribute}]"); let xpath = &format!("//{}[@{}]", xpath_tag, attribute);
let node_vec = Util::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if let Err(err) = node.remove_property(attribute) { if let Err(err) = node.remove_property(attribute) {
@ -791,16 +764,17 @@ impl FullTextParser {
if let Some(url) = node.get_attribute(attribute) { if let Some(url) = node.get_attribute(attribute) {
let trimmed_url = url.trim(); let trimmed_url = url.trim();
if url.starts_with('#') || url.starts_with("\\#") { let is_hash_url = url.starts_with('#');
continue;
}
let is_relative_url = url::Url::parse(&url) let is_relative_url = url::Url::parse(&url)
.err() .err()
.map(|err| err == url::ParseError::RelativeUrlWithoutBase) .map(|err| err == url::ParseError::RelativeUrlWithoutBase)
.unwrap_or(false); .unwrap_or(false);
let is_javascript = trimmed_url.contains("javascript:"); let is_javascript = trimmed_url.contains("javascript:");
if !is_hash_url && node.get_name().to_uppercase() == "A" {
_ = node.set_attribute("target", "_blank");
}
if let Some(srcset) = node.get_attribute("srcset") { if let Some(srcset) = node.get_attribute("srcset") {
let res = constants::SRC_SET_URL let res = constants::SRC_SET_URL
.captures_iter(&srcset) .captures_iter(&srcset)
@ -831,7 +805,9 @@ impl FullTextParser {
_ = node.set_attribute("srcset", res.as_str()); _ = node.set_attribute("srcset", res.as_str());
} }
if is_relative_url { if is_hash_url {
_ = node.set_attribute(attribute, trimmed_url);
} else if is_relative_url {
let completed_url = match article_url.join(trimmed_url) { let completed_url = match article_url.join(trimmed_url) {
Ok(joined_url) => joined_url, Ok(joined_url) => joined_url,
Err(_) => continue, Err(_) => continue,
@ -944,7 +920,7 @@ impl FullTextParser {
for xpath_strip_img_src in &config.strip_image_src { for xpath_strip_img_src in &config.strip_image_src {
_ = Util::strip_node( _ = Util::strip_node(
context, context,
&format!("//img[contains(@src,'{xpath_strip_img_src}')]"), &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
); );
} }
} }
@ -952,7 +928,7 @@ impl FullTextParser {
for xpath_strip_img_src in &global_config.strip_image_src { for xpath_strip_img_src in &global_config.strip_image_src {
_ = Util::strip_node( _ = Util::strip_node(
context, context,
&format!("//img[contains(@src,'{xpath_strip_img_src}')]"), &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
); );
} }
@ -1267,7 +1243,7 @@ impl FullTextParser {
Ok(()) Ok(())
} }
pub(crate) fn remove_single_cell_tables(root: &mut Node) { fn remove_single_cell_tables(root: &mut Node) {
let mut node_iter = Some(root.clone()); let mut node_iter = Some(root.clone());
while let Some(node) = node_iter { while let Some(node) = node_iter {
@ -1305,7 +1281,7 @@ impl FullTextParser {
} }
} }
pub(crate) fn remove_extra_p_and_div(root: &mut Node) { fn remove_extra_p_and_div(root: &mut Node) {
let mut node_iter = Some(root.clone()); let mut node_iter = Some(root.clone());
while let Some(mut node) = node_iter { while let Some(mut node) = node_iter {
@ -1327,7 +1303,7 @@ impl FullTextParser {
} }
} }
pub(crate) fn remove_share_elements(root: &mut Node) { fn remove_share_elements(root: &mut Node) {
let mut node_iter = Some(root.clone()); let mut node_iter = Some(root.clone());
while let Some(mut node) = node_iter { while let Some(mut node) = node_iter {
@ -1347,7 +1323,7 @@ impl FullTextParser {
} }
} }
pub(crate) fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> { fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> {
let mut node_iter = Some(root.clone()); let mut node_iter = Some(root.clone());
while let Some(mut node) = node_iter { while let Some(mut node) = node_iter {
@ -1436,7 +1412,7 @@ impl FullTextParser {
Ok(()) Ok(())
} }
pub(crate) fn remove_empty_nodes(root: &mut Node) { fn remove_empty_nodes(root: &mut Node) {
let mut node_iter = Some(root.clone()); let mut node_iter = Some(root.clone());
while let Some(mut node) = node_iter { while let Some(mut node) = node_iter {

View file

@ -219,9 +219,7 @@ impl ImageDownloader {
let mut image_urls = Vec::new(); let mut image_urls = Vec::new();
for node in node_vec { for node in node_vec {
if let Ok(url) = Self::harvest_image_urls_from_node(node) { image_urls.push(Self::harvest_image_urls_from_node(node)?);
image_urls.push(url);
}
} }
Ok(image_urls) Ok(image_urls)
@ -276,7 +274,7 @@ impl ImageDownloader {
} }
let image_base64 = base64::engine::general_purpose::STANDARD.encode(&image.data); let image_base64 = base64::engine::general_purpose::STANDARD.encode(&image.data);
let image_string = format!("data:{content_type};base64,{image_base64}"); let image_string = format!("data:{};base64,{}", content_type, image_base64);
let image_data_base64 = ImageDataBase64 { let image_data_base64 = ImageDataBase64 {
url: image.url, url: image.url,
data: image_string, data: image_string,

View file

@ -264,15 +264,17 @@ impl Util {
context: &Context, context: &Context,
id_or_class: &str, id_or_class: &str,
) -> Result<(), FullTextParserError> { ) -> Result<(), FullTextParserError> {
let xpath = let xpath = &format!(
&format!("//*[contains(@class, '{id_or_class}') or contains(@id, '{id_or_class}')]"); "//*[contains(@class, '{}') or contains(@id, '{}')]",
id_or_class, id_or_class
);
let mut ancestor = xpath.clone(); let mut ancestor = xpath.clone();
if ancestor.starts_with("//") { if ancestor.starts_with("//") {
ancestor = ancestor.chars().skip(2).collect(); ancestor = ancestor.chars().skip(2).collect();
} }
let query = &format!("{xpath}[not(ancestor::{ancestor})]"); let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
let node_vec = Util::evaluate_xpath(context, query, false)?; let node_vec = Util::evaluate_xpath(context, query, false)?;
for mut node in node_vec { for mut node in node_vec {
if node.is_null() { if node.is_null() {

View file

@ -9,22 +9,11 @@ repository.workspace = true
[dependencies] [dependencies]
article_scraper = { path = "../article_scraper/" } article_scraper = { path = "../article_scraper/" }
clap = { version = "4.5", features = ["derive"] } clap = { version = "4.5", features = [ "derive" ] }
simplelog = "0.12" simplelog = "0.12"
log = "0.4" log = "0.4"
url = "2.5" url = "2.5"
reqwest = { version = "0.12", features = [ reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
"json", tokio = { version = "1", features = ["macros", "fs", "io-util", "rt-multi-thread" ] }
"native-tls", indicatif = "0.17"
"gzip",
"brotli",
"stream",
] }
tokio = { version = "1", features = [
"macros",
"fs",
"io-util",
"rt-multi-thread",
] }
indicatif = "0.18"