From ca1cc47af1f7749ea7d10983e8e269afb0c57daf Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 17 Jan 2025 03:02:40 +0100 Subject: [PATCH 01/10] update CI image --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5a57616..7880e5d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,7 +4,7 @@ stages: run-build: stage: build - image: rust:1.79 + image: rust:1.83 before_script: - rustup component add rustfmt - rustup component add clippy From 8cfcd6d9f3636a84336da4d882a9f6db5ce565b4 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 17 Jan 2025 03:05:55 +0100 Subject: [PATCH 02/10] clippy --- article_scraper/src/full_text_parser/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 98e9478..18fc682 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -275,7 +275,7 @@ impl FullTextParser { /// See: /// - /// - - /// These two functions should be removed when the issue is fixed in libxml crate. + /// These two functions should be removed when the issue is fixed in libxml crate. fn try_usize_to_i32(value: usize) -> Result { if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) { // Cannot safely use our value comparison, but the conversion if always safe. From 9f56ed03b8e384378d92e01c5bc38bf80525760c Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Mon, 10 Mar 2025 13:42:31 +0100 Subject: [PATCH 03/10] article_scraper: don't specify reqwest features --- article_scraper/Cargo.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index 100766c..e852be9 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -14,17 +14,17 @@ exclude = ["resources/tests"] [dependencies] thiserror = "2.0" libxml = "0.3" -reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] } +reqwest = "0.12" tokio = { version = "1", features = ["macros", "fs", "io-util"] } url = "2.5" -regex = "1.10" +regex = "1.11" encoding_rs = "0.8" chrono = "0.4" base64 = "0.22" image = "0.25" log = "0.4" -rust-embed="8.4" -once_cell = "1.19" +rust-embed="8.6" +once_cell = "1.20" escaper = "0.1" futures = "0.3" unic-emoji-char = "0.9" From 0978335d3b73e8049c602713b76ca5e3f038d9ca Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 28 Mar 2025 17:18:03 +0100 Subject: [PATCH 04/10] [f] ignore url harvest error --- article_scraper/src/images/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/article_scraper/src/images/mod.rs b/article_scraper/src/images/mod.rs index de0f48f..4be98b8 100644 --- a/article_scraper/src/images/mod.rs +++ b/article_scraper/src/images/mod.rs @@ -219,7 +219,9 @@ impl ImageDownloader { let mut image_urls = Vec::new(); for node in node_vec { - image_urls.push(Self::harvest_image_urls_from_node(node)?); + if let Ok(url) = Self::harvest_image_urls_from_node(node) { + image_urls.push(url); + } } Ok(image_urls) From b92500fca276535b40d1956e71a1cca226d92437 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 5 Apr 2025 15:45:41 +0200 Subject: [PATCH 05/10] better error messages --- article_scraper/src/error.rs | 6 +- article_scraper/src/full_text_parser/mod.rs | 96 ++++++++++----------- 2 files changed, 50 insertions(+), 52 deletions(-) diff --git a/article_scraper/src/error.rs b/article_scraper/src/error.rs index 4f915fd..41ac9de 100644 --- a/article_scraper/src/error.rs +++ b/article_scraper/src/error.rs @@ -6,10 +6,10 @@ use thiserror::Error; #[derive(Error, Debug)] pub enum ScraperError { - #[error("")] + #[error("Configerror {0}")] Config(#[from] ConfigError), - #[error("")] + #[error("ImageDownloadError {0}")] Image(#[from] ImageDownloadError), - #[error("")] + #[error("FullTextParserError {0}")] Scrap(#[from] FullTextParserError), } diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 18fc682..4bb8a30 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -354,63 +354,61 @@ impl FullTextParser { .send() .await .map_err(|err| { - log::error!( - "Downloading HTML failed: GET '{}' - '{}'", - url.as_str(), - err - ); + log::error!("Downloading HTML failed: GET '{url}' - '{err}'"); FullTextParserError::Http })?; Ok(response) } async fn get_body(response: Response) -> Result { - if response.status().is_success() { - let headers = response.headers().clone(); - let bytes = response - .bytes() - .await - .map_err(|_| FullTextParserError::Http)?; - - match from_utf8(&bytes) { - Ok(utf8_str) => { - log::debug!("Valid utf-8 string"); - return Ok(utf8_str.into()); - } - Err(error) => { - log::debug!("Invalid utf-8 string"); - let lossy_string = std::string::String::from_utf8_lossy(&bytes); - - if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) { - log::debug!("Encoding extracted from HTML: '{}'", encoding); - if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { - let decoded_html = decoded_html.replacen( - &format!("charset=\"{encoding}\""), - "charset=\"utf-8\"", - 1, - ); - return Ok(decoded_html); - } - } - - if let Some(encoding) = Self::get_encoding_from_http_header(&headers) { - log::debug!("Encoding extracted from headers: '{}'", encoding); - if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { - let decoded_html = decoded_html.replacen( - &format!("charset=\"{encoding}\""), - "charset=\"utf-8\"", - 1, - ); - return Ok(decoded_html); - } - } - - return Err(FullTextParserError::Utf8(error)); - } - } + let status = response.status(); + if !status.is_success() { + log::error!("status code: {status}"); + return Err(FullTextParserError::Http); } - Err(FullTextParserError::Http) + let headers = response.headers().clone(); + let bytes = response + .bytes() + .await + .map_err(|_| FullTextParserError::Http)?; + + match from_utf8(&bytes) { + Ok(utf8_str) => { + log::debug!("Valid utf-8 string"); + Ok(utf8_str.into()) + } + Err(error) => { + log::debug!("Invalid utf-8 string"); + let lossy_string = std::string::String::from_utf8_lossy(&bytes); + + if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) { + log::debug!("Encoding extracted from HTML: '{encoding}'"); + if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { + let decoded_html = decoded_html.replacen( + &format!("charset=\"{encoding}\""), + "charset=\"utf-8\"", + 1, + ); + return Ok(decoded_html); + } + } + + if let Some(encoding) = Self::get_encoding_from_http_header(&headers) { + log::debug!("Encoding extracted from headers: '{encoding}'"); + if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { + let decoded_html = decoded_html.replacen( + &format!("charset=\"{encoding}\""), + "charset=\"utf-8\"", + 1, + ); + return Ok(decoded_html); + } + } + + Err(FullTextParserError::Utf8(error)) + } + } } pub async fn download( From 9b374a28c717e57db7341fac4967b9e0114ad455 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 5 Apr 2025 15:47:08 +0200 Subject: [PATCH 06/10] update ftr-site-config --- article_scraper/ftr-site-config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/ftr-site-config b/article_scraper/ftr-site-config index ccde390..69aa220 160000 --- a/article_scraper/ftr-site-config +++ b/article_scraper/ftr-site-config @@ -1 +1 @@ -Subproject commit ccde390b11893cbafdc84f74b449ddc3cc05c024 +Subproject commit 69aa220193d99427d3822fabccdfaeede56cd532 From f361392c04376736ce9ce2d338c7363959135878 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 4 May 2025 17:34:33 +0200 Subject: [PATCH 07/10] check for empty http response and parsed documents without root element --- article_scraper/src/full_text_parser/mod.rs | 35 +++++++++++++++++++-- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 4bb8a30..ac77bf6 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -69,6 +69,11 @@ impl FullTextParser { let html = Self::get_body(response).await?; + if html.is_empty() { + log::error!("Empty response body"); + return Err(FullTextParserError::Http); + } + // check for fingerprints let config = if config.is_none() { if let Some(url) = Fingerprints::detect(&html) { @@ -264,10 +269,17 @@ impl FullTextParser { } // parse html - Self::parse_html_string_patched(html.as_str()).map_err(|err| { + let document = Self::parse_html_string_patched(html.as_str()).map_err(|err| { log::error!("Parsing HTML failed for downloaded HTML {:?}", err); FullTextParserError::Xml - }) + })?; + + if document.get_root_element().is_none() { + log::error!("document without root"); + Err(FullTextParserError::Xml) + } else { + Ok(document) + } } /// FIXME: Here are some patched functions of libxml crate. @@ -368,6 +380,18 @@ impl FullTextParser { } let headers = response.headers().clone(); + + if headers + .get(reqwest::header::CONTENT_LENGTH) + .and_then(|hv| hv.to_str().ok()) + .and_then(|str| str.parse::().ok()) + .map(|content_length| content_length == 0) + .unwrap_or(false) + { + log::error!("Empty response body"); + return Err(FullTextParserError::Http); + } + let bytes = response .bytes() .await @@ -420,7 +444,12 @@ impl FullTextParser { let headers = Util::generate_headers(config, global_config)?; let response = Self::get_response(url, client, headers).await?; let body = Self::get_body(response).await?; - Ok(body) + if body.is_empty() { + log::error!("Empty response body"); + Err(FullTextParserError::Http) + } else { + Ok(body) + } } fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> { From 06990acbc0d4cd55a44aeb20e95c1e6216074a16 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 4 May 2025 17:38:46 +0200 Subject: [PATCH 08/10] fix libxml CI build --- .gitlab-ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7880e5d..159f07d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,12 +4,14 @@ stages: run-build: stage: build - image: rust:1.83 + image: rust:1.86 before_script: - rustup component add rustfmt - rustup component add clippy + - export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so script: - rustc --version && cargo --version + - echo $LIBXML2 - cargo fmt -- --check - cargo clippy --all-targets --all-features -- -D warnings - cargo build --release From 498008f6307c3faabfd6ac40e820871752b75039 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 4 May 2025 17:51:30 +0200 Subject: [PATCH 09/10] bump version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 99695c8..8569ad0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = ["article_scraper", "article_scraper_cli"] resolver = "2" [workspace.package] -version = "2.1.1" +version = "2.1.2" authors = ["Jan Lukas Gernert "] edition = "2021" license = "GPL-3.0-or-later" From 9f349f8c6f2a88b277a8d1552d3d84781bdc9363 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 4 May 2025 18:00:59 +0200 Subject: [PATCH 10/10] need reqwest streams --- article_scraper/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index e852be9..eeed67c 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -14,7 +14,7 @@ exclude = ["resources/tests"] [dependencies] thiserror = "2.0" libxml = "0.3" -reqwest = "0.12" +reqwest = { version = "0.12", features = ["stream"] } tokio = { version = "1", features = ["macros", "fs", "io-util"] } url = "2.5" regex = "1.11"