diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5a57616..159f07d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,12 +4,14 @@ stages: run-build: stage: build - image: rust:1.79 + image: rust:1.86 before_script: - rustup component add rustfmt - rustup component add clippy + - export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so script: - rustc --version && cargo --version + - echo $LIBXML2 - cargo fmt -- --check - cargo clippy --all-targets --all-features -- -D warnings - cargo build --release diff --git a/Cargo.toml b/Cargo.toml index 99695c8..8569ad0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = ["article_scraper", "article_scraper_cli"] resolver = "2" [workspace.package] -version = "2.1.1" +version = "2.1.2" authors = ["Jan Lukas Gernert "] edition = "2021" license = "GPL-3.0-or-later" diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index 100766c..eeed67c 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -14,17 +14,17 @@ exclude = ["resources/tests"] [dependencies] thiserror = "2.0" libxml = "0.3" -reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] } +reqwest = { version = "0.12", features = ["stream"] } tokio = { version = "1", features = ["macros", "fs", "io-util"] } url = "2.5" -regex = "1.10" +regex = "1.11" encoding_rs = "0.8" chrono = "0.4" base64 = "0.22" image = "0.25" log = "0.4" -rust-embed="8.4" -once_cell = "1.19" +rust-embed="8.6" +once_cell = "1.20" escaper = "0.1" futures = "0.3" unic-emoji-char = "0.9" diff --git a/article_scraper/ftr-site-config b/article_scraper/ftr-site-config index ccde390..69aa220 160000 --- a/article_scraper/ftr-site-config +++ b/article_scraper/ftr-site-config @@ -1 +1 @@ -Subproject commit ccde390b11893cbafdc84f74b449ddc3cc05c024 +Subproject commit 69aa220193d99427d3822fabccdfaeede56cd532 diff --git a/article_scraper/src/error.rs b/article_scraper/src/error.rs index 4f915fd..41ac9de 100644 --- a/article_scraper/src/error.rs +++ b/article_scraper/src/error.rs @@ -6,10 +6,10 @@ use thiserror::Error; #[derive(Error, Debug)] pub enum ScraperError { - #[error("")] + #[error("Configerror {0}")] Config(#[from] ConfigError), - #[error("")] + #[error("ImageDownloadError {0}")] Image(#[from] ImageDownloadError), - #[error("")] + #[error("FullTextParserError {0}")] Scrap(#[from] FullTextParserError), } diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 98e9478..ac77bf6 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -69,6 +69,11 @@ impl FullTextParser { let html = Self::get_body(response).await?; + if html.is_empty() { + log::error!("Empty response body"); + return Err(FullTextParserError::Http); + } + // check for fingerprints let config = if config.is_none() { if let Some(url) = Fingerprints::detect(&html) { @@ -264,10 +269,17 @@ impl FullTextParser { } // parse html - Self::parse_html_string_patched(html.as_str()).map_err(|err| { + let document = Self::parse_html_string_patched(html.as_str()).map_err(|err| { log::error!("Parsing HTML failed for downloaded HTML {:?}", err); FullTextParserError::Xml - }) + })?; + + if document.get_root_element().is_none() { + log::error!("document without root"); + Err(FullTextParserError::Xml) + } else { + Ok(document) + } } /// FIXME: Here are some patched functions of libxml crate. @@ -275,7 +287,7 @@ impl FullTextParser { /// See: /// - /// - - /// These two functions should be removed when the issue is fixed in libxml crate. + /// These two functions should be removed when the issue is fixed in libxml crate. fn try_usize_to_i32(value: usize) -> Result { if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) { // Cannot safely use our value comparison, but the conversion if always safe. @@ -354,63 +366,73 @@ impl FullTextParser { .send() .await .map_err(|err| { - log::error!( - "Downloading HTML failed: GET '{}' - '{}'", - url.as_str(), - err - ); + log::error!("Downloading HTML failed: GET '{url}' - '{err}'"); FullTextParserError::Http })?; Ok(response) } async fn get_body(response: Response) -> Result { - if response.status().is_success() { - let headers = response.headers().clone(); - let bytes = response - .bytes() - .await - .map_err(|_| FullTextParserError::Http)?; - - match from_utf8(&bytes) { - Ok(utf8_str) => { - log::debug!("Valid utf-8 string"); - return Ok(utf8_str.into()); - } - Err(error) => { - log::debug!("Invalid utf-8 string"); - let lossy_string = std::string::String::from_utf8_lossy(&bytes); - - if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) { - log::debug!("Encoding extracted from HTML: '{}'", encoding); - if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { - let decoded_html = decoded_html.replacen( - &format!("charset=\"{encoding}\""), - "charset=\"utf-8\"", - 1, - ); - return Ok(decoded_html); - } - } - - if let Some(encoding) = Self::get_encoding_from_http_header(&headers) { - log::debug!("Encoding extracted from headers: '{}'", encoding); - if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { - let decoded_html = decoded_html.replacen( - &format!("charset=\"{encoding}\""), - "charset=\"utf-8\"", - 1, - ); - return Ok(decoded_html); - } - } - - return Err(FullTextParserError::Utf8(error)); - } - } + let status = response.status(); + if !status.is_success() { + log::error!("status code: {status}"); + return Err(FullTextParserError::Http); } - Err(FullTextParserError::Http) + let headers = response.headers().clone(); + + if headers + .get(reqwest::header::CONTENT_LENGTH) + .and_then(|hv| hv.to_str().ok()) + .and_then(|str| str.parse::().ok()) + .map(|content_length| content_length == 0) + .unwrap_or(false) + { + log::error!("Empty response body"); + return Err(FullTextParserError::Http); + } + + let bytes = response + .bytes() + .await + .map_err(|_| FullTextParserError::Http)?; + + match from_utf8(&bytes) { + Ok(utf8_str) => { + log::debug!("Valid utf-8 string"); + Ok(utf8_str.into()) + } + Err(error) => { + log::debug!("Invalid utf-8 string"); + let lossy_string = std::string::String::from_utf8_lossy(&bytes); + + if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) { + log::debug!("Encoding extracted from HTML: '{encoding}'"); + if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { + let decoded_html = decoded_html.replacen( + &format!("charset=\"{encoding}\""), + "charset=\"utf-8\"", + 1, + ); + return Ok(decoded_html); + } + } + + if let Some(encoding) = Self::get_encoding_from_http_header(&headers) { + log::debug!("Encoding extracted from headers: '{encoding}'"); + if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { + let decoded_html = decoded_html.replacen( + &format!("charset=\"{encoding}\""), + "charset=\"utf-8\"", + 1, + ); + return Ok(decoded_html); + } + } + + Err(FullTextParserError::Utf8(error)) + } + } } pub async fn download( @@ -422,7 +444,12 @@ impl FullTextParser { let headers = Util::generate_headers(config, global_config)?; let response = Self::get_response(url, client, headers).await?; let body = Self::get_body(response).await?; - Ok(body) + if body.is_empty() { + log::error!("Empty response body"); + Err(FullTextParserError::Http) + } else { + Ok(body) + } } fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> { diff --git a/article_scraper/src/images/mod.rs b/article_scraper/src/images/mod.rs index de0f48f..4be98b8 100644 --- a/article_scraper/src/images/mod.rs +++ b/article_scraper/src/images/mod.rs @@ -219,7 +219,9 @@ impl ImageDownloader { let mut image_urls = Vec::new(); for node in node_vec { - image_urls.push(Self::harvest_image_urls_from_node(node)?); + if let Ok(url) = Self::harvest_image_urls_from_node(node) { + image_urls.push(url); + } } Ok(image_urls)