From 90383545e0fa0a9384352705f116d16ae206e36b Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 11 Dec 2022 17:38:42 +0100 Subject: [PATCH] extract & parse charsets other than utf8 --- src/full_text_parser/error.rs | 2 + src/full_text_parser/fingerprints.rs | 8 ++-- src/full_text_parser/mod.rs | 58 ++++++++++++++++------------ src/full_text_parser/tests.rs | 9 +++++ 4 files changed, 50 insertions(+), 27 deletions(-) diff --git a/src/full_text_parser/error.rs b/src/full_text_parser/error.rs index 6792e62..3aa0f0c 100644 --- a/src/full_text_parser/error.rs +++ b/src/full_text_parser/error.rs @@ -16,6 +16,8 @@ pub enum FullTextParserError { IO, #[error("Content-type suggest no html")] ContentType, + #[error("Invalid UTF8 Text")] + Utf8(#[from] std::str::Utf8Error), #[error("Unknown Error")] Unknown, } diff --git a/src/full_text_parser/fingerprints.rs b/src/full_text_parser/fingerprints.rs index 08c4bd2..a65db91 100644 --- a/src/full_text_parser/fingerprints.rs +++ b/src/full_text_parser/fingerprints.rs @@ -21,12 +21,14 @@ static FINGERPRINT_REGEXES: Lazy> = Lazy::new(|| { ); m.insert( "fingerprint.wordpress.com", - regex::Regex::new(r#"/\\/i"#) + regex::Regex::new(r#"/\\/i"#) .expect("failed to build static regex"), ); m diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs index 7f91996..d109c63 100644 --- a/src/full_text_parser/mod.rs +++ b/src/full_text_parser/mod.rs @@ -19,7 +19,7 @@ use log::{debug, error, info, warn}; use reqwest::header::HeaderMap; use reqwest::Client; use std::path::Path; -use std::str::FromStr; +use std::str::{from_utf8, FromStr}; pub struct FullTextParser { config_files: ConfigCollection, @@ -264,25 +264,37 @@ impl FullTextParser { if response.status().is_success() { let headers = response.headers().clone(); - let text = response - .text() + let bytes = response + .bytes() .await .map_err(|_| FullTextParserError::Http)?; - if let Some(decoded_html) = - Self::decode_html(&text, Self::get_encoding_from_html(&text)) - { - return Ok(decoded_html); - } + match from_utf8(&bytes) { + Ok(utf8_str) => { + debug!("Valid utf-8 string"); + return Ok(utf8_str.into()); + } + Err(error) => { + debug!("Invalid utf-8 string"); + let lossy_string = std::string::String::from_utf8_lossy(&bytes); - if let Some(decoded_html) = - Self::decode_html(&text, Self::get_encoding_from_http_header(&headers)) - { - return Ok(decoded_html); - } + if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) { + debug!("Encoding extracted from HTML: '{}'", encoding); + if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { + return Ok(decoded_html); + } + } - warn!("No encoding of HTML detected - assuming utf-8"); - return Ok(text); + if let Some(encoding) = Self::get_encoding_from_http_header(&headers) { + debug!("Encoding extracted from headers: '{}'", encoding); + if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { + return Ok(decoded_html); + } + } + + return Err(FullTextParserError::Utf8(error)); + } + } } Err(FullTextParserError::Http) @@ -303,7 +315,7 @@ impl FullTextParser { fn get_encoding_from_html(html: &str) -> Option<&str> { let regex = - regex::Regex::new(r#") -> Option { - if let Some(encoding) = encoding { - if let Some(encoding) = Encoding::for_label(encoding.as_bytes()) { - let (decoded_html, _, invalid_chars) = encoding.decode(html.as_bytes()); + fn decode_html(bytes: &[u8], encoding: &str) -> Option { + if let Some(encoding) = Encoding::for_label(encoding.as_bytes()) { + let (decoded_html, _, invalid_chars) = encoding.decode(bytes); - if !invalid_chars { - return Some(decoded_html.into_owned()); - } + if !invalid_chars { + return Some(decoded_html.into_owned()); } - warn!("Could not decode HTML. Encoding: '{}'", encoding); } + warn!("Could not decode HTML. Encoding: '{}'", encoding); None } diff --git a/src/full_text_parser/tests.rs b/src/full_text_parser/tests.rs index 896fa55..6f5e962 100644 --- a/src/full_text_parser/tests.rs +++ b/src/full_text_parser/tests.rs @@ -63,3 +63,12 @@ async fn youtube() { .map(|html| html.contains("https://www.youtube.com/embed/8KjaIumu-jI?feature=oembed")) .unwrap_or(false)); } + +#[tokio::test(flavor = "current_thread")] +async fn encoding_windows_1252() { + let url = url::Url::parse("https://www.aerzteblatt.de/nachrichten/139511/Scholz-zuversichtlich-mit-Blick-auf-Coronasituation-im-Winter").unwrap(); + let html = FullTextParser::download(&url, &Client::new(), reqwest::header::HeaderMap::new()) + .await + .unwrap(); + assert!(html.contains("Bund-Länder-Konferenz")); +}