1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

extract & parse charsets other than utf8

This commit is contained in:
Jan Lukas Gernert 2022-12-11 17:38:42 +01:00
parent 97b194c9e8
commit 90383545e0
4 changed files with 50 additions and 27 deletions

View file

@ -16,6 +16,8 @@ pub enum FullTextParserError {
IO, IO,
#[error("Content-type suggest no html")] #[error("Content-type suggest no html")]
ContentType, ContentType,
#[error("Invalid UTF8 Text")]
Utf8(#[from] std::str::Utf8Error),
#[error("Unknown Error")] #[error("Unknown Error")]
Unknown, Unknown,
} }

View file

@ -21,12 +21,14 @@ static FINGERPRINT_REGEXES: Lazy<HashMap<&'static str, Regex>> = Lazy::new(|| {
); );
m.insert( m.insert(
"fingerprint.wordpress.com", "fingerprint.wordpress.com",
regex::Regex::new(r#"/\\<meta\s*name=([\\'"])generator([\\'"])\s*content=([\\'"])WordPress/i"#) regex::Regex::new(
.expect("failed to build static regex"), r#"/\\<meta\\s*name=([\\'"])generator([\\'"])\s*content=([\\'"])WordPress/i"#,
)
.expect("failed to build static regex"),
); );
m.insert( m.insert(
"fingerprint.ippen.media", "fingerprint.ippen.media",
regex::Regex::new(r#"/\\<div\s*class=([\\'"])id-SiteBEEPWrap([\\'"])\\>/i"#) regex::Regex::new(r#"/\\<div\\s*class=([\\'"])id-SiteBEEPWrap([\\'"])\\>/i"#)
.expect("failed to build static regex"), .expect("failed to build static regex"),
); );
m m

View file

@ -19,7 +19,7 @@ use log::{debug, error, info, warn};
use reqwest::header::HeaderMap; use reqwest::header::HeaderMap;
use reqwest::Client; use reqwest::Client;
use std::path::Path; use std::path::Path;
use std::str::FromStr; use std::str::{from_utf8, FromStr};
pub struct FullTextParser { pub struct FullTextParser {
config_files: ConfigCollection, config_files: ConfigCollection,
@ -264,25 +264,37 @@ impl FullTextParser {
if response.status().is_success() { if response.status().is_success() {
let headers = response.headers().clone(); let headers = response.headers().clone();
let text = response let bytes = response
.text() .bytes()
.await .await
.map_err(|_| FullTextParserError::Http)?; .map_err(|_| FullTextParserError::Http)?;
if let Some(decoded_html) = match from_utf8(&bytes) {
Self::decode_html(&text, Self::get_encoding_from_html(&text)) Ok(utf8_str) => {
{ debug!("Valid utf-8 string");
return Ok(decoded_html); return Ok(utf8_str.into());
} }
Err(error) => {
debug!("Invalid utf-8 string");
let lossy_string = std::string::String::from_utf8_lossy(&bytes);
if let Some(decoded_html) = if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) {
Self::decode_html(&text, Self::get_encoding_from_http_header(&headers)) debug!("Encoding extracted from HTML: '{}'", encoding);
{ if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
return Ok(decoded_html); return Ok(decoded_html);
} }
}
warn!("No encoding of HTML detected - assuming utf-8"); if let Some(encoding) = Self::get_encoding_from_http_header(&headers) {
return Ok(text); debug!("Encoding extracted from headers: '{}'", encoding);
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
return Ok(decoded_html);
}
}
return Err(FullTextParserError::Utf8(error));
}
}
} }
Err(FullTextParserError::Http) Err(FullTextParserError::Http)
@ -303,7 +315,7 @@ impl FullTextParser {
fn get_encoding_from_html(html: &str) -> Option<&str> { fn get_encoding_from_html(html: &str) -> Option<&str> {
let regex = let regex =
regex::Regex::new(r#"<meta.*?charset=([^"']+)"#).expect("Failed to parse regex"); regex::Regex::new(r#"<meta.*?charset="*(.*?)""#).expect("Failed to parse regex");
if let Some(captures) = regex.captures(html) { if let Some(captures) = regex.captures(html) {
if let Some(regex_match) = captures.get(1) { if let Some(regex_match) = captures.get(1) {
return Some(regex_match.as_str()); return Some(regex_match.as_str());
@ -312,17 +324,15 @@ impl FullTextParser {
None None
} }
fn decode_html(html: &str, encoding: Option<&str>) -> Option<String> { fn decode_html(bytes: &[u8], encoding: &str) -> Option<String> {
if let Some(encoding) = encoding { if let Some(encoding) = Encoding::for_label(encoding.as_bytes()) {
if let Some(encoding) = Encoding::for_label(encoding.as_bytes()) { let (decoded_html, _, invalid_chars) = encoding.decode(bytes);
let (decoded_html, _, invalid_chars) = encoding.decode(html.as_bytes());
if !invalid_chars { if !invalid_chars {
return Some(decoded_html.into_owned()); return Some(decoded_html.into_owned());
}
} }
warn!("Could not decode HTML. Encoding: '{}'", encoding);
} }
warn!("Could not decode HTML. Encoding: '{}'", encoding);
None None
} }

View file

@ -63,3 +63,12 @@ async fn youtube() {
.map(|html| html.contains("https://www.youtube.com/embed/8KjaIumu-jI?feature=oembed")) .map(|html| html.contains("https://www.youtube.com/embed/8KjaIumu-jI?feature=oembed"))
.unwrap_or(false)); .unwrap_or(false));
} }
#[tokio::test(flavor = "current_thread")]
async fn encoding_windows_1252() {
let url = url::Url::parse("https://www.aerzteblatt.de/nachrichten/139511/Scholz-zuversichtlich-mit-Blick-auf-Coronasituation-im-Winter").unwrap();
let html = FullTextParser::download(&url, &Client::new(), reqwest::header::HeaderMap::new())
.await
.unwrap();
assert!(html.contains("Bund-Länder-Konferenz"));
}