mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
extract & parse charsets other than utf8
This commit is contained in:
parent
97b194c9e8
commit
90383545e0
4 changed files with 50 additions and 27 deletions
|
@ -16,6 +16,8 @@ pub enum FullTextParserError {
|
|||
IO,
|
||||
#[error("Content-type suggest no html")]
|
||||
ContentType,
|
||||
#[error("Invalid UTF8 Text")]
|
||||
Utf8(#[from] std::str::Utf8Error),
|
||||
#[error("Unknown Error")]
|
||||
Unknown,
|
||||
}
|
||||
|
|
|
@ -21,12 +21,14 @@ static FINGERPRINT_REGEXES: Lazy<HashMap<&'static str, Regex>> = Lazy::new(|| {
|
|||
);
|
||||
m.insert(
|
||||
"fingerprint.wordpress.com",
|
||||
regex::Regex::new(r#"/\\<meta\s*name=([\\'"])generator([\\'"])\s*content=([\\'"])WordPress/i"#)
|
||||
.expect("failed to build static regex"),
|
||||
regex::Regex::new(
|
||||
r#"/\\<meta\\s*name=([\\'"])generator([\\'"])\s*content=([\\'"])WordPress/i"#,
|
||||
)
|
||||
.expect("failed to build static regex"),
|
||||
);
|
||||
m.insert(
|
||||
"fingerprint.ippen.media",
|
||||
regex::Regex::new(r#"/\\<div\s*class=([\\'"])id-SiteBEEPWrap([\\'"])\\>/i"#)
|
||||
regex::Regex::new(r#"/\\<div\\s*class=([\\'"])id-SiteBEEPWrap([\\'"])\\>/i"#)
|
||||
.expect("failed to build static regex"),
|
||||
);
|
||||
m
|
||||
|
|
|
@ -19,7 +19,7 @@ use log::{debug, error, info, warn};
|
|||
use reqwest::header::HeaderMap;
|
||||
use reqwest::Client;
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
use std::str::{from_utf8, FromStr};
|
||||
|
||||
pub struct FullTextParser {
|
||||
config_files: ConfigCollection,
|
||||
|
@ -264,25 +264,37 @@ impl FullTextParser {
|
|||
|
||||
if response.status().is_success() {
|
||||
let headers = response.headers().clone();
|
||||
let text = response
|
||||
.text()
|
||||
let bytes = response
|
||||
.bytes()
|
||||
.await
|
||||
.map_err(|_| FullTextParserError::Http)?;
|
||||
|
||||
if let Some(decoded_html) =
|
||||
Self::decode_html(&text, Self::get_encoding_from_html(&text))
|
||||
{
|
||||
return Ok(decoded_html);
|
||||
}
|
||||
match from_utf8(&bytes) {
|
||||
Ok(utf8_str) => {
|
||||
debug!("Valid utf-8 string");
|
||||
return Ok(utf8_str.into());
|
||||
}
|
||||
Err(error) => {
|
||||
debug!("Invalid utf-8 string");
|
||||
let lossy_string = std::string::String::from_utf8_lossy(&bytes);
|
||||
|
||||
if let Some(decoded_html) =
|
||||
Self::decode_html(&text, Self::get_encoding_from_http_header(&headers))
|
||||
{
|
||||
return Ok(decoded_html);
|
||||
}
|
||||
if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) {
|
||||
debug!("Encoding extracted from HTML: '{}'", encoding);
|
||||
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
|
||||
return Ok(decoded_html);
|
||||
}
|
||||
}
|
||||
|
||||
warn!("No encoding of HTML detected - assuming utf-8");
|
||||
return Ok(text);
|
||||
if let Some(encoding) = Self::get_encoding_from_http_header(&headers) {
|
||||
debug!("Encoding extracted from headers: '{}'", encoding);
|
||||
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
|
||||
return Ok(decoded_html);
|
||||
}
|
||||
}
|
||||
|
||||
return Err(FullTextParserError::Utf8(error));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(FullTextParserError::Http)
|
||||
|
@ -303,7 +315,7 @@ impl FullTextParser {
|
|||
|
||||
fn get_encoding_from_html(html: &str) -> Option<&str> {
|
||||
let regex =
|
||||
regex::Regex::new(r#"<meta.*?charset=([^"']+)"#).expect("Failed to parse regex");
|
||||
regex::Regex::new(r#"<meta.*?charset="*(.*?)""#).expect("Failed to parse regex");
|
||||
if let Some(captures) = regex.captures(html) {
|
||||
if let Some(regex_match) = captures.get(1) {
|
||||
return Some(regex_match.as_str());
|
||||
|
@ -312,17 +324,15 @@ impl FullTextParser {
|
|||
None
|
||||
}
|
||||
|
||||
fn decode_html(html: &str, encoding: Option<&str>) -> Option<String> {
|
||||
if let Some(encoding) = encoding {
|
||||
if let Some(encoding) = Encoding::for_label(encoding.as_bytes()) {
|
||||
let (decoded_html, _, invalid_chars) = encoding.decode(html.as_bytes());
|
||||
fn decode_html(bytes: &[u8], encoding: &str) -> Option<String> {
|
||||
if let Some(encoding) = Encoding::for_label(encoding.as_bytes()) {
|
||||
let (decoded_html, _, invalid_chars) = encoding.decode(bytes);
|
||||
|
||||
if !invalid_chars {
|
||||
return Some(decoded_html.into_owned());
|
||||
}
|
||||
if !invalid_chars {
|
||||
return Some(decoded_html.into_owned());
|
||||
}
|
||||
warn!("Could not decode HTML. Encoding: '{}'", encoding);
|
||||
}
|
||||
warn!("Could not decode HTML. Encoding: '{}'", encoding);
|
||||
None
|
||||
}
|
||||
|
||||
|
|
|
@ -63,3 +63,12 @@ async fn youtube() {
|
|||
.map(|html| html.contains("https://www.youtube.com/embed/8KjaIumu-jI?feature=oembed"))
|
||||
.unwrap_or(false));
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "current_thread")]
|
||||
async fn encoding_windows_1252() {
|
||||
let url = url::Url::parse("https://www.aerzteblatt.de/nachrichten/139511/Scholz-zuversichtlich-mit-Blick-auf-Coronasituation-im-Winter").unwrap();
|
||||
let html = FullTextParser::download(&url, &Client::new(), reqwest::header::HeaderMap::new())
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(html.contains("Bund-Länder-Konferenz"));
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue