From dd958fe30f5960a00107498f58ad2ced8446a036 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Wed, 26 Apr 2023 07:44:10 +0200 Subject: [PATCH] fix encoding --- article_scraper/src/full_text_parser/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index e130ba8..8250b02 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -381,6 +381,7 @@ impl FullTextParser { if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) { log::debug!("Encoding extracted from HTML: '{}'", encoding); if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { + let decoded_html = decoded_html.replace(&format!("charset=\"{encoding}\""), "charset=\"utf-8\""); return Ok(decoded_html); } } @@ -388,6 +389,7 @@ impl FullTextParser { if let Some(encoding) = Self::get_encoding_from_http_header(&headers) { log::debug!("Encoding extracted from headers: '{}'", encoding); if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { + let decoded_html = decoded_html.replace(&format!("charset=\"{encoding}\""), "charset=\"utf-8\""); return Ok(decoded_html); } }