diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 16868b2..37a5f32 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -264,8 +264,7 @@ impl FullTextParser { } // parse html - let parser = Parser::default_html(); - Self::parse_html_string_patched(html.as_str(), &parser).map_err(|err| { + Self::parse_html_string_patched(html.as_str()).map_err(|err| { log::error!("Parsing HTML failed for downloaded HTML {:?}", err); FullTextParserError::Xml }) @@ -278,7 +277,7 @@ impl FullTextParser { /// - /// These two functions should be removed when the issue is fixed in libxml crate. fn try_usize_to_i32(value: usize) -> Result { - if cfg!(target_pointer_width = "16") || (value < i32::max_value() as usize) { + if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) { // Cannot safely use our value comparison, but the conversion if always safe. // Or, if the value can be safely represented as a 32-bit signed integer. Ok(value as i32) @@ -290,8 +289,12 @@ impl FullTextParser { pub(crate) fn parse_html_string_patched( input: &str, - parser: &Parser, ) -> Result { + unsafe { + // https://gitlab.gnome.org/GNOME/libxml2/-/wikis/Thread-safety + libxml::bindings::xmlInitParser(); + } + let parser = Parser::default_html(); let input_bytes: &[u8] = input.as_ref(); let input_ptr = input_bytes.as_ptr() as *const std::os::raw::c_char; let input_len = Self::try_usize_to_i32(input_bytes.len())?; @@ -488,7 +491,7 @@ impl FullTextParser { } pub fn thumbnail_from_html(html: &str) -> Option { - if let Ok(doc) = Parser::default_html().parse_string(html) { + if let Ok(doc) = Self::parse_html_string_patched(html) { if let Ok(ctx) = Self::get_xpath_ctx(&doc) { return Self::check_for_thumbnail(&ctx); } diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index 0f0370f..99a5235 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -1,5 +1,5 @@ use super::{config::ConfigEntry, FullTextParser}; -use libxml::{parser::Parser, tree::SaveOptions, xpath::Context}; +use libxml::{tree::SaveOptions, xpath::Context}; use reqwest::{Client, Url}; async fn run_test(name: &str, url: &str, title: Option<&str>, author: Option<&str>) { @@ -194,7 +194,7 @@ herausgebracht. (Fortschritt, Wissenschaft) "#; - let doc = Parser::default_html().parse_string(html).unwrap(); + let doc = FullTextParser::parse_html_string_patched(html).unwrap(); let ctx = Context::new(&doc).unwrap(); let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap(); @@ -269,7 +269,7 @@ Foto: IMAGO/Vaclav SalekĀ / IMAGO/CTK Photo "#; - let doc = Parser::default_html().parse_string(html).unwrap(); + let doc = FullTextParser::parse_html_string_patched(html).unwrap(); let ctx = Context::new(&doc).unwrap(); let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap(); diff --git a/article_scraper/src/images/image_data.rs b/article_scraper/src/images/image_data.rs index 2095f27..b26cfec 100644 --- a/article_scraper/src/images/image_data.rs +++ b/article_scraper/src/images/image_data.rs @@ -2,7 +2,6 @@ pub struct ImageData { pub url: String, pub data: Vec, - pub content_length: usize, pub content_type: String, } diff --git a/article_scraper/src/images/mod.rs b/article_scraper/src/images/mod.rs index 6d97fd8..de0f48f 100644 --- a/article_scraper/src/images/mod.rs +++ b/article_scraper/src/images/mod.rs @@ -2,12 +2,11 @@ pub use self::error::ImageDownloadError; use self::image_data::ImageDataBase64; use self::pair::Pair; use self::request::ImageRequest; -use crate::constants; use crate::util::Util; +use crate::{constants, FullTextParser}; use base64::Engine; use futures::StreamExt; use image::ImageFormat; -use libxml::parser::Parser; use libxml::tree::{Node, SaveOptions}; use libxml::xpath::Context; pub use progress::Progress; @@ -162,9 +161,7 @@ impl ImageDownloader { html: &str, downloaded_images: Vec>, ) -> Result { - let parser = Parser::default_html(); - let doc = parser - .parse_string(html) + let doc = FullTextParser::parse_html_string_patched(html) .map_err(|_| ImageDownloadError::HtmlParse)?; let xpath_ctx = Context::new(&doc).map_err(|()| { @@ -207,9 +204,7 @@ impl ImageDownloader { } fn harvest_image_urls_from_html(html: &str) -> Result>, ImageDownloadError> { - let parser = Parser::default_html(); - let doc = parser - .parse_string(html) + let doc = FullTextParser::parse_html_string_patched(html) .map_err(|_| ImageDownloadError::HtmlParse)?; let xpath_ctx = Context::new(&doc).map_err(|()| { diff --git a/article_scraper/src/images/request.rs b/article_scraper/src/images/request.rs index b7086ce..fe9adf0 100644 --- a/article_scraper/src/images/request.rs +++ b/article_scraper/src/images/request.rs @@ -48,7 +48,6 @@ impl ImageRequest { Ok(ImageData { url: self.url, data: result, - content_length: self.content_length, content_type: self.content_type, }) } diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index cbd5370..df76ced 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -1299,13 +1299,11 @@ impl Util { mod tests { use super::Util; use crate::FullTextParser; - use libxml::parser::Parser; fn replace_brs(source: &str, expected: &str) { libxml::tree::node::set_node_rc_guard(10); - let parser = Parser::default_html(); - let document = FullTextParser::parse_html_string_patched(source, &parser).unwrap(); + let document = FullTextParser::parse_html_string_patched(source).unwrap(); let root = document.get_root_element().unwrap(); let body = root.get_first_child().unwrap(); let div = body.get_first_child().unwrap(); @@ -1346,8 +1344,7 @@ mod tests { fn replace_emojis(source: &str, expected: &str) { libxml::tree::node::set_node_rc_guard(10); - let parser = Parser::default_html(); - let document = FullTextParser::parse_html_string_patched(source, &parser).unwrap(); + let document = FullTextParser::parse_html_string_patched(source).unwrap(); let root = document.get_root_element().unwrap(); let body = root.get_first_child().unwrap(); let p = body.get_first_child().unwrap();