From acb7d1d000b730ab3bdae5c7538a9550de2d1cb9 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Thu, 10 Aug 2023 02:09:07 +0200 Subject: [PATCH] port libxml workaround from hurl --- article_scraper/src/clean.rs | 16 +----- article_scraper/src/full_text_parser/mod.rs | 64 ++++++++++++++++++++- 2 files changed, 64 insertions(+), 16 deletions(-) diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index 790bf9d..81297a2 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -39,22 +39,8 @@ pub fn clean_html_fragment( ) -> Result { libxml::tree::node::set_node_rc_guard(10); - let html = format!( - r#" - - - - - - - {html_fragment} - - - "# - ); - let empty_config = FtrConfigEntry::default(); - let document = FullTextParser::parse_html(&html, None, &empty_config)?; + let document = FullTextParser::parse_html(html_fragment, None, &empty_config)?; let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?; let thumbnail = FullTextParser::check_for_thumbnail(&xpath_ctx); FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document, None); diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 4f39017..325cd10 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -265,12 +265,74 @@ impl FullTextParser { // parse html let parser = Parser::default_html(); - parser.parse_string(html.as_str()).map_err(|err| { + Self::parse_html_string_patched(html.as_str(), &parser).map_err(|err| { log::error!("Parsing HTML failed for downloaded HTML {:?}", err); FullTextParserError::Xml }) } + /// FIXME: Here are some patched functions of libxml crate. + /// Started from libxml 2.11.1+, we have some encoding issue. + /// See: + /// - + /// - + /// These two functions should be removed when the issue is fixed in libxml crate. + fn try_usize_to_i32(value: usize) -> Result { + if cfg!(target_pointer_width = "16") || (value < i32::max_value() as usize) { + // Cannot safely use our value comparison, but the conversion if always safe. + // Or, if the value can be safely represented as a 32-bit signed integer. + Ok(value as i32) + } else { + // Document too large, cannot parse using libxml2. + Err(libxml::parser::XmlParseError::DocumentTooLarge) + } + } + + fn parse_html_string_patched( + input: &str, + parser: &Parser, + ) -> Result { + let input_bytes: &[u8] = input.as_ref(); + let input_ptr = input_bytes.as_ptr() as *const std::os::raw::c_char; + let input_len = Self::try_usize_to_i32(input_bytes.len())?; + let encoding = std::ffi::CString::new("utf-8").unwrap(); + let encoding_ptr = encoding.as_ptr(); + let url_ptr = std::ptr::null(); + + // HTML_PARSE_RECOVER | HTML_PARSE_NOERROR + let options = 1 + 32; + match parser.format { + libxml::parser::ParseFormat::XML => unsafe { + let doc_ptr = libxml::bindings::xmlReadMemory( + input_ptr, + input_len, + url_ptr, + encoding_ptr, + options, + ); + if doc_ptr.is_null() { + Err(libxml::parser::XmlParseError::GotNullPointer) + } else { + Ok(Document::new_ptr(doc_ptr)) + } + }, + libxml::parser::ParseFormat::HTML => unsafe { + let docptr = libxml::bindings::htmlReadMemory( + input_ptr, + input_len, + url_ptr, + encoding_ptr, + options, + ); + if docptr.is_null() { + Err(libxml::parser::XmlParseError::GotNullPointer) + } else { + Ok(Document::new_ptr(docptr)) + } + }, + } + } + pub(crate) fn get_xpath_ctx(doc: &Document) -> Result { Context::new(doc).map_err(|()| { log::error!("Creating xpath context failed for downloaded HTML");