1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

port libxml workaround from hurl

This commit is contained in:
Jan Lukas Gernert 2023-08-10 02:09:07 +02:00
parent 6116ba38ae
commit acb7d1d000
2 changed files with 64 additions and 16 deletions

View file

@ -39,22 +39,8 @@ pub fn clean_html_fragment(
) -> Result<CleanedHtml, FullTextParserError> {
libxml::tree::node::set_node_rc_guard(10);
let html = format!(
r#"
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
</head>
<body>
{html_fragment}
</body>
</html>
"#
);
let empty_config = FtrConfigEntry::default();
let document = FullTextParser::parse_html(&html, None, &empty_config)?;
let document = FullTextParser::parse_html(html_fragment, None, &empty_config)?;
let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?;
let thumbnail = FullTextParser::check_for_thumbnail(&xpath_ctx);
FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document, None);

View file

@ -265,12 +265,74 @@ impl FullTextParser {
// parse html
let parser = Parser::default_html();
parser.parse_string(html.as_str()).map_err(|err| {
Self::parse_html_string_patched(html.as_str(), &parser).map_err(|err| {
log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
FullTextParserError::Xml
})
}
/// FIXME: Here are some patched functions of libxml crate.
/// Started from libxml 2.11.1+, we have some encoding issue.
/// See:
/// - <https://github.com/KWARC/rust-libxml/issues/111>
/// - <https://github.com/Orange-OpenSource/hurl/issues/1535>
/// These two functions should be removed when the issue is fixed in libxml crate.
fn try_usize_to_i32(value: usize) -> Result<i32, libxml::parser::XmlParseError> {
if cfg!(target_pointer_width = "16") || (value < i32::max_value() as usize) {
// Cannot safely use our value comparison, but the conversion if always safe.
// Or, if the value can be safely represented as a 32-bit signed integer.
Ok(value as i32)
} else {
// Document too large, cannot parse using libxml2.
Err(libxml::parser::XmlParseError::DocumentTooLarge)
}
}
fn parse_html_string_patched(
input: &str,
parser: &Parser,
) -> Result<Document, libxml::parser::XmlParseError> {
let input_bytes: &[u8] = input.as_ref();
let input_ptr = input_bytes.as_ptr() as *const std::os::raw::c_char;
let input_len = Self::try_usize_to_i32(input_bytes.len())?;
let encoding = std::ffi::CString::new("utf-8").unwrap();
let encoding_ptr = encoding.as_ptr();
let url_ptr = std::ptr::null();
// HTML_PARSE_RECOVER | HTML_PARSE_NOERROR
let options = 1 + 32;
match parser.format {
libxml::parser::ParseFormat::XML => unsafe {
let doc_ptr = libxml::bindings::xmlReadMemory(
input_ptr,
input_len,
url_ptr,
encoding_ptr,
options,
);
if doc_ptr.is_null() {
Err(libxml::parser::XmlParseError::GotNullPointer)
} else {
Ok(Document::new_ptr(doc_ptr))
}
},
libxml::parser::ParseFormat::HTML => unsafe {
let docptr = libxml::bindings::htmlReadMemory(
input_ptr,
input_len,
url_ptr,
encoding_ptr,
options,
);
if docptr.is_null() {
Err(libxml::parser::XmlParseError::GotNullPointer)
} else {
Ok(Document::new_ptr(docptr))
}
},
}
}
pub(crate) fn get_xpath_ctx(doc: &Document) -> Result<Context, FullTextParserError> {
Context::new(doc).map_err(|()| {
log::error!("Creating xpath context failed for downloaded HTML");