mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 08:05:31 +02:00
port libxml workaround from hurl
This commit is contained in:
parent
6116ba38ae
commit
acb7d1d000
2 changed files with 64 additions and 16 deletions
|
@ -39,22 +39,8 @@ pub fn clean_html_fragment(
|
|||
) -> Result<CleanedHtml, FullTextParserError> {
|
||||
libxml::tree::node::set_node_rc_guard(10);
|
||||
|
||||
let html = format!(
|
||||
r#"
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
</head>
|
||||
<body>
|
||||
{html_fragment}
|
||||
</body>
|
||||
</html>
|
||||
"#
|
||||
);
|
||||
|
||||
let empty_config = FtrConfigEntry::default();
|
||||
let document = FullTextParser::parse_html(&html, None, &empty_config)?;
|
||||
let document = FullTextParser::parse_html(html_fragment, None, &empty_config)?;
|
||||
let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?;
|
||||
let thumbnail = FullTextParser::check_for_thumbnail(&xpath_ctx);
|
||||
FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document, None);
|
||||
|
|
|
@ -265,12 +265,74 @@ impl FullTextParser {
|
|||
|
||||
// parse html
|
||||
let parser = Parser::default_html();
|
||||
parser.parse_string(html.as_str()).map_err(|err| {
|
||||
Self::parse_html_string_patched(html.as_str(), &parser).map_err(|err| {
|
||||
log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
||||
FullTextParserError::Xml
|
||||
})
|
||||
}
|
||||
|
||||
/// FIXME: Here are some patched functions of libxml crate.
|
||||
/// Started from libxml 2.11.1+, we have some encoding issue.
|
||||
/// See:
|
||||
/// - <https://github.com/KWARC/rust-libxml/issues/111>
|
||||
/// - <https://github.com/Orange-OpenSource/hurl/issues/1535>
|
||||
/// These two functions should be removed when the issue is fixed in libxml crate.
|
||||
fn try_usize_to_i32(value: usize) -> Result<i32, libxml::parser::XmlParseError> {
|
||||
if cfg!(target_pointer_width = "16") || (value < i32::max_value() as usize) {
|
||||
// Cannot safely use our value comparison, but the conversion if always safe.
|
||||
// Or, if the value can be safely represented as a 32-bit signed integer.
|
||||
Ok(value as i32)
|
||||
} else {
|
||||
// Document too large, cannot parse using libxml2.
|
||||
Err(libxml::parser::XmlParseError::DocumentTooLarge)
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_html_string_patched(
|
||||
input: &str,
|
||||
parser: &Parser,
|
||||
) -> Result<Document, libxml::parser::XmlParseError> {
|
||||
let input_bytes: &[u8] = input.as_ref();
|
||||
let input_ptr = input_bytes.as_ptr() as *const std::os::raw::c_char;
|
||||
let input_len = Self::try_usize_to_i32(input_bytes.len())?;
|
||||
let encoding = std::ffi::CString::new("utf-8").unwrap();
|
||||
let encoding_ptr = encoding.as_ptr();
|
||||
let url_ptr = std::ptr::null();
|
||||
|
||||
// HTML_PARSE_RECOVER | HTML_PARSE_NOERROR
|
||||
let options = 1 + 32;
|
||||
match parser.format {
|
||||
libxml::parser::ParseFormat::XML => unsafe {
|
||||
let doc_ptr = libxml::bindings::xmlReadMemory(
|
||||
input_ptr,
|
||||
input_len,
|
||||
url_ptr,
|
||||
encoding_ptr,
|
||||
options,
|
||||
);
|
||||
if doc_ptr.is_null() {
|
||||
Err(libxml::parser::XmlParseError::GotNullPointer)
|
||||
} else {
|
||||
Ok(Document::new_ptr(doc_ptr))
|
||||
}
|
||||
},
|
||||
libxml::parser::ParseFormat::HTML => unsafe {
|
||||
let docptr = libxml::bindings::htmlReadMemory(
|
||||
input_ptr,
|
||||
input_len,
|
||||
url_ptr,
|
||||
encoding_ptr,
|
||||
options,
|
||||
);
|
||||
if docptr.is_null() {
|
||||
Err(libxml::parser::XmlParseError::GotNullPointer)
|
||||
} else {
|
||||
Ok(Document::new_ptr(docptr))
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_xpath_ctx(doc: &Document) -> Result<Context, FullTextParserError> {
|
||||
Context::new(doc).map_err(|()| {
|
||||
log::error!("Creating xpath context failed for downloaded HTML");
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue