mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 08:05:31 +02:00
port libxml workaround from hurl
This commit is contained in:
parent
6116ba38ae
commit
acb7d1d000
2 changed files with 64 additions and 16 deletions
|
@ -39,22 +39,8 @@ pub fn clean_html_fragment(
|
||||||
) -> Result<CleanedHtml, FullTextParserError> {
|
) -> Result<CleanedHtml, FullTextParserError> {
|
||||||
libxml::tree::node::set_node_rc_guard(10);
|
libxml::tree::node::set_node_rc_guard(10);
|
||||||
|
|
||||||
let html = format!(
|
|
||||||
r#"
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<meta charset="utf-8">
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
{html_fragment}
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
"#
|
|
||||||
);
|
|
||||||
|
|
||||||
let empty_config = FtrConfigEntry::default();
|
let empty_config = FtrConfigEntry::default();
|
||||||
let document = FullTextParser::parse_html(&html, None, &empty_config)?;
|
let document = FullTextParser::parse_html(html_fragment, None, &empty_config)?;
|
||||||
let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?;
|
let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?;
|
||||||
let thumbnail = FullTextParser::check_for_thumbnail(&xpath_ctx);
|
let thumbnail = FullTextParser::check_for_thumbnail(&xpath_ctx);
|
||||||
FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document, None);
|
FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document, None);
|
||||||
|
|
|
@ -265,12 +265,74 @@ impl FullTextParser {
|
||||||
|
|
||||||
// parse html
|
// parse html
|
||||||
let parser = Parser::default_html();
|
let parser = Parser::default_html();
|
||||||
parser.parse_string(html.as_str()).map_err(|err| {
|
Self::parse_html_string_patched(html.as_str(), &parser).map_err(|err| {
|
||||||
log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
||||||
FullTextParserError::Xml
|
FullTextParserError::Xml
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// FIXME: Here are some patched functions of libxml crate.
|
||||||
|
/// Started from libxml 2.11.1+, we have some encoding issue.
|
||||||
|
/// See:
|
||||||
|
/// - <https://github.com/KWARC/rust-libxml/issues/111>
|
||||||
|
/// - <https://github.com/Orange-OpenSource/hurl/issues/1535>
|
||||||
|
/// These two functions should be removed when the issue is fixed in libxml crate.
|
||||||
|
fn try_usize_to_i32(value: usize) -> Result<i32, libxml::parser::XmlParseError> {
|
||||||
|
if cfg!(target_pointer_width = "16") || (value < i32::max_value() as usize) {
|
||||||
|
// Cannot safely use our value comparison, but the conversion if always safe.
|
||||||
|
// Or, if the value can be safely represented as a 32-bit signed integer.
|
||||||
|
Ok(value as i32)
|
||||||
|
} else {
|
||||||
|
// Document too large, cannot parse using libxml2.
|
||||||
|
Err(libxml::parser::XmlParseError::DocumentTooLarge)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_html_string_patched(
|
||||||
|
input: &str,
|
||||||
|
parser: &Parser,
|
||||||
|
) -> Result<Document, libxml::parser::XmlParseError> {
|
||||||
|
let input_bytes: &[u8] = input.as_ref();
|
||||||
|
let input_ptr = input_bytes.as_ptr() as *const std::os::raw::c_char;
|
||||||
|
let input_len = Self::try_usize_to_i32(input_bytes.len())?;
|
||||||
|
let encoding = std::ffi::CString::new("utf-8").unwrap();
|
||||||
|
let encoding_ptr = encoding.as_ptr();
|
||||||
|
let url_ptr = std::ptr::null();
|
||||||
|
|
||||||
|
// HTML_PARSE_RECOVER | HTML_PARSE_NOERROR
|
||||||
|
let options = 1 + 32;
|
||||||
|
match parser.format {
|
||||||
|
libxml::parser::ParseFormat::XML => unsafe {
|
||||||
|
let doc_ptr = libxml::bindings::xmlReadMemory(
|
||||||
|
input_ptr,
|
||||||
|
input_len,
|
||||||
|
url_ptr,
|
||||||
|
encoding_ptr,
|
||||||
|
options,
|
||||||
|
);
|
||||||
|
if doc_ptr.is_null() {
|
||||||
|
Err(libxml::parser::XmlParseError::GotNullPointer)
|
||||||
|
} else {
|
||||||
|
Ok(Document::new_ptr(doc_ptr))
|
||||||
|
}
|
||||||
|
},
|
||||||
|
libxml::parser::ParseFormat::HTML => unsafe {
|
||||||
|
let docptr = libxml::bindings::htmlReadMemory(
|
||||||
|
input_ptr,
|
||||||
|
input_len,
|
||||||
|
url_ptr,
|
||||||
|
encoding_ptr,
|
||||||
|
options,
|
||||||
|
);
|
||||||
|
if docptr.is_null() {
|
||||||
|
Err(libxml::parser::XmlParseError::GotNullPointer)
|
||||||
|
} else {
|
||||||
|
Ok(Document::new_ptr(docptr))
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn get_xpath_ctx(doc: &Document) -> Result<Context, FullTextParserError> {
|
pub(crate) fn get_xpath_ctx(doc: &Document) -> Result<Context, FullTextParserError> {
|
||||||
Context::new(doc).map_err(|()| {
|
Context::new(doc).map_err(|()| {
|
||||||
log::error!("Creating xpath context failed for downloaded HTML");
|
log::error!("Creating xpath context failed for downloaded HTML");
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue