mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
init parser according to (https://gitlab.gnome.org/GNOME/libxml2/-/wikis/Thread-safety)
This commit is contained in:
parent
f4e4e64b9e
commit
c16e11fdda
6 changed files with 16 additions and 23 deletions
|
@ -264,8 +264,7 @@ impl FullTextParser {
|
|||
}
|
||||
|
||||
// parse html
|
||||
let parser = Parser::default_html();
|
||||
Self::parse_html_string_patched(html.as_str(), &parser).map_err(|err| {
|
||||
Self::parse_html_string_patched(html.as_str()).map_err(|err| {
|
||||
log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
||||
FullTextParserError::Xml
|
||||
})
|
||||
|
@ -278,7 +277,7 @@ impl FullTextParser {
|
|||
/// - <https://github.com/Orange-OpenSource/hurl/issues/1535>
|
||||
/// These two functions should be removed when the issue is fixed in libxml crate.
|
||||
fn try_usize_to_i32(value: usize) -> Result<i32, libxml::parser::XmlParseError> {
|
||||
if cfg!(target_pointer_width = "16") || (value < i32::max_value() as usize) {
|
||||
if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) {
|
||||
// Cannot safely use our value comparison, but the conversion if always safe.
|
||||
// Or, if the value can be safely represented as a 32-bit signed integer.
|
||||
Ok(value as i32)
|
||||
|
@ -290,8 +289,12 @@ impl FullTextParser {
|
|||
|
||||
pub(crate) fn parse_html_string_patched(
|
||||
input: &str,
|
||||
parser: &Parser,
|
||||
) -> Result<Document, libxml::parser::XmlParseError> {
|
||||
unsafe {
|
||||
// https://gitlab.gnome.org/GNOME/libxml2/-/wikis/Thread-safety
|
||||
libxml::bindings::xmlInitParser();
|
||||
}
|
||||
let parser = Parser::default_html();
|
||||
let input_bytes: &[u8] = input.as_ref();
|
||||
let input_ptr = input_bytes.as_ptr() as *const std::os::raw::c_char;
|
||||
let input_len = Self::try_usize_to_i32(input_bytes.len())?;
|
||||
|
@ -488,7 +491,7 @@ impl FullTextParser {
|
|||
}
|
||||
|
||||
pub fn thumbnail_from_html(html: &str) -> Option<String> {
|
||||
if let Ok(doc) = Parser::default_html().parse_string(html) {
|
||||
if let Ok(doc) = Self::parse_html_string_patched(html) {
|
||||
if let Ok(ctx) = Self::get_xpath_ctx(&doc) {
|
||||
return Self::check_for_thumbnail(&ctx);
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
use super::{config::ConfigEntry, FullTextParser};
|
||||
use libxml::{parser::Parser, tree::SaveOptions, xpath::Context};
|
||||
use libxml::{tree::SaveOptions, xpath::Context};
|
||||
use reqwest::{Client, Url};
|
||||
|
||||
async fn run_test(name: &str, url: &str, title: Option<&str>, author: Option<&str>) {
|
||||
|
@ -194,7 +194,7 @@ herausgebracht. (<a href="https://www.golem.de/specials/fortschritt/" rel="noope
|
|||
referrerpolicy="no-referrer">Fortschritt</a>, <a href="https://www.golem.de/specials/wissenschaft/"
|
||||
rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Wissenschaft</a>)
|
||||
"#;
|
||||
let doc = Parser::default_html().parse_string(html).unwrap();
|
||||
let doc = FullTextParser::parse_html_string_patched(html).unwrap();
|
||||
let ctx = Context::new(&doc).unwrap();
|
||||
|
||||
let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap();
|
||||
|
@ -269,7 +269,7 @@ Foto: IMAGO/Vaclav Salek / IMAGO/CTK Photo
|
|||
</section></article>
|
||||
"#;
|
||||
|
||||
let doc = Parser::default_html().parse_string(html).unwrap();
|
||||
let doc = FullTextParser::parse_html_string_patched(html).unwrap();
|
||||
let ctx = Context::new(&doc).unwrap();
|
||||
|
||||
let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap();
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
pub struct ImageData {
|
||||
pub url: String,
|
||||
pub data: Vec<u8>,
|
||||
pub content_length: usize,
|
||||
pub content_type: String,
|
||||
}
|
||||
|
||||
|
|
|
@ -2,12 +2,11 @@ pub use self::error::ImageDownloadError;
|
|||
use self::image_data::ImageDataBase64;
|
||||
use self::pair::Pair;
|
||||
use self::request::ImageRequest;
|
||||
use crate::constants;
|
||||
use crate::util::Util;
|
||||
use crate::{constants, FullTextParser};
|
||||
use base64::Engine;
|
||||
use futures::StreamExt;
|
||||
use image::ImageFormat;
|
||||
use libxml::parser::Parser;
|
||||
use libxml::tree::{Node, SaveOptions};
|
||||
use libxml::xpath::Context;
|
||||
pub use progress::Progress;
|
||||
|
@ -162,9 +161,7 @@ impl ImageDownloader {
|
|||
html: &str,
|
||||
downloaded_images: Vec<Pair<ImageDataBase64>>,
|
||||
) -> Result<String, ImageDownloadError> {
|
||||
let parser = Parser::default_html();
|
||||
let doc = parser
|
||||
.parse_string(html)
|
||||
let doc = FullTextParser::parse_html_string_patched(html)
|
||||
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
||||
|
||||
let xpath_ctx = Context::new(&doc).map_err(|()| {
|
||||
|
@ -207,9 +204,7 @@ impl ImageDownloader {
|
|||
}
|
||||
|
||||
fn harvest_image_urls_from_html(html: &str) -> Result<Vec<Pair<String>>, ImageDownloadError> {
|
||||
let parser = Parser::default_html();
|
||||
let doc = parser
|
||||
.parse_string(html)
|
||||
let doc = FullTextParser::parse_html_string_patched(html)
|
||||
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
||||
|
||||
let xpath_ctx = Context::new(&doc).map_err(|()| {
|
||||
|
|
|
@ -48,7 +48,6 @@ impl ImageRequest {
|
|||
Ok(ImageData {
|
||||
url: self.url,
|
||||
data: result,
|
||||
content_length: self.content_length,
|
||||
content_type: self.content_type,
|
||||
})
|
||||
}
|
||||
|
|
|
@ -1299,13 +1299,11 @@ impl Util {
|
|||
mod tests {
|
||||
use super::Util;
|
||||
use crate::FullTextParser;
|
||||
use libxml::parser::Parser;
|
||||
|
||||
fn replace_brs(source: &str, expected: &str) {
|
||||
libxml::tree::node::set_node_rc_guard(10);
|
||||
|
||||
let parser = Parser::default_html();
|
||||
let document = FullTextParser::parse_html_string_patched(source, &parser).unwrap();
|
||||
let document = FullTextParser::parse_html_string_patched(source).unwrap();
|
||||
let root = document.get_root_element().unwrap();
|
||||
let body = root.get_first_child().unwrap();
|
||||
let div = body.get_first_child().unwrap();
|
||||
|
@ -1346,8 +1344,7 @@ mod tests {
|
|||
fn replace_emojis(source: &str, expected: &str) {
|
||||
libxml::tree::node::set_node_rc_guard(10);
|
||||
|
||||
let parser = Parser::default_html();
|
||||
let document = FullTextParser::parse_html_string_patched(source, &parser).unwrap();
|
||||
let document = FullTextParser::parse_html_string_patched(source).unwrap();
|
||||
let root = document.get_root_element().unwrap();
|
||||
let body = root.get_first_child().unwrap();
|
||||
let p = body.get_first_child().unwrap();
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue