mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
init parser according to (https://gitlab.gnome.org/GNOME/libxml2/-/wikis/Thread-safety)
This commit is contained in:
parent
f4e4e64b9e
commit
c16e11fdda
6 changed files with 16 additions and 23 deletions
|
@ -264,8 +264,7 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
// parse html
|
// parse html
|
||||||
let parser = Parser::default_html();
|
Self::parse_html_string_patched(html.as_str()).map_err(|err| {
|
||||||
Self::parse_html_string_patched(html.as_str(), &parser).map_err(|err| {
|
|
||||||
log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
||||||
FullTextParserError::Xml
|
FullTextParserError::Xml
|
||||||
})
|
})
|
||||||
|
@ -278,7 +277,7 @@ impl FullTextParser {
|
||||||
/// - <https://github.com/Orange-OpenSource/hurl/issues/1535>
|
/// - <https://github.com/Orange-OpenSource/hurl/issues/1535>
|
||||||
/// These two functions should be removed when the issue is fixed in libxml crate.
|
/// These two functions should be removed when the issue is fixed in libxml crate.
|
||||||
fn try_usize_to_i32(value: usize) -> Result<i32, libxml::parser::XmlParseError> {
|
fn try_usize_to_i32(value: usize) -> Result<i32, libxml::parser::XmlParseError> {
|
||||||
if cfg!(target_pointer_width = "16") || (value < i32::max_value() as usize) {
|
if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) {
|
||||||
// Cannot safely use our value comparison, but the conversion if always safe.
|
// Cannot safely use our value comparison, but the conversion if always safe.
|
||||||
// Or, if the value can be safely represented as a 32-bit signed integer.
|
// Or, if the value can be safely represented as a 32-bit signed integer.
|
||||||
Ok(value as i32)
|
Ok(value as i32)
|
||||||
|
@ -290,8 +289,12 @@ impl FullTextParser {
|
||||||
|
|
||||||
pub(crate) fn parse_html_string_patched(
|
pub(crate) fn parse_html_string_patched(
|
||||||
input: &str,
|
input: &str,
|
||||||
parser: &Parser,
|
|
||||||
) -> Result<Document, libxml::parser::XmlParseError> {
|
) -> Result<Document, libxml::parser::XmlParseError> {
|
||||||
|
unsafe {
|
||||||
|
// https://gitlab.gnome.org/GNOME/libxml2/-/wikis/Thread-safety
|
||||||
|
libxml::bindings::xmlInitParser();
|
||||||
|
}
|
||||||
|
let parser = Parser::default_html();
|
||||||
let input_bytes: &[u8] = input.as_ref();
|
let input_bytes: &[u8] = input.as_ref();
|
||||||
let input_ptr = input_bytes.as_ptr() as *const std::os::raw::c_char;
|
let input_ptr = input_bytes.as_ptr() as *const std::os::raw::c_char;
|
||||||
let input_len = Self::try_usize_to_i32(input_bytes.len())?;
|
let input_len = Self::try_usize_to_i32(input_bytes.len())?;
|
||||||
|
@ -488,7 +491,7 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn thumbnail_from_html(html: &str) -> Option<String> {
|
pub fn thumbnail_from_html(html: &str) -> Option<String> {
|
||||||
if let Ok(doc) = Parser::default_html().parse_string(html) {
|
if let Ok(doc) = Self::parse_html_string_patched(html) {
|
||||||
if let Ok(ctx) = Self::get_xpath_ctx(&doc) {
|
if let Ok(ctx) = Self::get_xpath_ctx(&doc) {
|
||||||
return Self::check_for_thumbnail(&ctx);
|
return Self::check_for_thumbnail(&ctx);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
use super::{config::ConfigEntry, FullTextParser};
|
use super::{config::ConfigEntry, FullTextParser};
|
||||||
use libxml::{parser::Parser, tree::SaveOptions, xpath::Context};
|
use libxml::{tree::SaveOptions, xpath::Context};
|
||||||
use reqwest::{Client, Url};
|
use reqwest::{Client, Url};
|
||||||
|
|
||||||
async fn run_test(name: &str, url: &str, title: Option<&str>, author: Option<&str>) {
|
async fn run_test(name: &str, url: &str, title: Option<&str>, author: Option<&str>) {
|
||||||
|
@ -194,7 +194,7 @@ herausgebracht. (<a href="https://www.golem.de/specials/fortschritt/" rel="noope
|
||||||
referrerpolicy="no-referrer">Fortschritt</a>, <a href="https://www.golem.de/specials/wissenschaft/"
|
referrerpolicy="no-referrer">Fortschritt</a>, <a href="https://www.golem.de/specials/wissenschaft/"
|
||||||
rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Wissenschaft</a>)
|
rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Wissenschaft</a>)
|
||||||
"#;
|
"#;
|
||||||
let doc = Parser::default_html().parse_string(html).unwrap();
|
let doc = FullTextParser::parse_html_string_patched(html).unwrap();
|
||||||
let ctx = Context::new(&doc).unwrap();
|
let ctx = Context::new(&doc).unwrap();
|
||||||
|
|
||||||
let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap();
|
let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap();
|
||||||
|
@ -269,7 +269,7 @@ Foto: IMAGO/Vaclav Salek / IMAGO/CTK Photo
|
||||||
</section></article>
|
</section></article>
|
||||||
"#;
|
"#;
|
||||||
|
|
||||||
let doc = Parser::default_html().parse_string(html).unwrap();
|
let doc = FullTextParser::parse_html_string_patched(html).unwrap();
|
||||||
let ctx = Context::new(&doc).unwrap();
|
let ctx = Context::new(&doc).unwrap();
|
||||||
|
|
||||||
let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap();
|
let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap();
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
pub struct ImageData {
|
pub struct ImageData {
|
||||||
pub url: String,
|
pub url: String,
|
||||||
pub data: Vec<u8>,
|
pub data: Vec<u8>,
|
||||||
pub content_length: usize,
|
|
||||||
pub content_type: String,
|
pub content_type: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,12 +2,11 @@ pub use self::error::ImageDownloadError;
|
||||||
use self::image_data::ImageDataBase64;
|
use self::image_data::ImageDataBase64;
|
||||||
use self::pair::Pair;
|
use self::pair::Pair;
|
||||||
use self::request::ImageRequest;
|
use self::request::ImageRequest;
|
||||||
use crate::constants;
|
|
||||||
use crate::util::Util;
|
use crate::util::Util;
|
||||||
|
use crate::{constants, FullTextParser};
|
||||||
use base64::Engine;
|
use base64::Engine;
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
use image::ImageFormat;
|
use image::ImageFormat;
|
||||||
use libxml::parser::Parser;
|
|
||||||
use libxml::tree::{Node, SaveOptions};
|
use libxml::tree::{Node, SaveOptions};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
pub use progress::Progress;
|
pub use progress::Progress;
|
||||||
|
@ -162,9 +161,7 @@ impl ImageDownloader {
|
||||||
html: &str,
|
html: &str,
|
||||||
downloaded_images: Vec<Pair<ImageDataBase64>>,
|
downloaded_images: Vec<Pair<ImageDataBase64>>,
|
||||||
) -> Result<String, ImageDownloadError> {
|
) -> Result<String, ImageDownloadError> {
|
||||||
let parser = Parser::default_html();
|
let doc = FullTextParser::parse_html_string_patched(html)
|
||||||
let doc = parser
|
|
||||||
.parse_string(html)
|
|
||||||
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
||||||
|
|
||||||
let xpath_ctx = Context::new(&doc).map_err(|()| {
|
let xpath_ctx = Context::new(&doc).map_err(|()| {
|
||||||
|
@ -207,9 +204,7 @@ impl ImageDownloader {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn harvest_image_urls_from_html(html: &str) -> Result<Vec<Pair<String>>, ImageDownloadError> {
|
fn harvest_image_urls_from_html(html: &str) -> Result<Vec<Pair<String>>, ImageDownloadError> {
|
||||||
let parser = Parser::default_html();
|
let doc = FullTextParser::parse_html_string_patched(html)
|
||||||
let doc = parser
|
|
||||||
.parse_string(html)
|
|
||||||
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
||||||
|
|
||||||
let xpath_ctx = Context::new(&doc).map_err(|()| {
|
let xpath_ctx = Context::new(&doc).map_err(|()| {
|
||||||
|
|
|
@ -48,7 +48,6 @@ impl ImageRequest {
|
||||||
Ok(ImageData {
|
Ok(ImageData {
|
||||||
url: self.url,
|
url: self.url,
|
||||||
data: result,
|
data: result,
|
||||||
content_length: self.content_length,
|
|
||||||
content_type: self.content_type,
|
content_type: self.content_type,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
@ -1299,13 +1299,11 @@ impl Util {
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::Util;
|
use super::Util;
|
||||||
use crate::FullTextParser;
|
use crate::FullTextParser;
|
||||||
use libxml::parser::Parser;
|
|
||||||
|
|
||||||
fn replace_brs(source: &str, expected: &str) {
|
fn replace_brs(source: &str, expected: &str) {
|
||||||
libxml::tree::node::set_node_rc_guard(10);
|
libxml::tree::node::set_node_rc_guard(10);
|
||||||
|
|
||||||
let parser = Parser::default_html();
|
let document = FullTextParser::parse_html_string_patched(source).unwrap();
|
||||||
let document = FullTextParser::parse_html_string_patched(source, &parser).unwrap();
|
|
||||||
let root = document.get_root_element().unwrap();
|
let root = document.get_root_element().unwrap();
|
||||||
let body = root.get_first_child().unwrap();
|
let body = root.get_first_child().unwrap();
|
||||||
let div = body.get_first_child().unwrap();
|
let div = body.get_first_child().unwrap();
|
||||||
|
@ -1346,8 +1344,7 @@ mod tests {
|
||||||
fn replace_emojis(source: &str, expected: &str) {
|
fn replace_emojis(source: &str, expected: &str) {
|
||||||
libxml::tree::node::set_node_rc_guard(10);
|
libxml::tree::node::set_node_rc_guard(10);
|
||||||
|
|
||||||
let parser = Parser::default_html();
|
let document = FullTextParser::parse_html_string_patched(source).unwrap();
|
||||||
let document = FullTextParser::parse_html_string_patched(source, &parser).unwrap();
|
|
||||||
let root = document.get_root_element().unwrap();
|
let root = document.get_root_element().unwrap();
|
||||||
let body = root.get_first_child().unwrap();
|
let body = root.get_first_child().unwrap();
|
||||||
let p = body.get_first_child().unwrap();
|
let p = body.get_first_child().unwrap();
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue