1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 08:05:31 +02:00
This commit is contained in:
Jan Lukas Gernert 2024-07-06 23:38:43 +02:00
parent f4e4e64b9e
commit c16e11fdda
6 changed files with 16 additions and 23 deletions

View file

@ -264,8 +264,7 @@ impl FullTextParser {
}
// parse html
let parser = Parser::default_html();
Self::parse_html_string_patched(html.as_str(), &parser).map_err(|err| {
Self::parse_html_string_patched(html.as_str()).map_err(|err| {
log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
FullTextParserError::Xml
})
@ -278,7 +277,7 @@ impl FullTextParser {
/// - <https://github.com/Orange-OpenSource/hurl/issues/1535>
/// These two functions should be removed when the issue is fixed in libxml crate.
fn try_usize_to_i32(value: usize) -> Result<i32, libxml::parser::XmlParseError> {
if cfg!(target_pointer_width = "16") || (value < i32::max_value() as usize) {
if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) {
// Cannot safely use our value comparison, but the conversion if always safe.
// Or, if the value can be safely represented as a 32-bit signed integer.
Ok(value as i32)
@ -290,8 +289,12 @@ impl FullTextParser {
pub(crate) fn parse_html_string_patched(
input: &str,
parser: &Parser,
) -> Result<Document, libxml::parser::XmlParseError> {
unsafe {
// https://gitlab.gnome.org/GNOME/libxml2/-/wikis/Thread-safety
libxml::bindings::xmlInitParser();
}
let parser = Parser::default_html();
let input_bytes: &[u8] = input.as_ref();
let input_ptr = input_bytes.as_ptr() as *const std::os::raw::c_char;
let input_len = Self::try_usize_to_i32(input_bytes.len())?;
@ -488,7 +491,7 @@ impl FullTextParser {
}
pub fn thumbnail_from_html(html: &str) -> Option<String> {
if let Ok(doc) = Parser::default_html().parse_string(html) {
if let Ok(doc) = Self::parse_html_string_patched(html) {
if let Ok(ctx) = Self::get_xpath_ctx(&doc) {
return Self::check_for_thumbnail(&ctx);
}

View file

@ -1,5 +1,5 @@
use super::{config::ConfigEntry, FullTextParser};
use libxml::{parser::Parser, tree::SaveOptions, xpath::Context};
use libxml::{tree::SaveOptions, xpath::Context};
use reqwest::{Client, Url};
async fn run_test(name: &str, url: &str, title: Option<&str>, author: Option<&str>) {
@ -194,7 +194,7 @@ herausgebracht. (<a href="https://www.golem.de/specials/fortschritt/" rel="noope
referrerpolicy="no-referrer">Fortschritt</a>, <a href="https://www.golem.de/specials/wissenschaft/"
rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Wissenschaft</a>)
"#;
let doc = Parser::default_html().parse_string(html).unwrap();
let doc = FullTextParser::parse_html_string_patched(html).unwrap();
let ctx = Context::new(&doc).unwrap();
let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap();
@ -269,7 +269,7 @@ Foto: IMAGO/Vaclav Salek / IMAGO/CTK Photo
</section></article>
"#;
let doc = Parser::default_html().parse_string(html).unwrap();
let doc = FullTextParser::parse_html_string_patched(html).unwrap();
let ctx = Context::new(&doc).unwrap();
let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap();

View file

@ -2,7 +2,6 @@
pub struct ImageData {
pub url: String,
pub data: Vec<u8>,
pub content_length: usize,
pub content_type: String,
}

View file

@ -2,12 +2,11 @@ pub use self::error::ImageDownloadError;
use self::image_data::ImageDataBase64;
use self::pair::Pair;
use self::request::ImageRequest;
use crate::constants;
use crate::util::Util;
use crate::{constants, FullTextParser};
use base64::Engine;
use futures::StreamExt;
use image::ImageFormat;
use libxml::parser::Parser;
use libxml::tree::{Node, SaveOptions};
use libxml::xpath::Context;
pub use progress::Progress;
@ -162,9 +161,7 @@ impl ImageDownloader {
html: &str,
downloaded_images: Vec<Pair<ImageDataBase64>>,
) -> Result<String, ImageDownloadError> {
let parser = Parser::default_html();
let doc = parser
.parse_string(html)
let doc = FullTextParser::parse_html_string_patched(html)
.map_err(|_| ImageDownloadError::HtmlParse)?;
let xpath_ctx = Context::new(&doc).map_err(|()| {
@ -207,9 +204,7 @@ impl ImageDownloader {
}
fn harvest_image_urls_from_html(html: &str) -> Result<Vec<Pair<String>>, ImageDownloadError> {
let parser = Parser::default_html();
let doc = parser
.parse_string(html)
let doc = FullTextParser::parse_html_string_patched(html)
.map_err(|_| ImageDownloadError::HtmlParse)?;
let xpath_ctx = Context::new(&doc).map_err(|()| {

View file

@ -48,7 +48,6 @@ impl ImageRequest {
Ok(ImageData {
url: self.url,
data: result,
content_length: self.content_length,
content_type: self.content_type,
})
}

View file

@ -1299,13 +1299,11 @@ impl Util {
mod tests {
use super::Util;
use crate::FullTextParser;
use libxml::parser::Parser;
fn replace_brs(source: &str, expected: &str) {
libxml::tree::node::set_node_rc_guard(10);
let parser = Parser::default_html();
let document = FullTextParser::parse_html_string_patched(source, &parser).unwrap();
let document = FullTextParser::parse_html_string_patched(source).unwrap();
let root = document.get_root_element().unwrap();
let body = root.get_first_child().unwrap();
let div = body.get_first_child().unwrap();
@ -1346,8 +1344,7 @@ mod tests {
fn replace_emojis(source: &str, expected: &str) {
libxml::tree::node::set_node_rc_guard(10);
let parser = Parser::default_html();
let document = FullTextParser::parse_html_string_patched(source, &parser).unwrap();
let document = FullTextParser::parse_html_string_patched(source).unwrap();
let root = document.get_root_element().unwrap();
let body = root.get_first_child().unwrap();
let p = body.get_first_child().unwrap();