1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

update to new serialization api of libxml

This commit is contained in:
Jan Lukas Gernert 2019-09-26 21:28:05 +02:00
parent b489af74bd
commit 2137e84743
3 changed files with 29 additions and 5 deletions

View file

@ -6,7 +6,7 @@ edition = "2018"
[dependencies]
failure = "0.1"
libxml = "0.2"
libxml = { git = "https://github.com/KWARC/rust-libxml.git" }
reqwest = "0.9"
url = "1.7"
regex = "1.3"

View file

@ -5,7 +5,10 @@ use log::{
};
use libxml::parser::Parser;
use libxml::xpath::Context;
use libxml::tree::Node;
use libxml::tree::{
Node,
SaveOptions,
};
use url;
use failure::ResultExt;
use std::error::Error;
@ -45,7 +48,17 @@ impl ImageDownloader {
self.download_images_from_context(&xpath_ctx)?;
Ok(doc.to_string(/*format:*/ false))
let options = SaveOptions {
format: false,
no_declaration: false,
no_empty_tags: true,
no_xhtml: false,
xhtml: false,
as_xml: false,
as_html: true,
non_significant_whitespace: false,
};
Ok(doc.to_string_with_options(options))
}
pub fn download_images_from_context(&self, context: &Context) -> Result<(), ImageDownloadError> {

View file

@ -19,7 +19,8 @@ use libxml::parser::Parser;
use libxml::xpath::Context;
use libxml::tree::{
Document,
Node
Node,
SaveOptions,
};
use std::path::PathBuf;
use std::ops::Index;
@ -128,7 +129,17 @@ impl ArticleScraper {
}
// serialize content
let html = document.to_string(/*format:*/ false);
let options = SaveOptions {
format: false,
no_declaration: false,
no_empty_tags: true,
no_xhtml: false,
xhtml: false,
as_xml: false,
as_html: true,
non_significant_whitespace: false,
};
let html = document.to_string_with_options(options);
article.html = Some(html);
Ok(article)