From 2137e8474365b1346ac436422a4804e872e80d37 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Thu, 26 Sep 2019 21:28:05 +0200 Subject: [PATCH] update to new serialization api of libxml --- Cargo.toml | 2 +- src/images/mod.rs | 17 +++++++++++++++-- src/lib.rs | 15 +++++++++++++-- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 506acbc..f732ba8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,7 @@ edition = "2018" [dependencies] failure = "0.1" -libxml = "0.2" +libxml = { git = "https://github.com/KWARC/rust-libxml.git" } reqwest = "0.9" url = "1.7" regex = "1.3" diff --git a/src/images/mod.rs b/src/images/mod.rs index 427054e..efe6e75 100644 --- a/src/images/mod.rs +++ b/src/images/mod.rs @@ -5,7 +5,10 @@ use log::{ }; use libxml::parser::Parser; use libxml::xpath::Context; -use libxml::tree::Node; +use libxml::tree::{ + Node, + SaveOptions, +}; use url; use failure::ResultExt; use std::error::Error; @@ -45,7 +48,17 @@ impl ImageDownloader { self.download_images_from_context(&xpath_ctx)?; - Ok(doc.to_string(/*format:*/ false)) + let options = SaveOptions { + format: false, + no_declaration: false, + no_empty_tags: true, + no_xhtml: false, + xhtml: false, + as_xml: false, + as_html: true, + non_significant_whitespace: false, + }; + Ok(doc.to_string_with_options(options)) } pub fn download_images_from_context(&self, context: &Context) -> Result<(), ImageDownloadError> { diff --git a/src/lib.rs b/src/lib.rs index 20597ef..963a69e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,7 +19,8 @@ use libxml::parser::Parser; use libxml::xpath::Context; use libxml::tree::{ Document, - Node + Node, + SaveOptions, }; use std::path::PathBuf; use std::ops::Index; @@ -128,7 +129,17 @@ impl ArticleScraper { } // serialize content - let html = document.to_string(/*format:*/ false); + let options = SaveOptions { + format: false, + no_declaration: false, + no_empty_tags: true, + no_xhtml: false, + xhtml: false, + as_xml: false, + as_html: true, + non_significant_whitespace: false, + }; + let html = document.to_string_with_options(options); article.html = Some(html); Ok(article)