1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 16:40:00 +02:00

update to new serialization api of libxml

This commit is contained in:
Jan Lukas Gernert 2019-09-26 21:28:05 +02:00
parent b489af74bd
commit 2137e84743
3 changed files with 29 additions and 5 deletions

View file

@ -6,7 +6,7 @@ edition = "2018"
[dependencies] [dependencies]
failure = "0.1" failure = "0.1"
libxml = "0.2" libxml = { git = "https://github.com/KWARC/rust-libxml.git" }
reqwest = "0.9" reqwest = "0.9"
url = "1.7" url = "1.7"
regex = "1.3" regex = "1.3"

View file

@ -5,7 +5,10 @@ use log::{
}; };
use libxml::parser::Parser; use libxml::parser::Parser;
use libxml::xpath::Context; use libxml::xpath::Context;
use libxml::tree::Node; use libxml::tree::{
Node,
SaveOptions,
};
use url; use url;
use failure::ResultExt; use failure::ResultExt;
use std::error::Error; use std::error::Error;
@ -45,7 +48,17 @@ impl ImageDownloader {
self.download_images_from_context(&xpath_ctx)?; self.download_images_from_context(&xpath_ctx)?;
Ok(doc.to_string(/*format:*/ false)) let options = SaveOptions {
format: false,
no_declaration: false,
no_empty_tags: true,
no_xhtml: false,
xhtml: false,
as_xml: false,
as_html: true,
non_significant_whitespace: false,
};
Ok(doc.to_string_with_options(options))
} }
pub fn download_images_from_context(&self, context: &Context) -> Result<(), ImageDownloadError> { pub fn download_images_from_context(&self, context: &Context) -> Result<(), ImageDownloadError> {

View file

@ -19,7 +19,8 @@ use libxml::parser::Parser;
use libxml::xpath::Context; use libxml::xpath::Context;
use libxml::tree::{ use libxml::tree::{
Document, Document,
Node Node,
SaveOptions,
}; };
use std::path::PathBuf; use std::path::PathBuf;
use std::ops::Index; use std::ops::Index;
@ -128,7 +129,17 @@ impl ArticleScraper {
} }
// serialize content // serialize content
let html = document.to_string(/*format:*/ false); let options = SaveOptions {
format: false,
no_declaration: false,
no_empty_tags: true,
no_xhtml: false,
xhtml: false,
as_xml: false,
as_html: true,
non_significant_whitespace: false,
};
let html = document.to_string_with_options(options);
article.html = Some(html); article.html = Some(html);
Ok(article) Ok(article)