mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
update to new serialization api of libxml
This commit is contained in:
parent
b489af74bd
commit
2137e84743
3 changed files with 29 additions and 5 deletions
|
@ -6,7 +6,7 @@ edition = "2018"
|
|||
|
||||
[dependencies]
|
||||
failure = "0.1"
|
||||
libxml = "0.2"
|
||||
libxml = { git = "https://github.com/KWARC/rust-libxml.git" }
|
||||
reqwest = "0.9"
|
||||
url = "1.7"
|
||||
regex = "1.3"
|
||||
|
|
|
@ -5,7 +5,10 @@ use log::{
|
|||
};
|
||||
use libxml::parser::Parser;
|
||||
use libxml::xpath::Context;
|
||||
use libxml::tree::Node;
|
||||
use libxml::tree::{
|
||||
Node,
|
||||
SaveOptions,
|
||||
};
|
||||
use url;
|
||||
use failure::ResultExt;
|
||||
use std::error::Error;
|
||||
|
@ -45,7 +48,17 @@ impl ImageDownloader {
|
|||
|
||||
self.download_images_from_context(&xpath_ctx)?;
|
||||
|
||||
Ok(doc.to_string(/*format:*/ false))
|
||||
let options = SaveOptions {
|
||||
format: false,
|
||||
no_declaration: false,
|
||||
no_empty_tags: true,
|
||||
no_xhtml: false,
|
||||
xhtml: false,
|
||||
as_xml: false,
|
||||
as_html: true,
|
||||
non_significant_whitespace: false,
|
||||
};
|
||||
Ok(doc.to_string_with_options(options))
|
||||
}
|
||||
|
||||
pub fn download_images_from_context(&self, context: &Context) -> Result<(), ImageDownloadError> {
|
||||
|
|
15
src/lib.rs
15
src/lib.rs
|
@ -19,7 +19,8 @@ use libxml::parser::Parser;
|
|||
use libxml::xpath::Context;
|
||||
use libxml::tree::{
|
||||
Document,
|
||||
Node
|
||||
Node,
|
||||
SaveOptions,
|
||||
};
|
||||
use std::path::PathBuf;
|
||||
use std::ops::Index;
|
||||
|
@ -128,7 +129,17 @@ impl ArticleScraper {
|
|||
}
|
||||
|
||||
// serialize content
|
||||
let html = document.to_string(/*format:*/ false);
|
||||
let options = SaveOptions {
|
||||
format: false,
|
||||
no_declaration: false,
|
||||
no_empty_tags: true,
|
||||
no_xhtml: false,
|
||||
xhtml: false,
|
||||
as_xml: false,
|
||||
as_html: true,
|
||||
non_significant_whitespace: false,
|
||||
};
|
||||
let html = document.to_string_with_options(options);
|
||||
article.html = Some(html);
|
||||
|
||||
Ok(article)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue