mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 00:19:59 +02:00
update to new serialization api of libxml
This commit is contained in:
parent
b489af74bd
commit
2137e84743
3 changed files with 29 additions and 5 deletions
|
@ -6,7 +6,7 @@ edition = "2018"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
failure = "0.1"
|
failure = "0.1"
|
||||||
libxml = "0.2"
|
libxml = { git = "https://github.com/KWARC/rust-libxml.git" }
|
||||||
reqwest = "0.9"
|
reqwest = "0.9"
|
||||||
url = "1.7"
|
url = "1.7"
|
||||||
regex = "1.3"
|
regex = "1.3"
|
||||||
|
|
|
@ -5,7 +5,10 @@ use log::{
|
||||||
};
|
};
|
||||||
use libxml::parser::Parser;
|
use libxml::parser::Parser;
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
use libxml::tree::Node;
|
use libxml::tree::{
|
||||||
|
Node,
|
||||||
|
SaveOptions,
|
||||||
|
};
|
||||||
use url;
|
use url;
|
||||||
use failure::ResultExt;
|
use failure::ResultExt;
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
|
@ -45,7 +48,17 @@ impl ImageDownloader {
|
||||||
|
|
||||||
self.download_images_from_context(&xpath_ctx)?;
|
self.download_images_from_context(&xpath_ctx)?;
|
||||||
|
|
||||||
Ok(doc.to_string(/*format:*/ false))
|
let options = SaveOptions {
|
||||||
|
format: false,
|
||||||
|
no_declaration: false,
|
||||||
|
no_empty_tags: true,
|
||||||
|
no_xhtml: false,
|
||||||
|
xhtml: false,
|
||||||
|
as_xml: false,
|
||||||
|
as_html: true,
|
||||||
|
non_significant_whitespace: false,
|
||||||
|
};
|
||||||
|
Ok(doc.to_string_with_options(options))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn download_images_from_context(&self, context: &Context) -> Result<(), ImageDownloadError> {
|
pub fn download_images_from_context(&self, context: &Context) -> Result<(), ImageDownloadError> {
|
||||||
|
|
15
src/lib.rs
15
src/lib.rs
|
@ -19,7 +19,8 @@ use libxml::parser::Parser;
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
use libxml::tree::{
|
use libxml::tree::{
|
||||||
Document,
|
Document,
|
||||||
Node
|
Node,
|
||||||
|
SaveOptions,
|
||||||
};
|
};
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::ops::Index;
|
use std::ops::Index;
|
||||||
|
@ -128,7 +129,17 @@ impl ArticleScraper {
|
||||||
}
|
}
|
||||||
|
|
||||||
// serialize content
|
// serialize content
|
||||||
let html = document.to_string(/*format:*/ false);
|
let options = SaveOptions {
|
||||||
|
format: false,
|
||||||
|
no_declaration: false,
|
||||||
|
no_empty_tags: true,
|
||||||
|
no_xhtml: false,
|
||||||
|
xhtml: false,
|
||||||
|
as_xml: false,
|
||||||
|
as_html: true,
|
||||||
|
non_significant_whitespace: false,
|
||||||
|
};
|
||||||
|
let html = document.to_string_with_options(options);
|
||||||
article.html = Some(html);
|
article.html = Some(html);
|
||||||
|
|
||||||
Ok(article)
|
Ok(article)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue