diff --git a/article_scraper/src/article.rs b/article_scraper/src/article.rs index 3408e15..994265f 100644 --- a/article_scraper/src/article.rs +++ b/article_scraper/src/article.rs @@ -1,5 +1,5 @@ use chrono::{DateTime, Utc}; -use libxml::tree::{Document, SaveOptions}; +use libxml::tree::{Document, SaveOptions, Node}; use std::fs::File; use std::io::{Error, ErrorKind, Write}; use std::path::PathBuf; @@ -12,10 +12,11 @@ pub struct Article { pub date: Option>, pub thumbnail_url: Option, pub document: Option, + pub root_node: Option, } impl Article { - pub fn get_content(&self) -> Option { + pub fn get_doc_content(&self) -> Option { // serialize content let options = SaveOptions { format: true, @@ -32,6 +33,14 @@ impl Article { .map(|doc| doc.to_string_with_options(options)) } + pub fn get_content(&self) -> Option { + if let (Some(document), Some(root)) = (self.document.as_ref(), self.root_node.as_ref()) { + Some(document.node_to_string(root)) + } else { + None + } + } + pub fn save_html(&self, path: &PathBuf) -> Result<(), Error> { if let Some(ref html) = self.get_content() { if let Ok(()) = std::fs::create_dir_all(path) { diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index 37f4bf2..0b8519c 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -1,7 +1,8 @@ -use libxml::tree::SaveOptions; +use libxml::tree::Node; use reqwest::Url; use crate::full_text_parser::error::FullTextParserError; +use crate::util::Util; use crate::{FtrConfigEntry, FullTextParser}; pub fn clean_html(html: &str, base_url: &Url) -> Result { @@ -12,34 +13,26 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result