From b4b5d802c9469fabaf469f059f1f6f6480a949bc Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 21 Apr 2023 08:46:10 +0200 Subject: [PATCH] only serialize root node --- article_scraper/src/article.rs | 13 +++++-- article_scraper/src/clean.rs | 34 +++++++------------ article_scraper/src/full_text_parser/mod.rs | 28 ++++++++++----- .../src/full_text_parser/readability/mod.rs | 2 ++ .../src/full_text_parser/readability/tests.rs | 2 ++ 5 files changed, 47 insertions(+), 32 deletions(-) diff --git a/article_scraper/src/article.rs b/article_scraper/src/article.rs index 3408e15..994265f 100644 --- a/article_scraper/src/article.rs +++ b/article_scraper/src/article.rs @@ -1,5 +1,5 @@ use chrono::{DateTime, Utc}; -use libxml::tree::{Document, SaveOptions}; +use libxml::tree::{Document, SaveOptions, Node}; use std::fs::File; use std::io::{Error, ErrorKind, Write}; use std::path::PathBuf; @@ -12,10 +12,11 @@ pub struct Article { pub date: Option>, pub thumbnail_url: Option, pub document: Option, + pub root_node: Option, } impl Article { - pub fn get_content(&self) -> Option { + pub fn get_doc_content(&self) -> Option { // serialize content let options = SaveOptions { format: true, @@ -32,6 +33,14 @@ impl Article { .map(|doc| doc.to_string_with_options(options)) } + pub fn get_content(&self) -> Option { + if let (Some(document), Some(root)) = (self.document.as_ref(), self.root_node.as_ref()) { + Some(document.node_to_string(root)) + } else { + None + } + } + pub fn save_html(&self, path: &PathBuf) -> Result<(), Error> { if let Some(ref html) = self.get_content() { if let Ok(()) = std::fs::create_dir_all(path) { diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index 37f4bf2..0b8519c 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -1,7 +1,8 @@ -use libxml::tree::SaveOptions; +use libxml::tree::Node; use reqwest::Url; use crate::full_text_parser::error::FullTextParserError; +use crate::util::Util; use crate::{FtrConfigEntry, FullTextParser}; pub fn clean_html(html: &str, base_url: &Url) -> Result { @@ -12,34 +13,26 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result