From 7ae98904d42af025578a267162367ce29fc7b3d7 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Thu, 23 Feb 2023 01:53:42 +0100 Subject: [PATCH] unwrap noscript images --- src/article.rs | 2 +- src/full_text_parser/mod.rs | 170 ++++++++++++ src/full_text_parser/readability/mod.rs | 324 +++------------------- src/full_text_parser/readability/tests.rs | 12 +- src/full_text_parser/tests.rs | 49 +++- src/util.rs | 264 +++++++++++++++++- 6 files changed, 537 insertions(+), 284 deletions(-) diff --git a/src/article.rs b/src/article.rs index 9c8694c..d171947 100644 --- a/src/article.rs +++ b/src/article.rs @@ -18,7 +18,7 @@ impl Article { pub fn get_content(&self) -> Option { // serialize content let options = SaveOptions { - format: false, + format: true, no_declaration: false, no_empty_tags: true, no_xhtml: false, diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs index 3f69442..1fcf6e6 100644 --- a/src/full_text_parser/mod.rs +++ b/src/full_text_parser/mod.rs @@ -19,6 +19,7 @@ use libxml::parser::Parser; use libxml::tree::{Document, Node}; use libxml::xpath::Context; use log::{debug, error, info, warn}; +use regex::Regex; use reqwest::header::HeaderMap; use reqwest::Client; use std::path::Path; @@ -124,6 +125,10 @@ impl FullTextParser { return Err(error); } + if let Some(mut root) = document.get_root_element() { + Self::post_process_content(&mut root)?; + } + article.document = Some(document); Ok(article) @@ -179,6 +184,7 @@ impl FullTextParser { Self::check_for_thumbnail(&xpath_ctx, article); } Self::strip_junk(&xpath_ctx, config, global_config, url); + Self::unwrap_noscript_images(&xpath_ctx)?; let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?; if !found_body { @@ -195,6 +201,7 @@ impl FullTextParser { document = Self::parse_html(&html, config, global_config)?; xpath_ctx = Self::get_xpath_ctx(&document)?; Self::strip_junk(&xpath_ctx, config, global_config, &url); + Self::unwrap_noscript_images(&xpath_ctx)?; Self::extract_body(&xpath_ctx, root, config, global_config)?; } @@ -609,6 +616,12 @@ impl FullTextParser { // strip all comments let _ = Util::strip_node(context, "//comment()"); + // strip all scripts + let _ = Util::strip_node(context, "//script"); + + // strip all styles + let _ = Util::strip_node(context, "//style"); + // strip all empty url-tags let _ = Util::strip_node(context, "//a[not(node())]"); @@ -616,6 +629,91 @@ impl FullTextParser { let _ = Util::strip_node(context, "//*[@type='text/css']"); } + /** + * Find all