From 9e995122c4cd233e9ed5d88d909b9f976ef021b7 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Thu, 19 Dec 2019 17:36:48 +0100 Subject: [PATCH] only strip topmost nodes in tree branches --- src/lib.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index a331c25..ce3f991 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -392,7 +392,14 @@ impl ArticleScraper { fn strip_id_or_class(context: &Context, id_or_class: &String) -> Result<(), ScraperError> { let xpath = &format!("//*[contains(@class, '{}') or contains(@id, '{}')]", id_or_class, id_or_class); - let node_vec = Self::evaluate_xpath(context, xpath, false)?; + + let mut ancestor = xpath.clone(); + if ancestor.starts_with("//") { + ancestor = ancestor.chars().skip(2).collect(); + } + + let query = &format!("{}[not(ancestor::{})]", xpath, ancestor); + let node_vec = Self::evaluate_xpath(context, query, false)?; for mut node in node_vec { node.unlink(); }