From 7c9e5278279aaf02e31d8b5f7f44d10495dab32d Mon Sep 17 00:00:00 2001
From: Jan Lukas Gernert <jangernert@gmail.com>
Date: Wed, 1 Mar 2023 01:37:37 +0100
Subject: [PATCH] strip iframes but keep vidoes

---
 src/constants.rs            | 47 ++++++++++++++++--------
 src/full_text_parser/mod.rs | 73 +++++++++++++++++++------------------
 src/util.rs                 | 11 ++++++
 3 files changed, 80 insertions(+), 51 deletions(-)
diff --git a/src/constants.rs b/src/constants.rs
index 0200c37..25c0a1c 100644
--- a/src/constants.rs
+++ b/src/constants.rs
@@ -1,35 +1,45 @@
 use std::collections::HashSet;
 
 use once_cell::sync::Lazy;
-use regex::Regex;
+use regex::{Regex, RegexBuilder};
 
 pub const DEFAULT_CHAR_THRESHOLD: usize = 500;
-pub static IS_IMAGE: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r#"/\.(jpg|jpeg|png|webp)/i"#).expect("IS_IMAGE regex"));
+pub static IS_IMAGE: Lazy<Regex> = Lazy::new(|| {
+    RegexBuilder::new(r#"\.(jpg|jpeg|png|webp)"#)
+        .case_insensitive(true)
+        .build()
+        .expect("IS_IMAGE regex")
+});
 pub static SIBLING_CONTENT: Lazy<Regex> =
     Lazy::new(|| Regex::new(r#"/\.( |$)/"#).expect("SIBLING_CONTENT regex"));
 pub static BYLINE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r#"/byline|author|dateline|writtenby|p-author/i"#).expect("BYLINE regex")
+    RegexBuilder::new(r#"byline|author|dateline|writtenby|p-author"#)
+        .case_insensitive(true)
+        .build()
+        .expect("BYLINE regex")
 });
 pub static NORMALIZE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex"));
 pub static TOKENIZE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\W+"#).expect("TOKENIZE regex"));
 pub static UNLIELY_CANDIDATES: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r#"/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i"#).expect("UNLIELY_CANDIDATES regex")
+    RegexBuilder::new(r#"-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote"#).case_insensitive(true).build().expect("UNLIELY_CANDIDATES regex")
 });
 pub static OKAY_MAYBE_ITS_A_CANDIDATE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r#"/and|article|body|column|content|main|shadow/i"#)
+    RegexBuilder::new(r#"and|article|body|column|content|main|shadow"#)
+        .case_insensitive(true)
+        .build()
         .expect("OKAY_MAYBE_ITS_A_CANDIDATE regex")
 });
 pub static HAS_CONTENT: Lazy<Regex> =
     Lazy::new(|| Regex::new(r#"/\S$/"#).expect("HAS_CONTENT regex"));
 pub static HASH_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"/^#.+/"#).expect("HASH_URL regex"));
-pub static POSITIVE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(
-        r#"article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i"#,
-    )
+pub static POSITIVE: Lazy<Regex> =
+    Lazy::new(|| {
+        RegexBuilder::new(
+        r#"article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story"#,
+    ).case_insensitive(true).build()
     .expect("POSITIVE regex")
-});
+    });
 pub static NEGATIVE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r#"-ad-|hidden|^hid$| hid$| hid |^hid"#).expect("NEGATIVE regex"));
 
@@ -38,10 +48,14 @@ pub static TITLE_SEPARATOR: Lazy<Regex> =
 pub static TITLE_CUT_END: Lazy<Regex> =
     Lazy::new(|| Regex::new(r#"(.*)[-|\\/>»] .*"#).expect("TITLE_CUT_END regex"));
 pub static WORD_COUNT: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\s+"#).expect("WORD_COUNT regex"));
-pub static TITLE_CUT_FRONT: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r#"/[^-|\\/>»]*[-|\\/>»](.*)/gi"#).expect("TITLE_CUT_FRONT regex"));
+pub static TITLE_CUT_FRONT: Lazy<Regex> = Lazy::new(|| {
+    RegexBuilder::new(r#"[^-|\\/>»]*[-|\\/>»](.*)"#)
+        .case_insensitive(true)
+        .build()
+        .expect("TITLE_CUT_FRONT regex")
+});
 pub static VIDEOS: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r#"///(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i"#).expect("VIDEOS regex")
+    RegexBuilder::new(r#"(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)"#).case_insensitive(true).build().expect("VIDEOS regex")
 });
 pub const SCORE_ATTR: &str = "content_score";
 pub const DATA_TABLE_ATTR: &str = "is_data_table";
@@ -72,9 +86,12 @@ pub static DIV_TO_P_ELEMS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
     ])
 });
 
-pub static ALTER_TO_DIV_EXCEPTIONS: Lazy<HashSet<&'static str>> =
+pub static ALTER_TO_DIV_EXCEPTIONS: Lazy<HashSet<&str>> =
     Lazy::new(|| HashSet::from(["DIV", "ARTICLE", "SECTION", "P"]));
 
+pub static EMBED_TAG_NAMES: Lazy<HashSet<&str>> =
+    Lazy::new(|| HashSet::from(["OBJECT", "EMBED", "IFRAME"]));
+
 pub const PHRASING_ELEMS: &[&str] = &[
     // "CANVAS", "IFRAME", "SVG", "VIDEO",
     "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", "DATALIST", "DFN", "EM",
diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs
index 3f72179..f810e4f 100644
--- a/src/full_text_parser/mod.rs
+++ b/src/full_text_parser/mod.rs
@@ -520,40 +520,40 @@ impl FullTextParser {
     }
 
     fn fix_urls(context: &Context, url: &Url) {
-        let _ = Self::repair_urls(context, "//img", "src", url);
-        let _ = Self::repair_urls(context, "//a", "src", url);
-        let _ = Self::repair_urls(context, "//a", "href", url);
-        let _ = Self::repair_urls(context, "//object", "data", url);
-        let _ = Self::repair_urls(context, "//iframe", "src", url);
+        _ = Self::repair_urls(context, "//img", "src", url);
+        _ = Self::repair_urls(context, "//a", "src", url);
+        _ = Self::repair_urls(context, "//a", "href", url);
+        _ = Self::repair_urls(context, "//object", "data", url);
+        _ = Self::repair_urls(context, "//iframe", "src", url);
     }
 
     fn strip_junk(context: &Context, config: Option<&ConfigEntry>, global_config: &ConfigEntry) {
         // strip specified xpath
         if let Some(config) = config {
             for xpath_strip in &config.xpath_strip {
-                let _ = Util::strip_node(context, xpath_strip);
+                _ = Util::strip_node(context, xpath_strip);
             }
         }
 
         for xpath_strip in &global_config.xpath_strip {
-            let _ = Util::strip_node(context, xpath_strip);
+            _ = Util::strip_node(context, xpath_strip);
         }
 
         // strip everything with specified 'id' or 'class'
         if let Some(config) = config {
             for xpaht_strip_class in &config.strip_id_or_class {
-                let _ = Util::strip_id_or_class(context, xpaht_strip_class);
+                _ = Util::strip_id_or_class(context, xpaht_strip_class);
             }
         }
 
         for xpaht_strip_class in &global_config.strip_id_or_class {
-            let _ = Util::strip_id_or_class(context, xpaht_strip_class);
+            _ = Util::strip_id_or_class(context, xpaht_strip_class);
         }
 
         // strip any <img> element where @src attribute contains this substring
         if let Some(config) = config {
             for xpath_strip_img_src in &config.strip_image_src {
-                let _ = Util::strip_node(
+                _ = Util::strip_node(
                     context,
                     &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
                 );
@@ -561,59 +561,60 @@ impl FullTextParser {
         }
 
         for xpath_strip_img_src in &global_config.strip_image_src {
-            let _ = Util::strip_node(
+            _ = Util::strip_node(
                 context,
                 &format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
             );
         }
 
-        let _ = Self::fix_lazy_images(context, "lazyload", "data-src");
-        let _ = Self::fix_iframe_size(context, "youtube.com");
-        let _ = Self::remove_attribute(context, Some("a"), "onclick");
-        let _ = Self::remove_attribute(context, Some("img"), "srcset");
-        let _ = Self::remove_attribute(context, Some("img"), "sizes");
-        let _ = Self::add_attribute(context, Some("a"), "target", "_blank");
+        _ = Self::fix_lazy_images(context, "lazyload", "data-src");
+        _ = Self::fix_iframe_size(context, "youtube.com");
+        _ = Self::remove_attribute(context, Some("a"), "onclick");
+        _ = Self::remove_attribute(context, Some("img"), "srcset");
+        _ = Self::remove_attribute(context, Some("img"), "sizes");
+        _ = Self::add_attribute(context, Some("a"), "target", "_blank");
 
         // strip elements using Readability.com and Instapaper.com ignore class names
         // .entry-unrelated and .instapaper_ignore
         // See http://blog.instapaper.com/post/730281947
-        let _ = Util::strip_node(
+        _ = Util::strip_node(
             context,
             "//*[contains(@class,' entry-unrelated ') or contains(@class,' instapaper_ignore ')]",
         );
 
         // strip elements that contain style="display: none;"
-        let _ = Util::strip_node(context, "//*[contains(@style,'display:none')]");
-        let _ = Util::strip_node(context, "//*[contains(@style,'display: none')]");
-        let _ = Self::remove_attribute(context, None, "style");
+        _ = Util::strip_node(context, "//*[contains(@style,'display:none')]");
+        _ = Util::strip_node(context, "//*[contains(@style,'display: none')]");
+        _ = Self::remove_attribute(context, None, "style");
 
         // strip all comments
-        let _ = Util::strip_node(context, "//input");
-        let _ = Util::strip_node(context, "//textarea");
-        let _ = Util::strip_node(context, "//select");
-        let _ = Util::strip_node(context, "//button");
+        _ = Util::strip_node(context, "//input");
+        _ = Util::strip_node(context, "//textarea");
+        _ = Util::strip_node(context, "//select");
+        _ = Util::strip_node(context, "//button");
 
         // strip all input elements
-        let _ = Util::strip_node(context, "//comment()");
+        _ = Util::strip_node(context, "//comment()");
 
         // strip all scripts
-        let _ = Util::strip_node(context, "//script");
+        _ = Util::strip_node(context, "//script");
 
         // strip all styles
-        let _ = Util::strip_node(context, "//style");
+        _ = Util::strip_node(context, "//style");
 
         // strip all empty url-tags <a/>
-        let _ = Util::strip_node(context, "//a[not(node())]");
+        _ = Util::strip_node(context, "//a[not(node())]");
 
         // strip all external css and fonts
-        let _ = Util::strip_node(context, "//*[@type='text/css']");
+        _ = Util::strip_node(context, "//*[@type='text/css']");
 
         // other junk
-        let _ = Util::strip_node(context, "//object");
-        let _ = Util::strip_node(context, "//embed");
-        let _ = Util::strip_node(context, "//footer");
-        let _ = Util::strip_node(context, "//link");
-        let _ = Util::strip_node(context, "//aside");
+        _ = Util::strip_node(context, "//iframe");
+        _ = Util::strip_node(context, "//object");
+        _ = Util::strip_node(context, "//embed");
+        _ = Util::strip_node(context, "//footer");
+        _ = Util::strip_node(context, "//link");
+        _ = Util::strip_node(context, "//aside");
     }
 
     /**
@@ -818,7 +819,7 @@ impl FullTextParser {
                 continue;
             }
 
-            let _ = node.add_text_child(None, "empty", "");
+            _ = node.add_text_child(None, "empty", "");
         }
 
         Ok(())
diff --git a/src/util.rs b/src/util.rs
index 63de5f4..cff6456 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -199,7 +199,18 @@ impl Util {
 
         let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
         let node_vec = Util::evaluate_xpath(context, query, false)?;
+
         for mut node in node_vec {
+            let tag_name = node.get_name();
+            if constants::EMBED_TAG_NAMES.contains(tag_name.to_uppercase().as_str()) {
+                if node
+                    .get_attributes()
+                    .iter()
+                    .any(|(_name, value)| constants::VIDEOS.is_match(value))
+                {
+                    continue;
+                }
+            }
             node.unlink();
         }
         Ok(())