From 7c9e5278279aaf02e31d8b5f7f44d10495dab32d Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Wed, 1 Mar 2023 01:37:37 +0100 Subject: [PATCH] strip iframes but keep vidoes --- src/constants.rs | 47 ++++++++++++++++-------- src/full_text_parser/mod.rs | 73 +++++++++++++++++++------------------ src/util.rs | 11 ++++++ 3 files changed, 80 insertions(+), 51 deletions(-) diff --git a/src/constants.rs b/src/constants.rs index 0200c37..25c0a1c 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -1,35 +1,45 @@ use std::collections::HashSet; use once_cell::sync::Lazy; -use regex::Regex; +use regex::{Regex, RegexBuilder}; pub const DEFAULT_CHAR_THRESHOLD: usize = 500; -pub static IS_IMAGE: Lazy = - Lazy::new(|| Regex::new(r#"/\.(jpg|jpeg|png|webp)/i"#).expect("IS_IMAGE regex")); +pub static IS_IMAGE: Lazy = Lazy::new(|| { + RegexBuilder::new(r#"\.(jpg|jpeg|png|webp)"#) + .case_insensitive(true) + .build() + .expect("IS_IMAGE regex") +}); pub static SIBLING_CONTENT: Lazy = Lazy::new(|| Regex::new(r#"/\.( |$)/"#).expect("SIBLING_CONTENT regex")); pub static BYLINE: Lazy = Lazy::new(|| { - Regex::new(r#"/byline|author|dateline|writtenby|p-author/i"#).expect("BYLINE regex") + RegexBuilder::new(r#"byline|author|dateline|writtenby|p-author"#) + .case_insensitive(true) + .build() + .expect("BYLINE regex") }); pub static NORMALIZE: Lazy = Lazy::new(|| Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex")); pub static TOKENIZE: Lazy = Lazy::new(|| Regex::new(r#"\W+"#).expect("TOKENIZE regex")); pub static UNLIELY_CANDIDATES: Lazy = Lazy::new(|| { - Regex::new(r#"/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i"#).expect("UNLIELY_CANDIDATES regex") + RegexBuilder::new(r#"-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote"#).case_insensitive(true).build().expect("UNLIELY_CANDIDATES regex") }); pub static OKAY_MAYBE_ITS_A_CANDIDATE: Lazy = Lazy::new(|| { - Regex::new(r#"/and|article|body|column|content|main|shadow/i"#) + RegexBuilder::new(r#"and|article|body|column|content|main|shadow"#) + .case_insensitive(true) + .build() .expect("OKAY_MAYBE_ITS_A_CANDIDATE regex") }); pub static HAS_CONTENT: Lazy = Lazy::new(|| Regex::new(r#"/\S$/"#).expect("HAS_CONTENT regex")); pub static HASH_URL: Lazy = Lazy::new(|| Regex::new(r#"/^#.+/"#).expect("HASH_URL regex")); -pub static POSITIVE: Lazy = Lazy::new(|| { - Regex::new( - r#"article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i"#, - ) +pub static POSITIVE: Lazy = + Lazy::new(|| { + RegexBuilder::new( + r#"article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story"#, + ).case_insensitive(true).build() .expect("POSITIVE regex") -}); + }); pub static NEGATIVE: Lazy = Lazy::new(|| Regex::new(r#"-ad-|hidden|^hid$| hid$| hid |^hid"#).expect("NEGATIVE regex")); @@ -38,10 +48,14 @@ pub static TITLE_SEPARATOR: Lazy = pub static TITLE_CUT_END: Lazy = Lazy::new(|| Regex::new(r#"(.*)[-|\\/>»] .*"#).expect("TITLE_CUT_END regex")); pub static WORD_COUNT: Lazy = Lazy::new(|| Regex::new(r#"\s+"#).expect("WORD_COUNT regex")); -pub static TITLE_CUT_FRONT: Lazy = - Lazy::new(|| Regex::new(r#"/[^-|\\/>»]*[-|\\/>»](.*)/gi"#).expect("TITLE_CUT_FRONT regex")); +pub static TITLE_CUT_FRONT: Lazy = Lazy::new(|| { + RegexBuilder::new(r#"[^-|\\/>»]*[-|\\/>»](.*)"#) + .case_insensitive(true) + .build() + .expect("TITLE_CUT_FRONT regex") +}); pub static VIDEOS: Lazy = Lazy::new(|| { - Regex::new(r#"///(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i"#).expect("VIDEOS regex") + RegexBuilder::new(r#"(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)"#).case_insensitive(true).build().expect("VIDEOS regex") }); pub const SCORE_ATTR: &str = "content_score"; pub const DATA_TABLE_ATTR: &str = "is_data_table"; @@ -72,9 +86,12 @@ pub static DIV_TO_P_ELEMS: Lazy> = Lazy::new(|| { ]) }); -pub static ALTER_TO_DIV_EXCEPTIONS: Lazy> = +pub static ALTER_TO_DIV_EXCEPTIONS: Lazy> = Lazy::new(|| HashSet::from(["DIV", "ARTICLE", "SECTION", "P"])); +pub static EMBED_TAG_NAMES: Lazy> = + Lazy::new(|| HashSet::from(["OBJECT", "EMBED", "IFRAME"])); + pub const PHRASING_ELEMS: &[&str] = &[ // "CANVAS", "IFRAME", "SVG", "VIDEO", "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", "DATALIST", "DFN", "EM", diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs index 3f72179..f810e4f 100644 --- a/src/full_text_parser/mod.rs +++ b/src/full_text_parser/mod.rs @@ -520,40 +520,40 @@ impl FullTextParser { } fn fix_urls(context: &Context, url: &Url) { - let _ = Self::repair_urls(context, "//img", "src", url); - let _ = Self::repair_urls(context, "//a", "src", url); - let _ = Self::repair_urls(context, "//a", "href", url); - let _ = Self::repair_urls(context, "//object", "data", url); - let _ = Self::repair_urls(context, "//iframe", "src", url); + _ = Self::repair_urls(context, "//img", "src", url); + _ = Self::repair_urls(context, "//a", "src", url); + _ = Self::repair_urls(context, "//a", "href", url); + _ = Self::repair_urls(context, "//object", "data", url); + _ = Self::repair_urls(context, "//iframe", "src", url); } fn strip_junk(context: &Context, config: Option<&ConfigEntry>, global_config: &ConfigEntry) { // strip specified xpath if let Some(config) = config { for xpath_strip in &config.xpath_strip { - let _ = Util::strip_node(context, xpath_strip); + _ = Util::strip_node(context, xpath_strip); } } for xpath_strip in &global_config.xpath_strip { - let _ = Util::strip_node(context, xpath_strip); + _ = Util::strip_node(context, xpath_strip); } // strip everything with specified 'id' or 'class' if let Some(config) = config { for xpaht_strip_class in &config.strip_id_or_class { - let _ = Util::strip_id_or_class(context, xpaht_strip_class); + _ = Util::strip_id_or_class(context, xpaht_strip_class); } } for xpaht_strip_class in &global_config.strip_id_or_class { - let _ = Util::strip_id_or_class(context, xpaht_strip_class); + _ = Util::strip_id_or_class(context, xpaht_strip_class); } // strip any element where @src attribute contains this substring if let Some(config) = config { for xpath_strip_img_src in &config.strip_image_src { - let _ = Util::strip_node( + _ = Util::strip_node( context, &format!("//img[contains(@src,'{}')]", xpath_strip_img_src), ); @@ -561,59 +561,60 @@ impl FullTextParser { } for xpath_strip_img_src in &global_config.strip_image_src { - let _ = Util::strip_node( + _ = Util::strip_node( context, &format!("//img[contains(@src,'{}')]", xpath_strip_img_src), ); } - let _ = Self::fix_lazy_images(context, "lazyload", "data-src"); - let _ = Self::fix_iframe_size(context, "youtube.com"); - let _ = Self::remove_attribute(context, Some("a"), "onclick"); - let _ = Self::remove_attribute(context, Some("img"), "srcset"); - let _ = Self::remove_attribute(context, Some("img"), "sizes"); - let _ = Self::add_attribute(context, Some("a"), "target", "_blank"); + _ = Self::fix_lazy_images(context, "lazyload", "data-src"); + _ = Self::fix_iframe_size(context, "youtube.com"); + _ = Self::remove_attribute(context, Some("a"), "onclick"); + _ = Self::remove_attribute(context, Some("img"), "srcset"); + _ = Self::remove_attribute(context, Some("img"), "sizes"); + _ = Self::add_attribute(context, Some("a"), "target", "_blank"); // strip elements using Readability.com and Instapaper.com ignore class names // .entry-unrelated and .instapaper_ignore // See http://blog.instapaper.com/post/730281947 - let _ = Util::strip_node( + _ = Util::strip_node( context, "//*[contains(@class,' entry-unrelated ') or contains(@class,' instapaper_ignore ')]", ); // strip elements that contain style="display: none;" - let _ = Util::strip_node(context, "//*[contains(@style,'display:none')]"); - let _ = Util::strip_node(context, "//*[contains(@style,'display: none')]"); - let _ = Self::remove_attribute(context, None, "style"); + _ = Util::strip_node(context, "//*[contains(@style,'display:none')]"); + _ = Util::strip_node(context, "//*[contains(@style,'display: none')]"); + _ = Self::remove_attribute(context, None, "style"); // strip all comments - let _ = Util::strip_node(context, "//input"); - let _ = Util::strip_node(context, "//textarea"); - let _ = Util::strip_node(context, "//select"); - let _ = Util::strip_node(context, "//button"); + _ = Util::strip_node(context, "//input"); + _ = Util::strip_node(context, "//textarea"); + _ = Util::strip_node(context, "//select"); + _ = Util::strip_node(context, "//button"); // strip all input elements - let _ = Util::strip_node(context, "//comment()"); + _ = Util::strip_node(context, "//comment()"); // strip all scripts - let _ = Util::strip_node(context, "//script"); + _ = Util::strip_node(context, "//script"); // strip all styles - let _ = Util::strip_node(context, "//style"); + _ = Util::strip_node(context, "//style"); // strip all empty url-tags - let _ = Util::strip_node(context, "//a[not(node())]"); + _ = Util::strip_node(context, "//a[not(node())]"); // strip all external css and fonts - let _ = Util::strip_node(context, "//*[@type='text/css']"); + _ = Util::strip_node(context, "//*[@type='text/css']"); // other junk - let _ = Util::strip_node(context, "//object"); - let _ = Util::strip_node(context, "//embed"); - let _ = Util::strip_node(context, "//footer"); - let _ = Util::strip_node(context, "//link"); - let _ = Util::strip_node(context, "//aside"); + _ = Util::strip_node(context, "//iframe"); + _ = Util::strip_node(context, "//object"); + _ = Util::strip_node(context, "//embed"); + _ = Util::strip_node(context, "//footer"); + _ = Util::strip_node(context, "//link"); + _ = Util::strip_node(context, "//aside"); } /** @@ -818,7 +819,7 @@ impl FullTextParser { continue; } - let _ = node.add_text_child(None, "empty", ""); + _ = node.add_text_child(None, "empty", ""); } Ok(()) diff --git a/src/util.rs b/src/util.rs index 63de5f4..cff6456 100644 --- a/src/util.rs +++ b/src/util.rs @@ -199,7 +199,18 @@ impl Util { let query = &format!("{}[not(ancestor::{})]", xpath, ancestor); let node_vec = Util::evaluate_xpath(context, query, false)?; + for mut node in node_vec { + let tag_name = node.get_name(); + if constants::EMBED_TAG_NAMES.contains(tag_name.to_uppercase().as_str()) { + if node + .get_attributes() + .iter() + .any(|(_name, value)| constants::VIDEOS.is_match(value)) + { + continue; + } + } node.unlink(); } Ok(())