mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
strip iframes but keep vidoes
This commit is contained in:
parent
cea23f1638
commit
7c9e527827
3 changed files with 80 additions and 51 deletions
|
@ -1,35 +1,45 @@
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use regex::Regex;
|
use regex::{Regex, RegexBuilder};
|
||||||
|
|
||||||
pub const DEFAULT_CHAR_THRESHOLD: usize = 500;
|
pub const DEFAULT_CHAR_THRESHOLD: usize = 500;
|
||||||
pub static IS_IMAGE: Lazy<Regex> =
|
pub static IS_IMAGE: Lazy<Regex> = Lazy::new(|| {
|
||||||
Lazy::new(|| Regex::new(r#"/\.(jpg|jpeg|png|webp)/i"#).expect("IS_IMAGE regex"));
|
RegexBuilder::new(r#"\.(jpg|jpeg|png|webp)"#)
|
||||||
|
.case_insensitive(true)
|
||||||
|
.build()
|
||||||
|
.expect("IS_IMAGE regex")
|
||||||
|
});
|
||||||
pub static SIBLING_CONTENT: Lazy<Regex> =
|
pub static SIBLING_CONTENT: Lazy<Regex> =
|
||||||
Lazy::new(|| Regex::new(r#"/\.( |$)/"#).expect("SIBLING_CONTENT regex"));
|
Lazy::new(|| Regex::new(r#"/\.( |$)/"#).expect("SIBLING_CONTENT regex"));
|
||||||
pub static BYLINE: Lazy<Regex> = Lazy::new(|| {
|
pub static BYLINE: Lazy<Regex> = Lazy::new(|| {
|
||||||
Regex::new(r#"/byline|author|dateline|writtenby|p-author/i"#).expect("BYLINE regex")
|
RegexBuilder::new(r#"byline|author|dateline|writtenby|p-author"#)
|
||||||
|
.case_insensitive(true)
|
||||||
|
.build()
|
||||||
|
.expect("BYLINE regex")
|
||||||
});
|
});
|
||||||
pub static NORMALIZE: Lazy<Regex> =
|
pub static NORMALIZE: Lazy<Regex> =
|
||||||
Lazy::new(|| Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex"));
|
Lazy::new(|| Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex"));
|
||||||
pub static TOKENIZE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\W+"#).expect("TOKENIZE regex"));
|
pub static TOKENIZE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\W+"#).expect("TOKENIZE regex"));
|
||||||
pub static UNLIELY_CANDIDATES: Lazy<Regex> = Lazy::new(|| {
|
pub static UNLIELY_CANDIDATES: Lazy<Regex> = Lazy::new(|| {
|
||||||
Regex::new(r#"/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i"#).expect("UNLIELY_CANDIDATES regex")
|
RegexBuilder::new(r#"-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote"#).case_insensitive(true).build().expect("UNLIELY_CANDIDATES regex")
|
||||||
});
|
});
|
||||||
pub static OKAY_MAYBE_ITS_A_CANDIDATE: Lazy<Regex> = Lazy::new(|| {
|
pub static OKAY_MAYBE_ITS_A_CANDIDATE: Lazy<Regex> = Lazy::new(|| {
|
||||||
Regex::new(r#"/and|article|body|column|content|main|shadow/i"#)
|
RegexBuilder::new(r#"and|article|body|column|content|main|shadow"#)
|
||||||
|
.case_insensitive(true)
|
||||||
|
.build()
|
||||||
.expect("OKAY_MAYBE_ITS_A_CANDIDATE regex")
|
.expect("OKAY_MAYBE_ITS_A_CANDIDATE regex")
|
||||||
});
|
});
|
||||||
pub static HAS_CONTENT: Lazy<Regex> =
|
pub static HAS_CONTENT: Lazy<Regex> =
|
||||||
Lazy::new(|| Regex::new(r#"/\S$/"#).expect("HAS_CONTENT regex"));
|
Lazy::new(|| Regex::new(r#"/\S$/"#).expect("HAS_CONTENT regex"));
|
||||||
pub static HASH_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"/^#.+/"#).expect("HASH_URL regex"));
|
pub static HASH_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"/^#.+/"#).expect("HASH_URL regex"));
|
||||||
pub static POSITIVE: Lazy<Regex> = Lazy::new(|| {
|
pub static POSITIVE: Lazy<Regex> =
|
||||||
Regex::new(
|
Lazy::new(|| {
|
||||||
r#"article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i"#,
|
RegexBuilder::new(
|
||||||
)
|
r#"article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story"#,
|
||||||
|
).case_insensitive(true).build()
|
||||||
.expect("POSITIVE regex")
|
.expect("POSITIVE regex")
|
||||||
});
|
});
|
||||||
pub static NEGATIVE: Lazy<Regex> =
|
pub static NEGATIVE: Lazy<Regex> =
|
||||||
Lazy::new(|| Regex::new(r#"-ad-|hidden|^hid$| hid$| hid |^hid"#).expect("NEGATIVE regex"));
|
Lazy::new(|| Regex::new(r#"-ad-|hidden|^hid$| hid$| hid |^hid"#).expect("NEGATIVE regex"));
|
||||||
|
|
||||||
|
@ -38,10 +48,14 @@ pub static TITLE_SEPARATOR: Lazy<Regex> =
|
||||||
pub static TITLE_CUT_END: Lazy<Regex> =
|
pub static TITLE_CUT_END: Lazy<Regex> =
|
||||||
Lazy::new(|| Regex::new(r#"(.*)[-|\\/>»] .*"#).expect("TITLE_CUT_END regex"));
|
Lazy::new(|| Regex::new(r#"(.*)[-|\\/>»] .*"#).expect("TITLE_CUT_END regex"));
|
||||||
pub static WORD_COUNT: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\s+"#).expect("WORD_COUNT regex"));
|
pub static WORD_COUNT: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\s+"#).expect("WORD_COUNT regex"));
|
||||||
pub static TITLE_CUT_FRONT: Lazy<Regex> =
|
pub static TITLE_CUT_FRONT: Lazy<Regex> = Lazy::new(|| {
|
||||||
Lazy::new(|| Regex::new(r#"/[^-|\\/>»]*[-|\\/>»](.*)/gi"#).expect("TITLE_CUT_FRONT regex"));
|
RegexBuilder::new(r#"[^-|\\/>»]*[-|\\/>»](.*)"#)
|
||||||
|
.case_insensitive(true)
|
||||||
|
.build()
|
||||||
|
.expect("TITLE_CUT_FRONT regex")
|
||||||
|
});
|
||||||
pub static VIDEOS: Lazy<Regex> = Lazy::new(|| {
|
pub static VIDEOS: Lazy<Regex> = Lazy::new(|| {
|
||||||
Regex::new(r#"///(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i"#).expect("VIDEOS regex")
|
RegexBuilder::new(r#"(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)"#).case_insensitive(true).build().expect("VIDEOS regex")
|
||||||
});
|
});
|
||||||
pub const SCORE_ATTR: &str = "content_score";
|
pub const SCORE_ATTR: &str = "content_score";
|
||||||
pub const DATA_TABLE_ATTR: &str = "is_data_table";
|
pub const DATA_TABLE_ATTR: &str = "is_data_table";
|
||||||
|
@ -72,9 +86,12 @@ pub static DIV_TO_P_ELEMS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
||||||
])
|
])
|
||||||
});
|
});
|
||||||
|
|
||||||
pub static ALTER_TO_DIV_EXCEPTIONS: Lazy<HashSet<&'static str>> =
|
pub static ALTER_TO_DIV_EXCEPTIONS: Lazy<HashSet<&str>> =
|
||||||
Lazy::new(|| HashSet::from(["DIV", "ARTICLE", "SECTION", "P"]));
|
Lazy::new(|| HashSet::from(["DIV", "ARTICLE", "SECTION", "P"]));
|
||||||
|
|
||||||
|
pub static EMBED_TAG_NAMES: Lazy<HashSet<&str>> =
|
||||||
|
Lazy::new(|| HashSet::from(["OBJECT", "EMBED", "IFRAME"]));
|
||||||
|
|
||||||
pub const PHRASING_ELEMS: &[&str] = &[
|
pub const PHRASING_ELEMS: &[&str] = &[
|
||||||
// "CANVAS", "IFRAME", "SVG", "VIDEO",
|
// "CANVAS", "IFRAME", "SVG", "VIDEO",
|
||||||
"ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", "DATALIST", "DFN", "EM",
|
"ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", "DATALIST", "DFN", "EM",
|
||||||
|
|
|
@ -520,40 +520,40 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn fix_urls(context: &Context, url: &Url) {
|
fn fix_urls(context: &Context, url: &Url) {
|
||||||
let _ = Self::repair_urls(context, "//img", "src", url);
|
_ = Self::repair_urls(context, "//img", "src", url);
|
||||||
let _ = Self::repair_urls(context, "//a", "src", url);
|
_ = Self::repair_urls(context, "//a", "src", url);
|
||||||
let _ = Self::repair_urls(context, "//a", "href", url);
|
_ = Self::repair_urls(context, "//a", "href", url);
|
||||||
let _ = Self::repair_urls(context, "//object", "data", url);
|
_ = Self::repair_urls(context, "//object", "data", url);
|
||||||
let _ = Self::repair_urls(context, "//iframe", "src", url);
|
_ = Self::repair_urls(context, "//iframe", "src", url);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn strip_junk(context: &Context, config: Option<&ConfigEntry>, global_config: &ConfigEntry) {
|
fn strip_junk(context: &Context, config: Option<&ConfigEntry>, global_config: &ConfigEntry) {
|
||||||
// strip specified xpath
|
// strip specified xpath
|
||||||
if let Some(config) = config {
|
if let Some(config) = config {
|
||||||
for xpath_strip in &config.xpath_strip {
|
for xpath_strip in &config.xpath_strip {
|
||||||
let _ = Util::strip_node(context, xpath_strip);
|
_ = Util::strip_node(context, xpath_strip);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for xpath_strip in &global_config.xpath_strip {
|
for xpath_strip in &global_config.xpath_strip {
|
||||||
let _ = Util::strip_node(context, xpath_strip);
|
_ = Util::strip_node(context, xpath_strip);
|
||||||
}
|
}
|
||||||
|
|
||||||
// strip everything with specified 'id' or 'class'
|
// strip everything with specified 'id' or 'class'
|
||||||
if let Some(config) = config {
|
if let Some(config) = config {
|
||||||
for xpaht_strip_class in &config.strip_id_or_class {
|
for xpaht_strip_class in &config.strip_id_or_class {
|
||||||
let _ = Util::strip_id_or_class(context, xpaht_strip_class);
|
_ = Util::strip_id_or_class(context, xpaht_strip_class);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for xpaht_strip_class in &global_config.strip_id_or_class {
|
for xpaht_strip_class in &global_config.strip_id_or_class {
|
||||||
let _ = Util::strip_id_or_class(context, xpaht_strip_class);
|
_ = Util::strip_id_or_class(context, xpaht_strip_class);
|
||||||
}
|
}
|
||||||
|
|
||||||
// strip any <img> element where @src attribute contains this substring
|
// strip any <img> element where @src attribute contains this substring
|
||||||
if let Some(config) = config {
|
if let Some(config) = config {
|
||||||
for xpath_strip_img_src in &config.strip_image_src {
|
for xpath_strip_img_src in &config.strip_image_src {
|
||||||
let _ = Util::strip_node(
|
_ = Util::strip_node(
|
||||||
context,
|
context,
|
||||||
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
|
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
|
||||||
);
|
);
|
||||||
|
@ -561,59 +561,60 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
for xpath_strip_img_src in &global_config.strip_image_src {
|
for xpath_strip_img_src in &global_config.strip_image_src {
|
||||||
let _ = Util::strip_node(
|
_ = Util::strip_node(
|
||||||
context,
|
context,
|
||||||
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
|
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
let _ = Self::fix_lazy_images(context, "lazyload", "data-src");
|
_ = Self::fix_lazy_images(context, "lazyload", "data-src");
|
||||||
let _ = Self::fix_iframe_size(context, "youtube.com");
|
_ = Self::fix_iframe_size(context, "youtube.com");
|
||||||
let _ = Self::remove_attribute(context, Some("a"), "onclick");
|
_ = Self::remove_attribute(context, Some("a"), "onclick");
|
||||||
let _ = Self::remove_attribute(context, Some("img"), "srcset");
|
_ = Self::remove_attribute(context, Some("img"), "srcset");
|
||||||
let _ = Self::remove_attribute(context, Some("img"), "sizes");
|
_ = Self::remove_attribute(context, Some("img"), "sizes");
|
||||||
let _ = Self::add_attribute(context, Some("a"), "target", "_blank");
|
_ = Self::add_attribute(context, Some("a"), "target", "_blank");
|
||||||
|
|
||||||
// strip elements using Readability.com and Instapaper.com ignore class names
|
// strip elements using Readability.com and Instapaper.com ignore class names
|
||||||
// .entry-unrelated and .instapaper_ignore
|
// .entry-unrelated and .instapaper_ignore
|
||||||
// See http://blog.instapaper.com/post/730281947
|
// See http://blog.instapaper.com/post/730281947
|
||||||
let _ = Util::strip_node(
|
_ = Util::strip_node(
|
||||||
context,
|
context,
|
||||||
"//*[contains(@class,' entry-unrelated ') or contains(@class,' instapaper_ignore ')]",
|
"//*[contains(@class,' entry-unrelated ') or contains(@class,' instapaper_ignore ')]",
|
||||||
);
|
);
|
||||||
|
|
||||||
// strip elements that contain style="display: none;"
|
// strip elements that contain style="display: none;"
|
||||||
let _ = Util::strip_node(context, "//*[contains(@style,'display:none')]");
|
_ = Util::strip_node(context, "//*[contains(@style,'display:none')]");
|
||||||
let _ = Util::strip_node(context, "//*[contains(@style,'display: none')]");
|
_ = Util::strip_node(context, "//*[contains(@style,'display: none')]");
|
||||||
let _ = Self::remove_attribute(context, None, "style");
|
_ = Self::remove_attribute(context, None, "style");
|
||||||
|
|
||||||
// strip all comments
|
// strip all comments
|
||||||
let _ = Util::strip_node(context, "//input");
|
_ = Util::strip_node(context, "//input");
|
||||||
let _ = Util::strip_node(context, "//textarea");
|
_ = Util::strip_node(context, "//textarea");
|
||||||
let _ = Util::strip_node(context, "//select");
|
_ = Util::strip_node(context, "//select");
|
||||||
let _ = Util::strip_node(context, "//button");
|
_ = Util::strip_node(context, "//button");
|
||||||
|
|
||||||
// strip all input elements
|
// strip all input elements
|
||||||
let _ = Util::strip_node(context, "//comment()");
|
_ = Util::strip_node(context, "//comment()");
|
||||||
|
|
||||||
// strip all scripts
|
// strip all scripts
|
||||||
let _ = Util::strip_node(context, "//script");
|
_ = Util::strip_node(context, "//script");
|
||||||
|
|
||||||
// strip all styles
|
// strip all styles
|
||||||
let _ = Util::strip_node(context, "//style");
|
_ = Util::strip_node(context, "//style");
|
||||||
|
|
||||||
// strip all empty url-tags <a/>
|
// strip all empty url-tags <a/>
|
||||||
let _ = Util::strip_node(context, "//a[not(node())]");
|
_ = Util::strip_node(context, "//a[not(node())]");
|
||||||
|
|
||||||
// strip all external css and fonts
|
// strip all external css and fonts
|
||||||
let _ = Util::strip_node(context, "//*[@type='text/css']");
|
_ = Util::strip_node(context, "//*[@type='text/css']");
|
||||||
|
|
||||||
// other junk
|
// other junk
|
||||||
let _ = Util::strip_node(context, "//object");
|
_ = Util::strip_node(context, "//iframe");
|
||||||
let _ = Util::strip_node(context, "//embed");
|
_ = Util::strip_node(context, "//object");
|
||||||
let _ = Util::strip_node(context, "//footer");
|
_ = Util::strip_node(context, "//embed");
|
||||||
let _ = Util::strip_node(context, "//link");
|
_ = Util::strip_node(context, "//footer");
|
||||||
let _ = Util::strip_node(context, "//aside");
|
_ = Util::strip_node(context, "//link");
|
||||||
|
_ = Util::strip_node(context, "//aside");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -818,7 +819,7 @@ impl FullTextParser {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let _ = node.add_text_child(None, "empty", "");
|
_ = node.add_text_child(None, "empty", "");
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|
11
src/util.rs
11
src/util.rs
|
@ -199,7 +199,18 @@ impl Util {
|
||||||
|
|
||||||
let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
|
let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
|
||||||
let node_vec = Util::evaluate_xpath(context, query, false)?;
|
let node_vec = Util::evaluate_xpath(context, query, false)?;
|
||||||
|
|
||||||
for mut node in node_vec {
|
for mut node in node_vec {
|
||||||
|
let tag_name = node.get_name();
|
||||||
|
if constants::EMBED_TAG_NAMES.contains(tag_name.to_uppercase().as_str()) {
|
||||||
|
if node
|
||||||
|
.get_attributes()
|
||||||
|
.iter()
|
||||||
|
.any(|(_name, value)| constants::VIDEOS.is_match(value))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
node.unlink();
|
node.unlink();
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue