1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

fix positive/negative class weight regex

This commit is contained in:
Jan Lukas Gernert 2023-02-28 18:27:36 +01:00
parent aea57d0cf3
commit 58721efa35

View file

@ -26,12 +26,12 @@ pub static HAS_CONTENT: Lazy<Regex> =
pub static HASH_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"/^#.+/"#).expect("HASH_URL regex")); pub static HASH_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"/^#.+/"#).expect("HASH_URL regex"));
pub static POSITIVE: Lazy<Regex> = Lazy::new(|| { pub static POSITIVE: Lazy<Regex> = Lazy::new(|| {
Regex::new( Regex::new(
r#"/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i"#, r#"article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i"#,
) )
.expect("POSITIVE regex") .expect("POSITIVE regex")
}); });
pub static NEGATIVE: Lazy<Regex> = pub static NEGATIVE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"/-ad-|hidden|^hid$| hid$| hid |^hid"#).expect("NEGATIVE regex")); Lazy::new(|| Regex::new(r#"-ad-|hidden|^hid$| hid$| hid |^hid"#).expect("NEGATIVE regex"));
pub static TITLE_SEPARATOR: Lazy<Regex> = pub static TITLE_SEPARATOR: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"[-|\\/>»]"#).expect("TITLE_SEPARATOR regex")); Lazy::new(|| Regex::new(r#"[-|\\/>»]"#).expect("TITLE_SEPARATOR regex"));