mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
fmt
This commit is contained in:
parent
ded7cf5adb
commit
92b4427a9f
2 changed files with 13 additions and 6 deletions
|
@ -63,11 +63,12 @@ pub static NEGATIVE: Lazy<Regex> = Lazy::new(|| {
|
||||||
|
|
||||||
pub static TITLE_SEPARATOR: Lazy<Regex> =
|
pub static TITLE_SEPARATOR: Lazy<Regex> =
|
||||||
Lazy::new(|| Regex::new(r#" [-|—\\/>»] "#).expect("TITLE_SEPARATOR regex"));
|
Lazy::new(|| Regex::new(r#" [-|—\\/>»] "#).expect("TITLE_SEPARATOR regex"));
|
||||||
pub static TITLE_CUT_END: Lazy<Regex> = Lazy::new(||
|
pub static TITLE_CUT_END: Lazy<Regex> = Lazy::new(|| {
|
||||||
RegexBuilder::new(r#"(.*)[-|—\\/>»] .*"#)
|
RegexBuilder::new(r#"(.*)[-|—\\/>»] .*"#)
|
||||||
.case_insensitive(true)
|
.case_insensitive(true)
|
||||||
.build()
|
.build()
|
||||||
.expect("TITLE_CUT_END regex"));
|
.expect("TITLE_CUT_END regex")
|
||||||
|
});
|
||||||
pub static WORD_COUNT: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\s+"#).expect("WORD_COUNT regex"));
|
pub static WORD_COUNT: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\s+"#).expect("WORD_COUNT regex"));
|
||||||
pub static TITLE_CUT_FRONT: Lazy<Regex> = Lazy::new(|| {
|
pub static TITLE_CUT_FRONT: Lazy<Regex> = Lazy::new(|| {
|
||||||
RegexBuilder::new(r#"[^-|\\/>»]*[-|\\/>»](.*)"#)
|
RegexBuilder::new(r#"[^-|\\/>»]*[-|\\/>»](.*)"#)
|
||||||
|
|
10
src/util.rs
10
src/util.rs
|
@ -317,8 +317,14 @@ impl Util {
|
||||||
pub fn text_similarity(a: &str, b: &str) -> f64 {
|
pub fn text_similarity(a: &str, b: &str) -> f64 {
|
||||||
let a = a.to_lowercase();
|
let a = a.to_lowercase();
|
||||||
let b = b.to_lowercase();
|
let b = b.to_lowercase();
|
||||||
let tokens_a = constants::TOKENIZE.split(&a).filter(|token| !token.is_empty()).collect::<Vec<_>>();
|
let tokens_a = constants::TOKENIZE
|
||||||
let tokens_b = constants::TOKENIZE.split(&b).filter(|token| !token.is_empty()).collect::<Vec<_>>();
|
.split(&a)
|
||||||
|
.filter(|token| !token.is_empty())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
let tokens_b = constants::TOKENIZE
|
||||||
|
.split(&b)
|
||||||
|
.filter(|token| !token.is_empty())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
if tokens_a.is_empty() || tokens_b.is_empty() {
|
if tokens_a.is_empty() || tokens_b.is_empty() {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue