mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 08:30:00 +02:00
more tests & title fixes
This commit is contained in:
parent
a649b93c03
commit
ded7cf5adb
17 changed files with 8373 additions and 10 deletions
|
@ -62,9 +62,12 @@ pub static NEGATIVE: Lazy<Regex> = Lazy::new(|| {
|
|||
});
|
||||
|
||||
pub static TITLE_SEPARATOR: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"[-|\\/>»]"#).expect("TITLE_SEPARATOR regex"));
|
||||
pub static TITLE_CUT_END: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"(.*)[-|\\/>»] .*"#).expect("TITLE_CUT_END regex"));
|
||||
Lazy::new(|| Regex::new(r#" [-|—\\/>»] "#).expect("TITLE_SEPARATOR regex"));
|
||||
pub static TITLE_CUT_END: Lazy<Regex> = Lazy::new(||
|
||||
RegexBuilder::new(r#"(.*)[-|—\\/>»] .*"#)
|
||||
.case_insensitive(true)
|
||||
.build()
|
||||
.expect("TITLE_CUT_END regex"));
|
||||
pub static WORD_COUNT: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\s+"#).expect("WORD_COUNT regex"));
|
||||
pub static TITLE_CUT_FRONT: Lazy<Regex> = Lazy::new(|| {
|
||||
RegexBuilder::new(r#"[^-|\\/>»]*[-|\\/>»](.*)"#)
|
||||
|
|
|
@ -683,7 +683,7 @@ impl Readability {
|
|||
let heading = Util::get_inner_text(node, false);
|
||||
|
||||
if let Some(title) = title {
|
||||
Util::text_similarity(&heading, title) > 0.75
|
||||
Util::text_similarity(title, &heading) > 0.75
|
||||
} else {
|
||||
false
|
||||
}
|
||||
|
|
|
@ -307,6 +307,36 @@ async fn lifehacker_working() {
|
|||
run_test("lifehacker-working").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn links_in_tables() {
|
||||
run_test("links-in-tables").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn lwn_1() {
|
||||
run_test("lwn-1").await
|
||||
}
|
||||
|
||||
// #[tokio::test]
|
||||
// async fn medicalnewstoday() {
|
||||
// run_test("medicalnewstoday").await
|
||||
// }
|
||||
|
||||
#[tokio::test]
|
||||
async fn medium_1() {
|
||||
run_test("medium-1").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn medium_2() {
|
||||
run_test("medium-2").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn medium_3() {
|
||||
run_test("medium-3").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn webmd_1() {
|
||||
run_test("webmd-1").await
|
||||
|
|
|
@ -317,8 +317,8 @@ impl Util {
|
|||
pub fn text_similarity(a: &str, b: &str) -> f64 {
|
||||
let a = a.to_lowercase();
|
||||
let b = b.to_lowercase();
|
||||
let tokens_a = constants::TOKENIZE.split(&a).collect::<Vec<_>>();
|
||||
let tokens_b = constants::TOKENIZE.split(&b).collect::<Vec<_>>();
|
||||
let tokens_a = constants::TOKENIZE.split(&a).filter(|token| !token.is_empty()).collect::<Vec<_>>();
|
||||
let tokens_b = constants::TOKENIZE.split(&b).filter(|token| !token.is_empty()).collect::<Vec<_>>();
|
||||
if tokens_a.is_empty() || tokens_b.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue