more tests & title fixes

2025-07-08 08:30:00 +02:00 · 2023-03-29 08:35:36 +02:00 · 2023-03-29 08:35:36 +02:00 · ded7cf5adb
commit ded7cf5adb
parent a649b93c03
17 changed files with 8373 additions and 10 deletions
--- a/src/constants.rs
+++ b/src/constants.rs
@ -62,9 +62,12 @@ pub static NEGATIVE: Lazy<Regex> = Lazy::new(|| {
 });

 pub static TITLE_SEPARATOR: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r#"[-|\\/>»]"#).expect("TITLE_SEPARATOR regex"));
-pub static TITLE_CUT_END: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r#"(.*)[-|\\/>»] .*"#).expect("TITLE_CUT_END regex"));
+    Lazy::new(|| Regex::new(r#" [-|—\\/>»] "#).expect("TITLE_SEPARATOR regex"));
+pub static TITLE_CUT_END: Lazy<Regex> =  Lazy::new(||
+    RegexBuilder::new(r#"(.*)[-|—\\/>»] .*"#)
+    .case_insensitive(true)
+    .build()
+    .expect("TITLE_CUT_END regex"));
 pub static WORD_COUNT: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\s+"#).expect("WORD_COUNT regex"));
 pub static TITLE_CUT_FRONT: Lazy<Regex> = Lazy::new(|| {
    RegexBuilder::new(r#"[^-|\\/>»]*[-|\\/>»](.*)"#)
--- a/src/full_text_parser/readability/mod.rs
+++ b/src/full_text_parser/readability/mod.rs
@ -683,7 +683,7 @@ impl Readability {
        let heading = Util::get_inner_text(node, false);

        if let Some(title) = title {
-            Util::text_similarity(&heading, title) > 0.75
+            Util::text_similarity(title, &heading) > 0.75
        } else {
            false
        }
--- a/src/full_text_parser/readability/tests.rs
+++ b/src/full_text_parser/readability/tests.rs
@ -307,6 +307,36 @@ async fn lifehacker_working() {
    run_test("lifehacker-working").await
 }

+#[tokio::test]
+async fn links_in_tables() {
+    run_test("links-in-tables").await
+}
+
+#[tokio::test]
+async fn lwn_1() {
+    run_test("lwn-1").await
+}
+
+// #[tokio::test]
+// async fn medicalnewstoday() {
+//     run_test("medicalnewstoday").await
+// }
+
+#[tokio::test]
+async fn medium_1() {
+    run_test("medium-1").await
+}
+
+#[tokio::test]
+async fn medium_2() {
+    run_test("medium-2").await
+}
+
+#[tokio::test]
+async fn medium_3() {
+    run_test("medium-3").await
+}
+
 #[tokio::test]
 async fn webmd_1() {
    run_test("webmd-1").await
--- a/src/util.rs
+++ b/src/util.rs
@ -317,8 +317,8 @@ impl Util {
    pub fn text_similarity(a: &str, b: &str) -> f64 {
        let a = a.to_lowercase();
        let b = b.to_lowercase();
-        let tokens_a = constants::TOKENIZE.split(&a).collect::<Vec<_>>();
-        let tokens_b = constants::TOKENIZE.split(&b).collect::<Vec<_>>();
+        let tokens_a = constants::TOKENIZE.split(&a).filter(|token| !token.is_empty()).collect::<Vec<_>>();
+        let tokens_b = constants::TOKENIZE.split(&b).filter(|token| !token.is_empty()).collect::<Vec<_>>();
        if tokens_a.is_empty() || tokens_b.is_empty() {
            return 0.0;
        }