From 40317509564ce90d7251190a528058090eb39501 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Wed, 1 Mar 2023 01:37:44 +0100 Subject: [PATCH] tag cleaning test --- expected.html | 20 +++++++++++ .../basic-tags-cleaning/expected.html | 20 +++++++++++ .../basic-tags-cleaning/source.html | 36 +++++++++++++++++++ src/full_text_parser/readability/tests.rs | 5 +++ 4 files changed, 81 insertions(+) create mode 100644 expected.html create mode 100644 resources/tests/readability/basic-tags-cleaning/expected.html create mode 100644 resources/tests/readability/basic-tags-cleaning/source.html diff --git a/expected.html b/expected.html new file mode 100644 index 0000000..c5d7e2f --- /dev/null +++ b/expected.html @@ -0,0 +1,20 @@ +
+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua.

+

Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+

Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
+

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+

Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
diff --git a/resources/tests/readability/basic-tags-cleaning/expected.html b/resources/tests/readability/basic-tags-cleaning/expected.html new file mode 100644 index 0000000..c5d7e2f --- /dev/null +++ b/resources/tests/readability/basic-tags-cleaning/expected.html @@ -0,0 +1,20 @@ +
+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua.

+

Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+

Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
+

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+

Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
diff --git a/resources/tests/readability/basic-tags-cleaning/source.html b/resources/tests/readability/basic-tags-cleaning/source.html new file mode 100644 index 0000000..1d8809e --- /dev/null +++ b/resources/tests/readability/basic-tags-cleaning/source.html @@ -0,0 +1,36 @@ + + + + + Basic tag cleaning test + + +
+

Lorem

+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua.

+

Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+ +

Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+

Foo

+
+

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+ + + + +

Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
+ + \ No newline at end of file diff --git a/src/full_text_parser/readability/tests.rs b/src/full_text_parser/readability/tests.rs index 7a6820d..159a57d 100644 --- a/src/full_text_parser/readability/tests.rs +++ b/src/full_text_parser/readability/tests.rs @@ -91,6 +91,11 @@ async fn base_url_base_element_relative() { run_test("base-url-base-element-relative").await } +#[tokio::test] +async fn basic_tags_cleaning() { + run_test("basic-tags-cleaning").await +} + #[tokio::test] async fn webmd_1() { run_test("webmd-1").await