From 80de6d177cc4ea55025d6397a522ffcf4e188a97 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Wed, 1 Mar 2023 00:42:44 +0100 Subject: [PATCH] url completion test --- .../expected.html | 32 ++++++++++++++ .../source.html | 44 +++++++++++++++++++ src/full_text_parser/readability/tests.rs | 30 ++++++++----- 3 files changed, 96 insertions(+), 10 deletions(-) create mode 100644 resources/tests/readability/base-url-base-element-relative/expected.html create mode 100644 resources/tests/readability/base-url-base-element-relative/source.html diff --git a/resources/tests/readability/base-url-base-element-relative/expected.html b/resources/tests/readability/base-url-base-element-relative/expected.html new file mode 100644 index 0000000..21a19cc --- /dev/null +++ b/resources/tests/readability/base-url-base-element-relative/expected.html @@ -0,0 +1,32 @@ +

Lorem

+

+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +

+

Links

+

link

+

link

+

link

+

link

+

link

+

link

+

link

+

link

+

Images

+

+

+

+

+

+

Foo

+

+ Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +

diff --git a/resources/tests/readability/base-url-base-element-relative/source.html b/resources/tests/readability/base-url-base-element-relative/source.html new file mode 100644 index 0000000..bb0f7df --- /dev/null +++ b/resources/tests/readability/base-url-base-element-relative/source.html @@ -0,0 +1,44 @@ + + + + + + Base URL with base relative test + + +
+

Lorem

+
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +
+

Links

+

link

+

link

+

link

+

link

+

link

+

link

+

link

+

link

+

Images

+

+

+

+

+

+

Foo

+
+ Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +
+
+ + diff --git a/src/full_text_parser/readability/tests.rs b/src/full_text_parser/readability/tests.rs index 5f39de2..52f6664 100644 --- a/src/full_text_parser/readability/tests.rs +++ b/src/full_text_parser/readability/tests.rs @@ -6,13 +6,13 @@ use crate::{ full_text_parser::{config::ConfigEntry, metadata}, }; -async fn run_test(name: &str) { +async fn run_test(name: &str, url: Option) { libxml::tree::node::set_node_rc_guard(10); let _ = env_logger::builder().is_test(true).try_init(); let empty_config = ConfigEntry::default(); - let url = Url::parse("http://google.com").unwrap(); + let url = url.unwrap_or_else(|| Url::parse("http://google.com").unwrap()); let html = std::fs::read_to_string(format!("./resources/tests/readability/{name}/source.html")) .expect("Failed to read source HTML"); let document = crate::FullTextParser::parse_html(&html, None, &empty_config).unwrap(); @@ -20,6 +20,7 @@ async fn run_test(name: &str) { crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config); crate::FullTextParser::unwrap_noscript_images(&xpath_ctx).unwrap(); + crate::FullTextParser::fix_urls(&xpath_ctx, &url); let mut article = Article { title: None, author: None, @@ -52,40 +53,49 @@ async fn run_test(name: &str) { #[tokio::test] async fn test_001() { - run_test("001").await + run_test("001", None).await } #[tokio::test] async fn test_002() { - run_test("002").await + run_test("002", None).await } #[tokio::test] async fn test_003() { - run_test("003").await + run_test("003", None).await } #[tokio::test] async fn aclu() { - run_test("aclu").await + run_test("aclu", None).await } #[tokio::test] async fn aktualne() { - run_test("aktualne").await + run_test("aktualne", None).await } #[tokio::test] async fn archive_of_our_own() { - run_test("archive-of-our-own").await + run_test("archive-of-our-own", None).await } #[tokio::test] async fn ars_1() { - run_test("ars-1").await + run_test("ars-1", None).await +} + +#[tokio::test] +async fn base_url_base_element_relative() { + run_test( + "base-url-base-element-relative", + Some(Url::parse("http://fakehost/test/base/").unwrap()), + ) + .await } #[tokio::test] async fn webmd_1() { - run_test("webmd-1").await + run_test("webmd-1", None).await }