1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

url completion test

This commit is contained in:
Jan Lukas Gernert 2023-03-01 00:42:44 +01:00
parent 3a92585f4d
commit 80de6d177c
3 changed files with 96 additions and 10 deletions

View file

@ -0,0 +1,32 @@
<article><DIV id="readability-page-1"><article><h2>Lorem</h2>
<p>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
<p>Links</p>
<p><a href="http://fakehost/test/base/foo/bar/baz.html" target="_blank">link</a></p>
<p><a href="http://fakehost/test/base/foo/bar/baz.html" target="_blank">link</a></p>
<p><a href="http://fakehost/foo/bar/baz.html" target="_blank">link</a></p>
<p><a href="http://fakehost/test/base/#foo" target="_blank">link</a></p>
<p><a href="http://fakehost/test/base/baz.html#foo" target="_blank">link</a></p>
<p><a href="http://fakehost/foo/bar/baz.html#foo" target="_blank">link</a></p>
<p><a href="http://test/foo/bar/baz.html" target="_blank">link</a></p>
<p><a href="https://test/foo/bar/baz.html" target="_blank">link</a></p>
<p>Images</p>
<p><img src="http://fakehost/test/base/foo/bar/baz.png"></p>
<p><img src="http://fakehost/test/base/foo/bar/baz.png"></p>
<p><img src="http://fakehost/foo/bar/baz.png"></p>
<p><img src="http://test/foo/bar/baz.png"></p>
<p><img src="https://test/foo/bar/baz.png"></p>
<h2>Foo</h2>
<p>
Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p></article></DIV></article>

View file

@ -0,0 +1,44 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<base href="base/"/>
<title>Base URL with base relative test</title>
</head>
<body>
<article>
<h1>Lorem</h1>
<div>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</div>
<p>Links</p>
<p><a href="foo/bar/baz.html">link</a></p>
<p><a href="./foo/bar/baz.html">link</a></p>
<p><a href="/foo/bar/baz.html">link</a></p>
<p><a href="#foo">link</a></p>
<p><a href="baz.html#foo">link</a></p>
<p><a href="/foo/bar/baz.html#foo">link</a></p>
<p><a href="http://test/foo/bar/baz.html">link</a></p>
<p><a href="https://test/foo/bar/baz.html">link</a></p>
<p>Images</p>
<p><img src="foo/bar/baz.png"/></p>
<p><img src="./foo/bar/baz.png"/></p>
<p><img src="/foo/bar/baz.png"/></p>
<p><img src="http://test/foo/bar/baz.png"/></p>
<p><img src="https://test/foo/bar/baz.png"/></p>
<h2>Foo</h2>
<div>
Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</div>
</article>
</body>
</html>

View file

@ -6,13 +6,13 @@ use crate::{
full_text_parser::{config::ConfigEntry, metadata}, full_text_parser::{config::ConfigEntry, metadata},
}; };
async fn run_test(name: &str) { async fn run_test(name: &str, url: Option<Url>) {
libxml::tree::node::set_node_rc_guard(10); libxml::tree::node::set_node_rc_guard(10);
let _ = env_logger::builder().is_test(true).try_init(); let _ = env_logger::builder().is_test(true).try_init();
let empty_config = ConfigEntry::default(); let empty_config = ConfigEntry::default();
let url = Url::parse("http://google.com").unwrap(); let url = url.unwrap_or_else(|| Url::parse("http://google.com").unwrap());
let html = std::fs::read_to_string(format!("./resources/tests/readability/{name}/source.html")) let html = std::fs::read_to_string(format!("./resources/tests/readability/{name}/source.html"))
.expect("Failed to read source HTML"); .expect("Failed to read source HTML");
let document = crate::FullTextParser::parse_html(&html, None, &empty_config).unwrap(); let document = crate::FullTextParser::parse_html(&html, None, &empty_config).unwrap();
@ -20,6 +20,7 @@ async fn run_test(name: &str) {
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config); crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config);
crate::FullTextParser::unwrap_noscript_images(&xpath_ctx).unwrap(); crate::FullTextParser::unwrap_noscript_images(&xpath_ctx).unwrap();
crate::FullTextParser::fix_urls(&xpath_ctx, &url);
let mut article = Article { let mut article = Article {
title: None, title: None,
author: None, author: None,
@ -52,40 +53,49 @@ async fn run_test(name: &str) {
#[tokio::test] #[tokio::test]
async fn test_001() { async fn test_001() {
run_test("001").await run_test("001", None).await
} }
#[tokio::test] #[tokio::test]
async fn test_002() { async fn test_002() {
run_test("002").await run_test("002", None).await
} }
#[tokio::test] #[tokio::test]
async fn test_003() { async fn test_003() {
run_test("003").await run_test("003", None).await
} }
#[tokio::test] #[tokio::test]
async fn aclu() { async fn aclu() {
run_test("aclu").await run_test("aclu", None).await
} }
#[tokio::test] #[tokio::test]
async fn aktualne() { async fn aktualne() {
run_test("aktualne").await run_test("aktualne", None).await
} }
#[tokio::test] #[tokio::test]
async fn archive_of_our_own() { async fn archive_of_our_own() {
run_test("archive-of-our-own").await run_test("archive-of-our-own", None).await
} }
#[tokio::test] #[tokio::test]
async fn ars_1() { async fn ars_1() {
run_test("ars-1").await run_test("ars-1", None).await
}
#[tokio::test]
async fn base_url_base_element_relative() {
run_test(
"base-url-base-element-relative",
Some(Url::parse("http://fakehost/test/base/").unwrap()),
)
.await
} }
#[tokio::test] #[tokio::test]
async fn webmd_1() { async fn webmd_1() {
run_test("webmd-1").await run_test("webmd-1", None).await
} }