mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
url completion test
This commit is contained in:
parent
3a92585f4d
commit
80de6d177c
3 changed files with 96 additions and 10 deletions
|
@ -0,0 +1,32 @@
|
|||
<article><DIV id="readability-page-1"><article><h2>Lorem</h2>
|
||||
<p>
|
||||
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
|
||||
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
|
||||
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
|
||||
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
|
||||
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||
</p>
|
||||
<p>Links</p>
|
||||
<p><a href="http://fakehost/test/base/foo/bar/baz.html" target="_blank">link</a></p>
|
||||
<p><a href="http://fakehost/test/base/foo/bar/baz.html" target="_blank">link</a></p>
|
||||
<p><a href="http://fakehost/foo/bar/baz.html" target="_blank">link</a></p>
|
||||
<p><a href="http://fakehost/test/base/#foo" target="_blank">link</a></p>
|
||||
<p><a href="http://fakehost/test/base/baz.html#foo" target="_blank">link</a></p>
|
||||
<p><a href="http://fakehost/foo/bar/baz.html#foo" target="_blank">link</a></p>
|
||||
<p><a href="http://test/foo/bar/baz.html" target="_blank">link</a></p>
|
||||
<p><a href="https://test/foo/bar/baz.html" target="_blank">link</a></p>
|
||||
<p>Images</p>
|
||||
<p><img src="http://fakehost/test/base/foo/bar/baz.png"></p>
|
||||
<p><img src="http://fakehost/test/base/foo/bar/baz.png"></p>
|
||||
<p><img src="http://fakehost/foo/bar/baz.png"></p>
|
||||
<p><img src="http://test/foo/bar/baz.png"></p>
|
||||
<p><img src="https://test/foo/bar/baz.png"></p>
|
||||
<h2>Foo</h2>
|
||||
<p>
|
||||
Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
|
||||
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
|
||||
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
|
||||
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||
</p></article></DIV></article>
|
|
@ -0,0 +1,44 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8"/>
|
||||
<base href="base/"/>
|
||||
<title>Base URL with base relative test</title>
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
<h1>Lorem</h1>
|
||||
<div>
|
||||
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
|
||||
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
|
||||
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
|
||||
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
|
||||
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||
</div>
|
||||
<p>Links</p>
|
||||
<p><a href="foo/bar/baz.html">link</a></p>
|
||||
<p><a href="./foo/bar/baz.html">link</a></p>
|
||||
<p><a href="/foo/bar/baz.html">link</a></p>
|
||||
<p><a href="#foo">link</a></p>
|
||||
<p><a href="baz.html#foo">link</a></p>
|
||||
<p><a href="/foo/bar/baz.html#foo">link</a></p>
|
||||
<p><a href="http://test/foo/bar/baz.html">link</a></p>
|
||||
<p><a href="https://test/foo/bar/baz.html">link</a></p>
|
||||
<p>Images</p>
|
||||
<p><img src="foo/bar/baz.png"/></p>
|
||||
<p><img src="./foo/bar/baz.png"/></p>
|
||||
<p><img src="/foo/bar/baz.png"/></p>
|
||||
<p><img src="http://test/foo/bar/baz.png"/></p>
|
||||
<p><img src="https://test/foo/bar/baz.png"/></p>
|
||||
<h2>Foo</h2>
|
||||
<div>
|
||||
Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
|
||||
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
|
||||
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
|
||||
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||
</div>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
|
@ -6,13 +6,13 @@ use crate::{
|
|||
full_text_parser::{config::ConfigEntry, metadata},
|
||||
};
|
||||
|
||||
async fn run_test(name: &str) {
|
||||
async fn run_test(name: &str, url: Option<Url>) {
|
||||
libxml::tree::node::set_node_rc_guard(10);
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
let empty_config = ConfigEntry::default();
|
||||
|
||||
let url = Url::parse("http://google.com").unwrap();
|
||||
let url = url.unwrap_or_else(|| Url::parse("http://google.com").unwrap());
|
||||
let html = std::fs::read_to_string(format!("./resources/tests/readability/{name}/source.html"))
|
||||
.expect("Failed to read source HTML");
|
||||
let document = crate::FullTextParser::parse_html(&html, None, &empty_config).unwrap();
|
||||
|
@ -20,6 +20,7 @@ async fn run_test(name: &str) {
|
|||
|
||||
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config);
|
||||
crate::FullTextParser::unwrap_noscript_images(&xpath_ctx).unwrap();
|
||||
crate::FullTextParser::fix_urls(&xpath_ctx, &url);
|
||||
let mut article = Article {
|
||||
title: None,
|
||||
author: None,
|
||||
|
@ -52,40 +53,49 @@ async fn run_test(name: &str) {
|
|||
|
||||
#[tokio::test]
|
||||
async fn test_001() {
|
||||
run_test("001").await
|
||||
run_test("001", None).await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_002() {
|
||||
run_test("002").await
|
||||
run_test("002", None).await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_003() {
|
||||
run_test("003").await
|
||||
run_test("003", None).await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn aclu() {
|
||||
run_test("aclu").await
|
||||
run_test("aclu", None).await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn aktualne() {
|
||||
run_test("aktualne").await
|
||||
run_test("aktualne", None).await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn archive_of_our_own() {
|
||||
run_test("archive-of-our-own").await
|
||||
run_test("archive-of-our-own", None).await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn ars_1() {
|
||||
run_test("ars-1").await
|
||||
run_test("ars-1", None).await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn base_url_base_element_relative() {
|
||||
run_test(
|
||||
"base-url-base-element-relative",
|
||||
Some(Url::parse("http://fakehost/test/base/").unwrap()),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn webmd_1() {
|
||||
run_test("webmd-1").await
|
||||
run_test("webmd-1", None).await
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue