1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-09 17:05:30 +02:00

clean js-links & add new test

This commit is contained in:
Jan Lukas Gernert 2023-03-26 11:31:59 +02:00
parent da12fcdab6
commit 873e081c33
7 changed files with 564 additions and 492 deletions

View file

@ -293,13 +293,32 @@ impl Readability {
let mut top_candidate = top_candidates.first().cloned().unwrap_or_else(|| {
// If we still have no top candidate, just use the body as a last resort.
// We also have to copy the body node so it is something we can modify.
let mut rt = document.get_root_element().expect("doc should have root");
Self::initialize_node(&mut rt, &state).expect("init should not fail");
let mut root = document.get_root_element().expect("doc should have root");
if let Some(body) = root
.get_child_elements()
.into_iter()
.find(|n| n.get_name().to_uppercase() == "BODY")
{
root = body;
}
let mut new_top_candidate =
Node::new("DIV", None, &document).expect("can't create new node");
for mut child in root.get_child_elements().drain(..) {
child.unlink();
new_top_candidate.add_child(&mut child).unwrap();
}
root.add_child(&mut new_top_candidate).unwrap();
Self::initialize_node(&mut new_top_candidate, &state)
.expect("init should not fail");
needed_to_create_top_candidate = true;
rt
new_top_candidate
});
//Util::serialize_node(&top_candidate, "top_candidate.html");
// Util::serialize_node(&top_candidate, "top_candidate.html");
let mut alternative_candidate_ancestors = Vec::new();
// Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array

View file

@ -21,7 +21,7 @@ async fn run_test(name: &str) {
let document = crate::FullTextParser::parse_html(&html, None, &empty_config).unwrap();
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url);
crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document);
let mut article = Article {
title: None,
author: None,
@ -252,6 +252,11 @@ async fn ietf_1() {
run_test("ietf-1").await
}
#[tokio::test]
async fn js_link_replacement() {
run_test("js-link-replacement").await
}
#[tokio::test]
async fn webmd_1() {
run_test("webmd-1").await