1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-09 00:45:31 +02:00

move conditional cleaning right after parsing & port attribute cleaning form readability

This commit is contained in:
Jan Lukas Gernert 2023-03-19 22:43:26 +01:00
parent 47eed3a94f
commit 11e08ae505
10 changed files with 943 additions and 104 deletions

View file

@ -497,6 +497,11 @@ impl Readability {
}
}
crate::FullTextParser::post_process_content(
&mut article_content,
state.clean_conditionally,
)?;
if needed_to_create_top_candidate {
// We already created a fake div thing, and there wouldn't have been any siblings left
// for the previous loop, so there's no point trying to create a new div, and then

View file

@ -18,9 +18,7 @@ async fn run_test(name: &str) {
let document = crate::FullTextParser::parse_html(&html, None, &empty_config).unwrap();
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config);
crate::FullTextParser::fix_urls(&xpath_ctx, &url);
crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url);
let mut article = Article {
title: None,
author: None,
@ -36,7 +34,9 @@ async fn run_test(name: &str) {
metadata::extract(&xpath_ctx, None, None, &mut article);
super::Readability::extract_body(document, &mut root, article.title.as_deref()).unwrap();
crate::FullTextParser::post_process_content(&article_document).unwrap();
if let Some(mut root) = article_document.get_root_element() {
crate::FullTextParser::post_process_content(&mut root, false).unwrap();
}
article.document = Some(article_document);
let html = article.get_content().unwrap();
@ -236,6 +236,11 @@ async fn hidden_nodes() {
run_test("hidden-nodes").await
}
#[tokio::test]
async fn hukumusume() {
run_test("hukumusume").await
}
#[tokio::test]
async fn webmd_1() {
run_test("webmd-1").await