mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-09 17:05:30 +02:00
move conditional cleaning right after parsing & port attribute cleaning form readability
This commit is contained in:
parent
47eed3a94f
commit
11e08ae505
10 changed files with 943 additions and 104 deletions
|
@ -18,9 +18,7 @@ async fn run_test(name: &str) {
|
|||
let document = crate::FullTextParser::parse_html(&html, None, &empty_config).unwrap();
|
||||
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
|
||||
|
||||
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config);
|
||||
|
||||
crate::FullTextParser::fix_urls(&xpath_ctx, &url);
|
||||
crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url);
|
||||
let mut article = Article {
|
||||
title: None,
|
||||
author: None,
|
||||
|
@ -36,7 +34,9 @@ async fn run_test(name: &str) {
|
|||
|
||||
metadata::extract(&xpath_ctx, None, None, &mut article);
|
||||
super::Readability::extract_body(document, &mut root, article.title.as_deref()).unwrap();
|
||||
crate::FullTextParser::post_process_content(&article_document).unwrap();
|
||||
if let Some(mut root) = article_document.get_root_element() {
|
||||
crate::FullTextParser::post_process_content(&mut root, false).unwrap();
|
||||
}
|
||||
|
||||
article.document = Some(article_document);
|
||||
let html = article.get_content().unwrap();
|
||||
|
@ -236,6 +236,11 @@ async fn hidden_nodes() {
|
|||
run_test("hidden-nodes").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn hukumusume() {
|
||||
run_test("hukumusume").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn webmd_1() {
|
||||
run_test("webmd-1").await
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue