mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-10 01:15:31 +02:00
refactor & more testing
This commit is contained in:
parent
7ae98904d4
commit
e3246af28b
14 changed files with 1969 additions and 101 deletions
|
@ -5,7 +5,7 @@ mod tests;
|
|||
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use libxml::tree::{node, Document, Node, NodeType};
|
||||
use libxml::tree::{Document, Node, NodeType};
|
||||
|
||||
use self::state::State;
|
||||
use super::error::FullTextParserError;
|
||||
|
@ -19,8 +19,6 @@ impl Readability {
|
|||
root: &mut Node,
|
||||
title: Option<&str>,
|
||||
) -> Result<bool, FullTextParserError> {
|
||||
node::set_node_rc_guard(6);
|
||||
|
||||
let mut state = State::default();
|
||||
let mut document = document;
|
||||
let mut attempts: Vec<(Node, usize, Document)> = Vec::new();
|
||||
|
@ -253,12 +251,11 @@ impl Readability {
|
|||
let mut top_candidate = top_candidates.first().cloned().unwrap_or_else(|| {
|
||||
// If we still have no top candidate, just use the body as a last resort.
|
||||
// We also have to copy the body node so it is something we can modify.
|
||||
let mut rt = document.get_root_element().unwrap();
|
||||
Self::initialize_node(&mut rt, &state).unwrap();
|
||||
let mut rt = document.get_root_element().expect("doc should have root");
|
||||
Self::initialize_node(&mut rt, &state).expect("init should not fail");
|
||||
needed_to_create_top_candidate = true;
|
||||
rt
|
||||
});
|
||||
let mut parent_of_top_candidate = None;
|
||||
|
||||
let mut alternative_candidate_ancestors = Vec::new();
|
||||
// Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
|
||||
|
@ -274,25 +271,21 @@ impl Readability {
|
|||
}
|
||||
|
||||
if alternative_candidate_ancestors.len() >= constants::MINIMUM_TOPCANDIDATES {
|
||||
parent_of_top_candidate = top_candidate.get_parent();
|
||||
let mut parent_of_top_candidate = top_candidate.get_parent();
|
||||
|
||||
loop {
|
||||
if let Some(parent) = &parent_of_top_candidate {
|
||||
let mut lists_containing_this_ancestor = 0;
|
||||
let tmp = usize::min(
|
||||
alternative_candidate_ancestors.len(),
|
||||
constants::MINIMUM_TOPCANDIDATES,
|
||||
);
|
||||
for ancestor in alternative_candidate_ancestors.iter().take(tmp) {
|
||||
lists_containing_this_ancestor +=
|
||||
if ancestor == parent { 1 } else { 0 };
|
||||
}
|
||||
while let Some(parent) = &parent_of_top_candidate {
|
||||
let mut lists_containing_this_ancestor = 0;
|
||||
let tmp = usize::min(
|
||||
alternative_candidate_ancestors.len(),
|
||||
constants::MINIMUM_TOPCANDIDATES,
|
||||
);
|
||||
for ancestor in alternative_candidate_ancestors.iter().take(tmp) {
|
||||
lists_containing_this_ancestor +=
|
||||
if ancestor == parent { 1 } else { 0 };
|
||||
}
|
||||
|
||||
if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {
|
||||
top_candidate = parent.clone();
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {
|
||||
top_candidate = parent.clone();
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -311,7 +304,7 @@ impl Readability {
|
|||
// lurking in other places that we want to unify in. The sibling stuff
|
||||
// below does some of that - but only if we've looked high enough up the DOM
|
||||
// tree.
|
||||
parent_of_top_candidate = top_candidate.get_parent();
|
||||
let mut parent_of_top_candidate = top_candidate.get_parent();
|
||||
let mut last_score = Self::get_content_score(&top_candidate).unwrap_or(0.0);
|
||||
|
||||
// The scores shouldn't get too low.
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
use libxml::{
|
||||
tree::{Document, Node},
|
||||
xpath::Context,
|
||||
};
|
||||
use libxml::tree::{Document, Node};
|
||||
use reqwest::Url;
|
||||
|
||||
use crate::{
|
||||
|
@ -9,13 +6,21 @@ use crate::{
|
|||
full_text_parser::{config::ConfigEntry, metadata},
|
||||
};
|
||||
|
||||
async fn prepare(html: &str, url: &Url) -> (Document, Context, Article) {
|
||||
async fn run_test(name: &str) {
|
||||
libxml::tree::node::set_node_rc_guard(3);
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
let empty_config = ConfigEntry::default();
|
||||
let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap();
|
||||
|
||||
let url = Url::parse("http://google.com").unwrap();
|
||||
let html = std::fs::read_to_string(format!("./resources/tests/readability/{name}/source.html"))
|
||||
.expect("Failed to read source HTML");
|
||||
let document = crate::FullTextParser::parse_html(&html, None, &empty_config).unwrap();
|
||||
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
|
||||
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config, url);
|
||||
|
||||
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config);
|
||||
crate::FullTextParser::unwrap_noscript_images(&xpath_ctx).unwrap();
|
||||
let article = Article {
|
||||
let mut article = Article {
|
||||
title: None,
|
||||
author: None,
|
||||
url: url.clone(),
|
||||
|
@ -23,17 +28,6 @@ async fn prepare(html: &str, url: &Url) -> (Document, Context, Article) {
|
|||
thumbnail_url: None,
|
||||
document: None,
|
||||
};
|
||||
(document, xpath_ctx, article)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_1() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
let html = std::fs::read_to_string(r"./resources/tests/readability-test-1.html")
|
||||
.expect("Failed to read HTML");
|
||||
let url = Url::parse("http://google.com").unwrap();
|
||||
let (document, xpath_ctx, mut article) = prepare(&html, &url).await;
|
||||
|
||||
let mut article_document = Document::new().unwrap();
|
||||
let mut root = Node::new("article", None, &document).unwrap();
|
||||
|
@ -48,5 +42,21 @@ async fn test_1() {
|
|||
|
||||
article.document = Some(article_document);
|
||||
let html = article.get_content().unwrap();
|
||||
std::fs::write("test.html", html).unwrap();
|
||||
|
||||
let expected = std::fs::read_to_string(format!("./resources/tests/readability/{name}/expected.html"))
|
||||
.expect("Failed to read expected HTML");
|
||||
|
||||
//std::fs::write("expected.html", &html).unwrap();
|
||||
|
||||
assert_eq!(expected, html);
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "current_thread")]
|
||||
async fn test_001() {
|
||||
run_test("001").await
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "current_thread")]
|
||||
async fn test_002() {
|
||||
run_test("002").await
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue