1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

fix alternate candidates

This commit is contained in:
Jan Lukas Gernert 2023-03-06 01:36:21 +01:00
parent 45b4141049
commit 881c2b90ac
2 changed files with 12 additions and 7 deletions

View file

@ -205,7 +205,7 @@ impl Readability {
}
// Exclude nodes with no ancestor.
let ancestors = Util::get_node_ancestors(&element_to_score, 5);
let ancestors = Util::get_node_ancestors(&element_to_score, Some(5));
if ancestors.is_empty() {
continue;
}
@ -293,9 +293,8 @@ impl Readability {
for candidate in top_candidates.iter().skip(1) {
let score = Self::get_content_score(candidate).unwrap_or(0.0);
if score / top_score >= 0.75 {
if let Some(ancestor) = top_candidate.get_parent() {
alternative_candidate_ancestors.push(ancestor);
}
alternative_candidate_ancestors
.push(Util::get_node_ancestors(candidate, None));
}
}
}
@ -304,13 +303,18 @@ impl Readability {
let mut parent_of_top_candidate = top_candidate.get_parent();
while let Some(parent) = &parent_of_top_candidate {
if parent.get_name().to_uppercase() == "BODY" {
break;
}
let mut lists_containing_this_ancestor = 0;
let tmp = usize::min(
alternative_candidate_ancestors.len(),
constants::MINIMUM_TOPCANDIDATES,
);
for ancestor in alternative_candidate_ancestors.iter().take(tmp) {
lists_containing_this_ancestor += if ancestor == parent { 1 } else { 0 };
for ancestors in alternative_candidate_ancestors.iter().take(tmp) {
lists_containing_this_ancestor +=
ancestors.into_iter().filter(|n| n == &parent).count();
}
if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {

View file

@ -465,9 +465,10 @@ impl Util {
})
}
pub fn get_node_ancestors(node: &Node, max_depth: u64) -> Vec<Node> {
pub fn get_node_ancestors(node: &Node, max_depth: Option<u64>) -> Vec<Node> {
let mut ancestors = Vec::new();
let mut node = node.clone();
let max_depth = max_depth.unwrap_or(u64::MAX);
for _ in 0..max_depth {
let parent = node.get_parent();