From 881c2b90ac6632afc0f47569b36907b59469ea9d Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Mon, 6 Mar 2023 01:36:21 +0100 Subject: [PATCH] fix alternate candidates --- src/full_text_parser/readability/mod.rs | 16 ++++++++++------ src/util.rs | 3 ++- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/full_text_parser/readability/mod.rs b/src/full_text_parser/readability/mod.rs index a3ffe27..df8890a 100644 --- a/src/full_text_parser/readability/mod.rs +++ b/src/full_text_parser/readability/mod.rs @@ -205,7 +205,7 @@ impl Readability { } // Exclude nodes with no ancestor. - let ancestors = Util::get_node_ancestors(&element_to_score, 5); + let ancestors = Util::get_node_ancestors(&element_to_score, Some(5)); if ancestors.is_empty() { continue; } @@ -293,9 +293,8 @@ impl Readability { for candidate in top_candidates.iter().skip(1) { let score = Self::get_content_score(candidate).unwrap_or(0.0); if score / top_score >= 0.75 { - if let Some(ancestor) = top_candidate.get_parent() { - alternative_candidate_ancestors.push(ancestor); - } + alternative_candidate_ancestors + .push(Util::get_node_ancestors(candidate, None)); } } } @@ -304,13 +303,18 @@ impl Readability { let mut parent_of_top_candidate = top_candidate.get_parent(); while let Some(parent) = &parent_of_top_candidate { + if parent.get_name().to_uppercase() == "BODY" { + break; + } + let mut lists_containing_this_ancestor = 0; let tmp = usize::min( alternative_candidate_ancestors.len(), constants::MINIMUM_TOPCANDIDATES, ); - for ancestor in alternative_candidate_ancestors.iter().take(tmp) { - lists_containing_this_ancestor += if ancestor == parent { 1 } else { 0 }; + for ancestors in alternative_candidate_ancestors.iter().take(tmp) { + lists_containing_this_ancestor += + ancestors.into_iter().filter(|n| n == &parent).count(); } if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES { diff --git a/src/util.rs b/src/util.rs index d5588bc..6c9bb7c 100644 --- a/src/util.rs +++ b/src/util.rs @@ -465,9 +465,10 @@ impl Util { }) } - pub fn get_node_ancestors(node: &Node, max_depth: u64) -> Vec { + pub fn get_node_ancestors(node: &Node, max_depth: Option) -> Vec { let mut ancestors = Vec::new(); let mut node = node.clone(); + let max_depth = max_depth.unwrap_or(u64::MAX); for _ in 0..max_depth { let parent = node.get_parent();