1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

fix alternate candidates

This commit is contained in:
Jan Lukas Gernert 2023-03-06 01:36:21 +01:00
parent 45b4141049
commit 881c2b90ac
2 changed files with 12 additions and 7 deletions

View file

@ -205,7 +205,7 @@ impl Readability {
} }
// Exclude nodes with no ancestor. // Exclude nodes with no ancestor.
let ancestors = Util::get_node_ancestors(&element_to_score, 5); let ancestors = Util::get_node_ancestors(&element_to_score, Some(5));
if ancestors.is_empty() { if ancestors.is_empty() {
continue; continue;
} }
@ -293,9 +293,8 @@ impl Readability {
for candidate in top_candidates.iter().skip(1) { for candidate in top_candidates.iter().skip(1) {
let score = Self::get_content_score(candidate).unwrap_or(0.0); let score = Self::get_content_score(candidate).unwrap_or(0.0);
if score / top_score >= 0.75 { if score / top_score >= 0.75 {
if let Some(ancestor) = top_candidate.get_parent() { alternative_candidate_ancestors
alternative_candidate_ancestors.push(ancestor); .push(Util::get_node_ancestors(candidate, None));
}
} }
} }
} }
@ -304,13 +303,18 @@ impl Readability {
let mut parent_of_top_candidate = top_candidate.get_parent(); let mut parent_of_top_candidate = top_candidate.get_parent();
while let Some(parent) = &parent_of_top_candidate { while let Some(parent) = &parent_of_top_candidate {
if parent.get_name().to_uppercase() == "BODY" {
break;
}
let mut lists_containing_this_ancestor = 0; let mut lists_containing_this_ancestor = 0;
let tmp = usize::min( let tmp = usize::min(
alternative_candidate_ancestors.len(), alternative_candidate_ancestors.len(),
constants::MINIMUM_TOPCANDIDATES, constants::MINIMUM_TOPCANDIDATES,
); );
for ancestor in alternative_candidate_ancestors.iter().take(tmp) { for ancestors in alternative_candidate_ancestors.iter().take(tmp) {
lists_containing_this_ancestor += if ancestor == parent { 1 } else { 0 }; lists_containing_this_ancestor +=
ancestors.into_iter().filter(|n| n == &parent).count();
} }
if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES { if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {

View file

@ -465,9 +465,10 @@ impl Util {
}) })
} }
pub fn get_node_ancestors(node: &Node, max_depth: u64) -> Vec<Node> { pub fn get_node_ancestors(node: &Node, max_depth: Option<u64>) -> Vec<Node> {
let mut ancestors = Vec::new(); let mut ancestors = Vec::new();
let mut node = node.clone(); let mut node = node.clone();
let max_depth = max_depth.unwrap_or(u64::MAX);
for _ in 0..max_depth { for _ in 0..max_depth {
let parent = node.get_parent(); let parent = node.get_parent();