mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
fix alternate candidates
This commit is contained in:
parent
45b4141049
commit
881c2b90ac
2 changed files with 12 additions and 7 deletions
|
@ -205,7 +205,7 @@ impl Readability {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Exclude nodes with no ancestor.
|
// Exclude nodes with no ancestor.
|
||||||
let ancestors = Util::get_node_ancestors(&element_to_score, 5);
|
let ancestors = Util::get_node_ancestors(&element_to_score, Some(5));
|
||||||
if ancestors.is_empty() {
|
if ancestors.is_empty() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -293,9 +293,8 @@ impl Readability {
|
||||||
for candidate in top_candidates.iter().skip(1) {
|
for candidate in top_candidates.iter().skip(1) {
|
||||||
let score = Self::get_content_score(candidate).unwrap_or(0.0);
|
let score = Self::get_content_score(candidate).unwrap_or(0.0);
|
||||||
if score / top_score >= 0.75 {
|
if score / top_score >= 0.75 {
|
||||||
if let Some(ancestor) = top_candidate.get_parent() {
|
alternative_candidate_ancestors
|
||||||
alternative_candidate_ancestors.push(ancestor);
|
.push(Util::get_node_ancestors(candidate, None));
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -304,13 +303,18 @@ impl Readability {
|
||||||
let mut parent_of_top_candidate = top_candidate.get_parent();
|
let mut parent_of_top_candidate = top_candidate.get_parent();
|
||||||
|
|
||||||
while let Some(parent) = &parent_of_top_candidate {
|
while let Some(parent) = &parent_of_top_candidate {
|
||||||
|
if parent.get_name().to_uppercase() == "BODY" {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
let mut lists_containing_this_ancestor = 0;
|
let mut lists_containing_this_ancestor = 0;
|
||||||
let tmp = usize::min(
|
let tmp = usize::min(
|
||||||
alternative_candidate_ancestors.len(),
|
alternative_candidate_ancestors.len(),
|
||||||
constants::MINIMUM_TOPCANDIDATES,
|
constants::MINIMUM_TOPCANDIDATES,
|
||||||
);
|
);
|
||||||
for ancestor in alternative_candidate_ancestors.iter().take(tmp) {
|
for ancestors in alternative_candidate_ancestors.iter().take(tmp) {
|
||||||
lists_containing_this_ancestor += if ancestor == parent { 1 } else { 0 };
|
lists_containing_this_ancestor +=
|
||||||
|
ancestors.into_iter().filter(|n| n == &parent).count();
|
||||||
}
|
}
|
||||||
|
|
||||||
if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {
|
if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {
|
||||||
|
|
|
@ -465,9 +465,10 @@ impl Util {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_node_ancestors(node: &Node, max_depth: u64) -> Vec<Node> {
|
pub fn get_node_ancestors(node: &Node, max_depth: Option<u64>) -> Vec<Node> {
|
||||||
let mut ancestors = Vec::new();
|
let mut ancestors = Vec::new();
|
||||||
let mut node = node.clone();
|
let mut node = node.clone();
|
||||||
|
let max_depth = max_depth.unwrap_or(u64::MAX);
|
||||||
|
|
||||||
for _ in 0..max_depth {
|
for _ in 0..max_depth {
|
||||||
let parent = node.get_parent();
|
let parent = node.get_parent();
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue