mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
fix alternative top candidate calcs
This commit is contained in:
parent
f4ccd22837
commit
a1c07d436f
1 changed files with 18 additions and 4 deletions
|
@ -279,15 +279,17 @@ impl Readability {
|
||||||
let mut alternative_candidate_ancestors = Vec::new();
|
let mut alternative_candidate_ancestors = Vec::new();
|
||||||
// Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
|
// Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
|
||||||
// and whose scores are quite closed with current `topCandidate` node.
|
// and whose scores are quite closed with current `topCandidate` node.
|
||||||
for top_candidate in &top_candidates {
|
if let Some(top_score) = Self::get_content_score(&top_candidate) {
|
||||||
if let Some(score) = Self::get_content_score(top_candidate) {
|
for candidate in top_candidates.iter().skip(1) {
|
||||||
if score >= 0.75 {
|
let score = Self::get_content_score(candidate).unwrap_or(0.0);
|
||||||
|
if score / top_score >= 0.75 {
|
||||||
if let Some(ancestor) = top_candidate.get_parent() {
|
if let Some(ancestor) = top_candidate.get_parent() {
|
||||||
alternative_candidate_ancestors.push(ancestor);
|
alternative_candidate_ancestors.push(ancestor);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if alternative_candidate_ancestors.len() >= constants::MINIMUM_TOPCANDIDATES {
|
if alternative_candidate_ancestors.len() >= constants::MINIMUM_TOPCANDIDATES {
|
||||||
let mut parent_of_top_candidate = top_candidate.get_parent();
|
let mut parent_of_top_candidate = top_candidate.get_parent();
|
||||||
|
@ -325,10 +327,22 @@ impl Readability {
|
||||||
let mut parent_of_top_candidate = top_candidate.get_parent();
|
let mut parent_of_top_candidate = top_candidate.get_parent();
|
||||||
let mut last_score = Self::get_content_score(&top_candidate).unwrap_or(0.0);
|
let mut last_score = Self::get_content_score(&top_candidate).unwrap_or(0.0);
|
||||||
|
|
||||||
|
// let html = document.to_string_with_options(libxml::tree::SaveOptions {
|
||||||
|
// format: true,
|
||||||
|
// no_declaration: false,
|
||||||
|
// no_empty_tags: true,
|
||||||
|
// no_xhtml: false,
|
||||||
|
// xhtml: false,
|
||||||
|
// as_xml: false,
|
||||||
|
// as_html: true,
|
||||||
|
// non_significant_whitespace: false,
|
||||||
|
// });
|
||||||
|
// std::fs::write("doc.html", &html).unwrap();
|
||||||
|
|
||||||
// The scores shouldn't get too low.
|
// The scores shouldn't get too low.
|
||||||
let score_threshold = last_score / 3.0;
|
let score_threshold = last_score / 3.0;
|
||||||
|
|
||||||
while Util::has_tag_name(parent_of_top_candidate.as_ref(), "BODY") {
|
while !Util::has_tag_name(parent_of_top_candidate.as_ref(), "BODY") {
|
||||||
if parent_of_top_candidate
|
if parent_of_top_candidate
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.map(|n| Self::get_content_score(n).is_none())
|
.map(|n| Self::get_content_score(n).is_none())
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue