1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

fix scorint p tags twice

This commit is contained in:
Jan Lukas Gernert 2023-03-19 13:31:27 +01:00
parent 7737311a92
commit 3a56439ae8

View file

@ -173,7 +173,7 @@ impl Readability {
log::error!("{error}"); log::error!("{error}");
FullTextParserError::Readability FullTextParserError::Readability
})?; })?;
node = Some(new_node.clone()); node = Util::next_node(&new_node, false);
elements_to_score.push(new_node.clone()); elements_to_score.push(new_node.clone());
continue; continue;
} }
@ -210,9 +210,10 @@ impl Readability {
} }
let inner_text = Util::get_inner_text(&element_to_score, true); let inner_text = Util::get_inner_text(&element_to_score, true);
let inner_text_len = inner_text.len();
// If this paragraph is less than 25 characters, don't even count it. // If this paragraph is less than 25 characters, don't even count it.
if inner_text.len() < 25 { if inner_text_len < 25 {
continue; continue;
} }
@ -235,7 +236,9 @@ impl Readability {
// Initialize and score ancestors. // Initialize and score ancestors.
for (level, mut ancestor) in ancestors.into_iter().enumerate() { for (level, mut ancestor) in ancestors.into_iter().enumerate() {
if ancestor.get_parent().is_none() { let tag_name = ancestor.get_name().to_uppercase();
if ancestor.get_parent().is_none() || tag_name == "HTML" {
continue; continue;
} }
@ -256,9 +259,10 @@ impl Readability {
level as f64 * 3.0 level as f64 * 3.0
}; };
if let Some(mut score) = Self::get_content_score(&ancestor) { if let Some(score) = Self::get_content_score(&ancestor) {
score += content_score / score_divider; let add_score = content_score / score_divider;
Self::set_content_score(&mut ancestor, score)?; let new_score = score + add_score;
Self::set_content_score(&mut ancestor, new_score)?;
} }
} }
} }