1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 16:40:00 +02:00

fixes, more sanitation & 1 more failing test

This commit is contained in:
Jan Lukas Gernert 2023-02-28 01:50:13 +01:00
parent 56c08c501a
commit 31a8033844
8 changed files with 1993 additions and 162 deletions

View file

@ -69,8 +69,18 @@ impl Readability {
if state.strip_unlikely {
if constants::UNLIELY_CANDIDATES.is_match(&match_string)
&& !constants::OKAY_MAYBE_ITS_A_CANDIDATE.is_match(&match_string)
&& !Util::has_ancestor_tag(node_ref, "table", None)
&& !Util::has_ancestor_tag(node_ref, "code", None)
&& !Util::has_ancestor_tag(
node_ref,
"table",
None,
None::<fn(&Node) -> bool>,
)
&& !Util::has_ancestor_tag(
node_ref,
"code",
None,
None::<fn(&Node) -> bool>,
)
&& tag_name != "BODY"
&& tag_name != "A"
{
@ -123,6 +133,10 @@ impl Readability {
log::error!("{error}");
FullTextParserError::Readability
})?;
node_ref.add_child(&mut new_node).map_err(|error| {
log::error!("{error}");
FullTextParserError::Readability
})?;
p.replace(new_node);
}
} else if let Some(p) = p.as_mut() {
@ -638,40 +652,13 @@ impl Readability {
"H1" | "H2" | "H3" | "H4" | "H5" | "H6" | "TH" => -5,
_ => 0,
};
let score = score + Self::get_class_weight(node, state);
let class_weight = if state.weigh_classes {
Util::get_class_weight(node)
} else {
0
};
let score = score + class_weight;
Self::set_content_score(node, score as f64)?;
Ok(())
}
fn get_class_weight(node: &Node, state: &State) -> i64 {
if !state.weigh_classes {
return 0;
}
let mut weight = 0;
// Look for a special classname
if let Some(class_names) = node.get_property("class") {
if constants::NEGATIVE.is_match(&class_names) {
weight -= 25;
}
if constants::POSITIVE.is_match(&class_names) {
weight += 25;
}
}
// Look for a special ID
if let Some(class_names) = node.get_property("id") {
if constants::NEGATIVE.is_match(&class_names) {
weight -= 25;
}
if constants::POSITIVE.is_match(&class_names) {
weight += 25;
}
}
weight
}
}