1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

add exception to conditioal cleaning for list with images

This commit is contained in:
Jan Lukas Gernert 2023-03-12 13:39:10 +01:00
parent c19525f8cd
commit c8bc583864

View file

@ -590,7 +590,7 @@ impl Util {
let content = Self::get_inner_text(node, false); let content = Self::get_inner_text(node, false);
let content_length = content.len(); let content_length = content.len();
(img > 1 let have_to_remove = (img > 1
&& (p as f64 / img as f64) < 0.5 && (p as f64 / img as f64) < 0.5
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>)) && !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
|| (!is_list && li > p as i64) || (!is_list && li > p as i64)
@ -602,7 +602,25 @@ impl Util {
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>)) && !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
|| (!is_list && weight < 25 && link_density > 0.2) || (!is_list && weight < 25 && link_density > 0.2)
|| (weight >= 25 && link_density > 0.5) || (weight >= 25 && link_density > 0.5)
|| ((embed_count == 1 && content_length < 75) || embed_count > 1) || ((embed_count == 1 && content_length < 75) || embed_count > 1);
// Allow simple lists of images to remain in pages
if is_list && have_to_remove {
for child in node.get_child_elements() {
// Don't filter in lists with li's that contain more than one child
if child.get_child_elements().len() > 1 {
return have_to_remove;
}
}
let li_count = Util::get_elements_by_tag_name(node, "li").len();
// Only allow the list to remain if every li contains an image
if img == li_count {
return false;
}
}
have_to_remove
} else { } else {
false false
} }