mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
add exception to conditioal cleaning for list with images
This commit is contained in:
parent
c19525f8cd
commit
c8bc583864
1 changed files with 20 additions and 2 deletions
22
src/util.rs
22
src/util.rs
|
@ -590,7 +590,7 @@ impl Util {
|
|||
let content = Self::get_inner_text(node, false);
|
||||
let content_length = content.len();
|
||||
|
||||
(img > 1
|
||||
let have_to_remove = (img > 1
|
||||
&& (p as f64 / img as f64) < 0.5
|
||||
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
|
||||
|| (!is_list && li > p as i64)
|
||||
|
@ -602,7 +602,25 @@ impl Util {
|
|||
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
|
||||
|| (!is_list && weight < 25 && link_density > 0.2)
|
||||
|| (weight >= 25 && link_density > 0.5)
|
||||
|| ((embed_count == 1 && content_length < 75) || embed_count > 1)
|
||||
|| ((embed_count == 1 && content_length < 75) || embed_count > 1);
|
||||
|
||||
// Allow simple lists of images to remain in pages
|
||||
if is_list && have_to_remove {
|
||||
for child in node.get_child_elements() {
|
||||
// Don't filter in lists with li's that contain more than one child
|
||||
if child.get_child_elements().len() > 1 {
|
||||
return have_to_remove;
|
||||
}
|
||||
}
|
||||
|
||||
let li_count = Util::get_elements_by_tag_name(node, "li").len();
|
||||
// Only allow the list to remain if every li contains an image
|
||||
if img == li_count {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
have_to_remove
|
||||
} else {
|
||||
false
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue