mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
add exception to conditioal cleaning for list with images
This commit is contained in:
parent
c19525f8cd
commit
c8bc583864
1 changed files with 20 additions and 2 deletions
22
src/util.rs
22
src/util.rs
|
@ -590,7 +590,7 @@ impl Util {
|
||||||
let content = Self::get_inner_text(node, false);
|
let content = Self::get_inner_text(node, false);
|
||||||
let content_length = content.len();
|
let content_length = content.len();
|
||||||
|
|
||||||
(img > 1
|
let have_to_remove = (img > 1
|
||||||
&& (p as f64 / img as f64) < 0.5
|
&& (p as f64 / img as f64) < 0.5
|
||||||
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
|
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
|
||||||
|| (!is_list && li > p as i64)
|
|| (!is_list && li > p as i64)
|
||||||
|
@ -602,7 +602,25 @@ impl Util {
|
||||||
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
|
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
|
||||||
|| (!is_list && weight < 25 && link_density > 0.2)
|
|| (!is_list && weight < 25 && link_density > 0.2)
|
||||||
|| (weight >= 25 && link_density > 0.5)
|
|| (weight >= 25 && link_density > 0.5)
|
||||||
|| ((embed_count == 1 && content_length < 75) || embed_count > 1)
|
|| ((embed_count == 1 && content_length < 75) || embed_count > 1);
|
||||||
|
|
||||||
|
// Allow simple lists of images to remain in pages
|
||||||
|
if is_list && have_to_remove {
|
||||||
|
for child in node.get_child_elements() {
|
||||||
|
// Don't filter in lists with li's that contain more than one child
|
||||||
|
if child.get_child_elements().len() > 1 {
|
||||||
|
return have_to_remove;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let li_count = Util::get_elements_by_tag_name(node, "li").len();
|
||||||
|
// Only allow the list to remain if every li contains an image
|
||||||
|
if img == li_count {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
have_to_remove
|
||||||
} else {
|
} else {
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue