1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 08:05:31 +02:00

negative score for thumbnails with emoji alt

This commit is contained in:
Jan Lukas Gernert 2024-06-10 20:40:19 +02:00
parent 06018d98d4
commit e01c8e9d34
3 changed files with 49 additions and 14 deletions

View file

@ -533,6 +533,7 @@ impl FullTextParser {
let score = score + Util::score_by_sibling(&img_node); let score = score + Util::score_by_sibling(&img_node);
let score = score + Util::score_by_dimensions(&img_node); let score = score + Util::score_by_dimensions(&img_node);
let score = score + Util::score_by_position(len, index); let score = score + Util::score_by_position(len, index);
let score = score + Util::score_by_alt(&img_node);
scores.insert(src, score); scores.insert(src, score);
} }

View file

@ -278,3 +278,23 @@ Foto: IMAGO/Vaclav Salek / IMAGO/CTK Photo
"https://cdn.prod.www.spiegel.de/images/a4573666-f15e-4290-8c73-a0c6cd4ad3b2_w948_r1.778_fpx29.99_fpy44.98.jpg" "https://cdn.prod.www.spiegel.de/images/a4573666-f15e-4290-8c73-a0c6cd4ad3b2_w948_r1.778_fpx29.99_fpy44.98.jpg"
) )
} }
#[test]
fn extract_thumbnail_no_emoji() {
let html = r#"
<p>I recently went on Brodie Robertson&#8217;s Tech Over Tea channel for a second time. I guess I didn&#8217;t succeed at pissing him off enough on the first go-around, because he invited me back! Let&#8217;s see if I did a better job of it this time by telling him he was using Arch wrong. <img src="https://s0.wp.com/wp-content/mu-plugins/wpcom-smileys/twemoji/2/72x72/1f600.png" alt="😀" class="wp-smiley" style="height: 1em; max-height: 1em;" /></p>
<p>Anyway, Brodie was a fantastic host, and we talked about a number of topics such as KDE&#8217;s position in the world, institutional continuity, fundraising and financial stability, the difficulty of reporting and triaging bug, the challenges of packaging software, and windows that block WiFi signals.</p>
<p>I hope you enjoy it!</p>
<figure class="wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect-16-9 wp-has-aspect-ratio"><div class="wp-block-embed__wrapper">
<iframe class="youtube-player" width="1100" height="619" src="https://www.youtube.com/embed/qJZ2V5FmgO8?version=3&#038;rel=1&#038;showsearch=0&#038;showinfo=1&#038;iv_load_policy=1&#038;fs=1&#038;hl=en&#038;autohide=2&#038;wmode=transparent" allowfullscreen="true" style="border:0;" sandbox="allow-scripts allow-same-origin allow-popups allow-presentation allow-popups-to-escape-sandbox"></iframe>
</div></figure>
<p>And here&#8217;s the link I mention at the end: <a href="https://kde.org/community/donations">https://kde.org/community/donations</a> <img src="https://s0.wp.com/wp-content/mu-plugins/wpcom-smileys/twemoji/2/72x72/1f642.png" alt="🙂" class="wp-smiley" style="height: 1em; max-height: 1em;" /> </p>
"#;
let parser = Parser::default_html();
let doc = FullTextParser::parse_html_string_patched(html, &parser).unwrap();
let ctx = Context::new(&doc).unwrap();
let thumb = FullTextParser::check_for_thumbnail(&ctx);
assert_eq!(thumb, None)
}

View file

@ -687,26 +687,28 @@ impl Util {
for img_node in img_nodes { for img_node in img_nodes {
if let Some(img_alt) = img_node.get_attribute("alt") { if let Some(img_alt) = img_node.get_attribute("alt") {
let mut alt_chars = img_alt.chars(); if Self::is_emoji(&img_alt) {
let first_char = alt_chars.next(); if let Some(mut parent) = img_node.get_parent() {
let second_char = alt_chars.next(); let emoji_text_node = Node::new_text(&img_alt, document).unwrap();
_ = parent.replace_child_node(emoji_text_node, img_node);
if let (Some(char), None) = (first_char, second_char) {
if unic_emoji_char::is_emoji(char) {
if let Some(mut parent) = img_node.get_parent() {
// if let Ok(emoji_text_node) = parent.add_text_child(None, "emoji", &char.to_string()) {
// _ = parent.replace_child_node(emoji_text_node, img_node);
// }
let emoji_text_node =
Node::new_text(&char.to_string(), document).unwrap();
_ = parent.replace_child_node(emoji_text_node, img_node);
}
} }
} }
} }
} }
} }
pub fn is_emoji(text: &str) -> bool {
let mut alt_chars = text.chars();
let first_char = alt_chars.next();
let second_char = alt_chars.next();
if let (Some(char), None) = (first_char, second_char) {
unic_emoji_char::is_emoji(char)
} else {
false
}
}
// Clean an element of all tags of type "tag" if they look fishy. // Clean an element of all tags of type "tag" if they look fishy.
// "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. // "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
pub fn clean_conditionally(root: &mut Node, tag: &str) { pub fn clean_conditionally(root: &mut Node, tag: &str) {
@ -1248,6 +1250,18 @@ impl Util {
((len as f32 / 2.0) - index as f32) as i32 ((len as f32 / 2.0) - index as f32) as i32
} }
pub fn score_by_alt(node: &Node) -> i32 {
if let Some(alt) = node.get_attribute("alt") {
if Self::is_emoji(&alt) {
-100
} else {
0
}
} else {
0
}
}
pub fn get_content_length(response: &Response) -> Result<usize, ImageDownloadError> { pub fn get_content_length(response: &Response) -> Result<usize, ImageDownloadError> {
let status_code = response.status(); let status_code = response.status();