mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 08:05:31 +02:00
negative score for thumbnails with emoji alt
This commit is contained in:
parent
06018d98d4
commit
e01c8e9d34
3 changed files with 49 additions and 14 deletions
|
@ -533,6 +533,7 @@ impl FullTextParser {
|
|||
let score = score + Util::score_by_sibling(&img_node);
|
||||
let score = score + Util::score_by_dimensions(&img_node);
|
||||
let score = score + Util::score_by_position(len, index);
|
||||
let score = score + Util::score_by_alt(&img_node);
|
||||
|
||||
scores.insert(src, score);
|
||||
}
|
||||
|
|
|
@ -278,3 +278,23 @@ Foto: IMAGO/Vaclav Salek / IMAGO/CTK Photo
|
|||
"https://cdn.prod.www.spiegel.de/images/a4573666-f15e-4290-8c73-a0c6cd4ad3b2_w948_r1.778_fpx29.99_fpy44.98.jpg"
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_thumbnail_no_emoji() {
|
||||
let html = r#"
|
||||
<p>I recently went on Brodie Robertson’s Tech Over Tea channel for a second time. I guess I didn’t succeed at pissing him off enough on the first go-around, because he invited me back! Let’s see if I did a better job of it this time by telling him he was using Arch wrong. <img src="https://s0.wp.com/wp-content/mu-plugins/wpcom-smileys/twemoji/2/72x72/1f600.png" alt="😀" class="wp-smiley" style="height: 1em; max-height: 1em;" /></p>
|
||||
<p>Anyway, Brodie was a fantastic host, and we talked about a number of topics such as KDE’s position in the world, institutional continuity, fundraising and financial stability, the difficulty of reporting and triaging bug, the challenges of packaging software, and windows that block WiFi signals.</p>
|
||||
<p>I hope you enjoy it!</p>
|
||||
<figure class="wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect-16-9 wp-has-aspect-ratio"><div class="wp-block-embed__wrapper">
|
||||
<iframe class="youtube-player" width="1100" height="619" src="https://www.youtube.com/embed/qJZ2V5FmgO8?version=3&rel=1&showsearch=0&showinfo=1&iv_load_policy=1&fs=1&hl=en&autohide=2&wmode=transparent" allowfullscreen="true" style="border:0;" sandbox="allow-scripts allow-same-origin allow-popups allow-presentation allow-popups-to-escape-sandbox"></iframe>
|
||||
</div></figure>
|
||||
<p>And here’s the link I mention at the end: <a href="https://kde.org/community/donations">https://kde.org/community/donations</a> <img src="https://s0.wp.com/wp-content/mu-plugins/wpcom-smileys/twemoji/2/72x72/1f642.png" alt="🙂" class="wp-smiley" style="height: 1em; max-height: 1em;" /> </p>
|
||||
"#;
|
||||
|
||||
let parser = Parser::default_html();
|
||||
let doc = FullTextParser::parse_html_string_patched(html, &parser).unwrap();
|
||||
let ctx = Context::new(&doc).unwrap();
|
||||
|
||||
let thumb = FullTextParser::check_for_thumbnail(&ctx);
|
||||
assert_eq!(thumb, None)
|
||||
}
|
||||
|
|
|
@ -687,26 +687,28 @@ impl Util {
|
|||
|
||||
for img_node in img_nodes {
|
||||
if let Some(img_alt) = img_node.get_attribute("alt") {
|
||||
let mut alt_chars = img_alt.chars();
|
||||
let first_char = alt_chars.next();
|
||||
let second_char = alt_chars.next();
|
||||
|
||||
if let (Some(char), None) = (first_char, second_char) {
|
||||
if unic_emoji_char::is_emoji(char) {
|
||||
if let Some(mut parent) = img_node.get_parent() {
|
||||
// if let Ok(emoji_text_node) = parent.add_text_child(None, "emoji", &char.to_string()) {
|
||||
// _ = parent.replace_child_node(emoji_text_node, img_node);
|
||||
// }
|
||||
let emoji_text_node =
|
||||
Node::new_text(&char.to_string(), document).unwrap();
|
||||
_ = parent.replace_child_node(emoji_text_node, img_node);
|
||||
}
|
||||
if Self::is_emoji(&img_alt) {
|
||||
if let Some(mut parent) = img_node.get_parent() {
|
||||
let emoji_text_node = Node::new_text(&img_alt, document).unwrap();
|
||||
_ = parent.replace_child_node(emoji_text_node, img_node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_emoji(text: &str) -> bool {
|
||||
let mut alt_chars = text.chars();
|
||||
let first_char = alt_chars.next();
|
||||
let second_char = alt_chars.next();
|
||||
|
||||
if let (Some(char), None) = (first_char, second_char) {
|
||||
unic_emoji_char::is_emoji(char)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
// Clean an element of all tags of type "tag" if they look fishy.
|
||||
// "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
|
||||
pub fn clean_conditionally(root: &mut Node, tag: &str) {
|
||||
|
@ -1248,6 +1250,18 @@ impl Util {
|
|||
((len as f32 / 2.0) - index as f32) as i32
|
||||
}
|
||||
|
||||
pub fn score_by_alt(node: &Node) -> i32 {
|
||||
if let Some(alt) = node.get_attribute("alt") {
|
||||
if Self::is_emoji(&alt) {
|
||||
-100
|
||||
} else {
|
||||
0
|
||||
}
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_content_length(response: &Response) -> Result<usize, ImageDownloadError> {
|
||||
let status_code = response.status();
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue