diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs
index 0e63850..2857f73 100644
--- a/article_scraper/src/full_text_parser/mod.rs
+++ b/article_scraper/src/full_text_parser/mod.rs
@@ -533,6 +533,7 @@ impl FullTextParser {
let score = score + Util::score_by_sibling(&img_node);
let score = score + Util::score_by_dimensions(&img_node);
let score = score + Util::score_by_position(len, index);
+ let score = score + Util::score_by_alt(&img_node);
scores.insert(src, score);
}
diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs
index 0f0370f..36ae0d0 100644
--- a/article_scraper/src/full_text_parser/tests.rs
+++ b/article_scraper/src/full_text_parser/tests.rs
@@ -278,3 +278,23 @@ Foto: IMAGO/Vaclav Salek / IMAGO/CTK Photo
"https://cdn.prod.www.spiegel.de/images/a4573666-f15e-4290-8c73-a0c6cd4ad3b2_w948_r1.778_fpx29.99_fpy44.98.jpg"
)
}
+
+#[test]
+fn extract_thumbnail_no_emoji() {
+ let html = r#"
+
I recently went on Brodie Robertson’s Tech Over Tea channel for a second time. I guess I didn’t succeed at pissing him off enough on the first go-around, because he invited me back! Let’s see if I did a better job of it this time by telling him he was using Arch wrong.
+ Anyway, Brodie was a fantastic host, and we talked about a number of topics such as KDE’s position in the world, institutional continuity, fundraising and financial stability, the difficulty of reporting and triaging bug, the challenges of packaging software, and windows that block WiFi signals.
+ I hope you enjoy it!
+
+ VIDEO
+
+ And here’s the link I mention at the end: https://kde.org/community/donations
+ "#;
+
+ let parser = Parser::default_html();
+ let doc = FullTextParser::parse_html_string_patched(html, &parser).unwrap();
+ let ctx = Context::new(&doc).unwrap();
+
+ let thumb = FullTextParser::check_for_thumbnail(&ctx);
+ assert_eq!(thumb, None)
+}
diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs
index 429378c..cbd5370 100644
--- a/article_scraper/src/util.rs
+++ b/article_scraper/src/util.rs
@@ -687,26 +687,28 @@ impl Util {
for img_node in img_nodes {
if let Some(img_alt) = img_node.get_attribute("alt") {
- let mut alt_chars = img_alt.chars();
- let first_char = alt_chars.next();
- let second_char = alt_chars.next();
-
- if let (Some(char), None) = (first_char, second_char) {
- if unic_emoji_char::is_emoji(char) {
- if let Some(mut parent) = img_node.get_parent() {
- // if let Ok(emoji_text_node) = parent.add_text_child(None, "emoji", &char.to_string()) {
- // _ = parent.replace_child_node(emoji_text_node, img_node);
- // }
- let emoji_text_node =
- Node::new_text(&char.to_string(), document).unwrap();
- _ = parent.replace_child_node(emoji_text_node, img_node);
- }
+ if Self::is_emoji(&img_alt) {
+ if let Some(mut parent) = img_node.get_parent() {
+ let emoji_text_node = Node::new_text(&img_alt, document).unwrap();
+ _ = parent.replace_child_node(emoji_text_node, img_node);
}
}
}
}
}
+ pub fn is_emoji(text: &str) -> bool {
+ let mut alt_chars = text.chars();
+ let first_char = alt_chars.next();
+ let second_char = alt_chars.next();
+
+ if let (Some(char), None) = (first_char, second_char) {
+ unic_emoji_char::is_emoji(char)
+ } else {
+ false
+ }
+ }
+
// Clean an element of all tags of type "tag" if they look fishy.
// "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
pub fn clean_conditionally(root: &mut Node, tag: &str) {
@@ -1248,6 +1250,18 @@ impl Util {
((len as f32 / 2.0) - index as f32) as i32
}
+ pub fn score_by_alt(node: &Node) -> i32 {
+ if let Some(alt) = node.get_attribute("alt") {
+ if Self::is_emoji(&alt) {
+ -100
+ } else {
+ 0
+ }
+ } else {
+ 0
+ }
+ }
+
pub fn get_content_length(response: &Response) -> Result {
let status_code = response.status();