add mercury leading image heuristics

2025-07-07 16:15:32 +02:00 · 2023-06-26 22:25:57 +02:00 · 2023-06-26 22:25:57 +02:00 · e32015c1d0
commit e32015c1d0
parent e99a4b4f23
6 changed files with 315 additions and 21 deletions
--- a/article_scraper/src/clean.rs
+++ b/article_scraper/src/clean.rs
@ -5,6 +5,11 @@ use crate::full_text_parser::error::FullTextParserError;
 use crate::util::Util;
 use crate::{FtrConfigEntry, FullTextParser};

+pub struct CleanedHtml {
+    pub html: String,
+    pub thumbnail: Option<String>,
+}
+
 /// Re-use crate internals to clean HTML of articles before
 /// further processing:
 /// - replace H1 with H2
@ -29,12 +34,13 @@ use crate::{FtrConfigEntry, FullTextParser};
 /// * `html` - HTML content
 /// * `base_url` - URL used to complete relative URLs
 ///
-pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserError> {
+pub fn clean_html(html: &str, base_url: &Url) -> Result<CleanedHtml, FullTextParserError> {
    libxml::tree::node::set_node_rc_guard(10);

    let empty_config = FtrConfigEntry::default();
    let document = FullTextParser::parse_html(html, None, &empty_config)?;
    let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?;
+    let thumbnail = FullTextParser::check_for_thumbnail(&xpath_ctx);
    FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document, None);
    if let Some(mut root) = document.get_root_element() {
        FullTextParser::post_process_page(&mut root)?;
@ -50,7 +56,10 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserEr
        article_node.add_child(&mut node).unwrap();
    }

-    Ok(document.node_to_string(&article_node))
+    Ok(CleanedHtml {
+        html: document.node_to_string(&article_node),
+        thumbnail,
+    })
 }

 #[cfg(test)]
@ -64,6 +73,10 @@ mod tests {
        let url = Url::parse("https://finshots.in").unwrap();
        let res = clean_html(html, &url).unwrap();

-        assert_eq!(res.len(), 11965);
+        assert_eq!(res.html.len(), 11965);
+        assert_eq!(
+            res.thumbnail.as_deref(),
+            Some("https://cdn.finshots.app/images/2023/03/Design-8-Amul.jpg")
+        )
    }
 }
--- a/article_scraper/src/constants.rs
+++ b/article_scraper/src/constants.rs
@ -156,3 +156,71 @@ pub const PHRASING_ELEMS: &[&str] = &[
    "OUTPUT", "PROGRESS", "Q", "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG",
    "SUB", "SUP", "TEXTAREA", "TIME", "VAR", "WBR",
 ];
+
+pub const LEAD_IMAGE_URL_XPATH: &str = "//link[@rel='image_src']";
+
+pub const POSITIVE_LEAD_IMAGE_URL_HINTS: &[&str] =
+    &["upload", "wp-content", "large", "photo", "wp-image"];
+
+pub static POSITIVE_LEAD_IMAGE_URL_HINTS_REGEX: Lazy<Regex> = Lazy::new(|| {
+    RegexBuilder::new(&POSITIVE_LEAD_IMAGE_URL_HINTS.join("|"))
+        .case_insensitive(true)
+        .build()
+        .expect("POSITIVE_LEAD_IMAGE_URL_HINTS regex")
+});
+
+pub const NEGATIVE_LEAD_IMAGE_URL_HINTS: &[&str] = &[
+    "spacer",
+    "sprite",
+    "blank",
+    "throbber",
+    "gradient",
+    "tile",
+    "bg",
+    "background",
+    "icon",
+    "social",
+    "header",
+    "hdr",
+    "advert",
+    "spinner",
+    "loader",
+    "loading",
+    "default",
+    "rating",
+    "share",
+    "facebook",
+    "twitter",
+    "theme",
+    "promo",
+    "ads",
+    "wp-includes",
+];
+
+pub static NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX: Lazy<Regex> = Lazy::new(|| {
+    RegexBuilder::new(&NEGATIVE_LEAD_IMAGE_URL_HINTS.join("|"))
+        .case_insensitive(true)
+        .build()
+        .expect("NEGATIVE_LEAD_IMAGE_URL_HINTS regex")
+});
+
+pub const PHOTO_HINTS: &[&str] = &["figure", "photo", "image", "caption"];
+pub static PHOTO_HINTS_REGEX: Lazy<Regex> = Lazy::new(|| {
+    RegexBuilder::new(&PHOTO_HINTS.join("|"))
+        .case_insensitive(true)
+        .build()
+        .expect("PHOTO_HINTS_REGEX regex")
+});
+
+pub static GIF_REGEX: Lazy<Regex> = Lazy::new(|| {
+    RegexBuilder::new(r#"\.gif(\?.*)?$"#)
+        .case_insensitive(true)
+        .build()
+        .expect("GIF_REGEX")
+});
+pub static JPG_REGEX: Lazy<Regex> = Lazy::new(|| {
+    RegexBuilder::new(r#"\.jpe?g(\?.*)?$"#)
+        .case_insensitive(true)
+        .build()
+        .expect("JPG_REGEX")
+});
--- a/article_scraper/src/full_text_parser/mod.rs
+++ b/article_scraper/src/full_text_parser/mod.rs
@ -23,7 +23,7 @@ use libxml::tree::{Document, Node, NodeType};
 use libxml::xpath::Context;
 use reqwest::header::HeaderMap;
 use reqwest::{Client, Response, Url};
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::path::Path;
 use std::str::from_utf8;

@ -224,7 +224,7 @@ impl FullTextParser {
        metadata::extract(&xpath_ctx, config, Some(global_config), article);

        if article.thumbnail_url.is_none() {
-            Self::check_for_thumbnail(&xpath_ctx, article);
+            article.thumbnail_url = Self::check_for_thumbnail(&xpath_ctx);
        }
        Self::prep_content(
            &xpath_ctx,
@ -427,28 +427,89 @@ impl FullTextParser {
        conf
    }

-    fn check_for_thumbnail(context: &Context, article: &mut Article) {
+    pub fn check_for_thumbnail(context: &Context) -> Option<String> {
        if let Ok(thumb) = Util::get_attribute(
            context,
            "//meta[contains(@name, 'twitter:image')]",
            "content",
        ) {
-            article.thumbnail_url = Some(thumb);
-            return;
+            return Some(thumb);
        }

        if let Ok(thumb) =
            Util::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content")
        {
-            article.thumbnail_url = Some(thumb);
-            return;
+            return Some(thumb);
        }

        if let Ok(thumb) =
            Util::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href")
        {
-            article.thumbnail_url = Some(thumb);
+            return Some(thumb);
        }
+
+        if let Ok(img_nodes) = Util::evaluate_xpath(context, "//img", true) {
+            let mut scores: HashMap<String, i32> = HashMap::new();
+            let len = img_nodes.len();
+            for (index, img_node) in img_nodes.into_iter().enumerate() {
+                let src = if let Some(src) = img_node.get_attribute("src") {
+                    src
+                } else {
+                    continue;
+                };
+
+                let score = Util::score_image_url(&src);
+                let score = score + Util::score_img_attr(&img_node);
+                let score = score + Util::score_by_parents(&img_node);
+                let score = score + Util::score_by_sibling(&img_node);
+                let score = score + Util::score_by_dimensions(&img_node);
+                let score = score + Util::score_by_position(len, index);
+
+                scores.insert(src, score);
+            }
+
+            if let Some((top_src, top_score)) =
+                scores.into_iter().max_by_key(|(_src, score)| *score)
+            {
+                if top_score > 0 {
+                    let top_url = top_src.trim().into();
+                    if Url::parse(top_src.trim()).is_ok() {
+                        return Some(top_url);
+                    }
+                }
+            }
+        }
+
+        // If nothing else worked, check to see if there are any really
+        // probable nodes in the doc, like <link rel="image_src" />.
+        // eslint-disable-next-line no-restricted-syntax
+        if let Ok(link_nodes) = Util::evaluate_xpath(context, constants::LEAD_IMAGE_URL_XPATH, true)
+        {
+            if let Some(first_link_node) = link_nodes.first() {
+                if let Some(src) = first_link_node.get_attribute("src") {
+                    let src = src.trim().to_string();
+                    if Url::parse(&src).is_ok() {
+                        return Some(src);
+                    }
+                }
+
+                if let Some(href) = first_link_node.get_attribute("href") {
+                    let href = href.trim().to_string();
+                    if Url::parse(&href).is_ok() {
+                        return Some(href);
+                    }
+                }
+
+                if let Some(val) = first_link_node.get_attribute("value") {
+                    let val = val.trim().to_string();
+                    if Url::parse(&val).is_ok() {
+                        return Some(val);
+                    }
+                }
+            }
+        }
+
+        None
    }

    fn fix_lazy_images(context: &Context, doc: &Document) -> Result<(), FullTextParserError> {
--- a/article_scraper/src/full_text_parser/readability/mod.rs
+++ b/article_scraper/src/full_text_parser/readability/mod.rs
@ -102,14 +102,7 @@ impl Readability {
                    continue;
                }

-                let match_string = node_ref
-                    .get_class_names()
-                    .iter()
-                    .fold(String::new(), |a, b| format!("{a} {b}"));
-                let match_string = match node_ref.get_property("id") {
-                    Some(id) => format!("{match_string} {id}"),
-                    None => match_string,
-                };
+                let match_string = Util::get_signature(node_ref);

                if !Util::is_probably_visible(node_ref) {
                    log::debug!("removing hidden node {match_string}");
--- a/article_scraper/src/full_text_parser/tests.rs
+++ b/article_scraper/src/full_text_parser/tests.rs
@ -1,5 +1,5 @@
 use super::{config::ConfigEntry, FullTextParser};
-use libxml::tree::SaveOptions;
+use libxml::{parser::Parser, tree::SaveOptions, xpath::Context};
 use reqwest::{Client, Url};

 async fn run_test(name: &str, url: &str, title: Option<&str>, author: Option<&str>) {
@ -180,3 +180,22 @@ async fn unwrap_noscript_images_2() {

    assert_eq!(res, expected);
 }
+
+#[test]
+fn extract_thumbnail() {
+    let html = r#"
+<img src="https://www.golem.de/2306/175204-387164-387163_rc.jpg" width="140" height="140" loading="lazy" />Im staubigen
+Utah sind die Fossilien eines urzeitlichen Meeresreptils entdeckt worden. Nun haben Forscher eine Studie dazu
+herausgebracht. (<a href="https://www.golem.de/specials/fortschritt/" rel="noopener noreferrer" target="_blank"
+    referrerpolicy="no-referrer">Fortschritt</a>, <a href="https://www.golem.de/specials/wissenschaft/"
+    rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Wissenschaft</a>)
+    "#;
+    let doc = Parser::default_html().parse_string(html).unwrap();
+    let ctx = Context::new(&doc).unwrap();
+
+    let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap();
+    assert_eq!(
+        thumb,
+        "https://www.golem.de/2306/175204-387164-387163_rc.jpg"
+    )
+}
--- a/article_scraper/src/util.rs
+++ b/article_scraper/src/util.rs
@ -11,7 +11,7 @@ use reqwest::{
 use tokio::fs::DirEntry;

 use crate::{
-    constants,
+    constants::{self, NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX},
    full_text_parser::{config::ConfigEntry, error::FullTextParserError},
    image_object::ImageObject,
    video_object::VideoObject,
@ -275,6 +275,17 @@ impl Util {
        Ok(())
    }

+    pub fn get_signature(node: &Node) -> String {
+        let match_string = node
+            .get_class_names()
+            .iter()
+            .fold(String::new(), |a, b| format!("{a} {b}"));
+        match node.get_property("id") {
+            Some(id) => format!("{match_string} {id}"),
+            None => match_string,
+        }
+    }
+
    pub fn is_probably_visible(node: &Node) -> bool {
        let is_hidden = node.has_attribute("hidden");
        let aria_hidden = node
@ -1033,6 +1044,135 @@ impl Util {
            }
        }
    }
+
+    pub fn score_image_url(url: &str) -> i32 {
+        let url = url.trim();
+        let mut score = 0;
+
+        if constants::POSITIVE_LEAD_IMAGE_URL_HINTS_REGEX.is_match(url) {
+            score += 20;
+        }
+
+        if NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX.is_match(url) {
+            score -= 20;
+        }
+
+        // TODO: We might want to consider removing this as
+        // gifs are much more common/popular than they once were
+        if constants::GIF_REGEX.is_match(url) {
+            score -= 10;
+        }
+
+        if constants::JPG_REGEX.is_match(url) {
+            score += 10;
+        }
+
+        // PNGs are neutral.
+
+        score
+    }
+
+    // Alt attribute usually means non-presentational image.
+    pub fn score_img_attr(img: &Node) -> i32 {
+        if img.get_attribute("alt").is_some() {
+            5
+        } else {
+            0
+        }
+    }
+
+    // Look through our parent and grandparent for figure-like
+    // container elements, give a bonus if we find them
+    pub fn score_by_parents(img: &Node) -> i32 {
+        let mut score = 0;
+        let parent = img.get_parent();
+        let grand_parent = parent.as_ref().and_then(|n| n.get_parent());
+        if Self::has_tag_name(parent.as_ref(), "figure")
+            || Self::has_tag_name(grand_parent.as_ref(), "figure")
+        {
+            score += 25;
+        }
+
+        if let Some(parent) = parent.as_ref() {
+            let signature = Util::get_signature(parent);
+            if constants::PHOTO_HINTS_REGEX.is_match(&signature) {
+                score += 15;
+            }
+        }
+
+        if let Some(grand_parent) = grand_parent.as_ref() {
+            let signature = Util::get_signature(grand_parent);
+            if constants::PHOTO_HINTS_REGEX.is_match(&signature) {
+                score += 15;
+            }
+        }
+
+        score
+    }
+
+    // Look at our immediate sibling and see if it looks like it's a
+    // caption. Bonus if so.
+    pub fn score_by_sibling(img: &Node) -> i32 {
+        let mut score = 0;
+        let sibling = img.get_next_element_sibling();
+
+        if let Some(sibling) = sibling.as_ref() {
+            if sibling.get_name().to_lowercase() == "figcaption" {
+                score += 25;
+            }
+
+            let signature = Util::get_signature(sibling);
+            if constants::PHOTO_HINTS_REGEX.is_match(&signature) {
+                score += 15;
+            }
+        }
+
+        score
+    }
+
+    pub fn score_by_dimensions(img: &Node) -> i32 {
+        let mut score = 0;
+
+        let width = img
+            .get_attribute("width")
+            .and_then(|w| w.parse::<f32>().ok());
+        let height = img
+            .get_attribute("height")
+            .and_then(|w| w.parse::<f32>().ok());
+        let src = img.get_attribute("src").unwrap_or_default();
+
+        // Penalty for skinny images
+        if let Some(width) = width {
+            if width <= 50.0 {
+                score -= 50;
+            }
+        }
+
+        // Penalty for short images
+        if let Some(height) = height {
+            if height <= 50.0 {
+                score -= 50;
+            }
+        }
+
+        if let (Some(width), Some(height)) = (width, height) {
+            if !src.contains("sprite") {
+                let area = width * height;
+                if area < 5000.0 {
+                    // Smaller than 50 x 100
+                    score -= 100;
+                } else {
+                    score += f32::round(area / 1000.0) as i32;
+                }
+            }
+        }
+
+        score
+    }
+
+    pub fn score_by_position(len: usize, index: usize) -> i32 {
+        (len / 2 - index) as i32
+    }
 }

 #[cfg(test)]