diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index 14cc208..aec3967 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -5,6 +5,11 @@ use crate::full_text_parser::error::FullTextParserError; use crate::util::Util; use crate::{FtrConfigEntry, FullTextParser}; +pub struct CleanedHtml { + pub html: String, + pub thumbnail: Option, +} + /// Re-use crate internals to clean HTML of articles before /// further processing: /// - replace H1 with H2 @@ -29,12 +34,13 @@ use crate::{FtrConfigEntry, FullTextParser}; /// * `html` - HTML content /// * `base_url` - URL used to complete relative URLs /// -pub fn clean_html(html: &str, base_url: &Url) -> Result { +pub fn clean_html(html: &str, base_url: &Url) -> Result { libxml::tree::node::set_node_rc_guard(10); let empty_config = FtrConfigEntry::default(); let document = FullTextParser::parse_html(html, None, &empty_config)?; let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?; + let thumbnail = FullTextParser::check_for_thumbnail(&xpath_ctx); FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document, None); if let Some(mut root) = document.get_root_element() { FullTextParser::post_process_page(&mut root)?; @@ -50,7 +56,10 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result = Lazy::new(|| { + RegexBuilder::new(&POSITIVE_LEAD_IMAGE_URL_HINTS.join("|")) + .case_insensitive(true) + .build() + .expect("POSITIVE_LEAD_IMAGE_URL_HINTS regex") +}); + +pub const NEGATIVE_LEAD_IMAGE_URL_HINTS: &[&str] = &[ + "spacer", + "sprite", + "blank", + "throbber", + "gradient", + "tile", + "bg", + "background", + "icon", + "social", + "header", + "hdr", + "advert", + "spinner", + "loader", + "loading", + "default", + "rating", + "share", + "facebook", + "twitter", + "theme", + "promo", + "ads", + "wp-includes", +]; + +pub static NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX: Lazy = Lazy::new(|| { + RegexBuilder::new(&NEGATIVE_LEAD_IMAGE_URL_HINTS.join("|")) + .case_insensitive(true) + .build() + .expect("NEGATIVE_LEAD_IMAGE_URL_HINTS regex") +}); + +pub const PHOTO_HINTS: &[&str] = &["figure", "photo", "image", "caption"]; +pub static PHOTO_HINTS_REGEX: Lazy = Lazy::new(|| { + RegexBuilder::new(&PHOTO_HINTS.join("|")) + .case_insensitive(true) + .build() + .expect("PHOTO_HINTS_REGEX regex") +}); + +pub static GIF_REGEX: Lazy = Lazy::new(|| { + RegexBuilder::new(r#"\.gif(\?.*)?$"#) + .case_insensitive(true) + .build() + .expect("GIF_REGEX") +}); +pub static JPG_REGEX: Lazy = Lazy::new(|| { + RegexBuilder::new(r#"\.jpe?g(\?.*)?$"#) + .case_insensitive(true) + .build() + .expect("JPG_REGEX") +}); diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index ae3819d..c389a8d 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -23,7 +23,7 @@ use libxml::tree::{Document, Node, NodeType}; use libxml::xpath::Context; use reqwest::header::HeaderMap; use reqwest::{Client, Response, Url}; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::path::Path; use std::str::from_utf8; @@ -224,7 +224,7 @@ impl FullTextParser { metadata::extract(&xpath_ctx, config, Some(global_config), article); if article.thumbnail_url.is_none() { - Self::check_for_thumbnail(&xpath_ctx, article); + article.thumbnail_url = Self::check_for_thumbnail(&xpath_ctx); } Self::prep_content( &xpath_ctx, @@ -427,28 +427,89 @@ impl FullTextParser { conf } - fn check_for_thumbnail(context: &Context, article: &mut Article) { + pub fn check_for_thumbnail(context: &Context) -> Option { if let Ok(thumb) = Util::get_attribute( context, "//meta[contains(@name, 'twitter:image')]", "content", ) { - article.thumbnail_url = Some(thumb); - return; + return Some(thumb); } if let Ok(thumb) = Util::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content") { - article.thumbnail_url = Some(thumb); - return; + return Some(thumb); } if let Ok(thumb) = Util::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href") { - article.thumbnail_url = Some(thumb); + return Some(thumb); } + + if let Ok(img_nodes) = Util::evaluate_xpath(context, "//img", true) { + let mut scores: HashMap = HashMap::new(); + let len = img_nodes.len(); + for (index, img_node) in img_nodes.into_iter().enumerate() { + let src = if let Some(src) = img_node.get_attribute("src") { + src + } else { + continue; + }; + + let score = Util::score_image_url(&src); + let score = score + Util::score_img_attr(&img_node); + let score = score + Util::score_by_parents(&img_node); + let score = score + Util::score_by_sibling(&img_node); + let score = score + Util::score_by_dimensions(&img_node); + let score = score + Util::score_by_position(len, index); + + scores.insert(src, score); + } + + if let Some((top_src, top_score)) = + scores.into_iter().max_by_key(|(_src, score)| *score) + { + if top_score > 0 { + let top_url = top_src.trim().into(); + if Url::parse(top_src.trim()).is_ok() { + return Some(top_url); + } + } + } + } + + // If nothing else worked, check to see if there are any really + // probable nodes in the doc, like . + // eslint-disable-next-line no-restricted-syntax + if let Ok(link_nodes) = Util::evaluate_xpath(context, constants::LEAD_IMAGE_URL_XPATH, true) + { + if let Some(first_link_node) = link_nodes.first() { + if let Some(src) = first_link_node.get_attribute("src") { + let src = src.trim().to_string(); + if Url::parse(&src).is_ok() { + return Some(src); + } + } + + if let Some(href) = first_link_node.get_attribute("href") { + let href = href.trim().to_string(); + if Url::parse(&href).is_ok() { + return Some(href); + } + } + + if let Some(val) = first_link_node.get_attribute("value") { + let val = val.trim().to_string(); + if Url::parse(&val).is_ok() { + return Some(val); + } + } + } + } + + None } fn fix_lazy_images(context: &Context, doc: &Document) -> Result<(), FullTextParserError> { diff --git a/article_scraper/src/full_text_parser/readability/mod.rs b/article_scraper/src/full_text_parser/readability/mod.rs index 3b417de..a0b8c03 100644 --- a/article_scraper/src/full_text_parser/readability/mod.rs +++ b/article_scraper/src/full_text_parser/readability/mod.rs @@ -102,14 +102,7 @@ impl Readability { continue; } - let match_string = node_ref - .get_class_names() - .iter() - .fold(String::new(), |a, b| format!("{a} {b}")); - let match_string = match node_ref.get_property("id") { - Some(id) => format!("{match_string} {id}"), - None => match_string, - }; + let match_string = Util::get_signature(node_ref); if !Util::is_probably_visible(node_ref) { log::debug!("removing hidden node {match_string}"); diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index 4ec0444..bc25795 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -1,5 +1,5 @@ use super::{config::ConfigEntry, FullTextParser}; -use libxml::tree::SaveOptions; +use libxml::{parser::Parser, tree::SaveOptions, xpath::Context}; use reqwest::{Client, Url}; async fn run_test(name: &str, url: &str, title: Option<&str>, author: Option<&str>) { @@ -180,3 +180,22 @@ async fn unwrap_noscript_images_2() { assert_eq!(res, expected); } + +#[test] +fn extract_thumbnail() { + let html = r#" +Im staubigen +Utah sind die Fossilien eines urzeitlichen Meeresreptils entdeckt worden. Nun haben Forscher eine Studie dazu +herausgebracht. (Fortschritt, Wissenschaft) + "#; + let doc = Parser::default_html().parse_string(html).unwrap(); + let ctx = Context::new(&doc).unwrap(); + + let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap(); + assert_eq!( + thumb, + "https://www.golem.de/2306/175204-387164-387163_rc.jpg" + ) +} diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index b6705a0..8ac7711 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -11,7 +11,7 @@ use reqwest::{ use tokio::fs::DirEntry; use crate::{ - constants, + constants::{self, NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX}, full_text_parser::{config::ConfigEntry, error::FullTextParserError}, image_object::ImageObject, video_object::VideoObject, @@ -275,6 +275,17 @@ impl Util { Ok(()) } + pub fn get_signature(node: &Node) -> String { + let match_string = node + .get_class_names() + .iter() + .fold(String::new(), |a, b| format!("{a} {b}")); + match node.get_property("id") { + Some(id) => format!("{match_string} {id}"), + None => match_string, + } + } + pub fn is_probably_visible(node: &Node) -> bool { let is_hidden = node.has_attribute("hidden"); let aria_hidden = node @@ -1033,6 +1044,135 @@ impl Util { } } } + + pub fn score_image_url(url: &str) -> i32 { + let url = url.trim(); + let mut score = 0; + + if constants::POSITIVE_LEAD_IMAGE_URL_HINTS_REGEX.is_match(url) { + score += 20; + } + + if NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX.is_match(url) { + score -= 20; + } + + // TODO: We might want to consider removing this as + // gifs are much more common/popular than they once were + if constants::GIF_REGEX.is_match(url) { + score -= 10; + } + + if constants::JPG_REGEX.is_match(url) { + score += 10; + } + + // PNGs are neutral. + + score + } + + // Alt attribute usually means non-presentational image. + pub fn score_img_attr(img: &Node) -> i32 { + if img.get_attribute("alt").is_some() { + 5 + } else { + 0 + } + } + + // Look through our parent and grandparent for figure-like + // container elements, give a bonus if we find them + pub fn score_by_parents(img: &Node) -> i32 { + let mut score = 0; + let parent = img.get_parent(); + let grand_parent = parent.as_ref().and_then(|n| n.get_parent()); + if Self::has_tag_name(parent.as_ref(), "figure") + || Self::has_tag_name(grand_parent.as_ref(), "figure") + { + score += 25; + } + + if let Some(parent) = parent.as_ref() { + let signature = Util::get_signature(parent); + if constants::PHOTO_HINTS_REGEX.is_match(&signature) { + score += 15; + } + } + + if let Some(grand_parent) = grand_parent.as_ref() { + let signature = Util::get_signature(grand_parent); + if constants::PHOTO_HINTS_REGEX.is_match(&signature) { + score += 15; + } + } + + score + } + + // Look at our immediate sibling and see if it looks like it's a + // caption. Bonus if so. + pub fn score_by_sibling(img: &Node) -> i32 { + let mut score = 0; + let sibling = img.get_next_element_sibling(); + + if let Some(sibling) = sibling.as_ref() { + if sibling.get_name().to_lowercase() == "figcaption" { + score += 25; + } + + let signature = Util::get_signature(sibling); + if constants::PHOTO_HINTS_REGEX.is_match(&signature) { + score += 15; + } + } + + score + } + + pub fn score_by_dimensions(img: &Node) -> i32 { + let mut score = 0; + + let width = img + .get_attribute("width") + .and_then(|w| w.parse::().ok()); + let height = img + .get_attribute("height") + .and_then(|w| w.parse::().ok()); + let src = img.get_attribute("src").unwrap_or_default(); + + // Penalty for skinny images + if let Some(width) = width { + if width <= 50.0 { + score -= 50; + } + } + + // Penalty for short images + if let Some(height) = height { + if height <= 50.0 { + score -= 50; + } + } + + if let (Some(width), Some(height)) = (width, height) { + if !src.contains("sprite") { + let area = width * height; + if area < 5000.0 { + // Smaller than 50 x 100 + score -= 100; + } else { + score += f32::round(area / 1000.0) as i32; + } + } + } + + score + } + + pub fn score_by_position(len: usize, index: usize) -> i32 { + (len / 2 - index) as i32 + } } #[cfg(test)]