1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

add mercury leading image heuristics

This commit is contained in:
Jan Lukas Gernert 2023-06-26 22:25:57 +02:00
parent e99a4b4f23
commit e32015c1d0
6 changed files with 315 additions and 21 deletions

View file

@ -5,6 +5,11 @@ use crate::full_text_parser::error::FullTextParserError;
use crate::util::Util;
use crate::{FtrConfigEntry, FullTextParser};
pub struct CleanedHtml {
pub html: String,
pub thumbnail: Option<String>,
}
/// Re-use crate internals to clean HTML of articles before
/// further processing:
/// - replace H1 with H2
@ -29,12 +34,13 @@ use crate::{FtrConfigEntry, FullTextParser};
/// * `html` - HTML content
/// * `base_url` - URL used to complete relative URLs
///
pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserError> {
pub fn clean_html(html: &str, base_url: &Url) -> Result<CleanedHtml, FullTextParserError> {
libxml::tree::node::set_node_rc_guard(10);
let empty_config = FtrConfigEntry::default();
let document = FullTextParser::parse_html(html, None, &empty_config)?;
let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?;
let thumbnail = FullTextParser::check_for_thumbnail(&xpath_ctx);
FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document, None);
if let Some(mut root) = document.get_root_element() {
FullTextParser::post_process_page(&mut root)?;
@ -50,7 +56,10 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserEr
article_node.add_child(&mut node).unwrap();
}
Ok(document.node_to_string(&article_node))
Ok(CleanedHtml {
html: document.node_to_string(&article_node),
thumbnail,
})
}
#[cfg(test)]
@ -64,6 +73,10 @@ mod tests {
let url = Url::parse("https://finshots.in").unwrap();
let res = clean_html(html, &url).unwrap();
assert_eq!(res.len(), 11965);
assert_eq!(res.html.len(), 11965);
assert_eq!(
res.thumbnail.as_deref(),
Some("https://cdn.finshots.app/images/2023/03/Design-8-Amul.jpg")
)
}
}

View file

@ -156,3 +156,71 @@ pub const PHRASING_ELEMS: &[&str] = &[
"OUTPUT", "PROGRESS", "Q", "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG",
"SUB", "SUP", "TEXTAREA", "TIME", "VAR", "WBR",
];
pub const LEAD_IMAGE_URL_XPATH: &str = "//link[@rel='image_src']";
pub const POSITIVE_LEAD_IMAGE_URL_HINTS: &[&str] =
&["upload", "wp-content", "large", "photo", "wp-image"];
pub static POSITIVE_LEAD_IMAGE_URL_HINTS_REGEX: Lazy<Regex> = Lazy::new(|| {
RegexBuilder::new(&POSITIVE_LEAD_IMAGE_URL_HINTS.join("|"))
.case_insensitive(true)
.build()
.expect("POSITIVE_LEAD_IMAGE_URL_HINTS regex")
});
pub const NEGATIVE_LEAD_IMAGE_URL_HINTS: &[&str] = &[
"spacer",
"sprite",
"blank",
"throbber",
"gradient",
"tile",
"bg",
"background",
"icon",
"social",
"header",
"hdr",
"advert",
"spinner",
"loader",
"loading",
"default",
"rating",
"share",
"facebook",
"twitter",
"theme",
"promo",
"ads",
"wp-includes",
];
pub static NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX: Lazy<Regex> = Lazy::new(|| {
RegexBuilder::new(&NEGATIVE_LEAD_IMAGE_URL_HINTS.join("|"))
.case_insensitive(true)
.build()
.expect("NEGATIVE_LEAD_IMAGE_URL_HINTS regex")
});
pub const PHOTO_HINTS: &[&str] = &["figure", "photo", "image", "caption"];
pub static PHOTO_HINTS_REGEX: Lazy<Regex> = Lazy::new(|| {
RegexBuilder::new(&PHOTO_HINTS.join("|"))
.case_insensitive(true)
.build()
.expect("PHOTO_HINTS_REGEX regex")
});
pub static GIF_REGEX: Lazy<Regex> = Lazy::new(|| {
RegexBuilder::new(r#"\.gif(\?.*)?$"#)
.case_insensitive(true)
.build()
.expect("GIF_REGEX")
});
pub static JPG_REGEX: Lazy<Regex> = Lazy::new(|| {
RegexBuilder::new(r#"\.jpe?g(\?.*)?$"#)
.case_insensitive(true)
.build()
.expect("JPG_REGEX")
});

View file

@ -23,7 +23,7 @@ use libxml::tree::{Document, Node, NodeType};
use libxml::xpath::Context;
use reqwest::header::HeaderMap;
use reqwest::{Client, Response, Url};
use std::collections::HashSet;
use std::collections::{HashMap, HashSet};
use std::path::Path;
use std::str::from_utf8;
@ -224,7 +224,7 @@ impl FullTextParser {
metadata::extract(&xpath_ctx, config, Some(global_config), article);
if article.thumbnail_url.is_none() {
Self::check_for_thumbnail(&xpath_ctx, article);
article.thumbnail_url = Self::check_for_thumbnail(&xpath_ctx);
}
Self::prep_content(
&xpath_ctx,
@ -427,28 +427,89 @@ impl FullTextParser {
conf
}
fn check_for_thumbnail(context: &Context, article: &mut Article) {
pub fn check_for_thumbnail(context: &Context) -> Option<String> {
if let Ok(thumb) = Util::get_attribute(
context,
"//meta[contains(@name, 'twitter:image')]",
"content",
) {
article.thumbnail_url = Some(thumb);
return;
return Some(thumb);
}
if let Ok(thumb) =
Util::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content")
{
article.thumbnail_url = Some(thumb);
return;
return Some(thumb);
}
if let Ok(thumb) =
Util::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href")
{
article.thumbnail_url = Some(thumb);
return Some(thumb);
}
if let Ok(img_nodes) = Util::evaluate_xpath(context, "//img", true) {
let mut scores: HashMap<String, i32> = HashMap::new();
let len = img_nodes.len();
for (index, img_node) in img_nodes.into_iter().enumerate() {
let src = if let Some(src) = img_node.get_attribute("src") {
src
} else {
continue;
};
let score = Util::score_image_url(&src);
let score = score + Util::score_img_attr(&img_node);
let score = score + Util::score_by_parents(&img_node);
let score = score + Util::score_by_sibling(&img_node);
let score = score + Util::score_by_dimensions(&img_node);
let score = score + Util::score_by_position(len, index);
scores.insert(src, score);
}
if let Some((top_src, top_score)) =
scores.into_iter().max_by_key(|(_src, score)| *score)
{
if top_score > 0 {
let top_url = top_src.trim().into();
if Url::parse(top_src.trim()).is_ok() {
return Some(top_url);
}
}
}
}
// If nothing else worked, check to see if there are any really
// probable nodes in the doc, like <link rel="image_src" />.
// eslint-disable-next-line no-restricted-syntax
if let Ok(link_nodes) = Util::evaluate_xpath(context, constants::LEAD_IMAGE_URL_XPATH, true)
{
if let Some(first_link_node) = link_nodes.first() {
if let Some(src) = first_link_node.get_attribute("src") {
let src = src.trim().to_string();
if Url::parse(&src).is_ok() {
return Some(src);
}
}
if let Some(href) = first_link_node.get_attribute("href") {
let href = href.trim().to_string();
if Url::parse(&href).is_ok() {
return Some(href);
}
}
if let Some(val) = first_link_node.get_attribute("value") {
let val = val.trim().to_string();
if Url::parse(&val).is_ok() {
return Some(val);
}
}
}
}
None
}
fn fix_lazy_images(context: &Context, doc: &Document) -> Result<(), FullTextParserError> {

View file

@ -102,14 +102,7 @@ impl Readability {
continue;
}
let match_string = node_ref
.get_class_names()
.iter()
.fold(String::new(), |a, b| format!("{a} {b}"));
let match_string = match node_ref.get_property("id") {
Some(id) => format!("{match_string} {id}"),
None => match_string,
};
let match_string = Util::get_signature(node_ref);
if !Util::is_probably_visible(node_ref) {
log::debug!("removing hidden node {match_string}");

View file

@ -1,5 +1,5 @@
use super::{config::ConfigEntry, FullTextParser};
use libxml::tree::SaveOptions;
use libxml::{parser::Parser, tree::SaveOptions, xpath::Context};
use reqwest::{Client, Url};
async fn run_test(name: &str, url: &str, title: Option<&str>, author: Option<&str>) {
@ -180,3 +180,22 @@ async fn unwrap_noscript_images_2() {
assert_eq!(res, expected);
}
#[test]
fn extract_thumbnail() {
let html = r#"
<img src="https://www.golem.de/2306/175204-387164-387163_rc.jpg" width="140" height="140" loading="lazy" />Im staubigen
Utah sind die Fossilien eines urzeitlichen Meeresreptils entdeckt worden. Nun haben Forscher eine Studie dazu
herausgebracht. (<a href="https://www.golem.de/specials/fortschritt/" rel="noopener noreferrer" target="_blank"
referrerpolicy="no-referrer">Fortschritt</a>, <a href="https://www.golem.de/specials/wissenschaft/"
rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Wissenschaft</a>)
"#;
let doc = Parser::default_html().parse_string(html).unwrap();
let ctx = Context::new(&doc).unwrap();
let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap();
assert_eq!(
thumb,
"https://www.golem.de/2306/175204-387164-387163_rc.jpg"
)
}

View file

@ -11,7 +11,7 @@ use reqwest::{
use tokio::fs::DirEntry;
use crate::{
constants,
constants::{self, NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX},
full_text_parser::{config::ConfigEntry, error::FullTextParserError},
image_object::ImageObject,
video_object::VideoObject,
@ -275,6 +275,17 @@ impl Util {
Ok(())
}
pub fn get_signature(node: &Node) -> String {
let match_string = node
.get_class_names()
.iter()
.fold(String::new(), |a, b| format!("{a} {b}"));
match node.get_property("id") {
Some(id) => format!("{match_string} {id}"),
None => match_string,
}
}
pub fn is_probably_visible(node: &Node) -> bool {
let is_hidden = node.has_attribute("hidden");
let aria_hidden = node
@ -1033,6 +1044,135 @@ impl Util {
}
}
}
pub fn score_image_url(url: &str) -> i32 {
let url = url.trim();
let mut score = 0;
if constants::POSITIVE_LEAD_IMAGE_URL_HINTS_REGEX.is_match(url) {
score += 20;
}
if NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX.is_match(url) {
score -= 20;
}
// TODO: We might want to consider removing this as
// gifs are much more common/popular than they once were
if constants::GIF_REGEX.is_match(url) {
score -= 10;
}
if constants::JPG_REGEX.is_match(url) {
score += 10;
}
// PNGs are neutral.
score
}
// Alt attribute usually means non-presentational image.
pub fn score_img_attr(img: &Node) -> i32 {
if img.get_attribute("alt").is_some() {
5
} else {
0
}
}
// Look through our parent and grandparent for figure-like
// container elements, give a bonus if we find them
pub fn score_by_parents(img: &Node) -> i32 {
let mut score = 0;
let parent = img.get_parent();
let grand_parent = parent.as_ref().and_then(|n| n.get_parent());
if Self::has_tag_name(parent.as_ref(), "figure")
|| Self::has_tag_name(grand_parent.as_ref(), "figure")
{
score += 25;
}
if let Some(parent) = parent.as_ref() {
let signature = Util::get_signature(parent);
if constants::PHOTO_HINTS_REGEX.is_match(&signature) {
score += 15;
}
}
if let Some(grand_parent) = grand_parent.as_ref() {
let signature = Util::get_signature(grand_parent);
if constants::PHOTO_HINTS_REGEX.is_match(&signature) {
score += 15;
}
}
score
}
// Look at our immediate sibling and see if it looks like it's a
// caption. Bonus if so.
pub fn score_by_sibling(img: &Node) -> i32 {
let mut score = 0;
let sibling = img.get_next_element_sibling();
if let Some(sibling) = sibling.as_ref() {
if sibling.get_name().to_lowercase() == "figcaption" {
score += 25;
}
let signature = Util::get_signature(sibling);
if constants::PHOTO_HINTS_REGEX.is_match(&signature) {
score += 15;
}
}
score
}
pub fn score_by_dimensions(img: &Node) -> i32 {
let mut score = 0;
let width = img
.get_attribute("width")
.and_then(|w| w.parse::<f32>().ok());
let height = img
.get_attribute("height")
.and_then(|w| w.parse::<f32>().ok());
let src = img.get_attribute("src").unwrap_or_default();
// Penalty for skinny images
if let Some(width) = width {
if width <= 50.0 {
score -= 50;
}
}
// Penalty for short images
if let Some(height) = height {
if height <= 50.0 {
score -= 50;
}
}
if let (Some(width), Some(height)) = (width, height) {
if !src.contains("sprite") {
let area = width * height;
if area < 5000.0 {
// Smaller than 50 x 100
score -= 100;
} else {
score += f32::round(area / 1000.0) as i32;
}
}
}
score
}
pub fn score_by_position(len: usize, index: usize) -> i32 {
(len / 2 - index) as i32
}
}
#[cfg(test)]