mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
add mercury leading image heuristics
This commit is contained in:
parent
e99a4b4f23
commit
e32015c1d0
6 changed files with 315 additions and 21 deletions
|
@ -5,6 +5,11 @@ use crate::full_text_parser::error::FullTextParserError;
|
|||
use crate::util::Util;
|
||||
use crate::{FtrConfigEntry, FullTextParser};
|
||||
|
||||
pub struct CleanedHtml {
|
||||
pub html: String,
|
||||
pub thumbnail: Option<String>,
|
||||
}
|
||||
|
||||
/// Re-use crate internals to clean HTML of articles before
|
||||
/// further processing:
|
||||
/// - replace H1 with H2
|
||||
|
@ -29,12 +34,13 @@ use crate::{FtrConfigEntry, FullTextParser};
|
|||
/// * `html` - HTML content
|
||||
/// * `base_url` - URL used to complete relative URLs
|
||||
///
|
||||
pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserError> {
|
||||
pub fn clean_html(html: &str, base_url: &Url) -> Result<CleanedHtml, FullTextParserError> {
|
||||
libxml::tree::node::set_node_rc_guard(10);
|
||||
|
||||
let empty_config = FtrConfigEntry::default();
|
||||
let document = FullTextParser::parse_html(html, None, &empty_config)?;
|
||||
let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?;
|
||||
let thumbnail = FullTextParser::check_for_thumbnail(&xpath_ctx);
|
||||
FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document, None);
|
||||
if let Some(mut root) = document.get_root_element() {
|
||||
FullTextParser::post_process_page(&mut root)?;
|
||||
|
@ -50,7 +56,10 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserEr
|
|||
article_node.add_child(&mut node).unwrap();
|
||||
}
|
||||
|
||||
Ok(document.node_to_string(&article_node))
|
||||
Ok(CleanedHtml {
|
||||
html: document.node_to_string(&article_node),
|
||||
thumbnail,
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -64,6 +73,10 @@ mod tests {
|
|||
let url = Url::parse("https://finshots.in").unwrap();
|
||||
let res = clean_html(html, &url).unwrap();
|
||||
|
||||
assert_eq!(res.len(), 11965);
|
||||
assert_eq!(res.html.len(), 11965);
|
||||
assert_eq!(
|
||||
res.thumbnail.as_deref(),
|
||||
Some("https://cdn.finshots.app/images/2023/03/Design-8-Amul.jpg")
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -156,3 +156,71 @@ pub const PHRASING_ELEMS: &[&str] = &[
|
|||
"OUTPUT", "PROGRESS", "Q", "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG",
|
||||
"SUB", "SUP", "TEXTAREA", "TIME", "VAR", "WBR",
|
||||
];
|
||||
|
||||
pub const LEAD_IMAGE_URL_XPATH: &str = "//link[@rel='image_src']";
|
||||
|
||||
pub const POSITIVE_LEAD_IMAGE_URL_HINTS: &[&str] =
|
||||
&["upload", "wp-content", "large", "photo", "wp-image"];
|
||||
|
||||
pub static POSITIVE_LEAD_IMAGE_URL_HINTS_REGEX: Lazy<Regex> = Lazy::new(|| {
|
||||
RegexBuilder::new(&POSITIVE_LEAD_IMAGE_URL_HINTS.join("|"))
|
||||
.case_insensitive(true)
|
||||
.build()
|
||||
.expect("POSITIVE_LEAD_IMAGE_URL_HINTS regex")
|
||||
});
|
||||
|
||||
pub const NEGATIVE_LEAD_IMAGE_URL_HINTS: &[&str] = &[
|
||||
"spacer",
|
||||
"sprite",
|
||||
"blank",
|
||||
"throbber",
|
||||
"gradient",
|
||||
"tile",
|
||||
"bg",
|
||||
"background",
|
||||
"icon",
|
||||
"social",
|
||||
"header",
|
||||
"hdr",
|
||||
"advert",
|
||||
"spinner",
|
||||
"loader",
|
||||
"loading",
|
||||
"default",
|
||||
"rating",
|
||||
"share",
|
||||
"facebook",
|
||||
"twitter",
|
||||
"theme",
|
||||
"promo",
|
||||
"ads",
|
||||
"wp-includes",
|
||||
];
|
||||
|
||||
pub static NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX: Lazy<Regex> = Lazy::new(|| {
|
||||
RegexBuilder::new(&NEGATIVE_LEAD_IMAGE_URL_HINTS.join("|"))
|
||||
.case_insensitive(true)
|
||||
.build()
|
||||
.expect("NEGATIVE_LEAD_IMAGE_URL_HINTS regex")
|
||||
});
|
||||
|
||||
pub const PHOTO_HINTS: &[&str] = &["figure", "photo", "image", "caption"];
|
||||
pub static PHOTO_HINTS_REGEX: Lazy<Regex> = Lazy::new(|| {
|
||||
RegexBuilder::new(&PHOTO_HINTS.join("|"))
|
||||
.case_insensitive(true)
|
||||
.build()
|
||||
.expect("PHOTO_HINTS_REGEX regex")
|
||||
});
|
||||
|
||||
pub static GIF_REGEX: Lazy<Regex> = Lazy::new(|| {
|
||||
RegexBuilder::new(r#"\.gif(\?.*)?$"#)
|
||||
.case_insensitive(true)
|
||||
.build()
|
||||
.expect("GIF_REGEX")
|
||||
});
|
||||
pub static JPG_REGEX: Lazy<Regex> = Lazy::new(|| {
|
||||
RegexBuilder::new(r#"\.jpe?g(\?.*)?$"#)
|
||||
.case_insensitive(true)
|
||||
.build()
|
||||
.expect("JPG_REGEX")
|
||||
});
|
||||
|
|
|
@ -23,7 +23,7 @@ use libxml::tree::{Document, Node, NodeType};
|
|||
use libxml::xpath::Context;
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::{Client, Response, Url};
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::path::Path;
|
||||
use std::str::from_utf8;
|
||||
|
||||
|
@ -224,7 +224,7 @@ impl FullTextParser {
|
|||
metadata::extract(&xpath_ctx, config, Some(global_config), article);
|
||||
|
||||
if article.thumbnail_url.is_none() {
|
||||
Self::check_for_thumbnail(&xpath_ctx, article);
|
||||
article.thumbnail_url = Self::check_for_thumbnail(&xpath_ctx);
|
||||
}
|
||||
Self::prep_content(
|
||||
&xpath_ctx,
|
||||
|
@ -427,28 +427,89 @@ impl FullTextParser {
|
|||
conf
|
||||
}
|
||||
|
||||
fn check_for_thumbnail(context: &Context, article: &mut Article) {
|
||||
pub fn check_for_thumbnail(context: &Context) -> Option<String> {
|
||||
if let Ok(thumb) = Util::get_attribute(
|
||||
context,
|
||||
"//meta[contains(@name, 'twitter:image')]",
|
||||
"content",
|
||||
) {
|
||||
article.thumbnail_url = Some(thumb);
|
||||
return;
|
||||
return Some(thumb);
|
||||
}
|
||||
|
||||
if let Ok(thumb) =
|
||||
Util::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content")
|
||||
{
|
||||
article.thumbnail_url = Some(thumb);
|
||||
return;
|
||||
return Some(thumb);
|
||||
}
|
||||
|
||||
if let Ok(thumb) =
|
||||
Util::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href")
|
||||
{
|
||||
article.thumbnail_url = Some(thumb);
|
||||
return Some(thumb);
|
||||
}
|
||||
|
||||
if let Ok(img_nodes) = Util::evaluate_xpath(context, "//img", true) {
|
||||
let mut scores: HashMap<String, i32> = HashMap::new();
|
||||
let len = img_nodes.len();
|
||||
for (index, img_node) in img_nodes.into_iter().enumerate() {
|
||||
let src = if let Some(src) = img_node.get_attribute("src") {
|
||||
src
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let score = Util::score_image_url(&src);
|
||||
let score = score + Util::score_img_attr(&img_node);
|
||||
let score = score + Util::score_by_parents(&img_node);
|
||||
let score = score + Util::score_by_sibling(&img_node);
|
||||
let score = score + Util::score_by_dimensions(&img_node);
|
||||
let score = score + Util::score_by_position(len, index);
|
||||
|
||||
scores.insert(src, score);
|
||||
}
|
||||
|
||||
if let Some((top_src, top_score)) =
|
||||
scores.into_iter().max_by_key(|(_src, score)| *score)
|
||||
{
|
||||
if top_score > 0 {
|
||||
let top_url = top_src.trim().into();
|
||||
if Url::parse(top_src.trim()).is_ok() {
|
||||
return Some(top_url);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If nothing else worked, check to see if there are any really
|
||||
// probable nodes in the doc, like <link rel="image_src" />.
|
||||
// eslint-disable-next-line no-restricted-syntax
|
||||
if let Ok(link_nodes) = Util::evaluate_xpath(context, constants::LEAD_IMAGE_URL_XPATH, true)
|
||||
{
|
||||
if let Some(first_link_node) = link_nodes.first() {
|
||||
if let Some(src) = first_link_node.get_attribute("src") {
|
||||
let src = src.trim().to_string();
|
||||
if Url::parse(&src).is_ok() {
|
||||
return Some(src);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(href) = first_link_node.get_attribute("href") {
|
||||
let href = href.trim().to_string();
|
||||
if Url::parse(&href).is_ok() {
|
||||
return Some(href);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(val) = first_link_node.get_attribute("value") {
|
||||
let val = val.trim().to_string();
|
||||
if Url::parse(&val).is_ok() {
|
||||
return Some(val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
fn fix_lazy_images(context: &Context, doc: &Document) -> Result<(), FullTextParserError> {
|
||||
|
|
|
@ -102,14 +102,7 @@ impl Readability {
|
|||
continue;
|
||||
}
|
||||
|
||||
let match_string = node_ref
|
||||
.get_class_names()
|
||||
.iter()
|
||||
.fold(String::new(), |a, b| format!("{a} {b}"));
|
||||
let match_string = match node_ref.get_property("id") {
|
||||
Some(id) => format!("{match_string} {id}"),
|
||||
None => match_string,
|
||||
};
|
||||
let match_string = Util::get_signature(node_ref);
|
||||
|
||||
if !Util::is_probably_visible(node_ref) {
|
||||
log::debug!("removing hidden node {match_string}");
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
use super::{config::ConfigEntry, FullTextParser};
|
||||
use libxml::tree::SaveOptions;
|
||||
use libxml::{parser::Parser, tree::SaveOptions, xpath::Context};
|
||||
use reqwest::{Client, Url};
|
||||
|
||||
async fn run_test(name: &str, url: &str, title: Option<&str>, author: Option<&str>) {
|
||||
|
@ -180,3 +180,22 @@ async fn unwrap_noscript_images_2() {
|
|||
|
||||
assert_eq!(res, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_thumbnail() {
|
||||
let html = r#"
|
||||
<img src="https://www.golem.de/2306/175204-387164-387163_rc.jpg" width="140" height="140" loading="lazy" />Im staubigen
|
||||
Utah sind die Fossilien eines urzeitlichen Meeresreptils entdeckt worden. Nun haben Forscher eine Studie dazu
|
||||
herausgebracht. (<a href="https://www.golem.de/specials/fortschritt/" rel="noopener noreferrer" target="_blank"
|
||||
referrerpolicy="no-referrer">Fortschritt</a>, <a href="https://www.golem.de/specials/wissenschaft/"
|
||||
rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Wissenschaft</a>)
|
||||
"#;
|
||||
let doc = Parser::default_html().parse_string(html).unwrap();
|
||||
let ctx = Context::new(&doc).unwrap();
|
||||
|
||||
let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap();
|
||||
assert_eq!(
|
||||
thumb,
|
||||
"https://www.golem.de/2306/175204-387164-387163_rc.jpg"
|
||||
)
|
||||
}
|
||||
|
|
|
@ -11,7 +11,7 @@ use reqwest::{
|
|||
use tokio::fs::DirEntry;
|
||||
|
||||
use crate::{
|
||||
constants,
|
||||
constants::{self, NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX},
|
||||
full_text_parser::{config::ConfigEntry, error::FullTextParserError},
|
||||
image_object::ImageObject,
|
||||
video_object::VideoObject,
|
||||
|
@ -275,6 +275,17 @@ impl Util {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_signature(node: &Node) -> String {
|
||||
let match_string = node
|
||||
.get_class_names()
|
||||
.iter()
|
||||
.fold(String::new(), |a, b| format!("{a} {b}"));
|
||||
match node.get_property("id") {
|
||||
Some(id) => format!("{match_string} {id}"),
|
||||
None => match_string,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_probably_visible(node: &Node) -> bool {
|
||||
let is_hidden = node.has_attribute("hidden");
|
||||
let aria_hidden = node
|
||||
|
@ -1033,6 +1044,135 @@ impl Util {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn score_image_url(url: &str) -> i32 {
|
||||
let url = url.trim();
|
||||
let mut score = 0;
|
||||
|
||||
if constants::POSITIVE_LEAD_IMAGE_URL_HINTS_REGEX.is_match(url) {
|
||||
score += 20;
|
||||
}
|
||||
|
||||
if NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX.is_match(url) {
|
||||
score -= 20;
|
||||
}
|
||||
|
||||
// TODO: We might want to consider removing this as
|
||||
// gifs are much more common/popular than they once were
|
||||
if constants::GIF_REGEX.is_match(url) {
|
||||
score -= 10;
|
||||
}
|
||||
|
||||
if constants::JPG_REGEX.is_match(url) {
|
||||
score += 10;
|
||||
}
|
||||
|
||||
// PNGs are neutral.
|
||||
|
||||
score
|
||||
}
|
||||
|
||||
// Alt attribute usually means non-presentational image.
|
||||
pub fn score_img_attr(img: &Node) -> i32 {
|
||||
if img.get_attribute("alt").is_some() {
|
||||
5
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
// Look through our parent and grandparent for figure-like
|
||||
// container elements, give a bonus if we find them
|
||||
pub fn score_by_parents(img: &Node) -> i32 {
|
||||
let mut score = 0;
|
||||
let parent = img.get_parent();
|
||||
let grand_parent = parent.as_ref().and_then(|n| n.get_parent());
|
||||
if Self::has_tag_name(parent.as_ref(), "figure")
|
||||
|| Self::has_tag_name(grand_parent.as_ref(), "figure")
|
||||
{
|
||||
score += 25;
|
||||
}
|
||||
|
||||
if let Some(parent) = parent.as_ref() {
|
||||
let signature = Util::get_signature(parent);
|
||||
if constants::PHOTO_HINTS_REGEX.is_match(&signature) {
|
||||
score += 15;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(grand_parent) = grand_parent.as_ref() {
|
||||
let signature = Util::get_signature(grand_parent);
|
||||
if constants::PHOTO_HINTS_REGEX.is_match(&signature) {
|
||||
score += 15;
|
||||
}
|
||||
}
|
||||
|
||||
score
|
||||
}
|
||||
|
||||
// Look at our immediate sibling and see if it looks like it's a
|
||||
// caption. Bonus if so.
|
||||
pub fn score_by_sibling(img: &Node) -> i32 {
|
||||
let mut score = 0;
|
||||
let sibling = img.get_next_element_sibling();
|
||||
|
||||
if let Some(sibling) = sibling.as_ref() {
|
||||
if sibling.get_name().to_lowercase() == "figcaption" {
|
||||
score += 25;
|
||||
}
|
||||
|
||||
let signature = Util::get_signature(sibling);
|
||||
if constants::PHOTO_HINTS_REGEX.is_match(&signature) {
|
||||
score += 15;
|
||||
}
|
||||
}
|
||||
|
||||
score
|
||||
}
|
||||
|
||||
pub fn score_by_dimensions(img: &Node) -> i32 {
|
||||
let mut score = 0;
|
||||
|
||||
let width = img
|
||||
.get_attribute("width")
|
||||
.and_then(|w| w.parse::<f32>().ok());
|
||||
let height = img
|
||||
.get_attribute("height")
|
||||
.and_then(|w| w.parse::<f32>().ok());
|
||||
let src = img.get_attribute("src").unwrap_or_default();
|
||||
|
||||
// Penalty for skinny images
|
||||
if let Some(width) = width {
|
||||
if width <= 50.0 {
|
||||
score -= 50;
|
||||
}
|
||||
}
|
||||
|
||||
// Penalty for short images
|
||||
if let Some(height) = height {
|
||||
if height <= 50.0 {
|
||||
score -= 50;
|
||||
}
|
||||
}
|
||||
|
||||
if let (Some(width), Some(height)) = (width, height) {
|
||||
if !src.contains("sprite") {
|
||||
let area = width * height;
|
||||
if area < 5000.0 {
|
||||
// Smaller than 50 x 100
|
||||
score -= 100;
|
||||
} else {
|
||||
score += f32::round(area / 1000.0) as i32;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
score
|
||||
}
|
||||
|
||||
pub fn score_by_position(len: usize, index: usize) -> i32 {
|
||||
(len / 2 - index) as i32
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue