From e32015c1d00d4888848f4e1e2df2e29b923d99ee Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Mon, 26 Jun 2023 22:25:57 +0200 Subject: [PATCH 01/56] add mercury leading image heuristics --- article_scraper/src/clean.rs | 19 ++- article_scraper/src/constants.rs | 68 +++++++++ article_scraper/src/full_text_parser/mod.rs | 77 +++++++++- .../src/full_text_parser/readability/mod.rs | 9 +- article_scraper/src/full_text_parser/tests.rs | 21 ++- article_scraper/src/util.rs | 142 +++++++++++++++++- 6 files changed, 315 insertions(+), 21 deletions(-) diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index 14cc208..aec3967 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -5,6 +5,11 @@ use crate::full_text_parser::error::FullTextParserError; use crate::util::Util; use crate::{FtrConfigEntry, FullTextParser}; +pub struct CleanedHtml { + pub html: String, + pub thumbnail: Option, +} + /// Re-use crate internals to clean HTML of articles before /// further processing: /// - replace H1 with H2 @@ -29,12 +34,13 @@ use crate::{FtrConfigEntry, FullTextParser}; /// * `html` - HTML content /// * `base_url` - URL used to complete relative URLs /// -pub fn clean_html(html: &str, base_url: &Url) -> Result { +pub fn clean_html(html: &str, base_url: &Url) -> Result { libxml::tree::node::set_node_rc_guard(10); let empty_config = FtrConfigEntry::default(); let document = FullTextParser::parse_html(html, None, &empty_config)?; let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?; + let thumbnail = FullTextParser::check_for_thumbnail(&xpath_ctx); FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document, None); if let Some(mut root) = document.get_root_element() { FullTextParser::post_process_page(&mut root)?; @@ -50,7 +56,10 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result = Lazy::new(|| { + RegexBuilder::new(&POSITIVE_LEAD_IMAGE_URL_HINTS.join("|")) + .case_insensitive(true) + .build() + .expect("POSITIVE_LEAD_IMAGE_URL_HINTS regex") +}); + +pub const NEGATIVE_LEAD_IMAGE_URL_HINTS: &[&str] = &[ + "spacer", + "sprite", + "blank", + "throbber", + "gradient", + "tile", + "bg", + "background", + "icon", + "social", + "header", + "hdr", + "advert", + "spinner", + "loader", + "loading", + "default", + "rating", + "share", + "facebook", + "twitter", + "theme", + "promo", + "ads", + "wp-includes", +]; + +pub static NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX: Lazy = Lazy::new(|| { + RegexBuilder::new(&NEGATIVE_LEAD_IMAGE_URL_HINTS.join("|")) + .case_insensitive(true) + .build() + .expect("NEGATIVE_LEAD_IMAGE_URL_HINTS regex") +}); + +pub const PHOTO_HINTS: &[&str] = &["figure", "photo", "image", "caption"]; +pub static PHOTO_HINTS_REGEX: Lazy = Lazy::new(|| { + RegexBuilder::new(&PHOTO_HINTS.join("|")) + .case_insensitive(true) + .build() + .expect("PHOTO_HINTS_REGEX regex") +}); + +pub static GIF_REGEX: Lazy = Lazy::new(|| { + RegexBuilder::new(r#"\.gif(\?.*)?$"#) + .case_insensitive(true) + .build() + .expect("GIF_REGEX") +}); +pub static JPG_REGEX: Lazy = Lazy::new(|| { + RegexBuilder::new(r#"\.jpe?g(\?.*)?$"#) + .case_insensitive(true) + .build() + .expect("JPG_REGEX") +}); diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index ae3819d..c389a8d 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -23,7 +23,7 @@ use libxml::tree::{Document, Node, NodeType}; use libxml::xpath::Context; use reqwest::header::HeaderMap; use reqwest::{Client, Response, Url}; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::path::Path; use std::str::from_utf8; @@ -224,7 +224,7 @@ impl FullTextParser { metadata::extract(&xpath_ctx, config, Some(global_config), article); if article.thumbnail_url.is_none() { - Self::check_for_thumbnail(&xpath_ctx, article); + article.thumbnail_url = Self::check_for_thumbnail(&xpath_ctx); } Self::prep_content( &xpath_ctx, @@ -427,28 +427,89 @@ impl FullTextParser { conf } - fn check_for_thumbnail(context: &Context, article: &mut Article) { + pub fn check_for_thumbnail(context: &Context) -> Option { if let Ok(thumb) = Util::get_attribute( context, "//meta[contains(@name, 'twitter:image')]", "content", ) { - article.thumbnail_url = Some(thumb); - return; + return Some(thumb); } if let Ok(thumb) = Util::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content") { - article.thumbnail_url = Some(thumb); - return; + return Some(thumb); } if let Ok(thumb) = Util::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href") { - article.thumbnail_url = Some(thumb); + return Some(thumb); } + + if let Ok(img_nodes) = Util::evaluate_xpath(context, "//img", true) { + let mut scores: HashMap = HashMap::new(); + let len = img_nodes.len(); + for (index, img_node) in img_nodes.into_iter().enumerate() { + let src = if let Some(src) = img_node.get_attribute("src") { + src + } else { + continue; + }; + + let score = Util::score_image_url(&src); + let score = score + Util::score_img_attr(&img_node); + let score = score + Util::score_by_parents(&img_node); + let score = score + Util::score_by_sibling(&img_node); + let score = score + Util::score_by_dimensions(&img_node); + let score = score + Util::score_by_position(len, index); + + scores.insert(src, score); + } + + if let Some((top_src, top_score)) = + scores.into_iter().max_by_key(|(_src, score)| *score) + { + if top_score > 0 { + let top_url = top_src.trim().into(); + if Url::parse(top_src.trim()).is_ok() { + return Some(top_url); + } + } + } + } + + // If nothing else worked, check to see if there are any really + // probable nodes in the doc, like . + // eslint-disable-next-line no-restricted-syntax + if let Ok(link_nodes) = Util::evaluate_xpath(context, constants::LEAD_IMAGE_URL_XPATH, true) + { + if let Some(first_link_node) = link_nodes.first() { + if let Some(src) = first_link_node.get_attribute("src") { + let src = src.trim().to_string(); + if Url::parse(&src).is_ok() { + return Some(src); + } + } + + if let Some(href) = first_link_node.get_attribute("href") { + let href = href.trim().to_string(); + if Url::parse(&href).is_ok() { + return Some(href); + } + } + + if let Some(val) = first_link_node.get_attribute("value") { + let val = val.trim().to_string(); + if Url::parse(&val).is_ok() { + return Some(val); + } + } + } + } + + None } fn fix_lazy_images(context: &Context, doc: &Document) -> Result<(), FullTextParserError> { diff --git a/article_scraper/src/full_text_parser/readability/mod.rs b/article_scraper/src/full_text_parser/readability/mod.rs index 3b417de..a0b8c03 100644 --- a/article_scraper/src/full_text_parser/readability/mod.rs +++ b/article_scraper/src/full_text_parser/readability/mod.rs @@ -102,14 +102,7 @@ impl Readability { continue; } - let match_string = node_ref - .get_class_names() - .iter() - .fold(String::new(), |a, b| format!("{a} {b}")); - let match_string = match node_ref.get_property("id") { - Some(id) => format!("{match_string} {id}"), - None => match_string, - }; + let match_string = Util::get_signature(node_ref); if !Util::is_probably_visible(node_ref) { log::debug!("removing hidden node {match_string}"); diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index 4ec0444..bc25795 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -1,5 +1,5 @@ use super::{config::ConfigEntry, FullTextParser}; -use libxml::tree::SaveOptions; +use libxml::{parser::Parser, tree::SaveOptions, xpath::Context}; use reqwest::{Client, Url}; async fn run_test(name: &str, url: &str, title: Option<&str>, author: Option<&str>) { @@ -180,3 +180,22 @@ async fn unwrap_noscript_images_2() { assert_eq!(res, expected); } + +#[test] +fn extract_thumbnail() { + let html = r#" +Im staubigen +Utah sind die Fossilien eines urzeitlichen Meeresreptils entdeckt worden. Nun haben Forscher eine Studie dazu +herausgebracht. (Fortschritt, Wissenschaft) + "#; + let doc = Parser::default_html().parse_string(html).unwrap(); + let ctx = Context::new(&doc).unwrap(); + + let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap(); + assert_eq!( + thumb, + "https://www.golem.de/2306/175204-387164-387163_rc.jpg" + ) +} diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index b6705a0..8ac7711 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -11,7 +11,7 @@ use reqwest::{ use tokio::fs::DirEntry; use crate::{ - constants, + constants::{self, NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX}, full_text_parser::{config::ConfigEntry, error::FullTextParserError}, image_object::ImageObject, video_object::VideoObject, @@ -275,6 +275,17 @@ impl Util { Ok(()) } + pub fn get_signature(node: &Node) -> String { + let match_string = node + .get_class_names() + .iter() + .fold(String::new(), |a, b| format!("{a} {b}")); + match node.get_property("id") { + Some(id) => format!("{match_string} {id}"), + None => match_string, + } + } + pub fn is_probably_visible(node: &Node) -> bool { let is_hidden = node.has_attribute("hidden"); let aria_hidden = node @@ -1033,6 +1044,135 @@ impl Util { } } } + + pub fn score_image_url(url: &str) -> i32 { + let url = url.trim(); + let mut score = 0; + + if constants::POSITIVE_LEAD_IMAGE_URL_HINTS_REGEX.is_match(url) { + score += 20; + } + + if NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX.is_match(url) { + score -= 20; + } + + // TODO: We might want to consider removing this as + // gifs are much more common/popular than they once were + if constants::GIF_REGEX.is_match(url) { + score -= 10; + } + + if constants::JPG_REGEX.is_match(url) { + score += 10; + } + + // PNGs are neutral. + + score + } + + // Alt attribute usually means non-presentational image. + pub fn score_img_attr(img: &Node) -> i32 { + if img.get_attribute("alt").is_some() { + 5 + } else { + 0 + } + } + + // Look through our parent and grandparent for figure-like + // container elements, give a bonus if we find them + pub fn score_by_parents(img: &Node) -> i32 { + let mut score = 0; + let parent = img.get_parent(); + let grand_parent = parent.as_ref().and_then(|n| n.get_parent()); + if Self::has_tag_name(parent.as_ref(), "figure") + || Self::has_tag_name(grand_parent.as_ref(), "figure") + { + score += 25; + } + + if let Some(parent) = parent.as_ref() { + let signature = Util::get_signature(parent); + if constants::PHOTO_HINTS_REGEX.is_match(&signature) { + score += 15; + } + } + + if let Some(grand_parent) = grand_parent.as_ref() { + let signature = Util::get_signature(grand_parent); + if constants::PHOTO_HINTS_REGEX.is_match(&signature) { + score += 15; + } + } + + score + } + + // Look at our immediate sibling and see if it looks like it's a + // caption. Bonus if so. + pub fn score_by_sibling(img: &Node) -> i32 { + let mut score = 0; + let sibling = img.get_next_element_sibling(); + + if let Some(sibling) = sibling.as_ref() { + if sibling.get_name().to_lowercase() == "figcaption" { + score += 25; + } + + let signature = Util::get_signature(sibling); + if constants::PHOTO_HINTS_REGEX.is_match(&signature) { + score += 15; + } + } + + score + } + + pub fn score_by_dimensions(img: &Node) -> i32 { + let mut score = 0; + + let width = img + .get_attribute("width") + .and_then(|w| w.parse::().ok()); + let height = img + .get_attribute("height") + .and_then(|w| w.parse::().ok()); + let src = img.get_attribute("src").unwrap_or_default(); + + // Penalty for skinny images + if let Some(width) = width { + if width <= 50.0 { + score -= 50; + } + } + + // Penalty for short images + if let Some(height) = height { + if height <= 50.0 { + score -= 50; + } + } + + if let (Some(width), Some(height)) = (width, height) { + if !src.contains("sprite") { + let area = width * height; + if area < 5000.0 { + // Smaller than 50 x 100 + score -= 100; + } else { + score += f32::round(area / 1000.0) as i32; + } + } + } + + score + } + + pub fn score_by_position(len: usize, index: usize) -> i32 { + (len / 2 - index) as i32 + } } #[cfg(test)] From 4fd41d98cc1c1a7e0a4f9c3d4ddf793aeebdd005 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Mon, 26 Jun 2023 23:22:08 +0200 Subject: [PATCH 02/56] add fn to parse thumbnail from html --- article_scraper/src/full_text_parser/mod.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index c389a8d..b714827 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -427,6 +427,15 @@ impl FullTextParser { conf } + pub fn thumbnail_from_html(html: &str) -> Option { + if let Ok(doc) = Self::parse_html(html, None, &ConfigEntry::default()) { + if let Ok(ctx) = Self::get_xpath_ctx(&doc) { + return Self::check_for_thumbnail(&ctx); + } + } + None + } + pub fn check_for_thumbnail(context: &Context) -> Option { if let Ok(thumb) = Util::get_attribute( context, From fdb8d9a97e51c69cf780f7f31ec8ccde81edd125 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Tue, 27 Jun 2023 19:21:26 +0200 Subject: [PATCH 03/56] small fixes --- article_scraper/src/full_text_parser/mod.rs | 2 +- article_scraper/src/full_text_parser/tests.rs | 77 ++++++++++++++++++- article_scraper/src/util.rs | 2 +- 3 files changed, 78 insertions(+), 3 deletions(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index b714827..3e1e5d0 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -428,7 +428,7 @@ impl FullTextParser { } pub fn thumbnail_from_html(html: &str) -> Option { - if let Ok(doc) = Self::parse_html(html, None, &ConfigEntry::default()) { + if let Ok(doc) = Parser::default_html().parse_string(html) { if let Ok(ctx) = Self::get_xpath_ctx(&doc) { return Self::check_for_thumbnail(&ctx); } diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index bc25795..2d0f858 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -182,7 +182,7 @@ async fn unwrap_noscript_images_2() { } #[test] -fn extract_thumbnail() { +fn extract_thumbnail_golem() { let html = r#" Im staubigen Utah sind die Fossilien eines urzeitlichen Meeresreptils entdeckt worden. Nun haben Forscher eine Studie dazu @@ -199,3 +199,78 @@ herausgebracht. (
+
+
+
+ + +Grünenpolitiker Hofreiter: »Unternehmen werden in großem Umfang erpresst, unter Wert ihre Betriebe zu verkaufen« + +
+
+

Grünenpolitiker Hofreiter: »Unternehmen werden in großem Umfang erpresst, unter Wert ihre Betriebe zu verkaufen«

+ +Foto: IMAGO / IMAGO/Political-Moments + +
+
+
+
+
+

Der Töne aus Berlin in Richtung Budapest werden giftiger. Der Grünen-Europapolitiker Anton Hofreiter wirft der ungarischen Regierung vor, deutsche Unternehmen mit »Mafiamethoden« zum Verkauf ihres Ungarn-Geschäfts zu bringen. »Ungarn bewegt sich von einer autoritären Herrschaft in Richtung eines Mafiastaats«, sagte Hofreiter in Brüssel. »Unternehmen werden in großem Umfang erpresst, unter Wert ihre Betriebe zu verkaufen.«

+
+ +
+

Aus der deutschen Wirtschaft gebe es Klagen über zahlreiche Fälle, in denen Firmen »mit illegalen Methoden« vom Markt gedrängt worden seien oder entsprechende Versuche stattgefunden hätten.

Während Ungarns Regierungschef Viktor Orbán deutsche Autohersteller weiterhin mit niedrigen Steuern und wenig Bürokratie verwöhne, bekämen andere Firmen die Folgen von Orbáns Strategie der Nationalisierung von als strategisch wichtig geltenden Branchen zu spüren. Selbst Großunternehmen wie Lidl oder die Telekom würden inzwischen »massiv unter Druck gesetzt«, so Hofreiter.

+
+ +
+
+
+ + +Ungarns Regierungschef Viktor Orbán + + +
+
+
+

Ungarns Regierungschef Viktor Orbán

+ +Foto: IMAGO/Vaclav Salek / IMAGO/CTK Photo + +
+
+

Die Masche des Systems Orbán ist die immer gleiche, wie Unternehmen und Politiker schon seit Monaten beklagen: Die Regierung macht die Unternehmen erst Schikanen mürbe und unterbreitet dann wieder und wieder Kaufangebote. Die Firmen würden so gedrängt, ihre ungarischen Aktivitäten an Günstlinge Orbáns zu verkaufen – zwar nicht zu ruinösen Schleuderpreisen, aber üblicherweise für nur etwa 70 bis 80 Prozent des Marktwerts, sagt Hofreiter.

+
+ +
+

In Ungarn gehe es nicht mehr nur um die bereits weit fortgeschrittene Zerstörung des Rechtsstaats – »sondern inzwischen auch eindeutig um das Funktionieren des Binnenmarkts« der EU. »Der klassische ökonomische Teil des Binnenmarkts wird angegriffen.«

Die Kommission hat wegen Ungarns Rechtsstaatsverstößen bereits Milliardenzahlungen an das Land eingefroren. Das aber genüge nicht mehr, sagt Hofreiter – und fordert von der Kommission deshalb, neue Sanktionsinstrumente zu entwickeln: »Man muss sich Mechanismen zum Schutz des Binnenmarkts überlegen.«

Ungarns Außenminister beklagt »politisch motivierte Kampagne«

Der grüne Europaabgeordnete Daniel Freund verlangt außerdem eine Beschleunigung laufender und künftiger Verfahren gegen Ungarn wegen der Verletzung der EU-Verträge. »Wenn eine Firma wegen eines Regierungsdekrets Monat für Monat Millionen an Steuern bezahlen muss, kann sie nicht Jahre warten, ehe ein solches Verfahren abgeschlossen ist.«

+
+ +
+

Ungarns Außen- und Handelsminister Péter Szijjártó bezeichnete  die Vorwürfe kürzlich als »politisch motivierte Kampagne« und »emotionale Erpressung«. Seit 2014 habe Budapest 183 deutsche Unternehmen gefördert. Insgesamt würden rund 6000 deutsche Firmen in Ungarn etwa 300.000 Menschen beschäftigen.

+
+ +
+

Orbáns Politik stößt nicht nur bei den Grünen auf Kritik, sondern auch bei den deutschen Unionsparteien. Bis März 2021 waren sie gemeinsam mit Orbáns Fidesz-Partei in der Europäischen Volkspartei; jahrelang hofierten sie den Autokraten aus Budapest.

Monika Hohlmeier (CSU) etwa, Vorsitzende des Haushaltskontrollausschusses im EU-Parlament, sieht in Orbán mittlerweile »einen Mann mit kleptokratischen Zügen«, in dessen System »rechtsstaatliche Prinzipien mit Füßen getreten werden«. Erfolgreiche ausländische Unternehmer müssten in Ungarn damit rechnen, »dass ein Oligarch auftaucht, der sich deine Firma unter den Nagel reißen will«.

+ +
+
+
+ "#; + + let doc = Parser::default_html().parse_string(html).unwrap(); + let ctx = Context::new(&doc).unwrap(); + + let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap(); + assert_eq!( + thumb, + "https://cdn.prod.www.spiegel.de/images/a4573666-f15e-4290-8c73-a0c6cd4ad3b2_w948_r1.778_fpx29.99_fpy44.98.jpg" + ) +} diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index 8ac7711..4eb4f24 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -1171,7 +1171,7 @@ impl Util { } pub fn score_by_position(len: usize, index: usize) -> i32 { - (len / 2 - index) as i32 + ((len as f32 / 2.0) - index as f32) as i32 } } From fcec0d83ee754793ba78b74fa6c39ce07fc7505a Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Thu, 29 Jun 2023 19:47:49 +0200 Subject: [PATCH 04/56] don't move content nodes to
root node could fix potential crash? --- article_scraper/Cargo.toml | 2 +- article_scraper/src/clean.rs | 21 ++++++++++++--------- article_scraper/src/full_text_parser/mod.rs | 2 +- article_scraper/src/util.rs | 18 ++++++++++++++++++ 4 files changed, 32 insertions(+), 11 deletions(-) diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index 5ff0bac..948fe98 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -14,7 +14,7 @@ exclude = ["resources/tests"] thiserror = "1.0" libxml = "0.3" reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] } -tokio = { version = "1.27", features = ["macros", "fs", "io-util"] } +tokio = { version = "1.28", features = ["macros", "fs", "io-util"] } url = "2.3" regex = "1.8" encoding_rs = "0.8" diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index aec3967..c7913e3 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -1,4 +1,3 @@ -use libxml::tree::Node; use reqwest::Url; use crate::full_text_parser::error::FullTextParserError; @@ -47,17 +46,21 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result"); + log::debug!("Failed to add iframe as child of video wrapper
"); } } else { log::warn!("Failed to get parent of iframe"); diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index 4eb4f24..be04acf 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -547,6 +547,24 @@ impl Util { vec } + pub fn get_first_element_by_tag_name(node: &Node, tag: &str) -> Option { + let tag = tag.to_uppercase(); + + fn get_elems(node: &Node, tag: &str) -> Option { + for child in node.get_child_elements() { + if child.get_name().to_uppercase() == tag { + return Some(child.clone()); + } else { + return get_elems(&child, tag); + } + } + + None + } + + get_elems(node, &tag) + } + pub fn get_link_density(node: &Node) -> f64 { let text_length = Util::get_inner_text(node, true).len(); if text_length == 0 { From d62aa8c31a7d11824fa90666264bbf34e5bea857 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Thu, 29 Jun 2023 19:59:38 +0200 Subject: [PATCH 05/56] clippy fixes --- article_scraper/src/util.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index be04acf..57cd1b2 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -553,9 +553,9 @@ impl Util { fn get_elems(node: &Node, tag: &str) -> Option { for child in node.get_child_elements() { if child.get_name().to_uppercase() == tag { - return Some(child.clone()); - } else { - return get_elems(&child, tag); + return Some(child); + } else if let Some(node) = get_elems(&child, tag) { + return Some(node); } } From be40383b1a225c0cf08e78093bf07e315f5a92ba Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 16 Jul 2023 15:17:01 +0200 Subject: [PATCH 06/56] impl from reqwest error --- article_scraper/src/images/error.rs | 6 ++++++ article_scraper/src/images/request.rs | 8 ++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/article_scraper/src/images/error.rs b/article_scraper/src/images/error.rs index 7135f56..6c87710 100644 --- a/article_scraper/src/images/error.rs +++ b/article_scraper/src/images/error.rs @@ -23,3 +23,9 @@ pub enum ImageDownloadError { #[error("Unknown Error")] Unknown, } + +impl From for ImageDownloadError { + fn from(_value: reqwest::Error) -> Self { + Self::Http + } +} diff --git a/article_scraper/src/images/request.rs b/article_scraper/src/images/request.rs index ab64f41..c145598 100644 --- a/article_scraper/src/images/request.rs +++ b/article_scraper/src/images/request.rs @@ -14,11 +14,7 @@ pub struct ImageRequest { impl ImageRequest { pub async fn new(url: String, client: &Client) -> Result { - let response = client - .get(&url) - .send() - .await - .map_err(|_| ImageDownloadError::Http)?; + let response = client.get(&url).send().await?; let content_type = Self::get_content_type(&response)?; let content_length = Self::get_content_length(&response)?; @@ -40,7 +36,7 @@ impl ImageRequest { let mut result = Vec::with_capacity(self.content_length); while let Some(item) = stream.next().await { - let chunk = item.map_err(|_| ImageDownloadError::Http)?; + let chunk = item?; _ = tx.send(chunk.len()).await; for byte in chunk { result.push(byte); From d562d41b81908cb015414f8b3b30fb40b9298f53 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 16 Jul 2023 21:40:10 +0200 Subject: [PATCH 07/56] download single image --- article_scraper/src/images/mod.rs | 40 +++++++++++++++++++++++++++ article_scraper/src/images/request.rs | 37 ++++--------------------- article_scraper/src/util.rs | 32 ++++++++++++++++++++- 3 files changed, 76 insertions(+), 33 deletions(-) diff --git a/article_scraper/src/images/mod.rs b/article_scraper/src/images/mod.rs index 02933b5..14087c3 100644 --- a/article_scraper/src/images/mod.rs +++ b/article_scraper/src/images/mod.rs @@ -4,6 +4,7 @@ use self::pair::Pair; use self::request::ImageRequest; use crate::util::Util; use base64::Engine; +use futures::StreamExt; use image::ImageOutputFormat; use libxml::parser::Parser; use libxml::tree::{Node, SaveOptions}; @@ -28,6 +29,45 @@ impl ImageDownloader { ImageDownloader { max_size } } + pub async fn single_from_url( + url: &str, + client: &Client, + progress: Option>, + ) -> Result, ImageDownloadError> { + let response = client.get(url).send().await?; + + let content_type = Util::get_content_type(&response)?; + let content_length = Util::get_content_length(&response)?; + + if !content_type.contains("image") { + return Err(ImageDownloadError::ContentType); + } + + let mut stream = response.bytes_stream(); + let mut downloaded_bytes = 0; + + let mut result = Vec::with_capacity(content_length); + while let Some(item) = stream.next().await { + let chunk = item?; + downloaded_bytes += chunk.len(); + + if let Some(sender) = progress.as_ref() { + _ = sender + .send(Progress { + total_size: content_length, + downloaded: downloaded_bytes, + }) + .await; + } + + for byte in chunk { + result.push(byte); + } + } + + Ok(result) + } + pub async fn download_images_from_string( &self, html: &str, diff --git a/article_scraper/src/images/request.rs b/article_scraper/src/images/request.rs index c145598..b7086ce 100644 --- a/article_scraper/src/images/request.rs +++ b/article_scraper/src/images/request.rs @@ -1,7 +1,9 @@ use futures::StreamExt; -use reqwest::{header::CONTENT_TYPE, Client, Response}; +use reqwest::{Client, Response}; use tokio::sync::mpsc::Sender; +use crate::util::Util; + use super::{image_data::ImageData, ImageDownloadError}; #[derive(Debug)] @@ -16,8 +18,8 @@ impl ImageRequest { pub async fn new(url: String, client: &Client) -> Result { let response = client.get(&url).send().await?; - let content_type = Self::get_content_type(&response)?; - let content_length = Self::get_content_length(&response)?; + let content_type = Util::get_content_type(&response)?; + let content_length = Util::get_content_length(&response)?; if !content_type.contains("image") { return Err(ImageDownloadError::ContentType); @@ -58,33 +60,4 @@ impl ImageRequest { pub fn content_length(&self) -> usize { self.content_length } - - fn get_content_length(response: &Response) -> Result { - let status_code = response.status(); - - if !status_code.is_success() { - log::warn!("response: {status_code}"); - return Err(ImageDownloadError::Http); - } - - response - .headers() - .get(reqwest::header::CONTENT_LENGTH) - .and_then(|content_length| content_length.to_str().ok()) - .and_then(|content_length| content_length.parse::().ok()) - .ok_or(ImageDownloadError::ContentLength) - } - - fn get_content_type(response: &Response) -> Result { - if response.status().is_success() { - response - .headers() - .get(CONTENT_TYPE) - .and_then(|val| val.to_str().ok()) - .map(|val| val.to_string()) - .ok_or(ImageDownloadError::ContentType) - } else { - Err(ImageDownloadError::ContentType) - } - } } diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index 57cd1b2..73adee8 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -5,7 +5,7 @@ use libxml::{ xpath::Context, }; use reqwest::{ - header::{HeaderMap, HeaderName, HeaderValue}, + header::{HeaderMap, HeaderName, HeaderValue, CONTENT_LENGTH, CONTENT_TYPE}, Response, }; use tokio::fs::DirEntry; @@ -14,6 +14,7 @@ use crate::{ constants::{self, NEGATIVE_LEAD_IMAGE_URL_HINTS_REGEX}, full_text_parser::{config::ConfigEntry, error::FullTextParserError}, image_object::ImageObject, + images::ImageDownloadError, video_object::VideoObject, }; @@ -1191,6 +1192,35 @@ impl Util { pub fn score_by_position(len: usize, index: usize) -> i32 { ((len as f32 / 2.0) - index as f32) as i32 } + + pub fn get_content_length(response: &Response) -> Result { + let status_code = response.status(); + + if !status_code.is_success() { + log::warn!("response: {status_code}"); + return Err(ImageDownloadError::Http); + } + + response + .headers() + .get(CONTENT_LENGTH) + .and_then(|content_length| content_length.to_str().ok()) + .and_then(|content_length| content_length.parse::().ok()) + .ok_or(ImageDownloadError::ContentLength) + } + + pub fn get_content_type(response: &Response) -> Result { + if response.status().is_success() { + response + .headers() + .get(CONTENT_TYPE) + .and_then(|val| val.to_str().ok()) + .map(|val| val.to_string()) + .ok_or(ImageDownloadError::ContentType) + } else { + Err(ImageDownloadError::ContentType) + } + } } #[cfg(test)] From 42eb9daf65ead8e0b8da8e6accc801659e564f18 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 22 Jul 2023 19:57:38 +0200 Subject: [PATCH 08/56] remove lazy loading attributes --- article_scraper/src/full_text_parser/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 10ea343..a21ec64 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -583,6 +583,9 @@ impl FullTextParser { continue; } + _ = node.remove_attribute("decoding"); + _ = node.remove_attribute("loading"); + for (name, val) in node.get_attributes() { if name == "src" || name == "srcset" || name == "alt" { continue; From 345518253ad87f630f1cc2c572513cdfca906f36 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 22 Jul 2023 20:03:32 +0200 Subject: [PATCH 09/56] even if img has src --- article_scraper/src/full_text_parser/mod.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index a21ec64..166053d 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -583,9 +583,6 @@ impl FullTextParser { continue; } - _ = node.remove_attribute("decoding"); - _ = node.remove_attribute("loading"); - for (name, val) in node.get_attributes() { if name == "src" || name == "srcset" || name == "alt" { continue; @@ -848,6 +845,8 @@ impl FullTextParser { _ = Self::fix_lazy_images(context, document); _ = Self::fix_iframe_size(context, "youtube.com"); _ = Self::remove_attribute(context, Some("a"), "onclick"); + _ = Self::remove_attribute(context, Some("img"), "decoding"); + _ = Self::remove_attribute(context, Some("img"), "loading"); // strip elements using Readability.com and Instapaper.com ignore class names // .entry-unrelated and .instapaper_ignore From bf7a89fef7729eb2b0ebda962a8dd5cb53e09f6b Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 23 Jul 2023 15:39:24 +0200 Subject: [PATCH 10/56] don't fail because of lacking content length --- article_scraper/src/images/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/src/images/mod.rs b/article_scraper/src/images/mod.rs index 14087c3..56554c6 100644 --- a/article_scraper/src/images/mod.rs +++ b/article_scraper/src/images/mod.rs @@ -37,7 +37,7 @@ impl ImageDownloader { let response = client.get(url).send().await?; let content_type = Util::get_content_type(&response)?; - let content_length = Util::get_content_length(&response)?; + let content_length = Util::get_content_length(&response).unwrap_or(0); if !content_type.contains("image") { return Err(ImageDownloadError::ContentType); From db007f752cbcdb53de20706cbbdfbdabe485b2a6 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Thu, 27 Jul 2023 23:18:17 +0200 Subject: [PATCH 11/56] dont clean video tags --- article_scraper/src/clean.rs | 77 ++++++++++++++++++++++++++++++++++++ article_scraper/src/util.rs | 8 +--- 2 files changed, 79 insertions(+), 6 deletions(-) diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index c7913e3..6b0d698 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -70,6 +70,83 @@ mod tests { use super::clean_html; use reqwest::Url; + #[test] + fn rethinking_window_management() { + let html = r#"

Window management is one of those areas I’m fascinated with because even after 50 years, nobody’s fully cracked it yet. Ever since the dawn of time we’ve relied on the window metaphor as the primary way of multitasking on the desktop. In this metaphor, each app can spawn one or more rectangular windows, which are stacked by most recently used, and moved or resized manually.

+
Overlapping windows can get messy quickly
+

The traditional windowing system works well as long as you only have a handful of small windows, but issues emerge as soon the number and size of the windows grows. As new windows are opened, existing ones are obscured, sometimes completely hiding them from view. Or, when you open a maximized window, suddenly every other window is hidden.

+

Over the decades, different OSes have added different tools and workflows to deal with these issues, including workspaces, taskbars, and switchers. However, the basic primitives have not changed since the 70s and, as a result, the issues have never gone away.

+

While most of us are used to this system and its quirks, that doesn’t mean it’s without problems. This is especially apparent when you do user research with people who are new to computing, including children and older people. Manually placing and sizing windows can be fiddly work, and requires close attention and precise motor control. It’s also what we jokingly refer to as shit work: it is work that the user has to do, which is generated by the system itself, and has no other purpose.

+

Most of the time you don’t care about exact window sizes and positions and just want to see the windows that you need for your current task. Often that’s just a single, maximized window. Sometimes it’s two or three windows next to each other. It’s incredibly rare that you need a dozen different overlapping windows. Yet this is what you end up with by default today, when you simply use the computer, opening apps as you need them. Messy is the default, and it’s up to you to clean it up.

+

What about tiling?

+

Traditional tiling window managers solve the hidden window problem by preventing windows from overlapping. While this works well in some cases, it falls short as a general replacement for stacked, floating windows. The first reason for this is that tiling window managers size windows according to the amount of available screen space, yet most apps are designed to be used at a certain size and aspect ratio. For example, chat apps are inherently narrow and end up having large amounts of empty space at large sizes. Similarly, reading a PDF in a tiny window is not fun.

+
GNOME 44 with the “Forge” tiling extension. Just because windows can be tall and narrow doesn’t mean they should be :)
+

Another issue with tiling window manager is that they place new windows in seemingly arbitrary positions. This is a consequence of them not having knowledge about the content of a window or the context in which it is being used, and leads to having to manually move or resize windows after the fact, which is exactly the kind of fiddling we want to avoid in the first place.

+ +

More constrained tiling window managers such as on iPadOS are interesting in that they’re more purposeful (you always intentionally create the tiling groups). However, this approach only allows tiling two windows side-by-side, and does not scale well to larger screens.

+

History

+

This topic has been of interest to the design team for a very long time. I remember discussing it with Jakub at my first GUADEC in 2017, and there have been countless discussions, ideas, and concepts since. Some particular milestones in our thinking were the concept work leading up to GNOME 40 in 2019 and 2020, and the design sessions at the Berlin Mini GUADEC in 2022 and the Brno hackfest in 2023.

+
Tiling BoF in Brno during the HDR hackfest. Left to right: Robert Mader, Marco Trevisan, Georges Stavracase, Jakub Steiner and Allan Day (remote), Florian Müllner, Jonas Dreßler
+

I personally have a bit of a tradition working on this problem for at least a few weeks per year. For example, during the first lockdown in 2020 I spent quite a bit of time trying to envision a tiling-first version of GNOME Shell.

+
2020 mockup for a tiling-first GNOME Shell. More mockups in the OS mockups repo on Gitlab.
+

Problems with our current tiling

+

GNOME has had basic tiling functionality since early in the GNOME 3 series. While this is nice to have, it has obvious limitations:

+
    +
  • It’s completely manual
  • +
  • Only 2 windows are supported, and the current implementation is not extensible to more complex layouts
  • +
  • Tiled windows are not grouped in the window stack, so both windows are not raised simultaneously and other windows get in the way
  • +
  • Workspaces are manual, and not integrated into the workflow
  • +
+
Because tiled windows are currently mixed with overlapping floating windows they’re not really helping make things less messy in practice.
+

We’ve wanted more powerful tiling for years, but there has not been much progress due to the huge amount of work involved on the technical side and the lack of a clear design direction we were happy with. We now finally feel like the design is at a stage where we can take concrete next steps towards making it happen, which is very exciting!

+

Get out of my way

+

The key point we keep coming back to with this work is that, if we do add a new kind of window management to GNOME, it needs to be good enough to be the default. We don’t want to add yet another manual opt-in tool that doesn’t solve the problems the majority of people face.

+

To do this we landed on a number of high level ideas:

+
    +
  • Automatically do what people probably want, allow adjusting if needed
  • +
  • Make use of workspaces as a fully integrated part of the workflow
  • +
  • Richer metadata from apps to allow for better integration
  • +
+

Our current concept imagines windows having three potential layout states:

+
    +
  • Mosaic, a new window management mode which combines the best parts of tiling and floating
  • +
  • Edge Tiling, i.e. windows splitting the screen edge-to-edge
  • +
  • Floating, the classic stacked windows model
  • +
+ +

Mosaic is the default behavior. You open a window, it opens centered on the screen at a size that makes the most sense for the app. For a web browser that might be maximized, for a weather app maybe only 700×500 pixels.

+ +

As you open more windows, the existing windows move aside to make room for the new ones. If a new window doesn’t fit (e.g. because it wants to be maximized) it moves to its own workspace. If the window layout comes close to filling the screen, the windows are automatically tiled.

+ +

You can also manually tile windows. If there’s enough space, other windows are left in a mosaic layout. However, if there’s not enough space for this mosaic layout, you’re prompted to pick another window to tile alongside.

+ +

You’re not limited to tiling just two windows side by side. Any tile (or the remaining space) can be split by dragging another window over it, and freely resized as the window minimum sizes allow.

+ +

There are always going to be cases that require placing a window in a specific position on the screen. The new system allows windows to be used with the classic floating behavior, on a layer above the mosaic/tiling windows. However, we think that this floating behaviour is going to be a relatively uncommon, similar to the existing “always on top” behavior that we have today.

+

There’s of course much more to this, but hopefully this gives an idea of what we have in mind in terms of behavior.

+

New window metadata

+

As mentioned above, to avoid the pitfalls of traditional tiling window managers we need more information from windows about their content. Windows can already set a fixed size and they have an implicit minimum size, but to build a great tiling experience we need more.

+
Some apps should probably never be maximized/tiled on a 4K monitor…
+

One important missing piece is having information on the maximum desired size of a window. This is the size beyond which the window content stops looking good. Not having this information is one of the reasons that traditional tiling window managers have issues, especially on larger screens. This maximum size would not be a hard limit and manual resizing would still be possible. Instead, the system would use the maximum size as one factor when it calculates an optimal window layout. For example, when tiling to the side of the screen, a window would only grow as wide as its maximum width rather than filling exactly half of the screen.

+

In addition, it’d be helpful to know the range of ideal sizes where an app works best. While an app may technically work at mobile sizes that’s probably not the best way to use that app if you have a large display. To stay with our chat example, you probably want to avoid folding the sidebar if it can be avoided, so the range of ideal sizes would be between the point where it becomes single pane and its maximum usable size.

+

Ideally these properties could be set dynamically depending on the window content. For example, a spreadsheet with a lot of columns but few rows could have a wider ideal size than one with lots of rows.

+

Depending on apps using new system APIs can be challenging and slow — it’s not easy to move the entire ecosystem! However, we think there’s a good chance of success in this case, due to the simplicity and universal usefulness of the API.

+

Next steps

+

At the Brno hackfest in April we had an initial discussion with GNOME Shell developers about many of the technical details. There is tentative agreement that we want to move in the direction outlined in this post, but there’s still a lot of work ahead.

+

On the design side, the biggest uncertainty is the mosaic behavior — it’s a novel approach to window management without much prior art. That’s exciting, but also makes it a bit risky to jump head-first into implementation. We’d like to do user research to validate some of our assumptions on different aspects of this, but it’s the kind of project that’s very difficult to test outside of an actual prototype that’s usable day to day.

+

If you’d like to get involved with this initiative, one great way to help out would be to work on an extension that implements (parts of) the mosaic behavior for testing and refining the interactions. If you’re interested in this, please reach out :)

+

There’s no timeline or roadmap at this stage, but it’s definitely 46+ material and likely to take multiple cycles. There are individual parts of this that could be worked on independently ahead of the more contingent pieces, for example tiling groups or new window metadata. Help in any of these areas would be appreciated.

+

This post is summarizing collaborative work over the past years by the entire design team (Allan Day, Jakub Steiner, Sam Hewitt, et al). In particular, thanks to Jakub for the awesome animations bringing the behaviors to life!

"#; + + let url = + Url::parse("https://blogs.gnome.org/tbernard/2023/07/26/rethinking-window-management/") + .unwrap(); + let res = clean_html(html, &url).unwrap(); + + std::fs::write("/home/jeanluc/result.html", res.html).unwrap(); + } + #[test] fn finshots() { let html = r#"Amul, Cola and Atta???

In today’s Finshots, we discuss Amul’s pathway to becoming more than just a dairy brand.


The Story

The ₹61,000 crore Amul has a new leader — Jayen Mehta. And he says he wants to transform the dairy giant into a veritable FMCG behemoth. Think atta to compete with ITC’s Aashirvaad. Biscuits that creep into Britannia’s territory and even carbonated beverages to take on the might of Coca-Cola and Pepsi.

Now, you might have seen some of these products on your supermarket shelves already. Because they’re not exactly brand new launches. Amul has slowly been testing the waters over the past few years. And now, it just wants to double down on this diversification.

But before we get into why and how let’s rewind a bit to understand Amul’s history.

The story begins in 1945. The milk farmers at Anand in Gujarat’s Kaira (now Kheda) district were miserable. The entire market was controlled by one entity — Polson’s Dairy. See, the government had launched the Bombay Milk Scheme where milk had to be sent from Anand to Bombay. And since milk is perishable, it couldn’t be quickly transported across the country without getting spoilt. So the milk had to be pasteurised at Anand itself. And considering Polson had the factories, it emerged as the winner and it began to dictate prices to the farmers. They paid peanuts and Polson’s and the middlemen pocketed all the profits from the sales.

But then came Sardar Vallabhai Patel, the Iron Man of India, who rallied the farmers into setting up a cooperative. He wanted them to work together and pool their resources. A bigger unit meant that they could dictate their own terms. The farmers went on strike. Bombay ran out of milk. And finally, the Kaira District Co-operative Milk Producers’ Union or Amul was born. They kicked Polsons out of the game and started pasteurising milk for the Bombay Milk Scheme in 1948. Two villages, 250 litres of milk. That’s it.

But soon, there was another problem ― excess milk. See, because of a shortage of cow milk, the Union processed buffalo milk as well. But there came a point where Bombay wasn’t able to absorb this excess milk.

Enter Dr. Verghese Kurien, a government servant who was deputed to Anand’s experimental creamery. The man chalked out a billion-litre idea of reprocessing excess buffalo milk. And that’s when they decided to set up a factory to churn the raw milk into milk powder and butter. Products that had a longer shelf-life. In 1954, the first step towards the diversification of Amul’s products began.

Amul became a pan-India movement. And what started as a tiny union of a handful of farmers producing 250 litres of milk a day is now a 3.6 million-strong organisation producing an average of over 26 million litres of milk daily.

So yeah, you can see why consumers like you and me consider Amul synonymous with dairy. There’s a long history and there’s nothing else quite like it.

Now diversification is a natural strategy for any company, right? No one wants to be dependent on just one product. Also, milk is just a commodity. You can’t really earn too much margin on it. So Amul began to create milk-adjacent products that would add more value to the consumer. These products could be priced higher and make the cooperative more money — cheese, paneer, buttermilk, flavoured shakes, and ice creams were a perfect fit for a dairy company. And the strategy worked. In FY19–20, these value-added products actually contributed to 45% of its revenues.

Now if you think about it, Amul has all the ingredients to succeed with its diversification into non-dairy items like colas, atta, biscuits, and french fries too. It just needs to follow the same playbook, right?

It has a brand image that has been carefully cultivated over the years. In part due to the iconic Amul girl in the red polka-dotted dress. While other leading brands apportion 8–15% of their total spending on ads, Amul spends less than 1% on advertisements. And this brand image can come in handy for penetrating the rural markets which typically make up nearly 40% of an FMCG company’s sales. People trust Amul.

And most importantly, Amul has a massive distribution network it can tap — 10,000 distributors and over a million retailers. Its frozen products like french fries and aloo tikki can simply leverage its existing ice cream cold chain network. Amul really doesn’t need to build new distribution facilities from scratch.

But here’s the thing. Despite its decades of success selling dairy products, Amul hasn’t quite been able to crack the diversification code. It hasn’t been able to emerge as a true FMCG player yet.

Take chocolates for instance. Amul actually forayed into the industry way back in the 1970s itself. In fact, it tried the same playbook of setting up a cooperative society for cocoa farming. It wanted to fight Cadbury’s monopoly. It thought it could easily use its existing cold chain network for distribution. It even advertised heavily when colour televisions became popular in India in the 1980s. But nothing worked. Today, Amul has a measly 3% market share in India.

In 2006, it launched a sports drink called Stamina. It didn’t see any takers. It shut shop, re-launched the drink a decade later and failed again. Amul even launched a frozen pizza in the 2000s! And if you’re surprised at that bit of news, well, that’s because it failed too.

In 2019, it forayed into butter cookies. And it even took on rivals like Britannia’s Good Day. It thought, “Hey, we’re supplying all the butter to these FMCG companies. But they’re actually mixing a lot of palm oil into it. Why not make one of our own?”

Amul even went on the offensive and launched ad campaigns saying that it had ‘25% Amul butter.’ And that everyone else had less than 3%. It said that rivals simply used a flavouring. But despite that ad blitz, Amul hasn’t set the butter cookie segment on fire.

And in 2020, it launched the Amul Tru seltzer — a carbonated fizzy drink to take on the colas of India. But even this product hasn’t moved the needle.

Basically, almost everything other than the value-added dairy products hasn’t quite worked out for Amul. Its brand or distribution hasn’t helped it. So will it be different this time under new leadership? We don’t know.

Or maybe Amul should just do what it does best and focus on getting more of the dairy pie? After all, only 30% of the $110-billion dairy sector is organized even today.

Can Amul crack the code for non-dairy FMCG products? What do you think?

Until then…

Don't forget to share this article on WhatsApp, LinkedIn and Twitter


Ditto Insights: Why Millennials should buy a term plan

According to a survey, only 17% of Indian millennials (25–35 yrs) have bought term insurance. The actual numbers are likely even lower.

And the more worrying fact is that 55% hadn’t even heard of term insurance!

So why is this happening?

One common misconception is the dependent conundrum. Most millennials we spoke to want to buy a term policy because they want to cover their spouse and kids. And this makes perfect sense. After all, in your absence you want your term policy to pay out a large sum of money to cover your family’s needs for the future. But these very same people don’t think of their parents as dependents even though they support them extensively. I remember the moment it hit me. I routinely send money back home, but I had never considered my parents as my dependents. And when a colleague spoke about his experience, I immediately put two and two together. They were dependent on my income and my absence would most certainly affect them financially. So a term plan was a no-brainer for me.

There’s another reason why millennials should probably consider looking at a term plan — Debt. Most people we spoke to have home loans, education loans and other personal loans with a considerable interest burden. In their absence, this burden would shift to their dependents. It’s not something most people think of, but it happens all the time.

Finally, you actually get a pretty good bargain on term insurance prices when you’re younger. The idea is to pay a nominal sum every year (something that won’t burn your pocket) to protect your dependents in the event of your untimely demise. And this fee is lowest when you’re young.

So if you’re a millennial and you’re reading this, maybe you should reconsider buying a term plan. And don’t forget to talk to us at Ditto while you’re at it. We only have a limited number of slots everyday, so make sure you book your appointment at the earliest:

1. Just head to our website by clicking on the link here

2. Click on “Book a FREE call”

3. Select Term Insurance

4. Choose the date & time as per your convenience and RELAX!

"#; diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index 73adee8..22299b6 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -738,11 +738,6 @@ impl Util { } } - // For embed with tag, check inner HTML as well. - // if embed_node.get_name().to_lowercase() == "object" && constants::VIDEOS.is_match(embed_node.innerHTML) { - // return false; - // } - embed_count += 1; } } @@ -755,8 +750,9 @@ impl Util { let image_obj_count = Util::get_elements_by_tag_name(node, "imageobject").len(); let video_obj_count = Util::get_elements_by_tag_name(node, "videoobject").len(); + let video_tag_count = Util::get_elements_by_tag_name(node, "video").len(); - if image_obj_count > 0 || video_obj_count > 0 { + if image_obj_count > 0 || video_obj_count > 0 || video_tag_count > 0 { return false; } From 40f065d9cd44bfcec9993cdea431c7b8f014c4c8 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 28 Jul 2023 07:03:50 +0200 Subject: [PATCH 12/56] allow downloads without content type smaller than 5mb --- article_scraper/src/constants.rs | 1 + article_scraper/src/images/mod.rs | 13 ++++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/article_scraper/src/constants.rs b/article_scraper/src/constants.rs index 31bc213..79823f8 100644 --- a/article_scraper/src/constants.rs +++ b/article_scraper/src/constants.rs @@ -3,6 +3,7 @@ use std::collections::HashSet; use once_cell::sync::Lazy; use regex::{Regex, RegexBuilder}; +pub const UNKNOWN_CONTENT_SIZE_LIMIT: usize = 5 * 1024 * 1024; pub const DEFAULT_CHAR_THRESHOLD: usize = 500; pub static IS_IMAGE: Lazy = Lazy::new(|| { RegexBuilder::new(r#"\.(jpg|jpeg|png|webp)"#) diff --git a/article_scraper/src/images/mod.rs b/article_scraper/src/images/mod.rs index 56554c6..5a2d133 100644 --- a/article_scraper/src/images/mod.rs +++ b/article_scraper/src/images/mod.rs @@ -2,6 +2,7 @@ pub use self::error::ImageDownloadError; use self::image_data::ImageDataBase64; use self::pair::Pair; use self::request::ImageRequest; +use crate::constants; use crate::util::Util; use base64::Engine; use futures::StreamExt; @@ -36,13 +37,19 @@ impl ImageDownloader { ) -> Result, ImageDownloadError> { let response = client.get(url).send().await?; - let content_type = Util::get_content_type(&response)?; - let content_length = Util::get_content_length(&response).unwrap_or(0); + let content_type = Util::get_content_type(&response); + let content_length = Util::get_content_length(&response); - if !content_type.contains("image") { + if let (Err(_), Ok(content_length)) = (&content_type, &content_length) { + if *content_length > constants::UNKNOWN_CONTENT_SIZE_LIMIT { + return Err(ImageDownloadError::ContentType); + } + } else if !content_type?.contains("image") { return Err(ImageDownloadError::ContentType); } + let content_length = content_length.unwrap_or(0); + let mut stream = response.bytes_stream(); let mut downloaded_bytes = 0; From eb1bfdbca0258e6058096c15b45ab15105991986 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 28 Jul 2023 07:09:50 +0200 Subject: [PATCH 13/56] print url --- article_scraper/src/util.rs | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index 22299b6..dcbbad9 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -1193,7 +1193,7 @@ impl Util { let status_code = response.status(); if !status_code.is_success() { - log::warn!("response: {status_code}"); + log::warn!("response: {status_code} ({})", response.url()); return Err(ImageDownloadError::Http); } @@ -1206,16 +1206,19 @@ impl Util { } pub fn get_content_type(response: &Response) -> Result { - if response.status().is_success() { - response - .headers() - .get(CONTENT_TYPE) - .and_then(|val| val.to_str().ok()) - .map(|val| val.to_string()) - .ok_or(ImageDownloadError::ContentType) - } else { - Err(ImageDownloadError::ContentType) + let status_code = response.status(); + + if !status_code.is_success() { + log::warn!("response: {status_code} ({})", response.url()); + return Err(ImageDownloadError::Http); } + + response + .headers() + .get(CONTENT_TYPE) + .and_then(|val| val.to_str().ok()) + .map(|val| val.to_string()) + .ok_or(ImageDownloadError::ContentType) } } From a7e8661a098599184150757d315b0e891419161b Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Tue, 1 Aug 2023 18:37:55 +0200 Subject: [PATCH 14/56] update tests & defined youtube iframe height --- .../resources/tests/ftr/youtube/expected.html | 2 +- .../readability/bug-1255978/expected.html | 13 ++- .../firefox-nightly-blog/expected.html | 6 +- .../readability/wikipedia-2/expected.html | 88 +++++++++---------- .../readability/wikipedia-3/expected.html | 2 +- article_scraper/src/clean.rs | 2 +- article_scraper/src/full_text_parser/mod.rs | 2 +- 7 files changed, 63 insertions(+), 52 deletions(-) diff --git a/article_scraper/resources/tests/ftr/youtube/expected.html b/article_scraper/resources/tests/ftr/youtube/expected.html index 1213034..1ec3c5a 100644 --- a/article_scraper/resources/tests/ftr/youtube/expected.html +++ b/article_scraper/resources/tests/ftr/youtube/expected.html @@ -1 +1 @@ -
\ No newline at end of file +
\ No newline at end of file diff --git a/article_scraper/resources/tests/readability/bug-1255978/expected.html b/article_scraper/resources/tests/readability/bug-1255978/expected.html index 12043fb..bcd0d9f 100644 --- a/article_scraper/resources/tests/readability/bug-1255978/expected.html +++ b/article_scraper/resources/tests/readability/bug-1255978/expected.html @@ -27,7 +27,18 @@

Forrest Jones said that anything that comes into contact with any of the previous guest’s skin should be taken out and washed every time the room is made, but that even the fanciest hotels don’t always do so. "Hotels are getting away from comforters. Blankets are here to stay, however. But some hotels are still hesitant about washing them every day if they think they can get out of it," he said.

- +
+ + + + +

Play Video

+ + + + + +

Video shows bed bug infestation at New York hotel

diff --git a/article_scraper/resources/tests/readability/firefox-nightly-blog/expected.html b/article_scraper/resources/tests/readability/firefox-nightly-blog/expected.html index 77e6e10..28cc0c9 100644 --- a/article_scraper/resources/tests/readability/firefox-nightly-blog/expected.html +++ b/article_scraper/resources/tests/readability/firefox-nightly-blog/expected.html @@ -16,7 +16,7 @@
  • -

    The about:restartrequired error page, saying "Sorry. We just need to do one small thing to keep going. Nightly has just been updated in the background. Click Restart Nightly to complete the update. We will restore all your pages, windows and tabs afterwards, so you can be on your way quickly.", followed by a button to restart Nightly.

    +

    The about:restartrequired error page, saying "Sorry. We just need to do one small thing to keep going. Nightly has just been updated in the background. Click Restart Nightly to complete the update. We will restore all your pages, windows and tabs afterwards, so you can be on your way quickly.", followed by a button to restart Nightly.

    Users who run multiple user profiles concurrently will probably see this less!

    @@ -118,7 +118,7 @@
    • -

      A table showing the total number of remaining bugs for the MVP to make the DevTools Fission-compatible.

      +

      A table showing the total number of remaining bugs for the MVP to make the DevTools Fission-compatible.

      Our DevTools are ready for Fission (out-of-process iframes)!

      @@ -130,7 +130,7 @@
      • -

        A table showing the total number of remaining bugs for the MVP to make Marionette Fission-compatible.

        +

        A table showing the total number of remaining bugs for the MVP to make Marionette Fission-compatible.

        Marionette, the framework that allows Firefox to be tested with automation, is now Fission compatible too!

        diff --git a/article_scraper/resources/tests/readability/wikipedia-2/expected.html b/article_scraper/resources/tests/readability/wikipedia-2/expected.html index c6a4864..3774bc5 100644 --- a/article_scraper/resources/tests/readability/wikipedia-2/expected.html +++ b/article_scraper/resources/tests/readability/wikipedia-2/expected.html @@ -22,14 +22,14 @@
        -

        Blue field with the Union Flag in the top right corner, and four red stars with white borders to the right. +

        Blue field with the Union Flag in the top right corner, and four red stars with white borders to the right.

        -

        A quartered shield, flanked by two figures, topped with a crown. +

        A quartered shield, flanked by two figures, topped with a crown.

        Coat of arms @@ -56,7 +56,7 @@ - A map of the hemisphere centred on New Zealand, using an orthographic projection. + A map of the hemisphere centred on New Zealand, using an orthographic projection.

        Location of New Zealand, including outlying islands, its territorial claim in the Antarctic, and Tokelau

        @@ -354,7 +354,7 @@ HDI (2017) - Increase 0.917[8]
        + Increase 0.917[8]
        very high · 16th @@ -548,7 +548,7 @@
        -

        Brown square paper with Dutch writing and a thick red, curved line

        +

        Brown square paper with Dutch writing and a thick red, curved line

        Detail from a 1657 map showing the western coastline of "Nova Zeelandia". (In this map, north is at the bottom.)

        @@ -564,7 +564,7 @@
        -

        One set of arrows point from Taiwan to Melanesia to Fiji/Samoa and then to the Marquesas Islands. The population then spread, some going south to New Zealand and others going north to Hawai'i. A second set start in southern Asia and end in Melanesia.

        +

        One set of arrows point from Taiwan to Melanesia to Fiji/Samoa and then to the Marquesas Islands. The population then spread, some going south to New Zealand and others going north to Hawai'i. A second set start in southern Asia and end in Melanesia.

        The Māori people are most likely descended from people who emigrated from Taiwan to Melanesia and then travelled east through to the Society Islands. After a pause of 70 to 265 years, a new wave of exploration led to the discovery and settlement of New Zealand.[22]

        @@ -572,7 +572,7 @@ New Zealand was one of the last major landmasses settled by humans. Radiocarbon dating, evidence of deforestation[23] and mitochondrial DNA variability within Māori populations[24] suggest New Zealand was first settled by Eastern Polynesians between 1250 and 1300,[19][25] concluding a long series of voyages through the southern Pacific islands.[26] Over the centuries that followed, these settlers developed a distinct culture now known as Māori. The population was divided into iwi (tribes) and hapū (subtribes) who would sometimes cooperate, sometimes compete and sometimes fight against each other.[27] At some point a group of Māori migrated to Rēkohu, now known as the Chatham Islands, where they developed their distinct Moriori culture.[28][29] The Moriori population was all but wiped out between 1835 and 1862, largely because of Taranaki Māori invasion and enslavement in the 1830s, although European diseases also contributed. In 1862 only 101 survived, and the last known full-blooded Moriori died in 1933.[30]

        -

        An engraving of a sketched coastline on white background

        +

        An engraving of a sketched coastline on white background

        Map of the New Zealand coastline as Cook charted it on his first visit in 1769–70. The track of the Endeavour is also shown.

        @@ -580,13 +580,13 @@ The first Europeans known to have reached New Zealand were Dutch explorer Abel Tasman and his crew in 1642.[31] In a hostile encounter, four crew members were killed and at least one Māori was hit by canister shot.[32] Europeans did not revisit New Zealand until 1769 when British explorer James Cook mapped almost the entire coastline.[31] Following Cook, New Zealand was visited by numerous European and North American whaling, sealing and trading ships. They traded European food, metal tools, weapons and other goods for timber, Māori food, artefacts and water.[33] The introduction of the potato and the musket transformed Māori agriculture and warfare. Potatoes provided a reliable food surplus, which enabled longer and more sustained military campaigns.[34] The resulting intertribal Musket Wars encompassed over 600 battles between 1801 and 1840, killing 30,000–40,000 Māori.[35] From the early 19th century, Christian missionaries began to settle New Zealand, eventually converting most of the Māori population.[36] The Māori population declined to around 40% of its pre-contact level during the 19th century; introduced diseases were the major factor.[37]

        -

        A torn sheet of paper

        +

        A torn sheet of paper

        In 1788 Captain Arthur Phillip assumed the position of Governor of the new British colony of New South Wales which according to his commission included New Zealand.[38] The British Government appointed James Busby as British Resident to New Zealand in 1832 following a petition from northern Māori.[39] In 1835, following an announcement of impending French settlement by Charles de Thierry, the nebulous United Tribes of New Zealand sent a Declaration of Independence to King William IV of the United Kingdom asking for protection.[39] Ongoing unrest, the proposed settlement of New Zealand by the New Zealand Company (which had already sent its first ship of surveyors to buy land from Māori) and the dubious legal standing of the Declaration of Independence prompted the Colonial Office to send Captain William Hobson to claim sovereignty for the United Kingdom and negotiate a treaty with the Māori.[40] The Treaty of Waitangi was first signed in the Bay of Islands on 6 February 1840.[41] In response to the New Zealand Company's attempts to establish an independent settlement in Wellington[42] and French settlers purchasing land in Akaroa,[43] Hobson declared British sovereignty over all of New Zealand on 21 May 1840, even though copies of the Treaty were still circulating throughout the country for Māori to sign.[44] With the signing of the Treaty and declaration of sovereignty the number of immigrants, particularly from the United Kingdom, began to increase.[45]

        -

        Black and white engraving depicting a crowd of people

        +

        Black and white engraving depicting a crowd of people

        New Zealand, still part of the colony of New South Wales, became a separate Colony of New Zealand on 1 July 1841.[46] Armed conflict began between the Colonial government and Māori in 1843 with the Wairau Affray over land and disagreements over sovereignty. These conflicts, mainly in the North Island, saw thousands of Imperial troops and the Royal Navy come to New Zealand and became known as the New Zealand Wars. Following these armed conflicts, large amounts of Māori land was confiscated by the government to meet settler demands.[47] @@ -609,12 +609,12 @@

        -

        The Queen wearing her New Zealand insignia +

        The Queen wearing her New Zealand insignia

        -

        A smiling woman wearing a black dress +

        A smiling woman wearing a black dress

        @@ -629,7 +629,7 @@ A parliamentary general election must be called no later than three years after the previous election.[76] Almost all general elections between 1853 and 1993 were held under the first-past-the-post voting system.[77] Since the 1996 election, a form of proportional representation called mixed-member proportional (MMP) has been used.[66] Under the MMP system, each person has two votes; one is for a candidate standing in the voter's electorate and the other is for a party. Since the 2014 election, there have been 71 electorates (which include seven Māori electorates in which only Māori can optionally vote),[78] and the remaining 49 of the 120 seats are assigned so that representation in parliament reflects the party vote, with the threshold that a party must win at least one electorate or 5% of the total party vote before it is eligible for a seat.[79]

        -

        A block of buildings fronted by a large statue.

        +

        A block of buildings fronted by a large statue.

        Elections since the 1930s have been dominated by two political parties, National and Labour.[77] Between March 2005 and August 2006, New Zealand became the first country in the world in which all the highest offices in the land—head of state, governor-general, prime minister, speaker and chief justice—were occupied simultaneously by women.[80] The current prime minister is Jacinda Ardern, who has been in office since 26 October 2017.[81] She is the country's third female prime minister.[82] @@ -646,7 +646,7 @@

        -

        A squad of men kneel in the desert sand while performing a war dance

        +

        A squad of men kneel in the desert sand while performing a war dance

        Early colonial New Zealand allowed the British Government to determine external trade and be responsible for foreign policy.[91] The 1923 and 1926 Imperial Conferences decided that New Zealand should be allowed to negotiate its own political treaties and the first commercial treaty was ratified in 1928 with Japan. On 3 September 1939 New Zealand allied itself with Britain and declared war on Germany with Prime Minister Michael Joseph Savage proclaiming, "Where she goes, we go; where she stands, we stand."[92] @@ -655,7 +655,7 @@ In 1951 the United Kingdom became increasingly focused on its European interests,[93] while New Zealand joined Australia and the United States in the ANZUS security treaty.[94] The influence of the United States on New Zealand weakened following protests over the Vietnam War,[95] the refusal of the United States to admonish France after the sinking of the Rainbow Warrior,[96] disagreements over environmental and agricultural trade issues and New Zealand's nuclear-free policy.[97][98] Despite the United States' suspension of ANZUS obligations the treaty remained in effect between New Zealand and Australia, whose foreign policy has followed a similar historical trend.[99] Close political contact is maintained between the two countries, with free trade agreements and travel arrangements that allow citizens to visit, live and work in both countries without restrictions.[100] In 2013 there were about 650,000 New Zealand citizens living in Australia, which is equivalent to 15% of the resident population of New Zealand.[101]

        -

        A soldier in a green army uniform faces forwards

        +

        A soldier in a green army uniform faces forwards

        Anzac Day service at the National War Memorial

        @@ -673,7 +673,7 @@
        -

        Map with the North, South, Stewart/Rakiura, Tokelau, Cook, Niue, Kermadec, Chatham, Bounty, Antipodes, Snare, Auckland and Campbell Islands highlighted. New Zealand's segment of Antarctica (the Ross Dependency) is also highlighted.

        +

        Map with the North, South, Stewart/Rakiura, Tokelau, Cook, Niue, Kermadec, Chatham, Bounty, Antipodes, Snare, Auckland and Campbell Islands highlighted. New Zealand's segment of Antarctica (the Ross Dependency) is also highlighted.

        The early European settlers divided New Zealand into provinces, which had a degree of autonomy.[121] Because of financial pressures and the desire to consolidate railways, education, land sales and other policies, government was centralised and the provinces were abolished in 1876.[122] The provinces are remembered in regional public holidays[123] and sporting rivalries.[124] @@ -711,15 +711,15 @@ Countries -  New Zealand +  New Zealand -  Cook Islands +  Cook Islands -  Niue +  Niue @@ -744,7 +744,7 @@ Ross Dependency -  Tokelau +  Tokelau 15 islands @@ -799,7 +799,7 @@

        -

        Islands of New Zealand as seen from satellite

        +

        Islands of New Zealand as seen from satellite

        New Zealand is located near the centre of the water hemisphere and is made up of two main islands and a number of smaller islands. The two main islands (the North Island, or Te Ika-a-Māui, and the South Island, or Te Waipounamu) are separated by Cook Strait, 22 kilometres (14 mi) wide at its narrowest point.[138] Besides the North and South Islands, the five largest inhabited islands are Stewart Island (across the Foveaux Strait), Chatham Island, Great Barrier Island (in the Hauraki Gulf),[139] D'Urville Island (in the Marlborough Sounds)[140] and Waiheke Island (about 22 km (14 mi) from central Auckland).[141] @@ -807,12 +807,12 @@

        -

        A large mountain with a lake in the foreground +

        A large mountain with a lake in the foreground

        -

        Snow-capped mountain range +

        Snow-capped mountain range

        The Southern Alps stretch for 500 kilometres down the South Island @@ -839,13 +839,13 @@

      • -

        +

      • -

        +

      • @@ -991,7 +991,7 @@
        -

        Kiwi amongst sticks

        +

        Kiwi amongst sticks

        The endemic flightless kiwi is a national icon.

        @@ -1002,7 +1002,7 @@ Before the arrival of humans, an estimated 80% of the land was covered in forest, with only high alpine, wet, infertile and volcanic areas without trees.[173] Massive deforestation occurred after humans arrived, with around half the forest cover lost to fire after Polynesian settlement.[174] Much of the remaining forest fell after European settlement, being logged or cleared to make room for pastoral farming, leaving forest occupying only 23% of the land.[175]

        -

        An artist's rendition of a Haast's eagle attacking two moa

        +

        An artist's rendition of a Haast's eagle attacking two moa

        The giant Haast's eagle died out when humans hunted its main prey, the moa, to extinction.

        @@ -1022,7 +1022,7 @@
        -

        Boats docked in blue-green water. Plate glass skyscrapers rising up in the background.

        +

        Boats docked in blue-green water. Plate glass skyscrapers rising up in the background.

        New Zealand has an advanced market economy,[193] ranked 16th in the 2018 Human Development Index[8] and third in the 2018 Index of Economic Freedom.[194] It is a high-income economy with a nominal gross domestic product (GDP) per capita of US$36,254.[6] The currency is the New Zealand dollar, informally known as the "Kiwi dollar"; it also circulates in the Cook Islands (see Cook Islands dollar), Niue, Tokelau, and the Pitcairn Islands.[195] @@ -1031,7 +1031,7 @@ Historically, extractive industries have contributed strongly to New Zealand's economy, focussing at different times on sealing, whaling, flax, gold, kauri gum, and native timber.[196] The first shipment of refrigerated meat on the Dunedin in 1882 led to the establishment of meat and dairy exports to Britain, a trade which provided the basis for strong economic growth in New Zealand.[197] High demand for agricultural products from the United Kingdom and the United States helped New Zealanders achieve higher living standards than both Australia and Western Europe in the 1950s and 1960s.[198] In 1973, New Zealand's export market was reduced when the United Kingdom joined the European Economic Community[199] and other compounding factors, such as the 1973 oil and 1979 energy crises, led to a severe economic depression.[200] Living standards in New Zealand fell behind those of Australia and Western Europe, and by 1982 New Zealand had the lowest per-capita income of all the developed nations surveyed by the World Bank.[201] In the mid-1980s New Zealand deregulated its agricultural sector by phasing out subsidies over a three-year period.[202][203] Since 1984, successive governments engaged in major macroeconomic restructuring (known first as Rogernomics and then Ruthanasia), rapidly transforming New Zealand from a protected and highly regulated economy to a liberalised free-trade economy.[204][205]

        -

        Blue water against a backdrop of snow-capped mountains

        +

        Blue water against a backdrop of snow-capped mountains

        Milford Sound is one of New Zealand's most famous tourist destinations.[206]

        @@ -1045,7 +1045,7 @@ New Zealand is heavily dependent on international trade,[217] particularly in agricultural products.[218] Exports account for 24% of its output,[143] making New Zealand vulnerable to international commodity prices and global economic slowdowns. Food products made up 55% of the value of all the country's exports in 2014; wood was the second largest earner (7%).[219] New Zealand's main trading partners, as at June 2018, are China (NZ$27.8b), Australia ($26.2b), the European Union ($22.9b), the United States ($17.6b), and Japan ($8.4b).[220] On 7 April 2008, New Zealand and China signed the New Zealand–China Free Trade Agreement, the first such agreement China has signed with a developed country.[221] The service sector is the largest sector in the economy, followed by manufacturing and construction and then farming and raw material extraction.[143] Tourism plays a significant role in the economy, contributing $12.9 billion (or 5.6%) to New Zealand's total GDP and supporting 7.5% of the total workforce in 2016.[222] International visitor arrivals are expected to increase at a rate of 5.4% annually up to 2022.[222]

        -

        A Romney ewe with her two lambs

        +

        A Romney ewe with her two lambs

        Wool has historically been one of New Zealand's major exports.

        @@ -1056,7 +1056,7 @@ Infrastructure
        -

        A mid-size jet airliner in flight. The plane livery is all-black and features a New Zealand silver fern mark.

        +

        A mid-size jet airliner in flight. The plane livery is all-black and features a New Zealand silver fern mark.

        In 2015, renewable energy, primarily geothermal and hydroelectric power, generated 40.1% of New Zealand's gross energy supply.[231] Geothermal power alone accounted for 22% of New Zealand's energy in 2015.[231] @@ -1075,7 +1075,7 @@

        -

        Stationary population pyramid broken down into 21 age ranges.

        +

        Stationary population pyramid broken down into 21 age ranges.

        The 2013 New Zealand census enumerated a resident population of 4,242,048, an increase of 5.3% over the 2006 figure.[245][n 8] As of September 2019, the total population has risen to an estimated 4,933,210.[5] @@ -1141,9 +1141,9 @@ - Auckland
        + Auckland
        Auckland
        - Wellington
        + Wellington
        Wellington @@ -1171,9 +1171,9 @@ 58,800 - Christchurch
        + Christchurch
        Christchurch
        - Hamilton
        + Hamilton
        Hamilton @@ -1419,7 +1419,7 @@

        -

        Pedestrians crossing a wide street which is flanked by storefronts

        +

        Pedestrians crossing a wide street which is flanked by storefronts

        Pedestrians on Queen Street in Auckland, an ethnically diverse city

        @@ -1437,7 +1437,7 @@
        -

        Map of New Zealand showing the percentage of people in each census area unit who speak Māori. Areas of the North Island exhibit the highest Māori proficiency.

        +

        Map of New Zealand showing the percentage of people in each census area unit who speak Māori. Areas of the North Island exhibit the highest Māori proficiency.

        Speakers of Māori according to the 2013 census[270]

         Less than 5%

         More than 5% @@ -1469,7 +1469,7 @@

        -

        Simple white building with two red domed towers

        +

        Simple white building with two red domed towers

        A Rātana church on a hill near Raetihi. The two-tower construction is characteristic of Rātana buildings.

        @@ -1489,7 +1489,7 @@
        -

        Tall wooden carving showing Kupe above two tentacled sea creatures +

        Tall wooden carving showing Kupe above two tentacled sea creatures

        Late 20th-century house-post depicting the navigator Kupe fighting two sea creatures @@ -1515,7 +1515,7 @@ Māori decorated the white wood of buildings, canoes and cenotaphs using red (a mixture of red ochre and shark fat) and black (made from soot) paint and painted pictures of birds, reptiles and other designs on cave walls.[310] Māori tattoos (moko) consisting of coloured soot mixed with gum were cut into the flesh with a bone chisel.[311] Since European arrival paintings and photographs have been dominated by landscapes, originally not as works of art but as factual portrayals of New Zealand.[312] Portraits of Māori were also common, with early painters often portraying them as "noble savages", exotic beauties or friendly natives.[312] The country's isolation delayed the influence of European artistic trends allowing local artists to develop their own distinctive style of regionalism.[313] During the 1960s and 1970s many artists combined traditional Māori and Western techniques, creating unique art forms.[314] New Zealand art and craft has gradually achieved an international audience, with exhibitions in the Venice Biennale in 2001 and the "Paradise Now" exhibition in New York in 2004.[306][315]

        -

        Refer to caption

        +

        Refer to caption

        Māori cloaks are made of fine flax fibre and patterned with black, red and white triangles, diamonds and other geometric shapes.[316] Greenstone was fashioned into earrings and necklaces, with the most well-known design being the hei-tiki, a distorted human figure sitting cross-legged with its head tilted to the side.[317] Europeans brought English fashion etiquette to New Zealand, and until the 1950s most people dressed up for social occasions.[318] Standards have since relaxed and New Zealand fashion has received a reputation for being casual, practical and lacklustre.[319][320] However, the local fashion industry has grown significantly since 2000, doubling exports and increasing from a handful to about 50 established labels, with some labels gaining international recognition.[320] @@ -1535,7 +1535,7 @@ New Zealand music has been influenced by blues, jazz, country, rock and roll and hip hop, with many of these genres given a unique New Zealand interpretation.[326] Māori developed traditional chants and songs from their ancient Southeast Asian origins, and after centuries of isolation created a unique "monotonous" and "doleful" sound.[327] Flutes and trumpets were used as musical instruments[328] or as signalling devices during war or special occasions.[329] Early settlers brought over their ethnic music, with brass bands and choral music being popular, and musicians began touring New Zealand in the 1860s.[330][331] Pipe bands became widespread during the early 20th century.[332] The New Zealand recording industry began to develop from 1940 onwards and many New Zealand musicians have obtained success in Britain and the United States.[326] Some artists release Māori language songs and the Māori tradition-based art of kapa haka (song and dance) has made a resurgence.[333] The New Zealand Music Awards are held annually by Recorded Music NZ; the awards were first held in 1965 by Reckitt & Colman as the Loxene Golden Disc awards.[334] Recorded Music NZ also publishes the country's official weekly record charts.[335]

        -

        Hills with inset, round doors. Reflected in water.

        +

        Hills with inset, round doors. Reflected in water.

        Public radio was introduced in New Zealand in 1922.[337] A state-owned television service began in 1960.[338] Deregulation in the 1980s saw a sudden increase in the numbers of radio and television stations.[339] New Zealand television primarily broadcasts American and British programming, along with a large number of Australian and local shows.[340] The number of New Zealand films significantly increased during the 1970s. In 1978 the New Zealand Film Commission started assisting local film-makers and many films attained a world audience, some receiving international acknowledgement.[339] The highest-grossing New Zealand films are Hunt for the Wilderpeople, Boy, The World's Fastest Indian, Once Were Warriors and Whale Rider.[341] The country's diverse scenery and compact size, plus government incentives,[342] have encouraged some producers to shoot big-budget productions in New Zealand, including Avatar, The Lord of the Rings, The Hobbit, The Chronicles of Narnia, King Kong and The Last Samurai.[343] The New Zealand media industry is dominated by a small number of companies, most of which are foreign-owned, although the state retains ownership of some television and radio stations.[344] Since 1994, Freedom House has consistently ranked New Zealand's press freedom in the top twenty, with the 19th freest media in 2015.[345] @@ -1546,7 +1546,7 @@

        -

        Rugby team wearing all black, facing the camera, knees bent, and facing toward a team wearing white

        +

        Rugby team wearing all black, facing the camera, knees bent, and facing toward a team wearing white

        Most of the major sporting codes played in New Zealand have British origins.[346] Rugby union is considered the national sport[347] and attracts the most spectators.[348] Golf, netball, tennis and cricket have the highest rates of adult participation, while netball, rugby union and football (soccer) are particularly popular among young people.[348][349] Around 54% of New Zealand adolescents participate in sports for their school.[349] Victorious rugby tours to Australia and the United Kingdom in the late 1880s and the early 1900s played an early role in instilling a national identity.[350] Horseracing was also a popular spectator sport and became part of the "Rugby, Racing and Beer" culture during the 1960s.[351] Māori participation in European sports was particularly evident in rugby and the country's team performs a haka, a traditional Māori challenge, before international matches.[352] New Zealand is known for its extreme sports, adventure tourism[353] and strong mountaineering tradition, as seen in the success of notable New Zealander Sir Edmund Hillary.[354][355] Other outdoor pursuits such as cycling, fishing, swimming, running, tramping, canoeing, hunting, snowsports, surfing and sailing are also popular.[356] The Polynesian sport of waka ama racing has experienced a resurgence of interest in New Zealand since the 1980s.[357] @@ -1559,7 +1559,7 @@

      diff --git a/article_scraper/resources/tests/readability/wikipedia-3/expected.html b/article_scraper/resources/tests/readability/wikipedia-3/expected.html index 4e0644b..7506300 100644 --- a/article_scraper/resources/tests/readability/wikipedia-3/expected.html +++ b/article_scraper/resources/tests/readability/wikipedia-3/expected.html @@ -95,7 +95,7 @@ -

      [icon] +

      [icon]

      diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index 6b0d698..0e7138b 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -153,7 +153,7 @@ mod tests { let url = Url::parse("https://finshots.in").unwrap(); let res = clean_html(html, &url).unwrap(); - assert_eq!(res.html.len(), 11965); + assert_eq!(res.html.len(), 11959); assert_eq!( res.thumbnail.as_deref(), Some("https://cdn.finshots.app/images/2023/03/Design-8-Amul.jpg") diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 166053d..d5803c9 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -626,7 +626,7 @@ impl FullTextParser { .set_property("class", "videoWrapper") .ok() .and_then(|()| node.set_property("width", "100%").ok()) - .and_then(|()| node.set_property("height", "100%").ok()) + .and_then(|()| node.set_property("height", "500").ok()) .ok_or_else(|| { node.unlink(); video_wrapper.add_child(&mut node) From 7a4f5c500d0e79f1ed0cf1c5ff7db5c3c9b0d511 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Tue, 1 Aug 2023 19:35:22 +0200 Subject: [PATCH 15/56] 400 --- article_scraper/src/full_text_parser/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index d5803c9..65b3912 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -626,7 +626,7 @@ impl FullTextParser { .set_property("class", "videoWrapper") .ok() .and_then(|()| node.set_property("width", "100%").ok()) - .and_then(|()| node.set_property("height", "500").ok()) + .and_then(|()| node.set_property("height", "400").ok()) .ok_or_else(|| { node.unlink(); video_wrapper.add_child(&mut node) From 3211b91badb71a5108ed9761d42890e9afd31bbd Mon Sep 17 00:00:00 2001 From: Leonardo Fedalto Date: Tue, 1 Aug 2023 21:39:48 +0200 Subject: [PATCH 16/56] Make `Article` public --- article_scraper/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/src/lib.rs b/article_scraper/src/lib.rs index a1819a3..0517010 100644 --- a/article_scraper/src/lib.rs +++ b/article_scraper/src/lib.rs @@ -45,7 +45,7 @@ mod util; mod video_object; use crate::images::Progress; -use article::Article; +pub use article::Article; use error::ScraperError; #[doc(hidden)] pub use full_text_parser::config::ConfigEntry as FtrConfigEntry; From b91014c685d185a14077b72982c63b5733c49c4c Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Thu, 3 Aug 2023 10:40:29 +0200 Subject: [PATCH 17/56] clean html fragments --- article_scraper/src/clean.rs | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index 0e7138b..a8e507e 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -9,7 +9,7 @@ pub struct CleanedHtml { pub thumbnail: Option, } -/// Re-use crate internals to clean HTML of articles before +/// Re-use crate internals to clean HTML fragments of articles before /// further processing: /// - replace H1 with H2 /// - rename all font nodes to span @@ -33,11 +33,28 @@ pub struct CleanedHtml { /// * `html` - HTML content /// * `base_url` - URL used to complete relative URLs /// -pub fn clean_html(html: &str, base_url: &Url) -> Result { +pub fn clean_html_fragment( + html_fragment: &str, + base_url: &Url, +) -> Result { libxml::tree::node::set_node_rc_guard(10); + let html = format!( + r#" + + + + + + + {html_fragment} + + + "# + ); + let empty_config = FtrConfigEntry::default(); - let document = FullTextParser::parse_html(html, None, &empty_config)?; + let document = FullTextParser::parse_html(&html, None, &empty_config)?; let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?; let thumbnail = FullTextParser::check_for_thumbnail(&xpath_ctx); FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document, None); @@ -67,7 +84,7 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result

      In today’s Finshots, we discuss Amul’s pathway to becoming more than just a dairy brand.


      The Story

      The ₹61,000 crore Amul has a new leader — Jayen Mehta. And he says he wants to transform the dairy giant into a veritable FMCG behemoth. Think atta to compete with ITC’s Aashirvaad. Biscuits that creep into Britannia’s territory and even carbonated beverages to take on the might of Coca-Cola and Pepsi.

      Now, you might have seen some of these products on your supermarket shelves already. Because they’re not exactly brand new launches. Amul has slowly been testing the waters over the past few years. And now, it just wants to double down on this diversification.

      But before we get into why and how let’s rewind a bit to understand Amul’s history.

      The story begins in 1945. The milk farmers at Anand in Gujarat’s Kaira (now Kheda) district were miserable. The entire market was controlled by one entity — Polson’s Dairy. See, the government had launched the Bombay Milk Scheme where milk had to be sent from Anand to Bombay. And since milk is perishable, it couldn’t be quickly transported across the country without getting spoilt. So the milk had to be pasteurised at Anand itself. And considering Polson had the factories, it emerged as the winner and it began to dictate prices to the farmers. They paid peanuts and Polson’s and the middlemen pocketed all the profits from the sales.

      But then came Sardar Vallabhai Patel, the Iron Man of India, who rallied the farmers into setting up a cooperative. He wanted them to work together and pool their resources. A bigger unit meant that they could dictate their own terms. The farmers went on strike. Bombay ran out of milk. And finally, the Kaira District Co-operative Milk Producers’ Union or Amul was born. They kicked Polsons out of the game and started pasteurising milk for the Bombay Milk Scheme in 1948. Two villages, 250 litres of milk. That’s it.

      But soon, there was another problem ― excess milk. See, because of a shortage of cow milk, the Union processed buffalo milk as well. But there came a point where Bombay wasn’t able to absorb this excess milk.

      Enter Dr. Verghese Kurien, a government servant who was deputed to Anand’s experimental creamery. The man chalked out a billion-litre idea of reprocessing excess buffalo milk. And that’s when they decided to set up a factory to churn the raw milk into milk powder and butter. Products that had a longer shelf-life. In 1954, the first step towards the diversification of Amul’s products began.

      Amul became a pan-India movement. And what started as a tiny union of a handful of farmers producing 250 litres of milk a day is now a 3.6 million-strong organisation producing an average of over 26 million litres of milk daily.

      So yeah, you can see why consumers like you and me consider Amul synonymous with dairy. There’s a long history and there’s nothing else quite like it.

      Now diversification is a natural strategy for any company, right? No one wants to be dependent on just one product. Also, milk is just a commodity. You can’t really earn too much margin on it. So Amul began to create milk-adjacent products that would add more value to the consumer. These products could be priced higher and make the cooperative more money — cheese, paneer, buttermilk, flavoured shakes, and ice creams were a perfect fit for a dairy company. And the strategy worked. In FY19–20, these value-added products actually contributed to 45% of its revenues.

      Now if you think about it, Amul has all the ingredients to succeed with its diversification into non-dairy items like colas, atta, biscuits, and french fries too. It just needs to follow the same playbook, right?

      It has a brand image that has been carefully cultivated over the years. In part due to the iconic Amul girl in the red polka-dotted dress. While other leading brands apportion 8–15% of their total spending on ads, Amul spends less than 1% on advertisements. And this brand image can come in handy for penetrating the rural markets which typically make up nearly 40% of an FMCG company’s sales. People trust Amul.

      And most importantly, Amul has a massive distribution network it can tap — 10,000 distributors and over a million retailers. Its frozen products like french fries and aloo tikki can simply leverage its existing ice cream cold chain network. Amul really doesn’t need to build new distribution facilities from scratch.

      But here’s the thing. Despite its decades of success selling dairy products, Amul hasn’t quite been able to crack the diversification code. It hasn’t been able to emerge as a true FMCG player yet.

      Take chocolates for instance. Amul actually forayed into the industry way back in the 1970s itself. In fact, it tried the same playbook of setting up a cooperative society for cocoa farming. It wanted to fight Cadbury’s monopoly. It thought it could easily use its existing cold chain network for distribution. It even advertised heavily when colour televisions became popular in India in the 1980s. But nothing worked. Today, Amul has a measly 3% market share in India.

      In 2006, it launched a sports drink called Stamina. It didn’t see any takers. It shut shop, re-launched the drink a decade later and failed again. Amul even launched a frozen pizza in the 2000s! And if you’re surprised at that bit of news, well, that’s because it failed too.

      In 2019, it forayed into butter cookies. And it even took on rivals like Britannia’s Good Day. It thought, “Hey, we’re supplying all the butter to these FMCG companies. But they’re actually mixing a lot of palm oil into it. Why not make one of our own?”

      Amul even went on the offensive and launched ad campaigns saying that it had ‘25% Amul butter.’ And that everyone else had less than 3%. It said that rivals simply used a flavouring. But despite that ad blitz, Amul hasn’t set the butter cookie segment on fire.

      And in 2020, it launched the Amul Tru seltzer — a carbonated fizzy drink to take on the colas of India. But even this product hasn’t moved the needle.

      Basically, almost everything other than the value-added dairy products hasn’t quite worked out for Amul. Its brand or distribution hasn’t helped it. So will it be different this time under new leadership? We don’t know.

      Or maybe Amul should just do what it does best and focus on getting more of the dairy pie? After all, only 30% of the $110-billion dairy sector is organized even today.

      Can Amul crack the code for non-dairy FMCG products? What do you think?

      Until then…

      Don't forget to share this article on WhatsApp, LinkedIn and Twitter


      Ditto Insights: Why Millennials should buy a term plan

      According to a survey, only 17% of Indian millennials (25–35 yrs) have bought term insurance. The actual numbers are likely even lower.

      And the more worrying fact is that 55% hadn’t even heard of term insurance!

      So why is this happening?

      One common misconception is the dependent conundrum. Most millennials we spoke to want to buy a term policy because they want to cover their spouse and kids. And this makes perfect sense. After all, in your absence you want your term policy to pay out a large sum of money to cover your family’s needs for the future. But these very same people don’t think of their parents as dependents even though they support them extensively. I remember the moment it hit me. I routinely send money back home, but I had never considered my parents as my dependents. And when a colleague spoke about his experience, I immediately put two and two together. They were dependent on my income and my absence would most certainly affect them financially. So a term plan was a no-brainer for me.

      There’s another reason why millennials should probably consider looking at a term plan — Debt. Most people we spoke to have home loans, education loans and other personal loans with a considerable interest burden. In their absence, this burden would shift to their dependents. It’s not something most people think of, but it happens all the time.

      Finally, you actually get a pretty good bargain on term insurance prices when you’re younger. The idea is to pay a nominal sum every year (something that won’t burn your pocket) to protect your dependents in the event of your untimely demise. And this fee is lowest when you’re young.

      So if you’re a millennial and you’re reading this, maybe you should reconsider buying a term plan. And don’t forget to talk to us at Ditto while you’re at it. We only have a limited number of slots everyday, so make sure you book your appointment at the earliest:

      1. Just head to our website by clicking on the link here

      2. Click on “Book a FREE call”

      3. Select Term Insurance

      4. Choose the date & time as per your convenience and RELAX!

      "#; let url = Url::parse("https://finshots.in").unwrap(); - let res = clean_html(html, &url).unwrap(); + let res = clean_html_fragment(html, &url).unwrap(); assert_eq!(res.html.len(), 11959); assert_eq!( From 9aa6478e3ce648d7c40ee811b52f2d6c87253240 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Wed, 9 Aug 2023 23:25:07 +0200 Subject: [PATCH 18/56] update heise test --- article_scraper/resources/tests/ftr/heise-1/expected.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/resources/tests/ftr/heise-1/expected.html b/article_scraper/resources/tests/ftr/heise-1/expected.html index 46fb001..b5fed54 100644 --- a/article_scraper/resources/tests/ftr/heise-1/expected.html +++ b/article_scraper/resources/tests/ftr/heise-1/expected.html @@ -122,7 +122,7 @@ - + From 2c76a89f9d8e704e2bbc24f7b0bf6deca84378bc Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Wed, 9 Aug 2023 23:57:25 +0200 Subject: [PATCH 19/56] add spiegel test --- .../tests/ftr/spiegel-1/expected.html | 52 + .../resources/tests/ftr/spiegel-1/source.html | 3988 +++++++++++++++++ article_scraper/src/full_text_parser/tests.rs | 6 +- 3 files changed, 4045 insertions(+), 1 deletion(-) create mode 100644 article_scraper/resources/tests/ftr/spiegel-1/expected.html create mode 100644 article_scraper/resources/tests/ftr/spiegel-1/source.html diff --git a/article_scraper/resources/tests/ftr/spiegel-1/expected.html b/article_scraper/resources/tests/ftr/spiegel-1/expected.html new file mode 100644 index 0000000..42eca1c --- /dev/null +++ b/article_scraper/resources/tests/ftr/spiegel-1/expected.html @@ -0,0 +1,52 @@ +
      +
      +
      +
      + + +»Barbie« soll im Libanon nicht gezeigt werden + +
      +
      +

      »Barbie« soll im Libanon nicht gezeigt werden

      + +Foto:

      - / dpa

      +
      +
      +
      +
      +
      +
      +

      Im Libanon soll der erfolgreiche Hollywood-Streifen »Barbie« verboten werden, weil dieser der Regierung zufolge »Werbung für Homosexualität und Geschlechtsumwandlung« macht. Der Film verstoße gegen die »moralischen und religiösen Werte« des Landes, erklärte der libanesische Kulturminister Mohammed Mourtada. Ursprünglich sollte der Blockbuster, der weltweit bereits mehr als eine Milliarde Dollar (rund 910 Millionen Euro) eingespielt hat, ab dem 31. August im Libanon gezeigt werden.

      +
      + +
      +

      Mourtada erklärte weiter, »Barbie« unterstütze die »Ablehnung der Vormundschaft des Vaters«, ziehe die Rolle der Mutter ins Lächerliche und stelle die Ehe und die Gründung einer Familie infrage.

      +
      +
      + + +
      + +An dieser Stelle finden Sie einen externen Inhalt von Twitter, +der den Artikel ergänzt und von der Redaktion empfohlen wird. Sie können ihn sich mit einem Klick anzeigen lassen +und wieder ausblenden. +
      + +
      +

      In dem Film von US-Regisseurin Greta Gerwig verlassen Barbie und Ken, gespielt von den Superstars Margot Robbie und Ryan Gosling, die pinkfarbene Plastikwelt Barbieland und lernen in Los Angeles das echte Leben kennen.

      +
      + +
      +

      Community unter Druck

      Der Libanon gilt in Bezug auf Homosexualität als toleranter als andere arabische Staaten. Allerdings haben religiöse Organisationen wie die radikalislamische Hisbollah einen großen Einfluss auf soziale und kulturelle Einrichtungen.

      Immer wieder wurden in den vergangenen Jahren Veranstaltungen der libanesischen LGBTQ+-Community abgesagt. Zudem steht Homosexualität im Libanon nach wie vor unter Strafe.

      +
      + + +
      +

      Die englische Abkürzung LGBTQ+ steht für lesbisch, schwul, bisexuell, transgender, queer und andere Geschlechtsidentitäten.

      + +
      +
      + +
      +
      \ No newline at end of file diff --git a/article_scraper/resources/tests/ftr/spiegel-1/source.html b/article_scraper/resources/tests/ftr/spiegel-1/source.html new file mode 100644 index 0000000..5cf86d8 --- /dev/null +++ b/article_scraper/resources/tests/ftr/spiegel-1/source.html @@ -0,0 +1,3988 @@ + + +Libanon: Regierung will »Barbie«-Ausstrahlung verbieten - DER SPIEGEL + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      +
      +
      + +
      +
      +
      + + +
      +
      +
      +
      +
      +
      +

      + +Absage an erfolgreichen Blockbuster + +Libanesische Regierung will »Barbie«-Ausstrahlung verbieten + +

      +
      +Kein pinkes Hollywood-Spektakel im Libanon – der geplante Kinostart von »Barbie« soll in dem Land abgesagt werden. Begründung: Der Film mache angeblich Werbung für Homosexualität und Geschlechtsumwandlung. +
      +
      + +
      +
      +
      + +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
        +
      • + +
      • +
      + +
      +
      + +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      + + +»Barbie« soll im Libanon nicht gezeigt werden + +
      + +
      +
      +

      »Barbie« soll im Libanon nicht gezeigt werden

      + +Foto:

      - / dpa

      +
      +
      +
      +
      +
      +
      +
      +
      +

      Im Libanon soll der erfolgreiche Hollywood-Streifen »Barbie« verboten werden, weil dieser der Regierung zufolge »Werbung für Homosexualität und Geschlechtsumwandlung« macht. Der Film verstoße gegen die »moralischen und religiösen Werte« des Landes, erklärte der libanesische Kulturminister Mohammed Mourtada. Ursprünglich sollte der Blockbuster, der weltweit bereits mehr als eine Milliarde Dollar (rund 910 Millionen Euro) eingespielt hat, ab dem 31. August im Libanon gezeigt werden.

      +
      +
      +
      +
      + +
      +
      +
      +
      +

      Mourtada erklärte weiter, »Barbie« unterstütze die »Ablehnung der Vormundschaft des Vaters«, ziehe die Rolle der Mutter ins Lächerliche und stelle die Ehe und die Gründung einer Familie infrage.

      +
      +
      +
      +
      +
      + +
      + +
      +
      +
      +
      +Empfohlener externer Inhalt +
      +
      +
      +
      +An dieser Stelle finden Sie einen externen Inhalt von Twitter, +der den Artikel ergänzt und von der Redaktion empfohlen wird. Sie können ihn sich mit einem Klick anzeigen lassen +und wieder ausblenden. +
      +
      + + +Externer Inhalt + +
      +

      +Ich bin damit einverstanden, dass mir externe Inhalte angezeigt werden. Damit können personenbezogene Daten an Drittplattformen übermittelt werden. + +Mehr dazu in unserer Datenschutzerklärung. + +

      +
      + +
      +
      +
      +
      +
      +
      +

      In dem Film von US-Regisseurin Greta Gerwig verlassen Barbie und Ken, gespielt von den Superstars Margot Robbie und Ryan Gosling, die pinkfarbene Plastikwelt Barbieland und lernen in Los Angeles das echte Leben kennen.

      +
      +
      +
      +
      + +
      +
      +
      +
      +

      Community unter Druck

      Der Libanon gilt in Bezug auf Homosexualität als toleranter als andere arabische Staaten. Allerdings haben religiöse Organisationen wie die radikalislamische Hisbollah einen großen Einfluss auf soziale und kulturelle Einrichtungen.

      Immer wieder wurden in den vergangenen Jahren Veranstaltungen der libanesischen LGBTQ+-Community abgesagt. Zudem steht Homosexualität im Libanon nach wie vor unter Strafe.

      +
      +
      +
      +
      + +
      +
      +
      + +
      +
      +
      +

      Die englische Abkürzung LGBTQ+ steht für lesbisch, schwul, bisexuell, transgender, queer und andere Geschlechtsidentitäten.

      + + +
      +
      +
      +
      +
      atb/AFP
      +
      +
      +
      +
      +
      + +
      +
      +
      +
      +
      +
      +
      + +
      +
      +
      +
      +
      +

      Mehr lesen über

      + +
      +
      +

      Verwandte Artikel

      + +
      +
      + + +
      +
      +
      +
      + + +
      + +
      +
      +
      +
      +
      +
      +
      +
      +
      + +
      +
      +
      +
      +
      +
      +
      +Die Wiedergabe wurde unterbrochen. +
      + +
      +
      +
      +
      + +
      + +
      + +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      + +
      +
      +
      + +
      +
      +
      +
      +
      +
      +
      +
      +
      + +
      +
      + + + + +
      + + +
      +
      +
      +
      +
      +
      +
      +
      + + + +
      +
      +
      +
      + +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +Playlist +
      + +
      +
      + + + +
      +
      +Speichern Sie Audioinhalte in Ihrer Playlist, um sie später zu hören oder offline abzuspielen. Zusätzlich können Sie Ihre Playlist über alle Geräte mit der SPIEGEL-App synchronisieren, auf denen Sie mit Ihrem Konto angemeldet sind. +
      + + +Anmelden oder Konto erstellen + + + + +
      +
      +
      + +
      +
      +
      +
      +
      +
      +
      +Merkliste +
      + +
      +
      + + + +
      +
      +Speichern Sie Ihre Lieblingsartikel in der persönlichen Merkliste, um sie später zu lesen und einfach wiederzufinden. +
      + + +Anmelden oder Konto erstellen + + + + +
      +
      +
      +
      +
      +
      +
      +
      +
      +Mehrfachnutzung erkannt +
      + +
      +
      + + + +
      +
      +Bitte beachten Sie: Die zeitgleiche Nutzung von SPIEGEL+-Inhalten ist auf ein Gerät beschränkt. +Wir behalten uns vor, die Mehrfachnutzung zukünftig technisch zu unterbinden. +
      + +
      +Sie möchten SPIEGEL+ auf mehreren Geräten zeitgleich nutzen? + +Zu unseren Angeboten + +
      +
      +
      +
      +
      +
      + + +
      + + + + + + + + + + + \ No newline at end of file diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index 2d0f858..2161a6e 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -74,11 +74,15 @@ async fn hardwareluxx() { } #[tokio::test] -#[ignore = "waiting on clarification for https://github.com/fivefilters/ftr-site-config/pull/1081"] async fn heise_1() { run_test("heise-1", "https://www.heise.de/", None, None).await } +#[tokio::test] +async fn spiegel_1() { + run_test("spiegel-1", "https://www.spiegel.de/", None, None).await +} + #[tokio::test] #[ignore = "downloads content from the web"] async fn encoding_windows_1252() { From 1584649eb4c0ca49b30fd8ea885cdb8829fba539 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Thu, 10 Aug 2023 00:01:10 +0200 Subject: [PATCH 20/56] fix tests --- article_scraper/resources/tests/ftr/youtube/expected.html | 2 +- article_scraper/src/clean.rs | 2 +- article_scraper/src/full_text_parser/tests.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/article_scraper/resources/tests/ftr/youtube/expected.html b/article_scraper/resources/tests/ftr/youtube/expected.html index 1ec3c5a..570905a 100644 --- a/article_scraper/resources/tests/ftr/youtube/expected.html +++ b/article_scraper/resources/tests/ftr/youtube/expected.html @@ -1 +1 @@ -
      \ No newline at end of file +
      \ No newline at end of file diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index a8e507e..790bf9d 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -170,7 +170,7 @@ mod tests { let url = Url::parse("https://finshots.in").unwrap(); let res = clean_html_fragment(html, &url).unwrap(); - assert_eq!(res.html.len(), 11959); + assert_eq!(res.html.len(), 11989); assert_eq!( res.thumbnail.as_deref(), Some("https://cdn.finshots.app/images/2023/03/Design-8-Amul.jpg") diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index 2161a6e..0f0370f 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -81,7 +81,7 @@ async fn heise_1() { #[tokio::test] async fn spiegel_1() { run_test("spiegel-1", "https://www.spiegel.de/", None, None).await -} +} #[tokio::test] #[ignore = "downloads content from the web"] From 0133b20f065c2adabd71a9cac23f5da3d097fab4 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Thu, 10 Aug 2023 00:01:31 +0200 Subject: [PATCH 21/56] generate full html document --- article_scraper/src/full_text_parser/mod.rs | 48 ++++++++++++++++++--- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 65b3912..f5043ba 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -118,14 +118,52 @@ impl FullTextParser { libxml::tree::node::set_node_rc_guard(10); let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?; - let mut root = + let mut html_node = + Node::new("html", None, &document).map_err(|()| FullTextParserError::Xml)?; + let mut head_node = + Node::new("head", None, &document).map_err(|()| FullTextParserError::Xml)?; + let mut charset_node = + Node::new("meta", None, &document).map_err(|()| FullTextParserError::Xml)?; + charset_node + .set_attribute("charset", "utf-8") + .map_err(|e| { + log::error!("{e}"); + FullTextParserError::Xml + })?; + let mut body_node = + Node::new("body", None, &document).map_err(|()| FullTextParserError::Xml)?; + let mut article_root = Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?; - document.set_root_element(&root); - Self::generate_head(&mut root, &document)?; + html_node.add_child(&mut head_node).map_err(|e| { + log::error!("{e}"); + FullTextParserError::Xml + })?; + html_node.add_child(&mut body_node).map_err(|e| { + log::error!("{e}"); + FullTextParserError::Xml + })?; + head_node.add_child(&mut charset_node).map_err(|e| { + log::error!("{e}"); + FullTextParserError::Xml + })?; + body_node.add_child(&mut article_root).map_err(|e| { + log::error!("{e}"); + FullTextParserError::Xml + })?; + + document.set_root_element(&html_node); + + Self::generate_head(&mut article_root, &document)?; for page_html in pages { - self.parse_page(&mut article, &page_html, &mut root, config, global_config)?; + self.parse_page( + &mut article, + &page_html, + &mut article_root, + config, + global_config, + )?; } let context = Context::new(&document).map_err(|()| { @@ -139,7 +177,7 @@ impl FullTextParser { } Self::post_process_document(&document)?; - article.html = Some(Util::serialize_node(&document, &root)); + article.html = Some(Util::serialize_node(&document, &article_root)); Ok(article) } From 8c7cdacd26c19c7bff40dee6cc8713fa8c3e516b Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Thu, 10 Aug 2023 02:06:08 +0200 Subject: [PATCH 22/56] Revert "generate full html document" This reverts commit 0133b20f065c2adabd71a9cac23f5da3d097fab4. --- article_scraper/src/full_text_parser/mod.rs | 48 +++------------------ 1 file changed, 5 insertions(+), 43 deletions(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index f5043ba..65b3912 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -118,52 +118,14 @@ impl FullTextParser { libxml::tree::node::set_node_rc_guard(10); let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?; - let mut html_node = - Node::new("html", None, &document).map_err(|()| FullTextParserError::Xml)?; - let mut head_node = - Node::new("head", None, &document).map_err(|()| FullTextParserError::Xml)?; - let mut charset_node = - Node::new("meta", None, &document).map_err(|()| FullTextParserError::Xml)?; - charset_node - .set_attribute("charset", "utf-8") - .map_err(|e| { - log::error!("{e}"); - FullTextParserError::Xml - })?; - let mut body_node = - Node::new("body", None, &document).map_err(|()| FullTextParserError::Xml)?; - let mut article_root = + let mut root = Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?; + document.set_root_element(&root); - html_node.add_child(&mut head_node).map_err(|e| { - log::error!("{e}"); - FullTextParserError::Xml - })?; - html_node.add_child(&mut body_node).map_err(|e| { - log::error!("{e}"); - FullTextParserError::Xml - })?; - head_node.add_child(&mut charset_node).map_err(|e| { - log::error!("{e}"); - FullTextParserError::Xml - })?; - body_node.add_child(&mut article_root).map_err(|e| { - log::error!("{e}"); - FullTextParserError::Xml - })?; - - document.set_root_element(&html_node); - - Self::generate_head(&mut article_root, &document)?; + Self::generate_head(&mut root, &document)?; for page_html in pages { - self.parse_page( - &mut article, - &page_html, - &mut article_root, - config, - global_config, - )?; + self.parse_page(&mut article, &page_html, &mut root, config, global_config)?; } let context = Context::new(&document).map_err(|()| { @@ -177,7 +139,7 @@ impl FullTextParser { } Self::post_process_document(&document)?; - article.html = Some(Util::serialize_node(&document, &article_root)); + article.html = Some(Util::serialize_node(&document, &root)); Ok(article) } From 6116ba38aeda31ff5e7fd32aa34f46494b10d3b8 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Thu, 10 Aug 2023 02:06:52 +0200 Subject: [PATCH 23/56] no need for head --- article_scraper/src/full_text_parser/mod.rs | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 65b3912..4f39017 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -122,8 +122,6 @@ impl FullTextParser { Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?; document.set_root_element(&root); - Self::generate_head(&mut root, &document)?; - for page_html in pages { self.parse_page(&mut article, &page_html, &mut root, config, global_config)?; } @@ -1091,20 +1089,6 @@ impl FullTextParser { } } - fn generate_head(root: &mut Node, document: &Document) -> Result<(), FullTextParserError> { - if let Ok(mut head_node) = Node::new("head", None, document) { - if let Ok(()) = root.add_prev_sibling(&mut head_node) { - if let Ok(mut meta) = head_node.new_child(None, "meta") { - if meta.set_property("charset", "utf-8").is_ok() { - return Ok(()); - } - } - } - } - - Err(FullTextParserError::Xml) - } - fn prevent_self_closing_tags(context: &Context) -> Result<(), FullTextParserError> { // search document for empty tags and add a empty text node as child // this prevents libxml from self closing non void elements such as iframe From acb7d1d000b730ab3bdae5c7538a9550de2d1cb9 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Thu, 10 Aug 2023 02:09:07 +0200 Subject: [PATCH 24/56] port libxml workaround from hurl --- article_scraper/src/clean.rs | 16 +----- article_scraper/src/full_text_parser/mod.rs | 64 ++++++++++++++++++++- 2 files changed, 64 insertions(+), 16 deletions(-) diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index 790bf9d..81297a2 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -39,22 +39,8 @@ pub fn clean_html_fragment( ) -> Result { libxml::tree::node::set_node_rc_guard(10); - let html = format!( - r#" - - - - - - - {html_fragment} - - - "# - ); - let empty_config = FtrConfigEntry::default(); - let document = FullTextParser::parse_html(&html, None, &empty_config)?; + let document = FullTextParser::parse_html(html_fragment, None, &empty_config)?; let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?; let thumbnail = FullTextParser::check_for_thumbnail(&xpath_ctx); FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document, None); diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 4f39017..325cd10 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -265,12 +265,74 @@ impl FullTextParser { // parse html let parser = Parser::default_html(); - parser.parse_string(html.as_str()).map_err(|err| { + Self::parse_html_string_patched(html.as_str(), &parser).map_err(|err| { log::error!("Parsing HTML failed for downloaded HTML {:?}", err); FullTextParserError::Xml }) } + /// FIXME: Here are some patched functions of libxml crate. + /// Started from libxml 2.11.1+, we have some encoding issue. + /// See: + /// - + /// - + /// These two functions should be removed when the issue is fixed in libxml crate. + fn try_usize_to_i32(value: usize) -> Result { + if cfg!(target_pointer_width = "16") || (value < i32::max_value() as usize) { + // Cannot safely use our value comparison, but the conversion if always safe. + // Or, if the value can be safely represented as a 32-bit signed integer. + Ok(value as i32) + } else { + // Document too large, cannot parse using libxml2. + Err(libxml::parser::XmlParseError::DocumentTooLarge) + } + } + + fn parse_html_string_patched( + input: &str, + parser: &Parser, + ) -> Result { + let input_bytes: &[u8] = input.as_ref(); + let input_ptr = input_bytes.as_ptr() as *const std::os::raw::c_char; + let input_len = Self::try_usize_to_i32(input_bytes.len())?; + let encoding = std::ffi::CString::new("utf-8").unwrap(); + let encoding_ptr = encoding.as_ptr(); + let url_ptr = std::ptr::null(); + + // HTML_PARSE_RECOVER | HTML_PARSE_NOERROR + let options = 1 + 32; + match parser.format { + libxml::parser::ParseFormat::XML => unsafe { + let doc_ptr = libxml::bindings::xmlReadMemory( + input_ptr, + input_len, + url_ptr, + encoding_ptr, + options, + ); + if doc_ptr.is_null() { + Err(libxml::parser::XmlParseError::GotNullPointer) + } else { + Ok(Document::new_ptr(doc_ptr)) + } + }, + libxml::parser::ParseFormat::HTML => unsafe { + let docptr = libxml::bindings::htmlReadMemory( + input_ptr, + input_len, + url_ptr, + encoding_ptr, + options, + ); + if docptr.is_null() { + Err(libxml::parser::XmlParseError::GotNullPointer) + } else { + Ok(Document::new_ptr(docptr)) + } + }, + } + } + pub(crate) fn get_xpath_ctx(doc: &Document) -> Result { Context::new(doc).map_err(|()| { log::error!("Creating xpath context failed for downloaded HTML"); From f9812b556c9cf05de13d936ea73f03c95de79bbc Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 13 Aug 2023 16:43:38 +0200 Subject: [PATCH 25/56] update ftr config --- article_scraper/ftr-site-config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/ftr-site-config b/article_scraper/ftr-site-config index 40796c9..1e23fc8 160000 --- a/article_scraper/ftr-site-config +++ b/article_scraper/ftr-site-config @@ -1 +1 @@ -Subproject commit 40796c9a9db47189121e844abbf5d3fbce02c9f5 +Subproject commit 1e23fc8a040627eddb6abcdf3803d8186ba67124 From ed8a83708b387069f31c7b235fe036aedc8198e4 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Tue, 13 Feb 2024 17:00:45 +0100 Subject: [PATCH 26/56] update deps & fix some flaky tests --- article_scraper/Cargo.toml | 12 ++++++------ .../tests/readability/hukumusume/source.html | 10 +++++----- article_scraper/src/clean.rs | 6 ++---- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index 948fe98..1c7d2ac 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -14,18 +14,18 @@ exclude = ["resources/tests"] thiserror = "1.0" libxml = "0.3" reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] } -tokio = { version = "1.28", features = ["macros", "fs", "io-util"] } -url = "2.3" -regex = "1.8" +tokio = { version = "1", features = ["macros", "fs", "io-util"] } +url = "2.5" +regex = "1.10" encoding_rs = "0.8" chrono = "0.4" base64 = "0.21" image = "0.24" log = "0.4" -rust-embed="6.6" -once_cell = "1.17" +rust-embed="8.2" +once_cell = "1.19" escaper = "0.1" futures = "0.3" [dev-dependencies] -env_logger = "0.10" +env_logger = "0.11" diff --git a/article_scraper/resources/tests/readability/hukumusume/source.html b/article_scraper/resources/tests/readability/hukumusume/source.html index 7034430..7f7afd4 100644 --- a/article_scraper/resources/tests/readability/hukumusume/source.html +++ b/article_scraper/resources/tests/readability/hukumusume/source.html @@ -175,7 +175,7 @@

      - + @@ -204,14 +204,14 @@ @@ -269,14 +269,14 @@ diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index 81297a2..7ab49e8 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -145,9 +145,7 @@ mod tests { let url = Url::parse("https://blogs.gnome.org/tbernard/2023/07/26/rethinking-window-management/") .unwrap(); - let res = clean_html_fragment(html, &url).unwrap(); - - std::fs::write("/home/jeanluc/result.html", res.html).unwrap(); + _ = clean_html_fragment(html, &url).unwrap(); } #[test] @@ -156,7 +154,7 @@ mod tests { let url = Url::parse("https://finshots.in").unwrap(); let res = clean_html_fragment(html, &url).unwrap(); - assert_eq!(res.html.len(), 11989); + assert!(res.html.len().abs_diff(12_000) < 200); assert_eq!( res.thumbnail.as_deref(), Some("https://cdn.finshots.app/images/2023/03/Design-8-Amul.jpg") From b13673ce3b94b8c2810573bf61dfc8f0724b63df Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Tue, 13 Feb 2024 19:06:05 +0100 Subject: [PATCH 27/56] do some null checks before unlinking nodes --- article_scraper/src/full_text_parser/mod.rs | 24 ++++++++++++++ .../src/full_text_parser/readability/mod.rs | 28 ++++++++++++++++ article_scraper/src/image_object.rs | 4 +++ article_scraper/src/util.rs | 32 +++++++++++++++++++ article_scraper/src/video_object.rs | 4 +++ 5 files changed, 92 insertions(+) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 325cd10..c7d8d9e 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -678,6 +678,10 @@ impl FullTextParser { let xpath = &format!("//iframe[contains(@src, '{}')]", site_name); let node_vec = Util::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { + if node.is_null() { + continue; + } + let video_wrapper = node .get_parent() .and_then(|mut parent| parent.new_child(None, "div").ok()); @@ -732,6 +736,10 @@ impl FullTextParser { ) -> Result<(), FullTextParserError> { let node_vec = Util::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { + if node.is_null() { + continue; + } + if let Some(url) = node.get_attribute(attribute) { let trimmed_url = url.trim(); @@ -845,6 +853,10 @@ impl FullTextParser { if let Ok(h2_nodes) = Util::evaluate_xpath(context, "//h2", false) { for mut h2_node in h2_nodes { + if h2_node.is_null() { + continue; + } + if Util::header_duplicates_title(&h2_node, title) { h2_node.unlink(); } @@ -969,6 +981,10 @@ impl FullTextParser { // This is done to prevent a placeholder img is replaced by img from noscript in next step. let img_nodes = Util::evaluate_xpath(ctx, "//img", false)?; for mut img_node in img_nodes { + if img_node.is_null() { + continue; + } + let attrs = img_node.get_attributes(); let keep = attrs.iter().any(|(name, value)| { @@ -986,6 +1002,10 @@ impl FullTextParser { // Next find noscript and try to extract its image let noscript_nodes = Util::evaluate_xpath(ctx, "//noscript", false)?; for mut noscript_node in noscript_nodes { + if noscript_node.is_null() { + continue; + } + // Parse content of noscript and make sure it only contains image if !Util::is_single_image(&noscript_node) { continue; @@ -1091,6 +1111,10 @@ impl FullTextParser { { let node_vec = Util::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { + if node.is_null() { + continue; + } + if node.get_property("style").is_some() && node.remove_property("style").is_err() { return Err(FullTextParserError::Xml); } diff --git a/article_scraper/src/full_text_parser/readability/mod.rs b/article_scraper/src/full_text_parser/readability/mod.rs index a0b8c03..ae5154d 100644 --- a/article_scraper/src/full_text_parser/readability/mod.rs +++ b/article_scraper/src/full_text_parser/readability/mod.rs @@ -179,6 +179,10 @@ impl Readability { // Put phrasing content into paragraphs. let mut p: Option = None; for mut child in node_ref.get_child_nodes().into_iter() { + if child.is_null() { + continue; + } + if Util::is_phrasing_content(&child) { if let Some(p) = p.as_mut() { child.unlink(); @@ -205,6 +209,10 @@ impl Readability { } else if p.is_some() { if let Some(p) = p.as_mut() { for mut r_node in p.get_child_nodes().into_iter().rev() { + if r_node.is_null() { + continue; + } + if Util::is_whitespace(&r_node) { r_node.unlink(); continue; @@ -366,6 +374,10 @@ impl Readability { Node::new("DIV", None, &document).expect("can't create new node"); for mut child in root.get_child_elements().drain(..) { + if child.is_null() { + continue; + } + child.unlink(); new_top_candidate.add_child(&mut child).unwrap(); } @@ -510,6 +522,10 @@ impl Readability { if let Some(mut siblings) = siblings { for mut sibling in siblings.drain(..) { + if sibling.is_null() { + continue; + } + let mut append = false; let score = Self::get_content_score(&sibling).unwrap_or(0.0); @@ -614,6 +630,10 @@ impl Readability { })?; for mut child in article_content.get_child_nodes() { + if child.is_null() { + continue; + } + child.unlink(); div.add_child(&mut child).map_err(|error| { log::error!("{error}"); @@ -657,6 +677,10 @@ impl Readability { // But first check if we actually have something if let Some((best_attempt, _len, _document)) = attempts.pop() { for mut child in best_attempt.get_child_nodes() { + if child.is_null() { + continue; + } + child.unlink(); root.add_child(&mut child).map_err(|error| { log::error!("{error}"); @@ -674,6 +698,10 @@ impl Readability { .map_err(|()| FullTextParserError::Readability)?; } else { for mut child in article_content.get_child_nodes() { + if child.is_null() { + continue; + } + child.unlink(); root.add_child(&mut child).map_err(|error| { log::error!("{error}"); diff --git a/article_scraper/src/image_object.rs b/article_scraper/src/image_object.rs index d6e7af0..e665414 100644 --- a/article_scraper/src/image_object.rs +++ b/article_scraper/src/image_object.rs @@ -69,6 +69,10 @@ impl ImageObject { } pub fn replace(&self, node: &mut Node) -> Result<(), FullTextParserError> { + if node.is_null() { + return Err(FullTextParserError::Xml); + } + let mut parent = node.get_parent().ok_or(FullTextParserError::Xml)?; if parent.get_name().to_uppercase() == "A" { diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index dcbbad9..874e696 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -221,6 +221,10 @@ impl Util { let node_vec_clone = node_vec.clone(); for mut node in node_vec { + if node.is_null() { + continue; + } + let tag_name = node.get_name(); if constants::EMBED_TAG_NAMES.contains(tag_name.to_uppercase().as_str()) && node @@ -271,6 +275,10 @@ impl Util { let query = &format!("{}[not(ancestor::{})]", xpath, ancestor); let node_vec = Util::evaluate_xpath(context, query, false)?; for mut node in node_vec { + if node.is_null() { + continue; + } + node.unlink(); } Ok(()) @@ -318,6 +326,10 @@ impl Util { } pub fn remove_and_next(node: &mut Node) -> Option { + if node.is_null() { + return None; + } + let next_node = Self::next_node(node, true); node.unlink(); next_node @@ -641,6 +653,10 @@ impl Util { nodes.append(&mut Util::get_elements_by_tag_name(root, "h2")); for mut node in nodes.into_iter().rev() { + if node.is_null() { + continue; + } + if Util::get_class_weight(&node) < 0 { log::debug!( "Removing header with low class weight: {} {}", @@ -675,6 +691,10 @@ impl Util { let nodes = Util::get_elements_by_tag_name(root, tag); for mut node in nodes.into_iter().rev() { + if node.is_null() { + continue; + } + if Self::should_remove(&node, tag) { node.unlink(); } @@ -972,6 +992,10 @@ impl Util { // or non-whitespace. This leaves behind the first
      in the chain // (which will be replaced with a

      later). while let Some(mut n) = next { + if n.is_null() { + break; + } + let is_text_whitespace = n .get_type() .map(|t| t == NodeType::TextNode) @@ -1012,6 +1036,10 @@ impl Util { next = p.get_next_sibling(); while let Some(mut next_node) = next { + if next_node.is_null() { + break; + } + // If we've hit another

      , we're done adding children to this

      . if next_node.get_name().to_uppercase() == "BR" { if let Some(next_elem) = next_node.get_next_element_sibling() { @@ -1039,6 +1067,10 @@ impl Util { } while let Some(mut last_child) = p.get_last_child() { + if last_child.is_null() { + continue; + } + let is_text_node = last_child .get_type() .map(|t| t == NodeType::TextNode) diff --git a/article_scraper/src/video_object.rs b/article_scraper/src/video_object.rs index fee9c07..55023a2 100644 --- a/article_scraper/src/video_object.rs +++ b/article_scraper/src/video_object.rs @@ -87,6 +87,10 @@ impl VideoObject { } pub fn replace(&self, node: &mut Node) -> Result<(), FullTextParserError> { + if node.is_null() { + return Err(FullTextParserError::Xml); + } + let mut parent = node.get_parent().ok_or(FullTextParserError::Xml)?; node.unlink(); From a1ee3b22f98943bac5bfd8e1801c4c9bc6066d43 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Tue, 13 Feb 2024 19:35:29 +0100 Subject: [PATCH 28/56] clippy --- article_scraper/src/util.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index 874e696..1aba153 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -1,5 +1,5 @@ use std::collections::HashSet; - +use std::fmt::Write; use libxml::{ tree::{Document, Node, NodeType, SaveOptions}, xpath::Context, @@ -190,7 +190,7 @@ impl Util { pub fn extract_value(context: &Context, xpath: &str) -> Result { let node_vec = Util::evaluate_xpath(context, xpath, false)?; - if let Some(val) = node_vec.get(0) { + if let Some(val) = node_vec.first() { return Ok(val.get_content()); } @@ -207,8 +207,10 @@ impl Util { let part = node .get_content() .split_whitespace() - .map(|s| format!("{} ", s)) - .collect::(); + .fold(String::new(), |mut output, s| { + let _ = write!(output, " {s}"); + output + }); val.push_str(&part); val.push(' '); } From 0dcebe8b49b8d867810d0f7ff155e502f637bb96 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Tue, 13 Feb 2024 19:36:58 +0100 Subject: [PATCH 29/56] fmt --- article_scraper/src/util.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index 1aba153..e605015 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -1,5 +1,3 @@ -use std::collections::HashSet; -use std::fmt::Write; use libxml::{ tree::{Document, Node, NodeType, SaveOptions}, xpath::Context, @@ -8,6 +6,8 @@ use reqwest::{ header::{HeaderMap, HeaderName, HeaderValue, CONTENT_LENGTH, CONTENT_TYPE}, Response, }; +use std::collections::HashSet; +use std::fmt::Write; use tokio::fs::DirEntry; use crate::{ @@ -204,13 +204,13 @@ impl Util { let node_vec = Util::evaluate_xpath(context, xpath, true)?; let mut val = String::new(); for node in node_vec { - let part = node - .get_content() - .split_whitespace() - .fold(String::new(), |mut output, s| { - let _ = write!(output, " {s}"); - output - }); + let part = + node.get_content() + .split_whitespace() + .fold(String::new(), |mut output, s| { + let _ = write!(output, " {s}"); + output + }); val.push_str(&part); val.push(' '); } From 689a72e6cdd5d736219a19eaf6c010173a8ea77e Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 24 Mar 2024 17:54:30 +0100 Subject: [PATCH 30/56] reqwest 0.12 --- .gitlab-ci.yml | 4 ++-- article_scraper/Cargo.toml | 8 ++++---- article_scraper/src/images/mod.rs | 5 ++--- article_scraper_cli/Cargo.toml | 8 ++++---- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 55d3464..c6a7149 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,7 +4,7 @@ stages: run-build: stage: build - image: rust:latest + image: rust:1.77 before_script: - rustup component add rustfmt - rustup component add clippy @@ -12,4 +12,4 @@ run-build: - rustc --version && cargo --version - cargo fmt -- --check - cargo clippy --all-targets --all-features -- -D warnings - - cargo build --release --jobs 1 + - cargo build --release diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index 1c7d2ac..9603c6b 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -13,16 +13,16 @@ exclude = ["resources/tests"] [dependencies] thiserror = "1.0" libxml = "0.3" -reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] } +reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] } tokio = { version = "1", features = ["macros", "fs", "io-util"] } url = "2.5" regex = "1.10" encoding_rs = "0.8" chrono = "0.4" -base64 = "0.21" -image = "0.24" +base64 = "0.22" +image = "0.25" log = "0.4" -rust-embed="8.2" +rust-embed="8.3" once_cell = "1.19" escaper = "0.1" futures = "0.3" diff --git a/article_scraper/src/images/mod.rs b/article_scraper/src/images/mod.rs index 5a2d133..6d97fd8 100644 --- a/article_scraper/src/images/mod.rs +++ b/article_scraper/src/images/mod.rs @@ -6,7 +6,7 @@ use crate::constants; use crate::util::Util; use base64::Engine; use futures::StreamExt; -use image::ImageOutputFormat; +use image::ImageFormat; use libxml::parser::Parser; use libxml::tree::{Node, SaveOptions}; use libxml::xpath::Context; @@ -324,8 +324,7 @@ impl ImageDownloader { image::imageops::FilterType::Lanczos3, ); let mut resized_buf: Vec = Vec::new(); - if let Err(error) = - image.write_to(&mut Cursor::new(&mut resized_buf), ImageOutputFormat::Png) + if let Err(error) = image.write_to(&mut Cursor::new(&mut resized_buf), ImageFormat::Png) { log::error!("Failed to save resized image to resize: {}", error); return None; diff --git a/article_scraper_cli/Cargo.toml b/article_scraper_cli/Cargo.toml index 1ef293e..7c24fd2 100644 --- a/article_scraper_cli/Cargo.toml +++ b/article_scraper_cli/Cargo.toml @@ -9,10 +9,10 @@ repository = "https://gitlab.com/news-flash/article_scraper" [dependencies] article_scraper = { path = "../article_scraper/" } -clap = { version = "4.2", features = [ "derive" ] } +clap = { version = "4.5", features = [ "derive" ] } simplelog = "0.12" log = "0.4" -url = "2.3" -reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] } -tokio = { version = "1.27", features = ["macros", "fs", "io-util", "rt-multi-thread" ] } +url = "2.5" +reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] } +tokio = { version = "1", features = ["macros", "fs", "io-util", "rt-multi-thread" ] } indicatif = "0.17" \ No newline at end of file From eee7ffee05ffde68e014657e32e5486c5b4c6443 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 24 Mar 2024 22:00:44 +0100 Subject: [PATCH 31/56] update ftr config --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 01c5a43..fcfce90 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,4 @@ [submodule "article_scraper/ftr-site-config"] path = article_scraper/ftr-site-config - url = https://github.com/jangernert/ftr-site-config.git + url = https://github.com/fivefilters/ftr-site-config.git branch = news-flash From a80b8a82749db688d4b8453f6a4deb47b5c8242c Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 24 Mar 2024 22:01:34 +0100 Subject: [PATCH 32/56] bump versions --- article_scraper/Cargo.toml | 2 +- article_scraper/ftr-site-config | 2 +- article_scraper_cli/Cargo.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index 9603c6b..8d92a79 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "article_scraper" -version = "2.0.0" +version = "2.1.0" authors = ["Jan Lukas Gernert "] edition = "2018" license = "GPL-3.0-or-later" diff --git a/article_scraper/ftr-site-config b/article_scraper/ftr-site-config index 1e23fc8..737398e 160000 --- a/article_scraper/ftr-site-config +++ b/article_scraper/ftr-site-config @@ -1 +1 @@ -Subproject commit 1e23fc8a040627eddb6abcdf3803d8186ba67124 +Subproject commit 737398ef6b121db2d72042b5406a95dfd497113f diff --git a/article_scraper_cli/Cargo.toml b/article_scraper_cli/Cargo.toml index 7c24fd2..b91abc5 100644 --- a/article_scraper_cli/Cargo.toml +++ b/article_scraper_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "article_scraper_cli" -version = "2.0.0-alpha.0" +version = "2.1.0" authors = ["Jan Lukas Gernert "] edition = "2018" license = "GPL-3.0-or-later" From 65b26370a2f6a85d12a632bb5a4f05d7d9c00440 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 24 Mar 2024 22:11:49 +0100 Subject: [PATCH 33/56] update ftr config --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index fcfce90..246181d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,4 @@ [submodule "article_scraper/ftr-site-config"] path = article_scraper/ftr-site-config url = https://github.com/fivefilters/ftr-site-config.git - branch = news-flash + branch = master From 3e5654e1973b1a2c3e7e50bf13bac68b44ab364c Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 8 Jun 2024 01:02:52 +0200 Subject: [PATCH 34/56] fix tests --- article_scraper/src/full_text_parser/mod.rs | 3 ++- article_scraper/src/util.rs | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index c7d8d9e..a25df4e 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -288,7 +288,7 @@ impl FullTextParser { } } - fn parse_html_string_patched( + pub(crate) fn parse_html_string_patched( input: &str, parser: &Parser, ) -> Result { @@ -691,6 +691,7 @@ impl FullTextParser { .ok() .and_then(|()| node.set_property("width", "100%").ok()) .and_then(|()| node.set_property("height", "400").ok()) + .and_then(|()| node.remove_attribute("aspect-ratio").ok()) .ok_or_else(|| { node.unlink(); video_wrapper.add_child(&mut node) diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index e605015..fa86b5b 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -1259,14 +1259,14 @@ impl Util { #[cfg(test)] mod tests { use libxml::parser::Parser; - + use crate::FullTextParser; use super::Util; fn replace_brs(source: &str, expected: &str) { libxml::tree::node::set_node_rc_guard(10); let parser = Parser::default_html(); - let document = parser.parse_string(source).unwrap(); + let document = FullTextParser::parse_html_string_patched(source, &parser).unwrap(); let root = document.get_root_element().unwrap(); let body = root.get_first_child().unwrap(); let div = body.get_first_child().unwrap(); From 11e9261bf28a130262ca7e5b1254eee50e3adc46 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 8 Jun 2024 01:03:00 +0200 Subject: [PATCH 35/56] fmt --- article_scraper/src/util.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index fa86b5b..2d24cdc 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -1258,9 +1258,9 @@ impl Util { #[cfg(test)] mod tests { - use libxml::parser::Parser; - use crate::FullTextParser; use super::Util; + use crate::FullTextParser; + use libxml::parser::Parser; fn replace_brs(source: &str, expected: &str) { libxml::tree::node::set_node_rc_guard(10); From 06018d98d4c743fda58eff1e41027476b5cf16ff Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 8 Jun 2024 23:18:00 +0200 Subject: [PATCH 36/56] replace emoji images --- article_scraper/Cargo.toml | 3 +- article_scraper/src/full_text_parser/mod.rs | 1 + article_scraper/src/util.rs | 57 +++++++++++++++++++++ 3 files changed, 60 insertions(+), 1 deletion(-) diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index 8d92a79..6e4d003 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -22,10 +22,11 @@ chrono = "0.4" base64 = "0.22" image = "0.25" log = "0.4" -rust-embed="8.3" +rust-embed="8.4" once_cell = "1.19" escaper = "0.1" futures = "0.3" +unic-emoji-char = "0.9" [dev-dependencies] env_logger = "0.11" diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index a25df4e..0e63850 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -966,6 +966,7 @@ impl FullTextParser { if let Some(root) = document.get_root_element() { Util::replace_brs(&root, document); + Util::replace_emoji_images(&root, document); } Self::fix_urls(context, url, document); diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index 2d24cdc..429378c 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -682,6 +682,31 @@ impl Util { } } + pub fn replace_emoji_images(root: &Node, document: &Document) { + let img_nodes = Util::get_elements_by_tag_name(root, "img"); + + for img_node in img_nodes { + if let Some(img_alt) = img_node.get_attribute("alt") { + let mut alt_chars = img_alt.chars(); + let first_char = alt_chars.next(); + let second_char = alt_chars.next(); + + if let (Some(char), None) = (first_char, second_char) { + if unic_emoji_char::is_emoji(char) { + if let Some(mut parent) = img_node.get_parent() { + // if let Ok(emoji_text_node) = parent.add_text_child(None, "emoji", &char.to_string()) { + // _ = parent.replace_child_node(emoji_text_node, img_node); + // } + let emoji_text_node = + Node::new_text(&char.to_string(), document).unwrap(); + _ = parent.replace_child_node(emoji_text_node, img_node); + } + } + } + } + } + } + // Clean an element of all tags of type "tag" if they look fishy. // "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. pub fn clean_conditionally(root: &mut Node, tag: &str) { @@ -1303,4 +1328,36 @@ mod tests { "#; replace_brs(source, source.trim()) } + + fn replace_emojis(source: &str, expected: &str) { + libxml::tree::node::set_node_rc_guard(10); + + let parser = Parser::default_html(); + let document = FullTextParser::parse_html_string_patched(source, &parser).unwrap(); + let root = document.get_root_element().unwrap(); + let body = root.get_first_child().unwrap(); + let p = body.get_first_child().unwrap(); + + Util::replace_emoji_images(&root, &document); + + let result = document.node_to_string(&p); + + assert_eq!(expected, result); + } + + #[test] + fn replace_emojis_1() { + replace_emojis( + "

      Let’s see if I did a better job of it this time by telling him he was using Arch wrong. \"😀\"/

      ", + "

      Let’s see if I did a better job of it this time by telling him he was using Arch wrong. 😀

      ", + ) + } + + #[test] + fn replace_emojis_2() { + replace_emojis( + "

      \"😀\"/ Abc

      ", + "

      😀 Abc

      ", + ) + } } From e01c8e9d34a28d646d26aad6e2a7a9dc093056c3 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Mon, 10 Jun 2024 20:40:19 +0200 Subject: [PATCH 37/56] negative score for thumbnails with emoji alt --- article_scraper/src/full_text_parser/mod.rs | 1 + article_scraper/src/full_text_parser/tests.rs | 20 +++++++++ article_scraper/src/util.rs | 42 ++++++++++++------- 3 files changed, 49 insertions(+), 14 deletions(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 0e63850..2857f73 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -533,6 +533,7 @@ impl FullTextParser { let score = score + Util::score_by_sibling(&img_node); let score = score + Util::score_by_dimensions(&img_node); let score = score + Util::score_by_position(len, index); + let score = score + Util::score_by_alt(&img_node); scores.insert(src, score); } diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index 0f0370f..36ae0d0 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -278,3 +278,23 @@ Foto: IMAGO/Vaclav Salek / IMAGO/CTK Photo "https://cdn.prod.www.spiegel.de/images/a4573666-f15e-4290-8c73-a0c6cd4ad3b2_w948_r1.778_fpx29.99_fpy44.98.jpg" ) } + +#[test] +fn extract_thumbnail_no_emoji() { + let html = r#" +

      I recently went on Brodie Robertson’s Tech Over Tea channel for a second time. I guess I didn’t succeed at pissing him off enough on the first go-around, because he invited me back! Let’s see if I did a better job of it this time by telling him he was using Arch wrong. 😀

      +

      Anyway, Brodie was a fantastic host, and we talked about a number of topics such as KDE’s position in the world, institutional continuity, fundraising and financial stability, the difficulty of reporting and triaging bug, the challenges of packaging software, and windows that block WiFi signals.

      +

      I hope you enjoy it!

      +
      + +
      +

      And here’s the link I mention at the end: https://kde.org/community/donations 🙂

      + "#; + + let parser = Parser::default_html(); + let doc = FullTextParser::parse_html_string_patched(html, &parser).unwrap(); + let ctx = Context::new(&doc).unwrap(); + + let thumb = FullTextParser::check_for_thumbnail(&ctx); + assert_eq!(thumb, None) +} diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index 429378c..cbd5370 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -687,26 +687,28 @@ impl Util { for img_node in img_nodes { if let Some(img_alt) = img_node.get_attribute("alt") { - let mut alt_chars = img_alt.chars(); - let first_char = alt_chars.next(); - let second_char = alt_chars.next(); - - if let (Some(char), None) = (first_char, second_char) { - if unic_emoji_char::is_emoji(char) { - if let Some(mut parent) = img_node.get_parent() { - // if let Ok(emoji_text_node) = parent.add_text_child(None, "emoji", &char.to_string()) { - // _ = parent.replace_child_node(emoji_text_node, img_node); - // } - let emoji_text_node = - Node::new_text(&char.to_string(), document).unwrap(); - _ = parent.replace_child_node(emoji_text_node, img_node); - } + if Self::is_emoji(&img_alt) { + if let Some(mut parent) = img_node.get_parent() { + let emoji_text_node = Node::new_text(&img_alt, document).unwrap(); + _ = parent.replace_child_node(emoji_text_node, img_node); } } } } } + pub fn is_emoji(text: &str) -> bool { + let mut alt_chars = text.chars(); + let first_char = alt_chars.next(); + let second_char = alt_chars.next(); + + if let (Some(char), None) = (first_char, second_char) { + unic_emoji_char::is_emoji(char) + } else { + false + } + } + // Clean an element of all tags of type "tag" if they look fishy. // "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. pub fn clean_conditionally(root: &mut Node, tag: &str) { @@ -1248,6 +1250,18 @@ impl Util { ((len as f32 / 2.0) - index as f32) as i32 } + pub fn score_by_alt(node: &Node) -> i32 { + if let Some(alt) = node.get_attribute("alt") { + if Self::is_emoji(&alt) { + -100 + } else { + 0 + } + } else { + 0 + } + } + pub fn get_content_length(response: &Response) -> Result { let status_code = response.status(); From df8ebcbb3552c2533fe47979e48378916929cd7f Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Mon, 10 Jun 2024 22:06:48 +0200 Subject: [PATCH 38/56] treat iframes as valid emtry tags --- .../readability/embedded-videos/expected.html | 10 +-- .../tests/readability/engadget/expected.html | 4 +- .../readability/hukumusume/expected.html | 6 +- .../tests/readability/lemonde-1/expected.html | 4 +- .../readability/liberation-1/expected.html | 4 +- .../tests/readability/msn/expected.html | 2 +- .../tests/readability/qq/expected.html | 2 +- .../tests/readability/videos-1/expected.html | 84 ++++++++++++++----- .../tests/readability/videos-2/expected.html | 28 +++++-- .../tests/readability/yahoo-1/expected.html | 16 ++-- article_scraper/src/clean.rs | 19 +++++ article_scraper/src/constants.rs | 8 ++ article_scraper/src/full_text_parser/mod.rs | 6 +- .../src/full_text_parser/readability/tests.rs | 4 + article_scraper/src/full_text_parser/tests.rs | 20 ----- 15 files changed, 145 insertions(+), 72 deletions(-) diff --git a/article_scraper/resources/tests/readability/embedded-videos/expected.html b/article_scraper/resources/tests/readability/embedded-videos/expected.html index 690b431..6db4190 100644 --- a/article_scraper/resources/tests/readability/embedded-videos/expected.html +++ b/article_scraper/resources/tests/readability/embedded-videos/expected.html @@ -8,13 +8,13 @@ proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

      At root

      - - - + + +

      In a paragraph

      - +

      In a div

      - +

      Foo

      Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, diff --git a/article_scraper/resources/tests/readability/engadget/expected.html b/article_scraper/resources/tests/readability/engadget/expected.html index a112ae2..b9b0c8e 100644 --- a/article_scraper/resources/tests/readability/engadget/expected.html +++ b/article_scraper/resources/tests/readability/engadget/expected.html @@ -250,7 +250,9 @@ capable HDR 10 standard. That makes sense since it's more widely supported, but it would have been nice to see Dolby's, too.

      - +

      + +

      And speaking of Dolby technology, Microsoft is also highlighting Atmos support on the One X, just like it did with the One S. The company's app lets you diff --git a/article_scraper/resources/tests/readability/hukumusume/expected.html b/article_scraper/resources/tests/readability/hukumusume/expected.html index 5ecf93d..fbc27b0 100644 --- a/article_scraper/resources/tests/readability/hukumusume/expected.html +++ b/article_scraper/resources/tests/readability/hukumusume/expected.html @@ -6,7 +6,7 @@

      - @@ -80,7 +80,7 @@ @@ -109,7 +109,7 @@ - diff --git a/article_scraper/resources/tests/readability/lemonde-1/expected.html b/article_scraper/resources/tests/readability/lemonde-1/expected.html index e870560..53d1828 100644 --- a/article_scraper/resources/tests/readability/lemonde-1/expected.html +++ b/article_scraper/resources/tests/readability/lemonde-1/expected.html @@ -1,5 +1,7 @@
      - +

      + +

      Les députés ont, sans surprise, adopté à une large majorité (438 contre 86 et 42 abstentions) le projet de loi sur le renseignement défendu par le gouvernement lors d’un vote solennel, mardi 5 mai. Il sera désormais examiné par le Sénat, puis le Conseil constitutionnel, prochainement saisi par 75 députés. Dans un souci d'apaisement, François Hollande avait annoncé par avance qu'il saisirait les Sages.

      Revivez le direct du vote à l’Assemblée avec vos questions.

      Ont voté contre : 10 députés socialistes (sur 288), 35 UMP (sur 198), 11 écologistes (sur 18), 11 UDI (sur 30), 12 députés Front de gauche (sur 15) et 7 non-inscrits (sur 9). Le détail est disponible sur le site de l'Assemblée nationale.

      diff --git a/article_scraper/resources/tests/readability/liberation-1/expected.html b/article_scraper/resources/tests/readability/liberation-1/expected.html index 4b911de..2806896 100644 --- a/article_scraper/resources/tests/readability/liberation-1/expected.html +++ b/article_scraper/resources/tests/readability/liberation-1/expected.html @@ -8,7 +8,9 @@

      L’appareil, mis à disposition par Airbus, était arrivé à Katmandou mercredi matin avec 55 personnels de santé et humanitaires, ainsi que 25 tonnes de matériel (abris, médicaments, aide alimentaire). Un deuxième avion dépêché par Paris, qui était immobilisé aux Emirats depuis mardi avec 20 tonnes de matériel, est arrivé jeudi à Katmandou, dont le petit aéroport est engorgé par le trafic et l’afflux d’aide humanitaire. Il devait lui aussi ramener des Français, «les plus éprouvés» par la catastrophe et les «plus vulnérables (blessés, familles avec enfants)», selon le ministère des Affaires étrangères.

      2 209 Français ont été localisés sains et saufs tandis que 393 n’ont pas encore pu être joints, selon le Quai d’Orsay. Environ 400 Français ont demandé à être rapatriés dans les vols mis en place par la France.

      Le séisme a fait près de 5 500 morts et touche huit des 28 millions d’habitants du Népal. Des dizaines de milliers de personnes sont sans abri.

      - +

      + +

      \ No newline at end of file diff --git a/article_scraper/resources/tests/readability/msn/expected.html b/article_scraper/resources/tests/readability/msn/expected.html index 157f0bd..597e831 100644 --- a/article_scraper/resources/tests/readability/msn/expected.html +++ b/article_scraper/resources/tests/readability/msn/expected.html @@ -16,7 +16,7 @@

      The name and basic idea might sound like one of those endless score attack games like "Temple Run," but that's not the case. "Super Mario Run" is divided into hand-crafted levels with a clear end-point like any other Mario game, meaning you're essentially getting the Mario experience for $10 without needing to control his movement.

      $10 might seem like a bit much compared to the $0 people pay for most mobile games, but it's possible the game has $10 worth of levels to play in it. It's also not iPhone exclusive, but the Android version will launch at a later, currently unknown date.

      To see "Super Mario Run" in action, check out the footage below:

      - +
      \ No newline at end of file diff --git a/article_scraper/resources/tests/readability/qq/expected.html b/article_scraper/resources/tests/readability/qq/expected.html index baad181..20914e1 100644 --- a/article_scraper/resources/tests/readability/qq/expected.html +++ b/article_scraper/resources/tests/readability/qq/expected.html @@ -28,7 +28,7 @@

      转播到腾讯微博

      - + diff --git a/article_scraper/resources/tests/readability/videos-1/expected.html b/article_scraper/resources/tests/readability/videos-1/expected.html index e469e3d..1bd350e 100644 --- a/article_scraper/resources/tests/readability/videos-1/expected.html +++ b/article_scraper/resources/tests/readability/videos-1/expected.html @@ -11,7 +11,9 @@

      21) Star Wars: The Last Jedi

      - +
      + +

      I am as shocked as anyone that a Star Wars movie found its way onto my list — but I was bowled over by The Last Jedi, which may be one of the series’ best. In the hands of writer-director Rian Johnson (who will also oversee a new Star Wars trilogy), The Last Jedi is beautiful to look at and keeps its eye on the relationships between characters and how they communicate with one another, in addition to the bigger galactic story. The same characters are back, but they seem infused with new life, and the galaxy with a new kind of hope. The movie’s best details are in the strong bonds that develop between characters, and I left the film with the realization that for the first time in my life, I loved a Star Wars movie. Now I understand the magic.

      @@ -21,7 +23,9 @@

      20) Faces Places

      - +
      + +

      The unusual documentary Faces Places (in French, Visages Villages) turns on the friendship between the accomplished street artist JR and legendary film director Agnès Varda, whose work was central to the development of the French New Wave movement. The pair (whose difference in age is 55 years) met after years of admiring each other’s work and decided to create a documentary portrait of France — by making a number of actual portraits. The film chronicles a leg of the "Inside Outside Project," a roving art initiative in which JR makes enormous portraits of people he meets and pastes them onto buildings and walls. In the film, Varda joins him, and as they talk to people around the country, they grow in their understanding of themselves and of each other. The development of their friendship, which is both affectionate and mutually sharpening, forms Faces Places’ emotional center.

      @@ -32,7 +36,9 @@

      19) Ingrid Goes West

      - +
      + +

      Ingrid Goes West is a twisted and dark comedy — part addiction narrative, part stalker story — and yet it’s set in a world that’s almost pathologically cheery: the glossy, sunny, nourishing, superfood- and superlative-loving universe of Instagram celebrity. But despite Ingrid Goes West’s spot-on take on that world, the best thing about the film is that it refuses to traffic in lazy buzzwords and easy skewering, particularly at the expense of young women. Instead, the movie conveys that behind every Instagram image and meltdown is a real person, with real insecurities, real feelings, and real problems. And it recognizes that living a life performed in public can be its own kind of self-deluding prison.

      @@ -42,7 +48,9 @@

      18) Lady Macbeth

      - +
      + +

      Lady Macbeth is no placid costume drama. Adapted from an 1865 Russian novella by Nikolai Leskov, the movie follows Katherine (the astounding Florence Pugh), a woman in the Lady Macbeth line characterized by a potent cocktail of very few scruples and a lot of determination. She's a chilling avatar for the ways that class and privilege — both obvious and hidden — insulate some people from the consequences of their actions while damning others. Lady Macbeth is also a dazzling directorial debut from William Oldroyd, a thrilling combination of sex, murder, intrigue, and power plays. It’s visually stunning, each frame composed so carefully and deliberately that the wildness and danger roiling just below the surface feels even more frightening. Each scene ratchets up the tension to an explosive, chilling end.

      @@ -52,7 +60,9 @@

      17) BPM (Beats Per Minute)

      - +
      + +

      BPM (Beats Per Minute) is a remarkably tender and stirring story of the Paris chapter of ACT UP, an AIDS activism group, and the young people who found themselves caught in the crosshairs of the AIDS crisis in the early 1990s. The film follows both the group's actions and the individual members’ shifting relationships to one another — enemies becoming friends, friends becoming lovers, lovers becoming caretakers — as well as their struggles with the disease wracking their community. As an account of the period, it’s riveting; as an exploration of life and love set at the urgent intersection of the political and the personal, it’s devastating.

      @@ -62,7 +72,9 @@

      16) The Big Sick

      - +
      + +

      Few 2017 movies could top the charm and tenderness of The Big Sick, which hits all the right romantic comedy notes with one unusual distinction: It feels like real life. That’s probably because The Big Sick is written by real-life married couple Emily V. Gordon and Silicon Valley's Kumail Nanjiani, and based on their real-life romance. The Big Sick — which stars Nanjiani as a version of himself, alongside Zoe Kazan as Emily — is funny and sweet while not backing away from matters that romantic comedies don’t usually touch on, like serious illness, struggles in long-term marriages, and religion. As it tells the couple’s story, which takes a serious turn when Emily falls ill with a mysterious infection and her parents (played by Holly Hunter and Ray Romano) come to town, it becomes a funny and wise story about real love.

      @@ -72,7 +84,9 @@

      15) Mother!

      - +
      + +

      There’s so much pulsing beneath the surface of Mother! that it’s hard to grab on to just one theme as what it “means.” It’s full-on apocalyptic fiction, and like all stories of apocalypse, it’s intended to draw back the veil on reality and show us what’s really beneath. And this movie gets wild: If its gleeful cracking apart of traditional theologies doesn’t get you (there’s a lot of Catholic folk imagery here, complete with an Ash Wednesday-like mud smearing on the foreheads of the faithful), its bonkers scenes of chaos probably will. Mother! is a movie designed to provoke fury, ecstasy, madness, catharsis, and more than a little awe. Watching it, and then participating in the flurry of arguments and discussions unpacking it, was among my best moviegoing experiences of 2017.

      @@ -82,7 +96,9 @@

      14) A Ghost Story

      - +
      + +

      Director David Lowery filmed A Ghost Story in secret, then premiered it at the Sundance Film Festival to critical acclaim. The movie starts out being about a grieving widow (Rooney Mara) trying to live through the pain of losing her beloved husband, but it soon shifts focus to the ghost of her husband (Casey Affleck, covered in a sheet), evolving into a compelling rumination on the nature of time, memory, history, and the universe. Bathed in warm humor and wistful longing, it's a film that stays with you long after it’s over, a lingering reminder of the inextricable link between love and place.

      @@ -92,7 +108,9 @@

      13) The Square

      - +
      + +
      @@ -102,7 +120,9 @@

      12) Dunkirk

      - +
      + +

      Dunkirk, a true cinematic achievement from acclaimed director Christopher Nolan, backs off conventional notions of narrative and chronology as much as possible, while leaning headfirst into everything else that makes a movie a visceral work of art aimed at the senses: the images, the sounds, the scale, the swelling vibrations of it all. You can’t smell the sea spray, but your brain may trick you into thinking you can. Nolan’s camera pushes the edges of the screen as far as it can as Dunkirk engulfs the audience in something that feels like a lot more than a war movie. It’s a symphony for the brave and broken, and it resolves in a major key — but one with an undercurrent of sorrow, and of sober warning. Courage in the face of danger is not just for characters in movies.

      @@ -112,7 +132,9 @@

      11) Rat Film

      - +
      + +

      Rat Film is about rats, yes — and rat poison experts and rat hunters and people who keep rats as pets. But it’s also about the history of eugenics, dubious science, “redlining,” and segregated housing in Baltimore. All these pieces come together to form one big essay, where the meaning of each vignette only becomes clearer in light of the whole. It’s a fast-paced, no-holds-barred exploration of a damning history, and it accrues meaning as the images, sounds, and text pile up.

      @@ -122,7 +144,9 @@

      10) A Quiet Passion

      - +
      + +

      A Quiet Passion is technically a biographical film about Emily Dickinson, but it transcends its genre to become something more like poetry. It’s a perplexing and challenging film, crafted without the traditional guardrails that guide most biographical movies — dates, times, major accomplishments, and so on. Time slips away in the film almost imperceptibly, and the narrative arc doesn’t yield easily to the viewer. Cynthia Nixon plays Emily Dickinson, whose poetry and life is a perfect match for the signature style of director Terence Davies: rich in detail, deeply enigmatic, and weighed down with a kind of sparkling, joy-tinged sorrow. A Quiet Passion is a portrait, both visual and narrative, of the kind of saint most modern people can understand: one who is certain of her uncertainty, and yearning to walk the path on which her passion and longing meet.

      @@ -132,7 +156,9 @@

      9) Columbus

      - +
      + +

      Columbus is a stunner of a debut from video essayist turned director Kogonada. Haley Lu Richardson stars as Casey, a young woman living in Columbus, Indiana, who cares for her mother, works at a library, and harbors a passion for architecture. (Columbus is a mecca for modernist architecture scholars and enthusiasts.) When a visiting architecture scholar falls into a coma in Columbus, his estranged son Jin (John Cho) arrives to wait for him and strikes up a friendship with Casey, who starts to show him her favorite buildings. The two begin to unlock something in each other that’s hard to define but life-changing for both. Columbus is beautiful and subtle, letting us feel how the places we build and the people we let near us move and mold us.

      @@ -142,7 +168,9 @@

      8) The Florida Project

      - +
      + +

      Sean Baker’s The Florida Project unfolds at first like a series of sketches about the characters who live in a purple-painted, $35-a-night motel called the Magic Castle down the street from Disney World. The film is held together by the hysterical antics of a kid named Moonee and her pack of young friends, as well as long-suffering hotel manager Bobby (a splendid, warm Willem Dafoe), who tries to put up with it all while keeping some kind of order. But as The Florida Project goes on, a narrative starts to form, one that chronicles with heartbreaking attention the sort of dilemmas that face poor parents and their children in America, and the broken systems that try to cope with impossible situations.

      @@ -152,7 +180,9 @@

      7) Call Me by Your Name

      - +
      + +

      Luca Guadagnino’s gorgeous film Call Me by Your Name adapts André Aciman’s 2007 novel about a precocious 17-year-old named Elio (Timothée Chalamet), who falls in lust and love with his father’s 24-year-old graduate student Oliver (Armie Hammer). It’s remarkable for how it turns literature into pure cinema, all emotion and image and heady sensation. Set in 1983 in Northern Italy, Call Me by Your Name is less about coming out than coming of age, but it also captures a particular sort of love that’s equal parts passion and torment, a kind of irrational heart fire that opens a gate into something longer-lasting. The film is a lush, heady experience for the body, but it’s also an arousal for the soul.

      @@ -162,7 +192,9 @@

      6) Personal Shopper

      - +
      + +

      In her second collaboration with French director Olivier Assayas, Kristen Stewart plays a personal shopper to a wealthy socialite, with a sideline as an amateur ghost hunter who’s searching for her dead twin brother. Personal Shopper is deeper than it seems at first blush, a meditation on grief and an exploration of “between” places — on the fringes of wealth, and in the space between life and death. Some souls are linked in a way that can’t be shaken, and whether or not there’s an afterlife doesn’t change the fact that we see and sense them everywhere. (Personal Shopper also has one of the most tense extended scenes involving text messaging ever seen onscreen.)

      @@ -172,7 +204,9 @@

      5) Princess Cyd

      - +
      + +

      Stephen Cone is a master of small, carefully realized filmmaking; his earlier films such as The Wise Kids and Henry Gamble’s Birthday Party combine an unusual level of empathy for his characters with an unusual combination of interests: love, desire, sexual awakenings, and religion. Princess Cyd is his most accomplished film yet, about a young woman named Cyd (Jessie Pinnick) who finds herself attracted to Katie (Malic White), a barista, while visiting her Aunt Miranda (Rebecca Spence, playing a character modeled on the author Marilynne Robinson) in Chicago. As she works through her own sexual awakening with Katie, Cyd unwinds some of the ways Miranda’s life has gotten too safe. They provoke each other while forming a bond and being prodded toward a bigger understanding of the world. It is a graceful and honest film, and it feels like a modest miracle.

      @@ -182,7 +216,9 @@

      4) Get Out

      - +
      + +

      Racism is sinister, frightening, and deadly. But Get Out (a stunning directorial debut from Key & Peele's Jordan Peele) isn’t about the blatantly, obviously scary kind of racism — burning crosses and lynchings and snarling hate. Instead, it’s interested in showing how the parts of racism that try to be aggressively unscary are just as horrifying, and it’s interested in making us feel that horror in a visceral, bodily way. In the tradition of the best classic social thrillers, Get Out takes a topic that is often approached cerebrally — casual racism — and turns it into something you feel in your tummy. And it does it with a wicked sense of humor.

      @@ -192,7 +228,9 @@

      3) The Work

      - +
      + +

      The Work is an outstanding, astonishing accomplishment and a viewing experience that will leave you shaken (but in a good way). At Folsom Prison in California, incarcerated men regularly participate in group therapy, and each year other men from the “outside” apply to participate in an intense four-day period of group therapy alongside Folsom’s inmates. The Work spends almost all of its time inside the room where that therapy happens, observing the strong, visceral, and sometimes violent emotions the men feel as they expose the hurt and raw nerves that have shaped how they encounter the world. Watching is not always easy, but by letting us peek in, the film invites viewers to become part of the experience — as if we, too, are being asked to let go.

      @@ -202,7 +240,9 @@

      2) Ex Libris

      - +
      + +

      Frederick Wiseman is one of the towering giants of nonfiction film, a keen observer of American institutions — ranging from prisons to dance companies to welfare offices — for the past half-century. Ex Libris is his mesmerizing look at the New York Public Library and the many functions it fills, which go far beyond housing books. Wiseman works in the observational mode, which means his films contain no captions, dates, or talking-head interviews: We just see what his camera captured, which in this case includes community meetings, benefit dinners, after-school programs, readings with authors and scholars (including Richard Dawkins and Ta-Nehisi Coates), and NYPL patrons going about their business in the library’s branches all over the city. The result is almost hypnotic and, perhaps surprisingly, deeply moving. It makes a case for having faith in the public institutions where ordinary people work — away from the limelight, without trying to score political points — in order to make our communities truly better.

      @@ -212,7 +252,9 @@

      1) Lady Bird

      - +
      + +

      Lady Bird topped my list almost instantly, and only rose in my estimation on repeated viewings. For many who saw it (including me), it felt like a movie made not just for but about me. Lady Bird is a masterful, exquisite coming-of-age comedy starring the great Saoirse Ronan as Christine — or “Lady Bird,” as she’s re-christened herself — and it’s as funny, smart, and filled with yearning as its heroine. Writer-director Greta Gerwig made the film as an act of love, not just toward her hometown of Sacramento but also toward girlhood, and toward the feeling of always being on the outside of wherever real life is happening. Lady Bird is the rare movie that manages to be affectionate, entertaining, hilarious, witty, and confident. And one line from it struck me as the guiding principle of many of the year’s best films: “Don’t you think they are the same thing? Love, and attention?”

      diff --git a/article_scraper/resources/tests/readability/videos-2/expected.html b/article_scraper/resources/tests/readability/videos-2/expected.html index f2fe95f..f5f2c68 100644 --- a/article_scraper/resources/tests/readability/videos-2/expected.html +++ b/article_scraper/resources/tests/readability/videos-2/expected.html @@ -8,7 +8,9 @@

      Vape Wave (documentaire, 1h28, Planète+)

      - +

      + +

      Pendant quelques jours, le doute a plané : l’Etat comptait-il vraiment légiférer contre la cigarette dans les films français, que ce soit via une interdiction pure et simple ou via un système de «punition» (coupe des aides CNC, par exemple) pour les longs-métrages qui sentent le mégot ? Si le rétropédalage de la ministre Buzyn n’en est pas vraiment un (elle n’avait jamais clairement menacé le septième art), la polémique a le mérite de pointer la (sur)représentation clopesque sur écran. Et si, comme c’est le cas dans la vie quotidienne, on voyait progressivement les cigarettes électroniques remplacer les tiges nicotinées authentiques ? Que ceux qui mettraient en doute le potentiel cinématographique des vapoteuses se ruent sur Vape Wave, documentaire militant signé Jan Kounen, ex-fumeur reconverti à la vape dont les images magnifient les volutes de vapeur recrachée.

      @@ -21,7 +23,9 @@

      Dans la tête d’Alan Moore (websérie documentaire, 8x5min, Arte Creative)

      - +

      + +

      Le week-end dernier, Libération publiait un portrait de der consacré à l’auteur britannique Alan Moore, connu pour ses BD cultes (V pour Vendetta, Watchmen, From Hell), à l’occasion de la sortie de son deuxième roman, le pavé Jérusalem. En attendant l’imminente sortie d’une version longue de son entretien avec Libé, on pourra se replonger dans les épisodes d’une websérie documentaire d’Arte Creative en 8 épisodes consacré au maître. Brexit, magie, Anonymous font partie des sujets discutés avec le maître au fil de ce programme sobrement intitulé Dans la tête d’Alan Moore. (A.H.)

      @@ -31,7 +35,9 @@

      The Death and Life of Marsha P. Johnson (docu, 1h45, Netflix)

      - +

      + +

      Marsha, la «Rosa Parks du mouvement LGBTQ». Marsha «la prostituée, l’actrice et la sainte, modèle d’Andy Warhol» ou encore Marsha l’élaborée, la radicale, «avec ses plumes et ce maquillage qu’elle ne mettait jamais bien». «Queen Marsha» a été retrouvée morte dans l’Hudson en juillet 1992, alors qu’on la voyait encore parader dans les rues de Greenwich Village quelques jours auparavant. Un choc glaçant. Là où son corps a été repêché puis ingratement déposé, les sans-abri ont constitué le lendemain un mémorial de bouteilles et de plantes qui délimitent les contours de l’absente.

      @@ -44,7 +50,9 @@

      Alphonse President (série, 10x26, OCS Max)

      - +

      + +

      Un temps baptisée French Touch, la série Alphonse Président est le dernier né des programmes originaux made in OCS. On savait les budgets de la chaîne bien moins généreux que ceux de Canal+ (voire que ceux de France 3 Limousin), et cette série le prouve à nouveau régulièrement, notamment lors d’une scène de conférence de presse alternant plans larges d’une authentique conf' à l’Elysée période François Hollande et plans serrés d’acteurs filmés dans un château des Pays de la Loire où a eu lieu le tournage. Le principal atout (et quel atout) de cette série écrite et réalisée par Nicolas Castro (Des lendemains qui chantent, 2014) réside dans son interprète principal, Michel Vuillermoz.

      @@ -57,7 +65,9 @@

      Jim & Andy (documentaire, 1h33, Netflix) 

      - +

      + +

      A la sortie de Man on the Moon (2000), le magnifique film de Milos Forman consacré à Andy Kaufman – comique et génie de la performance absurde mort en 1984 –, le cinéaste et les acteurs insistaient dans chaque interview sur l’in­croyable comportement de Jim Carrey pendant le tournage : il aurait été comme possédé par Kaufman, se prenant pour lui 24 heures sur 24. Certains affirmaient même ne jamais avoir eu l’impression que l’acteur était présent, tant son modèle avait littéralement pris sa place. Nous en avons aujourd’hui la preuve en images car tout cela avait été filmé par Bob Zmuda et Lynne Margulies, l’ancien complice et la veuve de Kaufman.

      @@ -70,7 +80,9 @@

      Braguino (documentaire, 50min, Arte)

      - +

      + +

      La querelle peut se trouver derrière toutes les portes, y compris celle de l’exil. On a beau croire avoir tourné le dos à tout, à cette inclination humaine à nourrir sa propre haine, l’allergie peut regermer fissa sur une peau qui frissonne à l’approche de ce voisin que l’on ne comprend pas. Issu d’une lignée de vieux-croyants orthodoxes russes, Sacha Braguine a pris sa famille sous le bras, loin de toute autre présence humaine en taïga sibérienne. Un autre groupe, les Kiline, a décidé d’en faire de même et de s’installer de l’autre côté de la rivière. Qui est arrivé en premier ? Qui menace l’autre ? L’histoire de l’impossible communauté peut commencer.

      @@ -83,7 +95,9 @@

      6 Days (film, 1h34, Netflix)

      - +

      + +

      Fin avril 1980, l’ambassade d’Iran à Londres a été le théâtre d’une prise d’otages largement médiatisée : une trentaine de personnes ont ainsi été retenues pendant six jours par des soldats iraniens dissidents exigeant la libération de 91 prisonniers. Avec Margaret Thatcher au 10 Downing Street à l’époque, pas question pour l’Angleterre d’avoir l’air mou du genou sur la réponse à apporter à cette crise scrutée par les caméras du monde entier. Le SAS (Special Air Service) est sur le coup : l’opération Nimrod se met en place pour prendre d’assaut l’ambassade.

      diff --git a/article_scraper/resources/tests/readability/yahoo-1/expected.html b/article_scraper/resources/tests/readability/yahoo-1/expected.html index 9d547d4..b5d5602 100644 --- a/article_scraper/resources/tests/readability/yahoo-1/expected.html +++ b/article_scraper/resources/tests/readability/yahoo-1/expected.html @@ -11,28 +11,28 @@

      Virtual reality has officially reached the consoles. And it’s pretty good! Sony’s PlayStation VR is extremely comfortable and reasonably priced, and while it’s lacking killer apps, it’s loaded with lots of interesting ones.

      But which ones should you buy? I’ve played just about every launch game, and while some are worth your time, others you might want to skip. To help you decide what’s what, I’ve put together this list of the eight PSVR games worth considering.

      “Rez Infinite” ($30)

      - +

      Beloved cult hit “Rez” gets the VR treatment to help launch the PSVR, and the results are terrific. It includes a fully remastered take on the original “Rez” – you zoom through a Matrix-like computer system, shooting down enemies to the steady beat of thumping electronica – but the VR setting makes it incredibly immersive. It gets better the more you play it, too; unlock the amazing Area X mode and you’ll find yourself flying, shooting and bobbing your head to some of the trippiest visuals yet seen in VR.

      “Thumper” ($20)

      - +

      What would happen if Tron, the board game Simon, a Clown beetle, Cthulhu and a noise band met in VR? Chaos, for sure, and also “Thumper.” Called a “violent rhythm game” by its creators, “Thumper” is, well, a violent rhythm game that’s also a gorgeous, unsettling and totally captivating assault on the senses. With simple controls and a straightforward premise – click the X button and the analog stick in time with the music as you barrel down a neon highway — it’s one of the rare games that works equally well both in and out of VR. But since you have PSVR, play it there. It’s marvelous.

      “Until Dawn: Rush of Blood” ($20)

      - +

      Cheeky horror game “Until Dawn” was a breakout hit for the PS4 last year, channeling the classic “dumb teens in the woods” horror trope into an effective interactive drama. Well, forget all that if you fire up “Rush of Blood,” because this one sticks you front and center on a rollercoaster ride from Hell. Literally. You ride through a dimly-lit carnival of terror, dual-wielding pistols as you take down targets, hideous pig monsters and, naturally, maniac clowns. Be warned: If the bad guys don’t get you, the jump scares will.

      “Headmaster” ($20)

      - +

      Soccer meets “Portal” in the weird (and weirdly fun) “Headmaster,” a game about heading soccer balls into nets, targets and a variety of other things while stuck in some diabolical training facility. While at first it seems a little basic, increasingly challenging shots and a consistently entertaining narrative keep it from running off the pitch. Funny, ridiculous and as easy as literally moving your head back and forth, it’s a pleasant PSVR surprise.

      “RIGS: Mechanized Combat League” ($50)

      - +

      Giant mechs + sports? That’s the gist of this robotic blast-a-thon, which pits two teams of three against one another in gorgeous, explosive and downright fun VR combat. At its best, “RIGS” marries the thrill of fast-paced competitive shooters with the insanity of piloting a giant mech in VR. It can, however, be one of the barfier PSVR games. So pack your Dramamine, you’re going to have to ease yourself into this one.

      “Batman Arkham VR” ($20)

      - +

      “I’m Batman,” you will say. And you’ll actually be right this time, because you are Batman in this detective yarn, and you know this because you actually grab the famous cowl and mask, stick it on your head, and stare into the mirrored reflection of Rocksteady Games’ impressive Dark Knight character model. It lacks the action of its fellow “Arkham” games and runs disappointingly short, but it’s a high-quality experience that really shows off how powerfully immersive VR can be.

      “Job Simulator” ($30)

      - +

      There are a number of good VR ports in the PSVR launch lineup, but the HTC Vive launch game “Job Simulator” might be the best. Your task? Lots of tasks, actually, from cooking food to fixing cars to working in an office, all for robots, because did I mention you were in the future? Infinitely charming and surprisingly challenging, it’s a great showpiece for VR.

      “Eve Valkyrie” ($60)

      - +

      Already a hit on the Oculus Rift, this space dogfighting game was one of the first to really show off how VR can turn a traditional game experience into something special. It’s pricey and not quite as hi-res as the Rift version, but “Eve Valkyrie” does an admirable job filling the void left since “Battlestar Galactica” ended. Too bad there aren’t any Cylons in it (or are there?)

      More games news:

        diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index 7ab49e8..140ff5b 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -160,4 +160,23 @@ mod tests { Some("https://cdn.finshots.app/images/2023/03/Design-8-Amul.jpg") ) } + + #[test] + fn pointieststick() { + let html = r#" +

        I recently went on Brodie Robertson’s Tech Over Tea channel for a second time. I guess I didn’t succeed at pissing him off enough on the first go-around, because he invited me back! Let’s see if I did a better job of it this time by telling him he was using Arch wrong. 😀

        +

        Anyway, Brodie was a fantastic host, and we talked about a number of topics such as KDE’s position in the world, institutional continuity, fundraising and financial stability, the difficulty of reporting and triaging bug, the challenges of packaging software, and windows that block WiFi signals.

        +

        I hope you enjoy it!

        +
        + +
        +

        And here’s the link I mention at the end: https://kde.org/community/donations 🙂

        + "#; + + let url = Url::parse("https://pointieststick.com").unwrap(); + let res = clean_html_fragment(html, &url).unwrap(); + + assert_eq!(res.thumbnail, None); + assert!(res.html.contains("iframe")); + } } diff --git a/article_scraper/src/constants.rs b/article_scraper/src/constants.rs index 79823f8..adf6df2 100644 --- a/article_scraper/src/constants.rs +++ b/article_scraper/src/constants.rs @@ -141,6 +141,14 @@ pub static DIV_TO_P_ELEMS: Lazy> = Lazy::new(|| { pub static VALID_EMPTY_TAGS: Lazy> = Lazy::new(|| { HashSet::from([ "AREA", "BASE", "BR", "COL", "EMBED", "HR", "IMG", "LINK", "META", "SOURCE", "TRACK", + "IFRAME", + ]) +}); + +pub static VALID_SELF_CLOSING_TAGS: Lazy> = Lazy::new(|| { + HashSet::from([ + "AREA", "BASE", "BR", "COL", "EMBED", "HR", "IMG", "INPUT", "LINK", "META", "PARAM", + "SOURCE", "TRACK", "WBR", ]) }); diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 2857f73..fece84e 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -1178,15 +1178,15 @@ impl FullTextParser { } } - fn prevent_self_closing_tags(context: &Context) -> Result<(), FullTextParserError> { + pub(crate) fn prevent_self_closing_tags(context: &Context) -> Result<(), FullTextParserError> { // search document for empty tags and add a empty text node as child // this prevents libxml from self closing non void elements such as iframe let xpath = "//*[not(node())]"; let node_vec = Util::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { - let name = node.get_name().to_lowercase(); - if name == "meta" || name == "img" || name == "br" { + let name = node.get_name().to_uppercase(); + if constants::VALID_SELF_CLOSING_TAGS.contains(name.as_str()) { continue; } diff --git a/article_scraper/src/full_text_parser/readability/tests.rs b/article_scraper/src/full_text_parser/readability/tests.rs index 32667c7..562313f 100644 --- a/article_scraper/src/full_text_parser/readability/tests.rs +++ b/article_scraper/src/full_text_parser/readability/tests.rs @@ -39,6 +39,10 @@ async fn run_test(name: &str) { metadata::extract(&xpath_ctx, None, None, &mut article); super::Readability::extract_body(document, &mut root, article.title.as_deref()).unwrap(); + + let article_ctx = crate::FullTextParser::get_xpath_ctx(&article_document).unwrap(); + + crate::FullTextParser::prevent_self_closing_tags(&article_ctx).unwrap(); crate::FullTextParser::post_process_document(&article_document).unwrap(); let html = Util::serialize_node(&article_document, &root); diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index 36ae0d0..0f0370f 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -278,23 +278,3 @@ Foto: IMAGO/Vaclav Salek / IMAGO/CTK Photo "https://cdn.prod.www.spiegel.de/images/a4573666-f15e-4290-8c73-a0c6cd4ad3b2_w948_r1.778_fpx29.99_fpy44.98.jpg" ) } - -#[test] -fn extract_thumbnail_no_emoji() { - let html = r#" -

        I recently went on Brodie Robertson’s Tech Over Tea channel for a second time. I guess I didn’t succeed at pissing him off enough on the first go-around, because he invited me back! Let’s see if I did a better job of it this time by telling him he was using Arch wrong. 😀

        -

        Anyway, Brodie was a fantastic host, and we talked about a number of topics such as KDE’s position in the world, institutional continuity, fundraising and financial stability, the difficulty of reporting and triaging bug, the challenges of packaging software, and windows that block WiFi signals.

        -

        I hope you enjoy it!

        -
        - -
        -

        And here’s the link I mention at the end: https://kde.org/community/donations 🙂

        - "#; - - let parser = Parser::default_html(); - let doc = FullTextParser::parse_html_string_patched(html, &parser).unwrap(); - let ctx = Context::new(&doc).unwrap(); - - let thumb = FullTextParser::check_for_thumbnail(&ctx); - assert_eq!(thumb, None) -} From f4e4e64b9e4c250b3b2f9319aa54797d7dd555dc Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Mon, 10 Jun 2024 22:27:10 +0200 Subject: [PATCH 39/56] absolute default size for embedded youtube videos --- .../resources/tests/ftr/youtube/expected.html | 2 +- .../readability/embedded-videos/expected.html | 4 +- .../tests/readability/engadget/expected.html | 2 +- .../tests/readability/msn/expected.html | 2 +- .../tests/readability/videos-1/expected.html | 42 +++++++++---------- .../tests/readability/videos-2/expected.html | 12 +++--- .../tests/readability/yahoo-1/expected.html | 16 +++---- article_scraper/src/clean.rs | 1 + article_scraper/src/full_text_parser/mod.rs | 6 +-- 9 files changed, 44 insertions(+), 43 deletions(-) diff --git a/article_scraper/resources/tests/ftr/youtube/expected.html b/article_scraper/resources/tests/ftr/youtube/expected.html index 570905a..e05d2c2 100644 --- a/article_scraper/resources/tests/ftr/youtube/expected.html +++ b/article_scraper/resources/tests/ftr/youtube/expected.html @@ -1 +1 @@ -
        \ No newline at end of file +
        \ No newline at end of file diff --git a/article_scraper/resources/tests/readability/embedded-videos/expected.html b/article_scraper/resources/tests/readability/embedded-videos/expected.html index 6db4190..c520e7f 100644 --- a/article_scraper/resources/tests/readability/embedded-videos/expected.html +++ b/article_scraper/resources/tests/readability/embedded-videos/expected.html @@ -8,13 +8,13 @@ proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

        At root

        - +

        In a paragraph

        In a div

        -
        +

        Foo

        Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, diff --git a/article_scraper/resources/tests/readability/engadget/expected.html b/article_scraper/resources/tests/readability/engadget/expected.html index b9b0c8e..ed29aa5 100644 --- a/article_scraper/resources/tests/readability/engadget/expected.html +++ b/article_scraper/resources/tests/readability/engadget/expected.html @@ -251,7 +251,7 @@ more widely supported, but it would have been nice to see Dolby's, too.

        - +

        And speaking of Dolby technology, Microsoft is also highlighting Atmos support on the One X, just like diff --git a/article_scraper/resources/tests/readability/msn/expected.html b/article_scraper/resources/tests/readability/msn/expected.html index 597e831..38ea173 100644 --- a/article_scraper/resources/tests/readability/msn/expected.html +++ b/article_scraper/resources/tests/readability/msn/expected.html @@ -16,7 +16,7 @@

        The name and basic idea might sound like one of those endless score attack games like "Temple Run," but that's not the case. "Super Mario Run" is divided into hand-crafted levels with a clear end-point like any other Mario game, meaning you're essentially getting the Mario experience for $10 without needing to control his movement.

        $10 might seem like a bit much compared to the $0 people pay for most mobile games, but it's possible the game has $10 worth of levels to play in it. It's also not iPhone exclusive, but the Android version will launch at a later, currently unknown date.

        To see "Super Mario Run" in action, check out the footage below:

        -
        +
        \ No newline at end of file diff --git a/article_scraper/resources/tests/readability/videos-1/expected.html b/article_scraper/resources/tests/readability/videos-1/expected.html index 1bd350e..0c6fd6f 100644 --- a/article_scraper/resources/tests/readability/videos-1/expected.html +++ b/article_scraper/resources/tests/readability/videos-1/expected.html @@ -12,7 +12,7 @@ 21) Star Wars: The Last Jedi
        - +

        I am as shocked as anyone that a Star Wars movie found its way onto my list — but I was bowled over by The Last Jedi, which may be one of the series’ best. In the hands of writer-director Rian Johnson (who will also oversee a new Star Wars trilogy), The Last Jedi is beautiful to look at and keeps its eye on the relationships between characters and how they communicate with one another, in addition to the bigger galactic story. The same characters are back, but they seem infused with new life, and the galaxy with a new kind of hope. The movie’s best details are in the strong bonds that develop between characters, and I left the film with the realization that for the first time in my life, I loved a Star Wars movie. Now I understand the magic. @@ -24,7 +24,7 @@ 20) Faces Places

        - +

        The unusual documentary Faces Places (in French, Visages Villages) turns on the friendship between the accomplished street artist JR and legendary film director Agnès Varda, whose work was central to the development of the French New Wave movement. The pair (whose difference in age is 55 years) met after years of admiring each other’s work and decided to create a documentary portrait of France — by making a number of actual portraits. The film chronicles a leg of the "Inside Outside Project," a roving art initiative in which JR makes enormous portraits of people he meets and pastes them onto buildings and walls. In the film, Varda joins him, and as they talk to people around the country, they grow in their understanding of themselves and of each other. The development of their friendship, which is both affectionate and mutually sharpening, forms Faces Places’ emotional center. @@ -37,7 +37,7 @@ 19) Ingrid Goes West

        - +

        Ingrid Goes West is a twisted and dark comedy — part addiction narrative, part stalker story — and yet it’s set in a world that’s almost pathologically cheery: the glossy, sunny, nourishing, superfood- and superlative-loving universe of Instagram celebrity. But despite Ingrid Goes West’s spot-on take on that world, the best thing about the film is that it refuses to traffic in lazy buzzwords and easy skewering, particularly at the expense of young women. Instead, the movie conveys that behind every Instagram image and meltdown is a real person, with real insecurities, real feelings, and real problems. And it recognizes that living a life performed in public can be its own kind of self-deluding prison. @@ -49,7 +49,7 @@ 18) Lady Macbeth

        - +

        Lady Macbeth is no placid costume drama. Adapted from an 1865 Russian novella by Nikolai Leskov, the movie follows Katherine (the astounding Florence Pugh), a woman in the Lady Macbeth line characterized by a potent cocktail of very few scruples and a lot of determination. She's a chilling avatar for the ways that class and privilege — both obvious and hidden — insulate some people from the consequences of their actions while damning others. Lady Macbeth is also a dazzling directorial debut from William Oldroyd, a thrilling combination of sex, murder, intrigue, and power plays. It’s visually stunning, each frame composed so carefully and deliberately that the wildness and danger roiling just below the surface feels even more frightening. Each scene ratchets up the tension to an explosive, chilling end. @@ -61,7 +61,7 @@ 17) BPM (Beats Per Minute)

        - +

        BPM (Beats Per Minute) is a remarkably tender and stirring story of the Paris chapter of ACT UP, an AIDS activism group, and the young people who found themselves caught in the crosshairs of the AIDS crisis in the early 1990s. The film follows both the group's actions and the individual members’ shifting relationships to one another — enemies becoming friends, friends becoming lovers, lovers becoming caretakers — as well as their struggles with the disease wracking their community. As an account of the period, it’s riveting; as an exploration of life and love set at the urgent intersection of the political and the personal, it’s devastating. @@ -73,7 +73,7 @@ 16) The Big Sick

        - +

        Few 2017 movies could top the charm and tenderness of The Big Sick, which hits all the right romantic comedy notes with one unusual distinction: It feels like real life. That’s probably because The Big Sick is written by real-life married couple Emily V. Gordon and Silicon Valley's Kumail Nanjiani, and based on their real-life romance. The Big Sick — which stars Nanjiani as a version of himself, alongside Zoe Kazan as Emily — is funny and sweet while not backing away from matters that romantic comedies don’t usually touch on, like serious illness, struggles in long-term marriages, and religion. As it tells the couple’s story, which takes a serious turn when Emily falls ill with a mysterious infection and her parents (played by Holly Hunter and Ray Romano) come to town, it becomes a funny and wise story about real love. @@ -85,7 +85,7 @@ 15) Mother!

        - +

        There’s so much pulsing beneath the surface of Mother! that it’s hard to grab on to just one theme as what it “means.” It’s full-on apocalyptic fiction, and like all stories of apocalypse, it’s intended to draw back the veil on reality and show us what’s really beneath. And this movie gets wild: If its gleeful cracking apart of traditional theologies doesn’t get you (there’s a lot of Catholic folk imagery here, complete with an Ash Wednesday-like mud smearing on the foreheads of the faithful), its bonkers scenes of chaos probably will. Mother! is a movie designed to provoke fury, ecstasy, madness, catharsis, and more than a little awe. Watching it, and then participating in the flurry of arguments and discussions unpacking it, was among my best moviegoing experiences of 2017. @@ -97,7 +97,7 @@ 14) A Ghost Story

        - +

        Director David Lowery filmed A Ghost Story in secret, then premiered it at the Sundance Film Festival to critical acclaim. The movie starts out being about a grieving widow (Rooney Mara) trying to live through the pain of losing her beloved husband, but it soon shifts focus to the ghost of her husband (Casey Affleck, covered in a sheet), evolving into a compelling rumination on the nature of time, memory, history, and the universe. Bathed in warm humor and wistful longing, it's a film that stays with you long after it’s over, a lingering reminder of the inextricable link between love and place. @@ -109,7 +109,7 @@ 13) The Square

        - +
        - +

        Dunkirk, a true cinematic achievement from acclaimed director Christopher Nolan, backs off conventional notions of narrative and chronology as much as possible, while leaning headfirst into everything else that makes a movie a visceral work of art aimed at the senses: the images, the sounds, the scale, the swelling vibrations of it all. You can’t smell the sea spray, but your brain may trick you into thinking you can. Nolan’s camera pushes the edges of the screen as far as it can as Dunkirk engulfs the audience in something that feels like a lot more than a war movie. It’s a symphony for the brave and broken, and it resolves in a major key — but one with an undercurrent of sorrow, and of sober warning. Courage in the face of danger is not just for characters in movies. @@ -133,7 +133,7 @@ 11) Rat Film

        - +

        Rat Film is about rats, yes — and rat poison experts and rat hunters and people who keep rats as pets. But it’s also about the history of eugenics, dubious science, “redlining,” and segregated housing in Baltimore. All these pieces come together to form one big essay, where the meaning of each vignette only becomes clearer in light of the whole. It’s a fast-paced, no-holds-barred exploration of a damning history, and it accrues meaning as the images, sounds, and text pile up. @@ -145,7 +145,7 @@ 10) A Quiet Passion

        - +

        A Quiet Passion is technically a biographical film about Emily Dickinson, but it transcends its genre to become something more like poetry. It’s a perplexing and challenging film, crafted without the traditional guardrails that guide most biographical movies — dates, times, major accomplishments, and so on. Time slips away in the film almost imperceptibly, and the narrative arc doesn’t yield easily to the viewer. Cynthia Nixon plays Emily Dickinson, whose poetry and life is a perfect match for the signature style of director Terence Davies: rich in detail, deeply enigmatic, and weighed down with a kind of sparkling, joy-tinged sorrow. A Quiet Passion is a portrait, both visual and narrative, of the kind of saint most modern people can understand: one who is certain of her uncertainty, and yearning to walk the path on which her passion and longing meet. @@ -157,7 +157,7 @@ 9) Columbus

        - +

        Columbus is a stunner of a debut from video essayist turned director Kogonada. Haley Lu Richardson stars as Casey, a young woman living in Columbus, Indiana, who cares for her mother, works at a library, and harbors a passion for architecture. (Columbus is a mecca for modernist architecture scholars and enthusiasts.) When a visiting architecture scholar falls into a coma in Columbus, his estranged son Jin (John Cho) arrives to wait for him and strikes up a friendship with Casey, who starts to show him her favorite buildings. The two begin to unlock something in each other that’s hard to define but life-changing for both. Columbus is beautiful and subtle, letting us feel how the places we build and the people we let near us move and mold us. @@ -169,7 +169,7 @@ 8) The Florida Project

        - +

        Sean Baker’s The Florida Project unfolds at first like a series of sketches about the characters who live in a purple-painted, $35-a-night motel called the Magic Castle down the street from Disney World. The film is held together by the hysterical antics of a kid named Moonee and her pack of young friends, as well as long-suffering hotel manager Bobby (a splendid, warm Willem Dafoe), who tries to put up with it all while keeping some kind of order. But as The Florida Project goes on, a narrative starts to form, one that chronicles with heartbreaking attention the sort of dilemmas that face poor parents and their children in America, and the broken systems that try to cope with impossible situations. @@ -181,7 +181,7 @@ 7) Call Me by Your Name

        - +

        Luca Guadagnino’s gorgeous film Call Me by Your Name adapts André Aciman’s 2007 novel about a precocious 17-year-old named Elio (Timothée Chalamet), who falls in lust and love with his father’s 24-year-old graduate student Oliver (Armie Hammer). It’s remarkable for how it turns literature into pure cinema, all emotion and image and heady sensation. Set in 1983 in Northern Italy, Call Me by Your Name is less about coming out than coming of age, but it also captures a particular sort of love that’s equal parts passion and torment, a kind of irrational heart fire that opens a gate into something longer-lasting. The film is a lush, heady experience for the body, but it’s also an arousal for the soul. @@ -193,7 +193,7 @@ 6) Personal Shopper

        - +

        In her second collaboration with French director Olivier Assayas, Kristen Stewart plays a personal shopper to a wealthy socialite, with a sideline as an amateur ghost hunter who’s searching for her dead twin brother. Personal Shopper is deeper than it seems at first blush, a meditation on grief and an exploration of “between” places — on the fringes of wealth, and in the space between life and death. Some souls are linked in a way that can’t be shaken, and whether or not there’s an afterlife doesn’t change the fact that we see and sense them everywhere. (Personal Shopper also has one of the most tense extended scenes involving text messaging ever seen onscreen.) @@ -205,7 +205,7 @@ 5) Princess Cyd

        - +

        Stephen Cone is a master of small, carefully realized filmmaking; his earlier films such as The Wise Kids and Henry Gamble’s Birthday Party combine an unusual level of empathy for his characters with an unusual combination of interests: love, desire, sexual awakenings, and religion. Princess Cyd is his most accomplished film yet, about a young woman named Cyd (Jessie Pinnick) who finds herself attracted to Katie (Malic White), a barista, while visiting her Aunt Miranda (Rebecca Spence, playing a character modeled on the author Marilynne Robinson) in Chicago. As she works through her own sexual awakening with Katie, Cyd unwinds some of the ways Miranda’s life has gotten too safe. They provoke each other while forming a bond and being prodded toward a bigger understanding of the world. It is a graceful and honest film, and it feels like a modest miracle. @@ -217,7 +217,7 @@ 4) Get Out

        - +

        Racism is sinister, frightening, and deadly. But Get Out (a stunning directorial debut from Key & Peele's Jordan Peele) isn’t about the blatantly, obviously scary kind of racism — burning crosses and lynchings and snarling hate. Instead, it’s interested in showing how the parts of racism that try to be aggressively unscary are just as horrifying, and it’s interested in making us feel that horror in a visceral, bodily way. In the tradition of the best classic social thrillers, Get Out takes a topic that is often approached cerebrally — casual racism — and turns it into something you feel in your tummy. And it does it with a wicked sense of humor. @@ -229,7 +229,7 @@ 3) The Work

        - +

        The Work is an outstanding, astonishing accomplishment and a viewing experience that will leave you shaken (but in a good way). At Folsom Prison in California, incarcerated men regularly participate in group therapy, and each year other men from the “outside” apply to participate in an intense four-day period of group therapy alongside Folsom’s inmates. The Work spends almost all of its time inside the room where that therapy happens, observing the strong, visceral, and sometimes violent emotions the men feel as they expose the hurt and raw nerves that have shaped how they encounter the world. Watching is not always easy, but by letting us peek in, the film invites viewers to become part of the experience — as if we, too, are being asked to let go. @@ -241,7 +241,7 @@ 2) Ex Libris

        - +

        Frederick Wiseman is one of the towering giants of nonfiction film, a keen observer of American institutions — ranging from prisons to dance companies to welfare offices — for the past half-century. Ex Libris is his mesmerizing look at the New York Public Library and the many functions it fills, which go far beyond housing books. Wiseman works in the observational mode, which means his films contain no captions, dates, or talking-head interviews: We just see what his camera captured, which in this case includes community meetings, benefit dinners, after-school programs, readings with authors and scholars (including Richard Dawkins and Ta-Nehisi Coates), and NYPL patrons going about their business in the library’s branches all over the city. The result is almost hypnotic and, perhaps surprisingly, deeply moving. It makes a case for having faith in the public institutions where ordinary people work — away from the limelight, without trying to score political points — in order to make our communities truly better. @@ -253,7 +253,7 @@ 1) Lady Bird

        - +

        Lady Bird topped my list almost instantly, and only rose in my estimation on repeated viewings. For many who saw it (including me), it felt like a movie made not just for but about me. Lady Bird is a masterful, exquisite coming-of-age comedy starring the great Saoirse Ronan as Christine — or “Lady Bird,” as she’s re-christened herself — and it’s as funny, smart, and filled with yearning as its heroine. Writer-director Greta Gerwig made the film as an act of love, not just toward her hometown of Sacramento but also toward girlhood, and toward the feeling of always being on the outside of wherever real life is happening. Lady Bird is the rare movie that manages to be affectionate, entertaining, hilarious, witty, and confident. And one line from it struck me as the guiding principle of many of the year’s best films: “Don’t you think they are the same thing? Love, and attention?” diff --git a/article_scraper/resources/tests/readability/videos-2/expected.html b/article_scraper/resources/tests/readability/videos-2/expected.html index f5f2c68..abdf514 100644 --- a/article_scraper/resources/tests/readability/videos-2/expected.html +++ b/article_scraper/resources/tests/readability/videos-2/expected.html @@ -9,7 +9,7 @@ Vape Wave (documentaire, 1h28, Planète+)

        - +

        Pendant quelques jours, le doute a plané : l’Etat comptait-il vraiment légiférer contre la cigarette dans les films français, que ce soit via une interdiction pure et simple ou via un système de «punition» (coupe des aides CNC, par exemple) pour les longs-métrages qui sentent le mégot ? Si le rétropédalage de la ministre Buzyn n’en est pas vraiment un (elle n’avait jamais clairement menacé le septième art), la polémique a le mérite de pointer la (sur)représentation clopesque sur écran. Et si, comme c’est le cas dans la vie quotidienne, on voyait progressivement les cigarettes électroniques remplacer les tiges nicotinées authentiques ? Que ceux qui mettraient en doute le potentiel cinématographique des vapoteuses se ruent sur Vape Wave, documentaire militant signé Jan Kounen, ex-fumeur reconverti à la vape dont les images magnifient les volutes de vapeur recrachée. @@ -24,7 +24,7 @@ Dans la tête d’Alan Moore (websérie documentaire, 8x5min, Arte Creative)

        - +

        Le week-end dernier, Libération publiait un portrait de der consacré à l’auteur britannique Alan Moore, connu pour ses BD cultes (V pour Vendetta, Watchmen, From Hell), à l’occasion de la sortie de son deuxième roman, le pavé Jérusalem. En attendant l’imminente sortie d’une version longue de son entretien avec Libé, on pourra se replonger dans les épisodes d’une websérie documentaire d’Arte Creative en 8 épisodes consacré au maître. Brexit, magie, Anonymous font partie des sujets discutés avec le maître au fil de ce programme sobrement intitulé Dans la tête d’Alan Moore. (A.H.) @@ -36,7 +36,7 @@ The Death and Life of Marsha P. Johnson (docu, 1h45, Netflix)

        - +

        Marsha, la «Rosa Parks du mouvement LGBTQ». Marsha «la prostituée, l’actrice et la sainte, modèle d’Andy Warhol» ou encore Marsha l’élaborée, la radicale, «avec ses plumes et ce maquillage qu’elle ne mettait jamais bien». «Queen Marsha» a été retrouvée morte dans l’Hudson en juillet 1992, alors qu’on la voyait encore parader dans les rues de Greenwich Village quelques jours auparavant. Un choc glaçant. Là où son corps a été repêché puis ingratement déposé, les sans-abri ont constitué le lendemain un mémorial de bouteilles et de plantes qui délimitent les contours de l’absente. @@ -66,7 +66,7 @@ Jim & Andy (documentaire, 1h33, Netflix) 

        - +

        A la sortie de Man on the Moon (2000), le magnifique film de Milos Forman consacré à Andy Kaufman – comique et génie de la performance absurde mort en 1984 –, le cinéaste et les acteurs insistaient dans chaque interview sur l’in­croyable comportement de Jim Carrey pendant le tournage : il aurait été comme possédé par Kaufman, se prenant pour lui 24 heures sur 24. Certains affirmaient même ne jamais avoir eu l’impression que l’acteur était présent, tant son modèle avait littéralement pris sa place. Nous en avons aujourd’hui la preuve en images car tout cela avait été filmé par Bob Zmuda et Lynne Margulies, l’ancien complice et la veuve de Kaufman. @@ -81,7 +81,7 @@ Braguino (documentaire, 50min, Arte)

        - +

        La querelle peut se trouver derrière toutes les portes, y compris celle de l’exil. On a beau croire avoir tourné le dos à tout, à cette inclination humaine à nourrir sa propre haine, l’allergie peut regermer fissa sur une peau qui frissonne à l’approche de ce voisin que l’on ne comprend pas. Issu d’une lignée de vieux-croyants orthodoxes russes, Sacha Braguine a pris sa famille sous le bras, loin de toute autre présence humaine en taïga sibérienne. Un autre groupe, les Kiline, a décidé d’en faire de même et de s’installer de l’autre côté de la rivière. Qui est arrivé en premier ? Qui menace l’autre ? L’histoire de l’impossible communauté peut commencer. @@ -96,7 +96,7 @@ 6 Days (film, 1h34, Netflix)

        - +

        Fin avril 1980, l’ambassade d’Iran à Londres a été le théâtre d’une prise d’otages largement médiatisée : une trentaine de personnes ont ainsi été retenues pendant six jours par des soldats iraniens dissidents exigeant la libération de 91 prisonniers. Avec Margaret Thatcher au 10 Downing Street à l’époque, pas question pour l’Angleterre d’avoir l’air mou du genou sur la réponse à apporter à cette crise scrutée par les caméras du monde entier. Le SAS (Special Air Service) est sur le coup : l’opération Nimrod se met en place pour prendre d’assaut l’ambassade. diff --git a/article_scraper/resources/tests/readability/yahoo-1/expected.html b/article_scraper/resources/tests/readability/yahoo-1/expected.html index b5d5602..a2f2954 100644 --- a/article_scraper/resources/tests/readability/yahoo-1/expected.html +++ b/article_scraper/resources/tests/readability/yahoo-1/expected.html @@ -11,28 +11,28 @@

        Virtual reality has officially reached the consoles. And it’s pretty good! Sony’s PlayStation VR is extremely comfortable and reasonably priced, and while it’s lacking killer apps, it’s loaded with lots of interesting ones.

        But which ones should you buy? I’ve played just about every launch game, and while some are worth your time, others you might want to skip. To help you decide what’s what, I’ve put together this list of the eight PSVR games worth considering.

        “Rez Infinite” ($30)

        -
        +

        Beloved cult hit “Rez” gets the VR treatment to help launch the PSVR, and the results are terrific. It includes a fully remastered take on the original “Rez” – you zoom through a Matrix-like computer system, shooting down enemies to the steady beat of thumping electronica – but the VR setting makes it incredibly immersive. It gets better the more you play it, too; unlock the amazing Area X mode and you’ll find yourself flying, shooting and bobbing your head to some of the trippiest visuals yet seen in VR.

        “Thumper” ($20)

        -
        +

        What would happen if Tron, the board game Simon, a Clown beetle, Cthulhu and a noise band met in VR? Chaos, for sure, and also “Thumper.” Called a “violent rhythm game” by its creators, “Thumper” is, well, a violent rhythm game that’s also a gorgeous, unsettling and totally captivating assault on the senses. With simple controls and a straightforward premise – click the X button and the analog stick in time with the music as you barrel down a neon highway — it’s one of the rare games that works equally well both in and out of VR. But since you have PSVR, play it there. It’s marvelous.

        “Until Dawn: Rush of Blood” ($20)

        -
        +

        Cheeky horror game “Until Dawn” was a breakout hit for the PS4 last year, channeling the classic “dumb teens in the woods” horror trope into an effective interactive drama. Well, forget all that if you fire up “Rush of Blood,” because this one sticks you front and center on a rollercoaster ride from Hell. Literally. You ride through a dimly-lit carnival of terror, dual-wielding pistols as you take down targets, hideous pig monsters and, naturally, maniac clowns. Be warned: If the bad guys don’t get you, the jump scares will.

        “Headmaster” ($20)

        -
        +

        Soccer meets “Portal” in the weird (and weirdly fun) “Headmaster,” a game about heading soccer balls into nets, targets and a variety of other things while stuck in some diabolical training facility. While at first it seems a little basic, increasingly challenging shots and a consistently entertaining narrative keep it from running off the pitch. Funny, ridiculous and as easy as literally moving your head back and forth, it’s a pleasant PSVR surprise.

        “RIGS: Mechanized Combat League” ($50)

        -
        +

        Giant mechs + sports? That’s the gist of this robotic blast-a-thon, which pits two teams of three against one another in gorgeous, explosive and downright fun VR combat. At its best, “RIGS” marries the thrill of fast-paced competitive shooters with the insanity of piloting a giant mech in VR. It can, however, be one of the barfier PSVR games. So pack your Dramamine, you’re going to have to ease yourself into this one.

        “Batman Arkham VR” ($20)

        -
        +

        “I’m Batman,” you will say. And you’ll actually be right this time, because you are Batman in this detective yarn, and you know this because you actually grab the famous cowl and mask, stick it on your head, and stare into the mirrored reflection of Rocksteady Games’ impressive Dark Knight character model. It lacks the action of its fellow “Arkham” games and runs disappointingly short, but it’s a high-quality experience that really shows off how powerfully immersive VR can be.

        “Job Simulator” ($30)

        -
        +

        There are a number of good VR ports in the PSVR launch lineup, but the HTC Vive launch game “Job Simulator” might be the best. Your task? Lots of tasks, actually, from cooking food to fixing cars to working in an office, all for robots, because did I mention you were in the future? Infinitely charming and surprisingly challenging, it’s a great showpiece for VR.

        “Eve Valkyrie” ($60)

        -
        +

        Already a hit on the Oculus Rift, this space dogfighting game was one of the first to really show off how VR can turn a traditional game experience into something special. It’s pricey and not quite as hi-res as the Rift version, but “Eve Valkyrie” does an admirable job filling the void left since “Battlestar Galactica” ended. Too bad there aren’t any Cylons in it (or are there?)

        More games news:

          diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index 140ff5b..e44f421 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -47,6 +47,7 @@ pub fn clean_html_fragment( if let Some(mut root) = document.get_root_element() { FullTextParser::post_process_page(&mut root)?; } + FullTextParser::prevent_self_closing_tags(&xpath_ctx)?; FullTextParser::post_process_document(&document)?; let content_node = if let Some(root) = document.get_root_element() { diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index fece84e..16868b2 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -690,9 +690,9 @@ impl FullTextParser { let success = video_wrapper .set_property("class", "videoWrapper") .ok() - .and_then(|()| node.set_property("width", "100%").ok()) - .and_then(|()| node.set_property("height", "400").ok()) - .and_then(|()| node.remove_attribute("aspect-ratio").ok()) + .and_then(|()| node.set_property("width", "480").ok()) + .and_then(|()| node.set_property("height", "360").ok()) + .and_then(|()| node.set_property("aspect-ratio", "auto").ok()) .ok_or_else(|| { node.unlink(); video_wrapper.add_child(&mut node) From c16e11fdda7d7442f91baa18134e68271240a282 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 6 Jul 2024 23:38:43 +0200 Subject: [PATCH 40/56] init parser according to (https://gitlab.gnome.org/GNOME/libxml2/-/wikis/Thread-safety) --- article_scraper/src/full_text_parser/mod.rs | 13 ++++++++----- article_scraper/src/full_text_parser/tests.rs | 6 +++--- article_scraper/src/images/image_data.rs | 1 - article_scraper/src/images/mod.rs | 11 +++-------- article_scraper/src/images/request.rs | 1 - article_scraper/src/util.rs | 7 ++----- 6 files changed, 16 insertions(+), 23 deletions(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 16868b2..37a5f32 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -264,8 +264,7 @@ impl FullTextParser { } // parse html - let parser = Parser::default_html(); - Self::parse_html_string_patched(html.as_str(), &parser).map_err(|err| { + Self::parse_html_string_patched(html.as_str()).map_err(|err| { log::error!("Parsing HTML failed for downloaded HTML {:?}", err); FullTextParserError::Xml }) @@ -278,7 +277,7 @@ impl FullTextParser { /// - /// These two functions should be removed when the issue is fixed in libxml crate. fn try_usize_to_i32(value: usize) -> Result { - if cfg!(target_pointer_width = "16") || (value < i32::max_value() as usize) { + if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) { // Cannot safely use our value comparison, but the conversion if always safe. // Or, if the value can be safely represented as a 32-bit signed integer. Ok(value as i32) @@ -290,8 +289,12 @@ impl FullTextParser { pub(crate) fn parse_html_string_patched( input: &str, - parser: &Parser, ) -> Result { + unsafe { + // https://gitlab.gnome.org/GNOME/libxml2/-/wikis/Thread-safety + libxml::bindings::xmlInitParser(); + } + let parser = Parser::default_html(); let input_bytes: &[u8] = input.as_ref(); let input_ptr = input_bytes.as_ptr() as *const std::os::raw::c_char; let input_len = Self::try_usize_to_i32(input_bytes.len())?; @@ -488,7 +491,7 @@ impl FullTextParser { } pub fn thumbnail_from_html(html: &str) -> Option { - if let Ok(doc) = Parser::default_html().parse_string(html) { + if let Ok(doc) = Self::parse_html_string_patched(html) { if let Ok(ctx) = Self::get_xpath_ctx(&doc) { return Self::check_for_thumbnail(&ctx); } diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index 0f0370f..99a5235 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -1,5 +1,5 @@ use super::{config::ConfigEntry, FullTextParser}; -use libxml::{parser::Parser, tree::SaveOptions, xpath::Context}; +use libxml::{tree::SaveOptions, xpath::Context}; use reqwest::{Client, Url}; async fn run_test(name: &str, url: &str, title: Option<&str>, author: Option<&str>) { @@ -194,7 +194,7 @@ herausgebracht. (Fortschritt, Wissenschaft) "#; - let doc = Parser::default_html().parse_string(html).unwrap(); + let doc = FullTextParser::parse_html_string_patched(html).unwrap(); let ctx = Context::new(&doc).unwrap(); let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap(); @@ -269,7 +269,7 @@ Foto: IMAGO/Vaclav Salek / IMAGO/CTK Photo "#; - let doc = Parser::default_html().parse_string(html).unwrap(); + let doc = FullTextParser::parse_html_string_patched(html).unwrap(); let ctx = Context::new(&doc).unwrap(); let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap(); diff --git a/article_scraper/src/images/image_data.rs b/article_scraper/src/images/image_data.rs index 2095f27..b26cfec 100644 --- a/article_scraper/src/images/image_data.rs +++ b/article_scraper/src/images/image_data.rs @@ -2,7 +2,6 @@ pub struct ImageData { pub url: String, pub data: Vec, - pub content_length: usize, pub content_type: String, } diff --git a/article_scraper/src/images/mod.rs b/article_scraper/src/images/mod.rs index 6d97fd8..de0f48f 100644 --- a/article_scraper/src/images/mod.rs +++ b/article_scraper/src/images/mod.rs @@ -2,12 +2,11 @@ pub use self::error::ImageDownloadError; use self::image_data::ImageDataBase64; use self::pair::Pair; use self::request::ImageRequest; -use crate::constants; use crate::util::Util; +use crate::{constants, FullTextParser}; use base64::Engine; use futures::StreamExt; use image::ImageFormat; -use libxml::parser::Parser; use libxml::tree::{Node, SaveOptions}; use libxml::xpath::Context; pub use progress::Progress; @@ -162,9 +161,7 @@ impl ImageDownloader { html: &str, downloaded_images: Vec>, ) -> Result { - let parser = Parser::default_html(); - let doc = parser - .parse_string(html) + let doc = FullTextParser::parse_html_string_patched(html) .map_err(|_| ImageDownloadError::HtmlParse)?; let xpath_ctx = Context::new(&doc).map_err(|()| { @@ -207,9 +204,7 @@ impl ImageDownloader { } fn harvest_image_urls_from_html(html: &str) -> Result>, ImageDownloadError> { - let parser = Parser::default_html(); - let doc = parser - .parse_string(html) + let doc = FullTextParser::parse_html_string_patched(html) .map_err(|_| ImageDownloadError::HtmlParse)?; let xpath_ctx = Context::new(&doc).map_err(|()| { diff --git a/article_scraper/src/images/request.rs b/article_scraper/src/images/request.rs index b7086ce..fe9adf0 100644 --- a/article_scraper/src/images/request.rs +++ b/article_scraper/src/images/request.rs @@ -48,7 +48,6 @@ impl ImageRequest { Ok(ImageData { url: self.url, data: result, - content_length: self.content_length, content_type: self.content_type, }) } diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index cbd5370..df76ced 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -1299,13 +1299,11 @@ impl Util { mod tests { use super::Util; use crate::FullTextParser; - use libxml::parser::Parser; fn replace_brs(source: &str, expected: &str) { libxml::tree::node::set_node_rc_guard(10); - let parser = Parser::default_html(); - let document = FullTextParser::parse_html_string_patched(source, &parser).unwrap(); + let document = FullTextParser::parse_html_string_patched(source).unwrap(); let root = document.get_root_element().unwrap(); let body = root.get_first_child().unwrap(); let div = body.get_first_child().unwrap(); @@ -1346,8 +1344,7 @@ mod tests { fn replace_emojis(source: &str, expected: &str) { libxml::tree::node::set_node_rc_guard(10); - let parser = Parser::default_html(); - let document = FullTextParser::parse_html_string_patched(source, &parser).unwrap(); + let document = FullTextParser::parse_html_string_patched(source).unwrap(); let root = document.get_root_element().unwrap(); let body = root.get_first_child().unwrap(); let p = body.get_first_child().unwrap(); From 6932902b7b2c403ec8c23f4bd167f72dbb042ecf Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 6 Jul 2024 23:43:23 +0200 Subject: [PATCH 41/56] update CI image --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c6a7149..5a57616 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,7 +4,7 @@ stages: run-build: stage: build - image: rust:1.77 + image: rust:1.79 before_script: - rustup component add rustfmt - rustup component add clippy From b3ce28632dab8678ae04789aeae76262283b1bb0 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Wed, 10 Jul 2024 11:59:21 +0200 Subject: [PATCH 42/56] update submodule --- article_scraper/ftr-site-config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/ftr-site-config b/article_scraper/ftr-site-config index 737398e..e9112fc 160000 --- a/article_scraper/ftr-site-config +++ b/article_scraper/ftr-site-config @@ -1 +1 @@ -Subproject commit 737398ef6b121db2d72042b5406a95dfd497113f +Subproject commit e9112fc55800cae00ca70f4c38248a3ef4228861 From 11ee29fedaba674ee615b977fc97d332645f4d0e Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 2 Nov 2024 11:30:29 +0100 Subject: [PATCH 43/56] thumbnail: check for attribute with name property as well (fixes #4) --- .../resources/tests/thumbnails/a-chacon.html | 808 ++++++++++++++++++ article_scraper/src/full_text_parser/mod.rs | 18 +- article_scraper/src/full_text_parser/tests.rs | 14 + 3 files changed, 839 insertions(+), 1 deletion(-) create mode 100644 article_scraper/resources/tests/thumbnails/a-chacon.html diff --git a/article_scraper/resources/tests/thumbnails/a-chacon.html b/article_scraper/resources/tests/thumbnails/a-chacon.html new file mode 100644 index 0000000..3e1bc5e --- /dev/null +++ b/article_scraper/resources/tests/thumbnails/a-chacon.html @@ -0,0 +1,808 @@ + + + + + + + + + + + +PoC: Usando el Generador de Autenticación de Rails 8 (Beta) En Modo API-Only. | a-chacon + + + + + + + + + + + + + + + + + + + + + + +
          +

          Building an API with Rails? Discover + + OasRails, a Rails engine for generate automatic interactive documentation. +

          +
          + + + + + + + + +
          +
          + + +

          PoC: Usando el Generador de Autenticación de Rails 8 (Beta) En Modo API-Only.

          + + + + PoC: Usando el Generador de Autenticación de Rails 8 (Beta) En Modo API-Only. + +
          +

          Como ya saben, una de las funcionalidades nuevas de Rails 8 es el nuevo generador básico de autenticación que viene a demostrar que no es tan complejo desarrollar todo lo que respecta a autenticación en una aplicación con Rails y que muchas veces no es necesario depender de terceros (gemas). La discusión comenzó aquí.

          + +

          Dicho esto, veamos que pasa usando el generador en una aplicación API-Only:

          + +
           rails -v
          +Rails 8.0.0.beta1
          +
          + +
           rails new app --api & cd app
          +
          + +

          Y ejecutamos el nuevo comando:

          + +
           rails g authentication
          +      create  app/models/session.rb
          +      create  app/models/user.rb
          +      create  app/models/current.rb
          +      create  app/controllers/sessions_controller.rb
          +      create  app/controllers/concerns/authentication.rb
          +      create  app/controllers/passwords_controller.rb
          +      create  app/mailers/passwords_mailer.rb
          +      create  app/views/passwords_mailer/reset.html.erb
          +      create  app/views/passwords_mailer/reset.text.erb
          +      create  test/mailers/previews/passwords_mailer_preview.rb
          +        gsub  app/controllers/application_controller.rb
          +       route  resources :passwords, param: :token
          +       route  resource :session
          +        gsub  Gemfile
          +      bundle  install --quiet
          +    generate  migration CreateUsers email_address:string!:uniq password_digest:string! --force
          +       rails  generate migration CreateUsers email_address:string!:uniq password_digest:string! --force
          +      invoke  active_record
          +      create    db/migrate/20241016002139_create_users.rb
          +    generate  migration CreateSessions user:references ip_address:string user_agent:string --force
          +       rails  generate migration CreateSessions user:references ip_address:string user_agent:string --force
          +      invoke  active_record
          +      create    db/migrate/20241016002140_create_sessions.rb
          +
          + +

          Ok, ahora por ejemplo, si revisamos SessionsController veremos que el método de Login se ve de la siguiente forma:

          + +
            def create
          +    if user = User.authenticate_by(params.permit(:email_address, :password))
          +      start_new_session_for user
          +      redirect_to after_authentication_url
          +    else
          +      redirect_to new_session_url, alert: "Try another email address or password."
          +    end
          +  end
          +
          + +

          O sea, redirecciona a rutas y/o vistas que en nuestra API no existen ni hacen sentido, y además si inspeccionamos el metodo start_new_session_for nos daremos cuenta de que el sistema está basado 100% en autenticación mediante cookies. Entonces, ¿qué hacemos?

          + +

          Mi propuesta es la siguiente: el generador crea las bases para la autenticación y creo que funciona bastante bien, por lo que con unas pequeñas modificaciones podemos dejar funcionando una autenticación Bearer (Token Authentication) rápidamente en nuestra API con Rails 8 más los archivos ya generados.

          + +

          El primer paso será agregar persistencia para nuestro token, para esto modificaremos la migración que crea las sessiones y agregaremos un nuevo campo llamado token:

          + +
              create_table :sessions do |t|
          +      t.references :user, null: false, foreign_key: true
          +      t.string :ip_address
          +      t.string :user_agent
          +      t.string :token     # HERE
          +
          +      t.timestamps
          +    end
          +
          + +

          Ahora simplemente ejecuta rails db:migrate y create un usuario de prueba por consola, yo lo haré con esta línea User.create(email_address: "[email protected]", password: "123456789") (Lo utilizaremos más tarde). Luego debemos crear un nuevo token para cada sesión nueva de un usuario, para esto lo más simple es usar un callback en el modelo Session:

          + +
          # app/models/sessions.rb
          +class Session < ApplicationRecord
          +  belongs_to :user
          +  before_create :generate_token # Here call
          +
          +  private
          +  def generate_token # Here implement, generate the token as you wish.
          +    self.token = Digest::SHA1.hexdigest([ Time.now, rand ].join)
          +  end
          +end
          +
          + +

          Ahora volviendo al metodo start_new_session_for en el concern Authentication, no es necesario que creemos una cookie, asi que debemos remover esa linea y dejar el metodo de la siguiente forma:

          + +
          # app/controllers/concerns/authentication.rb
          +def start_new_session_for(user)
          +  user.sessions.create!(user_agent: request.user_agent, ip_address: request.remote_ip).tap do |session|
          +    Current.session = session
          +  end
          +end
          +
          + +

          Y modificaremos el create de SessionsController para que las respuestas sean en formato json y no redirecciones:

          + +
          # app/controllers/sessions_controller.rb
          +def create
          +  if user = User.authenticate_by(params.permit(:email_address, :password))
          +    start_new_session_for user
          +    render json: { data: { token: Current.session.token  } }
          +  else
          +    render json: {}, status: :unauthorized
          +  end
          +end
          +
          + +

          Para hacer que todo esto funcione debemos hacer dos cosas:

          + +
            +
          1. +

            Incluir el modulo Authentication en ApplicationController:

            + +
            # app/controllers/application_controller.rb
            +class ApplicationController < ActionController::API
            +  include Authentication
            +end
            +
            +
          2. +
          3. +

            Eliminar la linea numero 6 de este mismo concern:

            + +
            # app/controllers/concerns/authentication.rb
            +  included do
            +    before_action :require_authentication
            +    helper_method :authenticated? # This, we don't use helpers in APIs
            +  end
            +
            +
          4. +
          + +

          Hasta este punto ya deberíamos tener el login funcionando. Para probar esto voy a agregar OasRails, que a propósito ya está funcionando con Rails 8 y voy a enviar un par de peticiones a ver como se comporta, no explicaré como implementar OasRails, para eso puedes ver el repositorio o leer más en este post.

          + +

          Inicio de sesión exitoso:

          + +

          + +

          Inicio de sesión fallido:

          + +

          + +
          + +

          Ya podemos generar tokens, ahora modificaremos el código para autenticarnos con ese mismo token. Para eso, cambiaremos la lógica de buscar la sesión actual del usuario con base en la cookie a buscarla basándonos en la cabecera Authorization:

          + +
          
          +# app/controllers/concerns/authentication.rb
          +  def resume_session
          +    Current.session = find_session_by_token
          +  end
          +
          +  def find_session_by_cookie
          +    Session.find_by(token: request.headers[:authorization]&.split(" ")[-1])
          +  end
          +
          + +

          Para probar esto creo que tendremos que hacer rápidamente un modelo que dependa de User y que requiera autenticación para utilizar. Intentemos con rails g scaffold project title:string description:text user:references y le agregamos al principio del controlador la línea de código before_action :require_authentication.

          + +

          Aquí les dejo una pequeña prueba del index de Projects autenticado con el token que obtuve en las pruebas anteriores:

          + +

          + +
          + +

          Con esto ya tienes gran parte de la lógica de autenticación funcionando en la aplicación API-Only. Te queda continuar con las modificaciones en el resto de los endpoints para que las respuestas sean en formato json y no supuestas vistas que no existen en la aplicación.

          + +

          Probablemente de aquí a que se lance la versión final de Rails 8 aparezca un PR solucionando esto y el generador funcione correctamente en modo API-Only. Hasta entonces, con estas pequeñas modificaciones ya puedes seguir construyendo tu API.

          + +
          + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
          +
          + + + + + + + + diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 37a5f32..a45b36d 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -275,7 +275,7 @@ impl FullTextParser { /// See: /// - /// - - /// These two functions should be removed when the issue is fixed in libxml crate. + /// These two functions should be removed when the issue is fixed in libxml crate. fn try_usize_to_i32(value: usize) -> Result { if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) { // Cannot safely use our value comparison, but the conversion if always safe. @@ -514,6 +514,22 @@ impl FullTextParser { return Some(thumb); } + if let Ok(thumb) = Util::get_attribute( + context, + "//meta[contains(@property, 'twitter:image')]", + "content", + ) { + return Some(thumb); + } + + if let Ok(thumb) = Util::get_attribute( + context, + "//meta[contains(@property, 'og:image')]", + "content", + ) { + return Some(thumb); + } + if let Ok(thumb) = Util::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href") { diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index 99a5235..b111fcd 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -278,3 +278,17 @@ Foto: IMAGO/Vaclav Salek / IMAGO/CTK Photo "https://cdn.prod.www.spiegel.de/images/a4573666-f15e-4290-8c73-a0c6cd4ad3b2_w948_r1.778_fpx29.99_fpy44.98.jpg" ) } + +#[test] +fn extract_thumbnail_a_chacon() { + let html = std::fs::read_to_string(format!("./resources/tests/thumbnails/a-chacon.html")) + .expect("Failed to read source HTML"); + let doc = FullTextParser::parse_html_string_patched(&html).unwrap(); + let ctx = Context::new(&doc).unwrap(); + + let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap(); + assert_eq!( + thumb, + "https://a-chacon.com/assets/images/rails8-poc-api-auth.webp" + ) +} From 7fcb781c6819528893fcbaa414a57ce15bc51125 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 2 Nov 2024 11:34:47 +0100 Subject: [PATCH 44/56] remove useless format! --- article_scraper/src/full_text_parser/tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index b111fcd..8921ce9 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -281,7 +281,7 @@ Foto: IMAGO/Vaclav Salek / IMAGO/CTK Photo #[test] fn extract_thumbnail_a_chacon() { - let html = std::fs::read_to_string(format!("./resources/tests/thumbnails/a-chacon.html")) + let html = std::fs::read_to_string("./resources/tests/thumbnails/a-chacon.html") .expect("Failed to read source HTML"); let doc = FullTextParser::parse_html_string_patched(&html).unwrap(); let ctx = Context::new(&doc).unwrap(); From 89eb87fa85709378878032d3b3be0960f8b0fe3e Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 17 Jan 2025 02:55:59 +0100 Subject: [PATCH 45/56] update thiserror, ftr-site-config submodule and bump version --- Cargo.toml | 10 ++++++++-- article_scraper/Cargo.toml | 13 +++++++------ article_scraper/ftr-site-config | 2 +- article_scraper_cli/Cargo.toml | 11 ++++++----- 4 files changed, 22 insertions(+), 14 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 03f5662..4f0884f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,12 @@ [workspace] - members = [ "article_scraper", "article_scraper_cli", -] \ No newline at end of file +] + +[workspace.package] +version = "2.1.1" +authors = ["Jan Lukas Gernert "] +edition = "2021" +license = "GPL-3.0-or-later" +repository = "https://gitlab.com/news-flash/article_scraper" \ No newline at end of file diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index 6e4d003..100766c 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -1,17 +1,18 @@ [package] name = "article_scraper" -version = "2.1.0" -authors = ["Jan Lukas Gernert "] -edition = "2018" -license = "GPL-3.0-or-later" description = "Scrap article contents from the web. Powered by fivefilters full text feed configurations & mozilla readability." -repository = "https://gitlab.com/news-flash/article_scraper" +version.workspace = true +authors.workspace = true +edition.workspace = true +license.workspace = true +repository.workspace = true + readme = "../Readme.md" keywords = ["article", "scrape", "full-text", "readability"] exclude = ["resources/tests"] [dependencies] -thiserror = "1.0" +thiserror = "2.0" libxml = "0.3" reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] } tokio = { version = "1", features = ["macros", "fs", "io-util"] } diff --git a/article_scraper/ftr-site-config b/article_scraper/ftr-site-config index e9112fc..ccde390 160000 --- a/article_scraper/ftr-site-config +++ b/article_scraper/ftr-site-config @@ -1 +1 @@ -Subproject commit e9112fc55800cae00ca70f4c38248a3ef4228861 +Subproject commit ccde390b11893cbafdc84f74b449ddc3cc05c024 diff --git a/article_scraper_cli/Cargo.toml b/article_scraper_cli/Cargo.toml index b91abc5..22edcf1 100644 --- a/article_scraper_cli/Cargo.toml +++ b/article_scraper_cli/Cargo.toml @@ -1,11 +1,12 @@ [package] name = "article_scraper_cli" -version = "2.1.0" -authors = ["Jan Lukas Gernert "] -edition = "2018" -license = "GPL-3.0-or-later" description = "Cli to use the article_scraper lib" -repository = "https://gitlab.com/news-flash/article_scraper" +version.workspace = true +authors.workspace = true +edition.workspace = true +license.workspace = true +repository.workspace = true + [dependencies] article_scraper = { path = "../article_scraper/" } From 7c658a4ba80021c5ed108b795ee4aac17e02f321 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 17 Jan 2025 02:58:41 +0100 Subject: [PATCH 46/56] resolver 2 --- Cargo.toml | 6 ++---- article_scraper/src/full_text_parser/mod.rs | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4f0884f..99695c8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,6 @@ [workspace] -members = [ - "article_scraper", - "article_scraper_cli", -] +members = ["article_scraper", "article_scraper_cli"] +resolver = "2" [workspace.package] version = "2.1.1" diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index a45b36d..98e9478 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -275,7 +275,7 @@ impl FullTextParser { /// See: /// - /// - - /// These two functions should be removed when the issue is fixed in libxml crate. + /// These two functions should be removed when the issue is fixed in libxml crate. fn try_usize_to_i32(value: usize) -> Result { if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) { // Cannot safely use our value comparison, but the conversion if always safe. From ca1cc47af1f7749ea7d10983e8e269afb0c57daf Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 17 Jan 2025 03:02:40 +0100 Subject: [PATCH 47/56] update CI image --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5a57616..7880e5d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,7 +4,7 @@ stages: run-build: stage: build - image: rust:1.79 + image: rust:1.83 before_script: - rustup component add rustfmt - rustup component add clippy From 8cfcd6d9f3636a84336da4d882a9f6db5ce565b4 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 17 Jan 2025 03:05:55 +0100 Subject: [PATCH 48/56] clippy --- article_scraper/src/full_text_parser/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 98e9478..18fc682 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -275,7 +275,7 @@ impl FullTextParser { /// See: /// - /// - - /// These two functions should be removed when the issue is fixed in libxml crate. + /// These two functions should be removed when the issue is fixed in libxml crate. fn try_usize_to_i32(value: usize) -> Result { if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) { // Cannot safely use our value comparison, but the conversion if always safe. From 9f56ed03b8e384378d92e01c5bc38bf80525760c Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Mon, 10 Mar 2025 13:42:31 +0100 Subject: [PATCH 49/56] article_scraper: don't specify reqwest features --- article_scraper/Cargo.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index 100766c..e852be9 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -14,17 +14,17 @@ exclude = ["resources/tests"] [dependencies] thiserror = "2.0" libxml = "0.3" -reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] } +reqwest = "0.12" tokio = { version = "1", features = ["macros", "fs", "io-util"] } url = "2.5" -regex = "1.10" +regex = "1.11" encoding_rs = "0.8" chrono = "0.4" base64 = "0.22" image = "0.25" log = "0.4" -rust-embed="8.4" -once_cell = "1.19" +rust-embed="8.6" +once_cell = "1.20" escaper = "0.1" futures = "0.3" unic-emoji-char = "0.9" From 0978335d3b73e8049c602713b76ca5e3f038d9ca Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 28 Mar 2025 17:18:03 +0100 Subject: [PATCH 50/56] [f] ignore url harvest error --- article_scraper/src/images/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/article_scraper/src/images/mod.rs b/article_scraper/src/images/mod.rs index de0f48f..4be98b8 100644 --- a/article_scraper/src/images/mod.rs +++ b/article_scraper/src/images/mod.rs @@ -219,7 +219,9 @@ impl ImageDownloader { let mut image_urls = Vec::new(); for node in node_vec { - image_urls.push(Self::harvest_image_urls_from_node(node)?); + if let Ok(url) = Self::harvest_image_urls_from_node(node) { + image_urls.push(url); + } } Ok(image_urls) From b92500fca276535b40d1956e71a1cca226d92437 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 5 Apr 2025 15:45:41 +0200 Subject: [PATCH 51/56] better error messages --- article_scraper/src/error.rs | 6 +- article_scraper/src/full_text_parser/mod.rs | 96 ++++++++++----------- 2 files changed, 50 insertions(+), 52 deletions(-) diff --git a/article_scraper/src/error.rs b/article_scraper/src/error.rs index 4f915fd..41ac9de 100644 --- a/article_scraper/src/error.rs +++ b/article_scraper/src/error.rs @@ -6,10 +6,10 @@ use thiserror::Error; #[derive(Error, Debug)] pub enum ScraperError { - #[error("")] + #[error("Configerror {0}")] Config(#[from] ConfigError), - #[error("")] + #[error("ImageDownloadError {0}")] Image(#[from] ImageDownloadError), - #[error("")] + #[error("FullTextParserError {0}")] Scrap(#[from] FullTextParserError), } diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 18fc682..4bb8a30 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -354,63 +354,61 @@ impl FullTextParser { .send() .await .map_err(|err| { - log::error!( - "Downloading HTML failed: GET '{}' - '{}'", - url.as_str(), - err - ); + log::error!("Downloading HTML failed: GET '{url}' - '{err}'"); FullTextParserError::Http })?; Ok(response) } async fn get_body(response: Response) -> Result { - if response.status().is_success() { - let headers = response.headers().clone(); - let bytes = response - .bytes() - .await - .map_err(|_| FullTextParserError::Http)?; - - match from_utf8(&bytes) { - Ok(utf8_str) => { - log::debug!("Valid utf-8 string"); - return Ok(utf8_str.into()); - } - Err(error) => { - log::debug!("Invalid utf-8 string"); - let lossy_string = std::string::String::from_utf8_lossy(&bytes); - - if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) { - log::debug!("Encoding extracted from HTML: '{}'", encoding); - if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { - let decoded_html = decoded_html.replacen( - &format!("charset=\"{encoding}\""), - "charset=\"utf-8\"", - 1, - ); - return Ok(decoded_html); - } - } - - if let Some(encoding) = Self::get_encoding_from_http_header(&headers) { - log::debug!("Encoding extracted from headers: '{}'", encoding); - if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { - let decoded_html = decoded_html.replacen( - &format!("charset=\"{encoding}\""), - "charset=\"utf-8\"", - 1, - ); - return Ok(decoded_html); - } - } - - return Err(FullTextParserError::Utf8(error)); - } - } + let status = response.status(); + if !status.is_success() { + log::error!("status code: {status}"); + return Err(FullTextParserError::Http); } - Err(FullTextParserError::Http) + let headers = response.headers().clone(); + let bytes = response + .bytes() + .await + .map_err(|_| FullTextParserError::Http)?; + + match from_utf8(&bytes) { + Ok(utf8_str) => { + log::debug!("Valid utf-8 string"); + Ok(utf8_str.into()) + } + Err(error) => { + log::debug!("Invalid utf-8 string"); + let lossy_string = std::string::String::from_utf8_lossy(&bytes); + + if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) { + log::debug!("Encoding extracted from HTML: '{encoding}'"); + if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { + let decoded_html = decoded_html.replacen( + &format!("charset=\"{encoding}\""), + "charset=\"utf-8\"", + 1, + ); + return Ok(decoded_html); + } + } + + if let Some(encoding) = Self::get_encoding_from_http_header(&headers) { + log::debug!("Encoding extracted from headers: '{encoding}'"); + if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { + let decoded_html = decoded_html.replacen( + &format!("charset=\"{encoding}\""), + "charset=\"utf-8\"", + 1, + ); + return Ok(decoded_html); + } + } + + Err(FullTextParserError::Utf8(error)) + } + } } pub async fn download( From 9b374a28c717e57db7341fac4967b9e0114ad455 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 5 Apr 2025 15:47:08 +0200 Subject: [PATCH 52/56] update ftr-site-config --- article_scraper/ftr-site-config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/ftr-site-config b/article_scraper/ftr-site-config index ccde390..69aa220 160000 --- a/article_scraper/ftr-site-config +++ b/article_scraper/ftr-site-config @@ -1 +1 @@ -Subproject commit ccde390b11893cbafdc84f74b449ddc3cc05c024 +Subproject commit 69aa220193d99427d3822fabccdfaeede56cd532 From f361392c04376736ce9ce2d338c7363959135878 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 4 May 2025 17:34:33 +0200 Subject: [PATCH 53/56] check for empty http response and parsed documents without root element --- article_scraper/src/full_text_parser/mod.rs | 35 +++++++++++++++++++-- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 4bb8a30..ac77bf6 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -69,6 +69,11 @@ impl FullTextParser { let html = Self::get_body(response).await?; + if html.is_empty() { + log::error!("Empty response body"); + return Err(FullTextParserError::Http); + } + // check for fingerprints let config = if config.is_none() { if let Some(url) = Fingerprints::detect(&html) { @@ -264,10 +269,17 @@ impl FullTextParser { } // parse html - Self::parse_html_string_patched(html.as_str()).map_err(|err| { + let document = Self::parse_html_string_patched(html.as_str()).map_err(|err| { log::error!("Parsing HTML failed for downloaded HTML {:?}", err); FullTextParserError::Xml - }) + })?; + + if document.get_root_element().is_none() { + log::error!("document without root"); + Err(FullTextParserError::Xml) + } else { + Ok(document) + } } /// FIXME: Here are some patched functions of libxml crate. @@ -368,6 +380,18 @@ impl FullTextParser { } let headers = response.headers().clone(); + + if headers + .get(reqwest::header::CONTENT_LENGTH) + .and_then(|hv| hv.to_str().ok()) + .and_then(|str| str.parse::().ok()) + .map(|content_length| content_length == 0) + .unwrap_or(false) + { + log::error!("Empty response body"); + return Err(FullTextParserError::Http); + } + let bytes = response .bytes() .await @@ -420,7 +444,12 @@ impl FullTextParser { let headers = Util::generate_headers(config, global_config)?; let response = Self::get_response(url, client, headers).await?; let body = Self::get_body(response).await?; - Ok(body) + if body.is_empty() { + log::error!("Empty response body"); + Err(FullTextParserError::Http) + } else { + Ok(body) + } } fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> { From 06990acbc0d4cd55a44aeb20e95c1e6216074a16 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 4 May 2025 17:38:46 +0200 Subject: [PATCH 54/56] fix libxml CI build --- .gitlab-ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7880e5d..159f07d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,12 +4,14 @@ stages: run-build: stage: build - image: rust:1.83 + image: rust:1.86 before_script: - rustup component add rustfmt - rustup component add clippy + - export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so script: - rustc --version && cargo --version + - echo $LIBXML2 - cargo fmt -- --check - cargo clippy --all-targets --all-features -- -D warnings - cargo build --release From 498008f6307c3faabfd6ac40e820871752b75039 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 4 May 2025 17:51:30 +0200 Subject: [PATCH 55/56] bump version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 99695c8..8569ad0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = ["article_scraper", "article_scraper_cli"] resolver = "2" [workspace.package] -version = "2.1.1" +version = "2.1.2" authors = ["Jan Lukas Gernert "] edition = "2021" license = "GPL-3.0-or-later" From 9f349f8c6f2a88b277a8d1552d3d84781bdc9363 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 4 May 2025 18:00:59 +0200 Subject: [PATCH 56/56] need reqwest streams --- article_scraper/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index e852be9..eeed67c 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -14,7 +14,7 @@ exclude = ["resources/tests"] [dependencies] thiserror = "2.0" libxml = "0.3" -reqwest = "0.12" +reqwest = { version = "0.12", features = ["stream"] } tokio = { version = "1", features = ["macros", "fs", "io-util"] } url = "2.5" regex = "1.11"
      - きょうの記念日
      + きょうの記念日

      元旦
      - きょうの誕生花
      + きょうの誕生花

      松(まつ)
      - きょうの世界昔話
      + きょうの世界昔話

      モンゴルの十二支話
      - きょうの日本民話
      + きょうの日本民話

      仕事の取替えっこ
      +
      -
      +