From 3e5654e1973b1a2c3e7e50bf13bac68b44ab364c Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 8 Jun 2024 01:02:52 +0200 Subject: [PATCH 01/23] fix tests --- article_scraper/src/full_text_parser/mod.rs | 3 ++- article_scraper/src/util.rs | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index c7d8d9e..a25df4e 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -288,7 +288,7 @@ impl FullTextParser { } } - fn parse_html_string_patched( + pub(crate) fn parse_html_string_patched( input: &str, parser: &Parser, ) -> Result { @@ -691,6 +691,7 @@ impl FullTextParser { .ok() .and_then(|()| node.set_property("width", "100%").ok()) .and_then(|()| node.set_property("height", "400").ok()) + .and_then(|()| node.remove_attribute("aspect-ratio").ok()) .ok_or_else(|| { node.unlink(); video_wrapper.add_child(&mut node) diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index e605015..fa86b5b 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -1259,14 +1259,14 @@ impl Util { #[cfg(test)] mod tests { use libxml::parser::Parser; - + use crate::FullTextParser; use super::Util; fn replace_brs(source: &str, expected: &str) { libxml::tree::node::set_node_rc_guard(10); let parser = Parser::default_html(); - let document = parser.parse_string(source).unwrap(); + let document = FullTextParser::parse_html_string_patched(source, &parser).unwrap(); let root = document.get_root_element().unwrap(); let body = root.get_first_child().unwrap(); let div = body.get_first_child().unwrap(); From 11e9261bf28a130262ca7e5b1254eee50e3adc46 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 8 Jun 2024 01:03:00 +0200 Subject: [PATCH 02/23] fmt --- article_scraper/src/util.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index fa86b5b..2d24cdc 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -1258,9 +1258,9 @@ impl Util { #[cfg(test)] mod tests { - use libxml::parser::Parser; - use crate::FullTextParser; use super::Util; + use crate::FullTextParser; + use libxml::parser::Parser; fn replace_brs(source: &str, expected: &str) { libxml::tree::node::set_node_rc_guard(10); From 06018d98d4c743fda58eff1e41027476b5cf16ff Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 8 Jun 2024 23:18:00 +0200 Subject: [PATCH 03/23] replace emoji images --- article_scraper/Cargo.toml | 3 +- article_scraper/src/full_text_parser/mod.rs | 1 + article_scraper/src/util.rs | 57 +++++++++++++++++++++ 3 files changed, 60 insertions(+), 1 deletion(-) diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index 8d92a79..6e4d003 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -22,10 +22,11 @@ chrono = "0.4" base64 = "0.22" image = "0.25" log = "0.4" -rust-embed="8.3" +rust-embed="8.4" once_cell = "1.19" escaper = "0.1" futures = "0.3" +unic-emoji-char = "0.9" [dev-dependencies] env_logger = "0.11" diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index a25df4e..0e63850 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -966,6 +966,7 @@ impl FullTextParser { if let Some(root) = document.get_root_element() { Util::replace_brs(&root, document); + Util::replace_emoji_images(&root, document); } Self::fix_urls(context, url, document); diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index 2d24cdc..429378c 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -682,6 +682,31 @@ impl Util { } } + pub fn replace_emoji_images(root: &Node, document: &Document) { + let img_nodes = Util::get_elements_by_tag_name(root, "img"); + + for img_node in img_nodes { + if let Some(img_alt) = img_node.get_attribute("alt") { + let mut alt_chars = img_alt.chars(); + let first_char = alt_chars.next(); + let second_char = alt_chars.next(); + + if let (Some(char), None) = (first_char, second_char) { + if unic_emoji_char::is_emoji(char) { + if let Some(mut parent) = img_node.get_parent() { + // if let Ok(emoji_text_node) = parent.add_text_child(None, "emoji", &char.to_string()) { + // _ = parent.replace_child_node(emoji_text_node, img_node); + // } + let emoji_text_node = + Node::new_text(&char.to_string(), document).unwrap(); + _ = parent.replace_child_node(emoji_text_node, img_node); + } + } + } + } + } + } + // Clean an element of all tags of type "tag" if they look fishy. // "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. pub fn clean_conditionally(root: &mut Node, tag: &str) { @@ -1303,4 +1328,36 @@ mod tests { "#; replace_brs(source, source.trim()) } + + fn replace_emojis(source: &str, expected: &str) { + libxml::tree::node::set_node_rc_guard(10); + + let parser = Parser::default_html(); + let document = FullTextParser::parse_html_string_patched(source, &parser).unwrap(); + let root = document.get_root_element().unwrap(); + let body = root.get_first_child().unwrap(); + let p = body.get_first_child().unwrap(); + + Util::replace_emoji_images(&root, &document); + + let result = document.node_to_string(&p); + + assert_eq!(expected, result); + } + + #[test] + fn replace_emojis_1() { + replace_emojis( + "

Let’s see if I did a better job of it this time by telling him he was using Arch wrong. \"😀\"/

", + "

Let’s see if I did a better job of it this time by telling him he was using Arch wrong. 😀

", + ) + } + + #[test] + fn replace_emojis_2() { + replace_emojis( + "

\"😀\"/ Abc

", + "

😀 Abc

", + ) + } } From e01c8e9d34a28d646d26aad6e2a7a9dc093056c3 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Mon, 10 Jun 2024 20:40:19 +0200 Subject: [PATCH 04/23] negative score for thumbnails with emoji alt --- article_scraper/src/full_text_parser/mod.rs | 1 + article_scraper/src/full_text_parser/tests.rs | 20 +++++++++ article_scraper/src/util.rs | 42 ++++++++++++------- 3 files changed, 49 insertions(+), 14 deletions(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 0e63850..2857f73 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -533,6 +533,7 @@ impl FullTextParser { let score = score + Util::score_by_sibling(&img_node); let score = score + Util::score_by_dimensions(&img_node); let score = score + Util::score_by_position(len, index); + let score = score + Util::score_by_alt(&img_node); scores.insert(src, score); } diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index 0f0370f..36ae0d0 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -278,3 +278,23 @@ Foto: IMAGO/Vaclav Salek / IMAGO/CTK Photo "https://cdn.prod.www.spiegel.de/images/a4573666-f15e-4290-8c73-a0c6cd4ad3b2_w948_r1.778_fpx29.99_fpy44.98.jpg" ) } + +#[test] +fn extract_thumbnail_no_emoji() { + let html = r#" +

I recently went on Brodie Robertson’s Tech Over Tea channel for a second time. I guess I didn’t succeed at pissing him off enough on the first go-around, because he invited me back! Let’s see if I did a better job of it this time by telling him he was using Arch wrong. 😀

+

Anyway, Brodie was a fantastic host, and we talked about a number of topics such as KDE’s position in the world, institutional continuity, fundraising and financial stability, the difficulty of reporting and triaging bug, the challenges of packaging software, and windows that block WiFi signals.

+

I hope you enjoy it!

+
+ +
+

And here’s the link I mention at the end: https://kde.org/community/donations 🙂

+ "#; + + let parser = Parser::default_html(); + let doc = FullTextParser::parse_html_string_patched(html, &parser).unwrap(); + let ctx = Context::new(&doc).unwrap(); + + let thumb = FullTextParser::check_for_thumbnail(&ctx); + assert_eq!(thumb, None) +} diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index 429378c..cbd5370 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -687,26 +687,28 @@ impl Util { for img_node in img_nodes { if let Some(img_alt) = img_node.get_attribute("alt") { - let mut alt_chars = img_alt.chars(); - let first_char = alt_chars.next(); - let second_char = alt_chars.next(); - - if let (Some(char), None) = (first_char, second_char) { - if unic_emoji_char::is_emoji(char) { - if let Some(mut parent) = img_node.get_parent() { - // if let Ok(emoji_text_node) = parent.add_text_child(None, "emoji", &char.to_string()) { - // _ = parent.replace_child_node(emoji_text_node, img_node); - // } - let emoji_text_node = - Node::new_text(&char.to_string(), document).unwrap(); - _ = parent.replace_child_node(emoji_text_node, img_node); - } + if Self::is_emoji(&img_alt) { + if let Some(mut parent) = img_node.get_parent() { + let emoji_text_node = Node::new_text(&img_alt, document).unwrap(); + _ = parent.replace_child_node(emoji_text_node, img_node); } } } } } + pub fn is_emoji(text: &str) -> bool { + let mut alt_chars = text.chars(); + let first_char = alt_chars.next(); + let second_char = alt_chars.next(); + + if let (Some(char), None) = (first_char, second_char) { + unic_emoji_char::is_emoji(char) + } else { + false + } + } + // Clean an element of all tags of type "tag" if they look fishy. // "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. pub fn clean_conditionally(root: &mut Node, tag: &str) { @@ -1248,6 +1250,18 @@ impl Util { ((len as f32 / 2.0) - index as f32) as i32 } + pub fn score_by_alt(node: &Node) -> i32 { + if let Some(alt) = node.get_attribute("alt") { + if Self::is_emoji(&alt) { + -100 + } else { + 0 + } + } else { + 0 + } + } + pub fn get_content_length(response: &Response) -> Result { let status_code = response.status(); From df8ebcbb3552c2533fe47979e48378916929cd7f Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Mon, 10 Jun 2024 22:06:48 +0200 Subject: [PATCH 05/23] treat iframes as valid emtry tags --- .../readability/embedded-videos/expected.html | 10 +-- .../tests/readability/engadget/expected.html | 4 +- .../readability/hukumusume/expected.html | 6 +- .../tests/readability/lemonde-1/expected.html | 4 +- .../readability/liberation-1/expected.html | 4 +- .../tests/readability/msn/expected.html | 2 +- .../tests/readability/qq/expected.html | 2 +- .../tests/readability/videos-1/expected.html | 84 ++++++++++++++----- .../tests/readability/videos-2/expected.html | 28 +++++-- .../tests/readability/yahoo-1/expected.html | 16 ++-- article_scraper/src/clean.rs | 19 +++++ article_scraper/src/constants.rs | 8 ++ article_scraper/src/full_text_parser/mod.rs | 6 +- .../src/full_text_parser/readability/tests.rs | 4 + article_scraper/src/full_text_parser/tests.rs | 20 ----- 15 files changed, 145 insertions(+), 72 deletions(-) diff --git a/article_scraper/resources/tests/readability/embedded-videos/expected.html b/article_scraper/resources/tests/readability/embedded-videos/expected.html index 690b431..6db4190 100644 --- a/article_scraper/resources/tests/readability/embedded-videos/expected.html +++ b/article_scraper/resources/tests/readability/embedded-videos/expected.html @@ -8,13 +8,13 @@ proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

At root

- - - + + +

In a paragraph

- +

In a div

- +

Foo

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, diff --git a/article_scraper/resources/tests/readability/engadget/expected.html b/article_scraper/resources/tests/readability/engadget/expected.html index a112ae2..b9b0c8e 100644 --- a/article_scraper/resources/tests/readability/engadget/expected.html +++ b/article_scraper/resources/tests/readability/engadget/expected.html @@ -250,7 +250,9 @@ capable HDR 10 standard. That makes sense since it's more widely supported, but it would have been nice to see Dolby's, too.

- +

+ +

And speaking of Dolby technology, Microsoft is also highlighting Atmos support on the One X, just like it did with the One S. The company's app lets you diff --git a/article_scraper/resources/tests/readability/hukumusume/expected.html b/article_scraper/resources/tests/readability/hukumusume/expected.html index 5ecf93d..fbc27b0 100644 --- a/article_scraper/resources/tests/readability/hukumusume/expected.html +++ b/article_scraper/resources/tests/readability/hukumusume/expected.html @@ -6,7 +6,7 @@ - + @@ -80,7 +80,7 @@ -

- +

+ +

Les députés ont, sans surprise, adopté à une large majorité (438 contre 86 et 42 abstentions) le projet de loi sur le renseignement défendu par le gouvernement lors d’un vote solennel, mardi 5 mai. Il sera désormais examiné par le Sénat, puis le Conseil constitutionnel, prochainement saisi par 75 députés. Dans un souci d'apaisement, François Hollande avait annoncé par avance qu'il saisirait les Sages.

Revivez le direct du vote à l’Assemblée avec vos questions.

Ont voté contre : 10 députés socialistes (sur 288), 35 UMP (sur 198), 11 écologistes (sur 18), 11 UDI (sur 30), 12 députés Front de gauche (sur 15) et 7 non-inscrits (sur 9). Le détail est disponible sur le site de l'Assemblée nationale.

diff --git a/article_scraper/resources/tests/readability/liberation-1/expected.html b/article_scraper/resources/tests/readability/liberation-1/expected.html index 4b911de..2806896 100644 --- a/article_scraper/resources/tests/readability/liberation-1/expected.html +++ b/article_scraper/resources/tests/readability/liberation-1/expected.html @@ -8,7 +8,9 @@

L’appareil, mis à disposition par Airbus, était arrivé à Katmandou mercredi matin avec 55 personnels de santé et humanitaires, ainsi que 25 tonnes de matériel (abris, médicaments, aide alimentaire). Un deuxième avion dépêché par Paris, qui était immobilisé aux Emirats depuis mardi avec 20 tonnes de matériel, est arrivé jeudi à Katmandou, dont le petit aéroport est engorgé par le trafic et l’afflux d’aide humanitaire. Il devait lui aussi ramener des Français, «les plus éprouvés» par la catastrophe et les «plus vulnérables (blessés, familles avec enfants)», selon le ministère des Affaires étrangères.

2 209 Français ont été localisés sains et saufs tandis que 393 n’ont pas encore pu être joints, selon le Quai d’Orsay. Environ 400 Français ont demandé à être rapatriés dans les vols mis en place par la France.

Le séisme a fait près de 5 500 morts et touche huit des 28 millions d’habitants du Népal. Des dizaines de milliers de personnes sont sans abri.

- +

+ +

\ No newline at end of file diff --git a/article_scraper/resources/tests/readability/msn/expected.html b/article_scraper/resources/tests/readability/msn/expected.html index 157f0bd..597e831 100644 --- a/article_scraper/resources/tests/readability/msn/expected.html +++ b/article_scraper/resources/tests/readability/msn/expected.html @@ -16,7 +16,7 @@

The name and basic idea might sound like one of those endless score attack games like "Temple Run," but that's not the case. "Super Mario Run" is divided into hand-crafted levels with a clear end-point like any other Mario game, meaning you're essentially getting the Mario experience for $10 without needing to control his movement.

$10 might seem like a bit much compared to the $0 people pay for most mobile games, but it's possible the game has $10 worth of levels to play in it. It's also not iPhone exclusive, but the Android version will launch at a later, currently unknown date.

To see "Super Mario Run" in action, check out the footage below:

- +
\ No newline at end of file diff --git a/article_scraper/resources/tests/readability/qq/expected.html b/article_scraper/resources/tests/readability/qq/expected.html index baad181..20914e1 100644 --- a/article_scraper/resources/tests/readability/qq/expected.html +++ b/article_scraper/resources/tests/readability/qq/expected.html @@ -28,7 +28,7 @@

转播到腾讯微博

- + diff --git a/article_scraper/resources/tests/readability/videos-1/expected.html b/article_scraper/resources/tests/readability/videos-1/expected.html index e469e3d..1bd350e 100644 --- a/article_scraper/resources/tests/readability/videos-1/expected.html +++ b/article_scraper/resources/tests/readability/videos-1/expected.html @@ -11,7 +11,9 @@

21) Star Wars: The Last Jedi

- +
+ +

I am as shocked as anyone that a Star Wars movie found its way onto my list — but I was bowled over by The Last Jedi, which may be one of the series’ best. In the hands of writer-director Rian Johnson (who will also oversee a new Star Wars trilogy), The Last Jedi is beautiful to look at and keeps its eye on the relationships between characters and how they communicate with one another, in addition to the bigger galactic story. The same characters are back, but they seem infused with new life, and the galaxy with a new kind of hope. The movie’s best details are in the strong bonds that develop between characters, and I left the film with the realization that for the first time in my life, I loved a Star Wars movie. Now I understand the magic.

@@ -21,7 +23,9 @@

20) Faces Places

- +
+ +

The unusual documentary Faces Places (in French, Visages Villages) turns on the friendship between the accomplished street artist JR and legendary film director Agnès Varda, whose work was central to the development of the French New Wave movement. The pair (whose difference in age is 55 years) met after years of admiring each other’s work and decided to create a documentary portrait of France — by making a number of actual portraits. The film chronicles a leg of the "Inside Outside Project," a roving art initiative in which JR makes enormous portraits of people he meets and pastes them onto buildings and walls. In the film, Varda joins him, and as they talk to people around the country, they grow in their understanding of themselves and of each other. The development of their friendship, which is both affectionate and mutually sharpening, forms Faces Places’ emotional center.

@@ -32,7 +36,9 @@

19) Ingrid Goes West

- +
+ +

Ingrid Goes West is a twisted and dark comedy — part addiction narrative, part stalker story — and yet it’s set in a world that’s almost pathologically cheery: the glossy, sunny, nourishing, superfood- and superlative-loving universe of Instagram celebrity. But despite Ingrid Goes West’s spot-on take on that world, the best thing about the film is that it refuses to traffic in lazy buzzwords and easy skewering, particularly at the expense of young women. Instead, the movie conveys that behind every Instagram image and meltdown is a real person, with real insecurities, real feelings, and real problems. And it recognizes that living a life performed in public can be its own kind of self-deluding prison.

@@ -42,7 +48,9 @@

18) Lady Macbeth

- +
+ +

Lady Macbeth is no placid costume drama. Adapted from an 1865 Russian novella by Nikolai Leskov, the movie follows Katherine (the astounding Florence Pugh), a woman in the Lady Macbeth line characterized by a potent cocktail of very few scruples and a lot of determination. She's a chilling avatar for the ways that class and privilege — both obvious and hidden — insulate some people from the consequences of their actions while damning others. Lady Macbeth is also a dazzling directorial debut from William Oldroyd, a thrilling combination of sex, murder, intrigue, and power plays. It’s visually stunning, each frame composed so carefully and deliberately that the wildness and danger roiling just below the surface feels even more frightening. Each scene ratchets up the tension to an explosive, chilling end.

@@ -52,7 +60,9 @@

17) BPM (Beats Per Minute)

- +
+ +

BPM (Beats Per Minute) is a remarkably tender and stirring story of the Paris chapter of ACT UP, an AIDS activism group, and the young people who found themselves caught in the crosshairs of the AIDS crisis in the early 1990s. The film follows both the group's actions and the individual members’ shifting relationships to one another — enemies becoming friends, friends becoming lovers, lovers becoming caretakers — as well as their struggles with the disease wracking their community. As an account of the period, it’s riveting; as an exploration of life and love set at the urgent intersection of the political and the personal, it’s devastating.

@@ -62,7 +72,9 @@

16) The Big Sick

- +
+ +

Few 2017 movies could top the charm and tenderness of The Big Sick, which hits all the right romantic comedy notes with one unusual distinction: It feels like real life. That’s probably because The Big Sick is written by real-life married couple Emily V. Gordon and Silicon Valley's Kumail Nanjiani, and based on their real-life romance. The Big Sick — which stars Nanjiani as a version of himself, alongside Zoe Kazan as Emily — is funny and sweet while not backing away from matters that romantic comedies don’t usually touch on, like serious illness, struggles in long-term marriages, and religion. As it tells the couple’s story, which takes a serious turn when Emily falls ill with a mysterious infection and her parents (played by Holly Hunter and Ray Romano) come to town, it becomes a funny and wise story about real love.

@@ -72,7 +84,9 @@

15) Mother!

- +
+ +

There’s so much pulsing beneath the surface of Mother! that it’s hard to grab on to just one theme as what it “means.” It’s full-on apocalyptic fiction, and like all stories of apocalypse, it’s intended to draw back the veil on reality and show us what’s really beneath. And this movie gets wild: If its gleeful cracking apart of traditional theologies doesn’t get you (there’s a lot of Catholic folk imagery here, complete with an Ash Wednesday-like mud smearing on the foreheads of the faithful), its bonkers scenes of chaos probably will. Mother! is a movie designed to provoke fury, ecstasy, madness, catharsis, and more than a little awe. Watching it, and then participating in the flurry of arguments and discussions unpacking it, was among my best moviegoing experiences of 2017.

@@ -82,7 +96,9 @@

14) A Ghost Story

- +
+ +

Director David Lowery filmed A Ghost Story in secret, then premiered it at the Sundance Film Festival to critical acclaim. The movie starts out being about a grieving widow (Rooney Mara) trying to live through the pain of losing her beloved husband, but it soon shifts focus to the ghost of her husband (Casey Affleck, covered in a sheet), evolving into a compelling rumination on the nature of time, memory, history, and the universe. Bathed in warm humor and wistful longing, it's a film that stays with you long after it’s over, a lingering reminder of the inextricable link between love and place.

@@ -92,7 +108,9 @@

13) The Square

- +
+ +
@@ -102,7 +120,9 @@

12) Dunkirk

- +
+ +

Dunkirk, a true cinematic achievement from acclaimed director Christopher Nolan, backs off conventional notions of narrative and chronology as much as possible, while leaning headfirst into everything else that makes a movie a visceral work of art aimed at the senses: the images, the sounds, the scale, the swelling vibrations of it all. You can’t smell the sea spray, but your brain may trick you into thinking you can. Nolan’s camera pushes the edges of the screen as far as it can as Dunkirk engulfs the audience in something that feels like a lot more than a war movie. It’s a symphony for the brave and broken, and it resolves in a major key — but one with an undercurrent of sorrow, and of sober warning. Courage in the face of danger is not just for characters in movies.

@@ -112,7 +132,9 @@

11) Rat Film

- +
+ +

Rat Film is about rats, yes — and rat poison experts and rat hunters and people who keep rats as pets. But it’s also about the history of eugenics, dubious science, “redlining,” and segregated housing in Baltimore. All these pieces come together to form one big essay, where the meaning of each vignette only becomes clearer in light of the whole. It’s a fast-paced, no-holds-barred exploration of a damning history, and it accrues meaning as the images, sounds, and text pile up.

@@ -122,7 +144,9 @@

10) A Quiet Passion

- +
+ +

A Quiet Passion is technically a biographical film about Emily Dickinson, but it transcends its genre to become something more like poetry. It’s a perplexing and challenging film, crafted without the traditional guardrails that guide most biographical movies — dates, times, major accomplishments, and so on. Time slips away in the film almost imperceptibly, and the narrative arc doesn’t yield easily to the viewer. Cynthia Nixon plays Emily Dickinson, whose poetry and life is a perfect match for the signature style of director Terence Davies: rich in detail, deeply enigmatic, and weighed down with a kind of sparkling, joy-tinged sorrow. A Quiet Passion is a portrait, both visual and narrative, of the kind of saint most modern people can understand: one who is certain of her uncertainty, and yearning to walk the path on which her passion and longing meet.

@@ -132,7 +156,9 @@

9) Columbus

- +
+ +

Columbus is a stunner of a debut from video essayist turned director Kogonada. Haley Lu Richardson stars as Casey, a young woman living in Columbus, Indiana, who cares for her mother, works at a library, and harbors a passion for architecture. (Columbus is a mecca for modernist architecture scholars and enthusiasts.) When a visiting architecture scholar falls into a coma in Columbus, his estranged son Jin (John Cho) arrives to wait for him and strikes up a friendship with Casey, who starts to show him her favorite buildings. The two begin to unlock something in each other that’s hard to define but life-changing for both. Columbus is beautiful and subtle, letting us feel how the places we build and the people we let near us move and mold us.

@@ -142,7 +168,9 @@

8) The Florida Project

- +
+ +

Sean Baker’s The Florida Project unfolds at first like a series of sketches about the characters who live in a purple-painted, $35-a-night motel called the Magic Castle down the street from Disney World. The film is held together by the hysterical antics of a kid named Moonee and her pack of young friends, as well as long-suffering hotel manager Bobby (a splendid, warm Willem Dafoe), who tries to put up with it all while keeping some kind of order. But as The Florida Project goes on, a narrative starts to form, one that chronicles with heartbreaking attention the sort of dilemmas that face poor parents and their children in America, and the broken systems that try to cope with impossible situations.

@@ -152,7 +180,9 @@

7) Call Me by Your Name

- +
+ +

Luca Guadagnino’s gorgeous film Call Me by Your Name adapts André Aciman’s 2007 novel about a precocious 17-year-old named Elio (Timothée Chalamet), who falls in lust and love with his father’s 24-year-old graduate student Oliver (Armie Hammer). It’s remarkable for how it turns literature into pure cinema, all emotion and image and heady sensation. Set in 1983 in Northern Italy, Call Me by Your Name is less about coming out than coming of age, but it also captures a particular sort of love that’s equal parts passion and torment, a kind of irrational heart fire that opens a gate into something longer-lasting. The film is a lush, heady experience for the body, but it’s also an arousal for the soul.

@@ -162,7 +192,9 @@

6) Personal Shopper

- +
+ +

In her second collaboration with French director Olivier Assayas, Kristen Stewart plays a personal shopper to a wealthy socialite, with a sideline as an amateur ghost hunter who’s searching for her dead twin brother. Personal Shopper is deeper than it seems at first blush, a meditation on grief and an exploration of “between” places — on the fringes of wealth, and in the space between life and death. Some souls are linked in a way that can’t be shaken, and whether or not there’s an afterlife doesn’t change the fact that we see and sense them everywhere. (Personal Shopper also has one of the most tense extended scenes involving text messaging ever seen onscreen.)

@@ -172,7 +204,9 @@

5) Princess Cyd

- +
+ +

Stephen Cone is a master of small, carefully realized filmmaking; his earlier films such as The Wise Kids and Henry Gamble’s Birthday Party combine an unusual level of empathy for his characters with an unusual combination of interests: love, desire, sexual awakenings, and religion. Princess Cyd is his most accomplished film yet, about a young woman named Cyd (Jessie Pinnick) who finds herself attracted to Katie (Malic White), a barista, while visiting her Aunt Miranda (Rebecca Spence, playing a character modeled on the author Marilynne Robinson) in Chicago. As she works through her own sexual awakening with Katie, Cyd unwinds some of the ways Miranda’s life has gotten too safe. They provoke each other while forming a bond and being prodded toward a bigger understanding of the world. It is a graceful and honest film, and it feels like a modest miracle.

@@ -182,7 +216,9 @@

4) Get Out

- +
+ +

Racism is sinister, frightening, and deadly. But Get Out (a stunning directorial debut from Key & Peele's Jordan Peele) isn’t about the blatantly, obviously scary kind of racism — burning crosses and lynchings and snarling hate. Instead, it’s interested in showing how the parts of racism that try to be aggressively unscary are just as horrifying, and it’s interested in making us feel that horror in a visceral, bodily way. In the tradition of the best classic social thrillers, Get Out takes a topic that is often approached cerebrally — casual racism — and turns it into something you feel in your tummy. And it does it with a wicked sense of humor.

@@ -192,7 +228,9 @@

3) The Work

- +
+ +

The Work is an outstanding, astonishing accomplishment and a viewing experience that will leave you shaken (but in a good way). At Folsom Prison in California, incarcerated men regularly participate in group therapy, and each year other men from the “outside” apply to participate in an intense four-day period of group therapy alongside Folsom’s inmates. The Work spends almost all of its time inside the room where that therapy happens, observing the strong, visceral, and sometimes violent emotions the men feel as they expose the hurt and raw nerves that have shaped how they encounter the world. Watching is not always easy, but by letting us peek in, the film invites viewers to become part of the experience — as if we, too, are being asked to let go.

@@ -202,7 +240,9 @@

2) Ex Libris

- +
+ +

Frederick Wiseman is one of the towering giants of nonfiction film, a keen observer of American institutions — ranging from prisons to dance companies to welfare offices — for the past half-century. Ex Libris is his mesmerizing look at the New York Public Library and the many functions it fills, which go far beyond housing books. Wiseman works in the observational mode, which means his films contain no captions, dates, or talking-head interviews: We just see what his camera captured, which in this case includes community meetings, benefit dinners, after-school programs, readings with authors and scholars (including Richard Dawkins and Ta-Nehisi Coates), and NYPL patrons going about their business in the library’s branches all over the city. The result is almost hypnotic and, perhaps surprisingly, deeply moving. It makes a case for having faith in the public institutions where ordinary people work — away from the limelight, without trying to score political points — in order to make our communities truly better.

@@ -212,7 +252,9 @@

1) Lady Bird

- +
+ +

Lady Bird topped my list almost instantly, and only rose in my estimation on repeated viewings. For many who saw it (including me), it felt like a movie made not just for but about me. Lady Bird is a masterful, exquisite coming-of-age comedy starring the great Saoirse Ronan as Christine — or “Lady Bird,” as she’s re-christened herself — and it’s as funny, smart, and filled with yearning as its heroine. Writer-director Greta Gerwig made the film as an act of love, not just toward her hometown of Sacramento but also toward girlhood, and toward the feeling of always being on the outside of wherever real life is happening. Lady Bird is the rare movie that manages to be affectionate, entertaining, hilarious, witty, and confident. And one line from it struck me as the guiding principle of many of the year’s best films: “Don’t you think they are the same thing? Love, and attention?”

diff --git a/article_scraper/resources/tests/readability/videos-2/expected.html b/article_scraper/resources/tests/readability/videos-2/expected.html index f2fe95f..f5f2c68 100644 --- a/article_scraper/resources/tests/readability/videos-2/expected.html +++ b/article_scraper/resources/tests/readability/videos-2/expected.html @@ -8,7 +8,9 @@

Vape Wave (documentaire, 1h28, Planète+)

- +

+ +

Pendant quelques jours, le doute a plané : l’Etat comptait-il vraiment légiférer contre la cigarette dans les films français, que ce soit via une interdiction pure et simple ou via un système de «punition» (coupe des aides CNC, par exemple) pour les longs-métrages qui sentent le mégot ? Si le rétropédalage de la ministre Buzyn n’en est pas vraiment un (elle n’avait jamais clairement menacé le septième art), la polémique a le mérite de pointer la (sur)représentation clopesque sur écran. Et si, comme c’est le cas dans la vie quotidienne, on voyait progressivement les cigarettes électroniques remplacer les tiges nicotinées authentiques ? Que ceux qui mettraient en doute le potentiel cinématographique des vapoteuses se ruent sur Vape Wave, documentaire militant signé Jan Kounen, ex-fumeur reconverti à la vape dont les images magnifient les volutes de vapeur recrachée.

@@ -21,7 +23,9 @@

Dans la tête d’Alan Moore (websérie documentaire, 8x5min, Arte Creative)

- +

+ +

Le week-end dernier, Libération publiait un portrait de der consacré à l’auteur britannique Alan Moore, connu pour ses BD cultes (V pour Vendetta, Watchmen, From Hell), à l’occasion de la sortie de son deuxième roman, le pavé Jérusalem. En attendant l’imminente sortie d’une version longue de son entretien avec Libé, on pourra se replonger dans les épisodes d’une websérie documentaire d’Arte Creative en 8 épisodes consacré au maître. Brexit, magie, Anonymous font partie des sujets discutés avec le maître au fil de ce programme sobrement intitulé Dans la tête d’Alan Moore. (A.H.)

@@ -31,7 +35,9 @@

The Death and Life of Marsha P. Johnson (docu, 1h45, Netflix)

- +

+ +

Marsha, la «Rosa Parks du mouvement LGBTQ». Marsha «la prostituée, l’actrice et la sainte, modèle d’Andy Warhol» ou encore Marsha l’élaborée, la radicale, «avec ses plumes et ce maquillage qu’elle ne mettait jamais bien». «Queen Marsha» a été retrouvée morte dans l’Hudson en juillet 1992, alors qu’on la voyait encore parader dans les rues de Greenwich Village quelques jours auparavant. Un choc glaçant. Là où son corps a été repêché puis ingratement déposé, les sans-abri ont constitué le lendemain un mémorial de bouteilles et de plantes qui délimitent les contours de l’absente.

@@ -44,7 +50,9 @@

Alphonse President (série, 10x26, OCS Max)

- +

+ +

Un temps baptisée French Touch, la série Alphonse Président est le dernier né des programmes originaux made in OCS. On savait les budgets de la chaîne bien moins généreux que ceux de Canal+ (voire que ceux de France 3 Limousin), et cette série le prouve à nouveau régulièrement, notamment lors d’une scène de conférence de presse alternant plans larges d’une authentique conf' à l’Elysée période François Hollande et plans serrés d’acteurs filmés dans un château des Pays de la Loire où a eu lieu le tournage. Le principal atout (et quel atout) de cette série écrite et réalisée par Nicolas Castro (Des lendemains qui chantent, 2014) réside dans son interprète principal, Michel Vuillermoz.

@@ -57,7 +65,9 @@

Jim & Andy (documentaire, 1h33, Netflix) 

- +

+ +

A la sortie de Man on the Moon (2000), le magnifique film de Milos Forman consacré à Andy Kaufman – comique et génie de la performance absurde mort en 1984 –, le cinéaste et les acteurs insistaient dans chaque interview sur l’in­croyable comportement de Jim Carrey pendant le tournage : il aurait été comme possédé par Kaufman, se prenant pour lui 24 heures sur 24. Certains affirmaient même ne jamais avoir eu l’impression que l’acteur était présent, tant son modèle avait littéralement pris sa place. Nous en avons aujourd’hui la preuve en images car tout cela avait été filmé par Bob Zmuda et Lynne Margulies, l’ancien complice et la veuve de Kaufman.

@@ -70,7 +80,9 @@

Braguino (documentaire, 50min, Arte)

- +

+ +

La querelle peut se trouver derrière toutes les portes, y compris celle de l’exil. On a beau croire avoir tourné le dos à tout, à cette inclination humaine à nourrir sa propre haine, l’allergie peut regermer fissa sur une peau qui frissonne à l’approche de ce voisin que l’on ne comprend pas. Issu d’une lignée de vieux-croyants orthodoxes russes, Sacha Braguine a pris sa famille sous le bras, loin de toute autre présence humaine en taïga sibérienne. Un autre groupe, les Kiline, a décidé d’en faire de même et de s’installer de l’autre côté de la rivière. Qui est arrivé en premier ? Qui menace l’autre ? L’histoire de l’impossible communauté peut commencer.

@@ -83,7 +95,9 @@

6 Days (film, 1h34, Netflix)

- +

+ +

Fin avril 1980, l’ambassade d’Iran à Londres a été le théâtre d’une prise d’otages largement médiatisée : une trentaine de personnes ont ainsi été retenues pendant six jours par des soldats iraniens dissidents exigeant la libération de 91 prisonniers. Avec Margaret Thatcher au 10 Downing Street à l’époque, pas question pour l’Angleterre d’avoir l’air mou du genou sur la réponse à apporter à cette crise scrutée par les caméras du monde entier. Le SAS (Special Air Service) est sur le coup : l’opération Nimrod se met en place pour prendre d’assaut l’ambassade.

diff --git a/article_scraper/resources/tests/readability/yahoo-1/expected.html b/article_scraper/resources/tests/readability/yahoo-1/expected.html index 9d547d4..b5d5602 100644 --- a/article_scraper/resources/tests/readability/yahoo-1/expected.html +++ b/article_scraper/resources/tests/readability/yahoo-1/expected.html @@ -11,28 +11,28 @@

Virtual reality has officially reached the consoles. And it’s pretty good! Sony’s PlayStation VR is extremely comfortable and reasonably priced, and while it’s lacking killer apps, it’s loaded with lots of interesting ones.

But which ones should you buy? I’ve played just about every launch game, and while some are worth your time, others you might want to skip. To help you decide what’s what, I’ve put together this list of the eight PSVR games worth considering.

“Rez Infinite” ($30)

- +

Beloved cult hit “Rez” gets the VR treatment to help launch the PSVR, and the results are terrific. It includes a fully remastered take on the original “Rez” – you zoom through a Matrix-like computer system, shooting down enemies to the steady beat of thumping electronica – but the VR setting makes it incredibly immersive. It gets better the more you play it, too; unlock the amazing Area X mode and you’ll find yourself flying, shooting and bobbing your head to some of the trippiest visuals yet seen in VR.

“Thumper” ($20)

- +

What would happen if Tron, the board game Simon, a Clown beetle, Cthulhu and a noise band met in VR? Chaos, for sure, and also “Thumper.” Called a “violent rhythm game” by its creators, “Thumper” is, well, a violent rhythm game that’s also a gorgeous, unsettling and totally captivating assault on the senses. With simple controls and a straightforward premise – click the X button and the analog stick in time with the music as you barrel down a neon highway — it’s one of the rare games that works equally well both in and out of VR. But since you have PSVR, play it there. It’s marvelous.

“Until Dawn: Rush of Blood” ($20)

- +

Cheeky horror game “Until Dawn” was a breakout hit for the PS4 last year, channeling the classic “dumb teens in the woods” horror trope into an effective interactive drama. Well, forget all that if you fire up “Rush of Blood,” because this one sticks you front and center on a rollercoaster ride from Hell. Literally. You ride through a dimly-lit carnival of terror, dual-wielding pistols as you take down targets, hideous pig monsters and, naturally, maniac clowns. Be warned: If the bad guys don’t get you, the jump scares will.

“Headmaster” ($20)

- +

Soccer meets “Portal” in the weird (and weirdly fun) “Headmaster,” a game about heading soccer balls into nets, targets and a variety of other things while stuck in some diabolical training facility. While at first it seems a little basic, increasingly challenging shots and a consistently entertaining narrative keep it from running off the pitch. Funny, ridiculous and as easy as literally moving your head back and forth, it’s a pleasant PSVR surprise.

“RIGS: Mechanized Combat League” ($50)

- +

Giant mechs + sports? That’s the gist of this robotic blast-a-thon, which pits two teams of three against one another in gorgeous, explosive and downright fun VR combat. At its best, “RIGS” marries the thrill of fast-paced competitive shooters with the insanity of piloting a giant mech in VR. It can, however, be one of the barfier PSVR games. So pack your Dramamine, you’re going to have to ease yourself into this one.

“Batman Arkham VR” ($20)

- +

“I’m Batman,” you will say. And you’ll actually be right this time, because you are Batman in this detective yarn, and you know this because you actually grab the famous cowl and mask, stick it on your head, and stare into the mirrored reflection of Rocksteady Games’ impressive Dark Knight character model. It lacks the action of its fellow “Arkham” games and runs disappointingly short, but it’s a high-quality experience that really shows off how powerfully immersive VR can be.

“Job Simulator” ($30)

- +

There are a number of good VR ports in the PSVR launch lineup, but the HTC Vive launch game “Job Simulator” might be the best. Your task? Lots of tasks, actually, from cooking food to fixing cars to working in an office, all for robots, because did I mention you were in the future? Infinitely charming and surprisingly challenging, it’s a great showpiece for VR.

“Eve Valkyrie” ($60)

- +

Already a hit on the Oculus Rift, this space dogfighting game was one of the first to really show off how VR can turn a traditional game experience into something special. It’s pricey and not quite as hi-res as the Rift version, but “Eve Valkyrie” does an admirable job filling the void left since “Battlestar Galactica” ended. Too bad there aren’t any Cylons in it (or are there?)

More games news:

    diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index 7ab49e8..140ff5b 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -160,4 +160,23 @@ mod tests { Some("https://cdn.finshots.app/images/2023/03/Design-8-Amul.jpg") ) } + + #[test] + fn pointieststick() { + let html = r#" +

    I recently went on Brodie Robertson’s Tech Over Tea channel for a second time. I guess I didn’t succeed at pissing him off enough on the first go-around, because he invited me back! Let’s see if I did a better job of it this time by telling him he was using Arch wrong. 😀

    +

    Anyway, Brodie was a fantastic host, and we talked about a number of topics such as KDE’s position in the world, institutional continuity, fundraising and financial stability, the difficulty of reporting and triaging bug, the challenges of packaging software, and windows that block WiFi signals.

    +

    I hope you enjoy it!

    +
    + +
    +

    And here’s the link I mention at the end: https://kde.org/community/donations 🙂

    + "#; + + let url = Url::parse("https://pointieststick.com").unwrap(); + let res = clean_html_fragment(html, &url).unwrap(); + + assert_eq!(res.thumbnail, None); + assert!(res.html.contains("iframe")); + } } diff --git a/article_scraper/src/constants.rs b/article_scraper/src/constants.rs index 79823f8..adf6df2 100644 --- a/article_scraper/src/constants.rs +++ b/article_scraper/src/constants.rs @@ -141,6 +141,14 @@ pub static DIV_TO_P_ELEMS: Lazy> = Lazy::new(|| { pub static VALID_EMPTY_TAGS: Lazy> = Lazy::new(|| { HashSet::from([ "AREA", "BASE", "BR", "COL", "EMBED", "HR", "IMG", "LINK", "META", "SOURCE", "TRACK", + "IFRAME", + ]) +}); + +pub static VALID_SELF_CLOSING_TAGS: Lazy> = Lazy::new(|| { + HashSet::from([ + "AREA", "BASE", "BR", "COL", "EMBED", "HR", "IMG", "INPUT", "LINK", "META", "PARAM", + "SOURCE", "TRACK", "WBR", ]) }); diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 2857f73..fece84e 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -1178,15 +1178,15 @@ impl FullTextParser { } } - fn prevent_self_closing_tags(context: &Context) -> Result<(), FullTextParserError> { + pub(crate) fn prevent_self_closing_tags(context: &Context) -> Result<(), FullTextParserError> { // search document for empty tags and add a empty text node as child // this prevents libxml from self closing non void elements such as iframe let xpath = "//*[not(node())]"; let node_vec = Util::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { - let name = node.get_name().to_lowercase(); - if name == "meta" || name == "img" || name == "br" { + let name = node.get_name().to_uppercase(); + if constants::VALID_SELF_CLOSING_TAGS.contains(name.as_str()) { continue; } diff --git a/article_scraper/src/full_text_parser/readability/tests.rs b/article_scraper/src/full_text_parser/readability/tests.rs index 32667c7..562313f 100644 --- a/article_scraper/src/full_text_parser/readability/tests.rs +++ b/article_scraper/src/full_text_parser/readability/tests.rs @@ -39,6 +39,10 @@ async fn run_test(name: &str) { metadata::extract(&xpath_ctx, None, None, &mut article); super::Readability::extract_body(document, &mut root, article.title.as_deref()).unwrap(); + + let article_ctx = crate::FullTextParser::get_xpath_ctx(&article_document).unwrap(); + + crate::FullTextParser::prevent_self_closing_tags(&article_ctx).unwrap(); crate::FullTextParser::post_process_document(&article_document).unwrap(); let html = Util::serialize_node(&article_document, &root); diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index 36ae0d0..0f0370f 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -278,23 +278,3 @@ Foto: IMAGO/Vaclav Salek / IMAGO/CTK Photo "https://cdn.prod.www.spiegel.de/images/a4573666-f15e-4290-8c73-a0c6cd4ad3b2_w948_r1.778_fpx29.99_fpy44.98.jpg" ) } - -#[test] -fn extract_thumbnail_no_emoji() { - let html = r#" -

    I recently went on Brodie Robertson’s Tech Over Tea channel for a second time. I guess I didn’t succeed at pissing him off enough on the first go-around, because he invited me back! Let’s see if I did a better job of it this time by telling him he was using Arch wrong. 😀

    -

    Anyway, Brodie was a fantastic host, and we talked about a number of topics such as KDE’s position in the world, institutional continuity, fundraising and financial stability, the difficulty of reporting and triaging bug, the challenges of packaging software, and windows that block WiFi signals.

    -

    I hope you enjoy it!

    -
    - -
    -

    And here’s the link I mention at the end: https://kde.org/community/donations 🙂

    - "#; - - let parser = Parser::default_html(); - let doc = FullTextParser::parse_html_string_patched(html, &parser).unwrap(); - let ctx = Context::new(&doc).unwrap(); - - let thumb = FullTextParser::check_for_thumbnail(&ctx); - assert_eq!(thumb, None) -} From f4e4e64b9e4c250b3b2f9319aa54797d7dd555dc Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Mon, 10 Jun 2024 22:27:10 +0200 Subject: [PATCH 06/23] absolute default size for embedded youtube videos --- .../resources/tests/ftr/youtube/expected.html | 2 +- .../readability/embedded-videos/expected.html | 4 +- .../tests/readability/engadget/expected.html | 2 +- .../tests/readability/msn/expected.html | 2 +- .../tests/readability/videos-1/expected.html | 42 +++++++++---------- .../tests/readability/videos-2/expected.html | 12 +++--- .../tests/readability/yahoo-1/expected.html | 16 +++---- article_scraper/src/clean.rs | 1 + article_scraper/src/full_text_parser/mod.rs | 6 +-- 9 files changed, 44 insertions(+), 43 deletions(-) diff --git a/article_scraper/resources/tests/ftr/youtube/expected.html b/article_scraper/resources/tests/ftr/youtube/expected.html index 570905a..e05d2c2 100644 --- a/article_scraper/resources/tests/ftr/youtube/expected.html +++ b/article_scraper/resources/tests/ftr/youtube/expected.html @@ -1 +1 @@ -
    \ No newline at end of file +
    \ No newline at end of file diff --git a/article_scraper/resources/tests/readability/embedded-videos/expected.html b/article_scraper/resources/tests/readability/embedded-videos/expected.html index 6db4190..c520e7f 100644 --- a/article_scraper/resources/tests/readability/embedded-videos/expected.html +++ b/article_scraper/resources/tests/readability/embedded-videos/expected.html @@ -8,13 +8,13 @@ proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

    At root

    - +

    In a paragraph

    In a div

    -
    +

    Foo

    Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, diff --git a/article_scraper/resources/tests/readability/engadget/expected.html b/article_scraper/resources/tests/readability/engadget/expected.html index b9b0c8e..ed29aa5 100644 --- a/article_scraper/resources/tests/readability/engadget/expected.html +++ b/article_scraper/resources/tests/readability/engadget/expected.html @@ -251,7 +251,7 @@ more widely supported, but it would have been nice to see Dolby's, too.

    - +

    And speaking of Dolby technology, Microsoft is also highlighting Atmos support on the One X, just like diff --git a/article_scraper/resources/tests/readability/msn/expected.html b/article_scraper/resources/tests/readability/msn/expected.html index 597e831..38ea173 100644 --- a/article_scraper/resources/tests/readability/msn/expected.html +++ b/article_scraper/resources/tests/readability/msn/expected.html @@ -16,7 +16,7 @@

    The name and basic idea might sound like one of those endless score attack games like "Temple Run," but that's not the case. "Super Mario Run" is divided into hand-crafted levels with a clear end-point like any other Mario game, meaning you're essentially getting the Mario experience for $10 without needing to control his movement.

    $10 might seem like a bit much compared to the $0 people pay for most mobile games, but it's possible the game has $10 worth of levels to play in it. It's also not iPhone exclusive, but the Android version will launch at a later, currently unknown date.

    To see "Super Mario Run" in action, check out the footage below:

    -
    +
    \ No newline at end of file diff --git a/article_scraper/resources/tests/readability/videos-1/expected.html b/article_scraper/resources/tests/readability/videos-1/expected.html index 1bd350e..0c6fd6f 100644 --- a/article_scraper/resources/tests/readability/videos-1/expected.html +++ b/article_scraper/resources/tests/readability/videos-1/expected.html @@ -12,7 +12,7 @@ 21) Star Wars: The Last Jedi
    - +

    I am as shocked as anyone that a Star Wars movie found its way onto my list — but I was bowled over by The Last Jedi, which may be one of the series’ best. In the hands of writer-director Rian Johnson (who will also oversee a new Star Wars trilogy), The Last Jedi is beautiful to look at and keeps its eye on the relationships between characters and how they communicate with one another, in addition to the bigger galactic story. The same characters are back, but they seem infused with new life, and the galaxy with a new kind of hope. The movie’s best details are in the strong bonds that develop between characters, and I left the film with the realization that for the first time in my life, I loved a Star Wars movie. Now I understand the magic. @@ -24,7 +24,7 @@ 20) Faces Places

    - +

    The unusual documentary Faces Places (in French, Visages Villages) turns on the friendship between the accomplished street artist JR and legendary film director Agnès Varda, whose work was central to the development of the French New Wave movement. The pair (whose difference in age is 55 years) met after years of admiring each other’s work and decided to create a documentary portrait of France — by making a number of actual portraits. The film chronicles a leg of the "Inside Outside Project," a roving art initiative in which JR makes enormous portraits of people he meets and pastes them onto buildings and walls. In the film, Varda joins him, and as they talk to people around the country, they grow in their understanding of themselves and of each other. The development of their friendship, which is both affectionate and mutually sharpening, forms Faces Places’ emotional center. @@ -37,7 +37,7 @@ 19) Ingrid Goes West

    - +

    Ingrid Goes West is a twisted and dark comedy — part addiction narrative, part stalker story — and yet it’s set in a world that’s almost pathologically cheery: the glossy, sunny, nourishing, superfood- and superlative-loving universe of Instagram celebrity. But despite Ingrid Goes West’s spot-on take on that world, the best thing about the film is that it refuses to traffic in lazy buzzwords and easy skewering, particularly at the expense of young women. Instead, the movie conveys that behind every Instagram image and meltdown is a real person, with real insecurities, real feelings, and real problems. And it recognizes that living a life performed in public can be its own kind of self-deluding prison. @@ -49,7 +49,7 @@ 18) Lady Macbeth

    - +

    Lady Macbeth is no placid costume drama. Adapted from an 1865 Russian novella by Nikolai Leskov, the movie follows Katherine (the astounding Florence Pugh), a woman in the Lady Macbeth line characterized by a potent cocktail of very few scruples and a lot of determination. She's a chilling avatar for the ways that class and privilege — both obvious and hidden — insulate some people from the consequences of their actions while damning others. Lady Macbeth is also a dazzling directorial debut from William Oldroyd, a thrilling combination of sex, murder, intrigue, and power plays. It’s visually stunning, each frame composed so carefully and deliberately that the wildness and danger roiling just below the surface feels even more frightening. Each scene ratchets up the tension to an explosive, chilling end. @@ -61,7 +61,7 @@ 17) BPM (Beats Per Minute)

    - +

    BPM (Beats Per Minute) is a remarkably tender and stirring story of the Paris chapter of ACT UP, an AIDS activism group, and the young people who found themselves caught in the crosshairs of the AIDS crisis in the early 1990s. The film follows both the group's actions and the individual members’ shifting relationships to one another — enemies becoming friends, friends becoming lovers, lovers becoming caretakers — as well as their struggles with the disease wracking their community. As an account of the period, it’s riveting; as an exploration of life and love set at the urgent intersection of the political and the personal, it’s devastating. @@ -73,7 +73,7 @@ 16) The Big Sick

    - +

    Few 2017 movies could top the charm and tenderness of The Big Sick, which hits all the right romantic comedy notes with one unusual distinction: It feels like real life. That’s probably because The Big Sick is written by real-life married couple Emily V. Gordon and Silicon Valley's Kumail Nanjiani, and based on their real-life romance. The Big Sick — which stars Nanjiani as a version of himself, alongside Zoe Kazan as Emily — is funny and sweet while not backing away from matters that romantic comedies don’t usually touch on, like serious illness, struggles in long-term marriages, and religion. As it tells the couple’s story, which takes a serious turn when Emily falls ill with a mysterious infection and her parents (played by Holly Hunter and Ray Romano) come to town, it becomes a funny and wise story about real love. @@ -85,7 +85,7 @@ 15) Mother!

    - +

    There’s so much pulsing beneath the surface of Mother! that it’s hard to grab on to just one theme as what it “means.” It’s full-on apocalyptic fiction, and like all stories of apocalypse, it’s intended to draw back the veil on reality and show us what’s really beneath. And this movie gets wild: If its gleeful cracking apart of traditional theologies doesn’t get you (there’s a lot of Catholic folk imagery here, complete with an Ash Wednesday-like mud smearing on the foreheads of the faithful), its bonkers scenes of chaos probably will. Mother! is a movie designed to provoke fury, ecstasy, madness, catharsis, and more than a little awe. Watching it, and then participating in the flurry of arguments and discussions unpacking it, was among my best moviegoing experiences of 2017. @@ -97,7 +97,7 @@ 14) A Ghost Story

    - +

    Director David Lowery filmed A Ghost Story in secret, then premiered it at the Sundance Film Festival to critical acclaim. The movie starts out being about a grieving widow (Rooney Mara) trying to live through the pain of losing her beloved husband, but it soon shifts focus to the ghost of her husband (Casey Affleck, covered in a sheet), evolving into a compelling rumination on the nature of time, memory, history, and the universe. Bathed in warm humor and wistful longing, it's a film that stays with you long after it’s over, a lingering reminder of the inextricable link between love and place. @@ -109,7 +109,7 @@ 13) The Square

    - +
    - +

    Dunkirk, a true cinematic achievement from acclaimed director Christopher Nolan, backs off conventional notions of narrative and chronology as much as possible, while leaning headfirst into everything else that makes a movie a visceral work of art aimed at the senses: the images, the sounds, the scale, the swelling vibrations of it all. You can’t smell the sea spray, but your brain may trick you into thinking you can. Nolan’s camera pushes the edges of the screen as far as it can as Dunkirk engulfs the audience in something that feels like a lot more than a war movie. It’s a symphony for the brave and broken, and it resolves in a major key — but one with an undercurrent of sorrow, and of sober warning. Courage in the face of danger is not just for characters in movies. @@ -133,7 +133,7 @@ 11) Rat Film

    - +

    Rat Film is about rats, yes — and rat poison experts and rat hunters and people who keep rats as pets. But it’s also about the history of eugenics, dubious science, “redlining,” and segregated housing in Baltimore. All these pieces come together to form one big essay, where the meaning of each vignette only becomes clearer in light of the whole. It’s a fast-paced, no-holds-barred exploration of a damning history, and it accrues meaning as the images, sounds, and text pile up. @@ -145,7 +145,7 @@ 10) A Quiet Passion

    - +

    A Quiet Passion is technically a biographical film about Emily Dickinson, but it transcends its genre to become something more like poetry. It’s a perplexing and challenging film, crafted without the traditional guardrails that guide most biographical movies — dates, times, major accomplishments, and so on. Time slips away in the film almost imperceptibly, and the narrative arc doesn’t yield easily to the viewer. Cynthia Nixon plays Emily Dickinson, whose poetry and life is a perfect match for the signature style of director Terence Davies: rich in detail, deeply enigmatic, and weighed down with a kind of sparkling, joy-tinged sorrow. A Quiet Passion is a portrait, both visual and narrative, of the kind of saint most modern people can understand: one who is certain of her uncertainty, and yearning to walk the path on which her passion and longing meet. @@ -157,7 +157,7 @@ 9) Columbus

    - +

    Columbus is a stunner of a debut from video essayist turned director Kogonada. Haley Lu Richardson stars as Casey, a young woman living in Columbus, Indiana, who cares for her mother, works at a library, and harbors a passion for architecture. (Columbus is a mecca for modernist architecture scholars and enthusiasts.) When a visiting architecture scholar falls into a coma in Columbus, his estranged son Jin (John Cho) arrives to wait for him and strikes up a friendship with Casey, who starts to show him her favorite buildings. The two begin to unlock something in each other that’s hard to define but life-changing for both. Columbus is beautiful and subtle, letting us feel how the places we build and the people we let near us move and mold us. @@ -169,7 +169,7 @@ 8) The Florida Project

    - +

    Sean Baker’s The Florida Project unfolds at first like a series of sketches about the characters who live in a purple-painted, $35-a-night motel called the Magic Castle down the street from Disney World. The film is held together by the hysterical antics of a kid named Moonee and her pack of young friends, as well as long-suffering hotel manager Bobby (a splendid, warm Willem Dafoe), who tries to put up with it all while keeping some kind of order. But as The Florida Project goes on, a narrative starts to form, one that chronicles with heartbreaking attention the sort of dilemmas that face poor parents and their children in America, and the broken systems that try to cope with impossible situations. @@ -181,7 +181,7 @@ 7) Call Me by Your Name

    - +

    Luca Guadagnino’s gorgeous film Call Me by Your Name adapts André Aciman’s 2007 novel about a precocious 17-year-old named Elio (Timothée Chalamet), who falls in lust and love with his father’s 24-year-old graduate student Oliver (Armie Hammer). It’s remarkable for how it turns literature into pure cinema, all emotion and image and heady sensation. Set in 1983 in Northern Italy, Call Me by Your Name is less about coming out than coming of age, but it also captures a particular sort of love that’s equal parts passion and torment, a kind of irrational heart fire that opens a gate into something longer-lasting. The film is a lush, heady experience for the body, but it’s also an arousal for the soul. @@ -193,7 +193,7 @@ 6) Personal Shopper

    - +

    In her second collaboration with French director Olivier Assayas, Kristen Stewart plays a personal shopper to a wealthy socialite, with a sideline as an amateur ghost hunter who’s searching for her dead twin brother. Personal Shopper is deeper than it seems at first blush, a meditation on grief and an exploration of “between” places — on the fringes of wealth, and in the space between life and death. Some souls are linked in a way that can’t be shaken, and whether or not there’s an afterlife doesn’t change the fact that we see and sense them everywhere. (Personal Shopper also has one of the most tense extended scenes involving text messaging ever seen onscreen.) @@ -205,7 +205,7 @@ 5) Princess Cyd

    - +

    Stephen Cone is a master of small, carefully realized filmmaking; his earlier films such as The Wise Kids and Henry Gamble’s Birthday Party combine an unusual level of empathy for his characters with an unusual combination of interests: love, desire, sexual awakenings, and religion. Princess Cyd is his most accomplished film yet, about a young woman named Cyd (Jessie Pinnick) who finds herself attracted to Katie (Malic White), a barista, while visiting her Aunt Miranda (Rebecca Spence, playing a character modeled on the author Marilynne Robinson) in Chicago. As she works through her own sexual awakening with Katie, Cyd unwinds some of the ways Miranda’s life has gotten too safe. They provoke each other while forming a bond and being prodded toward a bigger understanding of the world. It is a graceful and honest film, and it feels like a modest miracle. @@ -217,7 +217,7 @@ 4) Get Out

    - +

    Racism is sinister, frightening, and deadly. But Get Out (a stunning directorial debut from Key & Peele's Jordan Peele) isn’t about the blatantly, obviously scary kind of racism — burning crosses and lynchings and snarling hate. Instead, it’s interested in showing how the parts of racism that try to be aggressively unscary are just as horrifying, and it’s interested in making us feel that horror in a visceral, bodily way. In the tradition of the best classic social thrillers, Get Out takes a topic that is often approached cerebrally — casual racism — and turns it into something you feel in your tummy. And it does it with a wicked sense of humor. @@ -229,7 +229,7 @@ 3) The Work

    - +

    The Work is an outstanding, astonishing accomplishment and a viewing experience that will leave you shaken (but in a good way). At Folsom Prison in California, incarcerated men regularly participate in group therapy, and each year other men from the “outside” apply to participate in an intense four-day period of group therapy alongside Folsom’s inmates. The Work spends almost all of its time inside the room where that therapy happens, observing the strong, visceral, and sometimes violent emotions the men feel as they expose the hurt and raw nerves that have shaped how they encounter the world. Watching is not always easy, but by letting us peek in, the film invites viewers to become part of the experience — as if we, too, are being asked to let go. @@ -241,7 +241,7 @@ 2) Ex Libris

    - +

    Frederick Wiseman is one of the towering giants of nonfiction film, a keen observer of American institutions — ranging from prisons to dance companies to welfare offices — for the past half-century. Ex Libris is his mesmerizing look at the New York Public Library and the many functions it fills, which go far beyond housing books. Wiseman works in the observational mode, which means his films contain no captions, dates, or talking-head interviews: We just see what his camera captured, which in this case includes community meetings, benefit dinners, after-school programs, readings with authors and scholars (including Richard Dawkins and Ta-Nehisi Coates), and NYPL patrons going about their business in the library’s branches all over the city. The result is almost hypnotic and, perhaps surprisingly, deeply moving. It makes a case for having faith in the public institutions where ordinary people work — away from the limelight, without trying to score political points — in order to make our communities truly better. @@ -253,7 +253,7 @@ 1) Lady Bird

    - +

    Lady Bird topped my list almost instantly, and only rose in my estimation on repeated viewings. For many who saw it (including me), it felt like a movie made not just for but about me. Lady Bird is a masterful, exquisite coming-of-age comedy starring the great Saoirse Ronan as Christine — or “Lady Bird,” as she’s re-christened herself — and it’s as funny, smart, and filled with yearning as its heroine. Writer-director Greta Gerwig made the film as an act of love, not just toward her hometown of Sacramento but also toward girlhood, and toward the feeling of always being on the outside of wherever real life is happening. Lady Bird is the rare movie that manages to be affectionate, entertaining, hilarious, witty, and confident. And one line from it struck me as the guiding principle of many of the year’s best films: “Don’t you think they are the same thing? Love, and attention?” diff --git a/article_scraper/resources/tests/readability/videos-2/expected.html b/article_scraper/resources/tests/readability/videos-2/expected.html index f5f2c68..abdf514 100644 --- a/article_scraper/resources/tests/readability/videos-2/expected.html +++ b/article_scraper/resources/tests/readability/videos-2/expected.html @@ -9,7 +9,7 @@ Vape Wave (documentaire, 1h28, Planète+)

    - +

    Pendant quelques jours, le doute a plané : l’Etat comptait-il vraiment légiférer contre la cigarette dans les films français, que ce soit via une interdiction pure et simple ou via un système de «punition» (coupe des aides CNC, par exemple) pour les longs-métrages qui sentent le mégot ? Si le rétropédalage de la ministre Buzyn n’en est pas vraiment un (elle n’avait jamais clairement menacé le septième art), la polémique a le mérite de pointer la (sur)représentation clopesque sur écran. Et si, comme c’est le cas dans la vie quotidienne, on voyait progressivement les cigarettes électroniques remplacer les tiges nicotinées authentiques ? Que ceux qui mettraient en doute le potentiel cinématographique des vapoteuses se ruent sur Vape Wave, documentaire militant signé Jan Kounen, ex-fumeur reconverti à la vape dont les images magnifient les volutes de vapeur recrachée. @@ -24,7 +24,7 @@ Dans la tête d’Alan Moore (websérie documentaire, 8x5min, Arte Creative)

    - +

    Le week-end dernier, Libération publiait un portrait de der consacré à l’auteur britannique Alan Moore, connu pour ses BD cultes (V pour Vendetta, Watchmen, From Hell), à l’occasion de la sortie de son deuxième roman, le pavé Jérusalem. En attendant l’imminente sortie d’une version longue de son entretien avec Libé, on pourra se replonger dans les épisodes d’une websérie documentaire d’Arte Creative en 8 épisodes consacré au maître. Brexit, magie, Anonymous font partie des sujets discutés avec le maître au fil de ce programme sobrement intitulé Dans la tête d’Alan Moore. (A.H.) @@ -36,7 +36,7 @@ The Death and Life of Marsha P. Johnson (docu, 1h45, Netflix)

    - +

    Marsha, la «Rosa Parks du mouvement LGBTQ». Marsha «la prostituée, l’actrice et la sainte, modèle d’Andy Warhol» ou encore Marsha l’élaborée, la radicale, «avec ses plumes et ce maquillage qu’elle ne mettait jamais bien». «Queen Marsha» a été retrouvée morte dans l’Hudson en juillet 1992, alors qu’on la voyait encore parader dans les rues de Greenwich Village quelques jours auparavant. Un choc glaçant. Là où son corps a été repêché puis ingratement déposé, les sans-abri ont constitué le lendemain un mémorial de bouteilles et de plantes qui délimitent les contours de l’absente. @@ -66,7 +66,7 @@ Jim & Andy (documentaire, 1h33, Netflix) 

    - +

    A la sortie de Man on the Moon (2000), le magnifique film de Milos Forman consacré à Andy Kaufman – comique et génie de la performance absurde mort en 1984 –, le cinéaste et les acteurs insistaient dans chaque interview sur l’in­croyable comportement de Jim Carrey pendant le tournage : il aurait été comme possédé par Kaufman, se prenant pour lui 24 heures sur 24. Certains affirmaient même ne jamais avoir eu l’impression que l’acteur était présent, tant son modèle avait littéralement pris sa place. Nous en avons aujourd’hui la preuve en images car tout cela avait été filmé par Bob Zmuda et Lynne Margulies, l’ancien complice et la veuve de Kaufman. @@ -81,7 +81,7 @@ Braguino (documentaire, 50min, Arte)

    - +

    La querelle peut se trouver derrière toutes les portes, y compris celle de l’exil. On a beau croire avoir tourné le dos à tout, à cette inclination humaine à nourrir sa propre haine, l’allergie peut regermer fissa sur une peau qui frissonne à l’approche de ce voisin que l’on ne comprend pas. Issu d’une lignée de vieux-croyants orthodoxes russes, Sacha Braguine a pris sa famille sous le bras, loin de toute autre présence humaine en taïga sibérienne. Un autre groupe, les Kiline, a décidé d’en faire de même et de s’installer de l’autre côté de la rivière. Qui est arrivé en premier ? Qui menace l’autre ? L’histoire de l’impossible communauté peut commencer. @@ -96,7 +96,7 @@ 6 Days (film, 1h34, Netflix)

    - +

    Fin avril 1980, l’ambassade d’Iran à Londres a été le théâtre d’une prise d’otages largement médiatisée : une trentaine de personnes ont ainsi été retenues pendant six jours par des soldats iraniens dissidents exigeant la libération de 91 prisonniers. Avec Margaret Thatcher au 10 Downing Street à l’époque, pas question pour l’Angleterre d’avoir l’air mou du genou sur la réponse à apporter à cette crise scrutée par les caméras du monde entier. Le SAS (Special Air Service) est sur le coup : l’opération Nimrod se met en place pour prendre d’assaut l’ambassade. diff --git a/article_scraper/resources/tests/readability/yahoo-1/expected.html b/article_scraper/resources/tests/readability/yahoo-1/expected.html index b5d5602..a2f2954 100644 --- a/article_scraper/resources/tests/readability/yahoo-1/expected.html +++ b/article_scraper/resources/tests/readability/yahoo-1/expected.html @@ -11,28 +11,28 @@

    Virtual reality has officially reached the consoles. And it’s pretty good! Sony’s PlayStation VR is extremely comfortable and reasonably priced, and while it’s lacking killer apps, it’s loaded with lots of interesting ones.

    But which ones should you buy? I’ve played just about every launch game, and while some are worth your time, others you might want to skip. To help you decide what’s what, I’ve put together this list of the eight PSVR games worth considering.

    “Rez Infinite” ($30)

    -
    +

    Beloved cult hit “Rez” gets the VR treatment to help launch the PSVR, and the results are terrific. It includes a fully remastered take on the original “Rez” – you zoom through a Matrix-like computer system, shooting down enemies to the steady beat of thumping electronica – but the VR setting makes it incredibly immersive. It gets better the more you play it, too; unlock the amazing Area X mode and you’ll find yourself flying, shooting and bobbing your head to some of the trippiest visuals yet seen in VR.

    “Thumper” ($20)

    -
    +

    What would happen if Tron, the board game Simon, a Clown beetle, Cthulhu and a noise band met in VR? Chaos, for sure, and also “Thumper.” Called a “violent rhythm game” by its creators, “Thumper” is, well, a violent rhythm game that’s also a gorgeous, unsettling and totally captivating assault on the senses. With simple controls and a straightforward premise – click the X button and the analog stick in time with the music as you barrel down a neon highway — it’s one of the rare games that works equally well both in and out of VR. But since you have PSVR, play it there. It’s marvelous.

    “Until Dawn: Rush of Blood” ($20)

    -
    +

    Cheeky horror game “Until Dawn” was a breakout hit for the PS4 last year, channeling the classic “dumb teens in the woods” horror trope into an effective interactive drama. Well, forget all that if you fire up “Rush of Blood,” because this one sticks you front and center on a rollercoaster ride from Hell. Literally. You ride through a dimly-lit carnival of terror, dual-wielding pistols as you take down targets, hideous pig monsters and, naturally, maniac clowns. Be warned: If the bad guys don’t get you, the jump scares will.

    “Headmaster” ($20)

    -
    +

    Soccer meets “Portal” in the weird (and weirdly fun) “Headmaster,” a game about heading soccer balls into nets, targets and a variety of other things while stuck in some diabolical training facility. While at first it seems a little basic, increasingly challenging shots and a consistently entertaining narrative keep it from running off the pitch. Funny, ridiculous and as easy as literally moving your head back and forth, it’s a pleasant PSVR surprise.

    “RIGS: Mechanized Combat League” ($50)

    -
    +

    Giant mechs + sports? That’s the gist of this robotic blast-a-thon, which pits two teams of three against one another in gorgeous, explosive and downright fun VR combat. At its best, “RIGS” marries the thrill of fast-paced competitive shooters with the insanity of piloting a giant mech in VR. It can, however, be one of the barfier PSVR games. So pack your Dramamine, you’re going to have to ease yourself into this one.

    “Batman Arkham VR” ($20)

    -
    +

    “I’m Batman,” you will say. And you’ll actually be right this time, because you are Batman in this detective yarn, and you know this because you actually grab the famous cowl and mask, stick it on your head, and stare into the mirrored reflection of Rocksteady Games’ impressive Dark Knight character model. It lacks the action of its fellow “Arkham” games and runs disappointingly short, but it’s a high-quality experience that really shows off how powerfully immersive VR can be.

    “Job Simulator” ($30)

    -
    +

    There are a number of good VR ports in the PSVR launch lineup, but the HTC Vive launch game “Job Simulator” might be the best. Your task? Lots of tasks, actually, from cooking food to fixing cars to working in an office, all for robots, because did I mention you were in the future? Infinitely charming and surprisingly challenging, it’s a great showpiece for VR.

    “Eve Valkyrie” ($60)

    -
    +

    Already a hit on the Oculus Rift, this space dogfighting game was one of the first to really show off how VR can turn a traditional game experience into something special. It’s pricey and not quite as hi-res as the Rift version, but “Eve Valkyrie” does an admirable job filling the void left since “Battlestar Galactica” ended. Too bad there aren’t any Cylons in it (or are there?)

    More games news:

      diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index 140ff5b..e44f421 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -47,6 +47,7 @@ pub fn clean_html_fragment( if let Some(mut root) = document.get_root_element() { FullTextParser::post_process_page(&mut root)?; } + FullTextParser::prevent_self_closing_tags(&xpath_ctx)?; FullTextParser::post_process_document(&document)?; let content_node = if let Some(root) = document.get_root_element() { diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index fece84e..16868b2 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -690,9 +690,9 @@ impl FullTextParser { let success = video_wrapper .set_property("class", "videoWrapper") .ok() - .and_then(|()| node.set_property("width", "100%").ok()) - .and_then(|()| node.set_property("height", "400").ok()) - .and_then(|()| node.remove_attribute("aspect-ratio").ok()) + .and_then(|()| node.set_property("width", "480").ok()) + .and_then(|()| node.set_property("height", "360").ok()) + .and_then(|()| node.set_property("aspect-ratio", "auto").ok()) .ok_or_else(|| { node.unlink(); video_wrapper.add_child(&mut node) From c16e11fdda7d7442f91baa18134e68271240a282 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 6 Jul 2024 23:38:43 +0200 Subject: [PATCH 07/23] init parser according to (https://gitlab.gnome.org/GNOME/libxml2/-/wikis/Thread-safety) --- article_scraper/src/full_text_parser/mod.rs | 13 ++++++++----- article_scraper/src/full_text_parser/tests.rs | 6 +++--- article_scraper/src/images/image_data.rs | 1 - article_scraper/src/images/mod.rs | 11 +++-------- article_scraper/src/images/request.rs | 1 - article_scraper/src/util.rs | 7 ++----- 6 files changed, 16 insertions(+), 23 deletions(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 16868b2..37a5f32 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -264,8 +264,7 @@ impl FullTextParser { } // parse html - let parser = Parser::default_html(); - Self::parse_html_string_patched(html.as_str(), &parser).map_err(|err| { + Self::parse_html_string_patched(html.as_str()).map_err(|err| { log::error!("Parsing HTML failed for downloaded HTML {:?}", err); FullTextParserError::Xml }) @@ -278,7 +277,7 @@ impl FullTextParser { /// - /// These two functions should be removed when the issue is fixed in libxml crate. fn try_usize_to_i32(value: usize) -> Result { - if cfg!(target_pointer_width = "16") || (value < i32::max_value() as usize) { + if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) { // Cannot safely use our value comparison, but the conversion if always safe. // Or, if the value can be safely represented as a 32-bit signed integer. Ok(value as i32) @@ -290,8 +289,12 @@ impl FullTextParser { pub(crate) fn parse_html_string_patched( input: &str, - parser: &Parser, ) -> Result { + unsafe { + // https://gitlab.gnome.org/GNOME/libxml2/-/wikis/Thread-safety + libxml::bindings::xmlInitParser(); + } + let parser = Parser::default_html(); let input_bytes: &[u8] = input.as_ref(); let input_ptr = input_bytes.as_ptr() as *const std::os::raw::c_char; let input_len = Self::try_usize_to_i32(input_bytes.len())?; @@ -488,7 +491,7 @@ impl FullTextParser { } pub fn thumbnail_from_html(html: &str) -> Option { - if let Ok(doc) = Parser::default_html().parse_string(html) { + if let Ok(doc) = Self::parse_html_string_patched(html) { if let Ok(ctx) = Self::get_xpath_ctx(&doc) { return Self::check_for_thumbnail(&ctx); } diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index 0f0370f..99a5235 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -1,5 +1,5 @@ use super::{config::ConfigEntry, FullTextParser}; -use libxml::{parser::Parser, tree::SaveOptions, xpath::Context}; +use libxml::{tree::SaveOptions, xpath::Context}; use reqwest::{Client, Url}; async fn run_test(name: &str, url: &str, title: Option<&str>, author: Option<&str>) { @@ -194,7 +194,7 @@ herausgebracht. (Fortschritt, Wissenschaft) "#; - let doc = Parser::default_html().parse_string(html).unwrap(); + let doc = FullTextParser::parse_html_string_patched(html).unwrap(); let ctx = Context::new(&doc).unwrap(); let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap(); @@ -269,7 +269,7 @@ Foto: IMAGO/Vaclav Salek / IMAGO/CTK Photo "#; - let doc = Parser::default_html().parse_string(html).unwrap(); + let doc = FullTextParser::parse_html_string_patched(html).unwrap(); let ctx = Context::new(&doc).unwrap(); let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap(); diff --git a/article_scraper/src/images/image_data.rs b/article_scraper/src/images/image_data.rs index 2095f27..b26cfec 100644 --- a/article_scraper/src/images/image_data.rs +++ b/article_scraper/src/images/image_data.rs @@ -2,7 +2,6 @@ pub struct ImageData { pub url: String, pub data: Vec, - pub content_length: usize, pub content_type: String, } diff --git a/article_scraper/src/images/mod.rs b/article_scraper/src/images/mod.rs index 6d97fd8..de0f48f 100644 --- a/article_scraper/src/images/mod.rs +++ b/article_scraper/src/images/mod.rs @@ -2,12 +2,11 @@ pub use self::error::ImageDownloadError; use self::image_data::ImageDataBase64; use self::pair::Pair; use self::request::ImageRequest; -use crate::constants; use crate::util::Util; +use crate::{constants, FullTextParser}; use base64::Engine; use futures::StreamExt; use image::ImageFormat; -use libxml::parser::Parser; use libxml::tree::{Node, SaveOptions}; use libxml::xpath::Context; pub use progress::Progress; @@ -162,9 +161,7 @@ impl ImageDownloader { html: &str, downloaded_images: Vec>, ) -> Result { - let parser = Parser::default_html(); - let doc = parser - .parse_string(html) + let doc = FullTextParser::parse_html_string_patched(html) .map_err(|_| ImageDownloadError::HtmlParse)?; let xpath_ctx = Context::new(&doc).map_err(|()| { @@ -207,9 +204,7 @@ impl ImageDownloader { } fn harvest_image_urls_from_html(html: &str) -> Result>, ImageDownloadError> { - let parser = Parser::default_html(); - let doc = parser - .parse_string(html) + let doc = FullTextParser::parse_html_string_patched(html) .map_err(|_| ImageDownloadError::HtmlParse)?; let xpath_ctx = Context::new(&doc).map_err(|()| { diff --git a/article_scraper/src/images/request.rs b/article_scraper/src/images/request.rs index b7086ce..fe9adf0 100644 --- a/article_scraper/src/images/request.rs +++ b/article_scraper/src/images/request.rs @@ -48,7 +48,6 @@ impl ImageRequest { Ok(ImageData { url: self.url, data: result, - content_length: self.content_length, content_type: self.content_type, }) } diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index cbd5370..df76ced 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -1299,13 +1299,11 @@ impl Util { mod tests { use super::Util; use crate::FullTextParser; - use libxml::parser::Parser; fn replace_brs(source: &str, expected: &str) { libxml::tree::node::set_node_rc_guard(10); - let parser = Parser::default_html(); - let document = FullTextParser::parse_html_string_patched(source, &parser).unwrap(); + let document = FullTextParser::parse_html_string_patched(source).unwrap(); let root = document.get_root_element().unwrap(); let body = root.get_first_child().unwrap(); let div = body.get_first_child().unwrap(); @@ -1346,8 +1344,7 @@ mod tests { fn replace_emojis(source: &str, expected: &str) { libxml::tree::node::set_node_rc_guard(10); - let parser = Parser::default_html(); - let document = FullTextParser::parse_html_string_patched(source, &parser).unwrap(); + let document = FullTextParser::parse_html_string_patched(source).unwrap(); let root = document.get_root_element().unwrap(); let body = root.get_first_child().unwrap(); let p = body.get_first_child().unwrap(); From 6932902b7b2c403ec8c23f4bd167f72dbb042ecf Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 6 Jul 2024 23:43:23 +0200 Subject: [PATCH 08/23] update CI image --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c6a7149..5a57616 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,7 +4,7 @@ stages: run-build: stage: build - image: rust:1.77 + image: rust:1.79 before_script: - rustup component add rustfmt - rustup component add clippy From b3ce28632dab8678ae04789aeae76262283b1bb0 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Wed, 10 Jul 2024 11:59:21 +0200 Subject: [PATCH 09/23] update submodule --- article_scraper/ftr-site-config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/ftr-site-config b/article_scraper/ftr-site-config index 737398e..e9112fc 160000 --- a/article_scraper/ftr-site-config +++ b/article_scraper/ftr-site-config @@ -1 +1 @@ -Subproject commit 737398ef6b121db2d72042b5406a95dfd497113f +Subproject commit e9112fc55800cae00ca70f4c38248a3ef4228861 From 11ee29fedaba674ee615b977fc97d332645f4d0e Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 2 Nov 2024 11:30:29 +0100 Subject: [PATCH 10/23] thumbnail: check for attribute with name property as well (fixes #4) --- .../resources/tests/thumbnails/a-chacon.html | 808 ++++++++++++++++++ article_scraper/src/full_text_parser/mod.rs | 18 +- article_scraper/src/full_text_parser/tests.rs | 14 + 3 files changed, 839 insertions(+), 1 deletion(-) create mode 100644 article_scraper/resources/tests/thumbnails/a-chacon.html diff --git a/article_scraper/resources/tests/thumbnails/a-chacon.html b/article_scraper/resources/tests/thumbnails/a-chacon.html new file mode 100644 index 0000000..3e1bc5e --- /dev/null +++ b/article_scraper/resources/tests/thumbnails/a-chacon.html @@ -0,0 +1,808 @@ + + + + + + + + + + + +PoC: Usando el Generador de Autenticación de Rails 8 (Beta) En Modo API-Only. | a-chacon + + + + + + + + + + + + + + + + + + + + + + +
      +

      Building an API with Rails? Discover + + OasRails, a Rails engine for generate automatic interactive documentation. +

      +
      + + + + + + + + +
      +
      + + +

      PoC: Usando el Generador de Autenticación de Rails 8 (Beta) En Modo API-Only.

      + + + + PoC: Usando el Generador de Autenticación de Rails 8 (Beta) En Modo API-Only. + +
      +

      Como ya saben, una de las funcionalidades nuevas de Rails 8 es el nuevo generador básico de autenticación que viene a demostrar que no es tan complejo desarrollar todo lo que respecta a autenticación en una aplicación con Rails y que muchas veces no es necesario depender de terceros (gemas). La discusión comenzó aquí.

      + +

      Dicho esto, veamos que pasa usando el generador en una aplicación API-Only:

      + +
       rails -v
      +Rails 8.0.0.beta1
      +
      + +
       rails new app --api & cd app
      +
      + +

      Y ejecutamos el nuevo comando:

      + +
       rails g authentication
      +      create  app/models/session.rb
      +      create  app/models/user.rb
      +      create  app/models/current.rb
      +      create  app/controllers/sessions_controller.rb
      +      create  app/controllers/concerns/authentication.rb
      +      create  app/controllers/passwords_controller.rb
      +      create  app/mailers/passwords_mailer.rb
      +      create  app/views/passwords_mailer/reset.html.erb
      +      create  app/views/passwords_mailer/reset.text.erb
      +      create  test/mailers/previews/passwords_mailer_preview.rb
      +        gsub  app/controllers/application_controller.rb
      +       route  resources :passwords, param: :token
      +       route  resource :session
      +        gsub  Gemfile
      +      bundle  install --quiet
      +    generate  migration CreateUsers email_address:string!:uniq password_digest:string! --force
      +       rails  generate migration CreateUsers email_address:string!:uniq password_digest:string! --force
      +      invoke  active_record
      +      create    db/migrate/20241016002139_create_users.rb
      +    generate  migration CreateSessions user:references ip_address:string user_agent:string --force
      +       rails  generate migration CreateSessions user:references ip_address:string user_agent:string --force
      +      invoke  active_record
      +      create    db/migrate/20241016002140_create_sessions.rb
      +
      + +

      Ok, ahora por ejemplo, si revisamos SessionsController veremos que el método de Login se ve de la siguiente forma:

      + +
        def create
      +    if user = User.authenticate_by(params.permit(:email_address, :password))
      +      start_new_session_for user
      +      redirect_to after_authentication_url
      +    else
      +      redirect_to new_session_url, alert: "Try another email address or password."
      +    end
      +  end
      +
      + +

      O sea, redirecciona a rutas y/o vistas que en nuestra API no existen ni hacen sentido, y además si inspeccionamos el metodo start_new_session_for nos daremos cuenta de que el sistema está basado 100% en autenticación mediante cookies. Entonces, ¿qué hacemos?

      + +

      Mi propuesta es la siguiente: el generador crea las bases para la autenticación y creo que funciona bastante bien, por lo que con unas pequeñas modificaciones podemos dejar funcionando una autenticación Bearer (Token Authentication) rápidamente en nuestra API con Rails 8 más los archivos ya generados.

      + +

      El primer paso será agregar persistencia para nuestro token, para esto modificaremos la migración que crea las sessiones y agregaremos un nuevo campo llamado token:

      + +
          create_table :sessions do |t|
      +      t.references :user, null: false, foreign_key: true
      +      t.string :ip_address
      +      t.string :user_agent
      +      t.string :token     # HERE
      +
      +      t.timestamps
      +    end
      +
      + +

      Ahora simplemente ejecuta rails db:migrate y create un usuario de prueba por consola, yo lo haré con esta línea User.create(email_address: "[email protected]", password: "123456789") (Lo utilizaremos más tarde). Luego debemos crear un nuevo token para cada sesión nueva de un usuario, para esto lo más simple es usar un callback en el modelo Session:

      + +
      # app/models/sessions.rb
      +class Session < ApplicationRecord
      +  belongs_to :user
      +  before_create :generate_token # Here call
      +
      +  private
      +  def generate_token # Here implement, generate the token as you wish.
      +    self.token = Digest::SHA1.hexdigest([ Time.now, rand ].join)
      +  end
      +end
      +
      + +

      Ahora volviendo al metodo start_new_session_for en el concern Authentication, no es necesario que creemos una cookie, asi que debemos remover esa linea y dejar el metodo de la siguiente forma:

      + +
      # app/controllers/concerns/authentication.rb
      +def start_new_session_for(user)
      +  user.sessions.create!(user_agent: request.user_agent, ip_address: request.remote_ip).tap do |session|
      +    Current.session = session
      +  end
      +end
      +
      + +

      Y modificaremos el create de SessionsController para que las respuestas sean en formato json y no redirecciones:

      + +
      # app/controllers/sessions_controller.rb
      +def create
      +  if user = User.authenticate_by(params.permit(:email_address, :password))
      +    start_new_session_for user
      +    render json: { data: { token: Current.session.token  } }
      +  else
      +    render json: {}, status: :unauthorized
      +  end
      +end
      +
      + +

      Para hacer que todo esto funcione debemos hacer dos cosas:

      + +
        +
      1. +

        Incluir el modulo Authentication en ApplicationController:

        + +
        # app/controllers/application_controller.rb
        +class ApplicationController < ActionController::API
        +  include Authentication
        +end
        +
        +
      2. +
      3. +

        Eliminar la linea numero 6 de este mismo concern:

        + +
        # app/controllers/concerns/authentication.rb
        +  included do
        +    before_action :require_authentication
        +    helper_method :authenticated? # This, we don't use helpers in APIs
        +  end
        +
        +
      4. +
      + +

      Hasta este punto ya deberíamos tener el login funcionando. Para probar esto voy a agregar OasRails, que a propósito ya está funcionando con Rails 8 y voy a enviar un par de peticiones a ver como se comporta, no explicaré como implementar OasRails, para eso puedes ver el repositorio o leer más en este post.

      + +

      Inicio de sesión exitoso:

      + +

      + +

      Inicio de sesión fallido:

      + +

      + +
      + +

      Ya podemos generar tokens, ahora modificaremos el código para autenticarnos con ese mismo token. Para eso, cambiaremos la lógica de buscar la sesión actual del usuario con base en la cookie a buscarla basándonos en la cabecera Authorization:

      + +
      
      +# app/controllers/concerns/authentication.rb
      +  def resume_session
      +    Current.session = find_session_by_token
      +  end
      +
      +  def find_session_by_cookie
      +    Session.find_by(token: request.headers[:authorization]&.split(" ")[-1])
      +  end
      +
      + +

      Para probar esto creo que tendremos que hacer rápidamente un modelo que dependa de User y que requiera autenticación para utilizar. Intentemos con rails g scaffold project title:string description:text user:references y le agregamos al principio del controlador la línea de código before_action :require_authentication.

      + +

      Aquí les dejo una pequeña prueba del index de Projects autenticado con el token que obtuve en las pruebas anteriores:

      + +

      + +
      + +

      Con esto ya tienes gran parte de la lógica de autenticación funcionando en la aplicación API-Only. Te queda continuar con las modificaciones en el resto de los endpoints para que las respuestas sean en formato json y no supuestas vistas que no existen en la aplicación.

      + +

      Probablemente de aquí a que se lance la versión final de Rails 8 aparezca un PR solucionando esto y el generador funcione correctamente en modo API-Only. Hasta entonces, con estas pequeñas modificaciones ya puedes seguir construyendo tu API.

      + +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      + + + + + + + + diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 37a5f32..a45b36d 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -275,7 +275,7 @@ impl FullTextParser { /// See: /// - /// - - /// These two functions should be removed when the issue is fixed in libxml crate. + /// These two functions should be removed when the issue is fixed in libxml crate. fn try_usize_to_i32(value: usize) -> Result { if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) { // Cannot safely use our value comparison, but the conversion if always safe. @@ -514,6 +514,22 @@ impl FullTextParser { return Some(thumb); } + if let Ok(thumb) = Util::get_attribute( + context, + "//meta[contains(@property, 'twitter:image')]", + "content", + ) { + return Some(thumb); + } + + if let Ok(thumb) = Util::get_attribute( + context, + "//meta[contains(@property, 'og:image')]", + "content", + ) { + return Some(thumb); + } + if let Ok(thumb) = Util::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href") { diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index 99a5235..b111fcd 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -278,3 +278,17 @@ Foto: IMAGO/Vaclav Salek / IMAGO/CTK Photo "https://cdn.prod.www.spiegel.de/images/a4573666-f15e-4290-8c73-a0c6cd4ad3b2_w948_r1.778_fpx29.99_fpy44.98.jpg" ) } + +#[test] +fn extract_thumbnail_a_chacon() { + let html = std::fs::read_to_string(format!("./resources/tests/thumbnails/a-chacon.html")) + .expect("Failed to read source HTML"); + let doc = FullTextParser::parse_html_string_patched(&html).unwrap(); + let ctx = Context::new(&doc).unwrap(); + + let thumb = FullTextParser::check_for_thumbnail(&ctx).unwrap(); + assert_eq!( + thumb, + "https://a-chacon.com/assets/images/rails8-poc-api-auth.webp" + ) +} From 7fcb781c6819528893fcbaa414a57ce15bc51125 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 2 Nov 2024 11:34:47 +0100 Subject: [PATCH 11/23] remove useless format! --- article_scraper/src/full_text_parser/tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index b111fcd..8921ce9 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -281,7 +281,7 @@ Foto: IMAGO/Vaclav Salek / IMAGO/CTK Photo #[test] fn extract_thumbnail_a_chacon() { - let html = std::fs::read_to_string(format!("./resources/tests/thumbnails/a-chacon.html")) + let html = std::fs::read_to_string("./resources/tests/thumbnails/a-chacon.html") .expect("Failed to read source HTML"); let doc = FullTextParser::parse_html_string_patched(&html).unwrap(); let ctx = Context::new(&doc).unwrap(); From 89eb87fa85709378878032d3b3be0960f8b0fe3e Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 17 Jan 2025 02:55:59 +0100 Subject: [PATCH 12/23] update thiserror, ftr-site-config submodule and bump version --- Cargo.toml | 10 ++++++++-- article_scraper/Cargo.toml | 13 +++++++------ article_scraper/ftr-site-config | 2 +- article_scraper_cli/Cargo.toml | 11 ++++++----- 4 files changed, 22 insertions(+), 14 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 03f5662..4f0884f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,12 @@ [workspace] - members = [ "article_scraper", "article_scraper_cli", -] \ No newline at end of file +] + +[workspace.package] +version = "2.1.1" +authors = ["Jan Lukas Gernert "] +edition = "2021" +license = "GPL-3.0-or-later" +repository = "https://gitlab.com/news-flash/article_scraper" \ No newline at end of file diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index 6e4d003..100766c 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -1,17 +1,18 @@ [package] name = "article_scraper" -version = "2.1.0" -authors = ["Jan Lukas Gernert "] -edition = "2018" -license = "GPL-3.0-or-later" description = "Scrap article contents from the web. Powered by fivefilters full text feed configurations & mozilla readability." -repository = "https://gitlab.com/news-flash/article_scraper" +version.workspace = true +authors.workspace = true +edition.workspace = true +license.workspace = true +repository.workspace = true + readme = "../Readme.md" keywords = ["article", "scrape", "full-text", "readability"] exclude = ["resources/tests"] [dependencies] -thiserror = "1.0" +thiserror = "2.0" libxml = "0.3" reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] } tokio = { version = "1", features = ["macros", "fs", "io-util"] } diff --git a/article_scraper/ftr-site-config b/article_scraper/ftr-site-config index e9112fc..ccde390 160000 --- a/article_scraper/ftr-site-config +++ b/article_scraper/ftr-site-config @@ -1 +1 @@ -Subproject commit e9112fc55800cae00ca70f4c38248a3ef4228861 +Subproject commit ccde390b11893cbafdc84f74b449ddc3cc05c024 diff --git a/article_scraper_cli/Cargo.toml b/article_scraper_cli/Cargo.toml index b91abc5..22edcf1 100644 --- a/article_scraper_cli/Cargo.toml +++ b/article_scraper_cli/Cargo.toml @@ -1,11 +1,12 @@ [package] name = "article_scraper_cli" -version = "2.1.0" -authors = ["Jan Lukas Gernert "] -edition = "2018" -license = "GPL-3.0-or-later" description = "Cli to use the article_scraper lib" -repository = "https://gitlab.com/news-flash/article_scraper" +version.workspace = true +authors.workspace = true +edition.workspace = true +license.workspace = true +repository.workspace = true + [dependencies] article_scraper = { path = "../article_scraper/" } From 7c658a4ba80021c5ed108b795ee4aac17e02f321 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 17 Jan 2025 02:58:41 +0100 Subject: [PATCH 13/23] resolver 2 --- Cargo.toml | 6 ++---- article_scraper/src/full_text_parser/mod.rs | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4f0884f..99695c8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,6 @@ [workspace] -members = [ - "article_scraper", - "article_scraper_cli", -] +members = ["article_scraper", "article_scraper_cli"] +resolver = "2" [workspace.package] version = "2.1.1" diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index a45b36d..98e9478 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -275,7 +275,7 @@ impl FullTextParser { /// See: /// - /// - - /// These two functions should be removed when the issue is fixed in libxml crate. + /// These two functions should be removed when the issue is fixed in libxml crate. fn try_usize_to_i32(value: usize) -> Result { if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) { // Cannot safely use our value comparison, but the conversion if always safe. From ca1cc47af1f7749ea7d10983e8e269afb0c57daf Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 17 Jan 2025 03:02:40 +0100 Subject: [PATCH 14/23] update CI image --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5a57616..7880e5d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,7 +4,7 @@ stages: run-build: stage: build - image: rust:1.79 + image: rust:1.83 before_script: - rustup component add rustfmt - rustup component add clippy From 8cfcd6d9f3636a84336da4d882a9f6db5ce565b4 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 17 Jan 2025 03:05:55 +0100 Subject: [PATCH 15/23] clippy --- article_scraper/src/full_text_parser/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 98e9478..18fc682 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -275,7 +275,7 @@ impl FullTextParser { /// See: /// - /// - - /// These two functions should be removed when the issue is fixed in libxml crate. + /// These two functions should be removed when the issue is fixed in libxml crate. fn try_usize_to_i32(value: usize) -> Result { if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) { // Cannot safely use our value comparison, but the conversion if always safe. From 9f56ed03b8e384378d92e01c5bc38bf80525760c Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Mon, 10 Mar 2025 13:42:31 +0100 Subject: [PATCH 16/23] article_scraper: don't specify reqwest features --- article_scraper/Cargo.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index 100766c..e852be9 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -14,17 +14,17 @@ exclude = ["resources/tests"] [dependencies] thiserror = "2.0" libxml = "0.3" -reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] } +reqwest = "0.12" tokio = { version = "1", features = ["macros", "fs", "io-util"] } url = "2.5" -regex = "1.10" +regex = "1.11" encoding_rs = "0.8" chrono = "0.4" base64 = "0.22" image = "0.25" log = "0.4" -rust-embed="8.4" -once_cell = "1.19" +rust-embed="8.6" +once_cell = "1.20" escaper = "0.1" futures = "0.3" unic-emoji-char = "0.9" From 0978335d3b73e8049c602713b76ca5e3f038d9ca Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 28 Mar 2025 17:18:03 +0100 Subject: [PATCH 17/23] [f] ignore url harvest error --- article_scraper/src/images/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/article_scraper/src/images/mod.rs b/article_scraper/src/images/mod.rs index de0f48f..4be98b8 100644 --- a/article_scraper/src/images/mod.rs +++ b/article_scraper/src/images/mod.rs @@ -219,7 +219,9 @@ impl ImageDownloader { let mut image_urls = Vec::new(); for node in node_vec { - image_urls.push(Self::harvest_image_urls_from_node(node)?); + if let Ok(url) = Self::harvest_image_urls_from_node(node) { + image_urls.push(url); + } } Ok(image_urls) From b92500fca276535b40d1956e71a1cca226d92437 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 5 Apr 2025 15:45:41 +0200 Subject: [PATCH 18/23] better error messages --- article_scraper/src/error.rs | 6 +- article_scraper/src/full_text_parser/mod.rs | 96 ++++++++++----------- 2 files changed, 50 insertions(+), 52 deletions(-) diff --git a/article_scraper/src/error.rs b/article_scraper/src/error.rs index 4f915fd..41ac9de 100644 --- a/article_scraper/src/error.rs +++ b/article_scraper/src/error.rs @@ -6,10 +6,10 @@ use thiserror::Error; #[derive(Error, Debug)] pub enum ScraperError { - #[error("")] + #[error("Configerror {0}")] Config(#[from] ConfigError), - #[error("")] + #[error("ImageDownloadError {0}")] Image(#[from] ImageDownloadError), - #[error("")] + #[error("FullTextParserError {0}")] Scrap(#[from] FullTextParserError), } diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 18fc682..4bb8a30 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -354,63 +354,61 @@ impl FullTextParser { .send() .await .map_err(|err| { - log::error!( - "Downloading HTML failed: GET '{}' - '{}'", - url.as_str(), - err - ); + log::error!("Downloading HTML failed: GET '{url}' - '{err}'"); FullTextParserError::Http })?; Ok(response) } async fn get_body(response: Response) -> Result { - if response.status().is_success() { - let headers = response.headers().clone(); - let bytes = response - .bytes() - .await - .map_err(|_| FullTextParserError::Http)?; - - match from_utf8(&bytes) { - Ok(utf8_str) => { - log::debug!("Valid utf-8 string"); - return Ok(utf8_str.into()); - } - Err(error) => { - log::debug!("Invalid utf-8 string"); - let lossy_string = std::string::String::from_utf8_lossy(&bytes); - - if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) { - log::debug!("Encoding extracted from HTML: '{}'", encoding); - if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { - let decoded_html = decoded_html.replacen( - &format!("charset=\"{encoding}\""), - "charset=\"utf-8\"", - 1, - ); - return Ok(decoded_html); - } - } - - if let Some(encoding) = Self::get_encoding_from_http_header(&headers) { - log::debug!("Encoding extracted from headers: '{}'", encoding); - if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { - let decoded_html = decoded_html.replacen( - &format!("charset=\"{encoding}\""), - "charset=\"utf-8\"", - 1, - ); - return Ok(decoded_html); - } - } - - return Err(FullTextParserError::Utf8(error)); - } - } + let status = response.status(); + if !status.is_success() { + log::error!("status code: {status}"); + return Err(FullTextParserError::Http); } - Err(FullTextParserError::Http) + let headers = response.headers().clone(); + let bytes = response + .bytes() + .await + .map_err(|_| FullTextParserError::Http)?; + + match from_utf8(&bytes) { + Ok(utf8_str) => { + log::debug!("Valid utf-8 string"); + Ok(utf8_str.into()) + } + Err(error) => { + log::debug!("Invalid utf-8 string"); + let lossy_string = std::string::String::from_utf8_lossy(&bytes); + + if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) { + log::debug!("Encoding extracted from HTML: '{encoding}'"); + if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { + let decoded_html = decoded_html.replacen( + &format!("charset=\"{encoding}\""), + "charset=\"utf-8\"", + 1, + ); + return Ok(decoded_html); + } + } + + if let Some(encoding) = Self::get_encoding_from_http_header(&headers) { + log::debug!("Encoding extracted from headers: '{encoding}'"); + if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { + let decoded_html = decoded_html.replacen( + &format!("charset=\"{encoding}\""), + "charset=\"utf-8\"", + 1, + ); + return Ok(decoded_html); + } + } + + Err(FullTextParserError::Utf8(error)) + } + } } pub async fn download( From 9b374a28c717e57db7341fac4967b9e0114ad455 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 5 Apr 2025 15:47:08 +0200 Subject: [PATCH 19/23] update ftr-site-config --- article_scraper/ftr-site-config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/ftr-site-config b/article_scraper/ftr-site-config index ccde390..69aa220 160000 --- a/article_scraper/ftr-site-config +++ b/article_scraper/ftr-site-config @@ -1 +1 @@ -Subproject commit ccde390b11893cbafdc84f74b449ddc3cc05c024 +Subproject commit 69aa220193d99427d3822fabccdfaeede56cd532 From f361392c04376736ce9ce2d338c7363959135878 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 4 May 2025 17:34:33 +0200 Subject: [PATCH 20/23] check for empty http response and parsed documents without root element --- article_scraper/src/full_text_parser/mod.rs | 35 +++++++++++++++++++-- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 4bb8a30..ac77bf6 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -69,6 +69,11 @@ impl FullTextParser { let html = Self::get_body(response).await?; + if html.is_empty() { + log::error!("Empty response body"); + return Err(FullTextParserError::Http); + } + // check for fingerprints let config = if config.is_none() { if let Some(url) = Fingerprints::detect(&html) { @@ -264,10 +269,17 @@ impl FullTextParser { } // parse html - Self::parse_html_string_patched(html.as_str()).map_err(|err| { + let document = Self::parse_html_string_patched(html.as_str()).map_err(|err| { log::error!("Parsing HTML failed for downloaded HTML {:?}", err); FullTextParserError::Xml - }) + })?; + + if document.get_root_element().is_none() { + log::error!("document without root"); + Err(FullTextParserError::Xml) + } else { + Ok(document) + } } /// FIXME: Here are some patched functions of libxml crate. @@ -368,6 +380,18 @@ impl FullTextParser { } let headers = response.headers().clone(); + + if headers + .get(reqwest::header::CONTENT_LENGTH) + .and_then(|hv| hv.to_str().ok()) + .and_then(|str| str.parse::().ok()) + .map(|content_length| content_length == 0) + .unwrap_or(false) + { + log::error!("Empty response body"); + return Err(FullTextParserError::Http); + } + let bytes = response .bytes() .await @@ -420,7 +444,12 @@ impl FullTextParser { let headers = Util::generate_headers(config, global_config)?; let response = Self::get_response(url, client, headers).await?; let body = Self::get_body(response).await?; - Ok(body) + if body.is_empty() { + log::error!("Empty response body"); + Err(FullTextParserError::Http) + } else { + Ok(body) + } } fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> { From 06990acbc0d4cd55a44aeb20e95c1e6216074a16 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 4 May 2025 17:38:46 +0200 Subject: [PATCH 21/23] fix libxml CI build --- .gitlab-ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7880e5d..159f07d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,12 +4,14 @@ stages: run-build: stage: build - image: rust:1.83 + image: rust:1.86 before_script: - rustup component add rustfmt - rustup component add clippy + - export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so script: - rustc --version && cargo --version + - echo $LIBXML2 - cargo fmt -- --check - cargo clippy --all-targets --all-features -- -D warnings - cargo build --release From 498008f6307c3faabfd6ac40e820871752b75039 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 4 May 2025 17:51:30 +0200 Subject: [PATCH 22/23] bump version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 99695c8..8569ad0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = ["article_scraper", "article_scraper_cli"] resolver = "2" [workspace.package] -version = "2.1.1" +version = "2.1.2" authors = ["Jan Lukas Gernert "] edition = "2021" license = "GPL-3.0-or-later" From 9f349f8c6f2a88b277a8d1552d3d84781bdc9363 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 4 May 2025 18:00:59 +0200 Subject: [PATCH 23/23] need reqwest streams --- article_scraper/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index e852be9..eeed67c 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -14,7 +14,7 @@ exclude = ["resources/tests"] [dependencies] thiserror = "2.0" libxml = "0.3" -reqwest = "0.12" +reqwest = { version = "0.12", features = ["stream"] } tokio = { version = "1", features = ["macros", "fs", "io-util"] } url = "2.5" regex = "1.11"