From 31a803384494924e8f6cc83e4da2f875531af701 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Tue, 28 Feb 2023 01:50:13 +0100 Subject: [PATCH] fixes, more sanitation & 1 more failing test --- expected.html | 107 -- .../tests/readability/aktualne/expected.html | 1 + .../tests/readability/aktualne/source.html | 1661 +++++++++++++++++ src/constants.rs | 5 +- src/full_text_parser/mod.rs | 29 +- src/full_text_parser/readability/mod.rs | 57 +- src/full_text_parser/readability/tests.rs | 5 + src/util.rs | 290 ++- 8 files changed, 1993 insertions(+), 162 deletions(-) delete mode 100644 expected.html create mode 100644 resources/tests/readability/aktualne/expected.html create mode 100644 resources/tests/readability/aktualne/source.html diff --git a/expected.html b/expected.html deleted file mode 100644 index 2d1034a..0000000 --- a/expected.html +++ /dev/null @@ -1,107 +0,0 @@ -
-

- I don't use Facebook. I'm not technophobic — I'm a geek. I've been using email since the early 1990s, I have accounts on hundreds of services around the net, and I do software development and internet protocol design both for work and for fun. I believe that a globe-spanning communications network like the internet can be a positive social force, and I publish much of my own work on the open web. -

-

- But Facebook and other massive web companies represent a strong push toward unaccountable centralized social control, which I think makes our society more unequal and more unjust. The Cambridge Analytica scandal is one instance of this long-running problem with what I call the "surveillance economy." I don't want to submit to these power structures, and I don’t want my presence on such platforms to serve as bait that lures other people into the digital panopticon. -

-

- But while I've never "opted in" to Facebook or any of the other big social networks, Facebook still has a detailed profile that can be used to target me. I've never consented to having Facebook collect my data, which can be used to draw very detailed inferences about my life, my habits, and my relationships. As we aim to take Facebook to task for its breach of user trust, we need to think about what its capabilities imply for society overall. After all, if you do #deleteFacebook, you'll find yourself in my shoes: non-consenting, but still subject to Facebook’s globe-spanning surveillance and targeting network. -

-

- There are at least two major categories of information available to Facebook about non-participants like me: information from other Facebook users, and information from sites on the open web. -

-

Information from other Facebook users

-

- When you sign up for Facebook, it encourages you to upload your list of contacts so that the site can "find your friends." Facebook uses this contact information to learn about people, even if those people don't agree to participate. It also links people together based on who they know, even if the shared contact hasn't agreed to this use. -

-

- For example, I received an email from Facebook that lists the people who have all invited me to join Facebook: my aunt, an old co-worker, a friend from elementary school, etc. This email includes names and email addresses — including my own name — and at least one web bug designed to identify me to Facebook’s web servers when I open the email. Facebook records this group of people as my contacts, even though I've never agreed to this kind of data collection. -

-

- Similarly, I'm sure that I'm in some photographs that someone has uploaded to Facebook — and I'm probably tagged in some of them. I've never agreed to this, but Facebook could still be keeping track. -

-

- So even if you decide you need to join Facebook, remember that you might be giving the company information about someone else who didn't agree to be part of its surveillance platform. -

-

Information from sites on the open Web

-

- Nearly every website that you visit that has a "Like" button is actually encouraging your browser to tell Facebook about your browsing habits. Even if you don't click on the "Like" button, displaying it requires your browser to send a request to Facebook's servers for the "Like" button itself. That request includes information mentioning the name of the page you are visiting and any Facebook-specific cookies your browser might have collected. (See Facebook's own description of this process.) This is called a "third-party request." -

-

- This makes it possible for Facebook to create a detailed picture of your browsing history — even if you've never even visited Facebook directly, let alone signed up for a Facebook account. -

-

- Think about most of the web pages you've visited — how many of them don't have a "Like" button? If you administer a website and you include a "Like" button on every page, you're helping Facebook to build profiles of your visitors, even those who have opted out of the social network. Facebook’s “Share” buttons on other sites — along with other tools — work a bit differently from the “Like” button, but do effectively the same thing. -

-

- The profiles that Facebook builds on non-users don't necessarily include so-called "personally identifiable information" (PII) like names or email addresses. But they do include fairly unique patterns. Using Chromium's NetLog dumping, I performed a simple five-minute browsing test last week that included visits to various sites — but not Facebook. In that test, the PII-free data that was sent to Facebook included information about which news articles I was reading, my dietary preferences, and my hobbies. -

-

- Given the precision of this kind of mapping and targeting, "PII" isn’t necessary to reveal my identity. How many vegans examine specifications for computer hardware from the ACLU's offices while reading about Cambridge Analytica? Anyway, if Facebook combined that information with the "web bug" from the email mentioned above — which is clearly linked to my name and e-mail address — no guesswork would be required. -

-

- I'd be shocked if Facebook were not connecting those dots given the goals they claim for data collection: -

-

- We use the information we have to improve our advertising and measurement systems so we can show you relevant ads on and off our Services and measure the effectiveness and reach of ads and services. -

-

- This is, in essence, exactly what Cambridge Analytica did. -

-

Consent

-

- Facebook and other tech companies often deflect accusations against excessive data collection by arguing "consent" — that they harvest and use data with the consent of the users involved. -

-

- But even if we accept that clicking through a "Terms of Service" that no one reads can actually constitute true consent, even if we ignore the fact that these terms are overwhelmingly one-sided and non-negotiable, and even if we accept that it's meaningful for people to give consent when sharing data about other people who may have also opted in — what is the recourse for someone who has not opted into these systems at all? -

-

- Are those of us who have explicitly avoided agreeing to the Facebook terms of service simply fair game for an industry-wide surveillance and targeting network? -

-

Privilege

-

- I don’t mean to critique people who have created a Facebook profile or suggest they deserve whatever they get. -

-

- My ability to avoid Facebook comes from privilege — I have existing social contacts with whom I know how to stay in touch without using Facebook's network. My job does not require that I use Facebook. I can afford the time and expense to communicate with my electoral representatives and political allies via other channels. -

-

- Many people do not have these privileges and are compelled to "opt in" on Facebook's non-negotiable terms. -

-

- Many journalists, organizers, schools, politicians, and others who have good reasons to oppose Facebook's centralized social control feel compelled by Facebook's reach and scale to participate in their practices, even those we know to be harmful. That includes the ACLU. -

-

- Privacy should not be a luxury good, and while I'm happy to encourage people to opt out of these subtle and socially fraught arrangements, I do not argue that anyone who has signed up has somehow relinquished concerns about their privacy. We need to evaluate privacy concerns in their full social contexts. These are not problems that can be resolved on an individual level, because of the interpersonal nature of much of this data and the complexities of the tradeoffs involved. -

-

Technical countermeasures

-

- While they may not solve the problem, there are some technical steps people can take to limit the scope of these surveillance practices. For example, some web browsers do not send "third-party cookies" by default, or they scope cookies so that centralized surveillance doesn't get a single view of one user. The most privacy-preserving modern browser is the Tor Browser, which everyone should have installed and available, even if it's not the browser they choose to use every day. It limits the surveillance ability of systems that you have not signed up for to track you as you move around the web. -

-

- You can also modify some browsers — for example, with plug-ins for Firefox and Chrome — so that they do not send third-partyrequests at all. Firefox is also exploring even more privacy-preserving techniques.

-

- It can’t be denied, though, that these tools are harder to use than the web browsers most people are accustomed to, and they create barriers to some online activities. (For example, logging in to some sites and accessing some web applications is impossible without third-party cookies.) -

-

- Some website operators take their visitors' privacy more seriously than others, by reducing the amount of third-party requests. For example, it's possible to display "share on Facebook" or "Like" buttons without sending user requests to Facebook in the first place. The ACLU's own website does this because we believe that the right to read with privacy is a fundamental protection for civic discourse. -

-

- If you are responsible for running a website, try browsing it with a third-party-blocking extension turned on. Think about how much information you're requiring your users to send to third parties as a condition for using your site. If you care about being a good steward of your visitors' data, you can re-design your website to reduce this kind of leakage. -

-

Opting out?

-

- Some advertisers claim that you can "opt out" of their targeted advertising, and even offer a centralized place meant to help you do so. However, my experience with these tools isn't a positive one. They don't appear to work all of the time. (In a recent experiment I conducted, two advertisers’ opt-out mechanisms failed to take effect.) And while advertisers claim to allow the user to opt out of "interest-based ads," it's not clear that the opt-outs govern data collection itself, rather than just the use of the collected data for displaying ads. Moreover, opting out on their terms requires the use of third-party cookies, thereby enabling another mechanism that other advertisers can then exploit. -

-

- It's also not clear how they function over time: How frequently do I need to take these steps? Do they expire? How often should I check back to make sure I’m still opted out? I'd much prefer an approach requiring me to opt in to surveillance and targeting. -

-

Fix the surveillance economy, not just Facebook

-

- These are just a few of the mechanisms that enable online tracking. Facebook is just one culprit in this online "surveillance economy," albeit a massive one — the company owns Instagram, Atlas, WhatsApp, and dozens of other internet and technology companies and services. But it’s not the only player in this space. Google’s business model also relies on this kind of surveillance, and there are dozens of smaller players as well. -

-

- As we work to address the fallout from the current storm around Facebook and Cambridge Analytica, we can't afford to lose sight of these larger mechanisms at play. Cambridge Analytica's failures and mistakes are inherent to Facebook's business model. We need to seriously challenge the social structures that encourage people to opt in to this kind of surveillance. At the same time, we also need to protect those of us who manage to opt out. -

-
diff --git a/resources/tests/readability/aktualne/expected.html b/resources/tests/readability/aktualne/expected.html new file mode 100644 index 0000000..4c28ea5 --- /dev/null +++ b/resources/tests/readability/aktualne/expected.html @@ -0,0 +1 @@ +FIXME \ No newline at end of file diff --git a/resources/tests/readability/aktualne/source.html b/resources/tests/readability/aktualne/source.html new file mode 100644 index 0000000..2d72af9 --- /dev/null +++ b/resources/tests/readability/aktualne/source.html @@ -0,0 +1,1661 @@ + + + + + + + West Ham hrozí gigantům, okouzlil i Linekera. Součkovu práci je snadné přehlédnout - Aktuálně.cz + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + + +
+ +

+ West Ham hrozí gigantům, okouzlil i Linekera. Součka je snadné přehlédnout +

+
+
+
+
+
+ Aleš Vávra Aleš Vávra +
+ před 2 hodinami +
+
+
+ Zázrak jedné sezony? West Ham dává pochybovačům stále pádnější odpovědi a fotbalový svět si začíná uvědomovat, že se absolutní anglická fotbalová elita rozrůstá o nového člena. Tým manažera Davida Moyese prohání giganty i v aktuálním ročníku Premier League. +
+
+ +
+ Spokojený Tomáš Souček po vítězství ve Villa Parku. +
+
+ Spokojený Tomáš Souček po vítězství ve Villa Parku. | Foto: Reuters +
+
+
+

+ Pět vítězných soutěžních duelů v řadě, během nich jediný inkasovaný gól. Čtvrté místo v lize, stejný bodový zisk jako loňský šampion Manchester City a nadšené ohlasy z tábora těch nejrenomovanějších komentátorů ostrovního fotbalu. +

+ +

+ West Ham je opět v kurzu, nadšené ohlasy po nedělní jasné výhře 4:1 na hřišti Aston Villy zaplnily anglický mediální prostor. +

+

+ "Stali se excelentním týmem. Jsou skvělí ve všech částech hřiště a David Moyes si zaslouží obrovský kredit za to, do jaké pozice je dostal," píše na Twitter Gary Lineker. +

+

+ "Nenapadá mě jediný důvod, proč by letos nemohli skončit v elitní čtyřce," přidává se Emile Heskey, někdejší útočník Liverpoolu. "Je fér říct, že vypadají fantasticky. Moyes je neskutečně oživil." +

+

+ I Heskey si všiml, že se Kladiváři skvěle vyrovnávají s náročným programem a pro ně novou rolí: účastí ve více soutěžích najednou. Moyes zůstává konzervativní v určování základní sestavy, chytře ale rozšířil kádr a v Evropské lize či ligovém poháru nechává některé opory odpočívat. Výjimkou potvrzující pravidlo je přitom Tomáš Souček, o jehož nezbytnosti bude řeč níže. +

+

+ "Klíčová věc je ta, že když udělá změny, pořád jim zůstává stejná struktura. To je něco, co pravidelně říkáme třeba o Manchesteru City. Ve hře neustále zůstává nějaká fundamentální filosofie. West Ham to má podobně a už kvůli tomu je třeba před Moyesem smeknout," přirovnává Heskey. +

+

+ Podívejte se na důležité momenty zápasu Aston Villa - West Ham: +

+ + +

+ V Evropské lize má West Ham po třech zápasech plný bodový zisk. V anglickém ligovém poháru dobyl čtvrtfinále, když vyřadil oba bohaté velkokluby z Manchesteru. +

+

+ Čeští fotbalisté nicméně momentálně nejsou ve světlech těch nejjasnějších reflektorů. +

+

+ Vladimír Coufal už sice uzdravil poraněné tříslo, v sestavě ale před ním dostal přednost rozjetý Ben Johnson. Anglický mladík další působivé představení okořenil parádním gólem a potvrdil, že se stává tvrdou konkurencí pro českého reprezentačního beka. +

+

+ Tomáš Souček zůstává nepostradatelným členem základní sestavy, navzdory tomu, že jeho poslední výkony působí nenápadně. +

+

+ "Pořád toho odvádí strašnou spoustu mimo hlavní pozornost. Jsou to důležité věci, které je snadné přehlédnout," píše ve svém hodnocení server Claret and Hugh.  +

+ +

+ "S Declanem Ricem vytvořil silné partnerství a udělal spoustu těžké práce. Má dobrou rozehrávku. Jediné, na co si lze stěžovat, jsou jeho občasná špatná rozhodnutí ve finální třetině hřiště," hodnotí českého středopolaře londýnský večerník Evening Standard. +

+

+ Web Football.London to vidí podobně. "Opět byl silný ve vzduchu, na obou koncích hřiště. Ve finální fázi se ale nerozhodoval dobře, příliš často volil špatnou variantu." +

+

+ Moyes nicméně nenechává Součka oddechnout. V pěti posledních utkáních, které West Ham odehrál během pouhých čtrnácti dnů, chyběl Čech jen pár minut v závěru na Evertonu, když utrpěl zranění v obličeji. +

+

+ Fanoušci pravidelně spekulují o únavě, skotský manažer ale - jak se zdá - bude mít v sestavě raději unaveného Součka než kohokoli jiného. Zvlášť, když Alex Král, plánovaný back-up do středu zálohy, stále není k dispozici. +

+

+ Zatímco v minulé sezoně Souček častokrát zastínil svého kolegu Rice, letos je to právě anglický reprezentant, kdo si užívá zasloužené ódy na svou adresu. +

+

+ "Hraje prostě velkolepě a připomínám, že je mu stále jen dvaadvacet let," kroutí hlavou Lineker. Není sám. Ještě před pár měsíci se většina odborníků pozastavovala nad údajnou cenovkou kolem 100 milionů liber. Nyní už zaznívají hlasy o tom, jak může být i tato hranice při případném přestupu Declana Rice výrazně překročena. +

+
+ +
+ Declan Rice po vítězství na Aston Ville. +
+
+ Declan Rice po vítězství na Aston Ville. | Foto: Reuters +
+
+

+ S blížícím se zimním přestupním termínem budou spekulace nabývat na síle, fanoušci Hammers ale věří, že Rice zůstane nejméně do léta. Jeho spokojenost je do očí bijící, stejně jako ochota nechat na hřišti všechno ve prospěch Clarets and Blues. +

+

+ "Náš kolektiv je teď opravdu speciální. Působíme ve výjimečném prostředí. Každé ráno se probouzíme s obrovskou touhou po dalším tréninku. Jsme nadšení," tvrdí mladá anglická superstar. +

+

+ "Jsme na děleném třetím místě. Lidé se před sezonou hodně ptali, zda to můžeme dokázat znovu. Ukázali jsme, že ano. Ale musíme pokračovat. Tohle musí být náš standard. Nesmíme polevit, pokud chceme být velkým týmem," zdůrazňuje Rice. +

+
+ +
+ +
+   +
+ +
+

+ Pokud jste v článku zaznamenali chybu nebo překlep, dejte nám, prosím, vědět prostřednictvím kontaktního formuláře. Děkujeme! +

+
+
+ +
+ + +
+
+
+
+
+
+ + +
+
+
+
+
+
+

+ Právě se děje +

+
+ +
+
+ před 3 minutami +
+
+

+ Praha postaví podél Radlické ulice téměř tři stovky družstevních bytů +

+
+

+ Podél Radlické ulice v Praze 5 vzniknou dva domy s až 266 družstevními byty. Pražští radní v pondělí schválili memorandum s pátou městskou částí, podle kterého v této lokalitě chce Praha založit bytové družstvo a byty vybudovat. Cílem je přispět k rozšíření možností cenově dostupného bydlení pro obyvatele na území Prahy. Řekla to radní Hana Kordová Marvanová (za STAN). +

+

+ Projekt podpory dostupného družstevního bydlení schválilo zastupitelstvo Prahy v prosinci 2020. Město pak hledalo vhodné pozemky a z nich zatím vybralo ty na Praze 5. Nedostatek bytů a vysoká cena trápí město dlouhodobě. +

+

+ "Praha chce tímto reagovat na dramatickou situaci s bydlením. Ceny bytů zde letí nahoru šestkrát rychleji než příjmy. Družstevní bydlení je tou nejrychlejší cestou, jak můžeme obyvatelům Prahy pomoci získat cenově dostupnější bydlení," řekla Marvanová. +

+

+ Po uzavření memoranda se následující kroky budou týkat především ekonomických a právních aspektů spojených se založením družstva. To znamená například vytvořit detailní záměr projektu, zajistit potřebnou dokumentaci a zpracovat odhad nákladů na stavbu. +

+

+ Dům na Radlické by měl mít po dokončení v příštích letech hrubou podlažní plochu zhruba 20 000 metrů čtverečních a skládat se bude minimálně ze dvou bloků se samostatnými vchody. Uvnitř bude 221 až 266 bytů a jejich průměrná výměra se má pohybovat od 54 do 65 metrů čtverečních. +

+
Zdroj: ČTK +
+
+ + + + + + + + Další zprávy +
+ +
+
+ + + +
+
+ + + +
+ + +
+
+ +
+
+ +
+
+
+ + + + + diff --git a/src/constants.rs b/src/constants.rs index 57bdc90..027b80f 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -40,8 +40,11 @@ pub static TITLE_CUT_END: Lazy = pub static WORD_COUNT: Lazy = Lazy::new(|| Regex::new(r#"\s+"#).expect("WORD_COUNT regex")); pub static TITLE_CUT_FRONT: Lazy = Lazy::new(|| Regex::new(r#"/[^-|\\/>»]*[-|\\/>»](.*)/gi"#).expect("TITLE_CUT_FRONT regex")); - +pub static VIDEOS: Lazy = Lazy::new(|| { + Regex::new(r#"///(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i"#).expect("VIDEOS regex") +}); pub const SCORE_ATTR: &str = "content_score"; +pub const DATA_TABLE_ATTR: &str = "is_data_table"; pub const MINIMUM_TOPCANDIDATES: usize = 3; pub const UNLIKELY_ROLES: &[&str] = &[ "menu", diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs index 7c2c92e..f965f6b 100644 --- a/src/full_text_parser/mod.rs +++ b/src/full_text_parser/mod.rs @@ -594,7 +594,6 @@ impl FullTextParser { let _ = Self::fix_lazy_images(context, "lazyload", "data-src"); let _ = Self::fix_iframe_size(context, "youtube.com"); - let _ = Self::remove_attribute(context, None, "style"); let _ = Self::remove_attribute(context, Some("a"), "onclick"); let _ = Self::remove_attribute(context, Some("img"), "srcset"); let _ = Self::remove_attribute(context, Some("img"), "sizes"); @@ -610,6 +609,8 @@ impl FullTextParser { // strip elements that contain style="display: none;" let _ = Util::strip_node(context, "//*[contains(@style,'display:none')]"); + let _ = Util::strip_node(context, "//*[contains(@style,'display: none')]"); + let _ = Self::remove_attribute(context, None, "style"); // strip all comments let _ = Util::strip_node(context, "//input"); @@ -849,11 +850,6 @@ impl FullTextParser { } pub(crate) fn post_process_content(document: &Document) -> Result<(), FullTextParserError> { - if let Some(mut root) = document.get_root_element() { - Self::clean_classes(&mut root)?; - Self::simplify_nested_elements(&mut root)?; - } - let context = Context::new(document).map_err(|()| { error!("Creating xpath context failed for article HTML"); FullTextParserError::Xml @@ -884,6 +880,19 @@ impl FullTextParser { } } + Util::mark_data_tables(&context)?; + + if let Some(mut root) = document.get_root_element() { + Util::clean_conditionally(&mut root, "form")?; + Util::clean_conditionally(&mut root, "fieldset")?; + Util::clean_conditionally(&mut root, "table")?; + Util::clean_conditionally(&mut root, "ul")?; + Util::clean_conditionally(&mut root, "div")?; + + Self::clean_classes(&mut root)?; + Self::simplify_nested_elements(&mut root)?; + } + Ok(()) } @@ -904,11 +913,17 @@ impl FullTextParser { })?; } - node.remove_attribute("content_score").map_err(|e| { + node.remove_attribute(constants::SCORE_ATTR).map_err(|e| { log::error!("{e}"); FullTextParserError::Xml })?; + node.remove_attribute(constants::DATA_TABLE_ATTR) + .map_err(|e| { + log::error!("{e}"); + FullTextParserError::Xml + })?; + node_iter = Util::next_node(&node, false); } Ok(()) diff --git a/src/full_text_parser/readability/mod.rs b/src/full_text_parser/readability/mod.rs index 4ee2264..b2a2c19 100644 --- a/src/full_text_parser/readability/mod.rs +++ b/src/full_text_parser/readability/mod.rs @@ -69,8 +69,18 @@ impl Readability { if state.strip_unlikely { if constants::UNLIELY_CANDIDATES.is_match(&match_string) && !constants::OKAY_MAYBE_ITS_A_CANDIDATE.is_match(&match_string) - && !Util::has_ancestor_tag(node_ref, "table", None) - && !Util::has_ancestor_tag(node_ref, "code", None) + && !Util::has_ancestor_tag( + node_ref, + "table", + None, + None:: bool>, + ) + && !Util::has_ancestor_tag( + node_ref, + "code", + None, + None:: bool>, + ) && tag_name != "BODY" && tag_name != "A" { @@ -123,6 +133,10 @@ impl Readability { log::error!("{error}"); FullTextParserError::Readability })?; + node_ref.add_child(&mut new_node).map_err(|error| { + log::error!("{error}"); + FullTextParserError::Readability + })?; p.replace(new_node); } } else if let Some(p) = p.as_mut() { @@ -638,40 +652,13 @@ impl Readability { "H1" | "H2" | "H3" | "H4" | "H5" | "H6" | "TH" => -5, _ => 0, }; - let score = score + Self::get_class_weight(node, state); + let class_weight = if state.weigh_classes { + Util::get_class_weight(node) + } else { + 0 + }; + let score = score + class_weight; Self::set_content_score(node, score as f64)?; Ok(()) } - - fn get_class_weight(node: &Node, state: &State) -> i64 { - if !state.weigh_classes { - return 0; - } - - let mut weight = 0; - - // Look for a special classname - if let Some(class_names) = node.get_property("class") { - if constants::NEGATIVE.is_match(&class_names) { - weight -= 25; - } - - if constants::POSITIVE.is_match(&class_names) { - weight += 25; - } - } - - // Look for a special ID - if let Some(class_names) = node.get_property("id") { - if constants::NEGATIVE.is_match(&class_names) { - weight -= 25; - } - - if constants::POSITIVE.is_match(&class_names) { - weight += 25; - } - } - - weight - } } diff --git a/src/full_text_parser/readability/tests.rs b/src/full_text_parser/readability/tests.rs index a0dc446..98c605c 100644 --- a/src/full_text_parser/readability/tests.rs +++ b/src/full_text_parser/readability/tests.rs @@ -70,6 +70,11 @@ async fn aclu() { run_test("aclu").await } +#[tokio::test] +async fn aktualne() { + run_test("aktualne").await +} + #[tokio::test] async fn webmd_1() { run_test("webmd-1").await diff --git a/src/util.rs b/src/util.rs index 117a55b..22df0b2 100644 --- a/src/util.rs +++ b/src/util.rs @@ -228,10 +228,6 @@ impl Util { } pub fn is_probably_visible(node: &Node) -> bool { - let display_none = node - .get_attribute("display") - .map(|display| display == "none") - .unwrap_or(false); let is_hidden = node.has_attribute("hidden"); let aria_hidden = node .get_attribute("aria-hidden") @@ -239,7 +235,7 @@ impl Util { .unwrap_or(false); let has_fallback_image = node.get_class_names().contains("fallback-image"); - !display_none && !is_hidden && !aria_hidden || has_fallback_image + !is_hidden && !aria_hidden || has_fallback_image } pub fn is_whitespace(node: &Node) -> bool { @@ -333,7 +329,15 @@ impl Util { 1.0 - distance_b } - pub fn has_ancestor_tag(node: &Node, tag_name: &str, max_depth: Option) -> bool { + pub fn has_ancestor_tag( + node: &Node, + tag_name: &str, + max_depth: Option, + filter: Option, + ) -> bool + where + F: Fn(&Node) -> bool, + { let max_depth = max_depth.unwrap_or(3); let tag_name = tag_name.to_uppercase(); let mut depth = 0; @@ -349,7 +353,12 @@ impl Util { None => return false, }; - if tmp_node.get_name() == tag_name { + if tmp_node.get_name() == tag_name + && filter + .as_ref() + .map(|filter| filter(&tmp_node)) + .unwrap_or(true) + { return true; } @@ -383,15 +392,15 @@ impl Util { if let Some(node_type) = node.get_type() { let len = node.get_child_nodes().len(); - return node_type == NodeType::ElementNode - && node.get_content().trim().is_empty() + node_type == NodeType::ElementNode && (len == 0 || len == Self::get_elements_by_tag_name(node, "br").len() - + Self::get_elements_by_tag_name(node, "hr").len()); + + Self::get_elements_by_tag_name(node, "hr").len()) + && node.get_content().trim().is_empty() + } else { + false } - - false } pub fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec { @@ -480,4 +489,261 @@ impl Util { false } } + + // Clean an element of all tags of type "tag" if they look fishy. + // "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. + pub fn clean_conditionally(root: &mut Node, tag: &str) -> Result<(), FullTextParserError> { + // Gather counts for other typical elements embedded within. + // Traverse backwards so we can remove nodes at the same time + // without effecting the traversal. + // + // TODO: Consider taking into account original contentScore here. + let nodes = Util::get_elements_by_tag_name(root, tag); + let nodes_to_remove = nodes + .into_iter() + .filter(|node| Self::should_remove(node, tag)) + .collect::>(); + + for mut node in nodes_to_remove { + node.unlink(); + } + + Ok(()) + } + + fn should_remove(node: &Node, tag: &str) -> bool { + // First check if this node IS data table, in which case don't remove it. + let mut is_list = tag == "ul" || tag == "ol"; + if !is_list { + let mut list_length = 0.0; + let ul_nodes = Self::get_elements_by_tag_name(node, "ul"); + let ol_nodes = Self::get_elements_by_tag_name(node, "ol"); + for list_node in ul_nodes { + list_length += Util::get_inner_text(&list_node, false).len() as f64; + } + for list_node in ol_nodes { + list_length += Util::get_inner_text(&list_node, false).len() as f64; + } + is_list = (list_length / Util::get_inner_text(node, false).len() as f64) > 0.9; + } + + if tag == "table" && Self::is_data_table(node) { + return false; + } + + // Next check if we're inside a data table, in which case don't remove it as well. + if Self::has_ancestor_tag(node, "table", Some(u64::MAX), Some(Self::is_data_table)) { + return false; + } + + if Self::has_ancestor_tag(node, "code", None, None:: bool>) { + return false; + } + + let weight = Self::get_class_weight(node); + if weight < 0 { + return false; + } + + if Self::get_char_count(node, ',') < 10 { + // If there are not very many commas, and the number of + // non-paragraph elements is more than paragraphs or other + // ominous signs, remove the element. + let p = Self::get_elements_by_tag_name(node, "p").len(); + let img = Self::get_elements_by_tag_name(node, "img").len(); + let li = Self::get_elements_by_tag_name(node, "li").len() as i64 - 100; + let input = Self::get_elements_by_tag_name(node, "input").len(); + let heading_density = + Self::get_text_density(node, &["h1", "h2", "h3", "h4", "h5", "h6"]); + + let mut embed_count = 0; + let embed_tags = ["object", "embed", "iframe"]; + + for embed_tag in embed_tags { + for embed_node in Self::get_elements_by_tag_name(node, embed_tag) { + // If this embed has attribute that matches video regex, don't delete it. + for (_name, value) in embed_node.get_attributes() { + if constants::VIDEOS.is_match(&value) { + return false; + } + } + + // For embed with tag, check inner HTML as well. + // if embed_node.get_name().to_lowercase() == "object" && constants::VIDEOS.is_match(embed_node.innerHTML) { + // return false; + // } + + embed_count += 1; + } + } + + let link_density = Self::get_link_density(node); + let content_length = Self::get_inner_text(node, false).len(); + + (img > 1 + && (p as f64 / img as f64) < 0.5 + && !Self::has_ancestor_tag(node, "figure", None, None:: bool>)) + || (!is_list && li > p as i64) + || (input as f64 > f64::floor(p as f64 / 3.0)) + || (!is_list + && heading_density < 0.9 + && content_length < 25 + && (img == 0 || img > 2) + && !Self::has_ancestor_tag(node, "figure", None, None:: bool>)) + || (!is_list && weight < 25 && link_density > 0.2) + || (weight >= 25 && link_density > 0.5) + || ((embed_count == 1 && content_length < 75) || embed_count > 1) + } else { + false + } + } + + pub fn get_class_weight(node: &Node) -> i64 { + let mut weight = 0; + + // Look for a special classname + if let Some(class_names) = node.get_property("class") { + if constants::NEGATIVE.is_match(&class_names) { + weight -= 25; + } + + if constants::POSITIVE.is_match(&class_names) { + weight += 25; + } + } + + // Look for a special ID + if let Some(class_names) = node.get_property("id") { + if constants::NEGATIVE.is_match(&class_names) { + weight -= 25; + } + + if constants::POSITIVE.is_match(&class_names) { + weight += 25; + } + } + + weight + } + + fn get_char_count(node: &Node, char: char) -> usize { + Util::get_inner_text(node, false).split(char).count() - 1 + } + + fn get_text_density(node: &Node, tags: &[&str]) -> f64 { + let text_length = Util::get_inner_text(node, false).len(); + if text_length == 0 { + return 0.0; + } + + let mut children_length = 0; + for tag in tags { + for child in Self::get_elements_by_tag_name(node, tag) { + children_length += Util::get_inner_text(&child, false).len() + } + } + children_length as f64 / text_length as f64 + } + + fn is_data_table(node: &Node) -> bool { + node.get_attribute(constants::DATA_TABLE_ATTR) + .and_then(|is_data_table| is_data_table.parse::().ok()) + .unwrap_or(false) + } + + pub fn mark_data_tables(context: &Context) -> Result<(), FullTextParserError> { + let nodes = Util::evaluate_xpath(context, "//table", false)?; + for mut node in nodes { + if node + .get_attribute("role") + .map(|role| role == "presentation") + .unwrap_or(false) + { + let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "false"); + continue; + } + + if node + .get_attribute("datatable") + .map(|role| role == "0") + .unwrap_or(false) + { + let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "false"); + continue; + } + + if node.get_attribute("summary").is_some() { + let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "true"); + continue; + } + + if let Some(first_caption) = Self::get_elements_by_tag_name(&node, "caption").first() { + if !first_caption.get_child_nodes().is_empty() { + let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "true"); + continue; + } + } + + // If the table has a descendant with any of these tags, consider a data table: + let data_table_descendants = ["col", "colgroup", "tfoot", "thead", "th"]; + for descendant in data_table_descendants { + if !Self::get_elements_by_tag_name(&node, descendant).is_empty() { + let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "true"); + continue; + } + } + + // Nested tables indicate a layout table: + if !Self::get_elements_by_tag_name(&node, "table").is_empty() { + let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "false"); + continue; + } + + let (rows, columns) = Self::get_row_and_column_count(&node); + if rows >= 10 || columns > 4 { + let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "true"); + continue; + } + + // Now just go by size entirely: + let _ = node.set_attribute( + constants::DATA_TABLE_ATTR, + if rows * columns > 10 { "true" } else { "false" }, + ); + } + + Ok(()) + } + + pub fn get_row_and_column_count(node: &Node) -> (usize, usize) { + if node.get_name().to_uppercase() != "TABLE" { + return (0, 0); + } + + let mut rows = 0; + let mut columns = 0; + + let trs = Self::get_elements_by_tag_name(node, "tr"); + for tr in trs { + let row_span = tr + .get_attribute("rowspan") + .and_then(|span| span.parse::().ok()) + .unwrap_or(1); + rows += row_span; + + // Now look for column-related info + let mut columns_in_this_row = 0; + let cells = Self::get_elements_by_tag_name(&tr, "td"); + for cell in cells { + let colspan = cell + .get_attribute("colspan") + .and_then(|span| span.parse::().ok()) + .unwrap_or(1); + columns_in_this_row += colspan; + } + columns = usize::max(columns, columns_in_this_row); + } + + (rows, columns) + } }