From cc6ff6d7e27936a73e82bf760b79c4832ab2bcbd Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 1 Apr 2023 18:14:05 +0200 Subject: [PATCH] 6 more tags & make seattletimes test consistent --- .../cnet-svg-classes/expected.html | 2 +- .../readability/seattletimes-1/expected.html | 2 +- .../readability/seattletimes-1/source.html | 2 +- .../style-tags-removal/expected.html | 21 + .../style-tags-removal/source.html | 42 + .../readability/svg-parsing/expected.html | 38 + .../tests/readability/svg-parsing/source.html | 44 + .../table-style-attributes/expected.html | 122 ++ .../table-style-attributes/source.html | 167 ++ .../tests/readability/telegraph/expected.html | 27 + .../tests/readability/telegraph/source.html | 1821 +++++++++++++++++ .../title-and-h1-discrepancy/expected.html | 21 + .../title-and-h1-discrepancy/source.html | 30 + .../tests/readability/tmz-1/expected.html | 37 + resources/tests/readability/tmz-1/source.html | 1528 ++++++++++++++ src/full_text_parser/readability/tests.rs | 30 + 16 files changed, 3931 insertions(+), 3 deletions(-) create mode 100644 resources/tests/readability/style-tags-removal/expected.html create mode 100644 resources/tests/readability/style-tags-removal/source.html create mode 100644 resources/tests/readability/svg-parsing/expected.html create mode 100644 resources/tests/readability/svg-parsing/source.html create mode 100644 resources/tests/readability/table-style-attributes/expected.html create mode 100644 resources/tests/readability/table-style-attributes/source.html create mode 100644 resources/tests/readability/telegraph/expected.html create mode 100644 resources/tests/readability/telegraph/source.html create mode 100644 resources/tests/readability/title-and-h1-discrepancy/expected.html create mode 100644 resources/tests/readability/title-and-h1-discrepancy/source.html create mode 100644 resources/tests/readability/tmz-1/expected.html create mode 100644 resources/tests/readability/tmz-1/source.html diff --git a/resources/tests/readability/cnet-svg-classes/expected.html b/resources/tests/readability/cnet-svg-classes/expected.html index bb85b1f..0ac4eb7 100644 --- a/resources/tests/readability/cnet-svg-classes/expected.html +++ b/resources/tests/readability/cnet-svg-classes/expected.html @@ -15,7 +15,7 @@

Además, el app ocupa menos espacio en tu teléfono móvil, al reducir a 3MB su peso.

Twitter dio a conocer Twitter Lite en abril en India, y desde entonces ha estado trabajando para llevarlo a más países. La empresa en los últimos meses también se ha involucrado de forma definitiva en la eliminación de los abusos en la red social, tomando medidas incluso en la verificación de cuentas.

- +
diff --git a/resources/tests/readability/seattletimes-1/expected.html b/resources/tests/readability/seattletimes-1/expected.html index 0246f14..a7c51b9 100644 --- a/resources/tests/readability/seattletimes-1/expected.html +++ b/resources/tests/readability/seattletimes-1/expected.html @@ -72,7 +72,7 @@

- Amazon-owned Whole Foods touted a price cut on halibut as part of an announcement recently about lower prices on hundreds of items. (Ellen M. Banner / The Seattle Times) +
Amazon-owned Whole Foods touted a price cut on halibut as part of an announcement recently about lower prices on hundreds of items. (Ellen M. Banner / The Seattle Times)
diff --git a/resources/tests/readability/seattletimes-1/source.html b/resources/tests/readability/seattletimes-1/source.html index f9a577e..32bc198 100644 --- a/resources/tests/readability/seattletimes-1/source.html +++ b/resources/tests/readability/seattletimes-1/source.html @@ -1161,7 +1161,7 @@

- Amazon-owned Whole Foods touted a price cut on halibut as part of an announcement recently about lower prices on hundreds of items. (Ellen M. Banner / The Seattle Times) +
Amazon-owned Whole Foods touted a price cut on halibut as part of an announcement recently about lower prices on hundreds of items. (Ellen M. Banner / The Seattle Times)
diff --git a/resources/tests/readability/style-tags-removal/expected.html b/resources/tests/readability/style-tags-removal/expected.html new file mode 100644 index 0000000..f587bdf --- /dev/null +++ b/resources/tests/readability/style-tags-removal/expected.html @@ -0,0 +1,21 @@ +
+

Lorem

+ +

+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +

+ +

Foo

+

+ Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +

+
diff --git a/resources/tests/readability/style-tags-removal/source.html b/resources/tests/readability/style-tags-removal/source.html new file mode 100644 index 0000000..687ca39 --- /dev/null +++ b/resources/tests/readability/style-tags-removal/source.html @@ -0,0 +1,42 @@ + + + + + Style tags removal + + + +
+

Lorem

+ +
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +
+ +

Foo

+
+ Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +
+
+ + + diff --git a/resources/tests/readability/svg-parsing/expected.html b/resources/tests/readability/svg-parsing/expected.html new file mode 100644 index 0000000..f0c913c --- /dev/null +++ b/resources/tests/readability/svg-parsing/expected.html @@ -0,0 +1,38 @@ +
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+ + + + + + +

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
diff --git a/resources/tests/readability/svg-parsing/source.html b/resources/tests/readability/svg-parsing/source.html new file mode 100644 index 0000000..e4fb905 --- /dev/null +++ b/resources/tests/readability/svg-parsing/source.html @@ -0,0 +1,44 @@ + + + + SVG parsing + + +

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+ + + + + + +

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+ + diff --git a/resources/tests/readability/table-style-attributes/expected.html b/resources/tests/readability/table-style-attributes/expected.html new file mode 100644 index 0000000..4778a67 --- /dev/null +++ b/resources/tests/readability/table-style-attributes/expected.html @@ -0,0 +1,122 @@ +
+

+ linux usability +
...or, why do I bother.

© 2002, 2003 + Jamie Zawinski

+ + +
+ + + + + + + +
+

In December 2002, I tried to install some software on my computer. The experience was, shall we say, less than pleasant. On many levels. I wrote about my experience, as I so often do.

+

Then in January, the jackasses over at Slashdot posted a link to it, calling it a "review" of Linux video software. I guess you could consider it a review, if you were to squint at it just right. But really what it is is a rant about how I had an evening stolen from me by crap software design. It is a flame about the pathetic state of Linux usability in general, and the handful of video players I tried out in particular. It makes no attempt to be balanced or objective or exhaustive. It is a description of my experience. Perhaps your experience was different. Good for you.

+

So of course that day I got hundreds of emails about it. Every Linux apologist in the world wanted to make sure I was fully informed of their opinion. The replies were roughly in the following groups:

+ +
    +
  • "Right on! I had exactly the same experience! Thank you for putting it into words." (This was about 1/3 of the replies.) + +
  • +
  • "You're clearly an idiot, Linux is too sophisticated for you, you clearly are incapable of understanding anything, you should go back to kindergarten and/or use a Mac." (Oddly, all of these messages used the word `clearly' repeatedly.) + +
  • +
  • "If you don't like it, fix it yourself." + +
  • +
  • "Netscape sucks! XEmacs sucks! You suck! I never liked you anyway! And you swear too much!" + +
  • +
  • "How dare you criticize someone else's work! You got it for free! You should be on your knees thanking them for wasting your time!" + +
  • +
  • "While you have some valid complaints, I'm going to focus on this one inconsequential error you made in your characterization of one of the many roadblocks you encountered. You suck!" + +
  • +
  • "It's your fault for using Red Hat! You should be using Debian/Mandrake/Gentoo instead!" + +
  • +
  • "Red Hat 7.2 is totally obsolete! It's almost 14 months old! What were you expecting!"
  • +
+
+

While I am flattered that so many logorrheic Linux fanboys are sufficiently interested in my opinions and experiences to share their deeply heartfelt views with me, you can all rest assured that:

+ +
    +
      +
    • I've heard it before; and
    • +
    • I didn't care the first time.
    • +
    +
+

So please. Don't bother sending me any more mail about this. It's a near certainty that I will just delete it unread, so you might as well not waste your time. Feel free to call me names on your own web page if you feel the need to get it out of your system. But kindly stay out of my inbox. + + +

+
+
+ +

+ that said... +

+

I understand that one can play videos on one's computer. I understand these videos come in many different formats. Every now and then I try to figure out what the Done Thing is, as far as playing movies on one's Linux machine.

+ +
    (Really my eventual goal is to be able to create video on Linux, but I figured I'd start small, and see if I could just get playback working before trying something that is undoubtedly ten thousand times harder.)
+

I finally found RPMs of mplayer that would consent to install themselves on a Red Hat 7.2 machine, and actually got it to play some videos. Amazing. But it's a total pain in the ass to use due to rampant "themeing." Why do people do this? They map this stupid shaped window with no titlebar (oh, sorry, your choice of a dozen stupidly-shaped windows without titlebars) all of which use fonts that are way too small to read. But, here's the best part, there's no way to raise the window to the top. So if another window ever gets on top of it, well, sorry, you're out of luck. And half of the themes always map the window at the very bottom of the + screen -- conveniently under my panel where I can't reach it.

+

Resizing the window changes the aspect ratio of the video! Yeah, I'm sure someone has ever wanted that.

+

It moves the mouse to the upper left corner of every dialog box it creates! Which is great, because that means that when it gets into this cute little state of popping up a blank dialog that says "Error" five times a second, you can't even move the mouse over to another window to kill the program, you have to log in from another machine.

+

Fucking morons.

+

So I gave up on that, and tried to install gstreamer. Get this. Their propose ``solution'' for distributing binaries on Red Hat systems? They point you at an RPM that installs apt, the Debian package system! Yeah, that's a good idea, I want to struggle with two competing packaging systems on my machine just to install a single app. Well, I found some +RPMs for Red Hat 7.2, but apparently they expect you to have already rectally inserted Gnome2 on that 7.2 system first. Uh, no. I've seen the horror of Red Hat 8.0, and there's no fucking way I'm putting Gnome2 on any more of my machines for at least another six months, maybe a year.

+

Ok, no gstreamer. Let's try Xine. I found +RPMs, and it sucks about the same as mplayer, and in about the same ways, though slightly less bad: it doesn't screw the aspect ratio when you resize the window; and at least its stupidly-shaped window is always forced to be on top. I don't like that either, but it's better than never being on top. It took me ten minutes to figure out where the "Open File" dialog was. It's on the button labeled "://" whose tooltip says "MRL Browser". Then you get to select file names from an oh-so-cute window that I guess is supposed to look like a tty, or maybe an LCD screen. It conveniently centers the file names in the list, and truncates them at about 30 characters. The scrollbar is also composed of "characters": it's an underscore.

+

What are these fucktards thinking???

+

Then I checked out Ogle again, and it hasn't been updated since the last time I tried, six months ago. It's a pretty decent DVD player, if you have the physical DVD. It does on-screen menus, and you can click on them with the mouse. But I don't need a DVD player (I have a hardware DVD player that works just fine.) It can't, as far as I can tell, play anything but actual discs.

+

Oh, and even though I have libdvdcss installed (as evidenced by the fact that Ogle actually works) Xine won't play the same disc that Ogle will play. It seems to be claiming that the CSS stuff isn't installed, which it clearly is.

+

An idiocy that all of these programs have in common is that, in addition to opening a window for the movie, and a window for the control panel, they also spray a constant spatter of curses crud on the terminal they were started from. I imagine at some point, there was some user who said, ``this program is pretty nice, but you know what it's missing? It's missing a lot of pointless chatter about what plugins and fonts have been loaded!''

+ +
And here's the Random Commentary section: + +
Makali wrote: +
    + Whenever a programmer thinks, "Hey, skins, what a cool idea", their + computer's speakers should create some sort of cock-shaped soundwave + and plunge it repeatedly through their skulls. +
+

I am fully in support of this proposed audio-cock technology.

+

Various people wrote:

+
    + You shouldn't even bother compiling the GUI into mplayer! +
+

So I should solve the problem of ``crappy GUI'' by replacing it with ``no GUI at all?'' I should use the program only from the command line, or by memorizing magic keystrokes? Awesome idea.

+

Various other people wrote:

+
    + You didn't try vlc! +
+

True, I hadn't. Now I have. It has an overly-complicated UI, (the Preferences panel is a festival of overkill) but at least it uses standard menus and buttons, so it doesn't make you want to claw your eyes out immediately. But, it can only play a miniscule number of video formats, so it's mostly useless. *plonk*

+

Someone else wrote:

+
    + Have you considered changing distributions? +
+

Yes, every single time I try something like this, I very seriously consider getting a Mac.

+

Really the only thing that's stopping me is that I fear the Emacs situation.

+

(By which I mean, ``Lack of a usable version thereof.'' No, running RMSmacs inside a terminal window doesn't qualify. Nor does running an X server on the Mac: if I were going to switch, why in the world would I continue inflicting the X Windows Disaster on myself? Wouldn't getting away from that be the whole + point?)

+ +
    + (I understand there is an almost-functional Aqua version of + RMSmacs now. I'll probably check it out at some point, but the problem with me switching from XEmacs to RMSmacs is that it would probably result in another + Slashdork post, meaning I'd wake up to another 150+ poorly spelled flames in my inbox... I'm hoping for a Aquafied XEmacs, but I know that's not likely to happen any time soon.) +
+

By the way, the suggestion to switch Linux distrubutions in order to get a single app to work might sound absurd at first. And that's because it is. But I've been saturated with Unix-peanut-gallery effluvia for so long that it no longer even surprises me when every + question -- no matter how + simple -- results in someone suggesting that you either A) patch your kernel or B) change distros. It's inevitable and inescapable, like Hitler.

+
+ +
+ +

[ up ]

+
diff --git a/resources/tests/readability/table-style-attributes/source.html b/resources/tests/readability/table-style-attributes/source.html new file mode 100644 index 0000000..0e20275 --- /dev/null +++ b/resources/tests/readability/table-style-attributes/source.html @@ -0,0 +1,167 @@ + + + + + linux video + + + + + + +

+ linux usability +
...or, why do I bother.

© 2002, 2003 + Jamie Zawinski

+ +


+
+ + + + + + + +
+

In December 2002, I tried to install some software on my computer. The experience was, shall we say, less than pleasant. On many levels. I wrote about my experience, as I so often do.

+

Then in January, the jackasses over at Slashdot posted a link to it, calling it a "review" of Linux video software. I guess you could consider it a review, if you were to squint at it just right. But really what it is is a rant about how I had an evening stolen from me by crap software design. It is a flame about the pathetic state of Linux usability in general, and the handful of video players I tried out in particular. It makes no attempt to be balanced or objective or exhaustive. It is a description of my experience. Perhaps your experience was different. Good for you.

+

So of course that day I got hundreds of emails about it. Every Linux apologist in the world wanted to make sure I was fully informed of their opinion. The replies were roughly in the following groups:

+

+
    +
  • "Right on! I had exactly the same experience! Thank you for putting it into words." (This was about 1/3 of the replies.) +

    +
  • +
  • "You're clearly an idiot, Linux is too sophisticated for you, you clearly are incapable of understanding anything, you should go back to kindergarten and/or use a Mac." (Oddly, all of these messages used the word `clearly' repeatedly.) +

    +
  • +
  • "If you don't like it, fix it yourself." +

    +
  • +
  • "Netscape sucks! XEmacs sucks! You suck! I never liked you anyway! And you swear too much!" +

    +
  • +
  • "How dare you criticize someone else's work! You got it for free! You should be on your knees thanking them for wasting your time!" +

    +
  • +
  • "While you have some valid complaints, I'm going to focus on this one inconsequential error you made in your characterization of one of the many roadblocks you encountered. You suck!" +

    +
  • +
  • "It's your fault for using Red Hat! You should be using Debian/Mandrake/Gentoo instead!" +

    +
  • +
  • "Red Hat 7.2 is totally obsolete! It's almost 14 months old! What were you expecting!"
  • +
+
+

While I am flattered that so many logorrheic Linux fanboys are sufficiently interested in my opinions and experiences to share their deeply heartfelt views with me, you can all rest assured that:

+

+
    +
      +
    • I've heard it before; and
    • +
    • I didn't care the first time.
    • +
    +
+

So please. Don't bother sending me any more mail about this. It's a near certainty that I will just delete it unread, so you might as well not waste your time. Feel free to call me names on your own web page if you feel the need to get it out of your system. But kindly stay out of my inbox. + + +

+
+
+


+

+ that said... +

+

I understand that one can play videos on one's computer. I understand these videos come in many different formats. Every now and then I try to figure out what the Done Thing is, as far as playing movies on one's Linux machine.

+

+
    (Really my eventual goal is to be able to create video on Linux, but I figured I'd start small, and see if I could just get playback working before trying something that is undoubtedly ten thousand times harder.)
+

I finally found RPMs of mplayer that would consent to install themselves on a Red Hat 7.2 machine, and actually got it to play some videos. Amazing. But it's a total pain in the ass to use due to rampant "themeing." Why do people do this? They map this stupid shaped window with no titlebar (oh, sorry, your choice of a dozen stupidly-shaped windows without titlebars) all of which use fonts that are way too small to read. But, here's the best part, there's no way to raise the window to the top. So if another window ever gets on top of it, well, sorry, you're out of luck. And half of the themes always map the window at the very bottom of the + screen -- conveniently under my panel where I can't reach it.

+

Resizing the window changes the aspect ratio of the video! Yeah, I'm sure someone has ever wanted that.

+

It moves the mouse to the upper left corner of every dialog box it creates! Which is great, because that means that when it gets into this cute little state of popping up a blank dialog that says "Error" five times a second, you can't even move the mouse over to another window to kill the program, you have to log in from another machine.

+

Fucking morons.

+

So I gave up on that, and tried to install gstreamer. Get this. Their propose ``solution'' for distributing binaries on Red Hat systems? They point you at an RPM that installs apt, the Debian package system! Yeah, that's a good idea, I want to struggle with two competing packaging systems on my machine just to install a single app. Well, I found some +RPMs for Red Hat 7.2, but apparently they expect you to have already rectally inserted Gnome2 on that 7.2 system first. Uh, no. I've seen the horror of Red Hat 8.0, and there's no fucking way I'm putting Gnome2 on any more of my machines for at least another six months, maybe a year.

+

Ok, no gstreamer. Let's try Xine. I found +RPMs, and it sucks about the same as mplayer, and in about the same ways, though slightly less bad: it doesn't screw the aspect ratio when you resize the window; and at least its stupidly-shaped window is always forced to be on top. I don't like that either, but it's better than never being on top. It took me ten minutes to figure out where the "Open File" dialog was. It's on the button labeled "://" whose tooltip says "MRL Browser". Then you get to select file names from an oh-so-cute window that I guess is supposed to look like a tty, or maybe an LCD screen. It conveniently centers the file names in the list, and truncates them at about 30 characters. The scrollbar is also composed of "characters": it's an underscore.

+

What are these fucktards thinking???

+

Then I checked out Ogle again, and it hasn't been updated since the last time I tried, six months ago. It's a pretty decent DVD player, if you have the physical DVD. It does on-screen menus, and you can click on them with the mouse. But I don't need a DVD player (I have a hardware DVD player that works just fine.) It can't, as far as I can tell, play anything but actual discs.

+

Oh, and even though I have libdvdcss installed (as evidenced by the fact that Ogle actually works) Xine won't play the same disc that Ogle will play. It seems to be claiming that the CSS stuff isn't installed, which it clearly is.

+

An idiocy that all of these programs have in common is that, in addition to opening a window for the movie, and a window for the control panel, they also spray a constant spatter of curses crud on the terminal they were started from. I imagine at some point, there was some user who said, ``this program is pretty nice, but you know what it's missing? It's missing a lot of pointless chatter about what plugins and fonts have been loaded!''

+

+
And here's the Random Commentary section: +

+
Makali wrote: +
    + Whenever a programmer thinks, "Hey, skins, what a cool idea", their + computer's speakers should create some sort of cock-shaped soundwave + and plunge it repeatedly through their skulls. +
+

I am fully in support of this proposed audio-cock technology.

+

Various people wrote:

+
    + You shouldn't even bother compiling the GUI into mplayer! +
+

So I should solve the problem of ``crappy GUI'' by replacing it with ``no GUI at all?'' I should use the program only from the command line, or by memorizing magic keystrokes? Awesome idea.

+

Various other people wrote:

+
    + You didn't try vlc! +
+

True, I hadn't. Now I have. It has an overly-complicated UI, (the Preferences panel is a festival of overkill) but at least it uses standard menus and buttons, so it doesn't make you want to claw your eyes out immediately. But, it can only play a miniscule number of video formats, so it's mostly useless. *plonk*

+

Someone else wrote:

+
    + Have you considered changing distributions? +
+

Yes, every single time I try something like this, I very seriously consider getting a Mac.

+

Really the only thing that's stopping me is that I fear the Emacs situation.

+

(By which I mean, ``Lack of a usable version thereof.'' No, running RMSmacs inside a terminal window doesn't qualify. Nor does running an X server on the Mac: if I were going to switch, why in the world would I continue inflicting the X Windows Disaster on myself? Wouldn't getting away from that be the whole + point?)

+

+
    + (I understand there is an almost-functional Aqua version of + RMSmacs now. I'll probably check it out at some point, but the problem with me switching from XEmacs to RMSmacs is that it would probably result in another + Slashdork post, meaning I'd wake up to another 150+ poorly spelled flames in my inbox... I'm hoping for a Aquafied XEmacs, but I know that's not likely to happen any time soon.) +
+

By the way, the suggestion to switch Linux distrubutions in order to get a single app to work might sound absurd at first. And that's because it is. But I've been saturated with Unix-peanut-gallery effluvia for so long that it no longer even surprises me when every + question -- no matter how + simple -- results in someone suggesting that you either A) patch your kernel or B) change distros. It's inevitable and inescapable, like Hitler.

+
+

+
+

+

[ up ]

+ + + diff --git a/resources/tests/readability/telegraph/expected.html b/resources/tests/readability/telegraph/expected.html new file mode 100644 index 0000000..7b6decb --- /dev/null +++ b/resources/tests/readability/telegraph/expected.html @@ -0,0 +1,27 @@ +
+
+

Zimbabwe President Robert Mugabe, his wife Grace and two key figures from her G40 political faction are under house arrest at Mugabe's "Blue House" compound in Harare and are insisting the 93 year-old finishes his presidential term, a source said.

+

The G40 figures are cabinet ministers Jonathan Moyo and Saviour Kasukuwere, who fled to the compound after their homes were attacked by troops in Tuesday night's coup, the source, who said he had spoken to people inside the compound, told Reuters.

+

Mr Mugabe is resisting mediation by a Catholic priest to allow the former guerrilla a graceful exit after the military takeover.

+

The priest, Fidelis Mukonori, is acting as a middle-man between Mr Mugabe and the generals, who seized power in a targeted operation against "criminals" in his entourage, a senior political source told Reuters.

+

The source could not provide details of the talks, which appear to be aimed at a smooth and bloodless transition after the departure of Mr Mugabe, who has led Zimbabwe since independence in 1980.

+

Mr Mugabe, still seen by many Africans as a liberation hero, is reviled in the West as a despot whose disastrous handling of the economy and willingness to resort to violence to maintain power destroyed one of Africa's most promising states.

+
+
+

Zimbabwean intelligence reports seen by Reuters suggest that former security chief Emmerson Mnangagwa, who was ousted as vice-president this month, has been mapping out a post-Mugabe vision with the military and opposition for more than a year.

+
+
+

Fuelling speculation that Mnangagwa's plan might be rolling into action, opposition leader Morgan Tsvangirai, who has been receiving cancer treatment in Britain and South Africa, returned to Harare late on Wednesday, his spokesman said.

+

South Africa said Mr Mugabe had told President Jacob Zuma by telephone on Wednesday that he was confined to his home but was otherwise fine and the military said it was keeping him and his family, including wife Grace, safe.

+
+
+

Despite the lingering admiration for Mr Mugabe, there is little public affection for 52-year-old Grace, a former government typist who started having an affair with Mr Mugabe in the early 1990s as his first wife, Sally, was dying of kidney disease.

+

Dubbed "DisGrace" or "Gucci Grace" on account of her reputed love of shopping, she enjoyed a meteoric rise through the ranks of Mugabe's ruling Zanu-PF in the last two years, culminating in Mnangagwa's removal a week ago - a move seen as clearing the way for her to succeed her husband.

+
+
+

In contrast to the high political drama unfolding behind closed doors, the streets of the capital remained calm, with people going about their daily business, albeit under the watch of soldiers on armoured vehicles at strategic locations.

+
+
+

Whatever the final outcome, the events could signal a once-in-a-generation change for the former British colony, a regional breadbasket reduced to destitution by economic policies Mr Mugabe's critics have long blamed on him.

+
+
diff --git a/resources/tests/readability/telegraph/source.html b/resources/tests/readability/telegraph/source.html new file mode 100644 index 0000000..3aad2a1 --- /dev/null +++ b/resources/tests/readability/telegraph/source.html @@ -0,0 +1,1821 @@ + + + + + + + + + + + + + + + + + + Zimbabwe coup: Robert Mugabe and wife Grace 'insisting he finishes his term', as priest steps in to mediate + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+ +
+
+ +
+
+
+
+
+ +
+
+

Zimbabwe coup: Robert Mugabe and wife Grace 'insisting he finishes his term', as priest steps in to mediate

+ +
+
+
+
+
+ +
+ +
+
+
+
+ + + +
+
+ + +
Zimbabwe president Robert Mugabe with ZDF commander general Constantino Chiwenga at State House in an image the Harare Herald claims was taken on November 16 - the day after the military takeover +Credit: HARARE HERALD +
+
+
+
+
+
+
+ + +
+
+
+
+

Zimbabwe President Robert Mugabe, his wife Grace and two key figures from her G40 political faction are under house arrest at Mugabe's "Blue House" compound in Harare and are insisting the 93 year-old finishes his presidential term, a source said.

+

The G40 figures are cabinet ministers Jonathan Moyo and Saviour Kasukuwere, who fled to the compound after their homes were attacked by troops in Tuesday night's coup, the source, who said he had spoken to people inside the compound, told Reuters.

+

Mr Mugabe is resisting mediation by a Catholic priest to allow the former guerrilla a graceful exit after the military takeover.

+

The priest, Fidelis Mukonori, is acting as a middle-man between Mr Mugabe and the generals, who seized power in a targeted operation against "criminals" in his entourage, a senior political source told Reuters.

+

The source could not provide details of the talks, which appear to be aimed at a smooth and bloodless transition after the departure of Mr Mugabe, who has led Zimbabwe since independence in 1980.

+

Mr Mugabe, still seen by many Africans as a liberation hero, is reviled in the West as a despot whose disastrous handling of the economy and willingness to resort to violence to maintain power destroyed one of Africa's most promising states.

+
+
+
+
+
+
+
+
+ +
+
+
Zimbabwean opposition leader Morgan Tsvangirai, right, meets church leaders Bishop Trevor Manhanga, centre, and Father Fidelis Mukonori in 2006. Father Mukonori is said to be mediating in the current crisis +Credit:  DESMOND KWANDE/ AFP +
+
+
+
+
+
+
+
+

Zimbabwean intelligence reports seen by Reuters suggest that former security chief Emmerson Mnangagwa, who was ousted as vice-president this month, has been mapping out a post-Mugabe vision with the military and opposition for more than a year.

+
+
+
+
+ +
+
+
+
+

Fuelling speculation that Mnangagwa's plan might be rolling into action, opposition leader Morgan Tsvangirai, who has been receiving cancer treatment in Britain and South Africa, returned to Harare late on Wednesday, his spokesman said.

+

South Africa said Mr Mugabe had told President Jacob Zuma by telephone on Wednesday that he was confined to his home but was otherwise fine and the military said it was keeping him and his family, including wife Grace, safe.

+
+
+
+
+ +
+
+
+
+

Despite the lingering admiration for Mr Mugabe, there is little public affection for 52-year-old Grace, a former government typist who started having an affair with Mr Mugabe in the early 1990s as his first wife, Sally, was dying of kidney disease.

+

Dubbed "DisGrace" or "Gucci Grace" on account of her reputed love of shopping, she enjoyed a meteoric rise through the ranks of Mugabe's ruling Zanu-PF in the last two years, culminating in Mnangagwa's removal a week ago - a move seen as clearing the way for her to succeed her husband.

+
+
+
+
+
+
+
+
+ +
+
+
A man walks past an armoured personnel carrier parked on a Harare street on Thursday +Credit: STR/AFP  +
+
+
+
+
+
+
+
+

In contrast to the high political drama unfolding behind closed doors, the streets of the capital remained calm, with people going about their daily business, albeit under the watch of soldiers on armoured vehicles at strategic locations.

+
+
+
+
+ +
+
+
+
+

Whatever the final outcome, the events could signal a once-in-a-generation change for the former British colony, a regional breadbasket reduced to destitution by economic policies Mr Mugabe's critics have long blamed on him.

+
+
+
+
+
+ +
+ + + +
+
+ + +
+ +
+ +
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/resources/tests/readability/title-and-h1-discrepancy/expected.html b/resources/tests/readability/title-and-h1-discrepancy/expected.html new file mode 100644 index 0000000..0b4f1a8 --- /dev/null +++ b/resources/tests/readability/title-and-h1-discrepancy/expected.html @@ -0,0 +1,21 @@ +
+

This is a long title with a colon: But the final text here is different

+

+ Lorem + ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +

+

+ Lorem + ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +

+
diff --git a/resources/tests/readability/title-and-h1-discrepancy/source.html b/resources/tests/readability/title-and-h1-discrepancy/source.html new file mode 100644 index 0000000..d91b658 --- /dev/null +++ b/resources/tests/readability/title-and-h1-discrepancy/source.html @@ -0,0 +1,30 @@ + + + + + This is a long title with a colon: Hello there + + +
+

This is a long title with a colon: But the final text here is different

+
+ Lorem + ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +
+
+ Lorem + ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +
+
+ + diff --git a/resources/tests/readability/tmz-1/expected.html b/resources/tests/readability/tmz-1/expected.html new file mode 100644 index 0000000..ea3c2f4 --- /dev/null +++ b/resources/tests/readability/tmz-1/expected.html @@ -0,0 +1,37 @@ +
+

+ + +

$150K Pearl Oscar Dress ... STOLEN!!!!

+ +

+
+ 2/26/2015 7:11 AM PST BY TMZ STAFF +
+ +
+

EXCLUSIVE +

+

+ 0225-lupita-nyongo-getty-01Lupita Nyong'o's now-famous Oscar dress + -- adorned in pearls -- was stolen right out of her hotel room ... TMZ + has learned.

+

Law enforcement sources tell TMZ ... the dress was taken out of Lupita's + room at The London West Hollywood. The dress is made of pearls ... 6,000 + white Akoya pearls. It's valued at $150,000.

+

Our sources say Lupita told cops it was taken from her room sometime between + 8 AM and 9 PM Wednesday ... while she was gone.  

+

We're told there is security footage that cops are looking at that could + catch the culprit right in the act. 

+

+ update_graphic_red_bar12:00 PM PT -- Sheriff's deputies were at The London Thursday + morning.  We know they were in the manager's office and we're told + they have looked at security footage to determine if they can ID the culprit.

+

+ 0226-SUB-london-hotel-swipe-tmz-02 +

+ + +
+ +
diff --git a/resources/tests/readability/tmz-1/source.html b/resources/tests/readability/tmz-1/source.html new file mode 100644 index 0000000..c2b19f0 --- /dev/null +++ b/resources/tests/readability/tmz-1/source.html @@ -0,0 +1,1528 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Lupita Nyong'o's $150K Pearl Oscar Dress -- STOLEN!!! | TMZ.com + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +  + +
+ +
+
+ + + + + + + + + + + + +
+
+
+
+ +
+ +
+
+
+
+TMZ + +
+
+
+

Our TV Shows

+ +
+
+ +
+
+
+ + + +
+ +
+
+ +
+ +
+
    +
    + +
    +
    +
    + +
    + +
    +
      +
      + +
      +
      +
      +
      + + +
      +

      Got a Tip?

      + +

      Call TMZ at (888) 847-9869 or Click Here + +

      +
      + +
      +
      +
      +
      + + + + + + + + + + + +
      + + +
      + +
      +
      +
      +

      Lupita Nyong'o

      + +

      $150K Pearl Oscar Dress ... STOLEN!!!!

      + +
      +
      + 2/26/2015 7:11 AM PST BY TMZ STAFF +
      + +
      +
      EXCLUSIVE +
      +

      + 0225-lupita-nyongo-getty-01Lupita Nyong'o's now-famous Oscar dress + -- adorned in pearls -- was stolen right out of her hotel room ... TMZ + has learned.

      +

      Law enforcement sources tell TMZ ... the dress was taken out of Lupita's + room at The London West Hollywood. The dress is made of pearls ... 6,000 + white Akoya pearls. It's valued at $150,000.

      +

      Our sources say Lupita told cops it was taken from her room sometime between + 8 AM and 9 PM Wednesday ... while she was gone.  

      +

      We're told there is security footage that cops are looking at that could + catch the culprit right in the act. 

      +

      + update_graphic_red_bar12:00 PM PT -- Sheriff's deputies were at The London Thursday + morning.  We know they were in the manager's office and we're told + they have looked at security footage to determine if they can ID the culprit.

      +

      + 0226-SUB-london-hotel-swipe-tmz-02 +

      + + +
      + +
      + + + + +
      +
      +
      + + + + +
      + + +
      + +
      + +
      + +
      + + + + + + +
      +

      Around The Web

      + + + +
      + +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/full_text_parser/readability/tests.rs b/src/full_text_parser/readability/tests.rs index fbd5775..6b8245f 100644 --- a/src/full_text_parser/readability/tests.rs +++ b/src/full_text_parser/readability/tests.rs @@ -458,6 +458,36 @@ async fn social_buttons() { run_test("social-buttons").await } +#[tokio::test] +async fn style_tags_removal() { + run_test("style-tags-removal").await +} + +#[tokio::test] +async fn svg_parsing() { + run_test("svg-parsing").await +} + +#[tokio::test] +async fn table_style_attributes() { + run_test("table-style-attributes").await +} + +#[tokio::test] +async fn title_and_h1_discrepancy() { + run_test("title-and-h1-discrepancy").await +} + +#[tokio::test] +async fn tmz_1() { + run_test("tmz-1").await +} + +#[tokio::test] +async fn telegraph() { + run_test("telegraph").await +} + #[tokio::test] async fn webmd_1() { run_test("webmd-1").await