From 15eec43ad9d3b2f4ee2765831c4c82c773500ff7 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 1 Apr 2023 18:22:42 +0200 Subject: [PATCH] 6 more tests --- .../readability/toc-missing/expected.html | 1214 ++++++++++ .../tests/readability/toc-missing/source.html | 1696 ++++++++++++++ .../readability/topicseed-1/expected.html | 93 + .../tests/readability/topicseed-1/source.html | 400 ++++ .../tests/readability/tumblr/expected.html | 4 + .../tests/readability/tumblr/source.html | 793 +++++++ .../tests/readability/v8-blog/expected.html | 178 ++ .../tests/readability/v8-blog/source.html | 259 +++ .../tests/readability/videos-1/expected.html | 267 +++ .../tests/readability/videos-1/source.html | 1319 +++++++++++ .../tests/readability/videos-2/expected.html | 109 + .../tests/readability/videos-2/source.html | 2043 +++++++++++++++++ src/full_text_parser/readability/tests.rs | 30 + 13 files changed, 8405 insertions(+) create mode 100644 resources/tests/readability/toc-missing/expected.html create mode 100644 resources/tests/readability/toc-missing/source.html create mode 100644 resources/tests/readability/topicseed-1/expected.html create mode 100644 resources/tests/readability/topicseed-1/source.html create mode 100644 resources/tests/readability/tumblr/expected.html create mode 100644 resources/tests/readability/tumblr/source.html create mode 100644 resources/tests/readability/v8-blog/expected.html create mode 100644 resources/tests/readability/v8-blog/source.html create mode 100644 resources/tests/readability/videos-1/expected.html create mode 100644 resources/tests/readability/videos-1/source.html create mode 100644 resources/tests/readability/videos-2/expected.html create mode 100644 resources/tests/readability/videos-2/source.html diff --git a/resources/tests/readability/toc-missing/expected.html b/resources/tests/readability/toc-missing/expected.html new file mode 100644 index 0000000..bd72b3d --- /dev/null +++ b/resources/tests/readability/toc-missing/expected.html @@ -0,0 +1,1214 @@ +
+
+

+ Many developers think that having a critical bug in their code is the worst thing that can happen. Well, there is something much worse than that: Having a critical bug in your code and not knowing about it! +

+

+ To make sure I get notified about critical bugs as soon as possible, I started looking for ways to find anomalies in my data. I quickly found that information about these subjects tend to get very complicated, and involve a lot of ad-hoc tools and dependencies. +

+

+ I'm not a statistician and not a data scientist, I'm just a developer. Before I introduce dependencies into my system I make sure I really can't do without them. So, using some high school level statistics and a fair knowledge of SQL, I implemented a simple anomaly detection system that works. +

+
+ Can you spot the anomaly?<br><small>Photo by <a href="https://unsplash.com/photos/KmKZV8pso-s">Ricardo Gomez Angel</a></small> +
+ Can you spot the anomaly?
+ Photo by Ricardo Gomez Angel +
+
+
+ + Table of Contents + + +
+
+ +
+

+ Detecting Anomalies +

+

+ Anomaly in a data series is a significant deviation from some reasonable value. Looking at this series of numbers for example, which number stands out? +

+
+
2, 3, 5, 2, 3, 12, 5, 3, 4
+
+
+

+ The number that stands out in this series is 12. +

+
+ Scatter plot +
+ Scatter plot +
+
+

+ This is intuitive to a human, but computer programs don't have intuition... +

+

+ To find the anomaly in the series we first need to define what a reasonable value is, and then define how far away from this value we consider a significant deviation. A good place to start looking for a reasonable value is the mean: +

+
+
SELECT avg(n)
+FROM unnest(array[2, 3, 5, 2, 3, 12, 5, 3, 4]) AS n;
+
+       avg
+────────────────────
+4.3333333333333333
+
+
+

+ The mean is ~4.33. +

+

+ Next, we need to define the deviation. Let's use Standard Deviation: +

+
+
SELECT stddev(n)
+FROM unnest(array[2, 3, 5, 2, 3, 12, 5, 3, 4]) AS n;
+
+      stddev
+────────────────────
+3.0822070014844882
+
+
+

+ Standard deviation is the square root of the variance, which is the average squared distance from the mean. In this case it's 3.08. +

+

+ Now that we've defined a "reasonable" value and a deviation, we can define a range of acceptable values: +

+
+
SELECT
+   avg(n) - stddev(n) AS lower_bound,
+   avg(n) + stddev(n) AS upper_bound
+FROM
+   unnest(array[2, 3, 5, 2, 3, 12, 5, 3, 4]) AS n;
+
+    lower_bound    │     upper_bound
+───────────────────┼────────────────────
+1.2511263318488451 │ 7.4155403348178215
+
+
+

+ The range we defined is one standard deviation from the mean. Any value outside this range is considered an anomaly: +

+
+
WITH series AS (
+   SELECT *
+   FROM unnest(array[2, 3, 5, 2, 3, 12, 5, 3, 4]) AS n
+),
+bounds AS (
+   SELECT
+       avg(n) - stddev(n) AS lower_bound,
+       avg(n) + stddev(n) AS upper_bound
+   FROM
+       series
+)
+SELECT
+   n,
+   n NOT BETWEEN lower_bound AND upper_bound AS is_anomaly
+FROM
+   series,
+   bounds;
+
+n  │ is_anomaly
+───┼────────────
+ 2 │ f
+ 3 │ f
+ 5 │ f
+ 2 │ f
+ 3 │ f
+12 │ t
+ 5 │ f
+ 3 │ f
+ 4 │ f
+
+
+

+ Using the query we found that the value 12 is outside the range of acceptable values, and identified it as an anomaly. +

+

+ Understanding Z-Score +

+

+ Another way to represent a range of acceptable values is using a z-score. z-score, or Standard Score, is the number of standard deviations from the mean. In the previous section, our acceptable range was one standard deviation from the mean, or in other words, a z-score in the range ±1: +

+
+
WITH series AS (
+   SELECT *
+   FROM unnest(array[2, 3, 5, 2, 3, 12, 5, 3, 4]) AS n
+),
+stats AS (
+   SELECT
+       avg(n) series_mean,
+       stddev(n) as series_stddev
+   FROM
+       series
+)
+SELECT
+   n,
+   (n - series_mean) / series_stddev as zscore
+FROM
+   series,
+   stats;
+
+n  │         zscore
+───┼─────────────────────────
+ 2 │ -0.75703329861022517346
+ 3 │ -0.43259045634870009448
+ 5 │  0.21629522817435006346
+ 2 │ -0.75703329861022517346
+ 3 │ -0.43259045634870009448
+12 │      2.4873951240050256
+ 5 │  0.21629522817435006346
+ 3 │ -0.43259045634870009448
+ 4 │ -0.10814761408717501551
+
+
+

+ Like before, we can detect anomalies by searching for values which are outside the acceptable range using the z-score: +

+
+
WITH series AS (
+   SELECT *
+   FROM unnest(array[2, 3, 5, 2, 3, 12, 5, 3, 4]) AS n
+),
+stats AS (
+   SELECT
+       avg(n) series_avg,
+       stddev(n) as series_stddev
+   FROM
+       series
+),
+zscores AS (
+   SELECT
+       n,
+       (n - series_avg) / series_stddev AS zscore
+   FROM
+       series,
+       stats
+)
+SELECT
+   *,
+   zscore NOT BETWEEN -1 AND 1 AS is_anomaly
+FROM
+   zscores;
+
+n  │         zscore          │ is_anomaly
+───┼─────────────────────────┼────────────
+ 2 │ -0.75703329861022517346 │ f
+ 3 │ -0.43259045634870009448 │ f
+ 5 │  0.21629522817435006346 │ f
+ 2 │ -0.75703329861022517346 │ f
+ 3 │ -0.43259045634870009448 │ f
+12 │      2.4873951240050256 │ t
+ 5 │  0.21629522817435006346 │ f
+ 3 │ -0.43259045634870009448 │ f
+ 4 │ -0.10814761408717501551 │ f
+
+
+

+ Using z-score, we also identified 12 as an anomaly in this series. +

+

+ Optimizing Z-Score +

+

+ So far we used one standard deviation from the mean, or a z-score of ±1 to identify anomalies. Changing the z-score threshold can affect our results. For example, let's see what anomalies we identify when the z-score is greater than 0.5 and when it's greater than 3: +

+
+
WITH series AS (
+   SELECT *
+   FROM unnest(array[2, 3, 5, 2, 3, 12, 5, 3, 4]) AS n
+),
+stats AS (
+   SELECT
+       avg(n) series_avg,
+       stddev(n) as series_stddev
+   FROM
+       series
+),
+zscores AS (
+   SELECT
+       n,
+       (n - series_avg) / series_stddev AS zscore
+   FROM
+       series,
+       stats
+)
+SELECT
+   *,
+   zscore NOT BETWEEN -0.5 AND 0.5 AS is_anomaly_0_5,
+   zscore NOT BETWEEN -1 AND 1 AS is_anomaly_1,
+   zscore NOT BETWEEN -3 AND 3 AS is_anomaly_3
+FROM
+   zscores;
+
+n  │         zscore          │ is_anomaly_0_5 │ is_anomaly_1 │ is_anomaly_3
+───┼─────────────────────────┼────────────────┼──────────────┼──────────────
+ 2 │ -0.75703329861022517346 │ t              │ f            │ f
+ 3 │ -0.43259045634870009448 │ f              │ f            │ f
+ 5 │  0.21629522817435006346 │ f              │ f            │ f
+ 2 │ -0.75703329861022517346 │ t              │ f            │ f
+ 3 │ -0.43259045634870009448 │ f              │ f            │ f
+12 │      2.4873951240050256 │ t              │ t            │ f
+ 5 │  0.21629522817435006346 │ f              │ f            │ f
+ 3 │ -0.43259045634870009448 │ f              │ f            │ f
+ 4 │ -0.10814761408717501551 │ f              │ f            │ f
+
+
+

+ Let's see what we got: +

+
    +
  • When we decreased the z-score threshold to 0.5, we identified the value 2 as an anomaly in addition to the value 12. +
  • +
  • When we increased the z-score threshold to 3 we did not identify any anomaly. +
  • +
+

+ The quality of our results are directly related to the parameters we set for the query. Later we'll see how using backtesting can help us identify ideal values. +

+
+

+ Analyzing a Server Log +

+

+ Application servers such as nginx, Apache and IIS write a lot of useful information to access logs. The data in these logs can be extremely useful in identifying anomalies. +

+

+ We are going to analyze logs of a web application, so the data we are most interested in is the timestamp and the status code of every response from the server. To illustrate the type of insight we can draw from just this data: +

+
    +
  • + A sudden increase in 500 status code: You may have a problem in the server. Did you just push a new version? Is there an external service you're using that started failing in unexpected ways? +
  • +
  • + A sudden increase in 400 status code: You may have a problem in the client. Did you change some validation logic and forgot to update the client? Did you make a change and forgot to handle backward compatibility? +
  • +
  • + A sudden increase in 404 status code: You may have an SEO problem. Did you move some pages and forgot to set up redirects? Is there some script kiddy running a scan on your site? +
  • +
  • + A sudden increase in 200 status code: You either have some significant legit traffic coming in, or you are under a DOS attack. Either way, you probably want to check where it's coming from. +
  • +
+

+ Preparing the Data +

+

+ Parsing and processing logs is outside the scope of this article, so let's assume we did that and we have a table that looks like this: +

+
+
CREATE TABLE server_log_summary AS (
+   period timestamptz,
+   status_code int,
+   entries int
+);
+
+
+

+ The table stores the number of entries for each status code at a given period. For example, our table stores how many responses returned each status code every minute: +

+
+
db=# SELECT * FROM server_log_summary ORDER BY period DESC LIMIT 10;
+
+        period         │ status_code │ entries
+───────────────────────┼─────────────┼─────────
+2020-08-01 18:00:00+00 │         200 │    4084
+2020-08-01 18:00:00+00 │         404 │       0
+2020-08-01 18:00:00+00 │         400 │      24
+2020-08-01 18:00:00+00 │         500 │       0
+2020-08-01 17:59:00+00 │         400 │      12
+2020-08-01 17:59:00+00 │         200 │    3927
+2020-08-01 17:59:00+00 │         500 │       0
+2020-08-01 17:59:00+00 │         404 │       0
+2020-08-01 17:58:00+00 │         400 │       2
+2020-08-01 17:58:00+00 │         200 │    3850
+
+
+

+ Note that the table has a row for every minute, even if the status code was never returned in that minute. Given a table of statuses, it's very tempting to do something like this: +

+
+
-- Wrong!
+SELECT
+   date_trunc('minute', timestamp) AS period,
+   status_code,
+   count(*) AS entries
+FROM
+   server_log
+GROUP BY
+   period,
+   status_code;
+
+
+

+ This is a common mistake and it can leave you with gaps in the data. Zero is a value, and it holds a significant meaning. A better approach is to create an "axis", and join to it: +

+
+
-- Correct!
+WITH axis AS (
+   SELECT
+       status_code,
+       generate_series(
+           date_trunc('minute', now()),
+           date_trunc('minute', now() - interval '1 hour'),
+           interval '1 minute' * -1
+       ) AS period
+   FROM (
+       VALUES (200), (400), (404), (500)
+   ) AS t(status_code)
+)
+SELECT
+   a.period,
+   a.status_code,
+   count(*) AS entries
+FROM
+   axis a
+   LEFT JOIN server_log l ON (
+       date_trunc('minute', l.timestamp) = a.period
+       AND l.status_code = a.status_code
+   )
+GROUP BY
+   period,
+   status_code;
+
+
+

+ First we generate an axis using a cartesian join between the status codes we want to track, and the times we want to monitor. To generate the axis we used two nice features of PostgreSQL: +

+
    +
  • + generate_series: function that generates a range of values. +
  • +
  • + VALUES list: special clause that can generate "constant tables", as the documentation calls it. You might be familiar with the VALUES clause from INSERT statements. In the old days, to generate data we had to use a bunch of SELECT ... UNION ALL... using VALUES is much nicer. +
  • +
+

+ After generating the axis, we left join the actual data into it to get a complete series for each status code. The resulting data has no gaps, and is ready for analysis. +

+

+ Getting a Sense of the Data +

+

+ To get a sense of the data, let's draw a stacked bar chart by status: +

+
+ stacked bar chart by status, over time +
+ stacked bar chart by status, over time +
+
+

+ The chart shows a period of 12 hours. It looks like we have a nice trend with two peaks at around 09:30 and again at 18:00. +

+

+ We also spot right away that at ~11:30 there was a significant increase in 500 errors. The burst died down after around 10 minutes. This is the type of anomalies we want to identify early on. +

+

+ It's entirely possible that there were other problems during that time, we just can't spot them with a naked eye. +

+

+ Identifying Anomalies +

+

+ In anomaly detection systems, we usually want to identify if we have an anomaly right now, and send an alert. +

+

+ To identify if the last datapoint is an anomaly, we start by calculating the mean and standard deviation for each status code in the past hour: +

+
+
db=# WITH stats AS (
+   SELECT
+       status_code,
+       (MAX(ARRAY[EXTRACT('epoch' FROM period), entries]))[2] AS last_value,
+       AVG(entries) AS mean_entries,
+       STDDEV(entries) AS stddev_entries
+   FROM
+       server_log_summary
+   WHERE
+       -- In the demo data use:
+       -- period > '2020-08-01 17:00 UTC'::timestamptz
+       period > now() - interval '1 hour'
+   GROUP BY
+       status_code
+)
+SELECT * FROM stats;
+
+status_code │ last_value │      mean_entries      │     stddev_entries
+────────────┼────────────┼────────────────────────┼────────────────────────
+        404 │          0 │ 0.13333333333333333333 │ 0.34280333180088158345
+        500 │          0 │ 0.15000000000000000000 │ 0.36008473579027553993
+        200 │       4084 │  2779.1000000000000000 │       689.219644702665
+        400 │         24 │ 0.73333333333333333333 │     3.4388935285299212
+
+
+

+ To get the last value in a GROUP BY in addition to the mean and standard deviation we used a little array trick. +

+

+ Next, we calculate the z-score for the last value for each status code: +

+
+
db=# WITH stats AS (
+   SELECT
+       status_code,
+       (MAX(ARRAY[EXTRACT('epoch' FROM period), entries]))[2] AS last_value,
+       AVG(entries) AS mean_entries,
+       STDDEV(entries) AS stddev_entries
+   FROM
+       server_log_summary
+   WHERE
+       -- In the demo data use:
+       -- period > '2020-08-01 17:00 UTC'::timestamptz
+       period > now() - interval '1 hour'
+   GROUP BY
+       status_code
+)
+SELECT
+   *,
+   (last_value - mean_entries) / NULLIF(stddev_entries::float, 0) as zscore
+FROM
+   stats;
+
+status_code │ last_value │ mean_entries │ stddev_entries │  zscore
+────────────┼────────────┼──────────────┼────────────────┼────────
+        404 │          0 │ 0.133        │ 0.3428         │ -0.388
+        500 │          0 │ 0.150        │ 0.3600         │ -0.416
+        200 │       4084 │ 2779.100     │ 689.2196       │  1.893
+        400 │         24 │ 0.733        │ 3.4388         │  6.765
+
+
+

+ We calculated the z-score by finding the number of standard deviations between the last value and the mean. To avoid a "division by zero" error we transform the denominator to NULL if it's zero. +

+

+ Looking at the z-scores we got, we can spot that status code 400 got a very high z-score of 6. In the past minute we returned a 400 status code 24 times, which is significantly higher than the average of 0.73 in the past hour. +

+

+ Let's take a look at the raw data: +

+
+
SELECT *
+FROM server_log_summary
+WHERE status_code = 400
+ORDER BY period DESC
+LIMIT 20;
+
+        period         │ status_code │ entries
+───────────────────────┼─────────────┼─────────
+2020-08-01 18:00:00+00 │         400 │      24
+2020-08-01 17:59:00+00 │         400 │      12
+2020-08-01 17:58:00+00 │         400 │       2
+2020-08-01 17:57:00+00 │         400 │       0
+2020-08-01 17:56:00+00 │         400 │       1
+2020-08-01 17:55:00+00 │         400 │       0
+2020-08-01 17:54:00+00 │         400 │       0
+2020-08-01 17:53:00+00 │         400 │       0
+2020-08-01 17:52:00+00 │         400 │       0
+2020-08-01 17:51:00+00 │         400 │       0
+2020-08-01 17:50:00+00 │         400 │       0
+2020-08-01 17:49:00+00 │         400 │       0
+2020-08-01 17:48:00+00 │         400 │       0
+2020-08-01 17:47:00+00 │         400 │       0
+2020-08-01 17:46:00+00 │         400 │       0
+2020-08-01 17:45:00+00 │         400 │       0
+2020-08-01 17:44:00+00 │         400 │       0
+2020-08-01 17:43:00+00 │         400 │       0
+2020-08-01 17:42:00+00 │         400 │       0
+2020-08-01 17:41:00+00 │         400 │       0
+
+
+

+ It does look like in the last couple of minutes we are getting more errors than expected. +

+
+ Status 400 in the past hour +
+ Status 400 in the past hour +
+
+

+ What our naked eye missed in the chart and in the raw data, was found by the query, and was classified as an anomaly. We are off to a great start! +

+
+

+ Backtesting +

+

+ In the previous section we identified an anomaly. We found an increase in 400 status code because the z-score was 6. But how do we set the threshold for the z-score? Is a z-score of 3 an anomaly? What about 2, or 1? +

+

+ To find thresholds that fit our needs, we can run simulations on past data with different values, and evaluate the results. This is often called backtesting. +

+

+ Finding Past Anomalies +

+

+ The first thing we need to do is to calculate the mean and the standard deviation for each status code up until every row, just as if it’s the current value. This is a classic job for a window function: +

+
+
WITH calculations_over_window AS (
+   SELECT
+      status_code,
+      period,
+      entries,
+      AVG(entries) OVER status_window as mean_entries,
+      STDDEV(entries) OVER status_window as stddev_entries
+   FROM
+      server_log_summary
+   WINDOW status_window AS (
+      PARTITION BY status_code
+      ORDER BY period
+      ROWS BETWEEN 60 PRECEDING AND CURRENT ROW
+   )
+)
+SELECT *
+FROM calculations_over_window
+ORDER BY period DESC
+LIMIT 20;
+
+status_code │         period         │ entries │      mean_entries      │     stddev_entries
+────────────┼────────────────────────┼─────────┼────────────────────────┼────────────────────────
+        200 │ 2020-08-01 18:00:00+00 │    4084 │  2759.9672131147540984 │       699.597407256800
+        400 │ 2020-08-01 18:00:00+00 │      24 │ 0.72131147540983606557 │     3.4114080550460080
+        404 │ 2020-08-01 18:00:00+00 │       0 │ 0.13114754098360655738 │ 0.34036303344446665347
+        500 │ 2020-08-01 18:00:00+00 │       0 │ 0.14754098360655737705 │ 0.35758754516763638735
+        500 │ 2020-08-01 17:59:00+00 │       0 │ 0.16393442622950819672 │ 0.37328844382740000274
+        400 │ 2020-08-01 17:59:00+00 │      12 │ 0.32786885245901639344 │     1.5676023249473471
+        200 │ 2020-08-01 17:59:00+00 │    3927 │  2718.6721311475409836 │       694.466863171826
+        404 │ 2020-08-01 17:59:00+00 │       0 │ 0.13114754098360655738 │ 0.34036303344446665347
+        500 │ 2020-08-01 17:58:00+00 │       0 │ 0.16393442622950819672 │ 0.37328844382740000274
+        404 │ 2020-08-01 17:58:00+00 │       0 │ 0.13114754098360655738 │ 0.34036303344446665347
+        200 │ 2020-08-01 17:58:00+00 │    3850 │  2680.4754098360655738 │       690.967283512936
+        400 │ 2020-08-01 17:58:00+00 │       2 │ 0.13114754098360655738 │ 0.38623869286861001780
+        404 │ 2020-08-01 17:57:00+00 │       0 │ 0.13114754098360655738 │ 0.34036303344446665347
+        400 │ 2020-08-01 17:57:00+00 │       0 │ 0.09836065573770491803 │ 0.30027309973793774423
+        500 │ 2020-08-01 17:57:00+00 │       1 │ 0.16393442622950819672 │ 0.37328844382740000274
+        200 │ 2020-08-01 17:57:00+00 │    3702 │  2643.0327868852459016 │       688.414796645480
+        200 │ 2020-08-01 17:56:00+00 │    3739 │  2607.5081967213114754 │       688.769908918569
+        404 │ 2020-08-01 17:56:00+00 │       0 │ 0.14754098360655737705 │ 0.35758754516763638735
+        400 │ 2020-08-01 17:56:00+00 │       1 │ 0.11475409836065573770 │ 0.32137001808599097120
+        500 │ 2020-08-01 17:56:00+00 │       0 │ 0.14754098360655737705 │ 0.35758754516763638735
+
+
+

+ To calculate the mean and standard deviation over a sliding window of 60 minutes, we use a window function. To avoid having to repeat the WINDOW clause for every aggregate, we define a named window called "status_window". This is another nice feature of PostgreSQL. +

+

+ In the results we can now see that for every entry, we have the mean and standard deviation of the previous 60 rows. This is similar to the calculation we did in the previous section, only this time we do it for every row. +

+

+ Now we can calculate the z-score for every row: +

+
+
WITH calculations_over_window AS (
+   SELECT
+      status_code,
+      period,
+      entries,
+      AVG(entries) OVER status_window as mean_entries,
+      STDDEV(entries) OVER status_window as stddev_entries
+   FROM
+      server_log_summary
+   WINDOW status_window AS (
+      PARTITION BY status_code
+      ORDER BY period
+      ROWS BETWEEN 60 PRECEDING AND CURRENT ROW
+   )
+),
+
+with_zscore AS (
+   SELECT
+       *,
+       (entries - mean_entries) / NULLIF(stddev_entries::float, 0) as zscore
+   FROM
+       calculations_over_window
+)
+
+SELECT
+   status_code,
+   period,
+   zscore
+FROM
+   with_zscore
+ORDER BY
+   period DESC
+LIMIT
+   20;
+
+status_code │         period         │        zscore
+────────────┼────────────────────────┼──────────────────────
+        200 │ 2020-08-01 18:00:00+00 │   1.8925638848161648
+        400 │ 2020-08-01 18:00:00+00 │    6.823777205473068
+        404 │ 2020-08-01 18:00:00+00 │ -0.38531664163524526
+        500 │ 2020-08-01 18:00:00+00 │ -0.41260101365496504
+        500 │ 2020-08-01 17:59:00+00 │  -0.4391628750910588
+        400 │ 2020-08-01 17:59:00+00 │    7.445849602151508
+        200 │ 2020-08-01 17:59:00+00 │   1.7399359608515874
+        404 │ 2020-08-01 17:59:00+00 │ -0.38531664163524526
+        500 │ 2020-08-01 17:58:00+00 │  -0.4391628750910588
+        404 │ 2020-08-01 17:58:00+00 │ -0.38531664163524526
+        200 │ 2020-08-01 17:58:00+00 │   1.6925903990967166
+        400 │ 2020-08-01 17:58:00+00 │    4.838594613958412
+        404 │ 2020-08-01 17:57:00+00 │ -0.38531664163524526
+        400 │ 2020-08-01 17:57:00+00 │ -0.32757065425956844
+        500 │ 2020-08-01 17:57:00+00 │      2.2397306629644
+        200 │ 2020-08-01 17:57:00+00 │   1.5382691050147506
+        200 │ 2020-08-01 17:56:00+00 │   1.6427718293547886
+        404 │ 2020-08-01 17:56:00+00 │ -0.41260101365496504
+        400 │ 2020-08-01 17:56:00+00 │     2.75460015502278
+        500 │ 2020-08-01 17:56:00+00 │ -0.41260101365496504
+
+
+

+ We now have z-scores for every row, and we can try to identify anomalies: +

+
+
WITH calculations_over_window AS (
+   SELECT
+       status_code,
+       period,
+       entries,
+       AVG(entries) OVER status_window as mean_entries,
+       STDDEV(entries) OVER status_window as stddev_entries
+   FROM
+       server_log_summary
+   WINDOW status_window AS (
+       PARTITION BY status_code
+       ORDER BY period
+       ROWS BETWEEN 60 PRECEDING AND CURRENT ROW
+   )
+),
+
+with_zscore AS (
+   SELECT
+       *,
+       (entries - mean_entries) / NULLIF(stddev_entries::float, 0) as zscore
+   FROM
+       calculations_over_window
+),
+
+with_alert AS (
+
+   SELECT
+       *,
+       zscore > 3 AS alert
+   FROM
+       with_zscore
+)
+
+SELECT
+   status_code,
+   period,
+   entries,
+   zscore,
+   alert
+FROM
+   with_alert
+WHERE
+   alert
+ORDER BY
+   period DESC
+LIMIT
+   20;
+
+status_code │         period         │ entries │       zscore       │ alert
+────────────┼────────────────────────┼─────────┼────────────────────┼───────
+        400 │ 2020-08-01 18:00:00+00 │      24 │  6.823777205473068 │ t
+        400 │ 2020-08-01 17:59:00+00 │      12 │  7.445849602151508 │ t
+        400 │ 2020-08-01 17:58:00+00 │       2 │  4.838594613958412 │ t
+        500 │ 2020-08-01 17:29:00+00 │       1 │ 3.0027309973793774 │ t
+        500 │ 2020-08-01 17:20:00+00 │       1 │ 3.3190952747131184 │ t
+        500 │ 2020-08-01 17:18:00+00 │       1 │ 3.7438474117708043 │ t
+        500 │ 2020-08-01 17:13:00+00 │       1 │ 3.7438474117708043 │ t
+        500 │ 2020-08-01 17:09:00+00 │       1 │  4.360778994930029 │ t
+        500 │ 2020-08-01 16:59:00+00 │       1 │ 3.7438474117708043 │ t
+        400 │ 2020-08-01 16:29:00+00 │       1 │ 3.0027309973793774 │ t
+        404 │ 2020-08-01 16:13:00+00 │       1 │ 3.0027309973793774 │ t
+        500 │ 2020-08-01 15:13:00+00 │       1 │ 3.0027309973793774 │ t
+        500 │ 2020-08-01 15:11:00+00 │       1 │ 3.0027309973793774 │ t
+        500 │ 2020-08-01 14:58:00+00 │       1 │ 3.0027309973793774 │ t
+        400 │ 2020-08-01 14:56:00+00 │       1 │ 3.0027309973793774 │ t
+        400 │ 2020-08-01 14:55:00+00 │       1 │ 3.3190952747131184 │ t
+        400 │ 2020-08-01 14:50:00+00 │       1 │ 3.3190952747131184 │ t
+        500 │ 2020-08-01 14:37:00+00 │       1 │ 3.0027309973793774 │ t
+        400 │ 2020-08-01 14:35:00+00 │       1 │ 3.3190952747131184 │ t
+        400 │ 2020-08-01 14:32:00+00 │       1 │ 3.3190952747131184 │ t
+
+
+

+ We decided to classify values with z-score greater than 3 as anomalies. 3 is usually the magic number you’ll see in textbooks, but don’t get sentimental about it because you can definitely change it to get better results. +

+

+ Adding Thresholds +

+

+ In the last query we detected a large number of "anomalies" with just one entry. This is very common in errors that don't happen very often. In our case, every once in a while we get a 400 status code, but because it doesn't happen very often, the standard deviation is very low so that even a single error can be considered way above the acceptable value. +

+

+ We don't really want to receive an alert in the middle of the night just because of one 400 status code. We can't have every curious developer fiddling with the devtools in his browser wake us up in the middle of the night. +

+

+ To eliminate rows with only a few entries we set a threshold: +

+
+
WITH calculations_over_window AS (
+   SELECT
+       status_code,
+       period,
+       entries,
+       AVG(entries) OVER status_window as mean_entries,
+       STDDEV(entries) OVER status_window as stddev_entries
+   FROM
+       server_log_summary
+   WINDOW status_window AS (
+       PARTITION BY status_code
+       ORDER BY period
+       ROWS BETWEEN 60 PRECEDING AND CURRENT ROW
+   )
+),
+
+with_zscore AS (
+   SELECT
+       *,
+       (entries - mean_entries) / NULLIF(stddev_entries::float, 0) as zscore
+   FROM
+       calculations_over_window
+),
+
+with_alert AS (
+
+   SELECT
+       *,
+       entries > 10 AND zscore > 3 AS alert
+   FROM
+       with_zscore
+)
+
+SELECT
+   status_code,
+   period,
+   entries,
+   zscore,
+   alert
+FROM
+   with_alert
+WHERE
+   alert
+ORDER BY
+   period DESC;
+
+status_code │         period         │ entries │       zscore       │ alert
+────────────┼────────────────────────┼─────────┼────────────────────┼───────
+        400 │ 2020-08-01 18:00:00+00 │      24 │  6.823777205473068 │ t
+        400 │ 2020-08-01 17:59:00+00 │      12 │  7.445849602151508 │ t
+        500 │ 2020-08-01 11:29:00+00 │    5001 │  3.172198441961645 │ t
+        500 │ 2020-08-01 11:28:00+00 │    4812 │ 3.3971646910263917 │ t
+        500 │ 2020-08-01 11:27:00+00 │    4443 │ 3.5349400089601586 │ t
+        500 │ 2020-08-01 11:26:00+00 │    4522 │ 4.1264785335553595 │ t
+        500 │ 2020-08-01 11:25:00+00 │    5567 │   6.17629336121081 │ t
+        500 │ 2020-08-01 11:24:00+00 │    3657 │ 6.8689992361141154 │ t
+        500 │ 2020-08-01 11:23:00+00 │    1512 │  6.342260662589681 │ t
+        500 │ 2020-08-01 11:22:00+00 │    1022 │  7.682189672504754 │ t
+        404 │ 2020-08-01 07:20:00+00 │      23 │  5.142126410098476 │ t
+        404 │ 2020-08-01 07:19:00+00 │      20 │  6.091200697920824 │ t
+        404 │ 2020-08-01 07:18:00+00 │      15 │   7.57547172423804 │ t
+
+
+

+ After eliminating potential anomalies with less than 10 entries we get much fewer, and probably more relevant results. +

+

+ Eliminating Repeating Alerts +

+

+ In the previous section we eliminated potential anomalies with less than 10 entries. Using thresholds we were able to remove some non interesting anomalies. +

+

+ Let's have a look at the data for status code 400 after applying the threshold: +

+
+
status_code │         period         │ entries │       zscore       │ alert
+────────────┼────────────────────────┼─────────┼────────────────────┼───────
+        400 │ 2020-08-01 18:00:00+00 │      24 │  6.823777205473068 │ t
+        400 │ 2020-08-01 17:59:00+00 │      12 │  7.445849602151508 │ t
+
+
+

+ The first alert happened in 17:59, and a minute later the z-score was still high with a large number of entries and so we classified the next rows at 18:00 as an anomaly as well. +

+

+ If you think of an alerting system, we want to send an alert only when an anomaly first happens. We don't want to send an alert every minute until the z-score comes back below the threshold. In this case, we only want to send one alert at 17:59. We don't want to send another alert a minute later at 18:00. +

+

+ Let's remove alerts where the previous period was also classified as an alert: +

+
+
WITH calculations_over_window AS (
+   SELECT
+       status_code,
+       period,
+       entries,
+       AVG(entries) OVER status_window as mean_entries,
+       STDDEV(entries) OVER status_window as stddev_entries
+   FROM
+       server_log_summary
+   WINDOW status_window AS (
+       PARTITION BY status_code
+       ORDER BY period
+       ROWS BETWEEN 60 PRECEDING AND CURRENT ROW
+   )
+),
+
+with_zscore AS (
+   SELECT
+       *,
+       (entries - mean_entries) / NULLIF(stddev_entries::float, 0) as zscore
+   FROM
+       calculations_over_window
+),
+
+with_alert AS (
+
+   SELECT
+       *,
+       entries > 10 AND zscore > 3 AS alert
+   FROM
+       with_zscore
+),
+
+with_previous_alert AS (
+   SELECT
+       *,
+       LAG(alert) OVER (PARTITION BY status_code ORDER BY period) AS previous_alert
+   FROM
+       with_alert
+)
+
+SELECT
+   status_code,
+   period,
+   entries,
+   zscore,
+   alert
+FROM
+   with_previous_alert
+WHERE
+   alert AND NOT previous_alert
+ORDER BY
+   period DESC;
+
+status_code │         period         │ entries │      zscore       │ alert
+────────────┼────────────────────────┼─────────┼───────────────────┼───────
+        400 │ 2020-08-01 17:59:00+00 │      12 │ 7.445849602151508 │ t
+        500 │ 2020-08-01 11:22:00+00 │    1022 │ 7.682189672504754 │ t
+        404 │ 2020-08-01 07:18:00+00 │      15 │  7.57547172423804 │ t
+
+
+

+ By eliminating alerts that were already triggered we get a very small list of anomalies that may have happened during the day. Looking at the results we can see what anomalies we would have discovered: +

+
    +
  • Anomaly in status code 400 at 17:59: we also found that one earlier. +
  • +
+
+ Anomaly in status code 400 +
+ Anomaly in status code 400 +
+
+
    +
  • Anomaly in status code 500: we spotted this one on the chart when we started. +
  • +
+
+ Anomaly in status code 500 +
+ Anomaly in status code 500 +
+
+
    +
  • Anomaly in status code 404: this is a hidden hidden anomaly which we did not know about until now. +
  • +
+
+ A hidden anomaly in status code 404 +
+ A hidden anomaly in status code 404 +
+
+

+ The query can now be used to fire alerts when it encounters an anomaly. +

+

+ Experiment With Different Values +

+

+ In the process so far we’ve used several constants in our calculations: +

+
    +
  • + Lookback period: How far back we calculate the mean and standard deviation for each status code. The value we used is 60 minutes. +
  • +
  • + Entries Threshold: The least amount of entries we want to get an alert for. The value we used is 10. +
  • +
  • + Z-Score Threshold: The z-score after which we classify the value as an anomaly. The value we used is 6. +
  • +
+

+ Now that we have a working query to backtest, we can experiment with different values. +

+
+ Experimenting with parameter values +
+ Experimenting with parameter values +
+
+

+ This is a chart showing the alerts our system identified in the past 12 hours: +

+
+ Backtesting with default parameters. <a href="https://popsql.com/queries/-MECQV6GiKr04WdCWM0K/simple-anomaly-detection-with-sql?access_token=2d2c0729f9a1cfa7b6a2dbb5b0adb45c">View in editor</a> +
+ Backtesting with default parameters. View in editor +
+
+

+ To get a sense of each parameter, let's adjust the values and see how it affects the number and quality of alerts we get. +

+

+ If we decrease the value of the z-score threshold from 3 to 1, we should get more alerts. With a lower threshold, more values are likely to be considered an anomaly: +

+
+ Backtesting with lower z-score threshold +
+ Backtesting with lower z-score threshold +
+
+

+ If we increase the entries threshold from 10 to 30, we should get less alerts: +

+
+ Backtesting with higher entries threshold +
+ Backtesting with higher entries threshold +
+
+

+ If we increase the backtest period from 60 minutes to 360 minutes, we get more alerts: +

+
+ Backtesting with higher entries threshold +
+ Backtesting with higher entries threshold +
+
+

+ A good alerting system is a system that produces true alerts, at a reasonable time. Using the backtesting query you can experiment with different values that produces quality alerts you can act on. +

+
+

+ Improving Accuracy +

+

+ Using a z-score for detecting anomalies is an easy way to get started with anomaly detection and see results right away. But, this method is not always the best choice, and if you don't get good alerts using this method, there are some improvements and other methods you can try using just SQL. +

+

+ Use Weighted Mean +

+

+ Our system uses a mean to determine a reasonable value, and a lookback period to determine how long back to calculate that mean over. In our case, we calculated the mean based on data from 1 hour ago. +

+

+ Using this method of calculating mean gives the same weight to entries that happened 1 hour ago and to entries that just happened. If you give more weight to recent entries at the expense of previous entries, the new weighted mean should become more sensitive to recent entries, and you may be able to identify anomalies quicker. +

+

+ To give more weight to recent entries, you can use a weighted average: +

+
+
SELECT
+   status_code,
+   avg(entries) as mean,
+   sum(
+      entries *
+      (60 - extract('seconds' from '2020-08-01 17:00 UTC'::timestamptz - period))
+   ) / (60 * 61 / 2) as weighted_mean
+FROM
+   server_log_summary
+WHERE
+   -- Last 60 periods
+   period > '2020-08-01 17:00 UTC'::timestamptz
+GROUP BY
+   status_code;
+
+ status_code │          mean          │    weighted_mean
+─────────────┼────────────────────────┼─────────────────────
+         404 │ 0.13333333333333333333 │ 0.26229508196721313
+         500 │ 0.15000000000000000000 │ 0.29508196721311475
+         200 │  2779.1000000000000000 │   5467.081967213115
+         400 │ 0.73333333333333333333 │  1.4426229508196722
+
+
+

+ In the results you can see the difference between the mean and the weighted mean for each status code. +

+

+ A weighted average is a very common indicator used by stock traders. We used a linear weighted average, but there are also exponential weighted averages and others you can try. +

+

+ Use Median +

+

+ In statistics, a mean is considered not robust because it is influenced by extreme values. Given our use case, the measure we are using to identify extreme values, is affected by those values we are trying to identify. +

+

+ For example, in the beginning of the article we used this series of values: +

+
+
2, 3, 5, 2, 3, 12, 5, 3, 4
+
+
+

+ The mean of this series is 4.33, and we detected 12 as an anomaly. +

+

+ If the 12 were a 120, the mean of the series would have been 16.33. Hence, our "reasonable" value is heavily affected by the values it is supposed to identify. +

+

+ A measure that is considered more robust is a median. The median of a series is the value that half the series is greater than, and half the series is less than: +

+
+
SELECT percentile_disc(0.5) within group(order by n)
+FROM unnest(ARRAY[2, 3, 5, 2, 3, 120, 5, 3, 4]) as n;
+
+ median
+────────
+      3
+
+
+

+ To calculate the median in PostgreSQL we use the function percentile_disc. In the series above, the median is 3. If we sort the list and cut it in the middle it will become more clear: +

+
+
2, 2, 3, 3, 3
+4, 5, 5, 12
+
+
+

+ If we change the value of 12 to 120, the median will not be affected at all: +

+
+
2, 2, 3, 3, 3
+4, 5, 5, 120
+
+
+

+ This is why a median is considered more robust than mean. +

+

+ Use MAD +

+

+ Median absolute deviation (MAD) is another way of finding anomalies in a series. MAD is considered better than z-score for real life data. +

+

+ MAD is calculated by finding the median of the deviations from the series median. Just for comparison, the standard deviation is the root square of the average square distance from the mean. +

+

+ Use Different Measures +

+

+ We used the number of entries per minute as an indicator. However, depending on the use case, there might be other things you can measure that can yield better results. For example: +

+
    +
  • To try and identify DOS attacks you can monitor the ratio between unique IP addresses to HTTP requests. +
  • +
  • To reduce the amount of false positives, you can normalize the number of responses to the proportion of the total responses. This way, for example, if you're using a flaky remote service that fails once after every certain amount of requests, using the proportion may not trigger an alert when the increase in errors correlates with an increase in overall traffic. +
  • +
+
+

+ Conclusion +

+

+ The method presented above is a very simple method to detect anomalies and produce actionable alerts that can potentially save you a lot of grief. There are many tools out there that provide similar functionally, but they require either tight integration or $$$. The main appeal of this approach is that you can get started with tools you probably already have, some SQL and a scheduled task! +

+
+

+ UPDATE: many readers asked me how I created the charts in this article... well, I used PopSQL. It’s a new modern SQL editor focused on collaborative editing. If you're in the market for one, go check it out... +

+
diff --git a/resources/tests/readability/toc-missing/source.html b/resources/tests/readability/toc-missing/source.html new file mode 100644 index 0000000..d79f045 --- /dev/null +++ b/resources/tests/readability/toc-missing/source.html @@ -0,0 +1,1696 @@ + + + + + Simple Anomaly Detection Using Plain SQL | Haki Benita + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ +

+ Simple Anomaly Detection Using Plain SQL +

+

+ Identify Problems Before They Become Disasters +

+
+
+
+

+ Many developers think that having a critical bug in their code is the worst thing that can happen. Well, there is something much worse than that: Having a critical bug in your code and not knowing about it! +

+

+ To make sure I get notified about critical bugs as soon as possible, I started looking for ways to find anomalies in my data. I quickly found that information about these subjects tend to get very complicated, and involve a lot of ad-hoc tools and dependencies. +

+

+ I'm not a statistician and not a data scientist, I'm just a developer. Before I introduce dependencies into my system I make sure I really can't do without them. So, using some high school level statistics and a fair knowledge of SQL, I implemented a simple anomaly detection system that works. +

+
+ Can you spot the anomaly?<br><small>Photo by <a href="https://unsplash.com/photos/KmKZV8pso-s">Ricardo Gomez Angel</a></small> +
+ Can you spot the anomaly?
+ Photo by Ricardo Gomez Angel +
+
+
+ + Table of Contents + + +
+
+
+

+ Interactive Editor +

+
+ + + + + + + + + + + + + + +
+ To follow along with the article and experiment with actual data online check out the interactive editor on PopSQL ≫ +
+
+
+
+

+ Detecting Anomalies +

+

+ Anomaly in a data series is a significant deviation from some reasonable value. Looking at this series of numbers for example, which number stands out? +

+
+
2, 3, 5, 2, 3, 12, 5, 3, 4
+
+
+

+ The number that stands out in this series is 12. +

+
+ Scatter plot +
+ Scatter plot +
+
+

+ This is intuitive to a human, but computer programs don't have intuition... +

+

+ To find the anomaly in the series we first need to define what a reasonable value is, and then define how far away from this value we consider a significant deviation. A good place to start looking for a reasonable value is the mean: +

+
+
SELECT avg(n)
+FROM unnest(array[2, 3, 5, 2, 3, 12, 5, 3, 4]) AS n;
+
+       avg
+────────────────────
+4.3333333333333333
+
+
+

+ The mean is ~4.33. +

+

+ Next, we need to define the deviation. Let's use Standard Deviation: +

+
+
SELECT stddev(n)
+FROM unnest(array[2, 3, 5, 2, 3, 12, 5, 3, 4]) AS n;
+
+      stddev
+────────────────────
+3.0822070014844882
+
+
+

+ Standard deviation is the square root of the variance, which is the average squared distance from the mean. In this case it's 3.08. +

+

+ Now that we've defined a "reasonable" value and a deviation, we can define a range of acceptable values: +

+
+
SELECT
+   avg(n) - stddev(n) AS lower_bound,
+   avg(n) + stddev(n) AS upper_bound
+FROM
+   unnest(array[2, 3, 5, 2, 3, 12, 5, 3, 4]) AS n;
+
+    lower_bound    │     upper_bound
+───────────────────┼────────────────────
+1.2511263318488451 │ 7.4155403348178215
+
+
+

+ The range we defined is one standard deviation from the mean. Any value outside this range is considered an anomaly: +

+
+
WITH series AS (
+   SELECT *
+   FROM unnest(array[2, 3, 5, 2, 3, 12, 5, 3, 4]) AS n
+),
+bounds AS (
+   SELECT
+       avg(n) - stddev(n) AS lower_bound,
+       avg(n) + stddev(n) AS upper_bound
+   FROM
+       series
+)
+SELECT
+   n,
+   n NOT BETWEEN lower_bound AND upper_bound AS is_anomaly
+FROM
+   series,
+   bounds;
+
+n  │ is_anomaly
+───┼────────────
+ 2 │ f
+ 3 │ f
+ 5 │ f
+ 2 │ f
+ 3 │ f
+12 │ t
+ 5 │ f
+ 3 │ f
+ 4 │ f
+
+
+

+ Using the query we found that the value 12 is outside the range of acceptable values, and identified it as an anomaly. +

+

+ Understanding Z-Score +

+

+ Another way to represent a range of acceptable values is using a z-score. z-score, or Standard Score, is the number of standard deviations from the mean. In the previous section, our acceptable range was one standard deviation from the mean, or in other words, a z-score in the range ±1: +

+
+
WITH series AS (
+   SELECT *
+   FROM unnest(array[2, 3, 5, 2, 3, 12, 5, 3, 4]) AS n
+),
+stats AS (
+   SELECT
+       avg(n) series_mean,
+       stddev(n) as series_stddev
+   FROM
+       series
+)
+SELECT
+   n,
+   (n - series_mean) / series_stddev as zscore
+FROM
+   series,
+   stats;
+
+n  │         zscore
+───┼─────────────────────────
+ 2 │ -0.75703329861022517346
+ 3 │ -0.43259045634870009448
+ 5 │  0.21629522817435006346
+ 2 │ -0.75703329861022517346
+ 3 │ -0.43259045634870009448
+12 │      2.4873951240050256
+ 5 │  0.21629522817435006346
+ 3 │ -0.43259045634870009448
+ 4 │ -0.10814761408717501551
+
+
+

+ Like before, we can detect anomalies by searching for values which are outside the acceptable range using the z-score: +

+
+
WITH series AS (
+   SELECT *
+   FROM unnest(array[2, 3, 5, 2, 3, 12, 5, 3, 4]) AS n
+),
+stats AS (
+   SELECT
+       avg(n) series_avg,
+       stddev(n) as series_stddev
+   FROM
+       series
+),
+zscores AS (
+   SELECT
+       n,
+       (n - series_avg) / series_stddev AS zscore
+   FROM
+       series,
+       stats
+)
+SELECT
+   *,
+   zscore NOT BETWEEN -1 AND 1 AS is_anomaly
+FROM
+   zscores;
+
+n  │         zscore          │ is_anomaly
+───┼─────────────────────────┼────────────
+ 2 │ -0.75703329861022517346 │ f
+ 3 │ -0.43259045634870009448 │ f
+ 5 │  0.21629522817435006346 │ f
+ 2 │ -0.75703329861022517346 │ f
+ 3 │ -0.43259045634870009448 │ f
+12 │      2.4873951240050256 │ t
+ 5 │  0.21629522817435006346 │ f
+ 3 │ -0.43259045634870009448 │ f
+ 4 │ -0.10814761408717501551 │ f
+
+
+

+ Using z-score, we also identified 12 as an anomaly in this series. +

+

+ Optimizing Z-Score +

+

+ So far we used one standard deviation from the mean, or a z-score of ±1 to identify anomalies. Changing the z-score threshold can affect our results. For example, let's see what anomalies we identify when the z-score is greater than 0.5 and when it's greater than 3: +

+
+
WITH series AS (
+   SELECT *
+   FROM unnest(array[2, 3, 5, 2, 3, 12, 5, 3, 4]) AS n
+),
+stats AS (
+   SELECT
+       avg(n) series_avg,
+       stddev(n) as series_stddev
+   FROM
+       series
+),
+zscores AS (
+   SELECT
+       n,
+       (n - series_avg) / series_stddev AS zscore
+   FROM
+       series,
+       stats
+)
+SELECT
+   *,
+   zscore NOT BETWEEN -0.5 AND 0.5 AS is_anomaly_0_5,
+   zscore NOT BETWEEN -1 AND 1 AS is_anomaly_1,
+   zscore NOT BETWEEN -3 AND 3 AS is_anomaly_3
+FROM
+   zscores;
+
+n  │         zscore          │ is_anomaly_0_5 │ is_anomaly_1 │ is_anomaly_3
+───┼─────────────────────────┼────────────────┼──────────────┼──────────────
+ 2 │ -0.75703329861022517346 │ t              │ f            │ f
+ 3 │ -0.43259045634870009448 │ f              │ f            │ f
+ 5 │  0.21629522817435006346 │ f              │ f            │ f
+ 2 │ -0.75703329861022517346 │ t              │ f            │ f
+ 3 │ -0.43259045634870009448 │ f              │ f            │ f
+12 │      2.4873951240050256 │ t              │ t            │ f
+ 5 │  0.21629522817435006346 │ f              │ f            │ f
+ 3 │ -0.43259045634870009448 │ f              │ f            │ f
+ 4 │ -0.10814761408717501551 │ f              │ f            │ f
+
+
+

+ Let's see what we got: +

+
    +
  • When we decreased the z-score threshold to 0.5, we identified the value 2 as an anomaly in addition to the value 12. +
  • +
  • When we increased the z-score threshold to 3 we did not identify any anomaly. +
  • +
+

+ The quality of our results are directly related to the parameters we set for the query. Later we'll see how using backtesting can help us identify ideal values. +

+
+

+ Analyzing a Server Log +

+

+ Application servers such as nginx, Apache and IIS write a lot of useful information to access logs. The data in these logs can be extremely useful in identifying anomalies. +

+

+ We are going to analyze logs of a web application, so the data we are most interested in is the timestamp and the status code of every response from the server. To illustrate the type of insight we can draw from just this data: +

+
    +
  • + A sudden increase in 500 status code: You may have a problem in the server. Did you just push a new version? Is there an external service you're using that started failing in unexpected ways? +
  • +
  • + A sudden increase in 400 status code: You may have a problem in the client. Did you change some validation logic and forgot to update the client? Did you make a change and forgot to handle backward compatibility? +
  • +
  • + A sudden increase in 404 status code: You may have an SEO problem. Did you move some pages and forgot to set up redirects? Is there some script kiddy running a scan on your site? +
  • +
  • + A sudden increase in 200 status code: You either have some significant legit traffic coming in, or you are under a DOS attack. Either way, you probably want to check where it's coming from. +
  • +
+

+ Preparing the Data +

+

+ Parsing and processing logs is outside the scope of this article, so let's assume we did that and we have a table that looks like this: +

+
+
CREATE TABLE server_log_summary AS (
+   period timestamptz,
+   status_code int,
+   entries int
+);
+
+
+

+ The table stores the number of entries for each status code at a given period. For example, our table stores how many responses returned each status code every minute: +

+
+
db=# SELECT * FROM server_log_summary ORDER BY period DESC LIMIT 10;
+
+        period         │ status_code │ entries
+───────────────────────┼─────────────┼─────────
+2020-08-01 18:00:00+00 │         200 │    4084
+2020-08-01 18:00:00+00 │         404 │       0
+2020-08-01 18:00:00+00 │         400 │      24
+2020-08-01 18:00:00+00 │         500 │       0
+2020-08-01 17:59:00+00 │         400 │      12
+2020-08-01 17:59:00+00 │         200 │    3927
+2020-08-01 17:59:00+00 │         500 │       0
+2020-08-01 17:59:00+00 │         404 │       0
+2020-08-01 17:58:00+00 │         400 │       2
+2020-08-01 17:58:00+00 │         200 │    3850
+
+
+

+ Note that the table has a row for every minute, even if the status code was never returned in that minute. Given a table of statuses, it's very tempting to do something like this: +

+
+
-- Wrong!
+SELECT
+   date_trunc('minute', timestamp) AS period,
+   status_code,
+   count(*) AS entries
+FROM
+   server_log
+GROUP BY
+   period,
+   status_code;
+
+
+

+ This is a common mistake and it can leave you with gaps in the data. Zero is a value, and it holds a significant meaning. A better approach is to create an "axis", and join to it: +

+
+
-- Correct!
+WITH axis AS (
+   SELECT
+       status_code,
+       generate_series(
+           date_trunc('minute', now()),
+           date_trunc('minute', now() - interval '1 hour'),
+           interval '1 minute' * -1
+       ) AS period
+   FROM (
+       VALUES (200), (400), (404), (500)
+   ) AS t(status_code)
+)
+SELECT
+   a.period,
+   a.status_code,
+   count(*) AS entries
+FROM
+   axis a
+   LEFT JOIN server_log l ON (
+       date_trunc('minute', l.timestamp) = a.period
+       AND l.status_code = a.status_code
+   )
+GROUP BY
+   period,
+   status_code;
+
+
+

+ First we generate an axis using a cartesian join between the status codes we want to track, and the times we want to monitor. To generate the axis we used two nice features of PostgreSQL: +

+
    +
  • + generate_series: function that generates a range of values. +
  • +
  • + VALUES list: special clause that can generate "constant tables", as the documentation calls it. You might be familiar with the VALUES clause from INSERT statements. In the old days, to generate data we had to use a bunch of SELECT ... UNION ALL... using VALUES is much nicer. +
  • +
+

+ After generating the axis, we left join the actual data into it to get a complete series for each status code. The resulting data has no gaps, and is ready for analysis. +

+

+ Getting a Sense of the Data +

+

+ To get a sense of the data, let's draw a stacked bar chart by status: +

+
+ stacked bar chart by status, over time +
+ stacked bar chart by status, over time +
+
+

+ The chart shows a period of 12 hours. It looks like we have a nice trend with two peaks at around 09:30 and again at 18:00. +

+

+ We also spot right away that at ~11:30 there was a significant increase in 500 errors. The burst died down after around 10 minutes. This is the type of anomalies we want to identify early on. +

+

+ It's entirely possible that there were other problems during that time, we just can't spot them with a naked eye. +

+

+ Identifying Anomalies +

+

+ In anomaly detection systems, we usually want to identify if we have an anomaly right now, and send an alert. +

+

+ To identify if the last datapoint is an anomaly, we start by calculating the mean and standard deviation for each status code in the past hour: +

+
+
db=# WITH stats AS (
+   SELECT
+       status_code,
+       (MAX(ARRAY[EXTRACT('epoch' FROM period), entries]))[2] AS last_value,
+       AVG(entries) AS mean_entries,
+       STDDEV(entries) AS stddev_entries
+   FROM
+       server_log_summary
+   WHERE
+       -- In the demo data use:
+       -- period > '2020-08-01 17:00 UTC'::timestamptz
+       period > now() - interval '1 hour'
+   GROUP BY
+       status_code
+)
+SELECT * FROM stats;
+
+status_code │ last_value │      mean_entries      │     stddev_entries
+────────────┼────────────┼────────────────────────┼────────────────────────
+        404 │          0 │ 0.13333333333333333333 │ 0.34280333180088158345
+        500 │          0 │ 0.15000000000000000000 │ 0.36008473579027553993
+        200 │       4084 │  2779.1000000000000000 │       689.219644702665
+        400 │         24 │ 0.73333333333333333333 │     3.4388935285299212
+
+
+

+ To get the last value in a GROUP BY in addition to the mean and standard deviation we used a little array trick. +

+

+ Next, we calculate the z-score for the last value for each status code: +

+
+
db=# WITH stats AS (
+   SELECT
+       status_code,
+       (MAX(ARRAY[EXTRACT('epoch' FROM period), entries]))[2] AS last_value,
+       AVG(entries) AS mean_entries,
+       STDDEV(entries) AS stddev_entries
+   FROM
+       server_log_summary
+   WHERE
+       -- In the demo data use:
+       -- period > '2020-08-01 17:00 UTC'::timestamptz
+       period > now() - interval '1 hour'
+   GROUP BY
+       status_code
+)
+SELECT
+   *,
+   (last_value - mean_entries) / NULLIF(stddev_entries::float, 0) as zscore
+FROM
+   stats;
+
+status_code │ last_value │ mean_entries │ stddev_entries │  zscore
+────────────┼────────────┼──────────────┼────────────────┼────────
+        404 │          0 │ 0.133        │ 0.3428         │ -0.388
+        500 │          0 │ 0.150        │ 0.3600         │ -0.416
+        200 │       4084 │ 2779.100     │ 689.2196       │  1.893
+        400 │         24 │ 0.733        │ 3.4388         │  6.765
+
+
+

+ We calculated the z-score by finding the number of standard deviations between the last value and the mean. To avoid a "division by zero" error we transform the denominator to NULL if it's zero. +

+

+ Looking at the z-scores we got, we can spot that status code 400 got a very high z-score of 6. In the past minute we returned a 400 status code 24 times, which is significantly higher than the average of 0.73 in the past hour. +

+

+ Let's take a look at the raw data: +

+
+
SELECT *
+FROM server_log_summary
+WHERE status_code = 400
+ORDER BY period DESC
+LIMIT 20;
+
+        period         │ status_code │ entries
+───────────────────────┼─────────────┼─────────
+2020-08-01 18:00:00+00 │         400 │      24
+2020-08-01 17:59:00+00 │         400 │      12
+2020-08-01 17:58:00+00 │         400 │       2
+2020-08-01 17:57:00+00 │         400 │       0
+2020-08-01 17:56:00+00 │         400 │       1
+2020-08-01 17:55:00+00 │         400 │       0
+2020-08-01 17:54:00+00 │         400 │       0
+2020-08-01 17:53:00+00 │         400 │       0
+2020-08-01 17:52:00+00 │         400 │       0
+2020-08-01 17:51:00+00 │         400 │       0
+2020-08-01 17:50:00+00 │         400 │       0
+2020-08-01 17:49:00+00 │         400 │       0
+2020-08-01 17:48:00+00 │         400 │       0
+2020-08-01 17:47:00+00 │         400 │       0
+2020-08-01 17:46:00+00 │         400 │       0
+2020-08-01 17:45:00+00 │         400 │       0
+2020-08-01 17:44:00+00 │         400 │       0
+2020-08-01 17:43:00+00 │         400 │       0
+2020-08-01 17:42:00+00 │         400 │       0
+2020-08-01 17:41:00+00 │         400 │       0
+
+
+

+ It does look like in the last couple of minutes we are getting more errors than expected. +

+
+ Status 400 in the past hour +
+ Status 400 in the past hour +
+
+

+ What our naked eye missed in the chart and in the raw data, was found by the query, and was classified as an anomaly. We are off to a great start! +

+
+

+ Backtesting +

+

+ In the previous section we identified an anomaly. We found an increase in 400 status code because the z-score was 6. But how do we set the threshold for the z-score? Is a z-score of 3 an anomaly? What about 2, or 1? +

+

+ To find thresholds that fit our needs, we can run simulations on past data with different values, and evaluate the results. This is often called backtesting. +

+

+ Finding Past Anomalies +

+

+ The first thing we need to do is to calculate the mean and the standard deviation for each status code up until every row, just as if it’s the current value. This is a classic job for a window function: +

+
+
WITH calculations_over_window AS (
+   SELECT
+      status_code,
+      period,
+      entries,
+      AVG(entries) OVER status_window as mean_entries,
+      STDDEV(entries) OVER status_window as stddev_entries
+   FROM
+      server_log_summary
+   WINDOW status_window AS (
+      PARTITION BY status_code
+      ORDER BY period
+      ROWS BETWEEN 60 PRECEDING AND CURRENT ROW
+   )
+)
+SELECT *
+FROM calculations_over_window
+ORDER BY period DESC
+LIMIT 20;
+
+status_code │         period         │ entries │      mean_entries      │     stddev_entries
+────────────┼────────────────────────┼─────────┼────────────────────────┼────────────────────────
+        200 │ 2020-08-01 18:00:00+00 │    4084 │  2759.9672131147540984 │       699.597407256800
+        400 │ 2020-08-01 18:00:00+00 │      24 │ 0.72131147540983606557 │     3.4114080550460080
+        404 │ 2020-08-01 18:00:00+00 │       0 │ 0.13114754098360655738 │ 0.34036303344446665347
+        500 │ 2020-08-01 18:00:00+00 │       0 │ 0.14754098360655737705 │ 0.35758754516763638735
+        500 │ 2020-08-01 17:59:00+00 │       0 │ 0.16393442622950819672 │ 0.37328844382740000274
+        400 │ 2020-08-01 17:59:00+00 │      12 │ 0.32786885245901639344 │     1.5676023249473471
+        200 │ 2020-08-01 17:59:00+00 │    3927 │  2718.6721311475409836 │       694.466863171826
+        404 │ 2020-08-01 17:59:00+00 │       0 │ 0.13114754098360655738 │ 0.34036303344446665347
+        500 │ 2020-08-01 17:58:00+00 │       0 │ 0.16393442622950819672 │ 0.37328844382740000274
+        404 │ 2020-08-01 17:58:00+00 │       0 │ 0.13114754098360655738 │ 0.34036303344446665347
+        200 │ 2020-08-01 17:58:00+00 │    3850 │  2680.4754098360655738 │       690.967283512936
+        400 │ 2020-08-01 17:58:00+00 │       2 │ 0.13114754098360655738 │ 0.38623869286861001780
+        404 │ 2020-08-01 17:57:00+00 │       0 │ 0.13114754098360655738 │ 0.34036303344446665347
+        400 │ 2020-08-01 17:57:00+00 │       0 │ 0.09836065573770491803 │ 0.30027309973793774423
+        500 │ 2020-08-01 17:57:00+00 │       1 │ 0.16393442622950819672 │ 0.37328844382740000274
+        200 │ 2020-08-01 17:57:00+00 │    3702 │  2643.0327868852459016 │       688.414796645480
+        200 │ 2020-08-01 17:56:00+00 │    3739 │  2607.5081967213114754 │       688.769908918569
+        404 │ 2020-08-01 17:56:00+00 │       0 │ 0.14754098360655737705 │ 0.35758754516763638735
+        400 │ 2020-08-01 17:56:00+00 │       1 │ 0.11475409836065573770 │ 0.32137001808599097120
+        500 │ 2020-08-01 17:56:00+00 │       0 │ 0.14754098360655737705 │ 0.35758754516763638735
+
+
+

+ To calculate the mean and standard deviation over a sliding window of 60 minutes, we use a window function. To avoid having to repeat the WINDOW clause for every aggregate, we define a named window called "status_window". This is another nice feature of PostgreSQL. +

+

+ In the results we can now see that for every entry, we have the mean and standard deviation of the previous 60 rows. This is similar to the calculation we did in the previous section, only this time we do it for every row. +

+

+ Now we can calculate the z-score for every row: +

+
+
WITH calculations_over_window AS (
+   SELECT
+      status_code,
+      period,
+      entries,
+      AVG(entries) OVER status_window as mean_entries,
+      STDDEV(entries) OVER status_window as stddev_entries
+   FROM
+      server_log_summary
+   WINDOW status_window AS (
+      PARTITION BY status_code
+      ORDER BY period
+      ROWS BETWEEN 60 PRECEDING AND CURRENT ROW
+   )
+),
+
+with_zscore AS (
+   SELECT
+       *,
+       (entries - mean_entries) / NULLIF(stddev_entries::float, 0) as zscore
+   FROM
+       calculations_over_window
+)
+
+SELECT
+   status_code,
+   period,
+   zscore
+FROM
+   with_zscore
+ORDER BY
+   period DESC
+LIMIT
+   20;
+
+status_code │         period         │        zscore
+────────────┼────────────────────────┼──────────────────────
+        200 │ 2020-08-01 18:00:00+00 │   1.8925638848161648
+        400 │ 2020-08-01 18:00:00+00 │    6.823777205473068
+        404 │ 2020-08-01 18:00:00+00 │ -0.38531664163524526
+        500 │ 2020-08-01 18:00:00+00 │ -0.41260101365496504
+        500 │ 2020-08-01 17:59:00+00 │  -0.4391628750910588
+        400 │ 2020-08-01 17:59:00+00 │    7.445849602151508
+        200 │ 2020-08-01 17:59:00+00 │   1.7399359608515874
+        404 │ 2020-08-01 17:59:00+00 │ -0.38531664163524526
+        500 │ 2020-08-01 17:58:00+00 │  -0.4391628750910588
+        404 │ 2020-08-01 17:58:00+00 │ -0.38531664163524526
+        200 │ 2020-08-01 17:58:00+00 │   1.6925903990967166
+        400 │ 2020-08-01 17:58:00+00 │    4.838594613958412
+        404 │ 2020-08-01 17:57:00+00 │ -0.38531664163524526
+        400 │ 2020-08-01 17:57:00+00 │ -0.32757065425956844
+        500 │ 2020-08-01 17:57:00+00 │      2.2397306629644
+        200 │ 2020-08-01 17:57:00+00 │   1.5382691050147506
+        200 │ 2020-08-01 17:56:00+00 │   1.6427718293547886
+        404 │ 2020-08-01 17:56:00+00 │ -0.41260101365496504
+        400 │ 2020-08-01 17:56:00+00 │     2.75460015502278
+        500 │ 2020-08-01 17:56:00+00 │ -0.41260101365496504
+
+
+

+ We now have z-scores for every row, and we can try to identify anomalies: +

+
+
WITH calculations_over_window AS (
+   SELECT
+       status_code,
+       period,
+       entries,
+       AVG(entries) OVER status_window as mean_entries,
+       STDDEV(entries) OVER status_window as stddev_entries
+   FROM
+       server_log_summary
+   WINDOW status_window AS (
+       PARTITION BY status_code
+       ORDER BY period
+       ROWS BETWEEN 60 PRECEDING AND CURRENT ROW
+   )
+),
+
+with_zscore AS (
+   SELECT
+       *,
+       (entries - mean_entries) / NULLIF(stddev_entries::float, 0) as zscore
+   FROM
+       calculations_over_window
+),
+
+with_alert AS (
+
+   SELECT
+       *,
+       zscore > 3 AS alert
+   FROM
+       with_zscore
+)
+
+SELECT
+   status_code,
+   period,
+   entries,
+   zscore,
+   alert
+FROM
+   with_alert
+WHERE
+   alert
+ORDER BY
+   period DESC
+LIMIT
+   20;
+
+status_code │         period         │ entries │       zscore       │ alert
+────────────┼────────────────────────┼─────────┼────────────────────┼───────
+        400 │ 2020-08-01 18:00:00+00 │      24 │  6.823777205473068 │ t
+        400 │ 2020-08-01 17:59:00+00 │      12 │  7.445849602151508 │ t
+        400 │ 2020-08-01 17:58:00+00 │       2 │  4.838594613958412 │ t
+        500 │ 2020-08-01 17:29:00+00 │       1 │ 3.0027309973793774 │ t
+        500 │ 2020-08-01 17:20:00+00 │       1 │ 3.3190952747131184 │ t
+        500 │ 2020-08-01 17:18:00+00 │       1 │ 3.7438474117708043 │ t
+        500 │ 2020-08-01 17:13:00+00 │       1 │ 3.7438474117708043 │ t
+        500 │ 2020-08-01 17:09:00+00 │       1 │  4.360778994930029 │ t
+        500 │ 2020-08-01 16:59:00+00 │       1 │ 3.7438474117708043 │ t
+        400 │ 2020-08-01 16:29:00+00 │       1 │ 3.0027309973793774 │ t
+        404 │ 2020-08-01 16:13:00+00 │       1 │ 3.0027309973793774 │ t
+        500 │ 2020-08-01 15:13:00+00 │       1 │ 3.0027309973793774 │ t
+        500 │ 2020-08-01 15:11:00+00 │       1 │ 3.0027309973793774 │ t
+        500 │ 2020-08-01 14:58:00+00 │       1 │ 3.0027309973793774 │ t
+        400 │ 2020-08-01 14:56:00+00 │       1 │ 3.0027309973793774 │ t
+        400 │ 2020-08-01 14:55:00+00 │       1 │ 3.3190952747131184 │ t
+        400 │ 2020-08-01 14:50:00+00 │       1 │ 3.3190952747131184 │ t
+        500 │ 2020-08-01 14:37:00+00 │       1 │ 3.0027309973793774 │ t
+        400 │ 2020-08-01 14:35:00+00 │       1 │ 3.3190952747131184 │ t
+        400 │ 2020-08-01 14:32:00+00 │       1 │ 3.3190952747131184 │ t
+
+
+

+ We decided to classify values with z-score greater than 3 as anomalies. 3 is usually the magic number you’ll see in textbooks, but don’t get sentimental about it because you can definitely change it to get better results. +

+

+ Adding Thresholds +

+

+ In the last query we detected a large number of "anomalies" with just one entry. This is very common in errors that don't happen very often. In our case, every once in a while we get a 400 status code, but because it doesn't happen very often, the standard deviation is very low so that even a single error can be considered way above the acceptable value. +

+

+ We don't really want to receive an alert in the middle of the night just because of one 400 status code. We can't have every curious developer fiddling with the devtools in his browser wake us up in the middle of the night. +

+

+ To eliminate rows with only a few entries we set a threshold: +

+
+
WITH calculations_over_window AS (
+   SELECT
+       status_code,
+       period,
+       entries,
+       AVG(entries) OVER status_window as mean_entries,
+       STDDEV(entries) OVER status_window as stddev_entries
+   FROM
+       server_log_summary
+   WINDOW status_window AS (
+       PARTITION BY status_code
+       ORDER BY period
+       ROWS BETWEEN 60 PRECEDING AND CURRENT ROW
+   )
+),
+
+with_zscore AS (
+   SELECT
+       *,
+       (entries - mean_entries) / NULLIF(stddev_entries::float, 0) as zscore
+   FROM
+       calculations_over_window
+),
+
+with_alert AS (
+
+   SELECT
+       *,
+       entries > 10 AND zscore > 3 AS alert
+   FROM
+       with_zscore
+)
+
+SELECT
+   status_code,
+   period,
+   entries,
+   zscore,
+   alert
+FROM
+   with_alert
+WHERE
+   alert
+ORDER BY
+   period DESC;
+
+status_code │         period         │ entries │       zscore       │ alert
+────────────┼────────────────────────┼─────────┼────────────────────┼───────
+        400 │ 2020-08-01 18:00:00+00 │      24 │  6.823777205473068 │ t
+        400 │ 2020-08-01 17:59:00+00 │      12 │  7.445849602151508 │ t
+        500 │ 2020-08-01 11:29:00+00 │    5001 │  3.172198441961645 │ t
+        500 │ 2020-08-01 11:28:00+00 │    4812 │ 3.3971646910263917 │ t
+        500 │ 2020-08-01 11:27:00+00 │    4443 │ 3.5349400089601586 │ t
+        500 │ 2020-08-01 11:26:00+00 │    4522 │ 4.1264785335553595 │ t
+        500 │ 2020-08-01 11:25:00+00 │    5567 │   6.17629336121081 │ t
+        500 │ 2020-08-01 11:24:00+00 │    3657 │ 6.8689992361141154 │ t
+        500 │ 2020-08-01 11:23:00+00 │    1512 │  6.342260662589681 │ t
+        500 │ 2020-08-01 11:22:00+00 │    1022 │  7.682189672504754 │ t
+        404 │ 2020-08-01 07:20:00+00 │      23 │  5.142126410098476 │ t
+        404 │ 2020-08-01 07:19:00+00 │      20 │  6.091200697920824 │ t
+        404 │ 2020-08-01 07:18:00+00 │      15 │   7.57547172423804 │ t
+
+
+

+ After eliminating potential anomalies with less than 10 entries we get much fewer, and probably more relevant results. +

+

+ Eliminating Repeating Alerts +

+

+ In the previous section we eliminated potential anomalies with less than 10 entries. Using thresholds we were able to remove some non interesting anomalies. +

+

+ Let's have a look at the data for status code 400 after applying the threshold: +

+
+
status_code │         period         │ entries │       zscore       │ alert
+────────────┼────────────────────────┼─────────┼────────────────────┼───────
+        400 │ 2020-08-01 18:00:00+00 │      24 │  6.823777205473068 │ t
+        400 │ 2020-08-01 17:59:00+00 │      12 │  7.445849602151508 │ t
+
+
+

+ The first alert happened in 17:59, and a minute later the z-score was still high with a large number of entries and so we classified the next rows at 18:00 as an anomaly as well. +

+

+ If you think of an alerting system, we want to send an alert only when an anomaly first happens. We don't want to send an alert every minute until the z-score comes back below the threshold. In this case, we only want to send one alert at 17:59. We don't want to send another alert a minute later at 18:00. +

+

+ Let's remove alerts where the previous period was also classified as an alert: +

+
+
WITH calculations_over_window AS (
+   SELECT
+       status_code,
+       period,
+       entries,
+       AVG(entries) OVER status_window as mean_entries,
+       STDDEV(entries) OVER status_window as stddev_entries
+   FROM
+       server_log_summary
+   WINDOW status_window AS (
+       PARTITION BY status_code
+       ORDER BY period
+       ROWS BETWEEN 60 PRECEDING AND CURRENT ROW
+   )
+),
+
+with_zscore AS (
+   SELECT
+       *,
+       (entries - mean_entries) / NULLIF(stddev_entries::float, 0) as zscore
+   FROM
+       calculations_over_window
+),
+
+with_alert AS (
+
+   SELECT
+       *,
+       entries > 10 AND zscore > 3 AS alert
+   FROM
+       with_zscore
+),
+
+with_previous_alert AS (
+   SELECT
+       *,
+       LAG(alert) OVER (PARTITION BY status_code ORDER BY period) AS previous_alert
+   FROM
+       with_alert
+)
+
+SELECT
+   status_code,
+   period,
+   entries,
+   zscore,
+   alert
+FROM
+   with_previous_alert
+WHERE
+   alert AND NOT previous_alert
+ORDER BY
+   period DESC;
+
+status_code │         period         │ entries │      zscore       │ alert
+────────────┼────────────────────────┼─────────┼───────────────────┼───────
+        400 │ 2020-08-01 17:59:00+00 │      12 │ 7.445849602151508 │ t
+        500 │ 2020-08-01 11:22:00+00 │    1022 │ 7.682189672504754 │ t
+        404 │ 2020-08-01 07:18:00+00 │      15 │  7.57547172423804 │ t
+
+
+

+ By eliminating alerts that were already triggered we get a very small list of anomalies that may have happened during the day. Looking at the results we can see what anomalies we would have discovered: +

+
    +
  • Anomaly in status code 400 at 17:59: we also found that one earlier. +
  • +
+
+ Anomaly in status code 400 +
+ Anomaly in status code 400 +
+
+
    +
  • Anomaly in status code 500: we spotted this one on the chart when we started. +
  • +
+
+ Anomaly in status code 500 +
+ Anomaly in status code 500 +
+
+
    +
  • Anomaly in status code 404: this is a hidden hidden anomaly which we did not know about until now. +
  • +
+
+ A hidden anomaly in status code 404 +
+ A hidden anomaly in status code 404 +
+
+

+ The query can now be used to fire alerts when it encounters an anomaly. +

+

+ Experiment With Different Values +

+

+ In the process so far we’ve used several constants in our calculations: +

+
    +
  • + Lookback period: How far back we calculate the mean and standard deviation for each status code. The value we used is 60 minutes. +
  • +
  • + Entries Threshold: The least amount of entries we want to get an alert for. The value we used is 10. +
  • +
  • + Z-Score Threshold: The z-score after which we classify the value as an anomaly. The value we used is 6. +
  • +
+

+ Now that we have a working query to backtest, we can experiment with different values. +

+
+ Experimenting with parameter values +
+ Experimenting with parameter values +
+
+

+ This is a chart showing the alerts our system identified in the past 12 hours: +

+
+ Backtesting with default parameters. <a href="https://popsql.com/queries/-MECQV6GiKr04WdCWM0K/simple-anomaly-detection-with-sql?access_token=2d2c0729f9a1cfa7b6a2dbb5b0adb45c">View in editor</a> +
+ Backtesting with default parameters. View in editor +
+
+

+ To get a sense of each parameter, let's adjust the values and see how it affects the number and quality of alerts we get. +

+

+ If we decrease the value of the z-score threshold from 3 to 1, we should get more alerts. With a lower threshold, more values are likely to be considered an anomaly: +

+
+ Backtesting with lower z-score threshold +
+ Backtesting with lower z-score threshold +
+
+

+ If we increase the entries threshold from 10 to 30, we should get less alerts: +

+
+ Backtesting with higher entries threshold +
+ Backtesting with higher entries threshold +
+
+

+ If we increase the backtest period from 60 minutes to 360 minutes, we get more alerts: +

+
+ Backtesting with higher entries threshold +
+ Backtesting with higher entries threshold +
+
+

+ A good alerting system is a system that produces true alerts, at a reasonable time. Using the backtesting query you can experiment with different values that produces quality alerts you can act on. +

+
+

+ Improving Accuracy +

+

+ Using a z-score for detecting anomalies is an easy way to get started with anomaly detection and see results right away. But, this method is not always the best choice, and if you don't get good alerts using this method, there are some improvements and other methods you can try using just SQL. +

+

+ Use Weighted Mean +

+

+ Our system uses a mean to determine a reasonable value, and a lookback period to determine how long back to calculate that mean over. In our case, we calculated the mean based on data from 1 hour ago. +

+

+ Using this method of calculating mean gives the same weight to entries that happened 1 hour ago and to entries that just happened. If you give more weight to recent entries at the expense of previous entries, the new weighted mean should become more sensitive to recent entries, and you may be able to identify anomalies quicker. +

+

+ To give more weight to recent entries, you can use a weighted average: +

+
+
SELECT
+   status_code,
+   avg(entries) as mean,
+   sum(
+      entries *
+      (60 - extract('seconds' from '2020-08-01 17:00 UTC'::timestamptz - period))
+   ) / (60 * 61 / 2) as weighted_mean
+FROM
+   server_log_summary
+WHERE
+   -- Last 60 periods
+   period > '2020-08-01 17:00 UTC'::timestamptz
+GROUP BY
+   status_code;
+
+ status_code │          mean          │    weighted_mean
+─────────────┼────────────────────────┼─────────────────────
+         404 │ 0.13333333333333333333 │ 0.26229508196721313
+         500 │ 0.15000000000000000000 │ 0.29508196721311475
+         200 │  2779.1000000000000000 │   5467.081967213115
+         400 │ 0.73333333333333333333 │  1.4426229508196722
+
+
+

+ In the results you can see the difference between the mean and the weighted mean for each status code. +

+

+ A weighted average is a very common indicator used by stock traders. We used a linear weighted average, but there are also exponential weighted averages and others you can try. +

+

+ Use Median +

+

+ In statistics, a mean is considered not robust because it is influenced by extreme values. Given our use case, the measure we are using to identify extreme values, is affected by those values we are trying to identify. +

+

+ For example, in the beginning of the article we used this series of values: +

+
+
2, 3, 5, 2, 3, 12, 5, 3, 4
+
+
+

+ The mean of this series is 4.33, and we detected 12 as an anomaly. +

+

+ If the 12 were a 120, the mean of the series would have been 16.33. Hence, our "reasonable" value is heavily affected by the values it is supposed to identify. +

+

+ A measure that is considered more robust is a median. The median of a series is the value that half the series is greater than, and half the series is less than: +

+
+
SELECT percentile_disc(0.5) within group(order by n)
+FROM unnest(ARRAY[2, 3, 5, 2, 3, 120, 5, 3, 4]) as n;
+
+ median
+────────
+      3
+
+
+

+ To calculate the median in PostgreSQL we use the function percentile_disc. In the series above, the median is 3. If we sort the list and cut it in the middle it will become more clear: +

+
+
2, 2, 3, 3, 3
+4, 5, 5, 12
+
+
+

+ If we change the value of 12 to 120, the median will not be affected at all: +

+
+
2, 2, 3, 3, 3
+4, 5, 5, 120
+
+
+

+ This is why a median is considered more robust than mean. +

+

+ Use MAD +

+

+ Median absolute deviation (MAD) is another way of finding anomalies in a series. MAD is considered better than z-score for real life data. +

+

+ MAD is calculated by finding the median of the deviations from the series median. Just for comparison, the standard deviation is the root square of the average square distance from the mean. +

+

+ Use Different Measures +

+

+ We used the number of entries per minute as an indicator. However, depending on the use case, there might be other things you can measure that can yield better results. For example: +

+
    +
  • To try and identify DOS attacks you can monitor the ratio between unique IP addresses to HTTP requests. +
  • +
  • To reduce the amount of false positives, you can normalize the number of responses to the proportion of the total responses. This way, for example, if you're using a flaky remote service that fails once after every certain amount of requests, using the proportion may not trigger an alert when the increase in errors correlates with an increase in overall traffic. +
  • +
+
+

+ Conclusion +

+

+ The method presented above is a very simple method to detect anomalies and produce actionable alerts that can potentially save you a lot of grief. There are many tools out there that provide similar functionally, but they require either tight integration or $$$. The main appeal of this approach is that you can get started with tools you probably already have, some SQL and a scheduled task! +

+
+

+ UPDATE: many readers asked me how I created the charts in this article... well, I used PopSQL. It’s a new modern SQL editor focused on collaborative editing. If you're in the market for one, go check it out... +

+
+
+ +
+ +
+
+

+ Similar articles +

+ +
+
+ + +
+ + + diff --git a/resources/tests/readability/topicseed-1/expected.html b/resources/tests/readability/topicseed-1/expected.html new file mode 100644 index 0000000..471f301 --- /dev/null +++ b/resources/tests/readability/topicseed-1/expected.html @@ -0,0 +1,93 @@ +
+ +

+ Content depth is an arbitrary score or rating of how comprehensive the coverage of a specific topic is within a piece of content. Content breadth is an arbitrary grading of how many related subjects are you covering within your content. +

+

+ And this distinction is important to make and establish from the beginning. Effective topical authority can only be gained when you use both content depth and content breadth in your overall content strategy for rapid search engine optimization gains. However, because most content writers prefer to write a little bit about many things rather than write a lot about one thing, you end up with a too little substance spread very thin. +

+

+ Content depth should be the urgent priority for your content marketing strategy, and clearly defined in your content briefs. Start by dominating your own core topics, before venturing across the pond and write about linked subject matters. Otherwise, you are the opposite of an authority as the definition states that an authority is “a person with extensive or specialized knowledge about a subject; an expert”. Lastly, do not mistake article depth vs. article length: a blog post’s extreme wordcount has nothing to do with its content depth. +

+

+ Assess How Deep Is Your Content +

+

+ The first task on your list, right now, is to shortlist your core topics. What are you trying to be an expert on? Then, go through each one of your pieces of content and understand how well each blog post is covering its focus topic(s). Not how many times specific keywords appear, or how well the article is outlined and structured. +

+

+ Put yourself in the shoes of an ignorant reader who seeks information. Read your article. And ask yourself how in-depth was the content you have written? I know the excuse you will come up with: this was written for beginners, therefore, it shouldn’t be too in-depth. And you are correct. Not every blog post is about absolute content depth otherwise we would only write one 10,000-word-long article, once and for all. But then, how well your beginner-level content pointing to your expert-level content? +

+

+ In other words, each article should reach an incredible level of content depth for its expertise level. And then, provide further reading (i.e. links) to gain more knowledge, and depth. A lot of content editors write a beginner’s blog post and wait to see it perform well in order to write a more advanced sequel. Wrong. Give all the value so search engines can grade you highly on their authority scale for your core topics. Yes, it is a risk and you may write a dozen of articles on a specific topic that will never really rank at the top of SERPs, but reaching content depth is the first step towards SEO gains. +

+

+ Remember that skyscraper content and 10x content are not necessarily the answer. These content writing strategies state that in order to beat another piece of content, you need to write 10x more. Either in quantity with a 10x word count or in quality by putting times more information within your own piece of content. Such articles often become unreadable and discourage visitors from absorbing all the knowledge. The best alternative is the create pillar pages centered around core topics, and several articles dealing with each specific section in depth. This is deep content powered by a smart internal linking strategy and search engines love that in this day and age where attention spans are short! With that being said, avoid writing 600-word articles! +

+

+ Rewrite With Content Depth In Mind +

+

+ Once you know which articles are lacking depth of knowledge and information, it is time to rethink each one. For each article, make a list of what essential pieces of information or data are missing. Then decide where to fit them, and decide whether the article would benefit from a full rewrite or not. As a rule of thumb, if you need to change a third of your article, you may need to rewrite it entirely. Of course, this does not mean erasing all work done prior, but it means starting afresh! Trying to fit deep content into an existing blog post gives you constraints so doing it from scratch can actually be easier to fight thin content. +

+ +

+ As explained above, make sure you do not force yourself to write a much longer article to reach a magic word count. And if you do, it has to be natural. In many cases, articles written months or years ago may need some upkeeping: trimming the fat and removing parts that are not bringing much value. Replace these with your newer and deeper content. +

+

+ All content writers know that when you open Google Docs, WordPress, or your text editor of choice, you will inevitably count your focus keywords’ frequency. Although I understand (yet question) the value of keywords in modern SEO, do not become obsessed with reaching a magic number for your keywords. No reader coming from Google is out there counting how often your keywords are appearing. And search engine algorithms will penalize you for writing for robots, rather than humans. +

+

+ With the massive rise of voice searches, users tend to use full questions for their search queries. What used to be top bottled water brands is now OK google, what is the best bottled-water brand in Texas? The point being, keywords are losing traction to leave space for a more natural language understanding of a blog post’s textual content, and meaning. +

+

+ Yes, Content Depth and Breadth Overlap +

+

+ “A topic can be defined as the company it keeps.” A very accurate saying loved by ontologists within the fields of computational linguistics, and information science. In simpler terms, a topic and all the terminology it is encompassing will inevitably overlap with related topics. Which, in turn, will form topic clusters. +

+

+ For example, it is obvious that despite being two different topics, digital advertising and content marketing share some common phrases and terms. Inevitably, a website picking one as its core topic will use words in some blog posts that will identify the article as belonging to both topics, with a specific weight for each. +

+

+ A keyword, phrase, or term, is not a prisoner to a single concept at all. This is how algorithms in natural language understanding can understand how two topics are related (e.g. read about topic modeling). Each topic has a specific vocabulary, a list of words and phrases commonly used in its context, and some of these terms are present in different vocabularies. +

+

+ Therefore, content depth and content breadth are not to be opposed. Content marketers should use both strategies in order to reach ultimate topical authority over their choice of subject matters. +

+

+ Depth of Content = Quality + Frequency +

+

+ Up until recently, long-form blog posts generally were evergreen articles that generated a constant stream of organic traffic for a website. This was a lead magnet generation strategy which worked well: hire a writer, include the right keywords, reach over a 5,000-word word count, and hit publish. Then, wait. +

+

+ Nowadays, in-depth content requires more effort over time in order to pay off. Writing a big article, as good as it is, will not get your anywhere near the level of topical breadth required by Google to rank you first. Instead, your content marketing plan should be about having: +

+
    +
  • a comprehensive pillar page covering a unique topic, and +
  • +
  • + narrow-focused children articles to dig deeper. +
  • +
+

+ Search engines also look at how often you publish about a specific topic, and when was the last time it was written about. Nobody likes a graveyard blog, it just makes the reader lose trust; as if the writer was not good enough, therefore had no traffic, before entirely giving up. Deep content requires a sustained effort on your part to always new find ways to write about a specific subject. Sure, it will be easy at first. But what about five years later? Well, you will still need to hit publish, all about the very same topics you already covered years ago. +

+

+ Tools and platforms such as topicseed are here to help you find new article ideas pertaining to your core topics within a few clicks and a few minutes. The number of web pages, Wikipedia articles, and pieces of content, our machine-learning algorithms can analyze in seconds would take you months to digest. Our topicgraph finds closely related concepts in order for your domain to reach topical authority through content depth and content breadth. +

+
diff --git a/resources/tests/readability/topicseed-1/source.html b/resources/tests/readability/topicseed-1/source.html new file mode 100644 index 0000000..fca1c00 --- /dev/null +++ b/resources/tests/readability/topicseed-1/source.html @@ -0,0 +1,400 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Content Depth — Write Comprehensively About Your Core Topics | topicseed + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+ +
+
+
+
+
+

+ Content Depth — Write Comprehensively About Your Core Topics +

+

+ Content writers and marketers find it hard to write a lot of content about a very specific topic. They lose a lot of points on their content depth because they would rather focus on pushing thin content about plenty of topics. +

+
    +
  • + On 6/13/2018 by @lazharichir +
  • +
+
+
+
+ +
+
+
+
+
+
+ +

+ Content depth is an arbitrary score or rating of how comprehensive the coverage of a specific topic is within a piece of content. Content breadth is an arbitrary grading of how many related subjects are you covering within your content. +

+

+ And this distinction is important to make and establish from the beginning. Effective topical authority can only be gained when you use both content depth and content breadth in your overall content strategy for rapid search engine optimization gains. However, because most content writers prefer to write a little bit about many things rather than write a lot about one thing, you end up with a too little substance spread very thin. +

+

+ Content depth should be the urgent priority for your content marketing strategy, and clearly defined in your content briefs. Start by dominating your own core topics, before venturing across the pond and write about linked subject matters. Otherwise, you are the opposite of an authority as the definition states that an authority is “a person with extensive or specialized knowledge about a subject; an expert”. Lastly, do not mistake article depth vs. article length: a blog post’s extreme wordcount has nothing to do with its content depth. +

+

+ Assess How Deep Is Your Content +

+

+ The first task on your list, right now, is to shortlist your core topics. What are you trying to be an expert on? Then, go through each one of your pieces of content and understand how well each blog post is covering its focus topic(s). Not how many times specific keywords appear, or how well the article is outlined and structured. +

+

+ Put yourself in the shoes of an ignorant reader who seeks information. Read your article. And ask yourself how in-depth was the content you have written? I know the excuse you will come up with: this was written for beginners, therefore, it shouldn’t be too in-depth. And you are correct. Not every blog post is about absolute content depth otherwise we would only write one 10,000-word-long article, once and for all. But then, how well your beginner-level content pointing to your expert-level content? +

+

+ In other words, each article should reach an incredible level of content depth for its expertise level. And then, provide further reading (i.e. links) to gain more knowledge, and depth. A lot of content editors write a beginner’s blog post and wait to see it perform well in order to write a more advanced sequel. Wrong. Give all the value so search engines can grade you highly on their authority scale for your core topics. Yes, it is a risk and you may write a dozen of articles on a specific topic that will never really rank at the top of SERPs, but reaching content depth is the first step towards SEO gains. +

+

+ Remember that skyscraper content and 10x content are not necessarily the answer. These content writing strategies state that in order to beat another piece of content, you need to write 10x more. Either in quantity with a 10x word count or in quality by putting times more information within your own piece of content. Such articles often become unreadable and discourage visitors from absorbing all the knowledge. The best alternative is the create pillar pages centered around core topics, and several articles dealing with each specific section in depth. This is deep content powered by a smart internal linking strategy and search engines love that in this day and age where attention spans are short! With that being said, avoid writing 600-word articles! +

+

+ Rewrite With Content Depth In Mind +

+

+ Once you know which articles are lacking depth of knowledge and information, it is time to rethink each one. For each article, make a list of what essential pieces of information or data are missing. Then decide where to fit them, and decide whether the article would benefit from a full rewrite or not. As a rule of thumb, if you need to change a third of your article, you may need to rewrite it entirely. Of course, this does not mean erasing all work done prior, but it means starting afresh! Trying to fit deep content into an existing blog post gives you constraints so doing it from scratch can actually be easier to fight thin content. +

+
+ +
+

+ As explained above, make sure you do not force yourself to write a much longer article to reach a magic word count. And if you do, it has to be natural. In many cases, articles written months or years ago may need some upkeeping: trimming the fat and removing parts that are not bringing much value. Replace these with your newer and deeper content. +

+

+ All content writers know that when you open Google Docs, WordPress, or your text editor of choice, you will inevitably count your focus keywords’ frequency. Although I understand (yet question) the value of keywords in modern SEO, do not become obsessed with reaching a magic number for your keywords. No reader coming from Google is out there counting how often your keywords are appearing. And search engine algorithms will penalize you for writing for robots, rather than humans. +

+

+ With the massive rise of voice searches, users tend to use full questions for their search queries. What used to be top bottled water brands is now OK google, what is the best bottled-water brand in Texas? The point being, keywords are losing traction to leave space for a more natural language understanding of a blog post’s textual content, and meaning. +

+

+ Yes, Content Depth and Breadth Overlap +

+

+ “A topic can be defined as the company it keeps.” A very accurate saying loved by ontologists within the fields of computational linguistics, and information science. In simpler terms, a topic and all the terminology it is encompassing will inevitably overlap with related topics. Which, in turn, will form topic clusters. +

+

+ For example, it is obvious that despite being two different topics, digital advertising and content marketing share some common phrases and terms. Inevitably, a website picking one as its core topic will use words in some blog posts that will identify the article as belonging to both topics, with a specific weight for each. +

+

+ A keyword, phrase, or term, is not a prisoner to a single concept at all. This is how algorithms in natural language understanding can understand how two topics are related (e.g. read about topic modeling). Each topic has a specific vocabulary, a list of words and phrases commonly used in its context, and some of these terms are present in different vocabularies. +

+

+ Therefore, content depth and content breadth are not to be opposed. Content marketers should use both strategies in order to reach ultimate topical authority over their choice of subject matters. +

+

+ Depth of Content = Quality + Frequency +

+

+ Up until recently, long-form blog posts generally were evergreen articles that generated a constant stream of organic traffic for a website. This was a lead magnet generation strategy which worked well: hire a writer, include the right keywords, reach over a 5,000-word word count, and hit publish. Then, wait. +

+

+ Nowadays, in-depth content requires more effort over time in order to pay off. Writing a big article, as good as it is, will not get your anywhere near the level of topical breadth required by Google to rank you first. Instead, your content marketing plan should be about having: +

+
    +
  • a comprehensive pillar page covering a unique topic, and +
  • +
  • + narrow-focused children articles to dig deeper. +
  • +
+

+ Search engines also look at how often you publish about a specific topic, and when was the last time it was written about. Nobody likes a graveyard blog, it just makes the reader lose trust; as if the writer was not good enough, therefore had no traffic, before entirely giving up. Deep content requires a sustained effort on your part to always new find ways to write about a specific subject. Sure, it will be easy at first. But what about five years later? Well, you will still need to hit publish, all about the very same topics you already covered years ago. +

+

+ Tools and platforms such as topicseed are here to help you find new article ideas pertaining to your core topics within a few clicks and a few minutes. The number of web pages, Wikipedia articles, and pieces of content, our machine-learning algorithms can analyze in seconds would take you months to digest. Our topicgraph finds closely related concepts in order for your domain to reach topical authority through content depth and content breadth. +

+
+
+
+
+
+ +
+ + +
+
+
+
+
+ + + + + + + + + + + + + diff --git a/resources/tests/readability/tumblr/expected.html b/resources/tests/readability/tumblr/expected.html new file mode 100644 index 0000000..daefbf8 --- /dev/null +++ b/resources/tests/readability/tumblr/expected.html @@ -0,0 +1,4 @@ +
+

Minecraft 1.8 - The Bountiful Update

+

+ Added Granite, Andesite, and Diorite stone blocks, with smooth versions
+ Added Slime Block
+ Added Iron Trapdoor
+ Added Prismarine and Sea Lantern blocks
+ Added the Ocean Monument
+ Added Red Sandstone
+ Added Banners
+ Added Armor Stands
+ Added Coarse Dirt (dirt where grass won’t grow)
+ Added Guardian mobs, with item drops
+ Added Endermite mob
+ Added Rabbits, with item drops
+ Added Mutton and Cooked Mutton
+ Villagers will harvest crops and plant new ones
+ Mossy Cobblestone and Mossy Stone Bricks are now craftable
+ Chiseled Stone Bricks are now craftable
+ Doors and fences now come in all wood type variants
+ Sponge block has regained its water-absorbing ability and becomes wet
+ Added a spectator game mode (game mode 3)
+ Added one new achievement
+ Added “Customized” world type
+ Added hidden “Debug Mode” world type
+ Worlds can now have a world barrier
+ Added @e target selector for Command Blocks
+ Added /blockdata command
+ Added /clone command
+ Added /execute command
+ Added /fill command
+ Added /particle command
+ Added /testforblocks command
+ Added /title command
+ Added /trigger command
+ Added /worldborder command
+ Added /stats command
+ Containers can be locked in custom maps by using the “Lock” data tag
+ Added logAdminCommands, showDeathMessages, reducedDebugInfo, sendCommandFeedback, and randomTickSpeed game rules
+ Added three new statistics
+ Player skins can now have double layers across the whole model, and left/right arms/legs can be edited independently
+ Added a new player model with smaller arms, and a new player skin called Alex?
+ Added options for configuring what pieces of the skin that are visible
+ Blocks can now have custom visual variations in the resource packs
+ Minecraft Realms now has an activity chart, so you can see who has been online
+ Minecraft Realms now lets you upload your maps
* Difficulty setting is saved per world, and can be locked if wanted
* Enchanting has been redone, now costs lapis lazuli in addition to enchantment levels
* Villager trading has been rebalanced
* Anvil repairing has been rebalanced
* Considerable faster client-side performance
* Max render distance has been increased to 32 chunks (512 blocks)
* Adventure mode now prevents you from destroying blocks, unless your items have the CanDestroy data tag
* Resource packs can now also define the shape of blocks and items, and not just their textures
* Scoreboards have been given a lot of new features
* Tweaked the F3 debug screen
* Block ID numbers (such as 1 for stone), are being replaced by ID names (such as minecraft:stone)
* Server list has been improved
* A few minor changes to village and temple generation
* Mob heads for players now show both skin layers
* Buttons can now be placed on the ceiling
* Lots and lots of other changes
* LOTS AND LOTS of other changes
- Removed Herobrine

+
diff --git a/resources/tests/readability/tumblr/source.html b/resources/tests/readability/tumblr/source.html new file mode 100644 index 0000000..14069d2 --- /dev/null +++ b/resources/tests/readability/tumblr/source.html @@ -0,0 +1,793 @@ + + + + + + + + + + Minecraft 1.8 - The Bountiful Update - Minecraft 1.8 - The Bountiful Update - Minecraft Update News + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Minecraft News

+
+ + Powered by Tumblr + +
+
+
+
+
+
+

Minecraft 1.8 - The Bountiful Update

+

+ Added Granite, Andesite, and Diorite stone blocks, with smooth versions
+ Added Slime Block
+ Added Iron Trapdoor
+ Added Prismarine and Sea Lantern blocks
+ Added the Ocean Monument
+ Added Red Sandstone
+ Added Banners
+ Added Armor Stands
+ Added Coarse Dirt (dirt where grass won’t grow)
+ Added Guardian mobs, with item drops
+ Added Endermite mob
+ Added Rabbits, with item drops
+ Added Mutton and Cooked Mutton
+ Villagers will harvest crops and plant new ones
+ Mossy Cobblestone and Mossy Stone Bricks are now craftable
+ Chiseled Stone Bricks are now craftable
+ Doors and fences now come in all wood type variants
+ Sponge block has regained its water-absorbing ability and becomes wet
+ Added a spectator game mode (game mode 3)
+ Added one new achievement
+ Added “Customized” world type
+ Added hidden “Debug Mode” world type
+ Worlds can now have a world barrier
+ Added @e target selector for Command Blocks
+ Added /blockdata command
+ Added /clone command
+ Added /execute command
+ Added /fill command
+ Added /particle command
+ Added /testforblocks command
+ Added /title command
+ Added /trigger command
+ Added /worldborder command
+ Added /stats command
+ Containers can be locked in custom maps by using the “Lock” data tag
+ Added logAdminCommands, showDeathMessages, reducedDebugInfo, sendCommandFeedback, and randomTickSpeed game rules
+ Added three new statistics
+ Player skins can now have double layers across the whole model, and left/right arms/legs can be edited independently
+ Added a new player model with smaller arms, and a new player skin called Alex?
+ Added options for configuring what pieces of the skin that are visible
+ Blocks can now have custom visual variations in the resource packs
+ Minecraft Realms now has an activity chart, so you can see who has been online
+ Minecraft Realms now lets you upload your maps
* Difficulty setting is saved per world, and can be locked if wanted
* Enchanting has been redone, now costs lapis lazuli in addition to enchantment levels
* Villager trading has been rebalanced
* Anvil repairing has been rebalanced
* Considerable faster client-side performance
* Max render distance has been increased to 32 chunks (512 blocks)
* Adventure mode now prevents you from destroying blocks, unless your items have the CanDestroy data tag
* Resource packs can now also define the shape of blocks and items, and not just their textures
* Scoreboards have been given a lot of new features
* Tweaked the F3 debug screen
* Block ID numbers (such as 1 for stone), are being replaced by ID names (such as minecraft:stone)
* Server list has been improved
* A few minor changes to village and temple generation
* Mob heads for players now show both skin layers
* Buttons can now be placed on the ceiling
* Lots and lots of other changes
* LOTS AND LOTS of other changes
- Removed Herobrine

+
+ +
+
+
+ + + + + + diff --git a/resources/tests/readability/v8-blog/expected.html b/resources/tests/readability/v8-blog/expected.html new file mode 100644 index 0000000..e5874fc --- /dev/null +++ b/resources/tests/readability/v8-blog/expected.html @@ -0,0 +1,178 @@ +
+

+ Emscripten has always focused first and foremost on compiling to the Web and other JavaScript environments like Node.js. But as WebAssembly starts to be used without JavaScript, new use cases are appearing, and so we've been working on support for emitting standalone Wasm files from Emscripten, that do not depend on the Emscripten JS runtime! This post explains why that's interesting. +

+

+ Using standalone mode in Emscripten # +

+

+ First, let's see what you can do with this new feature! Similar to this post let's start with a "hello world" type program that exports a single function that adds two numbers: +

+
// add.c
#include <emscripten.h>

EMSCRIPTEN_KEEPALIVE
int add(int x, int y) {
return x + y;
}
+

+ We'd normally build this with something like emcc -O3 add.c -o add.js which would emit add.js and add.wasm. Instead, let's ask emcc to only emit Wasm: +

+
emcc -O3 add.c -o add.wasm
+
+

+ When emcc sees we only want Wasm then it makes it "standalone" - a Wasm file that can run by itself as much as possible, without any JavaScript runtime code from Emscripten. +

+

+ Disassembling it, it's very minimal - just 87 bytes! It contains the obvious add function +

+
(func $add (param $0 i32) (param $1 i32) (result i32)
(i32.add
(local.get $0)
(local.get $1)
)
)
+

+ and one more function, _start, +

+
(func $_start
(nop)
)
+

+ _start is part of the WASI spec, and Emscripten's standalone mode emits it so that we can run in WASI runtimes. (Normally _start would do global initialization, but here we just don't need any so it's empty.) +

+

+ Write your own JavaScript loader # +

+

+ One nice thing about a standalone Wasm file like this is that you can write custom JavaScript to load and run it, which can be very minimal depending on your use case. For example, we can do this in Node.js: +

+
// load-add.js
const binary = require('fs').readFileSync('add.wasm');

WebAssembly.instantiate(binary).then(({ instance }) => {
console.log(instance.exports.add(40, 2));
});
+

+ Just 4 lines! Running that prints 42 as expected. Note that while this example is very simplistic, there are cases where you simply don't need much JavaScript, and may be able to do better than Emscripten's default JavaScript runtime (which supports a bunch of environments and options). A real-world example of that is in zeux's meshoptimizer - just 57 lines, including memory management, growth, etc.! +

+

+ Running in Wasm runtimes # +

+

+ Another nice thing about standalone Wasm files is that you can run them in Wasm runtimes like wasmer, wasmtime, or WAVM. For example, consider this hello world: +

+
// hello.cpp
#include <stdio.h>

int main() {
printf("hello, world!\n");
return 0;
}
+

+ We can build and run that in any of those runtimes: +

+
$ emcc hello.cpp -O3 -o hello.wasm
$ wasmer run hello.wasm
hello, world!
$ wasmtime hello.wasm
hello, world!
$ wavm run hello.wasm
hello, world!
+

+ Emscripten uses WASI APIs as much as possible, so programs like this end up using 100% WASI and can run in WASI-supporting runtimes (see notes later on what programs require more than WASI). +

+

+ Building Wasm plugins # +

+

+ Aside from the Web and the server, an exciting area for Wasm is plugins. For example, an image editor might have Wasm plugins that can perform filters and other operations on the image. For that type of use case you want a standalone Wasm binary, just like in the examples so far, but where it also has a proper API for the embedding application. +

+

+ Plugins are sometimes related to dynamic libraries, as dynamic libraries are one way to implement them. Emscripten has support for dynamic libraries with the SIDE_MODULE option, and this has been a way to build Wasm plugins. The new standalone Wasm option described here is an improvement on that in several ways: First, a dynamic library has relocatable memory, which adds overhead if you don’t need it (and you don’t if you aren’t linking the Wasm with another Wasm after loading it). Second, standalone output is designed to run in Wasm runtimes as well, as mentioned earlier. +

+

+ Okay, so far so good: Emscripten can either emit JavaScript + WebAssembly as it always did, and now it can also emit just WebAssembly by itself, which lets you run it in places that don't have JavaScript like Wasm runtimes, or you can write your own custom JavaScript loader code, etc. Now let's talk about the background and the technical details! +

+

+ WebAssembly's two standard APIs # +

+

+ WebAssembly can only access the APIs it receives as imports - the core Wasm spec has no concrete API details. Given the current trajectory of Wasm, it looks like there will be 3 main categories of APIs that people import and use: +

+
    +
  • + Web APIs: This is what Wasm programs use on the Web, which are the existing standardized APIs that JavaScript can use too. Currently these are called indirectly, through JS glue code, but in the future with interface types they will be called directly. +
  • +
  • + WASI APIs: WASI focuses on standardizing APIs for Wasm on the server. +
  • +
  • + Other APIs: Various custom embeddings will define their own application-specific APIs. For example, we gave the example earlier of an image editor with Wasm plugins that implement an API to do visual effects. Note that a plugin might also have access to “system” APIs, like a native dynamic library would, or it might be very sandboxed and have no imports at all (if the embedding just calls its methods). +
  • +
+

+ WebAssembly is in the interesting position of having two standardized sets of APIs. This does makes sense in that one is for the Web and one for the server, and those environments do have different requirements; for similar reasons Node.js does not have identical APIs to JavaScript on the Web. +

+

+ However, there is more than the Web and the server, in particular there are also Wasm plugins. For one thing, plugins can run inside an application that may be on the Web (just like JS plugins) or off the Web; for another, regardless of where the embedding application is, a plugin environment is not a Web nor a server environment. So it's not immediately obvious which sets of APIs will be used - it may depend on the code being ported, the Wasm runtime being embedded, etc. +

+

+ Let's unify as much as possible # +

+

+ One concrete way Emscripten hopes to help here is that by using WASI APIs as much as possible we can avoid unnecessary API differences. As mentioned earlier, on the Web Emscripten code accesses Web APIs indirectly, through JavaScript, so where that JavaScript API could look like WASI, we'd be removing an unnecessary API difference, and that same binary can also run on the server. In other words, if Wasm wants to log some info, it needs to call into JS, something like this: +

+
wasm   =>   function musl_writev(..) { .. console.log(..) .. }
+

+ musl_writev is an implementation of the Linux syscall interface that musl libc uses to write data to a file descriptor, and that ends up calling console.log with the proper data. The Wasm module imports and calls that musl_writev, which defines an ABI between the JS and the Wasm. That ABI is arbitrary (and in fact Emscripten has changed its ABI over time to optimize it). If we replace that with an ABI that matches WASI, we can get this: +

+
wasm   =>   function __wasi_fd_write(..) { .. console.log(..) .. }
+

+ This isn't a big change, just requiring some refactoring of the ABI, and when running in a JS environment it doesn't matter much. But now the Wasm can run without the JS since that WASI API is recognized by WASI runtimes! That’s how the standalone Wasm examples from before work, just by refactoring Emscripten to use WASI APIs. +

+

+ Another advantage of Emscripten using WASI APIs is that we can help the WASI spec by finding real-world issues. For example, we found that changing the WASI "whence" constants would be useful, and we've started some discussions around code size and POSIX compatibility. +

+

+ Emscripten using WASI as much as possible is also useful in that it lets users use a single SDK to target Web, server, and plugin environments. Emscripten isn't the only SDK allowing that, as the WASI SDK's output can be run on the Web using the WASI Web Polyfill or Wasmer's wasmer-js, but Emscripten’s Web output is more compact, so it lets a single SDK be used without compromising Web performance. +

+

+ Speaking of which, you can emit a standalone Wasm file from Emscripten with optional JS in a single command: +

+
emcc -O3 add.c -o add.js -s STANDALONE_WASM
+
+

+ That emits add.js and add.wasm. The Wasm file is standalone just like earlier when we only emitted a Wasm file by itself (STANDALONE_WASM was set automatically when we said -o add.wasm), but now in addition there is a JS file that can load and run it. The JS is useful for running it on the Web if you don't want to write your own JS for that. +

+

+ Do we need non-standalone Wasm? # +

+

+ Why does the STANDALONE_WASM flag exist? In theory Emscripten could always set STANDALONE_WASM, which would be simpler. But standalone Wasm files can't depend on JS, and that has some downsides: +

+
    +
  • We can't minify the Wasm import and export names, as the minification only works if both sides agree, the Wasm and what loads it. +
  • +
  • Normally we create the Wasm Memory in JS so that JS can start to use it during startup, which lets us do work in parallel. But in standalone Wasm we have to create the Memory in the Wasm. +
  • +
  • Some APIs are just easy to do in JS. For example __assert_fail, which is called when a C assertion fails, is normally implemented in JS. It takes just a single line, and even if you include the JS functions it calls, the total code size is quite small. On the other hand, in a standalone build we can't depend on JS, so we use musl's assert.c. That uses fprintf, which means it ends up pulling in a bunch of C stdio support, including things with indirect calls that make it hard to remove unused functions. Overall, there are many such details that end up making a difference in total code size. +
  • +
+

+ If you want to run both on the Web and elsewhere, and you want 100% optimal code size and startup times, you should make two separate builds, one with -s STANDALONE and one without. That's very easy as it's just flipping one flag! +

+

+ Necessary API differences # +

+

+ We saw that Emscripten uses WASI APIs as much as possible to avoid unnecessary API differences. Are there any necessary ones? Sadly, yes - some WASI APIs require tradeoffs. For example: +

+
    +
  • WASI does not support various POSIX features, like user/group/world file permissions, as a result of which you can't fully implement a (Linux) system ls for example (see details in that link). Emscripten's existing filesystem layer does support some of those things, so if we switched to WASI APIs for all filesystem operations then we'd be losing some POSIX support. +
  • +
  • WASI's path_open has a cost in code size because it forces extra permissions handling in the Wasm itself. That code is unnecessary on the Web. +
  • +
  • WASI doesn't provide a notification API for memory growth, and as a result, JS runtimes must constantly check if memory grew and if so update their views, on every import and export. To avoid that overhead, Emscripten provides a notification API, emscripten_notify_memory_growth, which you can see implemented in a single line in zeux's meshoptimizer that we mentioned earlier. +
  • +
+

+ In time WASI may add more POSIX support, a memory growth notification, etc. - WASI is still highly experimental and expected to change significantly. For now, to avoid regressions in Emscripten we do not emit 100% WASI binaries if you use certain features. In particular, opening files uses a POSIX method instead of WASI, which means that if you call fopen then the resulting Wasm file will not be 100% WASI - however, if all you do is use printf, which operates on the already-open stdout, then it will be 100% WASI, as in the "hello world" example we saw near the beginning, where Emscripten's output does run in WASI runtimes. +

+

+ If it would be useful for users we can add a PURE_WASI option which would sacrifice code size in return for strict WASI compliance, but if that's not urgent (and most plugin use cases we’ve seen so far don’t need full file I/O) then maybe we can wait for WASI to improve to where Emscripten can remove these non-WASI APIs. That would be the best outcome, and we’re working towards that as you can see in the links above. +

+

+ However, even if WASI does improve, there is no avoiding the fact that Wasm has two standardized APIs as mentioned earlier. In the future I expect Emscripten will call Web APIs directly using interface types, because that will be more compact than calling a WASI-looking JS API that then calls a Web API (as in the musl_writev example from before). We could have a polyfill or a translation layer of some sort to help here, but we wouldn't want to use it unnecessarily, so we will need separate builds for Web and WASI environments. (This is somewhat unfortunate; in theory this could have been avoided if WASI were a superset of Web APIs, but obviously that would have meant compromises on the server side.) +

+

+ Current status # +

+

+ Quite a lot works already! The main limitations are: +

+
    +
  • + WebAssembly limitations: Various features, like C++ exceptions, setjmp, and pthreads, depend on JavaScript due to Wasm limitations, and there is no good non-JS replacement yet. (Emscripten may start to support some of them using Asyncify, or maybe we'll just wait for native Wasm features to arrive to VMs.) +
  • +
  • + WASI limitations: Libraries and APIs like OpenGL and SDL don't have corresponding WASI APIs yet. +
  • +
+

+ You can still use all those in Emscripten's standalone mode, but the output will contain calls to JS runtime support code. As a result, it will not be 100% WASI (for similar reasons those features also do not work in the WASI SDK). Those Wasm files won't run in WASI runtimes, but you can use them on the Web and you can write your own JS runtime for them. You can also use them as plugins; for example, a game engine could have plugins that render using OpenGL, and the developer would compile them in standalone mode and then implement the OpenGL imports in the engine's Wasm runtime. Standalone Wasm mode still helps here because it makes the output as standalone as Emscripten can make it. +

+

+ You may also find APIs that do have a non-JS replacement that we haven’t converted yet, as work is still ongoing. Please file bugs, and as always help is welcome! +

+
diff --git a/resources/tests/readability/v8-blog/source.html b/resources/tests/readability/v8-blog/source.html new file mode 100644 index 0000000..4a23d09 --- /dev/null +++ b/resources/tests/readability/v8-blog/source.html @@ -0,0 +1,259 @@ + + + + + + Outside the web: standalone WebAssembly binaries using Emscripten · V8 + + + + + + + + + + + + + +
+
+
+

+ Outside the web: standalone WebAssembly binaries using Emscripten +

+

+ Published · Tagged with WebAssembly tooling +

+
+
+

+ Emscripten has always focused first and foremost on compiling to the Web and other JavaScript environments like Node.js. But as WebAssembly starts to be used without JavaScript, new use cases are appearing, and so we've been working on support for emitting standalone Wasm files from Emscripten, that do not depend on the Emscripten JS runtime! This post explains why that's interesting. +

+

+ Using standalone mode in Emscripten # +

+

+ First, let's see what you can do with this new feature! Similar to this post let's start with a "hello world" type program that exports a single function that adds two numbers: +

+
// add.c
#include <emscripten.h>

EMSCRIPTEN_KEEPALIVE
int add(int x, int y) {
return x + y;
}
+

+ We'd normally build this with something like emcc -O3 add.c -o add.js which would emit add.js and add.wasm. Instead, let's ask emcc to only emit Wasm: +

+
emcc -O3 add.c -o add.wasm
+
+

+ When emcc sees we only want Wasm then it makes it "standalone" - a Wasm file that can run by itself as much as possible, without any JavaScript runtime code from Emscripten. +

+

+ Disassembling it, it's very minimal - just 87 bytes! It contains the obvious add function +

+
(func $add (param $0 i32) (param $1 i32) (result i32)
(i32.add
(local.get $0)
(local.get $1)
)
)
+

+ and one more function, _start, +

+
(func $_start
(nop)
)
+

+ _start is part of the WASI spec, and Emscripten's standalone mode emits it so that we can run in WASI runtimes. (Normally _start would do global initialization, but here we just don't need any so it's empty.) +

+

+ Write your own JavaScript loader # +

+

+ One nice thing about a standalone Wasm file like this is that you can write custom JavaScript to load and run it, which can be very minimal depending on your use case. For example, we can do this in Node.js: +

+
// load-add.js
const binary = require('fs').readFileSync('add.wasm');

WebAssembly.instantiate(binary).then(({ instance }) => {
console.log(instance.exports.add(40, 2));
});
+

+ Just 4 lines! Running that prints 42 as expected. Note that while this example is very simplistic, there are cases where you simply don't need much JavaScript, and may be able to do better than Emscripten's default JavaScript runtime (which supports a bunch of environments and options). A real-world example of that is in zeux's meshoptimizer - just 57 lines, including memory management, growth, etc.! +

+

+ Running in Wasm runtimes # +

+

+ Another nice thing about standalone Wasm files is that you can run them in Wasm runtimes like wasmer, wasmtime, or WAVM. For example, consider this hello world: +

+
// hello.cpp
#include <stdio.h>

int main() {
printf("hello, world!\n");
return 0;
}
+

+ We can build and run that in any of those runtimes: +

+
$ emcc hello.cpp -O3 -o hello.wasm
$ wasmer run hello.wasm
hello, world!
$ wasmtime hello.wasm
hello, world!
$ wavm run hello.wasm
hello, world!
+

+ Emscripten uses WASI APIs as much as possible, so programs like this end up using 100% WASI and can run in WASI-supporting runtimes (see notes later on what programs require more than WASI). +

+

+ Building Wasm plugins # +

+

+ Aside from the Web and the server, an exciting area for Wasm is plugins. For example, an image editor might have Wasm plugins that can perform filters and other operations on the image. For that type of use case you want a standalone Wasm binary, just like in the examples so far, but where it also has a proper API for the embedding application. +

+

+ Plugins are sometimes related to dynamic libraries, as dynamic libraries are one way to implement them. Emscripten has support for dynamic libraries with the SIDE_MODULE option, and this has been a way to build Wasm plugins. The new standalone Wasm option described here is an improvement on that in several ways: First, a dynamic library has relocatable memory, which adds overhead if you don’t need it (and you don’t if you aren’t linking the Wasm with another Wasm after loading it). Second, standalone output is designed to run in Wasm runtimes as well, as mentioned earlier. +

+

+ Okay, so far so good: Emscripten can either emit JavaScript + WebAssembly as it always did, and now it can also emit just WebAssembly by itself, which lets you run it in places that don't have JavaScript like Wasm runtimes, or you can write your own custom JavaScript loader code, etc. Now let's talk about the background and the technical details! +

+

+ WebAssembly's two standard APIs # +

+

+ WebAssembly can only access the APIs it receives as imports - the core Wasm spec has no concrete API details. Given the current trajectory of Wasm, it looks like there will be 3 main categories of APIs that people import and use: +

+
    +
  • + Web APIs: This is what Wasm programs use on the Web, which are the existing standardized APIs that JavaScript can use too. Currently these are called indirectly, through JS glue code, but in the future with interface types they will be called directly. +
  • +
  • + WASI APIs: WASI focuses on standardizing APIs for Wasm on the server. +
  • +
  • + Other APIs: Various custom embeddings will define their own application-specific APIs. For example, we gave the example earlier of an image editor with Wasm plugins that implement an API to do visual effects. Note that a plugin might also have access to “system” APIs, like a native dynamic library would, or it might be very sandboxed and have no imports at all (if the embedding just calls its methods). +
  • +
+

+ WebAssembly is in the interesting position of having two standardized sets of APIs. This does makes sense in that one is for the Web and one for the server, and those environments do have different requirements; for similar reasons Node.js does not have identical APIs to JavaScript on the Web. +

+

+ However, there is more than the Web and the server, in particular there are also Wasm plugins. For one thing, plugins can run inside an application that may be on the Web (just like JS plugins) or off the Web; for another, regardless of where the embedding application is, a plugin environment is not a Web nor a server environment. So it's not immediately obvious which sets of APIs will be used - it may depend on the code being ported, the Wasm runtime being embedded, etc. +

+

+ Let's unify as much as possible # +

+

+ One concrete way Emscripten hopes to help here is that by using WASI APIs as much as possible we can avoid unnecessary API differences. As mentioned earlier, on the Web Emscripten code accesses Web APIs indirectly, through JavaScript, so where that JavaScript API could look like WASI, we'd be removing an unnecessary API difference, and that same binary can also run on the server. In other words, if Wasm wants to log some info, it needs to call into JS, something like this: +

+
wasm   =>   function musl_writev(..) { .. console.log(..) .. }
+

+ musl_writev is an implementation of the Linux syscall interface that musl libc uses to write data to a file descriptor, and that ends up calling console.log with the proper data. The Wasm module imports and calls that musl_writev, which defines an ABI between the JS and the Wasm. That ABI is arbitrary (and in fact Emscripten has changed its ABI over time to optimize it). If we replace that with an ABI that matches WASI, we can get this: +

+
wasm   =>   function __wasi_fd_write(..) { .. console.log(..) .. }
+

+ This isn't a big change, just requiring some refactoring of the ABI, and when running in a JS environment it doesn't matter much. But now the Wasm can run without the JS since that WASI API is recognized by WASI runtimes! That’s how the standalone Wasm examples from before work, just by refactoring Emscripten to use WASI APIs. +

+

+ Another advantage of Emscripten using WASI APIs is that we can help the WASI spec by finding real-world issues. For example, we found that changing the WASI "whence" constants would be useful, and we've started some discussions around code size and POSIX compatibility. +

+

+ Emscripten using WASI as much as possible is also useful in that it lets users use a single SDK to target Web, server, and plugin environments. Emscripten isn't the only SDK allowing that, as the WASI SDK's output can be run on the Web using the WASI Web Polyfill or Wasmer's wasmer-js, but Emscripten’s Web output is more compact, so it lets a single SDK be used without compromising Web performance. +

+

+ Speaking of which, you can emit a standalone Wasm file from Emscripten with optional JS in a single command: +

+
emcc -O3 add.c -o add.js -s STANDALONE_WASM
+
+

+ That emits add.js and add.wasm. The Wasm file is standalone just like earlier when we only emitted a Wasm file by itself (STANDALONE_WASM was set automatically when we said -o add.wasm), but now in addition there is a JS file that can load and run it. The JS is useful for running it on the Web if you don't want to write your own JS for that. +

+

+ Do we need non-standalone Wasm? # +

+

+ Why does the STANDALONE_WASM flag exist? In theory Emscripten could always set STANDALONE_WASM, which would be simpler. But standalone Wasm files can't depend on JS, and that has some downsides: +

+
    +
  • We can't minify the Wasm import and export names, as the minification only works if both sides agree, the Wasm and what loads it. +
  • +
  • Normally we create the Wasm Memory in JS so that JS can start to use it during startup, which lets us do work in parallel. But in standalone Wasm we have to create the Memory in the Wasm. +
  • +
  • Some APIs are just easy to do in JS. For example __assert_fail, which is called when a C assertion fails, is normally implemented in JS. It takes just a single line, and even if you include the JS functions it calls, the total code size is quite small. On the other hand, in a standalone build we can't depend on JS, so we use musl's assert.c. That uses fprintf, which means it ends up pulling in a bunch of C stdio support, including things with indirect calls that make it hard to remove unused functions. Overall, there are many such details that end up making a difference in total code size. +
  • +
+

+ If you want to run both on the Web and elsewhere, and you want 100% optimal code size and startup times, you should make two separate builds, one with -s STANDALONE and one without. That's very easy as it's just flipping one flag! +

+

+ Necessary API differences # +

+

+ We saw that Emscripten uses WASI APIs as much as possible to avoid unnecessary API differences. Are there any necessary ones? Sadly, yes - some WASI APIs require tradeoffs. For example: +

+
    +
  • WASI does not support various POSIX features, like user/group/world file permissions, as a result of which you can't fully implement a (Linux) system ls for example (see details in that link). Emscripten's existing filesystem layer does support some of those things, so if we switched to WASI APIs for all filesystem operations then we'd be losing some POSIX support. +
  • +
  • WASI's path_open has a cost in code size because it forces extra permissions handling in the Wasm itself. That code is unnecessary on the Web. +
  • +
  • WASI doesn't provide a notification API for memory growth, and as a result, JS runtimes must constantly check if memory grew and if so update their views, on every import and export. To avoid that overhead, Emscripten provides a notification API, emscripten_notify_memory_growth, which you can see implemented in a single line in zeux's meshoptimizer that we mentioned earlier. +
  • +
+

+ In time WASI may add more POSIX support, a memory growth notification, etc. - WASI is still highly experimental and expected to change significantly. For now, to avoid regressions in Emscripten we do not emit 100% WASI binaries if you use certain features. In particular, opening files uses a POSIX method instead of WASI, which means that if you call fopen then the resulting Wasm file will not be 100% WASI - however, if all you do is use printf, which operates on the already-open stdout, then it will be 100% WASI, as in the "hello world" example we saw near the beginning, where Emscripten's output does run in WASI runtimes. +

+

+ If it would be useful for users we can add a PURE_WASI option which would sacrifice code size in return for strict WASI compliance, but if that's not urgent (and most plugin use cases we’ve seen so far don’t need full file I/O) then maybe we can wait for WASI to improve to where Emscripten can remove these non-WASI APIs. That would be the best outcome, and we’re working towards that as you can see in the links above. +

+

+ However, even if WASI does improve, there is no avoiding the fact that Wasm has two standardized APIs as mentioned earlier. In the future I expect Emscripten will call Web APIs directly using interface types, because that will be more compact than calling a WASI-looking JS API that then calls a Web API (as in the musl_writev example from before). We could have a polyfill or a translation layer of some sort to help here, but we wouldn't want to use it unnecessarily, so we will need separate builds for Web and WASI environments. (This is somewhat unfortunate; in theory this could have been avoided if WASI were a superset of Web APIs, but obviously that would have meant compromises on the server side.) +

+

+ Current status # +

+

+ Quite a lot works already! The main limitations are: +

+
    +
  • + WebAssembly limitations: Various features, like C++ exceptions, setjmp, and pthreads, depend on JavaScript due to Wasm limitations, and there is no good non-JS replacement yet. (Emscripten may start to support some of them using Asyncify, or maybe we'll just wait for native Wasm features to arrive to VMs.) +
  • +
  • + WASI limitations: Libraries and APIs like OpenGL and SDL don't have corresponding WASI APIs yet. +
  • +
+

+ You can still use all those in Emscripten's standalone mode, but the output will contain calls to JS runtime support code. As a result, it will not be 100% WASI (for similar reasons those features also do not work in the WASI SDK). Those Wasm files won't run in WASI runtimes, but you can use them on the Web and you can write your own JS runtime for them. You can also use them as plugins; for example, a game engine could have plugins that render using OpenGL, and the developer would compile them in standalone mode and then implement the OpenGL imports in the engine's Wasm runtime. Standalone Wasm mode still helps here because it makes the output as standalone as Emscripten can make it. +

+

+ You may also find APIs that do have a non-JS replacement that we haven’t converted yet, as work is still ongoing. Please file bugs, and as always help is welcome! +

+
+ +
+
+ + + + + + diff --git a/resources/tests/readability/videos-1/expected.html b/resources/tests/readability/videos-1/expected.html new file mode 100644 index 0000000..47fe812 --- /dev/null +++ b/resources/tests/readability/videos-1/expected.html @@ -0,0 +1,267 @@ +
+

+ In the introduction to her review anthology For Keeps: 30 Years at the Movies, the legendary film critic Pauline Kael wrote, “I’m frequently asked why I don’t write my memoirs. I think I have.” She meant what most movie critics realize at some point: that reading your past reviews and revisiting the lists of films you liked most during the year reveals not just something about a particular year in cinema, but something about you as well. +

+

+ That’s the feeling I get constructing my list of the best films of 2017, a year that overflowed with great films in every genre, from horror and romantic comedy to documentary and arthouse drama. Some of the films on my list have commonalities — ghosts, meditations on memory and interpersonal connection, and women who refuse to behave — but mostly they underscore just how vibrant cinema remains as an art form, even in the midst of massive cultural shifts in the industry and beyond. And it is a keen reminder to me of all the 2017 conversations I’ve had around and at the movies — and the ways I will never be the same. +

+

+ Here are my top 21 films of 2017 and how to watch them at home, with 14 honorable mentions. +

+

+ 21) Star Wars: The Last Jedi +

+
+ +
+

+ I am as shocked as anyone that a Star Wars movie found its way onto my list — but I was bowled over by The Last Jedi, which may be one of the series’ best. In the hands of writer-director Rian Johnson (who will also oversee a new Star Wars trilogy), The Last Jedi is beautiful to look at and keeps its eye on the relationships between characters and how they communicate with one another, in addition to the bigger galactic story. The same characters are back, but they seem infused with new life, and the galaxy with a new kind of hope. The movie’s best details are in the strong bonds that develop between characters, and I left the film with the realization that for the first time in my life, I loved a Star Wars movie. Now I understand the magic. +

+

+ Star Wars: The Last Jedi is currently streaming on Netflix and available to digitally rent on Google Play and YouTube. +

+

+ 20) Faces Places +

+
+ +
+

+ The unusual documentary Faces Places (in French, Visages Villages) turns on the friendship between the accomplished street artist JR and legendary film director Agnès Varda, whose work was central to the development of the French New Wave movement. The pair (whose difference in age is 55 years) met after years of admiring each other’s work and decided to create a documentary portrait of France — by making a number of actual portraits. The film chronicles a leg of the "Inside Outside Project," a roving art initiative in which JR makes enormous portraits of people he meets and pastes them onto buildings and walls. In the film, Varda joins him, and as they talk to people around the country, they grow in their understanding of themselves and of each other. The development of their friendship, which is both affectionate and mutually sharpening, forms Faces Places’ emotional center. +

+

+ Faces Places is currently streaming on Netflix and available to digitally rent on Google Play and YouTube. +

+ +

+ 19) Ingrid Goes West +

+
+ +
+

+ Ingrid Goes West is a twisted and dark comedy — part addiction narrative, part stalker story — and yet it’s set in a world that’s almost pathologically cheery: the glossy, sunny, nourishing, superfood- and superlative-loving universe of Instagram celebrity. But despite Ingrid Goes West’s spot-on take on that world, the best thing about the film is that it refuses to traffic in lazy buzzwords and easy skewering, particularly at the expense of young women. Instead, the movie conveys that behind every Instagram image and meltdown is a real person, with real insecurities, real feelings, and real problems. And it recognizes that living a life performed in public can be its own kind of self-deluding prison. +

+

+ Ingrid Goes West is currently streaming on Hulu and available to digitally rent on YouTube and Google Play. +

+

+ 18) Lady Macbeth +

+
+ +
+

+ Lady Macbeth is no placid costume drama. Adapted from an 1865 Russian novella by Nikolai Leskov, the movie follows Katherine (the astounding Florence Pugh), a woman in the Lady Macbeth line characterized by a potent cocktail of very few scruples and a lot of determination. She's a chilling avatar for the ways that class and privilege — both obvious and hidden — insulate some people from the consequences of their actions while damning others. Lady Macbeth is also a dazzling directorial debut from William Oldroyd, a thrilling combination of sex, murder, intrigue, and power plays. It’s visually stunning, each frame composed so carefully and deliberately that the wildness and danger roiling just below the surface feels even more frightening. Each scene ratchets up the tension to an explosive, chilling end. +

+

+ Lady Macbeth is currently streaming on HBO Go and HBO Now, and it is available to digitally rent on Amazon Prime, Vudu, YouTube, iTunes, and Google Play. +

+

+ 17) BPM (Beats Per Minute) +

+
+ +
+

+ BPM (Beats Per Minute) is a remarkably tender and stirring story of the Paris chapter of ACT UP, an AIDS activism group, and the young people who found themselves caught in the crosshairs of the AIDS crisis in the early 1990s. The film follows both the group's actions and the individual members’ shifting relationships to one another — enemies becoming friends, friends becoming lovers, lovers becoming caretakers — as well as their struggles with the disease wracking their community. As an account of the period, it’s riveting; as an exploration of life and love set at the urgent intersection of the political and the personal, it’s devastating. +

+

+ BPM (Beats Per Minute) is currently streaming on Hulu and available to digitally rent on Google Play and YouTube. +

+

+ 16) The Big Sick +

+
+ +
+

+ Few 2017 movies could top the charm and tenderness of The Big Sick, which hits all the right romantic comedy notes with one unusual distinction: It feels like real life. That’s probably because The Big Sick is written by real-life married couple Emily V. Gordon and Silicon Valley's Kumail Nanjiani, and based on their real-life romance. The Big Sick — which stars Nanjiani as a version of himself, alongside Zoe Kazan as Emily — is funny and sweet while not backing away from matters that romantic comedies don’t usually touch on, like serious illness, struggles in long-term marriages, and religion. As it tells the couple’s story, which takes a serious turn when Emily falls ill with a mysterious infection and her parents (played by Holly Hunter and Ray Romano) come to town, it becomes a funny and wise story about real love. +

+

+ The Big Sick is currently streaming on Amazon Prime and available to digitally rent on iTunes, Vudu, Amazon, YouTube, and Google Play. +

+

+ 15) Mother! +

+
+ +
+

+ There’s so much pulsing beneath the surface of Mother! that it’s hard to grab on to just one theme as what it “means.” It’s full-on apocalyptic fiction, and like all stories of apocalypse, it’s intended to draw back the veil on reality and show us what’s really beneath. And this movie gets wild: If its gleeful cracking apart of traditional theologies doesn’t get you (there’s a lot of Catholic folk imagery here, complete with an Ash Wednesday-like mud smearing on the foreheads of the faithful), its bonkers scenes of chaos probably will. Mother! is a movie designed to provoke fury, ecstasy, madness, catharsis, and more than a little awe. Watching it, and then participating in the flurry of arguments and discussions unpacking it, was among my best moviegoing experiences of 2017. +

+

+ Mother! is available to digitally purchase on Google Play and YouTube. +

+

+ 14) A Ghost Story +

+
+ +
+

+ Director David Lowery filmed A Ghost Story in secret, then premiered it at the Sundance Film Festival to critical acclaim. The movie starts out being about a grieving widow (Rooney Mara) trying to live through the pain of losing her beloved husband, but it soon shifts focus to the ghost of her husband (Casey Affleck, covered in a sheet), evolving into a compelling rumination on the nature of time, memory, history, and the universe. Bathed in warm humor and wistful longing, it's a film that stays with you long after it’s over, a lingering reminder of the inextricable link between love and place. +

+

+ A Ghost Story is available to digitally rent on iTunes, Vudu, Amazon, Google Play, and YouTube. +

+

+ 13) The Square +

+
+ +
+ +

+ The Square is currently streaming on Hulu and available to digitally rent on Google Play and YouTube. +

+

+ 12) Dunkirk +

+
+ +
+

+ Dunkirk, a true cinematic achievement from acclaimed director Christopher Nolan, backs off conventional notions of narrative and chronology as much as possible, while leaning headfirst into everything else that makes a movie a visceral work of art aimed at the senses: the images, the sounds, the scale, the swelling vibrations of it all. You can’t smell the sea spray, but your brain may trick you into thinking you can. Nolan’s camera pushes the edges of the screen as far as it can as Dunkirk engulfs the audience in something that feels like a lot more than a war movie. It’s a symphony for the brave and broken, and it resolves in a major key — but one with an undercurrent of sorrow, and of sober warning. Courage in the face of danger is not just for characters in movies. +

+

+ Dunkirk is currently streaming on HBO Go and HBO Now, and available to digitally rent on Google Play and YouTube. +

+

+ 11) Rat Film +

+
+ +
+

+ Rat Film is about rats, yes — and rat poison experts and rat hunters and people who keep rats as pets. But it’s also about the history of eugenics, dubious science, “redlining,” and segregated housing in Baltimore. All these pieces come together to form one big essay, where the meaning of each vignette only becomes clearer in light of the whole. It’s a fast-paced, no-holds-barred exploration of a damning history, and it accrues meaning as the images, sounds, and text pile up. +

+

+ Rat Film is available to digitally rent on YouTube and Google Play. +

+

+ 10) A Quiet Passion +

+
+ +
+

+ A Quiet Passion is technically a biographical film about Emily Dickinson, but it transcends its genre to become something more like poetry. It’s a perplexing and challenging film, crafted without the traditional guardrails that guide most biographical movies — dates, times, major accomplishments, and so on. Time slips away in the film almost imperceptibly, and the narrative arc doesn’t yield easily to the viewer. Cynthia Nixon plays Emily Dickinson, whose poetry and life is a perfect match for the signature style of director Terence Davies: rich in detail, deeply enigmatic, and weighed down with a kind of sparkling, joy-tinged sorrow. A Quiet Passion is a portrait, both visual and narrative, of the kind of saint most modern people can understand: one who is certain of her uncertainty, and yearning to walk the path on which her passion and longing meet. +

+

+ A Quiet Passion is currently streaming on Amazon Prime and available to digitally rent or purchase on iTunes, Vudu, Amazon, YouTube, and Google Play. +

+

+ 9) Columbus +

+
+ +
+

+ Columbus is a stunner of a debut from video essayist turned director Kogonada. Haley Lu Richardson stars as Casey, a young woman living in Columbus, Indiana, who cares for her mother, works at a library, and harbors a passion for architecture. (Columbus is a mecca for modernist architecture scholars and enthusiasts.) When a visiting architecture scholar falls into a coma in Columbus, his estranged son Jin (John Cho) arrives to wait for him and strikes up a friendship with Casey, who starts to show him her favorite buildings. The two begin to unlock something in each other that’s hard to define but life-changing for both. Columbus is beautiful and subtle, letting us feel how the places we build and the people we let near us move and mold us. +

+

+ Columbus is currently streaming on Hulu and available to rent on Google Play and YouTube. +

+

+ 8) The Florida Project +

+
+ +
+

+ Sean Baker’s The Florida Project unfolds at first like a series of sketches about the characters who live in a purple-painted, $35-a-night motel called the Magic Castle down the street from Disney World. The film is held together by the hysterical antics of a kid named Moonee and her pack of young friends, as well as long-suffering hotel manager Bobby (a splendid, warm Willem Dafoe), who tries to put up with it all while keeping some kind of order. But as The Florida Project goes on, a narrative starts to form, one that chronicles with heartbreaking attention the sort of dilemmas that face poor parents and their children in America, and the broken systems that try to cope with impossible situations. +

+

+ The Florida Project is currently streaming on Amazon Prime and available to digitally rent on YouTube, Vudu, and Google Play. +

+

+ 7) Call Me by Your Name +

+
+ +
+

+ Luca Guadagnino’s gorgeous film Call Me by Your Name adapts André Aciman’s 2007 novel about a precocious 17-year-old named Elio (Timothée Chalamet), who falls in lust and love with his father’s 24-year-old graduate student Oliver (Armie Hammer). It’s remarkable for how it turns literature into pure cinema, all emotion and image and heady sensation. Set in 1983 in Northern Italy, Call Me by Your Name is less about coming out than coming of age, but it also captures a particular sort of love that’s equal parts passion and torment, a kind of irrational heart fire that opens a gate into something longer-lasting. The film is a lush, heady experience for the body, but it’s also an arousal for the soul. +

+

+ Call Me By Your Name is available to digitally purchase on Amazon, YouTube, and Google Play. +

+

+ 6) Personal Shopper +

+
+ +
+

+ In her second collaboration with French director Olivier Assayas, Kristen Stewart plays a personal shopper to a wealthy socialite, with a sideline as an amateur ghost hunter who’s searching for her dead twin brother. Personal Shopper is deeper than it seems at first blush, a meditation on grief and an exploration of “between” places — on the fringes of wealth, and in the space between life and death. Some souls are linked in a way that can’t be shaken, and whether or not there’s an afterlife doesn’t change the fact that we see and sense them everywhere. (Personal Shopper also has one of the most tense extended scenes involving text messaging ever seen onscreen.) +

+

+ Personal Shopper is currently streaming on Showtime and available to rent on Vudu, YouTube, Amazon, iTunes, and Google Play. +

+

+ 5) Princess Cyd +

+
+ +
+

+ Stephen Cone is a master of small, carefully realized filmmaking; his earlier films such as The Wise Kids and Henry Gamble’s Birthday Party combine an unusual level of empathy for his characters with an unusual combination of interests: love, desire, sexual awakenings, and religion. Princess Cyd is his most accomplished film yet, about a young woman named Cyd (Jessie Pinnick) who finds herself attracted to Katie (Malic White), a barista, while visiting her Aunt Miranda (Rebecca Spence, playing a character modeled on the author Marilynne Robinson) in Chicago. As she works through her own sexual awakening with Katie, Cyd unwinds some of the ways Miranda’s life has gotten too safe. They provoke each other while forming a bond and being prodded toward a bigger understanding of the world. It is a graceful and honest film, and it feels like a modest miracle. +

+

+ Princess Cyd is currently streaming on Netflix and available to digitally rent on Google Play and YouTube. +

+

+ 4) Get Out +

+
+ +
+

+ Racism is sinister, frightening, and deadly. But Get Out (a stunning directorial debut from Key & Peele's Jordan Peele) isn’t about the blatantly, obviously scary kind of racism — burning crosses and lynchings and snarling hate. Instead, it’s interested in showing how the parts of racism that try to be aggressively unscary are just as horrifying, and it’s interested in making us feel that horror in a visceral, bodily way. In the tradition of the best classic social thrillers, Get Out takes a topic that is often approached cerebrally — casual racism — and turns it into something you feel in your tummy. And it does it with a wicked sense of humor. +

+

+ Get Out is currently streaming on HBO Go and HBO Now, and is available to digitally rent on iTunes, Amazon, Google Play, YouTube, and Vudu. +

+

+ 3) The Work +

+
+ +
+

+ The Work is an outstanding, astonishing accomplishment and a viewing experience that will leave you shaken (but in a good way). At Folsom Prison in California, incarcerated men regularly participate in group therapy, and each year other men from the “outside” apply to participate in an intense four-day period of group therapy alongside Folsom’s inmates. The Work spends almost all of its time inside the room where that therapy happens, observing the strong, visceral, and sometimes violent emotions the men feel as they expose the hurt and raw nerves that have shaped how they encounter the world. Watching is not always easy, but by letting us peek in, the film invites viewers to become part of the experience — as if we, too, are being asked to let go. +

+

+ The Work is streaming on Topic.com and available to digitally rent on Google Play and YouTube. +

+

+ 2) Ex Libris +

+
+ +
+

+ Frederick Wiseman is one of the towering giants of nonfiction film, a keen observer of American institutions — ranging from prisons to dance companies to welfare offices — for the past half-century. Ex Libris is his mesmerizing look at the New York Public Library and the many functions it fills, which go far beyond housing books. Wiseman works in the observational mode, which means his films contain no captions, dates, or talking-head interviews: We just see what his camera captured, which in this case includes community meetings, benefit dinners, after-school programs, readings with authors and scholars (including Richard Dawkins and Ta-Nehisi Coates), and NYPL patrons going about their business in the library’s branches all over the city. The result is almost hypnotic and, perhaps surprisingly, deeply moving. It makes a case for having faith in the public institutions where ordinary people work — away from the limelight, without trying to score political points — in order to make our communities truly better. +

+

+ Ex Libris will air on PBS in the fall and then be available to cardholders in many library systems across the country via Kanopy. +

+

+ 1) Lady Bird +

+
+ +
+

+ Lady Bird topped my list almost instantly, and only rose in my estimation on repeated viewings. For many who saw it (including me), it felt like a movie made not just for but about me. Lady Bird is a masterful, exquisite coming-of-age comedy starring the great Saoirse Ronan as Christine — or “Lady Bird,” as she’s re-christened herself — and it’s as funny, smart, and filled with yearning as its heroine. Writer-director Greta Gerwig made the film as an act of love, not just toward her hometown of Sacramento but also toward girlhood, and toward the feeling of always being on the outside of wherever real life is happening. Lady Bird is the rare movie that manages to be affectionate, entertaining, hilarious, witty, and confident. And one line from it struck me as the guiding principle of many of the year’s best films: “Don’t you think they are the same thing? Love, and attention?” +

+

+ Lady Bird is currently streaming on Amazon Prime and available to digitally rent on Amazon, Google Play, and YouTube. +

+

+ Honorable mentions: Marjorie Prime, Phantom Thread, Casting JonBenet, The Post, The Shape of Water, Logan Lucky, I, Tonya, The Lost City of Z, Graduation, Spettacolo, Loveless, Restless Creature: Wendy Whelan, In Transit, The Reagan Show +

+
diff --git a/resources/tests/readability/videos-1/source.html b/resources/tests/readability/videos-1/source.html new file mode 100644 index 0000000..9e407bc --- /dev/null +++ b/resources/tests/readability/videos-1/source.html @@ -0,0 +1,1319 @@ + + + + + How to watch the 21 best films of 2017 - Vox + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + clock + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + menu + + + + + + more-arrow + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + no + + + + + + yes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+
+
+
+

+ The 21 best movies of 2017 +

+
+

+ How to watch the greatest movies of the year, from Lady Bird and Dunkirk to Get Out and The Big Sick. +

+ +
+ +
+
+
+
+ Javier Zarracina/Vox +
+ +
+

+ In the introduction to her review anthology For Keeps: 30 Years at the Movies, the legendary film critic Pauline Kael wrote, “I’m frequently asked why I don’t write my memoirs. I think I have.” She meant what most movie critics realize at some point: that reading your past reviews and revisiting the lists of films you liked most during the year reveals not just something about a particular year in cinema, but something about you as well. +

+

+ That’s the feeling I get constructing my list of the best films of 2017, a year that overflowed with great films in every genre, from horror and romantic comedy to documentary and arthouse drama. Some of the films on my list have commonalities — ghosts, meditations on memory and interpersonal connection, and women who refuse to behave — but mostly they underscore just how vibrant cinema remains as an art form, even in the midst of massive cultural shifts in the industry and beyond. And it is a keen reminder to me of all the 2017 conversations I’ve had around and at the movies — and the ways I will never be the same. +

+

+ Here are my top 21 films of 2017 and how to watch them at home, with 14 honorable mentions. +

+

+ 21) Star Wars: The Last Jedi +

+
+
+ +
+
+

+ I am as shocked as anyone that a Star Wars movie found its way onto my list — but I was bowled over by The Last Jedi, which may be one of the series’ best. In the hands of writer-director Rian Johnson (who will also oversee a new Star Wars trilogy), The Last Jedi is beautiful to look at and keeps its eye on the relationships between characters and how they communicate with one another, in addition to the bigger galactic story. The same characters are back, but they seem infused with new life, and the galaxy with a new kind of hope. The movie’s best details are in the strong bonds that develop between characters, and I left the film with the realization that for the first time in my life, I loved a Star Wars movie. Now I understand the magic. +

+

+ Star Wars: The Last Jedi is currently streaming on Netflix and available to digitally rent on Google Play and YouTube. +

+

+ 20) Faces Places +

+
+
+ +
+
+

+ The unusual documentary Faces Places (in French, Visages Villages) turns on the friendship between the accomplished street artist JR and legendary film director Agnès Varda, whose work was central to the development of the French New Wave movement. The pair (whose difference in age is 55 years) met after years of admiring each other’s work and decided to create a documentary portrait of France — by making a number of actual portraits. The film chronicles a leg of the "Inside Outside Project," a roving art initiative in which JR makes enormous portraits of people he meets and pastes them onto buildings and walls. In the film, Varda joins him, and as they talk to people around the country, they grow in their understanding of themselves and of each other. The development of their friendship, which is both affectionate and mutually sharpening, forms Faces Places’ emotional center. +

+

+ Faces Places is currently streaming on Netflix and available to digitally rent on Google Play and YouTube. +

+ +

+ 19) Ingrid Goes West +

+
+
+ +
+
+

+ Ingrid Goes West is a twisted and dark comedy — part addiction narrative, part stalker story — and yet it’s set in a world that’s almost pathologically cheery: the glossy, sunny, nourishing, superfood- and superlative-loving universe of Instagram celebrity. But despite Ingrid Goes West’s spot-on take on that world, the best thing about the film is that it refuses to traffic in lazy buzzwords and easy skewering, particularly at the expense of young women. Instead, the movie conveys that behind every Instagram image and meltdown is a real person, with real insecurities, real feelings, and real problems. And it recognizes that living a life performed in public can be its own kind of self-deluding prison. +

+

+ Ingrid Goes West is currently streaming on Hulu and available to digitally rent on YouTube and Google Play. +

+

+ 18) Lady Macbeth +

+
+
+ +
+
+

+ Lady Macbeth is no placid costume drama. Adapted from an 1865 Russian novella by Nikolai Leskov, the movie follows Katherine (the astounding Florence Pugh), a woman in the Lady Macbeth line characterized by a potent cocktail of very few scruples and a lot of determination. She's a chilling avatar for the ways that class and privilege — both obvious and hidden — insulate some people from the consequences of their actions while damning others. Lady Macbeth is also a dazzling directorial debut from William Oldroyd, a thrilling combination of sex, murder, intrigue, and power plays. It’s visually stunning, each frame composed so carefully and deliberately that the wildness and danger roiling just below the surface feels even more frightening. Each scene ratchets up the tension to an explosive, chilling end. +

+

+ Lady Macbeth is currently streaming on HBO Go and HBO Now, and it is available to digitally rent on Amazon Prime, Vudu, YouTube, iTunes, and Google Play. +

+

+ 17) BPM (Beats Per Minute) +

+
+
+ +
+
+

+ BPM (Beats Per Minute) is a remarkably tender and stirring story of the Paris chapter of ACT UP, an AIDS activism group, and the young people who found themselves caught in the crosshairs of the AIDS crisis in the early 1990s. The film follows both the group's actions and the individual members’ shifting relationships to one another — enemies becoming friends, friends becoming lovers, lovers becoming caretakers — as well as their struggles with the disease wracking their community. As an account of the period, it’s riveting; as an exploration of life and love set at the urgent intersection of the political and the personal, it’s devastating. +

+

+ BPM (Beats Per Minute) is currently streaming on Hulu and available to digitally rent on Google Play and YouTube. +

+

+ 16) The Big Sick +

+
+
+ +
+
+

+ Few 2017 movies could top the charm and tenderness of The Big Sick, which hits all the right romantic comedy notes with one unusual distinction: It feels like real life. That’s probably because The Big Sick is written by real-life married couple Emily V. Gordon and Silicon Valley's Kumail Nanjiani, and based on their real-life romance. The Big Sick — which stars Nanjiani as a version of himself, alongside Zoe Kazan as Emily — is funny and sweet while not backing away from matters that romantic comedies don’t usually touch on, like serious illness, struggles in long-term marriages, and religion. As it tells the couple’s story, which takes a serious turn when Emily falls ill with a mysterious infection and her parents (played by Holly Hunter and Ray Romano) come to town, it becomes a funny and wise story about real love. +

+

+ The Big Sick is currently streaming on Amazon Prime and available to digitally rent on iTunes, Vudu, Amazon, YouTube, and Google Play. +

+

+ 15) Mother! +

+
+
+ +
+
+

+ There’s so much pulsing beneath the surface of Mother! that it’s hard to grab on to just one theme as what it “means.” It’s full-on apocalyptic fiction, and like all stories of apocalypse, it’s intended to draw back the veil on reality and show us what’s really beneath. And this movie gets wild: If its gleeful cracking apart of traditional theologies doesn’t get you (there’s a lot of Catholic folk imagery here, complete with an Ash Wednesday-like mud smearing on the foreheads of the faithful), its bonkers scenes of chaos probably will. Mother! is a movie designed to provoke fury, ecstasy, madness, catharsis, and more than a little awe. Watching it, and then participating in the flurry of arguments and discussions unpacking it, was among my best moviegoing experiences of 2017. +

+

+ Mother! is available to digitally purchase on Google Play and YouTube. +

+

+ 14) A Ghost Story +

+
+
+ +
+
+

+ Director David Lowery filmed A Ghost Story in secret, then premiered it at the Sundance Film Festival to critical acclaim. The movie starts out being about a grieving widow (Rooney Mara) trying to live through the pain of losing her beloved husband, but it soon shifts focus to the ghost of her husband (Casey Affleck, covered in a sheet), evolving into a compelling rumination on the nature of time, memory, history, and the universe. Bathed in warm humor and wistful longing, it's a film that stays with you long after it’s over, a lingering reminder of the inextricable link between love and place. +

+

+ A Ghost Story is available to digitally rent on iTunes, Vudu, Amazon, Google Play, and YouTube. +

+

+ 13) The Square +

+
+
+ +
+
+ +

+ The Square is currently streaming on Hulu and available to digitally rent on Google Play and YouTube. +

+

+ 12) Dunkirk +

+
+
+ +
+
+

+ Dunkirk, a true cinematic achievement from acclaimed director Christopher Nolan, backs off conventional notions of narrative and chronology as much as possible, while leaning headfirst into everything else that makes a movie a visceral work of art aimed at the senses: the images, the sounds, the scale, the swelling vibrations of it all. You can’t smell the sea spray, but your brain may trick you into thinking you can. Nolan’s camera pushes the edges of the screen as far as it can as Dunkirk engulfs the audience in something that feels like a lot more than a war movie. It’s a symphony for the brave and broken, and it resolves in a major key — but one with an undercurrent of sorrow, and of sober warning. Courage in the face of danger is not just for characters in movies. +

+

+ Dunkirk is currently streaming on HBO Go and HBO Now, and available to digitally rent on Google Play and YouTube. +

+

+ 11) Rat Film +

+
+
+ +
+
+

+ Rat Film is about rats, yes — and rat poison experts and rat hunters and people who keep rats as pets. But it’s also about the history of eugenics, dubious science, “redlining,” and segregated housing in Baltimore. All these pieces come together to form one big essay, where the meaning of each vignette only becomes clearer in light of the whole. It’s a fast-paced, no-holds-barred exploration of a damning history, and it accrues meaning as the images, sounds, and text pile up. +

+

+ Rat Film is available to digitally rent on YouTube and Google Play. +

+

+ 10) A Quiet Passion +

+
+
+ +
+
+

+ A Quiet Passion is technically a biographical film about Emily Dickinson, but it transcends its genre to become something more like poetry. It’s a perplexing and challenging film, crafted without the traditional guardrails that guide most biographical movies — dates, times, major accomplishments, and so on. Time slips away in the film almost imperceptibly, and the narrative arc doesn’t yield easily to the viewer. Cynthia Nixon plays Emily Dickinson, whose poetry and life is a perfect match for the signature style of director Terence Davies: rich in detail, deeply enigmatic, and weighed down with a kind of sparkling, joy-tinged sorrow. A Quiet Passion is a portrait, both visual and narrative, of the kind of saint most modern people can understand: one who is certain of her uncertainty, and yearning to walk the path on which her passion and longing meet. +

+

+ A Quiet Passion is currently streaming on Amazon Prime and available to digitally rent or purchase on iTunes, Vudu, Amazon, YouTube, and Google Play. +

+

+ 9) Columbus +

+
+
+ +
+
+

+ Columbus is a stunner of a debut from video essayist turned director Kogonada. Haley Lu Richardson stars as Casey, a young woman living in Columbus, Indiana, who cares for her mother, works at a library, and harbors a passion for architecture. (Columbus is a mecca for modernist architecture scholars and enthusiasts.) When a visiting architecture scholar falls into a coma in Columbus, his estranged son Jin (John Cho) arrives to wait for him and strikes up a friendship with Casey, who starts to show him her favorite buildings. The two begin to unlock something in each other that’s hard to define but life-changing for both. Columbus is beautiful and subtle, letting us feel how the places we build and the people we let near us move and mold us. +

+

+ Columbus is currently streaming on Hulu and available to rent on Google Play and YouTube. +

+

+ 8) The Florida Project +

+
+
+ +
+
+

+ Sean Baker’s The Florida Project unfolds at first like a series of sketches about the characters who live in a purple-painted, $35-a-night motel called the Magic Castle down the street from Disney World. The film is held together by the hysterical antics of a kid named Moonee and her pack of young friends, as well as long-suffering hotel manager Bobby (a splendid, warm Willem Dafoe), who tries to put up with it all while keeping some kind of order. But as The Florida Project goes on, a narrative starts to form, one that chronicles with heartbreaking attention the sort of dilemmas that face poor parents and their children in America, and the broken systems that try to cope with impossible situations. +

+

+ The Florida Project is currently streaming on Amazon Prime and available to digitally rent on YouTube, Vudu, and Google Play. +

+

+ 7) Call Me by Your Name +

+
+
+ +
+
+

+ Luca Guadagnino’s gorgeous film Call Me by Your Name adapts André Aciman’s 2007 novel about a precocious 17-year-old named Elio (Timothée Chalamet), who falls in lust and love with his father’s 24-year-old graduate student Oliver (Armie Hammer). It’s remarkable for how it turns literature into pure cinema, all emotion and image and heady sensation. Set in 1983 in Northern Italy, Call Me by Your Name is less about coming out than coming of age, but it also captures a particular sort of love that’s equal parts passion and torment, a kind of irrational heart fire that opens a gate into something longer-lasting. The film is a lush, heady experience for the body, but it’s also an arousal for the soul. +

+

+ Call Me By Your Name is available to digitally purchase on Amazon, YouTube, and Google Play. +

+

+ 6) Personal Shopper +

+
+
+ +
+
+

+ In her second collaboration with French director Olivier Assayas, Kristen Stewart plays a personal shopper to a wealthy socialite, with a sideline as an amateur ghost hunter who’s searching for her dead twin brother. Personal Shopper is deeper than it seems at first blush, a meditation on grief and an exploration of “between” places — on the fringes of wealth, and in the space between life and death. Some souls are linked in a way that can’t be shaken, and whether or not there’s an afterlife doesn’t change the fact that we see and sense them everywhere. (Personal Shopper also has one of the most tense extended scenes involving text messaging ever seen onscreen.) +

+

+ Personal Shopper is currently streaming on Showtime and available to rent on Vudu, YouTube, Amazon, iTunes, and Google Play. +

+

+ 5) Princess Cyd +

+
+
+ +
+
+

+ Stephen Cone is a master of small, carefully realized filmmaking; his earlier films such as The Wise Kids and Henry Gamble’s Birthday Party combine an unusual level of empathy for his characters with an unusual combination of interests: love, desire, sexual awakenings, and religion. Princess Cyd is his most accomplished film yet, about a young woman named Cyd (Jessie Pinnick) who finds herself attracted to Katie (Malic White), a barista, while visiting her Aunt Miranda (Rebecca Spence, playing a character modeled on the author Marilynne Robinson) in Chicago. As she works through her own sexual awakening with Katie, Cyd unwinds some of the ways Miranda’s life has gotten too safe. They provoke each other while forming a bond and being prodded toward a bigger understanding of the world. It is a graceful and honest film, and it feels like a modest miracle. +

+

+ Princess Cyd is currently streaming on Netflix and available to digitally rent on Google Play and YouTube. +

+

+ 4) Get Out +

+
+
+ +
+
+

+ Racism is sinister, frightening, and deadly. But Get Out (a stunning directorial debut from Key & Peele's Jordan Peele) isn’t about the blatantly, obviously scary kind of racism — burning crosses and lynchings and snarling hate. Instead, it’s interested in showing how the parts of racism that try to be aggressively unscary are just as horrifying, and it’s interested in making us feel that horror in a visceral, bodily way. In the tradition of the best classic social thrillers, Get Out takes a topic that is often approached cerebrally — casual racism — and turns it into something you feel in your tummy. And it does it with a wicked sense of humor. +

+

+ Get Out is currently streaming on HBO Go and HBO Now, and is available to digitally rent on iTunes, Amazon, Google Play, YouTube, and Vudu. +

+

+ 3) The Work +

+
+
+ +
+
+

+ The Work is an outstanding, astonishing accomplishment and a viewing experience that will leave you shaken (but in a good way). At Folsom Prison in California, incarcerated men regularly participate in group therapy, and each year other men from the “outside” apply to participate in an intense four-day period of group therapy alongside Folsom’s inmates. The Work spends almost all of its time inside the room where that therapy happens, observing the strong, visceral, and sometimes violent emotions the men feel as they expose the hurt and raw nerves that have shaped how they encounter the world. Watching is not always easy, but by letting us peek in, the film invites viewers to become part of the experience — as if we, too, are being asked to let go. +

+

+ The Work is streaming on Topic.com and available to digitally rent on Google Play and YouTube. +

+

+ 2) Ex Libris +

+
+
+ +
+
+

+ Frederick Wiseman is one of the towering giants of nonfiction film, a keen observer of American institutions — ranging from prisons to dance companies to welfare offices — for the past half-century. Ex Libris is his mesmerizing look at the New York Public Library and the many functions it fills, which go far beyond housing books. Wiseman works in the observational mode, which means his films contain no captions, dates, or talking-head interviews: We just see what his camera captured, which in this case includes community meetings, benefit dinners, after-school programs, readings with authors and scholars (including Richard Dawkins and Ta-Nehisi Coates), and NYPL patrons going about their business in the library’s branches all over the city. The result is almost hypnotic and, perhaps surprisingly, deeply moving. It makes a case for having faith in the public institutions where ordinary people work — away from the limelight, without trying to score political points — in order to make our communities truly better. +

+

+ Ex Libris will air on PBS in the fall and then be available to cardholders in many library systems across the country via Kanopy. +

+

+ 1) Lady Bird +

+
+
+ +
+
+

+ Lady Bird topped my list almost instantly, and only rose in my estimation on repeated viewings. For many who saw it (including me), it felt like a movie made not just for but about me. Lady Bird is a masterful, exquisite coming-of-age comedy starring the great Saoirse Ronan as Christine — or “Lady Bird,” as she’s re-christened herself — and it’s as funny, smart, and filled with yearning as its heroine. Writer-director Greta Gerwig made the film as an act of love, not just toward her hometown of Sacramento but also toward girlhood, and toward the feeling of always being on the outside of wherever real life is happening. Lady Bird is the rare movie that manages to be affectionate, entertaining, hilarious, witty, and confident. And one line from it struck me as the guiding principle of many of the year’s best films: “Don’t you think they are the same thing? Love, and attention?” +

+

+ Lady Bird is currently streaming on Amazon Prime and available to digitally rent on Amazon, Google Play, and YouTube. +

+

+ Honorable mentions: Marjorie Prime, Phantom Thread, Casting JonBenet, The Post, The Shape of Water, Logan Lucky, I, Tonya, The Lost City of Z, Graduation, Spettacolo, Loveless, Restless Creature: Wendy Whelan, In Transit, The Reagan Show +

+
+
+
+ +
+
+
+
+ + +
+
+ +
+ +
+
+
+
+ +
+ +
+
+ + +
+ + + + + + diff --git a/resources/tests/readability/videos-2/expected.html b/resources/tests/readability/videos-2/expected.html new file mode 100644 index 0000000..55e1498 --- /dev/null +++ b/resources/tests/readability/videos-2/expected.html @@ -0,0 +1,109 @@ +
+

+ Séries, documentaires, programmes jeunesse… Retrouvez les recommandations de Libération pour savoir quoi regarder sur vos écrans cette semaine. +

+

+ Pour dépasser le tabac +

+

+ Vape Wave (documentaire, 1h28, Planète+) +

+

+ +

+

+ Pendant quelques jours, le doute a plané : l’Etat comptait-il vraiment légiférer contre la cigarette dans les films français, que ce soit via une interdiction pure et simple ou via un système de «punition» (coupe des aides CNC, par exemple) pour les longs-métrages qui sentent le mégot ? Si le rétropédalage de la ministre Buzyn n’en est pas vraiment un (elle n’avait jamais clairement menacé le septième art), la polémique a le mérite de pointer la (sur)représentation clopesque sur écran. Et si, comme c’est le cas dans la vie quotidienne, on voyait progressivement les cigarettes électroniques remplacer les tiges nicotinées authentiques ? Que ceux qui mettraient en doute le potentiel cinématographique des vapoteuses se ruent sur Vape Wave, documentaire militant signé Jan Kounen, ex-fumeur reconverti à la vape dont les images magnifient les volutes de vapeur recrachée. +

+

+ Si le film du réalisateur de Dobermann et 99 Francs part un peu dans tous les sens, il a le mérite de défendre avec une passion contagieuse ce qui semble, de loin, être le meilleur et plus sain substitut à la clope, n’en déplaise aux mesures restrictives imposées en France à son égard. Financé en partie via crowdfunding, le documentaire a été présenté par Kounen à travers toute la France lors de projection tenant quasiment de l’évangélisation. Disponible en VOD/DVD, il a été diffusé cette semaine sur la chaîne Planète+, qui le rediffusera les 25/11, 30/11 et 02/12 prochains. (Alexandre Hervaud) +

+

+ Pour écouter parler un génie +

+

+ Dans la tête d’Alan Moore (websérie documentaire, 8x5min, Arte Creative) +

+

+ +

+

+ Le week-end dernier, Libération publiait un portrait de der consacré à l’auteur britannique Alan Moore, connu pour ses BD cultes (V pour Vendetta, Watchmen, From Hell), à l’occasion de la sortie de son deuxième roman, le pavé Jérusalem. En attendant l’imminente sortie d’une version longue de son entretien avec Libé, on pourra se replonger dans les épisodes d’une websérie documentaire d’Arte Creative en 8 épisodes consacré au maître. Brexit, magie, Anonymous font partie des sujets discutés avec le maître au fil de ce programme sobrement intitulé Dans la tête d’Alan Moore. (A.H.) +

+

+ Pour honorer la mémoire d’une icône queer +

+

+ The Death and Life of Marsha P. Johnson (docu, 1h45, Netflix) +

+

+ +

+

+ Marsha, la «Rosa Parks du mouvement LGBTQ». Marsha «la prostituée, l’actrice et la sainte, modèle d’Andy Warhol» ou encore Marsha l’élaborée, la radicale, «avec ses plumes et ce maquillage qu’elle ne mettait jamais bien». «Queen Marsha» a été retrouvée morte dans l’Hudson en juillet 1992, alors qu’on la voyait encore parader dans les rues de Greenwich Village quelques jours auparavant. Un choc glaçant. Là où son corps a été repêché puis ingratement déposé, les sans-abri ont constitué le lendemain un mémorial de bouteilles et de plantes qui délimitent les contours de l’absente. +

+

+ Marsha P. Johnson de son nom complet, icône queer, femme transgenre noire américaine et emblème de la lutte pour les droits des LGBTQ avait été l’une des premières à s’engager lors des émeutes de Stonewall à New York, en 1969 : «C’est la révolution. Dieu merci.» Marsha était une fleur souriante au parfum d’espoir. Le documentaire The Death and Life of Marsha P. Johnson du cinéaste David France relate l’enquête de l’activiste Victoria Cruz, membre de l’organisation Anti-Violence Project à New York qui, avant de prendre sa retraite, réclame que lumière soit faite sur la disparition de l’icône […] Lire la suite de la critique de Jérémy Piette sur Libération.fr +

+

+ Pour Michel Vuilermoz (et rien d’autre) +

+

+ Alphonse President (série, 10x26, OCS Max) +

+

+ +

+

+ Un temps baptisée French Touch, la série Alphonse Président est le dernier né des programmes originaux made in OCS. On savait les budgets de la chaîne bien moins généreux que ceux de Canal+ (voire que ceux de France 3 Limousin), et cette série le prouve à nouveau régulièrement, notamment lors d’une scène de conférence de presse alternant plans larges d’une authentique conf' à l’Elysée période François Hollande et plans serrés d’acteurs filmés dans un château des Pays de la Loire où a eu lieu le tournage. Le principal atout (et quel atout) de cette série écrite et réalisée par Nicolas Castro (Des lendemains qui chantent, 2014) réside dans son interprète principal, Michel Vuillermoz. +

+

+ Dans le rôle d’un sénateur ringard devenu par un concours de circonstances président de la République, ce pensionnaire de la Comédie-Française et complice d’Albert Dupontel fait des merveilles, notamment lorsque le scénario lui prête des répliques enflammées typiques de la langue de bois politicienne – pas étonnant qu’il brasse du vent, son personnage de prof d’histoire retraité s’appelle Alphonse Dumoulin. C’est lorsqu’il n’est plus à l’écran que les choses se gâtent : si Jean-Michel Lahmi (de la bande d’Edouard Baer) fait le job en grand patron des flics, difficile de croire une seconde à Nabiha Akkari dans le rôle de la Première ministre – et pas uniquement parce que l’idée d’avoir une femme trentenaire issue de la diversité à Matignon sonne hélas comme un doux rêve en 2017. Si, en matière de fiction politique sérieuse, un Baron Noir n’a pas grand-chose à envier à un House of Cards, côté comique la France est encore loin d’avoir son Veep. Gageons que la génération LREM saura largement inspirer des scénaristes moqueurs. (A.H.) +

+

+ Pour les coulisses d’un tournage dément +

+

+ Jim & Andy (documentaire, 1h33, Netflix)  +

+

+ +

+

+ A la sortie de Man on the Moon (2000), le magnifique film de Milos Forman consacré à Andy Kaufman – comique et génie de la performance absurde mort en 1984 –, le cinéaste et les acteurs insistaient dans chaque interview sur l’in­croyable comportement de Jim Carrey pendant le tournage : il aurait été comme possédé par Kaufman, se prenant pour lui 24 heures sur 24. Certains affirmaient même ne jamais avoir eu l’impression que l’acteur était présent, tant son modèle avait littéralement pris sa place. Nous en avons aujourd’hui la preuve en images car tout cela avait été filmé par Bob Zmuda et Lynne Margulies, l’ancien complice et la veuve de Kaufman. +

+

+ Dans le passionnant Jim & Andy : the Great Beyond, disponible sur Netflix, Chris Smith a monté ces documents inédits parallèlement à un entretien dans lequel Jim Carrey revient sur cette expérience unique. Lire la suite de la critique de Marcos Uzal sur Liberation.fr +

+

+ Pour un trip sibérien en totale autarcie +

+

+ Braguino (documentaire, 50min, Arte) +

+

+ +

+

+ La querelle peut se trouver derrière toutes les portes, y compris celle de l’exil. On a beau croire avoir tourné le dos à tout, à cette inclination humaine à nourrir sa propre haine, l’allergie peut regermer fissa sur une peau qui frissonne à l’approche de ce voisin que l’on ne comprend pas. Issu d’une lignée de vieux-croyants orthodoxes russes, Sacha Braguine a pris sa famille sous le bras, loin de toute autre présence humaine en taïga sibérienne. Un autre groupe, les Kiline, a décidé d’en faire de même et de s’installer de l’autre côté de la rivière. Qui est arrivé en premier ? Qui menace l’autre ? L’histoire de l’impossible communauté peut commencer. +

+

+ La lecture d’Ermites dans la taïga (1992) de Vassili Peskov, authentique récit sur la famille Lykov opérant une migration similaire en 1938, a poussé l’artiste Clément Cogitore à rencontrer les Braguine, puis à se faire témoin de la bisbille de voisinage en 2016. Il en est revenu avec un nouveau film d’une cinquantaine de minutes : Braguino, soutenu par le prix Le Bal de la jeune création avec l’ADAGP. Le documentaire y frôle son déguisement fictionnel, tant ce qui s’y déroule convoque une dramaturgie comme invoquée par on ne sait quel rituel vaudou […] Lire la suite de la critique de Jérémy Piette sur Liberation.fr, le film diffusé cette semaine sur Arte est visible en intégralité ci-dessus. +

+

+ Pour un thriller tiré de faits réels +

+

+ 6 Days (film, 1h34, Netflix) +

+

+ +

+

+ Fin avril 1980, l’ambassade d’Iran à Londres a été le théâtre d’une prise d’otages largement médiatisée : une trentaine de personnes ont ainsi été retenues pendant six jours par des soldats iraniens dissidents exigeant la libération de 91 prisonniers. Avec Margaret Thatcher au 10 Downing Street à l’époque, pas question pour l’Angleterre d’avoir l’air mou du genou sur la réponse à apporter à cette crise scrutée par les caméras du monde entier. Le SAS (Special Air Service) est sur le coup : l’opération Nimrod se met en place pour prendre d’assaut l’ambassade. +

+

+ Inspiré par cet épisode, 6 Days de Toa Fraser (The Dead Lands, 2014) est un thriller carré pouvant compter sur l'autorité naturelle de Mark Strong (Kingsman) ici recyclé en flic londonien et sur la néo-badass attitude de Jamie Bell, bien loin du freluquet danseur de Billy Elliot puisqu'on le retrouve ici en soldat chargé d’organiser l’opération de secours. Attention, la bande-annonce ci-dessus dévoile à peu près l’intégralité des scènes d’action du film. (A.H.) +

+

Alexandre Hervaud , Jérémy Piette +

+
diff --git a/resources/tests/readability/videos-2/source.html b/resources/tests/readability/videos-2/source.html new file mode 100644 index 0000000..5cfc0ba --- /dev/null +++ b/resources/tests/readability/videos-2/source.html @@ -0,0 +1,2043 @@ + + + + + Screenshot : «Vape Wave», «6 Days», «Alphonse Président»… - Culture / Next + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Search + + + + + + + + + Direct + + + + + + User + + + + + + + + + 100 + + + + + + + + Zoom + + + + + + Quiz + + + ? + + + + + + Libération Diamond + + + + + + clock + + + + + + + + + xml + + + + + + netvibes + + + + + + + + live + + + + + + anciens-numeros + + + + + + + + data + + + + + + + + + desintox + + + + + + + + + + + + + + + diapo + + + + + + + + edito + + + + + + + + + election-2017 + + + + + + + + + + + election + + + + + + + + essentiel + + + + + + + + generique + + + + + + idee + + + + + + + + + + + + + + + jo + + + + + + + + + + + + next + + + + + + + + portrait + + + + + + + + + + radio + + + + + + + + + + + son + + + + + + + + + alerte + + + + + + top-100 + + + + + + + + + star + + + + + + une + + + + + + + video + + + + + + + scroll + + + + + + + politiques + + + + + + + + + + + + + + + + food + + + + + + + + + + + + sciences + + + + + + + + + + + + Facebook + + + + + + Whatsapp + + + + + + + + Twitter + + + + + + insta + + + + + + vine + + + + + + later + + + + + + + + glass + + + + + + Mail + + + + + + print + + + + + + Facebook + + + + + + Instagram + + + + + + Twitter + + + + + + Calendar + + + + + + download + + + + + + cross + + + + + + zoom-in + + + + + + + + + zoom-out + + + + + + + + + previous + + + + + + + + + next + + + + + + + + + truck + + + + + + + + + + + visa + + + + + + + + + mastercard + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + user-libe + + + + + + user-doc + + + + + + user-doc-list + + + + + + + + + user-mail + + + + + + + user-security + + + + + + user-settings + + + + + + user-shop + + + + + + + + + user-star + + + + + + + Ruban abo + + + + + + Losange orange + + + + + + List check + + + + + + Most read + + + + + + Ptit Libé + + + + + + + sport + + + + + + + + + blog + + + + + + + + voyage + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ +
+ +
+ + + + +
+ +
+
+
+
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + Sur vos écrans cette semaine +
    +

    + Screenshot : «Vape Wave», «6 Days», «Alphonse Président»… +

    +
    + Par Alexandre Hervaud et Jérémy Piette +
    +
    +
    +
    +
    +
    +
    + «Vape Wave», «6 Days», «Alphonse président» et «Braguino» + +
    + «Vape Wave», «6 Days», «Alphonse président» et «Braguino» DR + +
    +
    +
    +

    + Séries, documentaires, programmes jeunesse… Retrouvez les recommandations de Libération pour savoir quoi regarder sur vos écrans cette semaine. +

    +

    + Pour dépasser le tabac +

    +

    + Vape Wave (documentaire, 1h28, Planète+) +

    +

    + +

    +

    + Pendant quelques jours, le doute a plané : l’Etat comptait-il vraiment légiférer contre la cigarette dans les films français, que ce soit via une interdiction pure et simple ou via un système de «punition» (coupe des aides CNC, par exemple) pour les longs-métrages qui sentent le mégot ? Si le rétropédalage de la ministre Buzyn n’en est pas vraiment un (elle n’avait jamais clairement menacé le septième art), la polémique a le mérite de pointer la (sur)représentation clopesque sur écran. Et si, comme c’est le cas dans la vie quotidienne, on voyait progressivement les cigarettes électroniques remplacer les tiges nicotinées authentiques ? Que ceux qui mettraient en doute le potentiel cinématographique des vapoteuses se ruent sur Vape Wave, documentaire militant signé Jan Kounen, ex-fumeur reconverti à la vape dont les images magnifient les volutes de vapeur recrachée. +

    +

    + Si le film du réalisateur de Dobermann et 99 Francs part un peu dans tous les sens, il a le mérite de défendre avec une passion contagieuse ce qui semble, de loin, être le meilleur et plus sain substitut à la clope, n’en déplaise aux mesures restrictives imposées en France à son égard. Financé en partie via crowdfunding, le documentaire a été présenté par Kounen à travers toute la France lors de projection tenant quasiment de l’évangélisation. Disponible en VOD/DVD, il a été diffusé cette semaine sur la chaîne Planète+, qui le rediffusera les 25/11, 30/11 et 02/12 prochains. (Alexandre Hervaud) +

    +

    + Pour écouter parler un génie +

    +

    + Dans la tête d’Alan Moore (websérie documentaire, 8x5min, Arte Creative) +

    +

    + +

    +

    + Le week-end dernier, Libération publiait un portrait de der consacré à l’auteur britannique Alan Moore, connu pour ses BD cultes (V pour Vendetta, Watchmen, From Hell), à l’occasion de la sortie de son deuxième roman, le pavé Jérusalem. En attendant l’imminente sortie d’une version longue de son entretien avec Libé, on pourra se replonger dans les épisodes d’une websérie documentaire d’Arte Creative en 8 épisodes consacré au maître. Brexit, magie, Anonymous font partie des sujets discutés avec le maître au fil de ce programme sobrement intitulé Dans la tête d’Alan Moore. (A.H.) +

    +

    + Pour honorer la mémoire d’une icône queer +

    +

    + The Death and Life of Marsha P. Johnson (docu, 1h45, Netflix) +

    +

    + +

    +

    + Marsha, la «Rosa Parks du mouvement LGBTQ». Marsha «la prostituée, l’actrice et la sainte, modèle d’Andy Warhol» ou encore Marsha l’élaborée, la radicale, «avec ses plumes et ce maquillage qu’elle ne mettait jamais bien». «Queen Marsha» a été retrouvée morte dans l’Hudson en juillet 1992, alors qu’on la voyait encore parader dans les rues de Greenwich Village quelques jours auparavant. Un choc glaçant. Là où son corps a été repêché puis ingratement déposé, les sans-abri ont constitué le lendemain un mémorial de bouteilles et de plantes qui délimitent les contours de l’absente. +

    +

    + Marsha P. Johnson de son nom complet, icône queer, femme transgenre noire américaine et emblème de la lutte pour les droits des LGBTQ avait été l’une des premières à s’engager lors des émeutes de Stonewall à New York, en 1969 : «C’est la révolution. Dieu merci.» Marsha était une fleur souriante au parfum d’espoir. Le documentaire The Death and Life of Marsha P. Johnson du cinéaste David France relate l’enquête de l’activiste Victoria Cruz, membre de l’organisation Anti-Violence Project à New York qui, avant de prendre sa retraite, réclame que lumière soit faite sur la disparition de l’icône […] Lire la suite de la critique de Jérémy Piette sur Libération.fr +

    +

    + Pour Michel Vuilermoz (et rien d’autre) +

    +

    + Alphonse President (série, 10x26, OCS Max) +

    +

    + +

    +

    + Un temps baptisée French Touch, la série Alphonse Président est le dernier né des programmes originaux made in OCS. On savait les budgets de la chaîne bien moins généreux que ceux de Canal+ (voire que ceux de France 3 Limousin), et cette série le prouve à nouveau régulièrement, notamment lors d’une scène de conférence de presse alternant plans larges d’une authentique conf' à l’Elysée période François Hollande et plans serrés d’acteurs filmés dans un château des Pays de la Loire où a eu lieu le tournage. Le principal atout (et quel atout) de cette série écrite et réalisée par Nicolas Castro (Des lendemains qui chantent, 2014) réside dans son interprète principal, Michel Vuillermoz. +

    +

    + Dans le rôle d’un sénateur ringard devenu par un concours de circonstances président de la République, ce pensionnaire de la Comédie-Française et complice d’Albert Dupontel fait des merveilles, notamment lorsque le scénario lui prête des répliques enflammées typiques de la langue de bois politicienne – pas étonnant qu’il brasse du vent, son personnage de prof d’histoire retraité s’appelle Alphonse Dumoulin. C’est lorsqu’il n’est plus à l’écran que les choses se gâtent : si Jean-Michel Lahmi (de la bande d’Edouard Baer) fait le job en grand patron des flics, difficile de croire une seconde à Nabiha Akkari dans le rôle de la Première ministre – et pas uniquement parce que l’idée d’avoir une femme trentenaire issue de la diversité à Matignon sonne hélas comme un doux rêve en 2017. Si, en matière de fiction politique sérieuse, un Baron Noir n’a pas grand-chose à envier à un House of Cards, côté comique la France est encore loin d’avoir son Veep. Gageons que la génération LREM saura largement inspirer des scénaristes moqueurs. (A.H.) +

    +

    + Pour les coulisses d’un tournage dément +

    +

    + Jim & Andy (documentaire, 1h33, Netflix)  +

    +

    + +

    +

    + A la sortie de Man on the Moon (2000), le magnifique film de Milos Forman consacré à Andy Kaufman – comique et génie de la performance absurde mort en 1984 –, le cinéaste et les acteurs insistaient dans chaque interview sur l’in­croyable comportement de Jim Carrey pendant le tournage : il aurait été comme possédé par Kaufman, se prenant pour lui 24 heures sur 24. Certains affirmaient même ne jamais avoir eu l’impression que l’acteur était présent, tant son modèle avait littéralement pris sa place. Nous en avons aujourd’hui la preuve en images car tout cela avait été filmé par Bob Zmuda et Lynne Margulies, l’ancien complice et la veuve de Kaufman. +

    +

    + Dans le passionnant Jim & Andy : the Great Beyond, disponible sur Netflix, Chris Smith a monté ces documents inédits parallèlement à un entretien dans lequel Jim Carrey revient sur cette expérience unique. Lire la suite de la critique de Marcos Uzal sur Liberation.fr +

    +

    + Pour un trip sibérien en totale autarcie +

    +

    + Braguino (documentaire, 50min, Arte) +

    +

    + +

    +

    + La querelle peut se trouver derrière toutes les portes, y compris celle de l’exil. On a beau croire avoir tourné le dos à tout, à cette inclination humaine à nourrir sa propre haine, l’allergie peut regermer fissa sur une peau qui frissonne à l’approche de ce voisin que l’on ne comprend pas. Issu d’une lignée de vieux-croyants orthodoxes russes, Sacha Braguine a pris sa famille sous le bras, loin de toute autre présence humaine en taïga sibérienne. Un autre groupe, les Kiline, a décidé d’en faire de même et de s’installer de l’autre côté de la rivière. Qui est arrivé en premier ? Qui menace l’autre ? L’histoire de l’impossible communauté peut commencer. +

    +

    + La lecture d’Ermites dans la taïga (1992) de Vassili Peskov, authentique récit sur la famille Lykov opérant une migration similaire en 1938, a poussé l’artiste Clément Cogitore à rencontrer les Braguine, puis à se faire témoin de la bisbille de voisinage en 2016. Il en est revenu avec un nouveau film d’une cinquantaine de minutes : Braguino, soutenu par le prix Le Bal de la jeune création avec l’ADAGP. Le documentaire y frôle son déguisement fictionnel, tant ce qui s’y déroule convoque une dramaturgie comme invoquée par on ne sait quel rituel vaudou […] Lire la suite de la critique de Jérémy Piette sur Liberation.fr, le film diffusé cette semaine sur Arte est visible en intégralité ci-dessus. +

    +

    + Pour un thriller tiré de faits réels +

    +

    + 6 Days (film, 1h34, Netflix) +

    +

    + +

    +

    + Fin avril 1980, l’ambassade d’Iran à Londres a été le théâtre d’une prise d’otages largement médiatisée : une trentaine de personnes ont ainsi été retenues pendant six jours par des soldats iraniens dissidents exigeant la libération de 91 prisonniers. Avec Margaret Thatcher au 10 Downing Street à l’époque, pas question pour l’Angleterre d’avoir l’air mou du genou sur la réponse à apporter à cette crise scrutée par les caméras du monde entier. Le SAS (Special Air Service) est sur le coup : l’opération Nimrod se met en place pour prendre d’assaut l’ambassade. +

    +

    + Inspiré par cet épisode, 6 Days de Toa Fraser (The Dead Lands, 2014) est un thriller carré pouvant compter sur l'autorité naturelle de Mark Strong (Kingsman) ici recyclé en flic londonien et sur la néo-badass attitude de Jamie Bell, bien loin du freluquet danseur de Billy Elliot puisqu'on le retrouve ici en soldat chargé d’organiser l’opération de secours. Attention, la bande-annonce ci-dessus dévoile à peu près l’intégralité des scènes d’action du film. (A.H.) +

    Alexandre Hervaud , Jérémy Piette +
    +
    + +
    +
    +
    +
    +
    + + +
    +
    +
    +
    +
    + +
    + Un mot à ajouter ? +
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/full_text_parser/readability/tests.rs b/src/full_text_parser/readability/tests.rs index 6b8245f..fa6d113 100644 --- a/src/full_text_parser/readability/tests.rs +++ b/src/full_text_parser/readability/tests.rs @@ -488,6 +488,36 @@ async fn telegraph() { run_test("telegraph").await } +#[tokio::test] +async fn toc_missing() { + run_test("toc-missing").await +} + +#[tokio::test] +async fn topicseed_1() { + run_test("topicseed-1").await +} + +#[tokio::test] +async fn tumblr() { + run_test("tumblr").await +} + +#[tokio::test] +async fn v8_blog() { + run_test("v8-blog").await +} + +#[tokio::test] +async fn videos_1() { + run_test("videos-1").await +} + +#[tokio::test] +async fn videos_2() { + run_test("videos-2").await +} + #[tokio::test] async fn webmd_1() { run_test("webmd-1").await