From e3246af28b45b679488a1b684987c8629167c357 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 25 Feb 2023 00:42:26 +0100 Subject: [PATCH] refactor & more testing --- resources/tests/readability/001/expected.html | 132 ++ .../001/source.html} | 0 resources/tests/readability/002/expected.html | 594 +++++++++ resources/tests/readability/002/source.html | 1131 +++++++++++++++++ src/article.rs | 3 +- src/constants.rs | 2 + .../config/config_collection.rs | 8 +- src/full_text_parser/config/config_entry.rs | 20 +- src/full_text_parser/metadata.rs | 4 +- src/full_text_parser/mod.rs | 74 +- src/full_text_parser/readability/mod.rs | 41 +- src/full_text_parser/readability/tests.rs | 50 +- src/full_text_parser/tests.rs | 8 +- src/util.rs | 3 +- 14 files changed, 1969 insertions(+), 101 deletions(-) create mode 100644 resources/tests/readability/001/expected.html rename resources/tests/{readability-test-1.html => readability/001/source.html} (100%) create mode 100644 resources/tests/readability/002/expected.html create mode 100644 resources/tests/readability/002/source.html diff --git a/resources/tests/readability/001/expected.html b/resources/tests/readability/001/expected.html new file mode 100644 index 0000000..3b869e4 --- /dev/null +++ b/resources/tests/readability/001/expected.html @@ -0,0 +1,132 @@ +
+

So finally you're testing your frontend JavaScript code? Great! The more you +write tests, the more confident you are with your code… but how much precisely? +That's where code coverage might +help. +

+

The idea behind code coverage is to record which parts of your code (functions, + statements, conditionals and so on) have been executed by your test suite, + to compute metrics out of these data and usually to provide tools for navigating + and inspecting them.

+

Not a lot of frontend developers I know actually test their frontend code, + and I can barely imagine how many of them have ever setup code coverage… + Mostly because there are not many frontend-oriented tools in this area + I guess.

+

Actually I've only found one which provides an adapter for Mocha and + actually works…

+
+

Drinking game for web devs: +
(1) Think of a noun +
(2) Google "<noun>.js" +
(3) If a library with that name exists - drink

— Shay Friedman (@ironshay) + August 22, 2013 +
+

Blanket.js is an easy to install, easy to configure, +and easy to use JavaScript code coverage library that works both in-browser and +with nodejs. +

+

Its use is dead easy, adding Blanket support to your Mocha test suite + is just matter of adding this simple line to your HTML test file:

+
<script src="vendor/blanket.js"
+        data-cover-adapter="vendor/mocha-blanket.js"></script>
+
+ +

Source files: blanket.js, + mocha-blanket.js +

+

As an example, let's reuse the silly Cow example we used + in a previous episode:

+
// cow.js
+(function(exports) {
+  "use strict";
+
+  function Cow(name) {
+    this.name = name || "Anon cow";
+  }
+  exports.Cow = Cow;
+
+  Cow.prototype = {
+    greets: function(target) {
+      if (!target)
+        throw new Error("missing target");
+      return this.name + " greets " + target;
+    }
+  };
+})(this);
+
+ +

And its test suite, powered by Mocha and Chai:

+
var expect = chai.expect;
+
+describe("Cow", function() {
+  describe("constructor", function() {
+    it("should have a default name", function() {
+      var cow = new Cow();
+      expect(cow.name).to.equal("Anon cow");
+    });
+
+    it("should set cow's name if provided", function() {
+      var cow = new Cow("Kate");
+      expect(cow.name).to.equal("Kate");
+    });
+  });
+
+  describe("#greets", function() {
+    it("should greet passed target", function() {
+      var greetings = (new Cow("Kate")).greets("Baby");
+      expect(greetings).to.equal("Kate greets Baby");
+    });
+  });
+});
+
+ +

Let's create the HTML test file for it, featuring Blanket and its adapter + for Mocha:

+
<!DOCTYPE html>
+<html>
+<head>
+  <meta charset="utf-8">
+  <title>Test</title>
+  <link rel="stylesheet" media="all" href="vendor/mocha.css">
+</head>
+<body>
+  <div id="mocha"></div>
+  <div id="messages"></div>
+  <div id="fixtures"></div>
+  <script src="vendor/mocha.js"></script>
+  <script src="vendor/chai.js"></script>
+  <script src="vendor/blanket.js"
+          data-cover-adapter="vendor/mocha-blanket.js"></script>
+  <script>mocha.setup('bdd');</script>
+  <script src="cow.js" data-cover></script>
+  <script src="cow_test.js"></script>
+  <script>mocha.run();</script>
+</body>
+</html>
+
+ +

Notes:

+
    +
  • Notice the data-cover attribute we added to the script tag + loading the source of our library;
  • +
  • The HTML test file must be served over HTTP for the adapter to + be loaded.
  • +
+

Running the tests now gives us something like this:

+

+ screenshot +

+

As you can see, the report at the bottom highlights that we haven't actually + tested the case where an error is raised in case a target name is missing. + We've been informed of that, nothing more, nothing less. We simply know + we're missing a test here. Isn't this cool? I think so!

+

Just remember that code coverage will only bring you numbers and + raw information, not actual proofs that the whole of your code logic has + been actually covered. If you ask me, the best inputs you can get about + your code logic and implementation ever are the ones issued out of pair programming +sessions + and code reviews — + but that's another story.

+

So is code coverage silver bullet? No. Is it useful? Definitely. Happy testing! +

+
diff --git a/resources/tests/readability-test-1.html b/resources/tests/readability/001/source.html similarity index 100% rename from resources/tests/readability-test-1.html rename to resources/tests/readability/001/source.html diff --git a/resources/tests/readability/002/expected.html b/resources/tests/readability/002/expected.html new file mode 100644 index 0000000..af685ed --- /dev/null +++ b/resources/tests/readability/002/expected.html @@ -0,0 +1,594 @@ +
+
+

For more than a decade the Web has used XMLHttpRequest (XHR) to achieve + asynchronous requests in JavaScript. While very useful, XHR is not a very + nice API. It suffers from lack of separation of concerns. The input, output + and state are all managed by interacting with one object, and state is + tracked using events. Also, the event-based model doesn’t play well with + JavaScript’s recent focus on Promise- and generator-based asynchronous + programming.

+

The Fetch API intends + to fix most of these problems. It does this by introducing the same primitives + to JS that are used in the HTTP protocol. In addition, it introduces a + utility function fetch() that succinctly captures the intention + of retrieving a resource from the network.

+

The Fetch specification, which + defines the API, nails down the semantics of a user agent fetching a resource. + This, combined with ServiceWorkers, is an attempt to:

+
    +
  1. Improve the offline experience.
  2. +
  3. Expose the building blocks of the Web to the platform as part of the + extensible web movement.
  4. +
+

As of this writing, the Fetch API is available in Firefox 39 (currently + Nightly) and Chrome 42 (currently dev). Github has a Fetch polyfill.

+ +

Feature detection

+ +

Fetch API support can be detected by checking for Headers,Request, Response or fetch on + the window or worker scope.

+ +

Simple fetching

+ +

The most useful, high-level part of the Fetch API is the fetch() function. + In its simplest form it takes a URL and returns a promise that resolves + to the response. The response is captured as a Response object.

+

+ + + + + + +
+
fetch("/data.json").then(function(res) {
+  // res instanceof Response == true.
+  if (res.ok) {
+    res.json().then(function(data) {
+      console.log(data.entries);
+    });
+  } else {
+    console.log("Looks like the response wasn't perfect, got status", res.status);
+  }
+}, function(e) {
+  console.log("Fetch failed!", e);
+});
+
+

+

Submitting some parameters, it would look like this:

+

+ + + + + + +
+
fetch("http://www.example.org/submit.php", {
+  method: "POST",
+  headers: {
+    "Content-Type": "application/x-www-form-urlencoded"
+  },
+  body: "firstName=Nikhil&favColor=blue&password=easytoguess"
+}).then(function(res) {
+  if (res.ok) {
+    alert("Perfect! Your settings are saved.");
+  } else if (res.status == 401) {
+    alert("Oops! You are not authorized.");
+  }
+}, function(e) {
+  alert("Error submitting form!");
+});
+
+

+

The fetch() function’s arguments are the same as those passed + to the +
+Request() constructor, so you may directly pass arbitrarily + complex requests to fetch() as discussed below.

+ +

Headers

+ +

Fetch introduces 3 interfaces. These are Headers, Request and +
+Response. They map directly to the underlying HTTP concepts, + but have +
certain visibility filters in place for privacy and security reasons, + such as +
supporting CORS rules and ensuring cookies aren’t readable by third parties.

+

The Headers interface is + a simple multi-map of names to values:

+

+ + + + + + +
+
var content = "Hello World";
+var reqHeaders = new Headers();
+reqHeaders.append("Content-Type", "text/plain"
+reqHeaders.append("Content-Length", content.length.toString());
+reqHeaders.append("X-Custom-Header", "ProcessThisImmediately");
+
+

+

The same can be achieved by passing an array of arrays or a JS object + literal +
to the constructor:

+

+ + + + + + +
+
reqHeaders = new Headers({
+  "Content-Type": "text/plain",
+  "Content-Length": content.length.toString(),
+  "X-Custom-Header": "ProcessThisImmediately",
+});
+
+

+

The contents can be queried and retrieved:

+

+ + + + + + +
+
console.log(reqHeaders.has("Content-Type")); // true
+console.log(reqHeaders.has("Set-Cookie")); // false
+reqHeaders.set("Content-Type", "text/html");
+reqHeaders.append("X-Custom-Header", "AnotherValue");
+ 
+console.log(reqHeaders.get("Content-Length")); // 11
+console.log(reqHeaders.getAll("X-Custom-Header")); // ["ProcessThisImmediately", "AnotherValue"]
+ 
+reqHeaders.delete("X-Custom-Header");
+console.log(reqHeaders.getAll("X-Custom-Header")); // []
+
+

+

Some of these operations are only useful in ServiceWorkers, but they provide +
a much nicer API to Headers.

+

Since Headers can be sent in requests, or received in responses, and have + various limitations about what information can and should be mutable, Headers objects + have a guard property. This is not exposed to the Web, but + it affects which mutation operations are allowed on the Headers object. +
Possible values are:

+
    +
  • “none”: default.
  • +
  • “request”: guard for a Headers object obtained from a Request (Request.headers).
  • +
  • “request-no-cors”: guard for a Headers object obtained from a Request + created +
    with mode “no-cors”.
  • +
  • “response”: naturally, for Headers obtained from Response (Response.headers).
  • +
  • “immutable”: Mostly used for ServiceWorkers, renders a Headers object +
    read-only.
  • +
+

The details of how each guard affects the behaviors of the Headers object + are +
in the specification. For example, + you may not append or set a “request” guarded Headers’ “Content-Length” + header. Similarly, inserting “Set-Cookie” into a Response header is not + allowed so that ServiceWorkers may not set cookies via synthesized Responses.

+

All of the Headers methods throw TypeError if name is not a + valid HTTP Header name. The mutation operations will throw TypeError + if there is an immutable guard. Otherwise they fail silently. For example:

+

+ + + + + + +
+
var res = Response.error();
+try {
+  res.headers.set("Origin", "http://mybank.com");
+} catch(e) {
+  console.log("Cannot pretend to be a bank!");
+}
+
+

+ +

Request

+ +

The Request interface defines a request to fetch a resource over HTTP. + URL, method and headers are expected, but the Request also allows specifying + a body, a request mode, credentials and cache hints.

+

The simplest Request is of course, just a URL, as you may do to GET a + resource.

+

+ + + + + + +
+
var req = new Request("/index.html");
+console.log(req.method); // "GET"
+console.log(req.url); // "http://example.com/index.html"
+
+

+

You may also pass a Request to the Request() constructor to + create a copy. +
(This is not the same as calling the clone() method, which + is covered in +
the “Reading bodies” section.).

+

+ + + + + + +
+
var copy = new Request(req);
+console.log(copy.method); // "GET"
+console.log(copy.url); // "http://example.com/index.html"
+
+

+

Again, this form is probably only useful in ServiceWorkers.

+

The non-URL attributes of the Request can only be set by passing + initial +
values as a second argument to the constructor. This argument is a dictionary.

+

+ + + + + + +
+
var uploadReq = new Request("/uploadImage", {
+  method: "POST",
+  headers: {
+    "Content-Type": "image/png",
+  },
+  body: "image data"
+});
+
+

+

The Request’s mode is used to determine if cross-origin requests lead + to valid responses, and which properties on the response are readable. + Legal mode values are "same-origin", "no-cors" (default) + and "cors".

+

The "same-origin" mode is simple, if a request is made to another + origin with this mode set, the result is simply an error. You could use + this to ensure that +
a request is always being made to your origin.

+

+ + + + + + +
+
var arbitraryUrl = document.getElementById("url-input").value;
+fetch(arbitraryUrl, { mode: "same-origin" }).then(function(res) {
+  console.log("Response succeeded?", res.ok);
+}, function(e) {
+  console.log("Please enter a same-origin URL!");
+});
+
+

+

The "no-cors" mode captures what the web platform does by default + for scripts you import from CDNs, images hosted on other domains, and so + on. First, it prevents the method from being anything other than “HEAD”, + “GET” or “POST”. Second, if any ServiceWorkers intercept these requests, + they may not add or override any headers except for these. + Third, JavaScript may not access any properties of the resulting Response. + This ensures that ServiceWorkers do not affect the semantics of the Web + and prevents security and privacy issues that could arise from leaking + data across domains.

+

"cors" mode is what you’ll usually use to make known cross-origin + requests to access various APIs offered by other vendors. These are expected + to adhere to +
the CORS protocol. + Only a limited set of + headers is exposed in the Response, but the body is readable. For example, + you could get a list of Flickr’s most interesting photos + today like this:

+

+ + + + + + +
+
var u = new URLSearchParams();
+u.append('method', 'flickr.interestingness.getList');
+u.append('api_key', '<insert api key here>');
+u.append('format', 'json');
+u.append('nojsoncallback', '1');
+ 
+var apiCall = fetch('https://api.flickr.com/services/rest?' + u);
+ 
+apiCall.then(function(response) {
+  return response.json().then(function(json) {
+    // photo is a list of photos.
+    return json.photos.photo;
+  });
+}).then(function(photos) {
+  photos.forEach(function(photo) {
+    console.log(photo.title);
+  });
+});
+
+

+

You may not read out the “Date” header since Flickr does not allow it + via +
+Access-Control-Expose-Headers.

+

+ + + + + + +
+
response.headers.get("Date"); // null
+
+

+

The credentials enumeration determines if cookies for the other + domain are +
sent to cross-origin requests. This is similar to XHR’s withCredentials +
flag, but tri-valued as "omit" (default), "same-origin" and "include".

+

The Request object will also give the ability to offer caching hints to + the user-agent. This is currently undergoing some security review. + Firefox exposes the attribute, but it has no effect.

+

Requests have two read-only attributes that are relevant to ServiceWorkers +
intercepting them. There is the string referrer, which is + set by the UA to be +
the referrer of the Request. This may be an empty string. The other is +
+context which is a rather large enumeration defining + what sort of resource is being fetched. This could be “image” if the request + is from an + <img>tag in the controlled document, “worker” if it is an attempt to load a + worker script, and so on. When used with the fetch() function, + it is “fetch”.

+ +

Response

+ +

Response instances are returned by calls to fetch(). + They can also be created by JS, but this is only useful in ServiceWorkers.

+

We have already seen some attributes of Response when we looked at fetch(). + The most obvious candidates are status, an integer (default + value 200) and statusText (default value “OK”), which correspond + to the HTTP status code and reason. The ok attribute is just + a shorthand for checking that status is in the range 200-299 + inclusive.

+

headers is the Response’s Headers object, with guard “response”. + The url attribute reflects the URL of the corresponding request.

+

Response also has a type, which is “basic”, “cors”, “default”, + “error” or +
“opaque”.

+
    +
  • +"basic": normal, same origin response, with all headers exposed + except +
    “Set-Cookie” and “Set-Cookie2″.
  • +
  • +"cors": response was received from a valid cross-origin request. + Certain headers and the bodymay be accessed.
  • +
  • +"error": network error. No useful information describing + the error is available. The Response’s status is 0, headers are empty and + immutable. This is the type for a Response obtained from Response.error().
  • +
  • +"opaque": response for “no-cors” request to cross-origin + resource. Severely
    + restricted
    +
  • +
+

The “error” type results in the fetch() Promise rejecting with + TypeError.

+

There are certain attributes that are useful only in a ServiceWorker scope. + The +
idiomatic way to return a Response to an intercepted request in ServiceWorkers + is:

+

+ + + + + + +
+
addEventListener('fetch', function(event) {
+  event.respondWith(new Response("Response body", {
+    headers: { "Content-Type" : "text/plain" }
+  });
+});
+
+

+

As you can see, Response has a two argument constructor, where both arguments + are optional. The first argument is a body initializer, and the second + is a dictionary to set the status, statusText and headers.

+

The static method Response.error() simply returns an error + response. Similarly, Response.redirect(url, status) returns + a Response resulting in +
a redirect to url.

+ +

Dealing with bodies

+ +

Both Requests and Responses may contain body data. We’ve been glossing + over it because of the various data types body may contain, but we will + cover it in detail now.

+

A body is an instance of any of the following types.

+ +

In addition, Request and Response both offer the following methods to + extract their body. These all return a Promise that is eventually resolved + with the actual content.

+
    +
  • +arrayBuffer() +
  • +
  • +blob() +
  • +
  • +json() +
  • +
  • +text() +
  • +
  • +formData() +
  • +
+

This is a significant improvement over XHR in terms of ease of use of + non-text data!

+

Request bodies can be set by passing body parameters:

+

+ + + + + + +
+
var form = new FormData(document.getElementById('login-form'));
+fetch("/login", {
+  method: "POST",
+  body: form
+})
+
+

+

Responses take the first argument as the body.

+

+ + + + + + +
+
var res = new Response(new File(["chunk", "chunk"], "archive.zip",
+                       { type: "application/zip" }));
+
+

+

Both Request and Response (and by extension the fetch() function), + will try to intelligently determine the content type. + Request will also automatically set a “Content-Type” header if none is + set in the dictionary.

+ +

Streams and cloning

+ +

It is important to realise that Request and Response bodies can only be + read once! Both interfaces have a boolean attribute bodyUsed to + determine if it is safe to read or not.

+

+ + + + + + +
+
var res = new Response("one time use");
+console.log(res.bodyUsed); // false
+res.text().then(function(v) {
+  console.log(res.bodyUsed); // true
+});
+console.log(res.bodyUsed); // true
+ 
+res.text().catch(function(e) {
+  console.log("Tried to read already consumed Response");
+});
+
+

+

This decision allows easing the transition to an eventual stream-based Fetch + API. The intention is to let applications consume data as it arrives, allowing + for JavaScript to deal with larger files like videos, and perform things + like compression and editing on the fly.

+

Often, you’ll want access to the body multiple times. For example, you + can use the upcoming Cache API to + store Requests and Responses for offline use, and Cache requires bodies + to be available for reading.

+

So how do you read out the body multiple times within such constraints? + The API provides a clone() method on the two interfaces. This + will return a clone of the object, with a ‘new’ body. clone() MUST + be called before the body of the corresponding object has been used. That + is, clone() first, read later.

+

+ + + + + + +
+
addEventListener('fetch', function(evt) {
+  var sheep = new Response("Dolly");
+  console.log(sheep.bodyUsed); // false
+  var clone = sheep.clone();
+  console.log(clone.bodyUsed); // false
+ 
+  clone.text();
+  console.log(sheep.bodyUsed); // false
+  console.log(clone.bodyUsed); // true
+ 
+  evt.respondWith(cache.add(sheep.clone()).then(function(e) {
+    return sheep;
+  });
+});
+
+

+ +

Future improvements

+ +

Along with the transition to streams, Fetch will eventually have the ability + to abort running fetch()es and some way to report the progress + of a fetch. These are provided by XHR, but are a little tricky to fit in + the Promise-based nature of the Fetch API.

+

You can contribute to the evolution of this API by participating in discussions + on the WHATWG mailing list and + in the issues in the Fetch and + ServiceWorkerspecifications.

+

For a better web!

+

The author would like to thank Andrea Marchesini, Anne van Kesteren and Ben
+Kelly for helping with the specification and implementation.
+

+ +
+ + +
diff --git a/resources/tests/readability/002/source.html b/resources/tests/readability/002/source.html new file mode 100644 index 0000000..48befba --- /dev/null +++ b/resources/tests/readability/002/source.html @@ -0,0 +1,1131 @@ + + + + + + + + + + + + + + + + + + + + + + + + + This API is so Fetching! ✩ Mozilla Hacks – the Web developer blog + + + + + + + + + + + + + + + + + + +
+ +
+ + + + Mozilla + +
+ +
+
+ +

This API is so Fetching!

+ + +
+ + +
+
+ +
+
+

For more than a decade the Web has used XMLHttpRequest (XHR) to achieve + asynchronous requests in JavaScript. While very useful, XHR is not a very + nice API. It suffers from lack of separation of concerns. The input, output + and state are all managed by interacting with one object, and state is + tracked using events. Also, the event-based model doesn’t play well with + JavaScript’s recent focus on Promise- and generator-based asynchronous + programming.

+

The Fetch API intends + to fix most of these problems. It does this by introducing the same primitives + to JS that are used in the HTTP protocol. In addition, it introduces a + utility function fetch() that succinctly captures the intention + of retrieving a resource from the network.

+

The Fetch specification, which + defines the API, nails down the semantics of a user agent fetching a resource. + This, combined with ServiceWorkers, is an attempt to:

+
    +
  1. Improve the offline experience.
  2. +
  3. Expose the building blocks of the Web to the platform as part of the + extensible web movement.
  4. +
+

As of this writing, the Fetch API is available in Firefox 39 (currently + Nightly) and Chrome 42 (currently dev). Github has a Fetch polyfill.

+ +

Feature detection

+ +

Fetch API support can be detected by checking for Headers,Request, Response or fetch on + the window or worker scope.

+ +

Simple fetching

+ +

The most useful, high-level part of the Fetch API is the fetch() function. + In its simplest form it takes a URL and returns a promise that resolves + to the response. The response is captured as a Response object.

+
+ + + + + + +
fetch("/data.json").then(function(res) {
+  // res instanceof Response == true.
+  if (res.ok) {
+    res.json().then(function(data) {
+      console.log(data.entries);
+    });
+  } else {
+    console.log("Looks like the response wasn't perfect, got status", res.status);
+  }
+}, function(e) {
+  console.log("Fetch failed!", e);
+});
+
+
+

Submitting some parameters, it would look like this:

+
+ + + + + + +
fetch("http://www.example.org/submit.php", {
+  method: "POST",
+  headers: {
+    "Content-Type": "application/x-www-form-urlencoded"
+  },
+  body: "firstName=Nikhil&favColor=blue&password=easytoguess"
+}).then(function(res) {
+  if (res.ok) {
+    alert("Perfect! Your settings are saved.");
+  } else if (res.status == 401) {
+    alert("Oops! You are not authorized.");
+  }
+}, function(e) {
+  alert("Error submitting form!");
+});
+
+
+

The fetch() function’s arguments are the same as those passed + to the +
+Request() constructor, so you may directly pass arbitrarily + complex requests to fetch() as discussed below.

+ +

Headers

+ +

Fetch introduces 3 interfaces. These are Headers, Request and +
+Response. They map directly to the underlying HTTP concepts, + but have +
certain visibility filters in place for privacy and security reasons, + such as +
supporting CORS rules and ensuring cookies aren’t readable by third parties.

+

The Headers interface is + a simple multi-map of names to values:

+
+ + + + + + +
var content = "Hello World";
+var reqHeaders = new Headers();
+reqHeaders.append("Content-Type", "text/plain"
+reqHeaders.append("Content-Length", content.length.toString());
+reqHeaders.append("X-Custom-Header", "ProcessThisImmediately");
+
+
+

The same can be achieved by passing an array of arrays or a JS object + literal +
to the constructor:

+
+ + + + + + +
reqHeaders = new Headers({
+  "Content-Type": "text/plain",
+  "Content-Length": content.length.toString(),
+  "X-Custom-Header": "ProcessThisImmediately",
+});
+
+
+

The contents can be queried and retrieved:

+
+ + + + + + +
console.log(reqHeaders.has("Content-Type")); // true
+console.log(reqHeaders.has("Set-Cookie")); // false
+reqHeaders.set("Content-Type", "text/html");
+reqHeaders.append("X-Custom-Header", "AnotherValue");
+ 
+console.log(reqHeaders.get("Content-Length")); // 11
+console.log(reqHeaders.getAll("X-Custom-Header")); // ["ProcessThisImmediately", "AnotherValue"]
+ 
+reqHeaders.delete("X-Custom-Header");
+console.log(reqHeaders.getAll("X-Custom-Header")); // []
+
+
+

Some of these operations are only useful in ServiceWorkers, but they provide +
a much nicer API to Headers.

+

Since Headers can be sent in requests, or received in responses, and have + various limitations about what information can and should be mutable, Headers objects + have a guard property. This is not exposed to the Web, but + it affects which mutation operations are allowed on the Headers object. +
Possible values are:

+
    +
  • “none”: default.
  • +
  • “request”: guard for a Headers object obtained from a Request (Request.headers).
  • +
  • “request-no-cors”: guard for a Headers object obtained from a Request + created +
    with mode “no-cors”.
  • +
  • “response”: naturally, for Headers obtained from Response (Response.headers).
  • +
  • “immutable”: Mostly used for ServiceWorkers, renders a Headers object +
    read-only.
  • +
+

The details of how each guard affects the behaviors of the Headers object + are +
in the specification. For example, + you may not append or set a “request” guarded Headers’ “Content-Length” + header. Similarly, inserting “Set-Cookie” into a Response header is not + allowed so that ServiceWorkers may not set cookies via synthesized Responses.

+

All of the Headers methods throw TypeError if name is not a + valid HTTP Header name. The mutation operations will throw TypeError + if there is an immutable guard. Otherwise they fail silently. For example:

+
+ + + + + + +
var res = Response.error();
+try {
+  res.headers.set("Origin", "http://mybank.com");
+} catch(e) {
+  console.log("Cannot pretend to be a bank!");
+}
+
+
+ +

Request

+ +

The Request interface defines a request to fetch a resource over HTTP. + URL, method and headers are expected, but the Request also allows specifying + a body, a request mode, credentials and cache hints.

+

The simplest Request is of course, just a URL, as you may do to GET a + resource.

+
+ + + + + + +
var req = new Request("/index.html");
+console.log(req.method); // "GET"
+console.log(req.url); // "http://example.com/index.html"
+
+
+

You may also pass a Request to the Request() constructor to + create a copy. +
(This is not the same as calling the clone() method, which + is covered in +
the “Reading bodies” section.).

+
+ + + + + + +
var copy = new Request(req);
+console.log(copy.method); // "GET"
+console.log(copy.url); // "http://example.com/index.html"
+
+
+

Again, this form is probably only useful in ServiceWorkers.

+

The non-URL attributes of the Request can only be set by passing + initial +
values as a second argument to the constructor. This argument is a dictionary.

+
+ + + + + + +
var uploadReq = new Request("/uploadImage", {
+  method: "POST",
+  headers: {
+    "Content-Type": "image/png",
+  },
+  body: "image data"
+});
+
+
+

The Request’s mode is used to determine if cross-origin requests lead + to valid responses, and which properties on the response are readable. + Legal mode values are "same-origin", "no-cors" (default) + and "cors".

+

The "same-origin" mode is simple, if a request is made to another + origin with this mode set, the result is simply an error. You could use + this to ensure that +
a request is always being made to your origin.

+
+ + + + + + +
var arbitraryUrl = document.getElementById("url-input").value;
+fetch(arbitraryUrl, { mode: "same-origin" }).then(function(res) {
+  console.log("Response succeeded?", res.ok);
+}, function(e) {
+  console.log("Please enter a same-origin URL!");
+});
+
+
+

The "no-cors" mode captures what the web platform does by default + for scripts you import from CDNs, images hosted on other domains, and so + on. First, it prevents the method from being anything other than “HEAD”, + “GET” or “POST”. Second, if any ServiceWorkers intercept these requests, + they may not add or override any headers except for these. + Third, JavaScript may not access any properties of the resulting Response. + This ensures that ServiceWorkers do not affect the semantics of the Web + and prevents security and privacy issues that could arise from leaking + data across domains.

+

"cors" mode is what you’ll usually use to make known cross-origin + requests to access various APIs offered by other vendors. These are expected + to adhere to +
the CORS protocol. + Only a limited set of + headers is exposed in the Response, but the body is readable. For example, + you could get a list of Flickr’s most interesting photos + today like this:

+
+ + + + + + +
var u = new URLSearchParams();
+u.append('method', 'flickr.interestingness.getList');
+u.append('api_key', '<insert api key here>');
+u.append('format', 'json');
+u.append('nojsoncallback', '1');
+ 
+var apiCall = fetch('https://api.flickr.com/services/rest?' + u);
+ 
+apiCall.then(function(response) {
+  return response.json().then(function(json) {
+    // photo is a list of photos.
+    return json.photos.photo;
+  });
+}).then(function(photos) {
+  photos.forEach(function(photo) {
+    console.log(photo.title);
+  });
+});
+
+
+

You may not read out the “Date” header since Flickr does not allow it + via +
+Access-Control-Expose-Headers.

+
+ + + + + + +
response.headers.get("Date"); // null
+
+
+

The credentials enumeration determines if cookies for the other + domain are +
sent to cross-origin requests. This is similar to XHR’s withCredentials +
flag, but tri-valued as "omit" (default), "same-origin" and "include".

+

The Request object will also give the ability to offer caching hints to + the user-agent. This is currently undergoing some security review. + Firefox exposes the attribute, but it has no effect.

+

Requests have two read-only attributes that are relevant to ServiceWorkers +
intercepting them. There is the string referrer, which is + set by the UA to be +
the referrer of the Request. This may be an empty string. The other is +
+context which is a rather large enumeration defining + what sort of resource is being fetched. This could be “image” if the request + is from an + <img>tag in the controlled document, “worker” if it is an attempt to load a + worker script, and so on. When used with the fetch() function, + it is “fetch”.

+ +

Response

+ +

Response instances are returned by calls to fetch(). + They can also be created by JS, but this is only useful in ServiceWorkers.

+

We have already seen some attributes of Response when we looked at fetch(). + The most obvious candidates are status, an integer (default + value 200) and statusText (default value “OK”), which correspond + to the HTTP status code and reason. The ok attribute is just + a shorthand for checking that status is in the range 200-299 + inclusive.

+

headers is the Response’s Headers object, with guard “response”. + The url attribute reflects the URL of the corresponding request.

+

Response also has a type, which is “basic”, “cors”, “default”, + “error” or +
“opaque”.

+
    +
  • "basic": normal, same origin response, with all headers exposed + except +
    “Set-Cookie” and “Set-Cookie2″.
  • +
  • "cors": response was received from a valid cross-origin request. + Certain headers and the bodymay be accessed.
  • +
  • "error": network error. No useful information describing + the error is available. The Response’s status is 0, headers are empty and + immutable. This is the type for a Response obtained from Response.error().
  • +
  • "opaque": response for “no-cors” request to cross-origin + resource. Severely
    + restricted
    +
  • +
+

The “error” type results in the fetch() Promise rejecting with + TypeError.

+

There are certain attributes that are useful only in a ServiceWorker scope. + The +
idiomatic way to return a Response to an intercepted request in ServiceWorkers + is:

+
+ + + + + + +
addEventListener('fetch', function(event) {
+  event.respondWith(new Response("Response body", {
+    headers: { "Content-Type" : "text/plain" }
+  });
+});
+
+
+

As you can see, Response has a two argument constructor, where both arguments + are optional. The first argument is a body initializer, and the second + is a dictionary to set the status, statusText and headers.

+

The static method Response.error() simply returns an error + response. Similarly, Response.redirect(url, status) returns + a Response resulting in +
a redirect to url.

+ +

Dealing with bodies

+ +

Both Requests and Responses may contain body data. We’ve been glossing + over it because of the various data types body may contain, but we will + cover it in detail now.

+

A body is an instance of any of the following types.

+ +

In addition, Request and Response both offer the following methods to + extract their body. These all return a Promise that is eventually resolved + with the actual content.

+
    +
  • arrayBuffer() +
  • +
  • blob() +
  • +
  • json() +
  • +
  • text() +
  • +
  • formData() +
  • +
+

This is a significant improvement over XHR in terms of ease of use of + non-text data!

+

Request bodies can be set by passing body parameters:

+
+ + + + + + +
var form = new FormData(document.getElementById('login-form'));
+fetch("/login", {
+  method: "POST",
+  body: form
+})
+
+
+

Responses take the first argument as the body.

+
+ + + + + + +
var res = new Response(new File(["chunk", "chunk"], "archive.zip",
+                       { type: "application/zip" }));
+
+
+

Both Request and Response (and by extension the fetch() function), + will try to intelligently determine the content type. + Request will also automatically set a “Content-Type” header if none is + set in the dictionary.

+ +

Streams and cloning

+ +

It is important to realise that Request and Response bodies can only be + read once! Both interfaces have a boolean attribute bodyUsed to + determine if it is safe to read or not.

+
+ + + + + + +
var res = new Response("one time use");
+console.log(res.bodyUsed); // false
+res.text().then(function(v) {
+  console.log(res.bodyUsed); // true
+});
+console.log(res.bodyUsed); // true
+ 
+res.text().catch(function(e) {
+  console.log("Tried to read already consumed Response");
+});
+
+
+

This decision allows easing the transition to an eventual stream-based Fetch + API. The intention is to let applications consume data as it arrives, allowing + for JavaScript to deal with larger files like videos, and perform things + like compression and editing on the fly.

+

Often, you’ll want access to the body multiple times. For example, you + can use the upcoming Cache API to + store Requests and Responses for offline use, and Cache requires bodies + to be available for reading.

+

So how do you read out the body multiple times within such constraints? + The API provides a clone() method on the two interfaces. This + will return a clone of the object, with a ‘new’ body. clone() MUST + be called before the body of the corresponding object has been used. That + is, clone() first, read later.

+
+ + + + + + +
addEventListener('fetch', function(evt) {
+  var sheep = new Response("Dolly");
+  console.log(sheep.bodyUsed); // false
+  var clone = sheep.clone();
+  console.log(clone.bodyUsed); // false
+ 
+  clone.text();
+  console.log(sheep.bodyUsed); // false
+  console.log(clone.bodyUsed); // true
+ 
+  evt.respondWith(cache.add(sheep.clone()).then(function(e) {
+    return sheep;
+  });
+});
+
+
+ +

Future improvements

+ +

Along with the transition to streams, Fetch will eventually have the ability + to abort running fetch()es and some way to report the progress + of a fetch. These are provided by XHR, but are a little tricky to fit in + the Promise-based nature of the Fetch API.

+

You can contribute to the evolution of this API by participating in discussions + on the WHATWG mailing list and + in the issues in the Fetch and + ServiceWorkerspecifications.

+

For a better web!

+

The author would like to thank Andrea Marchesini, Anne van Kesteren and Ben
+Kelly for helping with the specification and implementation.
+

+ +
+
+
+
+

2 comments

+ +

Post a comment +

+
+
    +
  1. +

    + Alexander Petrov wrote on March 11th, 2015 at 02:57: + +

    +
    +

    how do you abort a fetch?

    +
    +

    Reply +

    +
      +
    1. +

      + Nikhil Marathe + + wrote on March 11th, 2015 at 08:00: + +

      +
      +

      At this point there is no way to do so. As mentioned in the future improvements + sections, there are ongoing attempts to find a nice way to plug abort() + into Promise based APIs.

      +
      +

      Reply +

      +
    2. + +
    + +
  2. + +
+
+
+
+
+ Post Your Comment + +

+

+
    +
  1. + + +
  2. +
  3. + + +
  4. +
  5. + + +
  6. +
  7. + + +
  8. +
  9. + + +
  10. +
  11. + + + +

    + +

    + +

    + +

    + +

    + +

    +
  12. +
+
+
+
+
+ +
+ +
+ +
+ +
+ + + + + + +
+ + + + + + diff --git a/src/article.rs b/src/article.rs index d171947..3408e15 100644 --- a/src/article.rs +++ b/src/article.rs @@ -32,8 +32,7 @@ impl Article { .map(|doc| doc.to_string_with_options(options)) } - #[allow(dead_code)] - pub(crate) fn save_html(&self, path: &PathBuf) -> Result<(), Error> { + pub fn save_html(&self, path: &PathBuf) -> Result<(), Error> { if let Some(ref html) = self.get_content() { if let Ok(()) = std::fs::create_dir_all(path) { let mut file_name = match self.title.clone() { diff --git a/src/constants.rs b/src/constants.rs index d18338c..57bdc90 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -4,6 +4,8 @@ use once_cell::sync::Lazy; use regex::Regex; pub const DEFAULT_CHAR_THRESHOLD: usize = 500; +pub static IS_IMAGE: Lazy = + Lazy::new(|| Regex::new(r#"/\.(jpg|jpeg|png|webp)/i"#).expect("IS_IMAGE regex")); pub static SIBLING_CONTENT: Lazy = Lazy::new(|| Regex::new(r#"/\.( |$)/"#).expect("SIBLING_CONTENT regex")); pub static BYLINE: Lazy = Lazy::new(|| { diff --git a/src/full_text_parser/config/config_collection.rs b/src/full_text_parser/config/config_collection.rs index 5c9ad34..6937ac0 100644 --- a/src/full_text_parser/config/config_collection.rs +++ b/src/full_text_parser/config/config_collection.rs @@ -21,7 +21,13 @@ impl ConfigCollection { for (file_name, entry) in EmbededConfigFiles::iter() .filter_map(|file_name| EmbededConfigFiles::get(&file_name).map(|e| (file_name, e))) { - let entry = ConfigEntry::parse_data(entry.data).await.unwrap(); + let entry = match ConfigEntry::parse_data(entry.data).await { + Ok(entry) => entry, + Err(error) => { + log::error!("{error}"); + continue; + } + }; let file_name: &str = file_name.borrow(); embedded_entries.insert(file_name.to_owned(), entry); } diff --git a/src/full_text_parser/config/config_entry.rs b/src/full_text_parser/config/config_entry.rs index 3395ca2..02b89ae 100644 --- a/src/full_text_parser/config/config_entry.rs +++ b/src/full_text_parser/config/config_entry.rs @@ -19,7 +19,7 @@ pub struct Header { pub value: String, } -#[derive(Clone)] +#[derive(Clone, Default)] pub struct ConfigEntry { pub xpath_title: Vec, pub xpath_author: Vec, @@ -34,24 +34,6 @@ pub struct ConfigEntry { pub next_page_link: Option, } -impl Default for ConfigEntry { - fn default() -> Self { - Self { - xpath_title: Vec::new(), - xpath_author: Vec::new(), - xpath_date: Vec::new(), - xpath_body: Vec::new(), - xpath_strip: Vec::new(), - strip_id_or_class: Vec::new(), - strip_image_src: Vec::new(), - replace: Vec::new(), - header: Vec::new(), - single_page_link: None, - next_page_link: None, - } - } -} - impl ConfigEntry { pub async fn parse_path(config_path: &Path) -> Result { let mut file = fs::File::open(&config_path).await?; diff --git a/src/full_text_parser/metadata.rs b/src/full_text_parser/metadata.rs index dd859a4..47f89ae 100644 --- a/src/full_text_parser/metadata.rs +++ b/src/full_text_parser/metadata.rs @@ -23,9 +23,9 @@ pub fn extract( let new_title = constants::TITLE_CUT_END.replace(&title, "$1"); let word_count = constants::WORD_COUNT.split(&title).count(); if word_count < 3 { - constants::TITLE_CUT_FRONT.replace(&title, "$1").to_string() + constants::TITLE_CUT_FRONT.replace(&title, "$1").trim().to_string() } else { - new_title.to_string() + new_title.trim().to_string() } } else { title diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs index 1fcf6e6..ac02695 100644 --- a/src/full_text_parser/mod.rs +++ b/src/full_text_parser/mod.rs @@ -11,6 +11,7 @@ use self::config::{ConfigCollection, ConfigEntry}; use self::error::FullTextParserError; use self::readability::Readability; use crate::article::Article; +use crate::constants; use crate::util::Util; use encoding_rs::Encoding; @@ -19,9 +20,8 @@ use libxml::parser::Parser; use libxml::tree::{Document, Node}; use libxml::xpath::Context; use log::{debug, error, info, warn}; -use regex::Regex; use reqwest::header::HeaderMap; -use reqwest::Client; +use reqwest::{Client, Url}; use std::path::Path; use std::str::from_utf8; @@ -40,6 +40,8 @@ impl FullTextParser { url: &url::Url, client: &Client, ) -> Result { + libxml::tree::node::set_node_rc_guard(3); + info!("Scraping article: '{}'", url.as_str()); // check if we have a config for the url @@ -106,7 +108,6 @@ impl FullTextParser { self.parse_pages( &mut article, - &url, &html, &mut root, config, @@ -137,7 +138,6 @@ impl FullTextParser { async fn parse_pages( &self, article: &mut Article, - url: &url::Url, html: &str, root: &mut Node, config: Option<&ConfigEntry>, @@ -183,7 +183,8 @@ impl FullTextParser { if article.thumbnail_url.is_none() { Self::check_for_thumbnail(&xpath_ctx, article); } - Self::strip_junk(&xpath_ctx, config, global_config, url); + Self::strip_junk(&xpath_ctx, config, global_config); + Self::fix_urls(&xpath_ctx, &article.url); Self::unwrap_noscript_images(&xpath_ctx)?; let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?; @@ -200,7 +201,8 @@ impl FullTextParser { let html = Self::download(&url, client, headers).await?; document = Self::parse_html(&html, config, global_config)?; xpath_ctx = Self::get_xpath_ctx(&document)?; - Self::strip_junk(&xpath_ctx, config, global_config, &url); + Self::strip_junk(&xpath_ctx, config, global_config); + Self::fix_urls(&xpath_ctx, &url); Self::unwrap_noscript_images(&xpath_ctx)?; Self::extract_body(&xpath_ctx, root, config, global_config)?; } @@ -256,7 +258,8 @@ impl FullTextParser { let xpath_ctx = Self::get_xpath_ctx(&document)?; metadata::extract(&xpath_ctx, config, Some(global_config), article); Self::check_for_thumbnail(&xpath_ctx, article); - Self::strip_junk(&xpath_ctx, config, global_config, url); + Self::strip_junk(&xpath_ctx, config, global_config); + Self::fix_urls(&xpath_ctx, url); Self::extract_body(&xpath_ctx, root, config, global_config)?; Ok(()) @@ -543,12 +546,15 @@ impl FullTextParser { Ok(url) } - fn strip_junk( - context: &Context, - config: Option<&ConfigEntry>, - global_config: &ConfigEntry, - url: &url::Url, - ) { + fn fix_urls(context: &Context, url: &Url) { + let _ = Self::repair_urls(context, "//img", "src", url); + let _ = Self::repair_urls(context, "//a", "src", url); + let _ = Self::repair_urls(context, "//a", "href", url); + let _ = Self::repair_urls(context, "//object", "data", url); + let _ = Self::repair_urls(context, "//iframe", "src", url); + } + + fn strip_junk(context: &Context, config: Option<&ConfigEntry>, global_config: &ConfigEntry) { // strip specified xpath if let Some(config) = config { for xpath_strip in &config.xpath_strip { @@ -596,12 +602,6 @@ impl FullTextParser { let _ = Self::remove_attribute(context, Some("img"), "sizes"); let _ = Self::add_attribute(context, Some("a"), "target", "_blank"); - let _ = Self::repair_urls(context, "//img", "src", url); - let _ = Self::repair_urls(context, "//a", "src", url); - let _ = Self::repair_urls(context, "//a", "href", url); - let _ = Self::repair_urls(context, "//object", "data", url); - let _ = Self::repair_urls(context, "//iframe", "src", url); - // strip elements using Readability.com and Instapaper.com ignore class names // .entry-unrelated and .instapaper_ignore // See http://blog.instapaper.com/post/730281947 @@ -638,7 +638,6 @@ impl FullTextParser { fn unwrap_noscript_images(ctx: &Context) -> Result<(), FullTextParserError> { // Find img without source or attributes that might contains image, and remove it. // This is done to prevent a placeholder img is replaced by img from noscript in next step. - let img_regex = Regex::new(r#"/\.(jpg|jpeg|png|webp)/i"#).unwrap(); let img_nodes = Util::evaluate_xpath(ctx, "//img", false)?; for mut img_node in img_nodes { let attrs = img_node.get_attributes(); @@ -648,7 +647,7 @@ impl FullTextParser { || name == "srcset" || name == "data-src" || name == "data-srcset" - || img_regex.is_match(&value) + || constants::IS_IMAGE.is_match(value) }); if !keep { img_node.unlink(); @@ -668,34 +667,44 @@ impl FullTextParser { // attributes that might contains image. if let Some(prev) = noscript_node.get_prev_element_sibling() { if Util::is_single_image(&prev) { - { let mut prev_img = prev.clone(); if prev_img.get_name().to_uppercase() != "IMG" { - if let Some(img_node) = Util::get_elements_by_tag_name(&prev_img, "img").into_iter().next() { + if let Some(img_node) = Util::get_elements_by_tag_name(&prev_img, "img") + .into_iter() + .next() + { prev_img = img_node; } } - let new_img = Util::get_elements_by_tag_name(&noscript_node, "img").into_iter().next(); + let new_img = Util::get_elements_by_tag_name(&noscript_node, "img") + .into_iter() + .next(); if let Some(mut new_img) = new_img { for (key, value) in prev_img.get_attributes() { if value.is_empty() { continue; } - if key == "src" || key == "srcset" || img_regex.is_match(&value) { + if key == "src" + || key == "srcset" + || constants::IS_IMAGE.is_match(&value) + { if new_img.get_attribute(&key).as_deref() == Some(&value) { continue; } - + let mut attr_name = key; if new_img.has_attribute(&attr_name) { attr_name = format!("data-old-{attr_name}"); } - - new_img.set_attribute(&attr_name, &value).unwrap(); + + new_img.set_attribute(&attr_name, &value).map_err(|e| { + log::error!("{e}"); + FullTextParserError::Xml + })?; } } } @@ -703,7 +712,10 @@ impl FullTextParser { if let Some(mut parent) = noscript_node.get_parent() { if let Some(first_child) = noscript_node.get_first_child() { - parent.replace_child_node(first_child, prev).unwrap(); + parent.replace_child_node(first_child, prev).map_err(|e| { + log::error!("{e}"); + FullTextParserError::Xml + })?; noscript_node.unlink(); } } @@ -825,7 +837,9 @@ impl FullTextParser { Ok(()) } - pub(crate) fn post_process_content(root: &mut Node) -> Result<(), FullTextParserError> { + pub(crate) fn post_process_content( + root: &mut Node + ) -> Result<(), FullTextParserError> { Self::clean_classes(root)?; Self::simplify_nested_elements(root)?; Ok(()) diff --git a/src/full_text_parser/readability/mod.rs b/src/full_text_parser/readability/mod.rs index d61c876..602fe73 100644 --- a/src/full_text_parser/readability/mod.rs +++ b/src/full_text_parser/readability/mod.rs @@ -5,7 +5,7 @@ mod tests; use std::cmp::Ordering; -use libxml::tree::{node, Document, Node, NodeType}; +use libxml::tree::{Document, Node, NodeType}; use self::state::State; use super::error::FullTextParserError; @@ -19,8 +19,6 @@ impl Readability { root: &mut Node, title: Option<&str>, ) -> Result { - node::set_node_rc_guard(6); - let mut state = State::default(); let mut document = document; let mut attempts: Vec<(Node, usize, Document)> = Vec::new(); @@ -253,12 +251,11 @@ impl Readability { let mut top_candidate = top_candidates.first().cloned().unwrap_or_else(|| { // If we still have no top candidate, just use the body as a last resort. // We also have to copy the body node so it is something we can modify. - let mut rt = document.get_root_element().unwrap(); - Self::initialize_node(&mut rt, &state).unwrap(); + let mut rt = document.get_root_element().expect("doc should have root"); + Self::initialize_node(&mut rt, &state).expect("init should not fail"); needed_to_create_top_candidate = true; rt }); - let mut parent_of_top_candidate = None; let mut alternative_candidate_ancestors = Vec::new(); // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array @@ -274,25 +271,21 @@ impl Readability { } if alternative_candidate_ancestors.len() >= constants::MINIMUM_TOPCANDIDATES { - parent_of_top_candidate = top_candidate.get_parent(); + let mut parent_of_top_candidate = top_candidate.get_parent(); - loop { - if let Some(parent) = &parent_of_top_candidate { - let mut lists_containing_this_ancestor = 0; - let tmp = usize::min( - alternative_candidate_ancestors.len(), - constants::MINIMUM_TOPCANDIDATES, - ); - for ancestor in alternative_candidate_ancestors.iter().take(tmp) { - lists_containing_this_ancestor += - if ancestor == parent { 1 } else { 0 }; - } + while let Some(parent) = &parent_of_top_candidate { + let mut lists_containing_this_ancestor = 0; + let tmp = usize::min( + alternative_candidate_ancestors.len(), + constants::MINIMUM_TOPCANDIDATES, + ); + for ancestor in alternative_candidate_ancestors.iter().take(tmp) { + lists_containing_this_ancestor += + if ancestor == parent { 1 } else { 0 }; + } - if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES { - top_candidate = parent.clone(); - break; - } - } else { + if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES { + top_candidate = parent.clone(); break; } @@ -311,7 +304,7 @@ impl Readability { // lurking in other places that we want to unify in. The sibling stuff // below does some of that - but only if we've looked high enough up the DOM // tree. - parent_of_top_candidate = top_candidate.get_parent(); + let mut parent_of_top_candidate = top_candidate.get_parent(); let mut last_score = Self::get_content_score(&top_candidate).unwrap_or(0.0); // The scores shouldn't get too low. diff --git a/src/full_text_parser/readability/tests.rs b/src/full_text_parser/readability/tests.rs index b84bc58..06b57c8 100644 --- a/src/full_text_parser/readability/tests.rs +++ b/src/full_text_parser/readability/tests.rs @@ -1,7 +1,4 @@ -use libxml::{ - tree::{Document, Node}, - xpath::Context, -}; +use libxml::tree::{Document, Node}; use reqwest::Url; use crate::{ @@ -9,13 +6,21 @@ use crate::{ full_text_parser::{config::ConfigEntry, metadata}, }; -async fn prepare(html: &str, url: &Url) -> (Document, Context, Article) { +async fn run_test(name: &str) { + libxml::tree::node::set_node_rc_guard(3); + let _ = env_logger::builder().is_test(true).try_init(); + let empty_config = ConfigEntry::default(); - let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap(); + + let url = Url::parse("http://google.com").unwrap(); + let html = std::fs::read_to_string(format!("./resources/tests/readability/{name}/source.html")) + .expect("Failed to read source HTML"); + let document = crate::FullTextParser::parse_html(&html, None, &empty_config).unwrap(); let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap(); - crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config, url); + + crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config); crate::FullTextParser::unwrap_noscript_images(&xpath_ctx).unwrap(); - let article = Article { + let mut article = Article { title: None, author: None, url: url.clone(), @@ -23,17 +28,6 @@ async fn prepare(html: &str, url: &Url) -> (Document, Context, Article) { thumbnail_url: None, document: None, }; - (document, xpath_ctx, article) -} - -#[tokio::test] -async fn test_1() { - let _ = env_logger::builder().is_test(true).try_init(); - - let html = std::fs::read_to_string(r"./resources/tests/readability-test-1.html") - .expect("Failed to read HTML"); - let url = Url::parse("http://google.com").unwrap(); - let (document, xpath_ctx, mut article) = prepare(&html, &url).await; let mut article_document = Document::new().unwrap(); let mut root = Node::new("article", None, &document).unwrap(); @@ -48,5 +42,21 @@ async fn test_1() { article.document = Some(article_document); let html = article.get_content().unwrap(); - std::fs::write("test.html", html).unwrap(); + + let expected = std::fs::read_to_string(format!("./resources/tests/readability/{name}/expected.html")) + .expect("Failed to read expected HTML"); + + //std::fs::write("expected.html", &html).unwrap(); + + assert_eq!(expected, html); +} + +#[tokio::test(flavor = "current_thread")] +async fn test_001() { + run_test("001").await +} + +#[tokio::test(flavor = "current_thread")] +async fn test_002() { + run_test("002").await } diff --git a/src/full_text_parser/tests.rs b/src/full_text_parser/tests.rs index 1e2fc33..76caec1 100644 --- a/src/full_text_parser/tests.rs +++ b/src/full_text_parser/tests.rs @@ -1,10 +1,11 @@ -use super::{FullTextParser, config::ConfigEntry}; +use super::{config::ConfigEntry, FullTextParser}; use libxml::tree::SaveOptions; use reqwest::Client; use std::path::PathBuf; #[tokio::test] async fn golem() { + let _ = env_logger::builder().is_test(true).try_init(); let out_path = PathBuf::from(r"./test_output"); let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap(); @@ -29,6 +30,7 @@ async fn golem() { #[tokio::test] async fn phoronix() { + let _ = env_logger::builder().is_test(true).try_init(); let out_path = PathBuf::from(r"./test_output"); let url = url::Url::parse("http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1") @@ -48,6 +50,7 @@ async fn phoronix() { #[tokio::test] async fn youtube() { + let _ = env_logger::builder().is_test(true).try_init(); let out_path = PathBuf::from(r"./test_output"); let url = url::Url::parse("https://www.youtube.com/watch?v=8KjaIumu-jI").unwrap(); @@ -57,7 +60,7 @@ async fn youtube() { assert_eq!( article.title.as_deref(), - Some("RIGGED! Arena Shuffler is BROKEN | 13 Land Mono Red Burn") + Some("RIGGED! Arena Shuffler is BROKEN") ); assert!(article .get_content() @@ -67,6 +70,7 @@ async fn youtube() { #[tokio::test] async fn encoding_windows_1252() { + let _ = env_logger::builder().is_test(true).try_init(); let url = url::Url::parse("https://www.aerzteblatt.de/nachrichten/139511/Scholz-zuversichtlich-mit-Blick-auf-Coronasituation-im-Winter").unwrap(); let html = FullTextParser::download(&url, &Client::new(), reqwest::header::HeaderMap::new()) .await diff --git a/src/util.rs b/src/util.rs index 169c2e3..9defaa2 100644 --- a/src/util.rs +++ b/src/util.rs @@ -86,7 +86,8 @@ impl Util { for node in res { let content = node.get_content(); let url_str = if content.trim().is_empty() && node.has_attribute("href") { - node.get_attribute("href").unwrap() + node.get_attribute("href") + .expect("already checked for href") } else { content };