mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
refactor & more testing
This commit is contained in:
parent
7ae98904d4
commit
e3246af28b
14 changed files with 1969 additions and 101 deletions
132
resources/tests/readability/001/expected.html
Normal file
132
resources/tests/readability/001/expected.html
Normal file
|
@ -0,0 +1,132 @@
|
|||
<article><DIV id="readability-page-1" class="page"><section>
|
||||
<p><strong>So finally you're <a href="/code/2013/testing-frontend-javascript-code-using-mocha-chai-and-sinon/" target="_blank">testing your frontend JavaScript code</a>? Great! The more you
|
||||
write tests, the more confident you are with your code… but how much precisely?
|
||||
That's where <a href="http://en.wikipedia.org/wiki/Code_coverage" target="_blank">code coverage</a> might
|
||||
help.</strong>
|
||||
</p>
|
||||
<p>The idea behind code coverage is to record which parts of your code (functions,
|
||||
statements, conditionals and so on) have been executed by your test suite,
|
||||
to compute metrics out of these data and usually to provide tools for navigating
|
||||
and inspecting them.</p>
|
||||
<p>Not a lot of frontend developers I know actually test their frontend code,
|
||||
and I can barely imagine how many of them have ever setup code coverage…
|
||||
Mostly because there are not many frontend-oriented tools in this area
|
||||
I guess.</p>
|
||||
<p>Actually I've only found one which provides an adapter for <a href="http://visionmedia.github.io/mocha/" target="_blank">Mocha</a> and
|
||||
actually works…</p>
|
||||
<blockquote>
|
||||
<p>Drinking game for web devs:
|
||||
<br>(1) Think of a noun
|
||||
<br>(2) Google "<noun>.js"
|
||||
<br>(3) If a library with that name exists - drink</p>— Shay Friedman (@ironshay)
|
||||
<a href="https://twitter.com/ironshay/statuses/370525864523743232" target="_blank">August 22, 2013</a>
|
||||
</blockquote>
|
||||
<p><strong><a href="http://blanketjs.org/" target="_blank">Blanket.js</a></strong> is an <em>easy to install, easy to configure,
|
||||
and easy to use JavaScript code coverage library that works both in-browser and
|
||||
with nodejs.</em>
|
||||
</p>
|
||||
<p>Its use is dead easy, adding Blanket support to your Mocha test suite
|
||||
is just matter of adding this simple line to your HTML test file:</p>
|
||||
<pre><code><script src="vendor/blanket.js"
|
||||
data-cover-adapter="vendor/mocha-blanket.js"></script>
|
||||
</code></pre>
|
||||
|
||||
<p>Source files: <a href="https://raw.github.com/alex-seville/blanket/master/dist/qunit/blanket.min.js" target="_blank">blanket.js</a>,
|
||||
<a href="https://raw.github.com/alex-seville/blanket/master/src/adapters/mocha-blanket.js" target="_blank">mocha-blanket.js</a>
|
||||
</p>
|
||||
<p>As an example, let's reuse the silly <code>Cow</code> example we used
|
||||
<a href="/code/2013/testing-frontend-javascript-code-using-mocha-chai-and-sinon/" target="_blank">in a previous episode</a>:</p>
|
||||
<pre><code>// cow.js
|
||||
(function(exports) {
|
||||
"use strict";
|
||||
|
||||
function Cow(name) {
|
||||
this.name = name || "Anon cow";
|
||||
}
|
||||
exports.Cow = Cow;
|
||||
|
||||
Cow.prototype = {
|
||||
greets: function(target) {
|
||||
if (!target)
|
||||
throw new Error("missing target");
|
||||
return this.name + " greets " + target;
|
||||
}
|
||||
};
|
||||
})(this);
|
||||
</code></pre>
|
||||
|
||||
<p>And its test suite, powered by Mocha and <a href="http://chaijs.com/" target="_blank">Chai</a>:</p>
|
||||
<pre><code>var expect = chai.expect;
|
||||
|
||||
describe("Cow", function() {
|
||||
describe("constructor", function() {
|
||||
it("should have a default name", function() {
|
||||
var cow = new Cow();
|
||||
expect(cow.name).to.equal("Anon cow");
|
||||
});
|
||||
|
||||
it("should set cow's name if provided", function() {
|
||||
var cow = new Cow("Kate");
|
||||
expect(cow.name).to.equal("Kate");
|
||||
});
|
||||
});
|
||||
|
||||
describe("#greets", function() {
|
||||
it("should greet passed target", function() {
|
||||
var greetings = (new Cow("Kate")).greets("Baby");
|
||||
expect(greetings).to.equal("Kate greets Baby");
|
||||
});
|
||||
});
|
||||
});
|
||||
</code></pre>
|
||||
|
||||
<p>Let's create the HTML test file for it, featuring Blanket and its adapter
|
||||
for Mocha:</p>
|
||||
<pre><code><!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Test</title>
|
||||
<link rel="stylesheet" media="all" href="vendor/mocha.css">
|
||||
</head>
|
||||
<body>
|
||||
<div id="mocha"></div>
|
||||
<div id="messages"></div>
|
||||
<div id="fixtures"></div>
|
||||
<script src="vendor/mocha.js"></script>
|
||||
<script src="vendor/chai.js"></script>
|
||||
<script src="vendor/blanket.js"
|
||||
data-cover-adapter="vendor/mocha-blanket.js"></script>
|
||||
<script>mocha.setup('bdd');</script>
|
||||
<script src="cow.js" data-cover></script>
|
||||
<script src="cow_test.js"></script>
|
||||
<script>mocha.run();</script>
|
||||
</body>
|
||||
</html>
|
||||
</code></pre>
|
||||
|
||||
<p><strong>Notes</strong>:</p>
|
||||
<ul>
|
||||
<li>Notice the <code>data-cover</code> attribute we added to the script tag
|
||||
loading the source of our library;</li>
|
||||
<li>The HTML test file <em>must</em> be served over HTTP for the adapter to
|
||||
be loaded.</li>
|
||||
</ul>
|
||||
<p>Running the tests now gives us something like this:</p>
|
||||
<p>
|
||||
<img alt="screenshot" src="/static/code/2013/blanket-coverage.png">
|
||||
</p>
|
||||
<p>As you can see, the report at the bottom highlights that we haven't actually
|
||||
tested the case where an error is raised in case a target name is missing.
|
||||
We've been informed of that, nothing more, nothing less. We simply know
|
||||
we're missing a test here. Isn't this cool? I think so!</p>
|
||||
<p>Just remember that code coverage will only <a href="http://codebetter.com/karlseguin/2008/12/09/code-coverage-use-it-wisely/" target="_blank">bring you numbers</a> and
|
||||
raw information, not actual proofs that the whole of your <em>code logic</em> has
|
||||
been actually covered. If you ask me, the best inputs you can get about
|
||||
your code logic and implementation ever are the ones issued out of <a href="http://www.extremeprogramming.org/rules/pair.html" target="_blank">pair programming</a>
|
||||
sessions
|
||||
and <a href="http://alexgaynor.net/2013/sep/26/effective-code-review/" target="_blank">code reviews</a> —
|
||||
but that's another story.</p>
|
||||
<p><strong>So is code coverage silver bullet? No. Is it useful? Definitely. Happy testing!</strong>
|
||||
</p>
|
||||
</section></DIV></article>
|
594
resources/tests/readability/002/expected.html
Normal file
594
resources/tests/readability/002/expected.html
Normal file
|
@ -0,0 +1,594 @@
|
|||
<article><DIV id="readability-page-1" class="page">
|
||||
<article role="article">
|
||||
<p>For more than a decade the Web has used XMLHttpRequest (XHR) to achieve
|
||||
asynchronous requests in JavaScript. While very useful, XHR is not a very
|
||||
nice API. It suffers from lack of separation of concerns. The input, output
|
||||
and state are all managed by interacting with one object, and state is
|
||||
tracked using events. Also, the event-based model doesn’t play well with
|
||||
JavaScript’s recent focus on Promise- and generator-based asynchronous
|
||||
programming.</p>
|
||||
<p>The <a href="https://developer.mozilla.org/en-US/docs/Web/API/Fetch_API" target="_blank">Fetch API</a> intends
|
||||
to fix most of these problems. It does this by introducing the same primitives
|
||||
to JS that are used in the HTTP protocol. In addition, it introduces a
|
||||
utility function <code>fetch()</code> that succinctly captures the intention
|
||||
of retrieving a resource from the network.</p>
|
||||
<p>The <a href="https://fetch.spec.whatwg.org" target="_blank">Fetch specification</a>, which
|
||||
defines the API, nails down the semantics of a user agent fetching a resource.
|
||||
This, combined with ServiceWorkers, is an attempt to:</p>
|
||||
<ol>
|
||||
<li>Improve the offline experience.</li>
|
||||
<li>Expose the building blocks of the Web to the platform as part of the
|
||||
<a href="https://extensiblewebmanifesto.org/" target="_blank">extensible web movement</a>.</li>
|
||||
</ol>
|
||||
<p>As of this writing, the Fetch API is available in Firefox 39 (currently
|
||||
Nightly) and Chrome 42 (currently dev). Github has a <a href="https://github.com/github/fetch" target="_blank">Fetch polyfill</a>.</p>
|
||||
|
||||
<h2>Feature detection</h2>
|
||||
|
||||
<p>Fetch API support can be detected by checking for <code>Headers</code>,<code>Request</code>, <code>Response</code> or <code>fetch</code> on
|
||||
the <code>window</code> or <code>worker</code> scope.</p>
|
||||
|
||||
<h2>Simple fetching</h2>
|
||||
|
||||
<p>The most useful, high-level part of the Fetch API is the <code>fetch()</code> function.
|
||||
In its simplest form it takes a URL and returns a promise that resolves
|
||||
to the response. The response is captured as a <code>Response</code> object.</p>
|
||||
<P>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<pre>fetch<span>(</span><span>"/data.json"</span><span>)</span>.<span>then</span><span>(</span><span>function</span><span>(</span>res<span>)</span> <span>{</span>
|
||||
<span>// res instanceof Response == true.</span>
|
||||
<span>if</span> <span>(</span>res.<span>ok</span><span>)</span> <span>{</span>
|
||||
res.<span>json</span><span>(</span><span>)</span>.<span>then</span><span>(</span><span>function</span><span>(</span>data<span>)</span> <span>{</span>
|
||||
console.<span>log</span><span>(</span>data.<span>entries</span><span>)</span><span>;</span>
|
||||
<span>}</span><span>)</span><span>;</span>
|
||||
<span>}</span> <span>else</span> <span>{</span>
|
||||
console.<span>log</span><span>(</span><span>"Looks like the response wasn't perfect, got status"</span><span>,</span> res.<span>status</span><span>)</span><span>;</span>
|
||||
<span>}</span>
|
||||
<span>}</span><span>,</span> <span>function</span><span>(</span>e<span>)</span> <span>{</span>
|
||||
console.<span>log</span><span>(</span><span>"Fetch failed!"</span><span>,</span> e<span>)</span><span>;</span>
|
||||
<span>}</span><span>)</span><span>;</span></pre>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</P>
|
||||
<p>Submitting some parameters, it would look like this:</p>
|
||||
<P>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<pre>fetch<span>(</span><span>"http://www.example.org/submit.php"</span><span>,</span> <span>{</span>
|
||||
method<span>:</span> <span>"POST"</span><span>,</span>
|
||||
headers<span>:</span> <span>{</span>
|
||||
<span>"Content-Type"</span><span>:</span> <span>"application/x-www-form-urlencoded"</span>
|
||||
<span>}</span><span>,</span>
|
||||
body<span>:</span> <span>"firstName=Nikhil&favColor=blue&password=easytoguess"</span>
|
||||
<span>}</span><span>)</span>.<span>then</span><span>(</span><span>function</span><span>(</span>res<span>)</span> <span>{</span>
|
||||
<span>if</span> <span>(</span>res.<span>ok</span><span>)</span> <span>{</span>
|
||||
alert<span>(</span><span>"Perfect! Your settings are saved."</span><span>)</span><span>;</span>
|
||||
<span>}</span> <span>else</span> <span>if</span> <span>(</span>res.<span>status</span> <span>==</span> <span>401</span><span>)</span> <span>{</span>
|
||||
alert<span>(</span><span>"Oops! You are not authorized."</span><span>)</span><span>;</span>
|
||||
<span>}</span>
|
||||
<span>}</span><span>,</span> <span>function</span><span>(</span>e<span>)</span> <span>{</span>
|
||||
alert<span>(</span><span>"Error submitting form!"</span><span>)</span><span>;</span>
|
||||
<span>}</span><span>)</span><span>;</span></pre>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</P>
|
||||
<p>The <code>fetch()</code> function’s arguments are the same as those passed
|
||||
to the
|
||||
<br>
|
||||
<code>Request()</code> constructor, so you may directly pass arbitrarily
|
||||
complex requests to <code>fetch()</code> as discussed below.</p>
|
||||
|
||||
<h2>Headers</h2>
|
||||
|
||||
<p>Fetch introduces 3 interfaces. These are <code>Headers</code>, <code>Request</code> and
|
||||
<br>
|
||||
<code>Response</code>. They map directly to the underlying HTTP concepts,
|
||||
but have
|
||||
<br>certain visibility filters in place for privacy and security reasons,
|
||||
such as
|
||||
<br>supporting CORS rules and ensuring cookies aren’t readable by third parties.</p>
|
||||
<p>The <a href="https://fetch.spec.whatwg.org/#headers-class" target="_blank">Headers interface</a> is
|
||||
a simple multi-map of names to values:</p>
|
||||
<P>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<pre><span>var</span> content <span>=</span> <span>"Hello World"</span><span>;</span>
|
||||
<span>var</span> reqHeaders <span>=</span> <span>new</span> Headers<span>(</span><span>)</span><span>;</span>
|
||||
reqHeaders.<span>append</span><span>(</span><span>"Content-Type"</span><span>,</span> <span>"text/plain"</span>
|
||||
reqHeaders.<span>append</span><span>(</span><span>"Content-Length"</span><span>,</span> content.<span>length</span>.<span>toString</span><span>(</span><span>)</span><span>)</span><span>;</span>
|
||||
reqHeaders.<span>append</span><span>(</span><span>"X-Custom-Header"</span><span>,</span> <span>"ProcessThisImmediately"</span><span>)</span><span>;</span></pre>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</P>
|
||||
<p>The same can be achieved by passing an array of arrays or a JS object
|
||||
literal
|
||||
<br>to the constructor:</p>
|
||||
<P>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<pre>reqHeaders <span>=</span> <span>new</span> Headers<span>(</span><span>{</span>
|
||||
<span>"Content-Type"</span><span>:</span> <span>"text/plain"</span><span>,</span>
|
||||
<span>"Content-Length"</span><span>:</span> content.<span>length</span>.<span>toString</span><span>(</span><span>)</span><span>,</span>
|
||||
<span>"X-Custom-Header"</span><span>:</span> <span>"ProcessThisImmediately"</span><span>,</span>
|
||||
<span>}</span><span>)</span><span>;</span></pre>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</P>
|
||||
<p>The contents can be queried and retrieved:</p>
|
||||
<P>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<pre>console.<span>log</span><span>(</span>reqHeaders.<span>has</span><span>(</span><span>"Content-Type"</span><span>)</span><span>)</span><span>;</span> <span>// true</span>
|
||||
console.<span>log</span><span>(</span>reqHeaders.<span>has</span><span>(</span><span>"Set-Cookie"</span><span>)</span><span>)</span><span>;</span> <span>// false</span>
|
||||
reqHeaders.<span>set</span><span>(</span><span>"Content-Type"</span><span>,</span> <span>"text/html"</span><span>)</span><span>;</span>
|
||||
reqHeaders.<span>append</span><span>(</span><span>"X-Custom-Header"</span><span>,</span> <span>"AnotherValue"</span><span>)</span><span>;</span>
|
||||
|
||||
console.<span>log</span><span>(</span>reqHeaders.<span>get</span><span>(</span><span>"Content-Length"</span><span>)</span><span>)</span><span>;</span> <span>// 11</span>
|
||||
console.<span>log</span><span>(</span>reqHeaders.<span>getAll</span><span>(</span><span>"X-Custom-Header"</span><span>)</span><span>)</span><span>;</span> <span>// ["ProcessThisImmediately", "AnotherValue"]</span>
|
||||
|
||||
reqHeaders.<span>delete</span><span>(</span><span>"X-Custom-Header"</span><span>)</span><span>;</span>
|
||||
console.<span>log</span><span>(</span>reqHeaders.<span>getAll</span><span>(</span><span>"X-Custom-Header"</span><span>)</span><span>)</span><span>;</span> <span>// []</span></pre>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</P>
|
||||
<p>Some of these operations are only useful in ServiceWorkers, but they provide
|
||||
<br>a much nicer API to Headers.</p>
|
||||
<p>Since Headers can be sent in requests, or received in responses, and have
|
||||
various limitations about what information can and should be mutable, <code>Headers</code> objects
|
||||
have a <strong>guard</strong> property. This is not exposed to the Web, but
|
||||
it affects which mutation operations are allowed on the Headers object.
|
||||
<br>Possible values are:</p>
|
||||
<ul>
|
||||
<li>“none”: default.</li>
|
||||
<li>“request”: guard for a Headers object obtained from a Request (<code>Request.headers</code>).</li>
|
||||
<li>“request-no-cors”: guard for a Headers object obtained from a Request
|
||||
created
|
||||
<br>with mode “no-cors”.</li>
|
||||
<li>“response”: naturally, for Headers obtained from Response (<code>Response.headers</code>).</li>
|
||||
<li>“immutable”: Mostly used for ServiceWorkers, renders a Headers object
|
||||
<br>read-only.</li>
|
||||
</ul>
|
||||
<p>The details of how each guard affects the behaviors of the Headers object
|
||||
are
|
||||
<br>in the <a href="https://fetch.spec.whatwg.org" target="_blank">specification</a>. For example,
|
||||
you may not append or set a “request” guarded Headers’ “Content-Length”
|
||||
header. Similarly, inserting “Set-Cookie” into a Response header is not
|
||||
allowed so that ServiceWorkers may not set cookies via synthesized Responses.</p>
|
||||
<p>All of the Headers methods throw TypeError if <code>name</code> is not a
|
||||
<a href="https://fetch.spec.whatwg.org/#concept-header-name" target="_blank">valid HTTP Header name</a>. The mutation operations will throw TypeError
|
||||
if there is an immutable guard. Otherwise they fail silently. For example:</p>
|
||||
<P>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<pre><span>var</span> res <span>=</span> Response.<span>error</span><span>(</span><span>)</span><span>;</span>
|
||||
<span>try</span> <span>{</span>
|
||||
res.<span>headers</span>.<span>set</span><span>(</span><span>"Origin"</span><span>,</span> <span>"http://mybank.com"</span><span>)</span><span>;</span>
|
||||
<span>}</span> <span>catch</span><span>(</span>e<span>)</span> <span>{</span>
|
||||
console.<span>log</span><span>(</span><span>"Cannot pretend to be a bank!"</span><span>)</span><span>;</span>
|
||||
<span>}</span></pre>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</P>
|
||||
|
||||
<h2>Request</h2>
|
||||
|
||||
<p>The Request interface defines a request to fetch a resource over HTTP.
|
||||
URL, method and headers are expected, but the Request also allows specifying
|
||||
a body, a request mode, credentials and cache hints.</p>
|
||||
<p>The simplest Request is of course, just a URL, as you may do to GET a
|
||||
resource.</p>
|
||||
<P>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<pre><span>var</span> req <span>=</span> <span>new</span> Request<span>(</span><span>"/index.html"</span><span>)</span><span>;</span>
|
||||
console.<span>log</span><span>(</span>req.<span>method</span><span>)</span><span>;</span> <span>// "GET"</span>
|
||||
console.<span>log</span><span>(</span>req.<span>url</span><span>)</span><span>;</span> <span>// "http://example.com/index.html"</span></pre>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</P>
|
||||
<p>You may also pass a Request to the <code>Request()</code> constructor to
|
||||
create a copy.
|
||||
<br>(This is not the same as calling the <code>clone()</code> method, which
|
||||
is covered in
|
||||
<br>the “Reading bodies” section.).</p>
|
||||
<P>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<pre><span>var</span> copy <span>=</span> <span>new</span> Request<span>(</span>req<span>)</span><span>;</span>
|
||||
console.<span>log</span><span>(</span>copy.<span>method</span><span>)</span><span>;</span> <span>// "GET"</span>
|
||||
console.<span>log</span><span>(</span>copy.<span>url</span><span>)</span><span>;</span> <span>// "http://example.com/index.html"</span></pre>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</P>
|
||||
<p>Again, this form is probably only useful in ServiceWorkers.</p>
|
||||
<p>The non-URL attributes of the <code>Request</code> can only be set by passing
|
||||
initial
|
||||
<br>values as a second argument to the constructor. This argument is a dictionary.</p>
|
||||
<P>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<pre><span>var</span> uploadReq <span>=</span> <span>new</span> Request<span>(</span><span>"/uploadImage"</span><span>,</span> <span>{</span>
|
||||
method<span>:</span> <span>"POST"</span><span>,</span>
|
||||
headers<span>:</span> <span>{</span>
|
||||
<span>"Content-Type"</span><span>:</span> <span>"image/png"</span><span>,</span>
|
||||
<span>}</span><span>,</span>
|
||||
body<span>:</span> <span>"image data"</span>
|
||||
<span>}</span><span>)</span><span>;</span></pre>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</P>
|
||||
<p>The Request’s mode is used to determine if cross-origin requests lead
|
||||
to valid responses, and which properties on the response are readable.
|
||||
Legal mode values are <code>"same-origin"</code>, <code>"no-cors"</code> (default)
|
||||
and <code>"cors"</code>.</p>
|
||||
<p>The <code>"same-origin"</code> mode is simple, if a request is made to another
|
||||
origin with this mode set, the result is simply an error. You could use
|
||||
this to ensure that
|
||||
<br>a request is always being made to your origin.</p>
|
||||
<P>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<pre><span>var</span> arbitraryUrl <span>=</span> document.<span>getElementById</span><span>(</span><span>"url-input"</span><span>)</span>.<span>value</span><span>;</span>
|
||||
fetch<span>(</span>arbitraryUrl<span>,</span> <span>{</span> mode<span>:</span> <span>"same-origin"</span> <span>}</span><span>)</span>.<span>then</span><span>(</span><span>function</span><span>(</span>res<span>)</span> <span>{</span>
|
||||
console.<span>log</span><span>(</span><span>"Response succeeded?"</span><span>,</span> res.<span>ok</span><span>)</span><span>;</span>
|
||||
<span>}</span><span>,</span> <span>function</span><span>(</span>e<span>)</span> <span>{</span>
|
||||
console.<span>log</span><span>(</span><span>"Please enter a same-origin URL!"</span><span>)</span><span>;</span>
|
||||
<span>}</span><span>)</span><span>;</span></pre>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</P>
|
||||
<p>The <code>"no-cors"</code> mode captures what the web platform does by default
|
||||
for scripts you import from CDNs, images hosted on other domains, and so
|
||||
on. First, it prevents the method from being anything other than “HEAD”,
|
||||
“GET” or “POST”. Second, if any ServiceWorkers intercept these requests,
|
||||
they may not add or override any headers except for <a href="https://fetch.spec.whatwg.org/#simple-header" target="_blank">these</a>.
|
||||
Third, JavaScript may not access any properties of the resulting Response.
|
||||
This ensures that ServiceWorkers do not affect the semantics of the Web
|
||||
and prevents security and privacy issues that could arise from leaking
|
||||
data across domains.</p>
|
||||
<p><code>"cors"</code> mode is what you’ll usually use to make known cross-origin
|
||||
requests to access various APIs offered by other vendors. These are expected
|
||||
to adhere to
|
||||
<br>the <a href="https://developer.mozilla.org/en-US/docs/Web/HTTP/Access_control_CORS" target="_blank">CORS protocol</a>.
|
||||
Only a <a href="https://fetch.spec.whatwg.org/#concept-filtered-response-cors" target="_blank">limited set</a> of
|
||||
headers is exposed in the Response, but the body is readable. For example,
|
||||
you could get a list of Flickr’s <a href="https://www.flickr.com/services/api/flickr.interestingness.getList.html" target="_blank">most interesting</a> photos
|
||||
today like this:</p>
|
||||
<P>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<pre><span>var</span> u <span>=</span> <span>new</span> URLSearchParams<span>(</span><span>)</span><span>;</span>
|
||||
u.<span>append</span><span>(</span><span>'method'</span><span>,</span> <span>'flickr.interestingness.getList'</span><span>)</span><span>;</span>
|
||||
u.<span>append</span><span>(</span><span>'api_key'</span><span>,</span> <span>'<insert api key here>'</span><span>)</span><span>;</span>
|
||||
u.<span>append</span><span>(</span><span>'format'</span><span>,</span> <span>'json'</span><span>)</span><span>;</span>
|
||||
u.<span>append</span><span>(</span><span>'nojsoncallback'</span><span>,</span> <span>'1'</span><span>)</span><span>;</span>
|
||||
|
||||
<span>var</span> apiCall <span>=</span> fetch<span>(</span><span>'https://api.flickr.com/services/rest?'</span> <span>+</span> u<span>)</span><span>;</span>
|
||||
|
||||
apiCall.<span>then</span><span>(</span><span>function</span><span>(</span>response<span>)</span> <span>{</span>
|
||||
<span>return</span> response.<span>json</span><span>(</span><span>)</span>.<span>then</span><span>(</span><span>function</span><span>(</span>json<span>)</span> <span>{</span>
|
||||
<span>// photo is a list of photos.</span>
|
||||
<span>return</span> json.<span>photos</span>.<span>photo</span><span>;</span>
|
||||
<span>}</span><span>)</span><span>;</span>
|
||||
<span>}</span><span>)</span>.<span>then</span><span>(</span><span>function</span><span>(</span>photos<span>)</span> <span>{</span>
|
||||
photos.<span>forEach</span><span>(</span><span>function</span><span>(</span>photo<span>)</span> <span>{</span>
|
||||
console.<span>log</span><span>(</span>photo.<span>title</span><span>)</span><span>;</span>
|
||||
<span>}</span><span>)</span><span>;</span>
|
||||
<span>}</span><span>)</span><span>;</span></pre>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</P>
|
||||
<p>You may not read out the “Date” header since Flickr does not allow it
|
||||
via
|
||||
<br>
|
||||
<code>Access-Control-Expose-Headers</code>.</p>
|
||||
<P>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<pre>response.<span>headers</span>.<span>get</span><span>(</span><span>"Date"</span><span>)</span><span>;</span> <span>// null</span></pre>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</P>
|
||||
<p>The <code>credentials</code> enumeration determines if cookies for the other
|
||||
domain are
|
||||
<br>sent to cross-origin requests. This is similar to XHR’s <code>withCredentials</code>
|
||||
<br>flag, but tri-valued as <code>"omit"</code> (default), <code>"same-origin"</code> and <code>"include"</code>.</p>
|
||||
<p>The Request object will also give the ability to offer caching hints to
|
||||
the user-agent. This is currently undergoing some <a href="https://github.com/slightlyoff/ServiceWorker/issues/585" target="_blank">security review</a>.
|
||||
Firefox exposes the attribute, but it has no effect.</p>
|
||||
<p>Requests have two read-only attributes that are relevant to ServiceWorkers
|
||||
<br>intercepting them. There is the string <code>referrer</code>, which is
|
||||
set by the UA to be
|
||||
<br>the referrer of the Request. This may be an empty string. The other is
|
||||
<br>
|
||||
<code>context</code> which is a rather <a href="https://fetch.spec.whatwg.org/#requestcredentials" target="_blank">large enumeration</a> defining
|
||||
what sort of resource is being fetched. This could be “image” if the request
|
||||
is from an
|
||||
<img>tag in the controlled document, “worker” if it is an attempt to load a
|
||||
worker script, and so on. When used with the <code>fetch()</code> function,
|
||||
it is “fetch”.</p>
|
||||
|
||||
<h2>Response</h2>
|
||||
|
||||
<p><code>Response</code> instances are returned by calls to <code>fetch()</code>.
|
||||
They can also be created by JS, but this is only useful in ServiceWorkers.</p>
|
||||
<p>We have already seen some attributes of Response when we looked at <code>fetch()</code>.
|
||||
The most obvious candidates are <code>status</code>, an integer (default
|
||||
value 200) and <code>statusText</code> (default value “OK”), which correspond
|
||||
to the HTTP status code and reason. The <code>ok</code> attribute is just
|
||||
a shorthand for checking that <code>status</code> is in the range 200-299
|
||||
inclusive.</p>
|
||||
<p><code>headers</code> is the Response’s Headers object, with guard “response”.
|
||||
The <code>url</code> attribute reflects the URL of the corresponding request.</p>
|
||||
<p>Response also has a <code>type</code>, which is “basic”, “cors”, “default”,
|
||||
“error” or
|
||||
<br>“opaque”.</p>
|
||||
<ul>
|
||||
<li>
|
||||
<code>"basic"</code>: normal, same origin response, with all headers exposed
|
||||
except
|
||||
<br>“Set-Cookie” and “Set-Cookie2″.</li>
|
||||
<li>
|
||||
<code>"cors"</code>: response was received from a valid cross-origin request.
|
||||
<a href="https://fetch.spec.whatwg.org/#concept-filtered-response-cors" target="_blank">Certain headers and the body</a>may be accessed.</li>
|
||||
<li>
|
||||
<code>"error"</code>: network error. No useful information describing
|
||||
the error is available. The Response’s status is 0, headers are empty and
|
||||
immutable. This is the type for a Response obtained from <code>Response.error()</code>.</li>
|
||||
<li>
|
||||
<code>"opaque"</code>: response for “no-cors” request to cross-origin
|
||||
resource. <a href="https://fetch.spec.whatwg.org/#concept-filtered-response-opaque" target="_blank">Severely<br>
|
||||
restricted</a>
|
||||
</li>
|
||||
</ul>
|
||||
<p>The “error” type results in the <code>fetch()</code> Promise rejecting with
|
||||
TypeError.</p>
|
||||
<p>There are certain attributes that are useful only in a ServiceWorker scope.
|
||||
The
|
||||
<br>idiomatic way to return a Response to an intercepted request in ServiceWorkers
|
||||
is:</p>
|
||||
<P>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<pre>addEventListener<span>(</span><span>'fetch'</span><span>,</span> <span>function</span><span>(</span>event<span>)</span> <span>{</span>
|
||||
event.<span>respondWith</span><span>(</span><span>new</span> Response<span>(</span><span>"Response body"</span><span>,</span> <span>{</span>
|
||||
headers<span>:</span> <span>{</span> <span>"Content-Type"</span> <span>:</span> <span>"text/plain"</span> <span>}</span>
|
||||
<span>}</span><span>)</span><span>;</span>
|
||||
<span>}</span><span>)</span><span>;</span></pre>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</P>
|
||||
<p>As you can see, Response has a two argument constructor, where both arguments
|
||||
are optional. The first argument is a body initializer, and the second
|
||||
is a dictionary to set the <code>status</code>, <code>statusText</code> and <code>headers</code>.</p>
|
||||
<p>The static method <code>Response.error()</code> simply returns an error
|
||||
response. Similarly, <code>Response.redirect(url, status)</code> returns
|
||||
a Response resulting in
|
||||
<br>a redirect to <code>url</code>.</p>
|
||||
|
||||
<h2>Dealing with bodies</h2>
|
||||
|
||||
<p>Both Requests and Responses may contain body data. We’ve been glossing
|
||||
over it because of the various data types body may contain, but we will
|
||||
cover it in detail now.</p>
|
||||
<p>A body is an instance of any of the following types.</p>
|
||||
<ul>
|
||||
<li>
|
||||
<a href="https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/ArrayBuffer" target="_blank">ArrayBuffer</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="https://developer.mozilla.org/en-US/docs/Web/API/ArrayBufferView" target="_blank">ArrayBufferView</a> (Uint8Array
|
||||
and friends)</li>
|
||||
<li>
|
||||
<a href="https://developer.mozilla.org/en-US/docs/Web/API/Blob" target="_blank">Blob</a>/
|
||||
<a href="https://developer.mozilla.org/en-US/docs/Web/API/File" target="_blank">File</a>
|
||||
</li>
|
||||
<li>string</li>
|
||||
<li>
|
||||
<a href="https://url.spec.whatwg.org/#interface-urlsearchparams" target="_blank">URLSearchParams</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="https://developer.mozilla.org/en-US/docs/Web/API/FormData" target="_blank">FormData</a> –
|
||||
currently not supported by either Gecko or Blink. Firefox expects to ship
|
||||
this in version 39 along with the rest of Fetch.</li>
|
||||
</ul>
|
||||
<p>In addition, Request and Response both offer the following methods to
|
||||
extract their body. These all return a Promise that is eventually resolved
|
||||
with the actual content.</p>
|
||||
<ul>
|
||||
<li>
|
||||
<code>arrayBuffer()</code>
|
||||
</li>
|
||||
<li>
|
||||
<code>blob()</code>
|
||||
</li>
|
||||
<li>
|
||||
<code>json()</code>
|
||||
</li>
|
||||
<li>
|
||||
<code>text()</code>
|
||||
</li>
|
||||
<li>
|
||||
<code>formData()</code>
|
||||
</li>
|
||||
</ul>
|
||||
<p>This is a significant improvement over XHR in terms of ease of use of
|
||||
non-text data!</p>
|
||||
<p>Request bodies can be set by passing <code>body</code> parameters:</p>
|
||||
<P>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<pre><span>var</span> form <span>=</span> <span>new</span> FormData<span>(</span>document.<span>getElementById</span><span>(</span><span>'login-form'</span><span>)</span><span>)</span><span>;</span>
|
||||
fetch<span>(</span><span>"/login"</span><span>,</span> <span>{</span>
|
||||
method<span>:</span> <span>"POST"</span><span>,</span>
|
||||
body<span>:</span> form
|
||||
<span>}</span><span>)</span></pre>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</P>
|
||||
<p>Responses take the first argument as the body.</p>
|
||||
<P>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<pre><span>var</span> res <span>=</span> <span>new</span> Response<span>(</span><span>new</span> File<span>(</span><span>[</span><span>"chunk"</span><span>,</span> <span>"chunk"</span><span>]</span><span>,</span> <span>"archive.zip"</span><span>,</span>
|
||||
<span>{</span> type<span>:</span> <span>"application/zip"</span> <span>}</span><span>)</span><span>)</span><span>;</span></pre>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</P>
|
||||
<p>Both Request and Response (and by extension the <code>fetch()</code> function),
|
||||
will try to intelligently <a href="https://fetch.spec.whatwg.org/#concept-bodyinit-extract" target="_blank">determine the content type</a>.
|
||||
Request will also automatically set a “Content-Type” header if none is
|
||||
set in the dictionary.</p>
|
||||
|
||||
<h3>Streams and cloning</h3>
|
||||
|
||||
<p>It is important to realise that Request and Response bodies can only be
|
||||
read once! Both interfaces have a boolean attribute <code>bodyUsed</code> to
|
||||
determine if it is safe to read or not.</p>
|
||||
<P>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<pre><span>var</span> res <span>=</span> <span>new</span> Response<span>(</span><span>"one time use"</span><span>)</span><span>;</span>
|
||||
console.<span>log</span><span>(</span>res.<span>bodyUsed</span><span>)</span><span>;</span> <span>// false</span>
|
||||
res.<span>text</span><span>(</span><span>)</span>.<span>then</span><span>(</span><span>function</span><span>(</span>v<span>)</span> <span>{</span>
|
||||
console.<span>log</span><span>(</span>res.<span>bodyUsed</span><span>)</span><span>;</span> <span>// true</span>
|
||||
<span>}</span><span>)</span><span>;</span>
|
||||
console.<span>log</span><span>(</span>res.<span>bodyUsed</span><span>)</span><span>;</span> <span>// true</span>
|
||||
|
||||
res.<span>text</span><span>(</span><span>)</span>.<span>catch</span><span>(</span><span>function</span><span>(</span>e<span>)</span> <span>{</span>
|
||||
console.<span>log</span><span>(</span><span>"Tried to read already consumed Response"</span><span>)</span><span>;</span>
|
||||
<span>}</span><span>)</span><span>;</span></pre>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</P>
|
||||
<p>This decision allows easing the transition to an eventual <a href="https://streams.spec.whatwg.org/" target="_blank">stream-based</a> Fetch
|
||||
API. The intention is to let applications consume data as it arrives, allowing
|
||||
for JavaScript to deal with larger files like videos, and perform things
|
||||
like compression and editing on the fly.</p>
|
||||
<p>Often, you’ll want access to the body multiple times. For example, you
|
||||
can use the upcoming <a href="http://slightlyoff.github.io/ServiceWorker/spec/service_worker/index.html#cache-objects" target="_blank">Cache API</a> to
|
||||
store Requests and Responses for offline use, and Cache requires bodies
|
||||
to be available for reading.</p>
|
||||
<p>So how do you read out the body multiple times within such constraints?
|
||||
The API provides a <code>clone()</code> method on the two interfaces. This
|
||||
will return a clone of the object, with a ‘new’ body. <code>clone()</code> MUST
|
||||
be called before the body of the corresponding object has been used. That
|
||||
is, <code>clone()</code> first, read later.</p>
|
||||
<P>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<pre>addEventListener<span>(</span><span>'fetch'</span><span>,</span> <span>function</span><span>(</span>evt<span>)</span> <span>{</span>
|
||||
<span>var</span> sheep <span>=</span> <span>new</span> Response<span>(</span><span>"Dolly"</span><span>)</span><span>;</span>
|
||||
console.<span>log</span><span>(</span>sheep.<span>bodyUsed</span><span>)</span><span>;</span> <span>// false</span>
|
||||
<span>var</span> clone <span>=</span> sheep.<span>clone</span><span>(</span><span>)</span><span>;</span>
|
||||
console.<span>log</span><span>(</span>clone.<span>bodyUsed</span><span>)</span><span>;</span> <span>// false</span>
|
||||
|
||||
clone.<span>text</span><span>(</span><span>)</span><span>;</span>
|
||||
console.<span>log</span><span>(</span>sheep.<span>bodyUsed</span><span>)</span><span>;</span> <span>// false</span>
|
||||
console.<span>log</span><span>(</span>clone.<span>bodyUsed</span><span>)</span><span>;</span> <span>// true</span>
|
||||
|
||||
evt.<span>respondWith</span><span>(</span>cache.<span>add</span><span>(</span>sheep.<span>clone</span><span>(</span><span>)</span><span>)</span>.<span>then</span><span>(</span><span>function</span><span>(</span>e<span>)</span> <span>{</span>
|
||||
<span>return</span> sheep<span>;</span>
|
||||
<span>}</span><span>)</span><span>;</span>
|
||||
<span>}</span><span>)</span><span>;</span></pre>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</P>
|
||||
|
||||
<h2>Future improvements</h2>
|
||||
|
||||
<p>Along with the transition to streams, Fetch will eventually have the ability
|
||||
to abort running <code>fetch()</code>es and some way to report the progress
|
||||
of a fetch. These are provided by XHR, but are a little tricky to fit in
|
||||
the Promise-based nature of the Fetch API.</p>
|
||||
<p>You can contribute to the evolution of this API by participating in discussions
|
||||
on the <a href="https://whatwg.org/mailing-list" target="_blank">WHATWG mailing list</a> and
|
||||
in the issues in the <a href="https://www.w3.org/Bugs/Public/buglist.cgi?product=WHATWG&component=Fetch&resolution=---" target="_blank">Fetch</a> and
|
||||
<a href="https://github.com/slightlyoff/ServiceWorker/issues" target="_blank">ServiceWorker</a>specifications.</p>
|
||||
<p>For a better web!</p>
|
||||
<p><em>The author would like to thank Andrea Marchesini, Anne van Kesteren and Ben<br>
|
||||
Kelly for helping with the specification and implementation.</em>
|
||||
</p>
|
||||
<footer>
|
||||
<p>Posted by <a href="https://hacks.mozilla.org/author/nmarathemozilla-com/" title="Posts by Nikhil Marathe" rel="author" target="_blank">Nikhil Marathe</a>
|
||||
on
|
||||
<time datetime="2015-03-10T08:05:41-07:00">March 10, 2015</time>at
|
||||
<time datetime="PDT08:05:41-07:00">08:05</time>
|
||||
</p>
|
||||
<P>
|
||||
|
||||
</P>
|
||||
</footer>
|
||||
</article>
|
||||
|
||||
|
||||
</DIV></article>
|
1131
resources/tests/readability/002/source.html
Normal file
1131
resources/tests/readability/002/source.html
Normal file
File diff suppressed because it is too large
Load diff
|
@ -32,8 +32,7 @@ impl Article {
|
|||
.map(|doc| doc.to_string_with_options(options))
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub(crate) fn save_html(&self, path: &PathBuf) -> Result<(), Error> {
|
||||
pub fn save_html(&self, path: &PathBuf) -> Result<(), Error> {
|
||||
if let Some(ref html) = self.get_content() {
|
||||
if let Ok(()) = std::fs::create_dir_all(path) {
|
||||
let mut file_name = match self.title.clone() {
|
||||
|
|
|
@ -4,6 +4,8 @@ use once_cell::sync::Lazy;
|
|||
use regex::Regex;
|
||||
|
||||
pub const DEFAULT_CHAR_THRESHOLD: usize = 500;
|
||||
pub static IS_IMAGE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"/\.(jpg|jpeg|png|webp)/i"#).expect("IS_IMAGE regex"));
|
||||
pub static SIBLING_CONTENT: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"/\.( |$)/"#).expect("SIBLING_CONTENT regex"));
|
||||
pub static BYLINE: Lazy<Regex> = Lazy::new(|| {
|
||||
|
|
|
@ -21,7 +21,13 @@ impl ConfigCollection {
|
|||
for (file_name, entry) in EmbededConfigFiles::iter()
|
||||
.filter_map(|file_name| EmbededConfigFiles::get(&file_name).map(|e| (file_name, e)))
|
||||
{
|
||||
let entry = ConfigEntry::parse_data(entry.data).await.unwrap();
|
||||
let entry = match ConfigEntry::parse_data(entry.data).await {
|
||||
Ok(entry) => entry,
|
||||
Err(error) => {
|
||||
log::error!("{error}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let file_name: &str = file_name.borrow();
|
||||
embedded_entries.insert(file_name.to_owned(), entry);
|
||||
}
|
||||
|
|
|
@ -19,7 +19,7 @@ pub struct Header {
|
|||
pub value: String,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, Default)]
|
||||
pub struct ConfigEntry {
|
||||
pub xpath_title: Vec<String>,
|
||||
pub xpath_author: Vec<String>,
|
||||
|
@ -34,24 +34,6 @@ pub struct ConfigEntry {
|
|||
pub next_page_link: Option<String>,
|
||||
}
|
||||
|
||||
impl Default for ConfigEntry {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
xpath_title: Vec::new(),
|
||||
xpath_author: Vec::new(),
|
||||
xpath_date: Vec::new(),
|
||||
xpath_body: Vec::new(),
|
||||
xpath_strip: Vec::new(),
|
||||
strip_id_or_class: Vec::new(),
|
||||
strip_image_src: Vec::new(),
|
||||
replace: Vec::new(),
|
||||
header: Vec::new(),
|
||||
single_page_link: None,
|
||||
next_page_link: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ConfigEntry {
|
||||
pub async fn parse_path(config_path: &Path) -> Result<ConfigEntry, ConfigError> {
|
||||
let mut file = fs::File::open(&config_path).await?;
|
||||
|
|
|
@ -23,9 +23,9 @@ pub fn extract(
|
|||
let new_title = constants::TITLE_CUT_END.replace(&title, "$1");
|
||||
let word_count = constants::WORD_COUNT.split(&title).count();
|
||||
if word_count < 3 {
|
||||
constants::TITLE_CUT_FRONT.replace(&title, "$1").to_string()
|
||||
constants::TITLE_CUT_FRONT.replace(&title, "$1").trim().to_string()
|
||||
} else {
|
||||
new_title.to_string()
|
||||
new_title.trim().to_string()
|
||||
}
|
||||
} else {
|
||||
title
|
||||
|
|
|
@ -11,6 +11,7 @@ use self::config::{ConfigCollection, ConfigEntry};
|
|||
use self::error::FullTextParserError;
|
||||
use self::readability::Readability;
|
||||
use crate::article::Article;
|
||||
use crate::constants;
|
||||
use crate::util::Util;
|
||||
|
||||
use encoding_rs::Encoding;
|
||||
|
@ -19,9 +20,8 @@ use libxml::parser::Parser;
|
|||
use libxml::tree::{Document, Node};
|
||||
use libxml::xpath::Context;
|
||||
use log::{debug, error, info, warn};
|
||||
use regex::Regex;
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::Client;
|
||||
use reqwest::{Client, Url};
|
||||
use std::path::Path;
|
||||
use std::str::from_utf8;
|
||||
|
||||
|
@ -40,6 +40,8 @@ impl FullTextParser {
|
|||
url: &url::Url,
|
||||
client: &Client,
|
||||
) -> Result<Article, FullTextParserError> {
|
||||
libxml::tree::node::set_node_rc_guard(3);
|
||||
|
||||
info!("Scraping article: '{}'", url.as_str());
|
||||
|
||||
// check if we have a config for the url
|
||||
|
@ -106,7 +108,6 @@ impl FullTextParser {
|
|||
|
||||
self.parse_pages(
|
||||
&mut article,
|
||||
&url,
|
||||
&html,
|
||||
&mut root,
|
||||
config,
|
||||
|
@ -137,7 +138,6 @@ impl FullTextParser {
|
|||
async fn parse_pages(
|
||||
&self,
|
||||
article: &mut Article,
|
||||
url: &url::Url,
|
||||
html: &str,
|
||||
root: &mut Node,
|
||||
config: Option<&ConfigEntry>,
|
||||
|
@ -183,7 +183,8 @@ impl FullTextParser {
|
|||
if article.thumbnail_url.is_none() {
|
||||
Self::check_for_thumbnail(&xpath_ctx, article);
|
||||
}
|
||||
Self::strip_junk(&xpath_ctx, config, global_config, url);
|
||||
Self::strip_junk(&xpath_ctx, config, global_config);
|
||||
Self::fix_urls(&xpath_ctx, &article.url);
|
||||
Self::unwrap_noscript_images(&xpath_ctx)?;
|
||||
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||
|
||||
|
@ -200,7 +201,8 @@ impl FullTextParser {
|
|||
let html = Self::download(&url, client, headers).await?;
|
||||
document = Self::parse_html(&html, config, global_config)?;
|
||||
xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||
Self::strip_junk(&xpath_ctx, config, global_config, &url);
|
||||
Self::strip_junk(&xpath_ctx, config, global_config);
|
||||
Self::fix_urls(&xpath_ctx, &url);
|
||||
Self::unwrap_noscript_images(&xpath_ctx)?;
|
||||
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||
}
|
||||
|
@ -256,7 +258,8 @@ impl FullTextParser {
|
|||
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||
metadata::extract(&xpath_ctx, config, Some(global_config), article);
|
||||
Self::check_for_thumbnail(&xpath_ctx, article);
|
||||
Self::strip_junk(&xpath_ctx, config, global_config, url);
|
||||
Self::strip_junk(&xpath_ctx, config, global_config);
|
||||
Self::fix_urls(&xpath_ctx, url);
|
||||
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||
|
||||
Ok(())
|
||||
|
@ -543,12 +546,15 @@ impl FullTextParser {
|
|||
Ok(url)
|
||||
}
|
||||
|
||||
fn strip_junk(
|
||||
context: &Context,
|
||||
config: Option<&ConfigEntry>,
|
||||
global_config: &ConfigEntry,
|
||||
url: &url::Url,
|
||||
) {
|
||||
fn fix_urls(context: &Context, url: &Url) {
|
||||
let _ = Self::repair_urls(context, "//img", "src", url);
|
||||
let _ = Self::repair_urls(context, "//a", "src", url);
|
||||
let _ = Self::repair_urls(context, "//a", "href", url);
|
||||
let _ = Self::repair_urls(context, "//object", "data", url);
|
||||
let _ = Self::repair_urls(context, "//iframe", "src", url);
|
||||
}
|
||||
|
||||
fn strip_junk(context: &Context, config: Option<&ConfigEntry>, global_config: &ConfigEntry) {
|
||||
// strip specified xpath
|
||||
if let Some(config) = config {
|
||||
for xpath_strip in &config.xpath_strip {
|
||||
|
@ -596,12 +602,6 @@ impl FullTextParser {
|
|||
let _ = Self::remove_attribute(context, Some("img"), "sizes");
|
||||
let _ = Self::add_attribute(context, Some("a"), "target", "_blank");
|
||||
|
||||
let _ = Self::repair_urls(context, "//img", "src", url);
|
||||
let _ = Self::repair_urls(context, "//a", "src", url);
|
||||
let _ = Self::repair_urls(context, "//a", "href", url);
|
||||
let _ = Self::repair_urls(context, "//object", "data", url);
|
||||
let _ = Self::repair_urls(context, "//iframe", "src", url);
|
||||
|
||||
// strip elements using Readability.com and Instapaper.com ignore class names
|
||||
// .entry-unrelated and .instapaper_ignore
|
||||
// See http://blog.instapaper.com/post/730281947
|
||||
|
@ -638,7 +638,6 @@ impl FullTextParser {
|
|||
fn unwrap_noscript_images(ctx: &Context) -> Result<(), FullTextParserError> {
|
||||
// Find img without source or attributes that might contains image, and remove it.
|
||||
// This is done to prevent a placeholder img is replaced by img from noscript in next step.
|
||||
let img_regex = Regex::new(r#"/\.(jpg|jpeg|png|webp)/i"#).unwrap();
|
||||
let img_nodes = Util::evaluate_xpath(ctx, "//img", false)?;
|
||||
for mut img_node in img_nodes {
|
||||
let attrs = img_node.get_attributes();
|
||||
|
@ -648,7 +647,7 @@ impl FullTextParser {
|
|||
|| name == "srcset"
|
||||
|| name == "data-src"
|
||||
|| name == "data-srcset"
|
||||
|| img_regex.is_match(&value)
|
||||
|| constants::IS_IMAGE.is_match(value)
|
||||
});
|
||||
if !keep {
|
||||
img_node.unlink();
|
||||
|
@ -668,24 +667,31 @@ impl FullTextParser {
|
|||
// attributes that might contains image.
|
||||
if let Some(prev) = noscript_node.get_prev_element_sibling() {
|
||||
if Util::is_single_image(&prev) {
|
||||
|
||||
{
|
||||
let mut prev_img = prev.clone();
|
||||
|
||||
if prev_img.get_name().to_uppercase() != "IMG" {
|
||||
if let Some(img_node) = Util::get_elements_by_tag_name(&prev_img, "img").into_iter().next() {
|
||||
if let Some(img_node) = Util::get_elements_by_tag_name(&prev_img, "img")
|
||||
.into_iter()
|
||||
.next()
|
||||
{
|
||||
prev_img = img_node;
|
||||
}
|
||||
}
|
||||
|
||||
let new_img = Util::get_elements_by_tag_name(&noscript_node, "img").into_iter().next();
|
||||
let new_img = Util::get_elements_by_tag_name(&noscript_node, "img")
|
||||
.into_iter()
|
||||
.next();
|
||||
if let Some(mut new_img) = new_img {
|
||||
for (key, value) in prev_img.get_attributes() {
|
||||
if value.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if key == "src" || key == "srcset" || img_regex.is_match(&value) {
|
||||
if key == "src"
|
||||
|| key == "srcset"
|
||||
|| constants::IS_IMAGE.is_match(&value)
|
||||
{
|
||||
if new_img.get_attribute(&key).as_deref() == Some(&value) {
|
||||
continue;
|
||||
}
|
||||
|
@ -695,7 +701,10 @@ impl FullTextParser {
|
|||
attr_name = format!("data-old-{attr_name}");
|
||||
}
|
||||
|
||||
new_img.set_attribute(&attr_name, &value).unwrap();
|
||||
new_img.set_attribute(&attr_name, &value).map_err(|e| {
|
||||
log::error!("{e}");
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -703,7 +712,10 @@ impl FullTextParser {
|
|||
|
||||
if let Some(mut parent) = noscript_node.get_parent() {
|
||||
if let Some(first_child) = noscript_node.get_first_child() {
|
||||
parent.replace_child_node(first_child, prev).unwrap();
|
||||
parent.replace_child_node(first_child, prev).map_err(|e| {
|
||||
log::error!("{e}");
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
noscript_node.unlink();
|
||||
}
|
||||
}
|
||||
|
@ -825,7 +837,9 @@ impl FullTextParser {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn post_process_content(root: &mut Node) -> Result<(), FullTextParserError> {
|
||||
pub(crate) fn post_process_content(
|
||||
root: &mut Node
|
||||
) -> Result<(), FullTextParserError> {
|
||||
Self::clean_classes(root)?;
|
||||
Self::simplify_nested_elements(root)?;
|
||||
Ok(())
|
||||
|
|
|
@ -5,7 +5,7 @@ mod tests;
|
|||
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use libxml::tree::{node, Document, Node, NodeType};
|
||||
use libxml::tree::{Document, Node, NodeType};
|
||||
|
||||
use self::state::State;
|
||||
use super::error::FullTextParserError;
|
||||
|
@ -19,8 +19,6 @@ impl Readability {
|
|||
root: &mut Node,
|
||||
title: Option<&str>,
|
||||
) -> Result<bool, FullTextParserError> {
|
||||
node::set_node_rc_guard(6);
|
||||
|
||||
let mut state = State::default();
|
||||
let mut document = document;
|
||||
let mut attempts: Vec<(Node, usize, Document)> = Vec::new();
|
||||
|
@ -253,12 +251,11 @@ impl Readability {
|
|||
let mut top_candidate = top_candidates.first().cloned().unwrap_or_else(|| {
|
||||
// If we still have no top candidate, just use the body as a last resort.
|
||||
// We also have to copy the body node so it is something we can modify.
|
||||
let mut rt = document.get_root_element().unwrap();
|
||||
Self::initialize_node(&mut rt, &state).unwrap();
|
||||
let mut rt = document.get_root_element().expect("doc should have root");
|
||||
Self::initialize_node(&mut rt, &state).expect("init should not fail");
|
||||
needed_to_create_top_candidate = true;
|
||||
rt
|
||||
});
|
||||
let mut parent_of_top_candidate = None;
|
||||
|
||||
let mut alternative_candidate_ancestors = Vec::new();
|
||||
// Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
|
||||
|
@ -274,25 +271,21 @@ impl Readability {
|
|||
}
|
||||
|
||||
if alternative_candidate_ancestors.len() >= constants::MINIMUM_TOPCANDIDATES {
|
||||
parent_of_top_candidate = top_candidate.get_parent();
|
||||
let mut parent_of_top_candidate = top_candidate.get_parent();
|
||||
|
||||
loop {
|
||||
if let Some(parent) = &parent_of_top_candidate {
|
||||
let mut lists_containing_this_ancestor = 0;
|
||||
let tmp = usize::min(
|
||||
alternative_candidate_ancestors.len(),
|
||||
constants::MINIMUM_TOPCANDIDATES,
|
||||
);
|
||||
for ancestor in alternative_candidate_ancestors.iter().take(tmp) {
|
||||
lists_containing_this_ancestor +=
|
||||
if ancestor == parent { 1 } else { 0 };
|
||||
}
|
||||
while let Some(parent) = &parent_of_top_candidate {
|
||||
let mut lists_containing_this_ancestor = 0;
|
||||
let tmp = usize::min(
|
||||
alternative_candidate_ancestors.len(),
|
||||
constants::MINIMUM_TOPCANDIDATES,
|
||||
);
|
||||
for ancestor in alternative_candidate_ancestors.iter().take(tmp) {
|
||||
lists_containing_this_ancestor +=
|
||||
if ancestor == parent { 1 } else { 0 };
|
||||
}
|
||||
|
||||
if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {
|
||||
top_candidate = parent.clone();
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {
|
||||
top_candidate = parent.clone();
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -311,7 +304,7 @@ impl Readability {
|
|||
// lurking in other places that we want to unify in. The sibling stuff
|
||||
// below does some of that - but only if we've looked high enough up the DOM
|
||||
// tree.
|
||||
parent_of_top_candidate = top_candidate.get_parent();
|
||||
let mut parent_of_top_candidate = top_candidate.get_parent();
|
||||
let mut last_score = Self::get_content_score(&top_candidate).unwrap_or(0.0);
|
||||
|
||||
// The scores shouldn't get too low.
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
use libxml::{
|
||||
tree::{Document, Node},
|
||||
xpath::Context,
|
||||
};
|
||||
use libxml::tree::{Document, Node};
|
||||
use reqwest::Url;
|
||||
|
||||
use crate::{
|
||||
|
@ -9,13 +6,21 @@ use crate::{
|
|||
full_text_parser::{config::ConfigEntry, metadata},
|
||||
};
|
||||
|
||||
async fn prepare(html: &str, url: &Url) -> (Document, Context, Article) {
|
||||
async fn run_test(name: &str) {
|
||||
libxml::tree::node::set_node_rc_guard(3);
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
let empty_config = ConfigEntry::default();
|
||||
let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap();
|
||||
|
||||
let url = Url::parse("http://google.com").unwrap();
|
||||
let html = std::fs::read_to_string(format!("./resources/tests/readability/{name}/source.html"))
|
||||
.expect("Failed to read source HTML");
|
||||
let document = crate::FullTextParser::parse_html(&html, None, &empty_config).unwrap();
|
||||
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
|
||||
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config, url);
|
||||
|
||||
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config);
|
||||
crate::FullTextParser::unwrap_noscript_images(&xpath_ctx).unwrap();
|
||||
let article = Article {
|
||||
let mut article = Article {
|
||||
title: None,
|
||||
author: None,
|
||||
url: url.clone(),
|
||||
|
@ -23,17 +28,6 @@ async fn prepare(html: &str, url: &Url) -> (Document, Context, Article) {
|
|||
thumbnail_url: None,
|
||||
document: None,
|
||||
};
|
||||
(document, xpath_ctx, article)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_1() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
let html = std::fs::read_to_string(r"./resources/tests/readability-test-1.html")
|
||||
.expect("Failed to read HTML");
|
||||
let url = Url::parse("http://google.com").unwrap();
|
||||
let (document, xpath_ctx, mut article) = prepare(&html, &url).await;
|
||||
|
||||
let mut article_document = Document::new().unwrap();
|
||||
let mut root = Node::new("article", None, &document).unwrap();
|
||||
|
@ -48,5 +42,21 @@ async fn test_1() {
|
|||
|
||||
article.document = Some(article_document);
|
||||
let html = article.get_content().unwrap();
|
||||
std::fs::write("test.html", html).unwrap();
|
||||
|
||||
let expected = std::fs::read_to_string(format!("./resources/tests/readability/{name}/expected.html"))
|
||||
.expect("Failed to read expected HTML");
|
||||
|
||||
//std::fs::write("expected.html", &html).unwrap();
|
||||
|
||||
assert_eq!(expected, html);
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "current_thread")]
|
||||
async fn test_001() {
|
||||
run_test("001").await
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "current_thread")]
|
||||
async fn test_002() {
|
||||
run_test("002").await
|
||||
}
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
use super::{FullTextParser, config::ConfigEntry};
|
||||
use super::{config::ConfigEntry, FullTextParser};
|
||||
use libxml::tree::SaveOptions;
|
||||
use reqwest::Client;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[tokio::test]
|
||||
async fn golem() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
let out_path = PathBuf::from(r"./test_output");
|
||||
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
|
||||
|
||||
|
@ -29,6 +30,7 @@ async fn golem() {
|
|||
|
||||
#[tokio::test]
|
||||
async fn phoronix() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
let out_path = PathBuf::from(r"./test_output");
|
||||
let url =
|
||||
url::Url::parse("http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1")
|
||||
|
@ -48,6 +50,7 @@ async fn phoronix() {
|
|||
|
||||
#[tokio::test]
|
||||
async fn youtube() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
let out_path = PathBuf::from(r"./test_output");
|
||||
let url = url::Url::parse("https://www.youtube.com/watch?v=8KjaIumu-jI").unwrap();
|
||||
|
||||
|
@ -57,7 +60,7 @@ async fn youtube() {
|
|||
|
||||
assert_eq!(
|
||||
article.title.as_deref(),
|
||||
Some("RIGGED! Arena Shuffler is BROKEN | 13 Land Mono Red Burn")
|
||||
Some("RIGGED! Arena Shuffler is BROKEN")
|
||||
);
|
||||
assert!(article
|
||||
.get_content()
|
||||
|
@ -67,6 +70,7 @@ async fn youtube() {
|
|||
|
||||
#[tokio::test]
|
||||
async fn encoding_windows_1252() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
let url = url::Url::parse("https://www.aerzteblatt.de/nachrichten/139511/Scholz-zuversichtlich-mit-Blick-auf-Coronasituation-im-Winter").unwrap();
|
||||
let html = FullTextParser::download(&url, &Client::new(), reqwest::header::HeaderMap::new())
|
||||
.await
|
||||
|
|
|
@ -86,7 +86,8 @@ impl Util {
|
|||
for node in res {
|
||||
let content = node.get_content();
|
||||
let url_str = if content.trim().is_empty() && node.has_attribute("href") {
|
||||
node.get_attribute("href").unwrap()
|
||||
node.get_attribute("href")
|
||||
.expect("already checked for href")
|
||||
} else {
|
||||
content
|
||||
};
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue