diff --git a/resources/tests/readability/002/expected.html b/resources/tests/readability/002/expected.html index e8b4241..6007921 100644 --- a/resources/tests/readability/002/expected.html +++ b/resources/tests/readability/002/expected.html @@ -33,13 +33,13 @@ console.log("Looks like the response wasn't perfect, got status", res.status);}},function(e){ console.log("Fetch failed!", e);});

Submitting some parameters, it would look like this:

-
fetch("http://www.example.org/submit.php",{
+
fetch("http://www.example.org/submit.php",{
   method:"POST",
   headers:{"Content-Type":"application/x-www-form-urlencoded"},
   body:"firstName=Nikhil&favColor=blue&password=easytoguess"}).then(function(res){if(res.ok){
     alert("Perfect! Your settings are saved.");}elseif(res.status==401){
     alert("Oops! You are not authorized.");}},function(e){
-  alert("Error submitting form!");});
+ alert("Error submitting form!");});

The fetch() function’s arguments are the same as those passed to the
Request() constructor, so you may directly pass arbitrarily @@ -53,16 +53,16 @@
supporting CORS rules and ensuring cookies aren’t readable by third parties.

The Headers interface is a simple multi-map of names to values:

-
var content ="Hello World";var reqHeaders =new Headers();
+
var content ="Hello World";var reqHeaders =new Headers();
 reqHeaders.append("Content-Type","text/plain"
 reqHeaders.append("Content-Length", content.length.toString());
-reqHeaders.append("X-Custom-Header","ProcessThisImmediately");
+reqHeaders.append("X-Custom-Header","ProcessThisImmediately");

The same can be achieved by passing an array of arrays or a JS object literal
to the constructor:

-
reqHeaders =new Headers({"Content-Type":"text/plain","Content-Length": content.length.toString(),"X-Custom-Header":"ProcessThisImmediately",});
+
reqHeaders =new Headers({"Content-Type":"text/plain","Content-Length": content.length.toString(),"X-Custom-Header":"ProcessThisImmediately",});

The contents can be queried and retrieved:

-
console.log(reqHeaders.has("Content-Type"));// true
+
console.log(reqHeaders.has("Content-Type"));// true
 console.log(reqHeaders.has("Set-Cookie"));// false
 reqHeaders.set("Content-Type","text/html");
 reqHeaders.append("X-Custom-Header","AnotherValue");
@@ -71,7 +71,7 @@ console.log(reqHeaders.get(<
 console.log(reqHeaders.getAll("X-Custom-Header"));// ["ProcessThisImmediately", "AnotherValue"]
  
 reqHeaders.delete("X-Custom-Header");
-console.log(reqHeaders.getAll("X-Custom-Header"));// []
+console.log(reqHeaders.getAll("X-Custom-Header"));// []

Some of these operations are only useful in ServiceWorkers, but they provide
a much nicer API to Headers.

Since Headers can be sent in requests, or received in responses, and have @@ -98,34 +98,34 @@ console.log(reqHeaders.getAll(All of the Headers methods throw TypeError if name is not a valid HTTP Header name. The mutation operations will throw TypeError if there is an immutable guard. Otherwise they fail silently. For example:

-
var res = Response.error();try{
+
var res = Response.error();try{
   res.headers.set("Origin","http://mybank.com");}catch(e){
-  console.log("Cannot pretend to be a bank!");}
+ console.log("Cannot pretend to be a bank!");}

Request

The Request interface defines a request to fetch a resource over HTTP. URL, method and headers are expected, but the Request also allows specifying a body, a request mode, credentials and cache hints.

The simplest Request is of course, just a URL, as you may do to GET a resource.

-
var req =new Request("/index.html");
+
var req =new Request("/index.html");
 console.log(req.method);// "GET"
-console.log(req.url);// "http://example.com/index.html"
+console.log(req.url);// "http://example.com/index.html"

You may also pass a Request to the Request() constructor to create a copy.
(This is not the same as calling the clone() method, which is covered in
the “Reading bodies” section.).

-
var copy =new Request(req);
+
var copy =new Request(req);
 console.log(copy.method);// "GET"
-console.log(copy.url);// "http://example.com/index.html"
+console.log(copy.url);// "http://example.com/index.html"

Again, this form is probably only useful in ServiceWorkers.

The non-URL attributes of the Request can only be set by passing initial
values as a second argument to the constructor. This argument is a dictionary.

-
var uploadReq =new Request("/uploadImage",{
+
var uploadReq =new Request("/uploadImage",{
   method:"POST",
   headers:{"Content-Type":"image/png",},
-  body:"image data"});
+ body:"image data"});

The Request’s mode is used to determine if cross-origin requests lead to valid responses, and which properties on the response are readable. Legal mode values are "same-origin", "no-cors" (default) @@ -134,10 +134,10 @@ console.log(copy.url); origin with this mode set, the result is simply an error. You could use this to ensure that
a request is always being made to your origin.

-
var arbitraryUrl = document.getElementById("url-input").value;
+
var arbitraryUrl = document.getElementById("url-input").value;
 fetch(arbitraryUrl,{ mode:"same-origin"}).then(function(res){
   console.log("Response succeeded?", res.ok);},function(e){
-  console.log("Please enter a same-origin URL!");});
+ console.log("Please enter a same-origin URL!");});

The "no-cors" mode captures what the web platform does by default for scripts you import from CDNs, images hosted on other domains, and so on. First, it prevents the method from being anything other than “HEAD”, @@ -155,7 +155,7 @@ fetch(arbitraryUrl,{ mode:most interesting photos today like this:

-
var u =new URLSearchParams();
+
var u =new URLSearchParams();
 u.append('method','flickr.interestingness.getList');
 u.append('api_key','<insert api key here>');
 u.append('format','json');
@@ -163,11 +163,11 @@ u.append('nojsoncallback',then(function(response){return response.json().then(function(json){// photo is a list of photos.return json.photos.photo;});}).then(function(photos){
   photos.forEach(function(photo){
-    console.log(photo.title);});});
+ console.log(photo.title);});});

You may not read out the “Date” header since Flickr does not allow it via
Access-Control-Expose-Headers.

-
response.headers.get("Date");// null
+
response.headers.get("Date");// null

The credentials enumeration determines if cookies for the other domain are
sent to cross-origin requests. This is similar to XHR’s withCredentials
flag, but tri-valued as "omit" (default), "same-origin" and "include".

@@ -222,9 +222,9 @@ apiCall.then(function(respon The
idiomatic way to return a Response to an intercepted request in ServiceWorkers is:

-
addEventListener('fetch',function(event){
+
addEventListener('fetch',function(event){
   event.respondWith(new Response("Response body",{
-    headers:{"Content-Type":"text/plain"}});});
+ headers:{"Content-Type":"text/plain"}});});

As you can see, Response has a two argument constructor, where both arguments are optional. The first argument is a body initializer, and the second is a dictionary to set the status, statusText and headers.

@@ -266,13 +266,13 @@ apiCall.then(function(respon

This is a significant improvement over XHR in terms of ease of use of non-text data!

Request bodies can be set by passing body parameters:

-
var form =new FormData(document.getElementById('login-form'));
+
var form =new FormData(document.getElementById('login-form'));
 fetch("/login",{
   method:"POST",
   body: form
-})
+})

Responses take the first argument as the body.

-
var res =new Response(new File(["chunk","chunk"],"archive.zip",{ type:"application/zip"}));
+
var res =new Response(new File(["chunk","chunk"],"archive.zip",{ type:"application/zip"}));

Both Request and Response (and by extension the fetch() function), will try to intelligently determine the content type. Request will also automatically set a “Content-Type” header if none is @@ -281,14 +281,14 @@ fetch("/login",{

It is important to realise that Request and Response bodies can only be read once! Both interfaces have a boolean attribute bodyUsed to determine if it is safe to read or not.

-
var res =new Response("one time use");
+
var res =new Response("one time use");
 console.log(res.bodyUsed);// false
 res.text().then(function(v){
   console.log(res.bodyUsed);// true});
 console.log(res.bodyUsed);// true
  
 res.text().catch(function(e){
-  console.log("Tried to read already consumed Response");});
+ console.log("Tried to read already consumed Response");});

This decision allows easing the transition to an eventual stream-based Fetch API. The intention is to let applications consume data as it arrives, allowing for JavaScript to deal with larger files like videos, and perform things @@ -302,7 +302,7 @@ res.text().catch(clone() MUST be called before the body of the corresponding object has been used. That is, clone() first, read later.

-
addEventListener('fetch',function(evt){var sheep =new Response("Dolly");
+
addEventListener('fetch',function(evt){var sheep =new Response("Dolly");
   console.log(sheep.bodyUsed);// falsevar clone = sheep.clone();
   console.log(clone.bodyUsed);// false
  
@@ -310,7 +310,7 @@ res.text().catch(log(sheep.bodyUsed);// false
   console.log(clone.bodyUsed);// true
  
-  evt.respondWith(cache.add(sheep.clone()).then(function(e){return sheep;});});
+ evt.respondWith(cache.add(sheep.clone()).then(function(e){return sheep;});});

Future improvements

Along with the transition to streams, Fetch will eventually have the ability to abort running fetch()es and some way to report the progress diff --git a/resources/tests/readability/buzzfeed-1/expected.html b/resources/tests/readability/buzzfeed-1/expected.html index dbbfa3b..19e4069 100644 --- a/resources/tests/readability/buzzfeed-1/expected.html +++ b/resources/tests/readability/buzzfeed-1/expected.html @@ -16,13 +16,9 @@

Facebook

-
-

-

-
+

Facebook

-

West Mercia police said the tablets were believed to contain dinitrophenol, known as DNP, which is a highly toxic industrial chemical.

diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs index eb3d893..c146463 100644 --- a/src/full_text_parser/mod.rs +++ b/src/full_text_parser/mod.rs @@ -856,8 +856,8 @@ impl FullTextParser { Self::clean_attributes(&mut root)?; Self::simplify_nested_elements(&mut root)?; - Self::remove_extra_p_and_div(&mut root); Self::remove_single_cell_tables(&mut root); + Self::remove_extra_p_and_div(&mut root); } Ok(()) @@ -887,7 +887,7 @@ impl FullTextParser { cell.set_name(if all_phrasing_content { "P" } else { "DIV" }) .unwrap(); if let Some(mut parent) = node.get_parent() { - node_iter = Util::next_node(&node, false); + node_iter = Util::next_node(&node, true); parent.replace_child_node(cell, node.clone()).unwrap(); continue; } @@ -914,7 +914,7 @@ impl FullTextParser { let total_count = img_count + embed_count + object_count + iframe_count; if total_count == 0 && Util::get_inner_text(&node, false).trim().is_empty() { - node_iter = Util::next_node(&node, false); + node_iter = Util::next_node(&node, true); node.unlink(); continue; } diff --git a/src/util.rs b/src/util.rs index 8c69339..8f44e8a 100644 --- a/src/util.rs +++ b/src/util.rs @@ -289,12 +289,7 @@ impl Util { // (because this is depth-first traversal, we will have already // seen the parent nodes themselves). loop { - let parent = node.get_parent(); - if parent.is_none() { - break; - } - - if let Some(parent) = parent { + if let Some(parent) = node.get_parent() { let parent_name = parent.get_name().to_uppercase(); if parent_name == "HTML" { break; @@ -306,6 +301,8 @@ impl Util { } else { node = parent; } + } else { + break; } }