From 848291e4f3ebbf0fd6f725cd6889c3846899ffac Mon Sep 17 00:00:00 2001
From: Jan Lukas Gernert Submitting some parameters, it would look like this: The The Headers interface is
a simple multi-map of names to values: The same can be achieved by passing an array of arrays or a JS object
literal
The contents can be queried and retrieved: Some of these operations are only useful in ServiceWorkers, but they provide
Since Headers can be sent in requests, or received in responses, and have
@@ -98,34 +98,34 @@ console.log(reqHeaders.getAll(All of the Headers methods throw TypeError if The Request interface defines a request to fetch a resource over HTTP.
URL, method and headers are expected, but the Request also allows specifying
a body, a request mode, credentials and cache hints. The simplest Request is of course, just a URL, as you may do to GET a
resource. You may also pass a Request to the Again, this form is probably only useful in ServiceWorkers. The non-URL attributes of the The Request’s mode is used to determine if cross-origin requests lead
to valid responses, and which properties on the response are readable.
Legal mode values are The You may not read out the “Date” header since Flickr does not allow it
via
The fetch("http://www.example.org/submit.php",{
+
fetch("http://www.example.org/submit.php",{
method:"POST",
headers:{"Content-Type":"application/x-www-form-urlencoded"},
body:"firstName=Nikhil&favColor=blue&password=easytoguess"}).then(function(res){if(res.ok){
alert("Perfect! Your settings are saved.");}elseif(res.status==401){
alert("Oops! You are not authorized.");}},function(e){
- alert("Error submitting form!");});
fetch()
function’s arguments are the same as those passed
to the
Request()
constructor, so you may directly pass arbitrarily
@@ -53,16 +53,16 @@
supporting CORS rules and ensuring cookies aren’t readable by third parties.var content ="Hello World";var reqHeaders =new Headers();
+
var content ="Hello World";var reqHeaders =new Headers();
reqHeaders.append("Content-Type","text/plain"
reqHeaders.append("Content-Length", content.length.toString());
-reqHeaders.append("X-Custom-Header","ProcessThisImmediately");
to the constructor:reqHeaders =new Headers({"Content-Type":"text/plain","Content-Length": content.length.toString(),"X-Custom-Header":"ProcessThisImmediately",});
reqHeaders =new Headers({"Content-Type":"text/plain","Content-Length": content.length.toString(),"X-Custom-Header":"ProcessThisImmediately",});
console.log(reqHeaders.has("Content-Type"));// true
+
console.log(reqHeaders.has("Content-Type"));// true
console.log(reqHeaders.has("Set-Cookie"));// false
reqHeaders.set("Content-Type","text/html");
reqHeaders.append("X-Custom-Header","AnotherValue");
@@ -71,7 +71,7 @@ console.log(reqHeaders.get(<
console.log(reqHeaders.getAll("X-Custom-Header"));// ["ProcessThisImmediately", "AnotherValue"]
reqHeaders.delete("X-Custom-Header");
-console.log(reqHeaders.getAll("X-Custom-Header"));// []
a much nicer API to Headers.name
is not a
valid HTTP Header name. The mutation operations will throw TypeError
if there is an immutable guard. Otherwise they fail silently. For example:var res = Response.error();try{
+
var res = Response.error();try{
res.headers.set("Origin","http://mybank.com");}catch(e){
- console.log("Cannot pretend to be a bank!");}
Request
var req =new Request("/index.html");
+
var req =new Request("/index.html");
console.log(req.method);// "GET"
-console.log(req.url);// "http://example.com/index.html"
Request()
constructor to
create a copy.
(This is not the same as calling the clone()
method, which
is covered in
the “Reading bodies” section.).var copy =new Request(req);
+
var copy =new Request(req);
console.log(copy.method);// "GET"
-console.log(copy.url);// "http://example.com/index.html"
Request
can only be set by passing
initial
values as a second argument to the constructor. This argument is a dictionary.var uploadReq =new Request("/uploadImage",{
+
var uploadReq =new Request("/uploadImage",{
method:"POST",
headers:{"Content-Type":"image/png",},
- body:"image data"});
"same-origin"
, "no-cors"
(default)
@@ -134,10 +134,10 @@ console.log(copy.url);
origin with this mode set, the result is simply an error. You could use
this to ensure that
a request is always being made to your origin.var arbitraryUrl = document.getElementById("url-input").value;
+
var arbitraryUrl = document.getElementById("url-input").value;
fetch(arbitraryUrl,{ mode:"same-origin"}).then(function(res){
console.log("Response succeeded?", res.ok);},function(e){
- console.log("Please enter a same-origin URL!");});
"no-cors"
mode captures what the web platform does by default
for scripts you import from CDNs, images hosted on other domains, and so
on. First, it prevents the method from being anything other than “HEAD”,
@@ -155,7 +155,7 @@ fetch(arbitraryUrl,{ mode:most interesting photos
today like this:var u =new URLSearchParams();
+
var u =new URLSearchParams();
u.append('method','flickr.interestingness.getList');
u.append('api_key','<insert api key here>');
u.append('format','json');
@@ -163,11 +163,11 @@ u.append('nojsoncallback',
then(function(response){return response.json().then(function(json){// photo is a list of photos.return json.photos.photo;});}).then(function(photos){
photos.forEach(function(photo){
- console.log(photo.title);});});Access-Control-Expose-Headers
.response.headers.get("Date");// null
response.headers.get("Date");// null
credentials
enumeration determines if cookies for the other
domain are
sent to cross-origin requests. This is similar to XHR’s withCredentials
flag, but tri-valued as "omit"
(default), "same-origin"
and "include"
.
idiomatic way to return a Response to an intercepted request in ServiceWorkers
is:
addEventListener('fetch',function(event){ + |
As you can see, Response has a two argument constructor, where both arguments
are optional. The first argument is a body initializer, and the second
is a dictionary to set the status
, statusText
and headers
.
This is a significant improvement over XHR in terms of ease of use of non-text data!
Request bodies can be set by passing body
parameters:
var form =new FormData(document.getElementById('login-form')); + |
Responses take the first argument as the body.
-var res =new Response(new File(["chunk","chunk"],"archive.zip",{ type:"application/zip"})); |
var res =new Response(new File(["chunk","chunk"],"archive.zip",{ type:"application/zip"}));
Both Request and Response (and by extension the fetch()
function),
will try to intelligently determine the content type.
Request will also automatically set a “Content-Type” header if none is
@@ -281,14 +281,14 @@ fetch("/login",{
It is important to realise that Request and Response bodies can only be
read once! Both interfaces have a boolean attribute bodyUsed
to
determine if it is safe to read or not.
var res =new Response("one time use"); + |
This decision allows easing the transition to an eventual stream-based Fetch
API. The intention is to let applications consume data as it arrives, allowing
for JavaScript to deal with larger files like videos, and perform things
@@ -302,7 +302,7 @@ res.text().catch(clone() MUST
be called before the body of the corresponding object has been used. That
is, clone()
first, read later.
addEventListener('fetch',function(evt){var sheep =new Response("Dolly"); + |
Along with the transition to streams, Fetch will eventually have the ability
to abort running fetch()
es and some way to report the progress
diff --git a/resources/tests/readability/buzzfeed-1/expected.html b/resources/tests/readability/buzzfeed-1/expected.html
index dbbfa3b..19e4069 100644
--- a/resources/tests/readability/buzzfeed-1/expected.html
+++ b/resources/tests/readability/buzzfeed-1/expected.html
@@ -16,13 +16,9 @@
West Mercia police said the tablets were believed to contain dinitrophenol, known as DNP, which is a highly toxic industrial chemical.
diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs index eb3d893..c146463 100644 --- a/src/full_text_parser/mod.rs +++ b/src/full_text_parser/mod.rs @@ -856,8 +856,8 @@ impl FullTextParser { Self::clean_attributes(&mut root)?; Self::simplify_nested_elements(&mut root)?; - Self::remove_extra_p_and_div(&mut root); Self::remove_single_cell_tables(&mut root); + Self::remove_extra_p_and_div(&mut root); } Ok(()) @@ -887,7 +887,7 @@ impl FullTextParser { cell.set_name(if all_phrasing_content { "P" } else { "DIV" }) .unwrap(); if let Some(mut parent) = node.get_parent() { - node_iter = Util::next_node(&node, false); + node_iter = Util::next_node(&node, true); parent.replace_child_node(cell, node.clone()).unwrap(); continue; } @@ -914,7 +914,7 @@ impl FullTextParser { let total_count = img_count + embed_count + object_count + iframe_count; if total_count == 0 && Util::get_inner_text(&node, false).trim().is_empty() { - node_iter = Util::next_node(&node, false); + node_iter = Util::next_node(&node, true); node.unlink(); continue; } diff --git a/src/util.rs b/src/util.rs index 8c69339..8f44e8a 100644 --- a/src/util.rs +++ b/src/util.rs @@ -289,12 +289,7 @@ impl Util { // (because this is depth-first traversal, we will have already // seen the parent nodes themselves). loop { - let parent = node.get_parent(); - if parent.is_none() { - break; - } - - if let Some(parent) = parent { + if let Some(parent) = node.get_parent() { let parent_name = parent.get_name().to_uppercase(); if parent_name == "HTML" { break; @@ -306,6 +301,8 @@ impl Util { } else { node = parent; } + } else { + break; } }