From 027fab7602466f044de581e607c5230d8541292d Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Thu, 30 Mar 2023 21:27:35 +0200 Subject: [PATCH] fix url completion for hash urls --- .../expected.html | 2 +- .../tests/readability/engadget/expected.html | 4 +- .../google-sre-book-1/expected.html | 12 +- .../readability/guardian-1/expected.html | 26 +- .../tests/readability/ietf-1/expected.html | 242 +++--- .../tests/readability/mercurial/expected.html | 738 ++++++++++++++++++ src/constants.rs | 2 +- src/full_text_parser/mod.rs | 29 +- src/full_text_parser/readability/mod.rs | 2 - src/full_text_parser/readability/tests.rs | 8 +- 10 files changed, 895 insertions(+), 170 deletions(-) create mode 100644 resources/tests/readability/mercurial/expected.html diff --git a/resources/tests/readability/base-url-base-element-relative/expected.html b/resources/tests/readability/base-url-base-element-relative/expected.html index de59b81..b8214f2 100644 --- a/resources/tests/readability/base-url-base-element-relative/expected.html +++ b/resources/tests/readability/base-url-base-element-relative/expected.html @@ -12,7 +12,7 @@

link

link

link

-

link

+

link

link

link

link

diff --git a/resources/tests/readability/engadget/expected.html b/resources/tests/readability/engadget/expected.html index b38701e..628a9b0 100644 --- a/resources/tests/readability/engadget/expected.html +++ b/resources/tests/readability/engadget/expected.html @@ -19,7 +19,7 @@

Gallery: Xbox One X | 14 Photos

-

+

@@ -193,7 +193,7 @@

Gallery: Xbox One X screenshots | 9 Photos

-

+

diff --git a/resources/tests/readability/google-sre-book-1/expected.html b/resources/tests/readability/google-sre-book-1/expected.html index 46d0359..2b9ce5b 100644 --- a/resources/tests/readability/google-sre-book-1/expected.html +++ b/resources/tests/readability/google-sre-book-1/expected.html @@ -41,7 +41,7 @@

- A notification intended to be read by a human and that is pushed to a system such as a bug or ticket queue, an email alias, or a pager. Respectively, these alerts are classified as tickets, email alerts,22 and pages. + A notification intended to be read by a human and that is pushed to a system such as a bug or ticket queue, an email alias, or a pager. Respectively, these alerts are classified as tickets, email alerts,22 and pages.

@@ -101,7 +101,7 @@

- Dashboards should answer basic questions about your service, and normally include some form of the four golden signals (discussed in The Four Golden Signals). + Dashboards should answer basic questions about your service, and normally include some form of the four golden signals (discussed in The Four Golden Signals).

@@ -149,7 +149,7 @@ Your monitoring system should address two questions: what’s broken, and why?

- The "what’s broken" indicates the symptom; the "why" indicates a (possibly intermediate) cause. Table 6-1 lists some hypothetical symptoms and corresponding causes. + The "what’s broken" indicates the symptom; the "why" indicates a (possibly intermediate) cause. Table 6-1 lists some hypothetical symptoms and corresponding causes.

@@ -285,7 +285,7 @@ Worrying About Your Tail (or, Instrumentation and Performance)

- When building a monitoring system from scratch, it’s tempting to design a system based upon the mean of some quantity: the mean latency, the mean CPU usage of your nodes, or the mean fullness of your databases. The danger presented by the latter two cases is obvious: CPUs and databases can easily be utilized in a very imbalanced way. The same holds for latency. If you run a web service with an average latency of 100 ms at 1,000 requests per second, 1% of requests might easily take 5 seconds.23 If your users depend on several such web services to render their page, the 99th percentile of one backend can easily become the median response of your frontend. + When building a monitoring system from scratch, it’s tempting to design a system based upon the mean of some quantity: the mean latency, the mean CPU usage of your nodes, or the mean fullness of your databases. The danger presented by the latter two cases is obvious: CPUs and databases can easily be utilized in a very imbalanced way. The same holds for latency. If you run a web service with an average latency of 100 ms at 1,000 requests per second, 1% of requests might easily take 5 seconds.23 If your users depend on several such web services to render their page, the 99th percentile of one backend can easily become the median response of your frontend.

The simplest way to differentiate between a slow average and a very slow "tail" of requests is to collect request counts bucketed by latencies (suitable for rendering a histogram), rather than actual latencies: how many requests did I serve that took between 0 ms and 10 ms, between 10 ms and 30 ms, between 30 ms and 100 ms, between 100 ms and 300 ms, and so on? Distributing the histogram boundaries approximately exponentially (in this case by factors of roughly 3) is often an easy way to visualize the distribution of your requests. @@ -362,10 +362,10 @@ The principles discussed in this chapter can be tied together into a philosophy on monitoring and alerting that’s widely endorsed and followed within Google SRE teams. While this monitoring philosophy is a bit aspirational, it’s a good starting point for writing or reviewing a new alert, and it can help your organization ask the right questions, regardless of the size of your organization or the complexity of your service or system.

- When creating rules for monitoring and alerting, asking the following questions can help you avoid false positives and pager burnout:24 + When creating rules for monitoring and alerting, asking the following questions can help you avoid false positives and pager burnout:24

    -
  • Does this rule detect an otherwise undetected condition that is urgent, actionable, and actively or imminently user-visible?25 +
  • Does this rule detect an otherwise undetected condition that is urgent, actionable, and actively or imminently user-visible?25
  • Will I ever be able to ignore this alert, knowing it’s benign? When and why will I be able to ignore this alert, and how can I avoid this scenario?
  • diff --git a/resources/tests/readability/guardian-1/expected.html b/resources/tests/readability/guardian-1/expected.html index 21c388e..0d4a648 100644 --- a/resources/tests/readability/guardian-1/expected.html +++ b/resources/tests/readability/guardian-1/expected.html @@ -12,7 +12,7 @@ - +
    @@ -38,7 +38,7 @@ - +
    @@ -64,7 +64,7 @@ - +
    @@ -77,7 +77,7 @@ - +
    @@ -90,7 +90,7 @@ - +
    @@ -137,7 +137,7 @@ - +
    @@ -163,7 +163,7 @@ - +
    @@ -189,7 +189,7 @@ - +
    @@ -218,7 +218,7 @@ - +
    @@ -231,7 +231,7 @@ - +
    @@ -253,7 +253,7 @@ - +
    @@ -282,7 +282,7 @@ - +
    @@ -302,7 +302,7 @@ - +
    diff --git a/resources/tests/readability/ietf-1/expected.html b/resources/tests/readability/ietf-1/expected.html index 6059c46..5c9f350 100644 --- a/resources/tests/readability/ietf-1/expected.html +++ b/resources/tests/readability/ietf-1/expected.html @@ -51,50 +51,50 @@ Copyright Notice publication of this document. Please review these documents carefully, as they describe your rights and restrictions with respect to this document. Code Components extracted from this document must - include Simplified BSD License text as described in Section 4.e of + include Simplified BSD License text as described in Section 4.e of the Trust Legal Provisions and are provided without warranty as described in the Simplified BSD License. de Jong [Page 1] -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
     Table of Contents
     
    -   1. Introduction...................................................2
    -   2. Terminology....................................................3
    -   3. Storage model..................................................3
    -   4. Requests.......................................................4
    -   5. Response codes.................................................7
    -   6. Versioning.....................................................7
    -   7. CORS headers...................................................8
    -   8. Session description............................................8
    -   9. Bearer tokens and access control...............................9
    -  10. Application-first bearer token issuance.......................10
    -  11. Storage-first bearer token issuance...........................11
    -  12. Example wire transcripts......................................12
    -     12.1. WebFinger................................................12
    -     12.2. OAuth dialog form........................................13
    -     12.3. OAuth dialog form submission.............................14
    -     12.4. OPTIONS preflight........................................15
    -     12.5. Initial PUT..............................................15
    -     12.6. Subsequent PUT...........................................16
    -     12.7. GET......................................................16
    -     12.8. DELETE...................................................17
    -  13. Distributed versioning........................................17
    -  14. Security Considerations.......................................19
    -  15. IANA Considerations...........................................20
    -  16. Acknowledgments...............................................20
    -  17. References....................................................21
    -     17.1. Normative References.....................................21
    -     17.2. Informative References...................................21
    -  18. Authors' addresses............................................22
    +   1. Introduction...................................................2
    +   2. Terminology....................................................3
    +   3. Storage model..................................................3
    +   4. Requests.......................................................4
    +   5. Response codes.................................................7
    +   6. Versioning.....................................................7
    +   7. CORS headers...................................................8
    +   8. Session description............................................8
    +   9. Bearer tokens and access control...............................9
    +  10. Application-first bearer token issuance.......................10
    +  11. Storage-first bearer token issuance...........................11
    +  12. Example wire transcripts......................................12
    +     12.1. WebFinger................................................12
    +     12.2. OAuth dialog form........................................13
    +     12.3. OAuth dialog form submission.............................14
    +     12.4. OPTIONS preflight........................................15
    +     12.5. Initial PUT..............................................15
    +     12.6. Subsequent PUT...........................................16
    +     12.7. GET......................................................16
    +     12.8. DELETE...................................................17
    +  13. Distributed versioning........................................17
    +  14. Security Considerations.......................................19
    +  15. IANA Considerations...........................................20
    +  16. Acknowledgments...............................................20
    +  17. References....................................................21
    +     17.1. Normative References.....................................21
    +     17.2. Informative References...................................21
    +  18. Authors' addresses............................................22
     
     
    -1.  Introduction
    +1.  Introduction
     
         Many services for data storage are available over the internet. This
         specification describes a vendor-independent interface for such
    @@ -109,7 +109,7 @@ Table of Contents
     
     de Jong                                                         [Page 2]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
    @@ -128,11 +128,11 @@ Table of Contents
         The exact details of these four actions are described in this
         specification.
     
    -2. Terminology
    +2. Terminology
     
         The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
         "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
    -    document are to be interpreted as described in RFC 2119 [WORDS].
    +    document are to be interpreted as described in RFC 2119 [WORDS].
     
         "SHOULD" and "SHOULD NOT" are appropriate when valid exceptions to a
         general requirement are known to exist or appear to exist, and it is
    @@ -141,7 +141,7 @@ Table of Contents
         implement the general requirement when such failure would result in
         interoperability failure.
     
    -3. Storage model
    +3. Storage model
     
         The server stores data in nodes that form a tree structure.
         Internal nodes are called 'folders' and leaf nodes are called
    @@ -160,7 +160,7 @@ Table of Contents
     
     de Jong                                                         [Page 3]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
    @@ -170,12 +170,12 @@ Table of Contents
            * content length
            * content
     
    -4. Requests
    +4. Requests
     
    -    Client-to-server requests SHOULD be made over https [HTTPS], and
    -    servers MUST comply with HTTP/1.1 [HTTP]. Specifically, they
    +    Client-to-server requests SHOULD be made over https [HTTPS], and
    +    servers MUST comply with HTTP/1.1 [HTTP]. Specifically, they
         MUST support chunked transfer coding on PUT requests. Servers MAY
    -    also offer an optional switch from https to SPDY [SPDY].
    +    also offer an optional switch from https to SPDY [SPDY].
     
         A request is considered successful if the HTTP response code is in
         the 2xx range (e.g. 200 OK, 201 Created), and unsuccessful if an
    @@ -211,14 +211,14 @@ Table of Contents
     
     de Jong                                                         [Page 4]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
         field, representing the folder's current version.
     
         A successful GET request to a folder MUST be responded to with a
    -    JSON-LD [JSON-LD] document (content type 'application/ld+json'),
    +    JSON-LD [JSON-LD] document (content type 'application/ld+json'),
         containing as its 'items' field a map in which contained documents
         appear as entries <item_name> to a document description, and
         contained non-empty folders appear as entries <item_name> '/' to a
    @@ -262,7 +262,7 @@ Table of Contents
     
     de Jong                                                         [Page 5]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
    @@ -273,9 +273,9 @@ Table of Contents
         gzipped when requested by the client, since the two bodies would not
         be identical byte-for-byte.
     
    -    Servers MAY support Content-Range headers [RANGE] on GET requests,
    +    Servers MAY support Content-Range headers [RANGE] on GET requests,
         but whether or not they do SHOULD be announced through the <ranges>
    -    variable mentioned below in section 10.
    +    variable mentioned below in section 10.
     
         A successful PUT request to a document MUST result in:
     
    @@ -288,7 +288,7 @@ Table of Contents
              document's new content type,
            * its version being updated, as well as that of its parent folder
              and further ancestor folders, using a strong validator [HTTP,
    -         section 7.2].
    +         section 7.2].
     
         The response MUST contain a strong ETag header, with the document's
         new version (for instance a hash of its contents) as its value.
    @@ -313,14 +313,14 @@ Table of Contents
     
     de Jong                                                         [Page 6]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
    -5. Response codes
    +5. Response codes
     
    -    Response codes SHOULD be given as defined by [HTTP, section 6] and
    -    [BEARER, section 3.1]. The following is a non-normative checklist
    +    Response codes SHOULD be given as defined by [HTTP, section 6] and
    +    [BEARER, section 3.1]. The following is a non-normative checklist
         of status codes that are likely to occur in practice:
     
            * 500 if an internal server error occurs,
    @@ -350,13 +350,13 @@ Table of Contents
         Clients SHOULD also handle the case where a response takes too long
         to arrive, or where no response is received at all.
     
    -6. Versioning
    +6. Versioning
     
    -    All successful requests MUST return an 'ETag' header [HTTP] with, in
    +    All successful requests MUST return an 'ETag' header [HTTP] with, in
         the case of GET, the current version, in the case of PUT, the new
         version, and in case of DELETE, the version that was deleted. All
         successful GET requests MUST return an 'Expires: 0' header. PUT and
    -    DELETE requests MAY have an 'If-Match' request header [COND], and
    +    DELETE requests MAY have an 'If-Match' request header [COND], and
         MUST fail with a 412 response code if that doesn't match the
         document's current version.
     
    @@ -364,14 +364,14 @@ Table of Contents
     
     de Jong                                                         [Page 7]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
         GET requests MAY have a comma-separated list of revisions in an
    -    'If-None-Match' header [COND], and SHOULD be responded to with a 304
    +    'If-None-Match' header [COND], and SHOULD be responded to with a 304
         response if that list includes the document or folder's current
    -    version. A PUT request MAY have an 'If-None-Match: *' header [COND],
    +    version. A PUT request MAY have an 'If-None-Match: *' header [COND],
         in which case it MUST fail with a 412 response code if the document
         already exists.
     
    @@ -381,14 +381,14 @@ Table of Contents
         A provider MAY offer version rollback functionality to its users,
         but this specification does not define the user interface for that.
     
    -7. CORS headers
    +7. CORS headers
     
    -    All responses MUST carry CORS headers [CORS]. The server MUST also
    +    All responses MUST carry CORS headers [CORS]. The server MUST also
         reply to OPTIONS requests as per CORS. For GET requests, a wildcard
         origin MAY be returned, but for PUT and DELETE requests, the
         response MUST echo back the Origin header sent by the client.
     
    -8. Session description
    +8. Session description
     
         The information that a client needs to receive in order to be able
         to connect to a server SHOULD reach the client as described in the
    @@ -396,12 +396,12 @@ Table of Contents
     
            * <storage_root>, consisting of 'https://' followed by a server
              host, and optionally a server port and a path prefix as per
    -         [IRI]. Examples:
    +         [IRI]. Examples:
              * 'https://example.com' (host only)
              * 'https://example.com:8080' (host and port)
              * 'https://example.com/path/to/storage' (host, port and
                path prefix; note there is no trailing slash)
    -       * <access_token> as per [OAUTH]. The token SHOULD be hard to
    +       * <access_token> as per [OAUTH]. The token SHOULD be hard to
              guess and SHOULD NOT be reused from one client to another. It
              can however be reused in subsequent interactions with the same
              client, as long as that client is still trusted. Example:
    @@ -415,7 +415,7 @@ Table of Contents
     
     de Jong                                                         [Page 8]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
    @@ -430,7 +430,7 @@ Table of Contents
         * https://storage.example.com/bob/public/documents/
         * https://storage.example.com/bob/public/documents/draft.txt
     
    -9. Bearer tokens and access control
    +9. Bearer tokens and access control
     
         A bearer token represents one or more access scopes. These access
         scopes are represented as strings of the form <module> <level>,
    @@ -452,7 +452,7 @@ Table of Contents
         As a special exceptions, GET requests to a document (but not a
         folder) whose path starts with '/public/' are always allowed. They,
         as well as OPTIONS requests, can be made without a bearer token.
    -    Unless [KERBEROS] is used (see section 10 below), all other requests
    +    Unless [KERBEROS] is used (see section 10 below), all other requests
         SHOULD present a bearer token with sufficient access scope, using a
         header of the following form (no double quotes here):
     
    @@ -461,21 +461,21 @@ Table of Contents
         In addition, providing the access token via a HTTP query parameter
         for GET requests MAY be supported by the server, although its use
         is not recommended, due to its security deficiencies; see [BEARER,
    -    section 2.3].
    +    section 2.3].
     
     
     de Jong                                                         [Page 9]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
     
    -10. Application-first bearer token issuance
    +10. Application-first bearer token issuance
     
         To make a remoteStorage server available as 'the remoteStorage of
         <account> at <host>', exactly one link of the following format
    -    SHOULD be added to the WebFinger record [WEBFINGER] of <account> at
    +    SHOULD be added to the WebFinger record [WEBFINGER] of <account> at
         <host>:
     
         {
    @@ -490,7 +490,7 @@ Table of Contents
     
         Here <storage_root> and <storage_api> are as per "Session
         description" above, and <auth-dialog> SHOULD be either null or a
    -    URL where an OAuth 2.0 implicit-grant flow dialog [OAUTH] is
    +    URL where an OAuth 2.0 implicit-grant flow dialog [OAUTH] is
         presented.
     
         If <auth-dialog> is a URL, the user can supply their credentials
    @@ -502,7 +502,7 @@ Table of Contents
     
         If <auth-dialog> is null, the client will not have a way to obtain
         an access token, and SHOULD send all requests without Authorization
    -    header, and rely on Kerberos [KERBEROS] instead for requests that
    +    header, and rely on Kerberos [KERBEROS] instead for requests that
         would normally be sent with a bearer token, but servers SHOULD NOT
         impose any such access barriers for resources that would normally
         not require an access token.
    @@ -511,20 +511,20 @@ Table of Contents
         Non-breaking examples that have been proposed so far, include a
         "http://tools.ietf.org/html/rfc6750#section-2.3" property, set to
         the string value "true" if the server supports passing the bearer
    -    token in the URI query parameter as per section 2.3 of [BEARER],
    +    token in the URI query parameter as per section 2.3 of [BEARER],
         instead of in the request header.
     
     
     de Jong                                                        [Page 10]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
     
         Another example is "http://tools.ietf.org/html/rfc7233" with a
         string value of "GET" if Content-Range headers are supported for
    -    GET requests as per [RANGE], "PUT" if they are supported for PUT
    +    GET requests as per [RANGE], "PUT" if they are supported for PUT
         requests, and "GET,PUT" if supported for both.
     
         Both these proposals are non-breaking extensions, since the client
    @@ -535,7 +535,7 @@ Table of Contents
         A "http://remotestorage.io/spec/web-authoring" property has been
         proposed with a string value of the fully qualified domain name to
         which web authoring content is published if the server supports web
    -    authoring as per [AUTHORING]. Note that this extension is a breaking
    +    authoring as per [AUTHORING]. Note that this extension is a breaking
         extension in the sense that it divides users into "haves", whose
         remoteStorage accounts allow them to author web content, and
         "have-nots", whose remoteStorage account does not support this
    @@ -547,10 +547,10 @@ Table of Contents
         client_id parameter in favor of relying on the redirect_uri
         parameter for client identification.
     
    -11. Storage-first bearer token issuance
    +11. Storage-first bearer token issuance
     
         The provider MAY also present a dashboard to the user, where they
    -    have some way to add open web app manifests [MANIFEST]. Adding a
    +    have some way to add open web app manifests [MANIFEST]. Adding a
         manifest to the dashboard is considered equivalent to clicking
         'accept' in the dialog of the application-first flow. Removing one
         is considered equivalent to revoking its access token.
    @@ -559,7 +559,7 @@ Table of Contents
         field SHOULD be present in the root of such an application manifest
         document, with entries <module> -> '{"access": "readonly"}' for
         <level> 'r' or '{"access": "readwrite"}' for <level> 'rw', as
    -    prescribed in [DATASTORE].
    +    prescribed in [DATASTORE].
     
         When the user gestures they want to use a certain application whose
         manifest is present on the dashboard, the dashboard SHOULD redirect
    @@ -568,7 +568,7 @@ Table of Contents
     
     de Jong                                                        [Page 11]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
    @@ -606,12 +606,12 @@ Table of Contents
         debug tool, thus bypassing the need for an OAuth dance. Clients
         SHOULD NOT rely on this in production.
     
    -12. Example wire transcripts
    +12. Example wire transcripts
     
         The following examples are not normative ("\" indicates a line was
         wrapped).
     
    -12.1. WebFinger
    +12.1. WebFinger
     
         In application-first, an in-browser application might issue the
         following request, using XMLHttpRequest and CORS:
    @@ -619,7 +619,7 @@ Table of Contents
     
     de Jong                                                        [Page 12]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
    @@ -659,7 +659,7 @@ motestorage-04",
                }]
              }
     
    -12.2. OAuth dialog form
    +12.2. OAuth dialog form
     
         Once the in-browser application has discovered the server's OAuth
         end-point, it will typically redirect the user to this URL, in
    @@ -670,7 +670,7 @@ motestorage-04",
     
     de Jong                                                        [Page 13]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
    @@ -690,7 +690,7 @@ unhosted.5apps.com&response_type=token HTTP/1.1
                 <title>Allow access?</title>
             ...
     
    -12.3. OAuth dialog form submission
    +12.3. OAuth dialog form submission
     
         When the user submits the form, the request would look something
         like this:
    @@ -715,13 +715,13 @@ low
             Location:https://drinks-unhosted.5apps.com/#access_token=j2YnGt\
     XjzzzHNjkd1CJxoQubA1o%3D&token_type=bearer&state=
     
    -12.4. OPTIONS preflight
    +12.4. OPTIONS preflight
     
     
     
     de Jong                                                        [Page 14]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
    @@ -744,7 +744,7 @@ XjzzzHNjkd1CJxoQubA1o%3D&token_type=bearer&state=
             Access-Control-Allow-Headers: Authorization, Content-Length, Co\
     ntent-Type, Origin, X-Requested-With, If-Match, If-None-Match
     
    -12.5. Initial PUT
    +12.5. Initial PUT
     
         An initial PUT may contain an 'If-None-Match: *' header, like this:
     
    @@ -767,12 +767,12 @@ ntent-Type, Origin, X-Requested-With, If-Match, If-None-Match
             Access-Control-Allow-Origin: https://drinks-unhosted.5apps.com
             ETag: "1382694045000"
     
    -12.6. Subsequent PUT
    +12.6. Subsequent PUT
     
     
     de Jong                                                        [Page 15]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
    @@ -798,7 +798,7 @@ e.io/spec/modules/myfavoritedrinks/drink"}
             Access-Control-Allow-Origin: https://drinks-unhosted.5apps.com
             ETag: "1382694048000"
     
    -12.7. GET
    +12.7. GET
     
         A GET request would also include the bearer token, and optionally
         an If-None-Match header:
    @@ -823,7 +823,7 @@ e.io/spec/modules/myfavoritedrinks/drink"}
     
     de Jong                                                        [Page 16]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
    @@ -858,7 +858,7 @@ charset=UTF-8","Content-Length":106}}}
             HTTP/1.1 404 Not Found
             Access-Control-Allow-Origin: https://drinks-unhosted.5apps.com
     
    -12.8. DELETE
    +12.8. DELETE
     
         A DELETE request may look like this:
     
    @@ -874,7 +874,7 @@ charset=UTF-8","Content-Length":106}}}
     
     de Jong                                                        [Page 17]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
    @@ -884,7 +884,7 @@ charset=UTF-8","Content-Length":106}}}
             Access-Control-Allow-Origin: https://drinks-unhosted.5apps.com
             ETag: "1382694048000"
     
    -13. Distributed versioning
    +13. Distributed versioning
     
         This section is non-normative, and is intended to explain some of
         the design choices concerning ETags and folder listings. At the
    @@ -925,7 +925,7 @@ charset=UTF-8","Content-Length":106}}}
     
     de Jong                                                        [Page 18]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
    @@ -947,7 +947,7 @@ charset=UTF-8","Content-Length":106}}}
         but it is up to whichever client discovers a given version
         conflict, to resolve it.
     
    -14. Security Considerations
    +14. Security Considerations
     
         To prevent man-in-the-middle attacks, the use of https instead of
         http is important for both the interface itself and all end-points
    @@ -976,7 +976,7 @@ charset=UTF-8","Content-Length":106}}}
     
     de Jong                                                        [Page 19]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
    @@ -993,7 +993,7 @@ charset=UTF-8","Content-Length":106}}}
         The server SHOULD also detect and stop denial-of-service attacks
         that aim to overwhelm its interface with too much traffic.
     
    -15. IANA Considerations
    +15. IANA Considerations
     
         This document registers the 'remotestorage' link relation, as well
         as the following WebFinger properties:
    @@ -1003,7 +1003,7 @@ charset=UTF-8","Content-Length":106}}}
           * "http://tools.ietf.org/html/rfc7233"
           * "http://remotestorage.io/spec/web-authoring"
     
    -16. Acknowledgements
    +16. Acknowledgements
     
         The authors would like to thank everybody who contributed to the
         development of this protocol, including Kenny Bentley, Javier Diaz,
    @@ -1016,88 +1016,88 @@ charset=UTF-8","Content-Length":106}}}
         Rick van Rein, Mark Nottingham, Julian Reschke, and Markus
         Lanthaler, among many others.
     
    -17. References
    +17. References
     
    -17.1. Normative References
    +17.1. Normative References
     
    -    [WORDS]
    +    [WORDS]
             Bradner, S., "Key words for use in RFCs to Indicate Requirement
             Levels", BCP 14, RFC 2119, March 1997.
     
     
     de Jong                                                        [Page 20]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
     
    -    [IRI]
    +    [IRI]
             Duerst, M., "Internationalized Resource Identifiers (IRIs)",
             RFC 3987, January 2005.
     
    -    [WEBFINGER]
    +    [WEBFINGER]
             Jones, P., Salguerio, G., Jones, M, and Smarr, J.,
             "WebFinger", RFC7033, September 2013.
     
    -    [OAUTH]
    -        "Section 4.2: Implicit Grant", in: Hardt, D. (ed), "The OAuth
    +    [OAUTH]
    +        "Section 4.2: Implicit Grant", in: Hardt, D. (ed), "The OAuth
             2.0 Authorization Framework", RFC6749, October 2012.
     
    -17.2. Informative References
    +17.2. Informative References
     
    -    [HTTPS]
    +    [HTTPS]
             Rescorla, E., "HTTP Over TLS", RFC2818, May 2000.
     
    -    [HTTP]
    +    [HTTP]
             Fielding et al., "Hypertext Transfer Protocol (HTTP/1.1):
             Semantics and Content", RFC7231, June 2014.
     
    -    [COND]
    +    [COND]
             Fielding et al., "Hypertext Transfer Protocol (HTTP/1.1):
             Conditional Requests", RFC7232, June 2014.
     
    -    [RANGE]
    +    [RANGE]
             Fielding et al., "Hypertext Transfer Protocol (HTTP/1.1):
             Conditional Requests", RFC7233, June 2014.
     
    -    [SPDY]
    +    [SPDY]
             Mark Belshe, Roberto Peon, "SPDY Protocol - Draft 3.1", http://
             www.chromium.org/spdy/spdy-protocol/spdy-protocol-draft3-1,
             September 2013.
     
    -    [JSON-LD]
    +    [JSON-LD]
             M. Sporny, G. Kellogg, M. Lanthaler, "JSON-LD 1.0", W3C
             Proposed Recommendation,
             http://www.w3.org/TR/2014/REC-json-ld-20140116/, January 2014.
     
    -    [CORS]
    +    [CORS]
             van Kesteren, Anne (ed), "Cross-Origin Resource Sharing --
             W3C Candidate Recommendation 29 January 2013",
     
     
     de Jong                                                        [Page 21]
     
    -
     
    +
     
     Internet-Draft              remoteStorage                  December 2014
     
     
             http://www.w3.org/TR/cors/, January 2013.
     
    -    [MANIFEST]
    +    [MANIFEST]
             Mozilla Developer Network (ed), "App manifest -- Revision
             330541", https://developer.mozilla.org/en-
             US/Apps/Build/Manifest$revision/566677, April 2014.
     
    -    [DATASTORE]
    +    [DATASTORE]
             "WebAPI/DataStore", MozillaWiki, retrieved May 2014.
             https://wiki.mozilla.org/WebAPI/DataStore#Manifest
     
    -    [KERBEROS]
    +    [KERBEROS]
             C. Neuman et al., "The Kerberos Network Authentication Service
             (V5)", RFC4120, July 2005.
     
    -    [BEARER]
    +    [BEARER]
             M. Jones, D. Hardt, "The OAuth 2.0 Authorization Framework:
             Bearer Token Usage", RFC6750, October 2012.
     
    @@ -1106,7 +1106,7 @@ charset=UTF-8","Content-Length":106}}}
             September 2014. https://github.com/michielbdejong/resite/wiki
             /Using-remoteStorage-for-web-authoring
     
    -18. Authors' addresses
    +18. Authors' addresses
     
         Michiel B. de Jong
         IndieHosters
    diff --git a/resources/tests/readability/mercurial/expected.html b/resources/tests/readability/mercurial/expected.html
    new file mode 100644
    index 0000000..c670ccc
    --- /dev/null
    +++ b/resources/tests/readability/mercurial/expected.html
    @@ -0,0 +1,738 @@
    +
    + + +

    + Once you have mastered the art of mutable history in a single repository (see the user guide), you can move up to the next level: shared mutable history. evolve lets you push and pull draft changesets between repositories along with their obsolescence markers. This opens up a number of interesting possibilities. +

    +

    + The simplest scenario is a single developer working across two computers. Say you’re working on code that must be tested on a remote test server, probably in a rack somewhere, only accessible by SSH, and running an “enterprise-grade” (out-of-date) OS. But you probably prefer to write code locally: everything is setup the way you like it, and you can use your preferred editor, IDE, merge/diff tools, etc. +

    +

    + Traditionally, your options are limited: either +

    +
    +
    +
      +
    • (ab)use your source control system by committing half-working code in order to get it onto the remote test server, or +
    • +
    • go behind source control’s back by using rsync (or similar) to transfer your code back-and-forth until it is ready to commit +
    • +
    +
    +
    +

    + The former is less bad with distributed version control systems like Mercurial, but it’s still far from ideal. (One important version control “best practice” is that every commit should make things just a little bit better, i.e. you should never commit code that is worse than what came before.) The latter, avoiding version control entirely, means that you’re walking a tightrope without a safety net. One accidental rsync in the wrong direction could destroy hours of work. +

    +

    + Using Mercurial with evolve to share mutable history solves these problems. As with single-repository evolve, you can commit whenever the code is demonstrably better, even if all the tests aren’t passing yet—just hg amend when they are. And you can transfer those half-baked changesets between repositories to try things out on your test server before anything is carved in stone. +

    +

    + A less common scenario is multiple developers sharing mutable history, typically for code review. We’ll cover this scenario later. First, we will cover single-user sharing. +

    +
    +

    + Sharing with a single developer +

    +
    +

    + Publishing and non-publishing repositories +

    +

    + The key to shared mutable history is to keep your changesets in draft phase as you pass them around. Recall that by default, hg push promotes changesets from draft to public, and public changesets are immutable. You can change this behaviour by reconfiguring the remote repository so that it is non-publishing. (Short version: set phases.publish to false. Long version follows.) +

    +
    +
    +

    + Setting up +

    +

    + We’ll work through an example with three local repositories, although in the real world they’d most likely be on three different computers. First, the public repository is where tested, polished changesets live, and it is where you synchronize with the rest of your team. +

    + +

    + We’ll need two clones where work gets done, test-repo and dev-repo: +

    +
    +
    $ hg clone public test-repo
    +updating to branch default
    +0 files updated, 0 files merged, 0 files removed, 0 files unresolved
    +$ hg clone test-repo dev-repo
    +updating to branch default
    +0 files updated, 0 files merged, 0 files removed, 0 files unresolved
    +
    +
    +

    + dev-repo is your local machine, with GUI merge tools and IDEs and everything configured just the way you like it. test-repo is the test server in a rack somewhere behind SSH. So for the most part, we’ll develop in dev-repo, push to test-repo, test and polish there, and push to public. +

    +

    + The key to shared mutable history is to make the target repository, in this case test-repo, non-publishing. And, of course, we have to enable the evolve extension in both test-repo and dev-repo. +

    +

    + First, edit the configuration for test-repo: +

    +
    +
    $ hg -R test-repo config --edit --local
    +
    +
    +

    + and add +

    +
    +
    [phases]
    +publish = false
    +
    +[extensions]
    +evolve =
    +
    +
    +

    + Then edit the configuration for dev-repo: +

    +
    +
    $ hg -R dev-repo config --edit --local
    +
    +
    +

    + and add +

    + +

    + Keep in mind that in real life, these repositories would probably be on separate computers, so you’d have to login to each one to configure each repository. +

    +

    + To start things off, let’s make one public, immutable changeset: +

    +
    +
    $ cd test-repo
    +$ echo 'my new project' > file1
    +$ hg add file1
    +$ hg commit -m 'create new project'
    +$ hg push
    +[...]
    +added 1 changesets with 1 changes to 1 files
    +
    +
    +

    + and pull that into the development repository: +

    +
    +
    $ cd ../dev-repo
    +$ hg pull -u
    +[...]
    +added 1 changesets with 1 changes to 1 files
    +1 files updated, 0 files merged, 0 files removed, 0 files unresolved
    +
    +
    +
    + +
    +

    + Example 2: Amend again, locally +

    +

    + This process can repeat. Perhaps you figure out a more elegant fix to the bug, and want to mutate history so nobody ever knows you had a less-than-perfect idea. We’ll implement it locally in dev-repo and push to test-repo: +

    +
    +
    $ echo 'Fix, fix, and fix.' > file1
    +$ hg amend
    +$ hg push
    +
    +
    +

    + This time around, the temporary amend commit is in dev-repo, and it is not transferred to test-repo—the same as before, just in the opposite direction. Figure 4 shows the two repositories after amending in dev-repo and pushing to test-repo. +

    +
    +

    + [figure SG04: each repo has one temporary amend commit, but they’re different in each one] +

    +
    +

    + Let’s hop over to test-repo to test the more elegant fix: +

    +
    +
    $ cd ../test-repo
    +$ hg update
    +1 files updated, 0 files merged, 0 files removed, 0 files unresolved
    +
    +
    +

    + This time, all the tests pass, so no further amending is required. This bug fix is finished, so we push it to the public repository: +

    +
    +
    $ hg push
    +[...]
    +added 1 changesets with 1 changes to 1 files
    +
    +
    +

    + Note that only one changeset—the final version, after two amendments—was actually pushed. Again, Mercurial doesn’t transfer hidden changesets on push and pull. +

    +

    + So the picture in public is much simpler than in either dev-repo or test-repo. Neither of our missteps nor our amendments are publicly visible, just the final, beautifully polished changeset: +

    +
    +

    + [figure SG05: public repo with rev 0:0dc9, 1:de61, both public] +

    +
    +

    + There is one important step left to do. Because we pushed from test-repo to public, the pushed changeset is in public phase in those two repositories. But dev-repo has been out-of-the-loop; changeset de61 is still draft there. If we’re not careful, we might mutate history in dev-repo, obsoleting a changeset that is already public. Let’s avoid that situation for now by pushing up to dev-repo: +

    +
    +
    $ hg push ../dev-repo
    +pushing to ../dev-repo
    +searching for changes
    +no changes found
    +
    +
    +

    + Even though no changesets were pushed, Mercurial still pushed obsolescence markers and phase changes to dev-repo. +

    +

    + A final note: since this fix is now public, it is immutable. It’s no longer possible to amend it: +

    +
    +
    $ hg amend -m 'fix bug 37'
    +abort: cannot amend public changesets
    +
    +
    +

    + This is, after all, the whole point of Mercurial’s phases: to prevent rewriting history that has already been published. +

    +
    +
    +
    +

    + Sharing with multiple developers: code review +

    +

    + Now that you know how to share your own mutable history across multiple computers, you might be wondering if it makes sense to share mutable history with others. It does, but you have to be careful, stay alert, and communicate with your peers. +

    +

    + Code review is a good use case for sharing mutable history across multiple developers: Alice commits a draft changeset, submits it for review, and amends her changeset until her reviewer is satisfied. Meanwhile, Bob is also committing draft changesets for review, amending until his reviewer is satisfied. Once a particular changeset passes review, the respective author (Alice or Bob) pushes it to the public (publishing) repository. +

    +

    + Incidentally, the reviewers here can be anyone: maybe Bob and Alice review each other’s work; maybe the same third party reviews both; or maybe they pick different experts to review their work on different parts of a large codebase. Similarly, it doesn’t matter if reviews are conducted in person, by email, or by carrier pigeon. Code review is outside of the scope of Mercurial, so all we’re looking at here is the mechanics of committing, amending, pushing, and pulling. +

    +
    +

    + Setting up +

    +

    + To demonstrate, let’s start with the public repository as we left it in the last example, with two immutable changesets (figure 5 above). We’ll clone a review repository from it, and then Alice and Bob will both clone from review. +

    +
    +
    $ hg clone public review
    +updating to branch default
    +1 files updated, 0 files merged, 0 files removed, 0 files unresolved
    +$ hg clone review alice
    +updating to branch default
    +1 files updated, 0 files merged, 0 files removed, 0 files unresolved
    +$ hg clone review bob
    +updating to branch default
    +1 files updated, 0 files merged, 0 files removed, 0 files unresolved
    +
    +
    +

    + We need to configure Alice’s and Bob’s working repositories to enable evolve. First, edit Alice’s configuration with +

    +
    +
    $ hg -R alice config --edit --local
    +
    +
    +

    + and add +

    + +

    + Then edit Bob’s repository configuration: +

    +
    +
    $ hg -R bob config --edit --local
    +
    +
    +

    + and add the same text. +

    +
    +
    +

    + Example 3: Alice commits and amends a draft fix +

    +

    + We’ll follow Alice working on a bug fix. We’re going to use bookmarks to make it easier to understand multiple branch heads in the review repository, so Alice starts off by creating a bookmark and committing her first attempt at a fix: +

    +
    +
    $ hg bookmark bug15
    +$ echo 'fix' > file2
    +$ hg commit -A -u alice -m 'fix bug 15 (v1)'
    +adding file2
    +
    +
    +

    + Note the unorthodox “(v1)” in the commit message. We’re just using that to make this tutorial easier to follow; it’s not something we’d recommend in real life. +

    +

    + Of course Alice wouldn’t commit unless her fix worked to her satisfaction, so it must be time to solicit a code review. She does this by pushing to the review repository: +

    +
    +
    $ hg push -B bug15
    +[...]
    +added 1 changesets with 1 changes to 1 files
    +exporting bookmark bug15
    +
    +
    +

    + (The use of -B is important to ensure that we only push the bookmarked head, and that the bookmark itself is pushed. See this guide to bookmarks, especially the Sharing Bookmarks section, if you’re not familiar with bookmarks.) +

    +

    + Some time passes, and Alice receives her code review. As a result, Alice revises her fix and submits it for a second review: +

    +
    +
    $ echo 'Fix.' > file2
    +$ hg amend -m 'fix bug 15 (v2)'
    +$ hg push
    +[...]
    +added 1 changesets with 1 changes to 1 files (+1 heads)
    +updating bookmark bug15
    +
    +
    +

    + Figure 6 shows the state of the review repository at this point. +

    +
    +

    + [figure SG06: rev 2:fn1e is Alice’s obsolete v1, rev 3:cbdf is her v2; both children of rev 1:de61] +

    +
    +

    + After a busy morning of bug fixing, Alice stops for lunch. Let’s see what Bob has been up to. +

    +
    +
    +

    + Example 4: Bob implements and publishes a new feature +

    +

    + Meanwhile, Bob has been working on a new feature. Like Alice, he’ll use a bookmark to track his work, and he’ll push that bookmark to the review repository, so that reviewers know which changesets to review. +

    +
    +
    $ cd ../bob
    +$ echo 'stuff' > file1
    +$ hg bookmark featureX
    +$ hg commit -u bob -m 'implement feature X (v1)'          # rev 4:1636
    +$ hg push -B featureX
    +[...]
    +added 1 changesets with 1 changes to 1 files (+1 heads)
    +exporting bookmark featureX
    +
    +
    +

    + When Bob receives his code review, he improves his implementation a bit, amends, and submits the resulting changeset for review: +

    +
    +
    $ echo 'do stuff' > file1
    +$ hg amend -m 'implement feature X (v2)'                  # rev 5:0eb7
    +$ hg push
    +[...]
    +added 1 changesets with 1 changes to 1 files (+1 heads)
    +updating bookmark featureX
    +
    +
    +

    + Unfortunately, that still doesn’t pass muster. Bob’s reviewer insists on proper capitalization and punctuation. +

    +
    +
    $ echo 'Do stuff.' > file1
    +$ hg amend -m 'implement feature X (v3)'                  # rev 6:540b
    +
    +
    +

    + On the bright side, the second review said, “Go ahead and publish once you fix that.” So Bob immediately publishes his third attempt: +

    +
    +
    $ hg push ../public
    +[...]
    +added 1 changesets with 1 changes to 1 files
    +
    +
    +

    + It’s not enough just to update public, though! Other people also use the review repository, and right now it doesn’t have Bob’s latest amendment (“v3”, revision 6:540b), nor does it know that the precursor of that changeset (“v2”, revision 5:0eb7) is obsolete. Thus, Bob pushes to review as well: +

    +
    +
    $ hg push ../review
    +[...]
    +added 1 changesets with 1 changes to 1 files (+1 heads)
    +updating bookmark featureX
    +
    +
    +

    + Figure 7 shows the result of Bob’s work in both review and public. +

    +
    +

    + [figure SG07: review includes Alice’s draft work on bug 15, as well as Bob’s v1, v2, and v3 changes for feature X: v1 and v2 obsolete, v3 public. public contains only the final, public implementation of feature X] +

    +
    +

    + Incidentally, it’s important that Bob push to public before review. If he pushed to review first, then revision 6:540b would still be in draft phase in review, but it would be public in both Bob’s local repository and the public repository. That could lead to confusion at some point, which is easily avoided by pushing first to public. +

    +
    +
    +

    + Example 5: Alice integrates and publishes +

    +

    + Finally, Alice gets back from lunch and sees that the carrier pigeon with her second review has arrived (or maybe it’s in her email inbox). Alice’s reviewer approved her amended changeset, so she pushes it to public: +

    +
    +
    $ hg push ../public
    +[...]
    +remote has heads on branch 'default' that are not known locally: 540ba8f317e6
    +abort: push creates new remote head cbdfbd5a5db2!
    +(pull and merge or see "hg help push" for details about pushing new heads)
    +
    +
    +

    + Oops! Bob has won the race to push first to public. So Alice needs to integrate with Bob: let’s pull his changeset(s) and see what the branch heads are. +

    +
    +
    $ hg pull ../public
    +[...]
    +added 1 changesets with 1 changes to 1 files (+1 heads)
    +(run 'hg heads' to see heads, 'hg merge' to merge)
    +$ hg log -G -q -r 'head()' --template '{rev}:{node|short}  ({author})\n'
    +o  5:540ba8f317e6  (bob)
    +|
    +| @  4:cbdfbd5a5db2  (alice)
    +|/
    +
    +
    +

    + We’ll assume Alice and Bob are perfectly comfortable with rebasing changesets. (After all, they’re already using mutable history in the form of amend.) So Alice rebases her changeset on top of Bob’s and publishes the result: +

    +
    +
    $ hg rebase -d 5
    +$ hg push ../public
    +[...]
    +added 1 changesets with 1 changes to 1 files
    +$ hg push ../review
    +[...]
    +added 1 changesets with 0 changes to 0 files
    +updating bookmark bug15
    +
    +
    +

    + The result, in both review and public repositories, is shown in figure 8. +

    +
    +

    + [figure SG08: review shows v1 and v2 of Alice’s fix, then v1, v2, v3 of Bob’s feature, finally Alice’s fix rebased onto Bob’s. public just shows the final public version of each changeset] +

    +
    +
    +
    +
    +

    + Getting into trouble with shared mutable history +

    +

    + Mercurial with evolve is a powerful tool, and using powerful tools can have consequences. (You can cut yourself badly with a sharp knife, but every competent chef keeps several around. Ever try to chop onions with a spoon?) +

    +

    + In the user guide, we saw examples of unstbale changesets, which are the most common type of troubled changeset. (Recall that a non-obsolete changeset with obsolete ancestors is an orphan.) +

    +

    + Two other types of troubles can happen: divergent and bumped changesets. Both are more likely with shared mutable history, especially mutable history shared by multiple developers. +

    +
    +

    + Setting up +

    +

    + For these examples, we’re going to use a slightly different workflow: as before, Alice and Bob share a public repository. But this time there is no review repository. Instead, Alice and Bob put on their cowboy hats, throw good practice to the wind, and pull directly from each other’s working repositories. +

    +

    + So we throw away everything except public and reclone: +

    +
    +
    $ rm -rf review alice bob
    +$ hg clone public alice
    +updating to branch default
    +2 files updated, 0 files merged, 0 files removed, 0 files unresolved
    +$ hg clone public bob
    +updating to branch default
    +2 files updated, 0 files merged, 0 files removed, 0 files unresolved
    +
    +
    +

    + Once again we have to configure their repositories: enable evolve and (since Alice and Bob will be pulling directly from each other) make their repositories non-publishing. Edit Alice’s configuration: +

    +
    +
    $ hg -R alice config --edit --local
    +
    +
    +

    + and add +

    +
    +
    [extensions]
    +rebase =
    +evolve =
    +
    +[phases]
    +publish = false
    +
    +
    +

    + Then edit Bob’s repository configuration: +

    +
    +
    $ hg -R bob config --edit --local
    +
    +
    +

    + and add the same text. +

    +
    +
    +

    + Example 6: Divergent changesets +

    +

    + When an obsolete changeset has two successors, those successors are divergent. One way to get into such a situation is by failing to communicate with your teammates. Let’s see how that might happen. +

    +

    + First, we’ll have Bob commit a bug fix that could still be improved: +

    +
    +
    $ cd bob
    +$ echo 'pretty good fix' >> file1
    +$ hg commit -u bob -m 'fix bug 24 (v1)'                   # rev 4:2fe6
    +
    +
    +

    + Since Alice and Bob are now in cowboy mode, Alice pulls Bob’s draft changeset and amends it herself. +

    +
    +
    $ cd ../alice
    +$ hg pull -u ../bob
    +[...]
    +added 1 changesets with 1 changes to 1 files
    +$ echo 'better fix (alice)' >> file1
    +$ hg amend -u alice -m 'fix bug 24 (v2 by alice)'
    +
    +
    +

    + But Bob has no idea that Alice just did this. (See how important good communication is?) So he implements a better fix of his own: +

    +
    +
    $ cd ../bob
    +$ echo 'better fix (bob)' >> file1
    +$ hg amend -u bob -m 'fix bug 24 (v2 by bob)'             # rev 6:a360
    +
    +
    +

    + At this point, the divergence exists, but only in theory: Bob’s original changeset, 4:2fe6, is obsolete and has two successors. But those successors are in different repositories, so the trouble is not visible to anyone yet. It will be as soon as Bob pulls from Alice’s repository (or vice-versa). +

    +
    +
    $ hg pull ../alice
    +[...]
    +added 1 changesets with 1 changes to 2 files (+1 heads)
    +(run 'hg heads' to see heads, 'hg merge' to merge)
    +2 new divergent changesets
    +
    +
    +

    + Figure 9 shows the situation in Bob’s repository. +

    +
    +

    + [figure SG09: Bob’s repo with 2 heads for the 2 divergent changesets, 6:a360 and 7:e3f9; wc is at 6:a360; both are successors of obsolete 4:2fe6, hence divergence] +

    +
    +

    + Now we need to get out of trouble. As usual, the answer is to evolve history. +

    +
    +
    $ HGMERGE=internal:other hg evolve
    +merge:[6] fix bug 24 (v2 by bob)
    +with: [7] fix bug 24 (v2 by alice)
    +base: [4] fix bug 24 (v1)
    +0 files updated, 1 files merged, 0 files removed, 0 files unresolved
    +
    +
    +

    + Figure 10 shows how Bob’s repository looks now. +

    +
    +

    + [figure SG10: only one visible head, 9:5ad6, successor to hidden 6:a360 and 7:e3f9] +

    +
    +

    + We carefully dodged a merge conflict by specifying a merge tool (internal:other) that will take Alice’s changes over Bob’s. (You might wonder why Bob wouldn’t prefer his own changes by using internal:local. He’s avoiding a bug in evolve that occurs when evolving divergent changesets using internal:local.) +

    +

    + # XXX this link does not work .. bug: https://bitbucket.org/marmoute/mutable-history/issue/48/ +

    +

    + ** STOP HERE: WORK IN PROGRESS ** +

    +
    +
    +

    + Phase-divergence: when a rewritten changeset is made public +

    +

    + If Alice and Bob are collaborating on some mutable changesets, it’s possible to get into a situation where an otherwise worthwhile changeset cannot be pushed to the public repository; it is phase-divergent with another changeset that was made public first. Let’s demonstrate one way this could happen. +

    +

    + It starts with Alice committing a bug fix. Right now, we don’t yet know if this bug fix is good enough to push to the public repository, but it’s good enough for Alice to commit. +

    +
    +
    $ cd alice
    +$ echo 'fix' > file2
    +$ hg commit -A -m 'fix bug 15'
    +adding file2
    +
    +
    +

    + Now Bob has a bad idea: he decides to pull whatever Alice is working on and tweak her bug fix to his taste: +

    +
    +
    $ cd ../bob
    +$ hg pull -u ../alice
    +[...]
    +added 1 changesets with 1 changes to 1 files
    +1 files updated, 0 files merged, 0 files removed, 0 files unresolved
    +$ echo 'Fix.' > file2
    +$ hg amend -A -m 'fix bug 15 (amended)'
    +
    +
    +

    + (Note the lack of communication between Alice and Bob. Failing to communicate with your colleagues is a good way to get into trouble. Nevertheless, evolve can usually sort things out, as we will see.) +

    +
    +

    + [figure SG06: Bob’s repo with one amendment] +

    +
    +

    + After some testing, Alice realizes her bug fix is just fine as it is: no need for further polishing and amending, this changeset is ready to publish. +

    +
    +
    $ cd ../alice
    +$ hg push
    +[...]
    +added 1 changesets with 1 changes to 1 files
    +
    +
    +

    + This introduces a contradiction: in Bob’s repository, changeset 2:e011 (his copy of Alice’s fix) is obsolete, since Bob amended it. But in Alice’s repository (and the public repository), that changeset is public: it is immutable, carved in stone for all eternity. No changeset can be both obsolete and public, so Bob is in for a surprise the next time he pulls from public: +

    +
    +
    $ cd ../bob
    +$ hg pull -q -u
    +1 new phase-divergent changesets
    +
    +
    +

    + Figure 7 shows what just happened to Bob’s repository: changeset 2:e011 is now public, so it can’t be obsolete. When that changeset was obsolete, it made perfect sense for it to have a successor, namely Bob’s amendment of Alice’s fix (changeset 4:fe88). But it’s illogical for a public changeset to have a successor, so 4:fe88 is troubled: it has become bumped. +

    +
    +

    + [figure SG07: 2:e011 now public not obsolete, 4:fe88 now bumped] +

    +
    +

    + As usual when there’s trouble in your repository, the solution is to evolve it: +

    + +

    + Figure 8 illustrates Bob’s repository after evolving away the bumped changeset. Ignoring the obsolete changesets, Bob now has a nice, clean, simple history. His amendment of Alice’s bug fix lives on, as changeset 5:227d—albeit with a software-generated commit message. (Bob should probably amend that changeset to improve the commit message.) But the important thing is that his repository no longer has any troubled changesets, thanks to evolve. +

    +
    +

    + [figure SG08: 5:227d is new, formerly bumped changeset 4:fe88 now hidden] +

    +
    +
    +
    +
    +

    + Conclusion +

    +

    + Mutable history is a powerful tool. Like a sharp knife, an experienced user can do wonderful things with it, much more wonderful than with a dull knife (never mind a rusty spoon). At the same time, an inattentive or careless user can do harm to himself or others. Mercurial with evolve goes to great lengths to limit the harm you can do by trying to handle all possible types of “troubled” changesets. Nevertheless, having a first-aid kit nearby does not mean you should stop being careful with sharp knives. +

    +

    + Mutable history shared across multiple repositories by a single developer is a natural extension of this model. Once you are used to using a single sharp knife on its own, it’s pretty straightforward to chop onions and mushrooms using the same knife, or to alternate between two chopping boards with different knives. +

    +

    + Mutable history shared by multiple developers is a scary place to go. Imagine a professional kitchen full of expert chefs tossing their favourite knives back and forth, with the occasional axe or chainsaw thrown in to spice things up. If you’re confident that you and your colleagues can do it without losing a limb, go for it. But be sure to practice a lot first before you rely on it! +

    +
    +
    diff --git a/src/constants.rs b/src/constants.rs index f5fb834..afd44f0 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -49,7 +49,7 @@ pub static OKAY_MAYBE_ITS_A_CANDIDATE: Lazy = Lazy::new(|| { }); pub static HAS_CONTENT: Lazy = Lazy::new(|| Regex::new(r#"/\S$/"#).expect("HAS_CONTENT regex")); -pub static HASH_URL: Lazy = Lazy::new(|| Regex::new(r#"/^#.+/"#).expect("HASH_URL regex")); +pub static HASH_URL: Lazy = Lazy::new(|| Regex::new(r#"^#.+"#).expect("HASH_URL regex")); pub static POSITIVE: Lazy = Lazy::new(|| { RegexBuilder::new( diff --git a/src/full_text_parser/mod.rs b/src/full_text_parser/mod.rs index 41859a2..e8a112d 100644 --- a/src/full_text_parser/mod.rs +++ b/src/full_text_parser/mod.rs @@ -551,24 +551,6 @@ impl FullTextParser { Ok(()) } - fn add_attribute( - context: &Context, - tag: Option<&str>, - attribute: &str, - value: &str, - ) -> Result<(), FullTextParserError> { - let xpath_tag = tag.unwrap_or("*"); - - let xpath = &format!("//{}", xpath_tag); - let node_vec = Util::evaluate_xpath(context, xpath, false)?; - for mut node in node_vec { - if let Err(err) = node.set_attribute(attribute, value) { - log::warn!("Failed to set attribute '{}' on node: {}", attribute, err); - } - } - Ok(()) - } - fn repair_urls( context: &Context, xpath: &str, @@ -580,13 +562,21 @@ impl FullTextParser { for mut node in node_vec { if let Some(url) = node.get_attribute(attribute) { let trimmed_url = url.trim(); + + let is_hash_url = url.starts_with('#'); let is_relative_url = url::Url::parse(&url) .err() .map(|err| err == url::ParseError::RelativeUrlWithoutBase) .unwrap_or(false); let is_javascript = trimmed_url.contains("javascript:"); - if is_relative_url { + if !is_hash_url && node.get_name().to_uppercase() == "A" { + _ = node.set_attribute("target", "_blank"); + } + + if is_hash_url { + _ = node.set_attribute(attribute, trimmed_url); + } else if is_relative_url { let completed_url = match article_url.join(trimmed_url) { Ok(joined_url) => joined_url, Err(_) => continue, @@ -697,7 +687,6 @@ impl FullTextParser { _ = Self::fix_lazy_images(context, document); _ = Self::fix_iframe_size(context, "youtube.com"); _ = Self::remove_attribute(context, Some("a"), "onclick"); - _ = Self::add_attribute(context, Some("a"), "target", "_blank"); // strip elements using Readability.com and Instapaper.com ignore class names // .entry-unrelated and .instapaper_ignore diff --git a/src/full_text_parser/readability/mod.rs b/src/full_text_parser/readability/mod.rs index 8881215..5205157 100644 --- a/src/full_text_parser/readability/mod.rs +++ b/src/full_text_parser/readability/mod.rs @@ -578,8 +578,6 @@ impl Readability { let text = Util::get_inner_text(&article_content, true); let text_length = text.len(); - //Util::serialize_node(&article_content, "dbg.html"); - if text_length < constants::DEFAULT_CHAR_THRESHOLD { parse_successful = false; diff --git a/src/full_text_parser/readability/tests.rs b/src/full_text_parser/readability/tests.rs index eb37947..1334b42 100644 --- a/src/full_text_parser/readability/tests.rs +++ b/src/full_text_parser/readability/tests.rs @@ -337,10 +337,10 @@ async fn medium_3() { run_test("medium-3").await } -// #[tokio::test] -// async fn mercurial() { -// run_test("mercurial").await -// } +#[tokio::test] +async fn mercurial() { + run_test("mercurial").await +} #[tokio::test] async fn metadata_content_missing() {