[#49] Allow certain reserved characters in the URLs

Problem: The current version of xrefcheck doesn't allow the square
brackets and some other special characters, like the angle brackets and
the curly brackets, to be present in the URLs, even in the query
strings, as they need to be percent-encoded first.

Solution: Allow some of the reserved characters, like the brackets, to
be present in the query strings of the URLs.
There exist two main standards of URL parsing: RFC 3986 and the Web
Hypertext Application Technology Working Group's URL standard. Ideally,
we want to be able to parse the URLs in accordance with the latter
standard, because it provides a much less ambiguous set of rules for
percent-encoding special characters, and is essentially a living
standard that gets updated constantly.
We allow these characters to be present in the query strings by using
the `parseURI` function from the `uri-bytestring` library with
`laxURIParseOptions`.
This commit is contained in:
Constantine Ter-Matevosian 2022-07-15 01:10:12 +03:00 committed by Sergey Gulin
parent acb2394c9c
commit 80b5edd1c7
No known key found for this signature in database
GPG Key ID: 67CBDE9BE7E6399B
4 changed files with 86 additions and 5 deletions

View File

@ -25,12 +25,14 @@ Unreleased
+ Add the duplication detection & verification result caching algorithm for external references.
* [#82](https://github.com/serokell/xrefcheck/pull/82)
+ Fix the issue of having the lowest level context duplicated, caused by the root's trailing path separator.
* [#31](https://github.com/serokell/xrefcheck/pull/88)
* [#88](https://github.com/serokell/xrefcheck/pull/88)
+ Handle the "429 too many requests" errors & attempt to eliminate them during verification.
* [#128](https://github.com/serokell/xrefcheck/pull/128)
+ Make `ignoreRefs` a required parameter.
* [#129](https://github.com/serokell/xrefcheck/pull/129)
+ Add support for the `id` attribute in anchors
+ Add support for the `id` attribute in anchors.
* [#116](https://github.com/serokell/xrefcheck/pull/116)
+ Allow certain reserved characters to be present in the query strings of the URLs.
0.2.1
==========

View File

@ -107,6 +107,7 @@ library:
- th-lift-instances
- time
- universum
- uri-bytestring
- yaml
executables:
@ -149,6 +150,7 @@ tests:
- regex-tdfa
- time
- universum
- modern-uri
- yaml
build-tools:
- hspec-discover

View File

@ -26,6 +26,9 @@ module Xrefcheck.Verify
, verifyRepo
, verifyReference
, checkExternalResource
-- * URI parsing
, parseUri
) where
import Universum
@ -59,8 +62,9 @@ import System.FilePath (takeDirectory, (</>), normalise)
import System.FilePath.Glob qualified as Glob
import Text.ParserCombinators.ReadPrec qualified as ReadPrec (lift)
import Text.Regex.TDFA.Text (Regex, regexec)
import Text.URI (Authority (..), URI (..), mkURI)
import Text.URI (Authority (..), URI (..), mkURIBs)
import Time (RatioNat, Second, Time (..), ms, sec, threadDelay, timeout, (+:+), (-:-))
import URI.ByteString qualified as URIBS
import Data.Bits (toIntegralSized)
import Xrefcheck.Config
@ -444,12 +448,41 @@ verifyReference
givenAnchors
in throwError $ AnchorDoesNotExist anchor similarAnchors
-- | Parse URI according to RFC 3986 extended by allowing non-encoded
-- `[` and `]` in query string.
parseUri :: Text -> ExceptT VerifyError IO URI
parseUri link = do
-- There exist two main standards of URL parsing: RFC 3986 and the Web
-- Hypertext Application Technology Working Group's URL standard. Ideally,
-- we want to be able to parse the URLs in accordance with the latter
-- standard, because it provides a much less ambiguous set of rules for
-- percent-encoding special characters, and is essentially a living
-- standard that gets updated constantly.
--
-- We have chosen the 'uri-bytestring' library for URI parsing because
-- of the 'laxURIParseOptions' parsing configuration. 'mkURI' from
-- the 'modern-uri' library parses URIs in accordance with RFC 3986 and does
-- not provide a means of parsing customization, which contrasts with
-- 'parseURI' that accepts a 'URIParserOptions'. One of the predefined
-- configurations of this type is 'strictURIParserOptions', which follows
-- RFC 3986, and the other -- 'laxURIParseOptions' -- allows brackets
-- in the queries, which draws us closer to the WHATWG URL standard.
uri' <- URIBS.parseURI URIBS.laxURIParserOptions (encodeUtf8 link)
& either (const $ throwError ExternalResourceInvalidUri) pure
-- We stick to our infrastructure by continuing to operate on the datatypes
-- from `modern-uri`, which are used in the 'req' library. First we
-- serialize our URI parsed with 'parseURI' so it becomes a 'ByteString'
-- with all the necessary special characters *percent-encoded*, and then
-- call 'mkURIBs'.
mkURIBs (URIBS.serializeURIRef' uri')
& maybe (throwError ExternalResourceInvalidUri) pure
checkExternalResource :: VerifyConfig -> Text -> IO (VerifyResult VerifyError)
checkExternalResource VerifyConfig{..} link
| skipCheck = return mempty
| otherwise = fmap toVerifyRes $ runExceptT $ do
uri <- mkURI link & maybe (throwError ExternalResourceInvalidUri) pure
uri <- parseUri link
case toString <$> uriScheme uri of
Just "http" -> checkHttp uri
Just "https" -> checkHttp uri

View File

@ -0,0 +1,44 @@
{- SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
-
- SPDX-License-Identifier: MPL-2.0
-}
{-# LANGUAGE QuasiQuotes #-}
module Test.Xrefcheck.URIParsingSpec where
import Universum
import Test.Hspec (Spec, describe, it, shouldReturn)
import Text.URI (URI)
import Text.URI.QQ (uri)
import Xrefcheck.Verify (parseUri, VerifyError (..))
spec :: Spec
spec = do
describe "URI parsing should be successful" $ do
it "Without the special characters in the query strings" do
parseUri' "https://example.com/?q=a&p=b#fragment" `shouldReturn`
Right [uri|https://example.com/?q=a&p=b#fragment|]
parseUri' "https://example.com/path/to/smth?q=a&p=b" `shouldReturn`
Right [uri|https://example.com/path/to/smth?q=a&p=b|]
it "With the special characters in the query strings" do
parseUri' "https://example.com/?q=[a]&<p>={b}#fragment" `shouldReturn`
Right [uri|https://example.com/?q=%5Ba%5D&%3Cp%3E=%7Bb%7D#fragment|]
parseUri' "https://example.com/path/to/smth?q=[a]&<p>={b}" `shouldReturn`
Right [uri|https://example.com/path/to/smth?q=%5Ba%5D&%3Cp%3E=%7Bb%7D|]
describe "URI parsing should be unsuccessful" $ do
it "With the special characters anywhere else" do
parseUri' "https://exa<mple.co>m/?q=a&p=b#fra{g}ment" `shouldReturn`
Left ExternalResourceInvalidUri
parseUri' "https://example.com/pa[t]h/to[/]smth?q=a&p=b" `shouldReturn`
Left ExternalResourceInvalidUri
where
parseUri' :: Text -> IO $ Either VerifyError URI
parseUri' = runExceptT . parseUri