Merge pull request #116 from serokell/breakerzirconia/#49-allow-brackets-in-URIs

[#49] Allow certain reserved characters in the URLs
This commit is contained in:
Sereja313 2022-09-06 04:43:09 +10:00 committed by GitHub
commit fd147e6856
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 86 additions and 5 deletions

View File

@ -25,12 +25,14 @@ Unreleased
+ Add the duplication detection & verification result caching algorithm for external references.
* [#82](https://github.com/serokell/xrefcheck/pull/82)
+ Fix the issue of having the lowest level context duplicated, caused by the root's trailing path separator.
* [#31](https://github.com/serokell/xrefcheck/pull/88)
* [#88](https://github.com/serokell/xrefcheck/pull/88)
+ Handle the "429 too many requests" errors & attempt to eliminate them during verification.
* [#128](https://github.com/serokell/xrefcheck/pull/128)
+ Make `ignoreRefs` a required parameter.
* [#129](https://github.com/serokell/xrefcheck/pull/129)
+ Add support for the `id` attribute in anchors
+ Add support for the `id` attribute in anchors.
* [#116](https://github.com/serokell/xrefcheck/pull/116)
+ Allow certain reserved characters to be present in the query strings of the URLs.
0.2.1
==========

View File

@ -107,6 +107,7 @@ library:
- th-lift-instances
- time
- universum
- uri-bytestring
- yaml
executables:
@ -149,6 +150,7 @@ tests:
- regex-tdfa
- time
- universum
- modern-uri
- yaml
build-tools:
- hspec-discover

View File

@ -26,6 +26,9 @@ module Xrefcheck.Verify
, verifyRepo
, verifyReference
, checkExternalResource
-- * URI parsing
, parseUri
) where
import Universum
@ -59,8 +62,9 @@ import System.FilePath (takeDirectory, (</>), normalise)
import System.FilePath.Glob qualified as Glob
import Text.ParserCombinators.ReadPrec qualified as ReadPrec (lift)
import Text.Regex.TDFA.Text (Regex, regexec)
import Text.URI (Authority (..), URI (..), mkURI)
import Text.URI (Authority (..), URI (..), mkURIBs)
import Time (RatioNat, Second, Time (..), ms, sec, threadDelay, timeout, (+:+), (-:-))
import URI.ByteString qualified as URIBS
import Data.Bits (toIntegralSized)
import Xrefcheck.Config
@ -444,12 +448,41 @@ verifyReference
givenAnchors
in throwError $ AnchorDoesNotExist anchor similarAnchors
-- | Parse URI according to RFC 3986 extended by allowing non-encoded
-- `[` and `]` in query string.
parseUri :: Text -> ExceptT VerifyError IO URI
parseUri link = do
-- There exist two main standards of URL parsing: RFC 3986 and the Web
-- Hypertext Application Technology Working Group's URL standard. Ideally,
-- we want to be able to parse the URLs in accordance with the latter
-- standard, because it provides a much less ambiguous set of rules for
-- percent-encoding special characters, and is essentially a living
-- standard that gets updated constantly.
--
-- We have chosen the 'uri-bytestring' library for URI parsing because
-- of the 'laxURIParseOptions' parsing configuration. 'mkURI' from
-- the 'modern-uri' library parses URIs in accordance with RFC 3986 and does
-- not provide a means of parsing customization, which contrasts with
-- 'parseURI' that accepts a 'URIParserOptions'. One of the predefined
-- configurations of this type is 'strictURIParserOptions', which follows
-- RFC 3986, and the other -- 'laxURIParseOptions' -- allows brackets
-- in the queries, which draws us closer to the WHATWG URL standard.
uri' <- URIBS.parseURI URIBS.laxURIParserOptions (encodeUtf8 link)
& either (const $ throwError ExternalResourceInvalidUri) pure
-- We stick to our infrastructure by continuing to operate on the datatypes
-- from `modern-uri`, which are used in the 'req' library. First we
-- serialize our URI parsed with 'parseURI' so it becomes a 'ByteString'
-- with all the necessary special characters *percent-encoded*, and then
-- call 'mkURIBs'.
mkURIBs (URIBS.serializeURIRef' uri')
& maybe (throwError ExternalResourceInvalidUri) pure
checkExternalResource :: VerifyConfig -> Text -> IO (VerifyResult VerifyError)
checkExternalResource VerifyConfig{..} link
| skipCheck = return mempty
| otherwise = fmap toVerifyRes $ runExceptT $ do
uri <- mkURI link & maybe (throwError ExternalResourceInvalidUri) pure
uri <- parseUri link
case toString <$> uriScheme uri of
Just "http" -> checkHttp uri
Just "https" -> checkHttp uri

View File

@ -0,0 +1,44 @@
{- SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
-
- SPDX-License-Identifier: MPL-2.0
-}
{-# LANGUAGE QuasiQuotes #-}
module Test.Xrefcheck.URIParsingSpec where
import Universum
import Test.Hspec (Spec, describe, it, shouldReturn)
import Text.URI (URI)
import Text.URI.QQ (uri)
import Xrefcheck.Verify (parseUri, VerifyError (..))
spec :: Spec
spec = do
describe "URI parsing should be successful" $ do
it "Without the special characters in the query strings" do
parseUri' "https://example.com/?q=a&p=b#fragment" `shouldReturn`
Right [uri|https://example.com/?q=a&p=b#fragment|]
parseUri' "https://example.com/path/to/smth?q=a&p=b" `shouldReturn`
Right [uri|https://example.com/path/to/smth?q=a&p=b|]
it "With the special characters in the query strings" do
parseUri' "https://example.com/?q=[a]&<p>={b}#fragment" `shouldReturn`
Right [uri|https://example.com/?q=%5Ba%5D&%3Cp%3E=%7Bb%7D#fragment|]
parseUri' "https://example.com/path/to/smth?q=[a]&<p>={b}" `shouldReturn`
Right [uri|https://example.com/path/to/smth?q=%5Ba%5D&%3Cp%3E=%7Bb%7D|]
describe "URI parsing should be unsuccessful" $ do
it "With the special characters anywhere else" do
parseUri' "https://exa<mple.co>m/?q=a&p=b#fra{g}ment" `shouldReturn`
Left ExternalResourceInvalidUri
parseUri' "https://example.com/pa[t]h/to[/]smth?q=a&p=b" `shouldReturn`
Left ExternalResourceInvalidUri
where
parseUri' :: Text -> IO $ Either VerifyError URI
parseUri' = runExceptT . parseUri