diff --git a/CHANGES.md b/CHANGES.md index 9914a41..08ade5d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -25,12 +25,14 @@ Unreleased + Add the duplication detection & verification result caching algorithm for external references. * [#82](https://github.com/serokell/xrefcheck/pull/82) + Fix the issue of having the lowest level context duplicated, caused by the root's trailing path separator. -* [#31](https://github.com/serokell/xrefcheck/pull/88) +* [#88](https://github.com/serokell/xrefcheck/pull/88) + Handle the "429 too many requests" errors & attempt to eliminate them during verification. * [#128](https://github.com/serokell/xrefcheck/pull/128) + Make `ignoreRefs` a required parameter. * [#129](https://github.com/serokell/xrefcheck/pull/129) - + Add support for the `id` attribute in anchors + + Add support for the `id` attribute in anchors. +* [#116](https://github.com/serokell/xrefcheck/pull/116) + + Allow certain reserved characters to be present in the query strings of the URLs. 0.2.1 ========== diff --git a/package.yaml b/package.yaml index e0d8a7b..3ddbb41 100644 --- a/package.yaml +++ b/package.yaml @@ -107,6 +107,7 @@ library: - th-lift-instances - time - universum + - uri-bytestring - yaml executables: @@ -149,6 +150,7 @@ tests: - regex-tdfa - time - universum + - modern-uri - yaml build-tools: - hspec-discover diff --git a/src/Xrefcheck/Verify.hs b/src/Xrefcheck/Verify.hs index 2a6930a..9ff13ed 100644 --- a/src/Xrefcheck/Verify.hs +++ b/src/Xrefcheck/Verify.hs @@ -26,6 +26,9 @@ module Xrefcheck.Verify , verifyRepo , verifyReference , checkExternalResource + + -- * URI parsing + , parseUri ) where import Universum @@ -59,8 +62,9 @@ import System.FilePath (takeDirectory, (), normalise) import System.FilePath.Glob qualified as Glob import Text.ParserCombinators.ReadPrec qualified as ReadPrec (lift) import Text.Regex.TDFA.Text (Regex, regexec) -import Text.URI (Authority (..), URI (..), mkURI) +import Text.URI (Authority (..), URI (..), mkURIBs) import Time (RatioNat, Second, Time (..), ms, sec, threadDelay, timeout, (+:+), (-:-)) +import URI.ByteString qualified as URIBS import Data.Bits (toIntegralSized) import Xrefcheck.Config @@ -444,12 +448,41 @@ verifyReference givenAnchors in throwError $ AnchorDoesNotExist anchor similarAnchors +-- | Parse URI according to RFC 3986 extended by allowing non-encoded +-- `[` and `]` in query string. +parseUri :: Text -> ExceptT VerifyError IO URI +parseUri link = do + -- There exist two main standards of URL parsing: RFC 3986 and the Web + -- Hypertext Application Technology Working Group's URL standard. Ideally, + -- we want to be able to parse the URLs in accordance with the latter + -- standard, because it provides a much less ambiguous set of rules for + -- percent-encoding special characters, and is essentially a living + -- standard that gets updated constantly. + -- + -- We have chosen the 'uri-bytestring' library for URI parsing because + -- of the 'laxURIParseOptions' parsing configuration. 'mkURI' from + -- the 'modern-uri' library parses URIs in accordance with RFC 3986 and does + -- not provide a means of parsing customization, which contrasts with + -- 'parseURI' that accepts a 'URIParserOptions'. One of the predefined + -- configurations of this type is 'strictURIParserOptions', which follows + -- RFC 3986, and the other -- 'laxURIParseOptions' -- allows brackets + -- in the queries, which draws us closer to the WHATWG URL standard. + uri' <- URIBS.parseURI URIBS.laxURIParserOptions (encodeUtf8 link) + & either (const $ throwError ExternalResourceInvalidUri) pure + + -- We stick to our infrastructure by continuing to operate on the datatypes + -- from `modern-uri`, which are used in the 'req' library. First we + -- serialize our URI parsed with 'parseURI' so it becomes a 'ByteString' + -- with all the necessary special characters *percent-encoded*, and then + -- call 'mkURIBs'. + mkURIBs (URIBS.serializeURIRef' uri') + & maybe (throwError ExternalResourceInvalidUri) pure + checkExternalResource :: VerifyConfig -> Text -> IO (VerifyResult VerifyError) checkExternalResource VerifyConfig{..} link | skipCheck = return mempty | otherwise = fmap toVerifyRes $ runExceptT $ do - uri <- mkURI link & maybe (throwError ExternalResourceInvalidUri) pure - + uri <- parseUri link case toString <$> uriScheme uri of Just "http" -> checkHttp uri Just "https" -> checkHttp uri diff --git a/tests/Test/Xrefcheck/URIParsingSpec.hs b/tests/Test/Xrefcheck/URIParsingSpec.hs new file mode 100644 index 0000000..a175500 --- /dev/null +++ b/tests/Test/Xrefcheck/URIParsingSpec.hs @@ -0,0 +1,44 @@ +{- SPDX-FileCopyrightText: 2022 Serokell + - + - SPDX-License-Identifier: MPL-2.0 + -} + +{-# LANGUAGE QuasiQuotes #-} + +module Test.Xrefcheck.URIParsingSpec where + +import Universum + +import Test.Hspec (Spec, describe, it, shouldReturn) +import Text.URI (URI) +import Text.URI.QQ (uri) + +import Xrefcheck.Verify (parseUri, VerifyError (..)) + +spec :: Spec +spec = do + describe "URI parsing should be successful" $ do + it "Without the special characters in the query strings" do + parseUri' "https://example.com/?q=a&p=b#fragment" `shouldReturn` + Right [uri|https://example.com/?q=a&p=b#fragment|] + + parseUri' "https://example.com/path/to/smth?q=a&p=b" `shouldReturn` + Right [uri|https://example.com/path/to/smth?q=a&p=b|] + + it "With the special characters in the query strings" do + parseUri' "https://example.com/?q=[a]&

={b}#fragment" `shouldReturn` + Right [uri|https://example.com/?q=%5Ba%5D&%3Cp%3E=%7Bb%7D#fragment|] + + parseUri' "https://example.com/path/to/smth?q=[a]&

={b}" `shouldReturn` + Right [uri|https://example.com/path/to/smth?q=%5Ba%5D&%3Cp%3E=%7Bb%7D|] + + describe "URI parsing should be unsuccessful" $ do + it "With the special characters anywhere else" do + parseUri' "https://exam/?q=a&p=b#fra{g}ment" `shouldReturn` + Left ExternalResourceInvalidUri + + parseUri' "https://example.com/pa[t]h/to[/]smth?q=a&p=b" `shouldReturn` + Left ExternalResourceInvalidUri + where + parseUri' :: Text -> IO $ Either VerifyError URI + parseUri' = runExceptT . parseUri