[#270] Handle relative redirects

Problem: Currently, Xrefcheck can follow redirects with an absolute
location link, but it cannot handle relative ones.

Solution: After parsing the location link, obtain the corresponding
absolute link by using the original request one.
This commit is contained in:
Adrián Enríquez 2023-01-26 11:35:57 +01:00
parent 25ec1a21d5
commit 9c2ac77619
No known key found for this signature in database
GPG Key ID: 1D2A049F5866F977
7 changed files with 168 additions and 84 deletions

View File

@ -52,6 +52,8 @@ Unreleased
* [#268](https://github.com/serokell/xrefcheck/pull/268)
+ Added CLI option `--color` that enables ANSI colors in output.
+ Changed the output coloring defaults to show colors when `CI` env variable is `true`.
* [#271](https://github.com/serokell/xrefcheck/pull/271)
+ Now Xrefcheck is able to follow relative redirects.
0.2.2
==========

74
src/Xrefcheck/Data/URI.hs Normal file
View File

@ -0,0 +1,74 @@
{- SPDX-FileCopyrightText: 2023 Serokell <https://serokell.io>
-
- SPDX-License-Identifier: MPL-2.0
-}
{-# LANGUAGE ExistentialQuantification #-}
module Xrefcheck.Data.URI
( UriParseError (..)
, parseUri
) where
import Universum
import Control.Exception.Safe (handleJust)
import Control.Monad.Except (throwError)
import Text.URI (ParseExceptionBs, URI, mkURIBs)
import URI.ByteString qualified as URIBS
data UriParseError
= UPEInvalid URIBS.URIParseError
| UPEConversion ParseExceptionBs
deriving stock (Show, Eq)
data AnyURIRef = forall a. AnyURIRef (URIBS.URIRef a)
serializeAnyURIRef :: AnyURIRef -> ByteString
serializeAnyURIRef (AnyURIRef uri) = URIBS.serializeURIRef' uri
-- | Parse URI according to RFC 3986 extended by allowing non-encoded
-- `[` and `]` in query string.
--
-- The first parameter indicates whether the parsing should admit relative
-- URIs or not.
parseUri :: Bool -> Text -> ExceptT UriParseError IO URI
parseUri canBeRelative link = do
-- There exist two main standards of URL parsing: RFC 3986 and the Web
-- Hypertext Application Technology Working Group's URL standard. Ideally,
-- we want to be able to parse the URLs in accordance with the latter
-- standard, because it provides a much less ambiguous set of rules for
-- percent-encoding special characters, and is essentially a living
-- standard that gets updated constantly.
--
-- We have chosen the 'uri-bytestring' library for URI parsing because
-- of the 'laxURIParseOptions' parsing configuration. 'mkURI' from
-- the 'modern-uri' library parses URIs in accordance with RFC 3986 and does
-- not provide a means of parsing customization, which contrasts with
-- 'parseURI' that accepts a 'URIParserOptions'. One of the predefined
-- configurations of this type is 'strictURIParserOptions', which follows
-- RFC 3986, and the other -- 'laxURIParseOptions' -- allows brackets
-- in the queries, which draws us closer to the WHATWG URL standard.
--
-- The 'modern-uri' package can parse an URI deciding if it is absolute or
-- relative depending on the success or failure of the scheme parsing. By
-- contrast, in 'uri-bytestring' it has to be decided beforehand, resulting in
-- different URI types.
uri <- case URIBS.parseURI URIBS.laxURIParserOptions (encodeUtf8 link) of
Left (URIBS.MalformedScheme _) | canBeRelative ->
URIBS.parseRelativeRef URIBS.laxURIParserOptions (encodeUtf8 link)
& either (throwError . UPEInvalid) (pure . AnyURIRef)
Left err -> throwError $ UPEInvalid err
Right uri -> pure $ AnyURIRef uri
-- We stick to our infrastructure by continuing to operate on the datatypes
-- from 'modern-uri', which are used in the 'req' library. First we
-- serialize our URI parsed with 'parseURI' so it becomes a 'ByteString'
-- with all the necessary special characters *percent-encoded*, and then
-- call 'mkURIBs'.
mkURIBs (serializeAnyURIRef uri)
-- Ideally, this exception should never be thrown, as the URI
-- already *percent-encoded* with 'parseURI' from 'uri-bytestring'
-- and 'mkURIBs' is only used to convert to 'URI' type from
-- 'modern-uri' package.
& handleJust fromException (throwError . UPEConversion)

View File

@ -25,9 +25,6 @@ module Xrefcheck.Verify
, verifyRepo
, verifyReference
, checkExternalResource
-- * URI parsing
, parseUri
, reportVerifyErrs
) where
@ -35,7 +32,7 @@ import Universum
import Control.Concurrent.Async (Async, async, cancel, poll, wait, withAsync)
import Control.Exception (AsyncException (..), throwIO)
import Control.Exception.Safe (handleAsync, handleJust)
import Control.Exception.Safe (handleAsync)
import Control.Monad.Except (MonadError (..))
import Data.Bits (toIntegralSized)
import Data.ByteString qualified as BS
@ -64,12 +61,13 @@ import Network.HTTP.Types.Header (hRetryAfter)
import Network.HTTP.Types.Status (Status, statusCode, statusMessage)
import Text.Interpolation.Nyan
import Text.ParserCombinators.ReadPrec qualified as ReadPrec (lift)
import Text.URI (Authority (..), ParseExceptionBs, URI (..), mkURIBs, unRText)
import Text.URI (Authority (..), URI (..), relativeTo, render, unRText)
import Time (RatioNat, Second, Time (..), ms, sec, threadDelay, timeout, (+:+), (-:-))
import URI.ByteString qualified as URIBS
import Control.Monad.Trans.Except (withExceptT)
import Xrefcheck.Config
import Xrefcheck.Core
import Xrefcheck.Data.URI
import Xrefcheck.Orphans ()
import Xrefcheck.Progress
import Xrefcheck.Scan
@ -129,8 +127,7 @@ data VerifyError
| LinkTargetNotAddedToGit RelPosixLink
| AnchorDoesNotExist Text [Anchor]
| AmbiguousAnchorRef RelPosixLink Text (NonEmpty Anchor)
| ExternalResourceInvalidUri URIBS.URIParseError
| ExternalResourceUriConversionError ParseExceptionBs
| ExternalResourceUriParseError UriParseError
| ExternalResourceInvalidUrl (Maybe Text)
| ExternalResourceUnknownProtocol
| ExternalHttpResourceUnavailable Status
@ -196,12 +193,12 @@ instance Given ColorMode => Buildable VerifyError where
can change silently while the document containing it evolves.
|]
ExternalResourceInvalidUri err ->
ExternalResourceUriParseError (UPEInvalid err) ->
[int||
Invalid URI (#{err})
|]
ExternalResourceUriConversionError err ->
ExternalResourceUriParseError (UPEConversion err) ->
[int||
Invalid URI
#{interpolateIndentF 4 . build $ displayException err}
@ -657,41 +654,6 @@ verifyReference
. aName
in throwError $ AnchorDoesNotExist anchor similarAnchors
-- | Parse URI according to RFC 3986 extended by allowing non-encoded
-- `[` and `]` in query string.
parseUri :: Text -> ExceptT VerifyError IO URI
parseUri link = do
-- There exist two main standards of URL parsing: RFC 3986 and the Web
-- Hypertext Application Technology Working Group's URL standard. Ideally,
-- we want to be able to parse the URLs in accordance with the latter
-- standard, because it provides a much less ambiguous set of rules for
-- percent-encoding special characters, and is essentially a living
-- standard that gets updated constantly.
--
-- We have chosen the 'uri-bytestring' library for URI parsing because
-- of the 'laxURIParseOptions' parsing configuration. 'mkURI' from
-- the 'modern-uri' library parses URIs in accordance with RFC 3986 and does
-- not provide a means of parsing customization, which contrasts with
-- 'parseURI' that accepts a 'URIParserOptions'. One of the predefined
-- configurations of this type is 'strictURIParserOptions', which follows
-- RFC 3986, and the other -- 'laxURIParseOptions' -- allows brackets
-- in the queries, which draws us closer to the WHATWG URL standard.
uri' <- URIBS.parseURI URIBS.laxURIParserOptions (encodeUtf8 link)
& either (throwError . ExternalResourceInvalidUri) pure
-- We stick to our infrastructure by continuing to operate on the datatypes
-- from `modern-uri`, which are used in the 'req' library. First we
-- serialize our URI parsed with 'parseURI' so it becomes a 'ByteString'
-- with all the necessary special characters *percent-encoded*, and then
-- call 'mkURIBs'.
mkURIBs (URIBS.serializeURIRef' uri')
-- Ideally, this exception should never be thrown, as the URI
-- already *percent-encoded* with 'parseURI' from 'uri-bytestring'
-- and 'mkURIBs' is only used to convert to 'URI' type from
-- 'modern-uri' package.
& handleJust (fromException @ParseExceptionBs)
(throwError . ExternalResourceUriConversionError)
checkExternalResource :: RedirectChain -> Config -> Text -> ExceptT VerifyError IO ()
checkExternalResource followed config@Config{..} link
| isIgnored = pass
@ -700,7 +662,7 @@ checkExternalResource followed config@Config{..} link
| ncMaxRedirectFollows >= 0 && totalFollowed followed > ncMaxRedirectFollows =
throwError $ RedirectChainLimit $ followed `pushRequest` (RedirectChainLink link)
| otherwise = do
uri <- parseUri link
uri <- ExternalResourceUriParseError `withExceptT` parseUri False link
case toString <$> uriScheme uri of
Just "http" -> checkHttp uri
Just "https" -> checkHttp uri
@ -745,10 +707,8 @@ checkExternalResource followed config@Config{..} link
let maxTime = Time @Second $ unTime ncExternalRefCheckTimeout * timeoutFrac
reqRes <- catch (liftIO (timeout maxTime $ reqLink $> RRDone)) $ \httpErr ->
case interpretErrors uri httpErr of
Left err -> throwError err
Right res -> pure $ Just res
reqRes <- catch (liftIO (timeout maxTime $ reqLink $> RRDone)) $
(Just <$>) <$> interpretErrors uri
case reqRes of
Nothing -> throwError $ ExternalHttpTimeout $ extractHost uri
@ -767,7 +727,7 @@ checkExternalResource followed config@Config{..} link
[ if ncIgnoreAuthFailures -- unauthorized access
then flip elem [403, 401]
else const False
, (405 ==) -- method mismatch
, (405 ==) -- method mismatch
]
interpretErrors uri = \case
@ -777,20 +737,26 @@ checkExternalResource followed config@Config{..} link
HttpExceptionRequest _ exc -> case exc of
StatusCodeException resp _
| isRedirectCode code -> case redirectLocation of
Nothing -> Left $ RedirectMissingLocation $ followed `pushRequest` (RedirectChainLink link)
Just nextLink -> case redirectRule link nextLink code ncExternalRefRedirects of
Nothing -> Right RRDone
Just RedirectRule{..} ->
case rrOutcome of
RROValid -> Right RRDone
RROInvalid -> Left $ RedirectRuleError
(followed `pushRequest` (RedirectChainLink link) `pushRequest` (RedirectChainLink nextLink))
rrOn
RROFollow -> Right $ RRFollow nextLink
| isAllowedErrorCode code -> Right RRDone
Nothing -> throwError $ RedirectMissingLocation $ followed `pushRequest` RedirectChainLink link
Just nextLink -> do
nextUri <- ExternalResourceUriParseError `withExceptT` parseUri True nextLink
nextLinkAbsolute <- case relativeTo nextUri uri of
-- This should not happen because uri has been parsed with `parseUri False`
Nothing -> error "Not an absolute URL exception"
Just absoluteTarget -> pure $ render absoluteTarget
case redirectRule link nextLinkAbsolute code ncExternalRefRedirects of
Nothing -> pure RRDone
Just RedirectRule{..} ->
case rrOutcome of
RROValid -> pure RRDone
RROInvalid -> throwError $ RedirectRuleError
(followed `pushRequest` RedirectChainLink link `pushRequest` RedirectChainLink nextLinkAbsolute)
rrOn
RROFollow -> pure $ RRFollow nextLinkAbsolute
| isAllowedErrorCode code -> pure RRDone
| otherwise -> case statusCode (responseStatus resp) of
429 -> Left $ ExternalHttpTooManyRequests (retryAfterInfo resp) (extractHost uri)
_ -> Left . ExternalHttpResourceUnavailable $ responseStatus resp
429 -> throwError $ ExternalHttpTooManyRequests (retryAfterInfo resp) (extractHost uri)
_ -> throwError $ ExternalHttpResourceUnavailable $ responseStatus resp
where
code :: Int
code = statusCode $ responseStatus resp
@ -799,7 +765,7 @@ checkExternalResource followed config@Config{..} link
redirectLocation = fmap decodeUtf8
. lookup "Location"
$ responseHeaders resp
other -> Left . ExternalResourceSomeError $ show other
other -> throwError $ ExternalResourceSomeError $ show other
where
retryAfterInfo :: Response a -> Maybe RetryAfter
retryAfterInfo = readMaybe . decodeUtf8 <=< L.lookup hRetryAfter . responseHeaders

View File

@ -40,6 +40,35 @@ test_redirectRequests = testGroup "Redirect chain tests"
(link "/cycle1")
progress
(VerifyResult [RedirectChainCycle $ chain ["/cycle1", "/cycle2", "/cycle3", "/cycle4", "/cycle2"]])
, testGroup "Relative redirect"
[ testCase "Host" $ do
setRef <- newIORef mempty
checkLinkAndProgressWithServer
(configMod 1)
setRef
mockRedirect
(link "/relative/host")
progress
(VerifyResult [RedirectChainLimit $ chain ["/relative/host", "/cycle2", "/cycle3"]])
, testCase "Path" $ do
setRef <- newIORef mempty
checkLinkAndProgressWithServer
(configMod 1)
setRef
mockRedirect
(link "/relative/path")
progress
(VerifyResult [RedirectChainLimit $ chain ["/relative/path", "/relative/host", "/cycle2"]])
]
, testCase "Other host redirect" $ withServer otherMockRedirect $ do
setRef <- newIORef mempty
checkLinkAndProgressWithServer
(configMod 1)
setRef
mockRedirect
"http://127.0.0.1:5001/other/host"
progress
(VerifyResult [RedirectChainLimit $ fromList ["http://127.0.0.1:5001/other/host", link "/relative/host", link "/cycle2"]])
, testGroup "Limit"
[ testCase "Takes effect" $ do
setRef <- newIORef mempty
@ -89,19 +118,28 @@ test_redirectRequests = testGroup "Redirect chain tests"
redirectRoute name to = route name $ pure $ toResponse
( "" :: Text
, mkStatus 301 "Permanent redirect"
, M.fromList [(CI.map (decodeUtf8 @Text) hLocation, fmap link $ maybeToList to)]
, M.fromList [(CI.map (decodeUtf8 @Text) hLocation, maybeToList to)]
)
mockRedirect :: IO ()
mockRedirect =
mockRedirect = do
run 5000 do
-- A set of redirect routes that correspond to a broken chain.
redirectRoute "/broken1" $ Just "/broken2"
redirectRoute "/broken2" $ Just "/broken3"
redirectRoute "/broken1" $ Just $ link "/broken2"
redirectRoute "/broken2" $ Just $ link "/broken3"
redirectRoute "/broken3" Nothing
-- A set of redirect routes that correspond to a cycle.
redirectRoute "/cycle1" $ Just "/cycle2"
redirectRoute "/cycle2" $ Just "/cycle3"
redirectRoute "/cycle3" $ Just "/cycle4"
redirectRoute "/cycle4" $ Just "/cycle2"
redirectRoute "/cycle1" $ Just $ link "/cycle2"
redirectRoute "/cycle2" $ Just $ link "/cycle3"
redirectRoute "/cycle3" $ Just $ link "/cycle4"
redirectRoute "/cycle4" $ Just $ link "/cycle2"
-- Relative redirects.
redirectRoute "/relative/host" $ Just "/cycle2"
redirectRoute "/relative/path" $ Just "host"
-- To other host
otherMockRedirect :: IO ()
otherMockRedirect =
run 5001 $ redirectRoute "/other/host" $ Just $ link "/relative/host"

View File

@ -11,9 +11,9 @@ import Test.Tasty (TestTree, testGroup)
import Test.Tasty.HUnit (testCase, (@?=))
import Text.URI (URI)
import Text.URI.QQ (uri)
import URI.ByteString (SchemaError (..), URIParseError (..))
import URI.ByteString qualified as URIBS
import Xrefcheck.Verify (VerifyError (..), parseUri)
import Xrefcheck.Data.URI (UriParseError (..), parseUri)
test_uri :: [TestTree]
test_uri =
@ -36,20 +36,20 @@ test_uri =
, testGroup "URI parsing should be unsuccessful"
[ testCase "With the special characters anywhere else" do
parseUri' "https://exa<mple.co>m/?q=a&p=b#fra{g}ment" >>=
(@?= Left (ExternalResourceInvalidUri MalformedPath))
(@?= Left (UPEInvalid URIBS.MalformedPath))
parseUri' "https://example.com/pa[t]h/to[/]smth?q=a&p=b" >>=
(@?= Left (ExternalResourceInvalidUri MalformedPath))
(@?= Left (UPEInvalid URIBS.MalformedPath))
, testCase "With malformed scheme" do
parseUri' "https//example.com/" >>=
(@?= Left (ExternalResourceInvalidUri $ MalformedScheme MissingColon))
(@?= Left (UPEInvalid $ URIBS.MalformedScheme URIBS.MissingColon))
, testCase "With malformed fragment" do
parseUri' "https://example.com/?q=a&p=b#fra{g}ment" >>=
(@?= Left (ExternalResourceInvalidUri MalformedFragment))
(@?= Left (UPEInvalid URIBS.MalformedFragment))
]
]
where
parseUri' :: Text -> IO $ Either VerifyError URI
parseUri' = runExceptT . parseUri
parseUri' :: Text -> IO $ Either UriParseError URI
parseUri' = runExceptT . parseUri False

View File

@ -11,6 +11,7 @@ module Test.Xrefcheck.UtilRequests
, checkLinkAndProgressWithServerDefault
, verifyLinkDefault
, verifyReferenceWithProgressDefault
, withServer
, VerifyLinkTestEntry (..)
) where
@ -31,13 +32,16 @@ import Xrefcheck.System
import Xrefcheck.Util
import Xrefcheck.Verify
withServer :: IO () -> IO () -> IO ()
withServer mock = E.bracket (forkIO mock) killThread . const
checkMultipleLinksWithServer
:: IO ()
-> IORef (S.Set DomainName)
-> [VerifyLinkTestEntry]
-> IO ()
checkMultipleLinksWithServer mock setRef entries =
E.bracket (forkIO mock) killThread $ \_ -> do
withServer mock $ do
forM_ entries $ \VerifyLinkTestEntry {..} ->
checkLinkAndProgress
vlteConfigModifier
@ -55,7 +59,7 @@ checkLinkAndProgressWithServer
-> VerifyResult VerifyError
-> IO ()
checkLinkAndProgressWithServer configModifier setRef mock link progress vrExpectation =
E.bracket (forkIO mock) killThread $ \_ -> do
withServer mock $
checkLinkAndProgress configModifier setRef link progress vrExpectation
checkLinkAndProgress

View File

@ -34,7 +34,7 @@ assert_diff - <<EOF
Permanent redirect found:
-| http://www.commonmark.org
-> https://commonmark.org/
-> https://commonmark.org
^-- stopped before this one
Invalid references dumped, 1 in total.