From e8d79e7f1489a7459c8e6349ded65ce4579c5b44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Enr=C3=ADquez?= Date: Tue, 13 Dec 2022 10:20:32 +0100 Subject: [PATCH] [#211] Case insensitive anchors Problem: Some Markdown flavours such as the GitHub one are case insensitive regarding anchors, but our analysis is currently case sensitive and it produces false positives. Solution: Support case-insensitivity depending on the configured Markdown flavour. Apply this also to ambiguous and similar anchors detection. --- CHANGES.md | 3 ++ README.md | 5 ++- src/Xrefcheck/Core.hs | 16 ++++---- src/Xrefcheck/Progress.hs | 12 +++--- src/Xrefcheck/Verify.hs | 20 +++++++--- tests/golden/check-case-sensitivity/a.md | 24 ++++++++++++ .../check-case-sensitivity.bats | 23 +++++++++++ .../check-case-sensitivity/config-github.yaml | 7 ++++ .../check-case-sensitivity/config-gitlab.yaml | 7 ++++ .../check-case-sensitivity/expected1.gold | 30 +++++++++++++++ .../check-case-sensitivity/expected2.gold | 38 +++++++++++++++++++ 11 files changed, 164 insertions(+), 21 deletions(-) create mode 100644 tests/golden/check-case-sensitivity/a.md create mode 100644 tests/golden/check-case-sensitivity/check-case-sensitivity.bats create mode 100644 tests/golden/check-case-sensitivity/config-github.yaml create mode 100644 tests/golden/check-case-sensitivity/config-gitlab.yaml create mode 100644 tests/golden/check-case-sensitivity/expected1.gold create mode 100644 tests/golden/check-case-sensitivity/expected2.gold diff --git a/CHANGES.md b/CHANGES.md index ec492cb..a4d037e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -38,6 +38,9 @@ Unreleased * [#233](https://github.com/serokell/xrefcheck/pull/233) + Now xrefxcheck does not follow redirect links by default. It fails for permanent redirect responses (i.e. 301 and 308) and passes for temporary ones (i.e. 302, 303, 307). +* [#231](https://github.com/serokell/xrefcheck/pull/231) + + Anchor analysis takes now into account the appropriate case-sensitivity depending on + the configured Markdown flavour. 0.2.2 ========== diff --git a/README.md b/README.md index 3517803..73d76c5 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,7 @@ Run `stack install` to build everything and install the executable. If you wish to use `cabal`, you need to run [`stack2cabal`](https://hackage.haskell.org/package/stack2cabal) first! ### Run on Windows [↑](#xrefcheck) + On Windows, executable requires some dynamic libraries (DLLs). They are shipped together with executable in [releases page](https://github.com/serokell/xrefcheck/releases). If you have built executable from source using `stack install`, @@ -135,7 +136,7 @@ There are several ways to fix this: * If you wish to ignore all http/ftp links, you can use `--mode local-only`. 1. How does `xrefcheck` handle links that require authentication? - * It's common for projects to contains links to protected resources. + * It's common for projects to contain links to protected resources. By default, when `xrefcheck` attempts to verify a link and is faced with a `403 Forbidden` or a `401 Unauthorized`, it assumes the link is valid. * This behavior can be disabled by setting `ignoreAuthFailures: false` in the config file. @@ -160,7 +161,7 @@ There are several ways to fix this: Its features include duplicated URLs detection, specifying allowed HTTP error codes and reporting generation. At the moment of writing, it scans only external references and checking anchors is not possible. * [remark-validate-links](https://github.com/remarkjs/remark-validate-links) and [remark-lint-no-dead-urls](https://github.com/davidtheclark/remark-lint-no-dead-urls) - highly configurable JavaScript solution for checking local and external links respectively. - It is able to check multiple repositores at once if they are gathered in one folder. + It is able to check multiple repositories at once if they are gathered in one folder. Doesn't handle "429 Too Many Requests", so false positives are likely when you have many links to the same domain. * [markdown-link-check](https://github.com/tcort/markdown-link-check) - another checker written in JavaScript, scans one specific file at a time. Supports `mailto:` link resolution. diff --git a/src/Xrefcheck/Core.hs b/src/Xrefcheck/Core.hs index aec017f..5a9d58c 100644 --- a/src/Xrefcheck/Core.hs +++ b/src/Xrefcheck/Core.hs @@ -15,9 +15,9 @@ import Control.Lens (makeLenses) import Data.Aeson (FromJSON (..), withText) import Data.Char (isAlphaNum) import Data.Char qualified as C +import Data.Default (Default (..)) import Data.DList (DList) import Data.DList qualified as DList -import Data.Default (Default (..)) import Data.List qualified as L import Data.Reflection (Given) import Data.Text qualified as T @@ -40,15 +40,15 @@ import Xrefcheck.Util data Flavor = GitHub | GitLab - deriving stock (Show) + deriving stock (Show, Enum, Bounded) allFlavors :: [Flavor] -allFlavors = [GitHub, GitLab] - where - _exhaustivenessCheck = \case - GitHub -> () - GitLab -> () - -- if you update this, also update the list above +allFlavors = [minBound .. maxBound] + +-- | Whether anchors are case-sensitive for a given Markdown flavour or not. +caseInsensitiveAnchors :: Flavor -> Bool +caseInsensitiveAnchors GitHub = True +caseInsensitiveAnchors GitLab = False instance FromJSON Flavor where parseJSON = withText "flavor" $ \txt -> diff --git a/src/Xrefcheck/Progress.hs b/src/Xrefcheck/Progress.hs index f30a1bd..bf6ef3d 100644 --- a/src/Xrefcheck/Progress.hs +++ b/src/Xrefcheck/Progress.hs @@ -137,7 +137,7 @@ showProgress name width col posixTime Progress{..} = mconcat , status ] where - -- | Each of the following values represents the number of the progress bar cells + -- Each of the following values represents the number of the progress bar cells -- corresponding to the respective "class" of processed references: the valid ones, -- the ones containing an unfixable error (a.k.a. the invalid ones), and the ones -- containing a fixable error. @@ -145,10 +145,10 @@ showProgress name width col posixTime Progress{..} = mconcat -- The current overall number of proccessed errors. done = floor $ (pCurrent % pTotal) * fromIntegral @Int @(Ratio Int) width - -- | The current number of the invalid references. + -- The current number of the invalid references. errsU = ceiling $ (pErrorsUnfixable % pTotal) * fromIntegral @Int @(Ratio Int) width - -- | The current number of (fixable) errors that may be eliminated during further + -- The current number of (fixable) errors that may be eliminated during further -- verification. -- Notice! -- 1. Both this and the previous values use @ceiling@ as the rounding function. @@ -160,13 +160,13 @@ showProgress name width col posixTime Progress{..} = mconcat errsF = min (width - errsU) . ceiling $ (pErrorsFixable % pTotal) * fromIntegral @Int @(Ratio Int) width - -- | The number of valid references. + -- The number of valid references. -- The value is bounded from below by 0 to ensure the number never gets negative. -- This situation is plausible due to the different rounding functions used for each value: -- @floor@ for the minuend @done@, @ceiling@ for the two subtrahends @errsU@ & @errsF@. successful = max 0 $ done - errsU - errsF - -- | The remaining number of references to be verified. + -- The remaining number of references to be verified. remaining = width - successful - errsU - errsF bar @@ -237,7 +237,7 @@ putTextRewrite (Rewrite RewriteCtx{..}) msg = do atomicModifyIORef' rMaxPrintedSize $ \maxPrinted -> (max maxPrinted (length msg), ()) where - -- | The maximum possible difference between two progress text representations, + -- The maximum possible difference between two progress text representations, -- including the timer & the status, is 9 characters. This is a temporary -- solution to the problem of re-printing a smaller string on top of another -- that'll leave some of the trailing characters in the original string diff --git a/src/Xrefcheck/Verify.hs b/src/Xrefcheck/Verify.hs index 0241ec6..1ed1fdb 100644 --- a/src/Xrefcheck/Verify.hs +++ b/src/Xrefcheck/Verify.hs @@ -69,11 +69,13 @@ import URI.ByteString qualified as URIBS import Control.Exception.Safe (handleAsync, handleJust) import Data.Bits (toIntegralSized) import Data.List (lookup) +import Data.Text (toCaseFold) import Xrefcheck.Config import Xrefcheck.Core import Xrefcheck.Orphans () import Xrefcheck.Progress import Xrefcheck.Scan +import Xrefcheck.Scanners.Markdown (MarkdownConfig (mcFlavor)) import Xrefcheck.System import Xrefcheck.Util @@ -596,12 +598,17 @@ verifyReference checkDeduplicatedAnchorReference file fileAnchors anchor checkAnchorExists fileAnchors anchor + anchorNameEq = + if caseInsensitiveAnchors . mcFlavor . scMarkdown $ cScanners + then (==) `on` toCaseFold + else (==) + -- Detect a case when original file contains two identical anchors, github -- has added a suffix to the duplicate, and now the original is referrenced - -- such links are pretty fragile and we discourage their use despite -- they are in fact unambiguous. checkAnchorReferenceAmbiguity file fileAnchors anchor = do - let similarAnchors = filter ((== anchor) . aName) fileAnchors + let similarAnchors = filter (anchorNameEq anchor . aName) fileAnchors when (length similarAnchors > 1) $ throwError $ AmbiguousAnchorRef file anchor (Exts.fromList similarAnchors) @@ -612,13 +619,16 @@ verifyReference checkAnchorReferenceAmbiguity file fileAnchors origAnchor checkAnchorExists givenAnchors anchor = - case find ((== anchor) . aName) givenAnchors of + case find (anchorNameEq anchor . aName) givenAnchors of Just _ -> pass Nothing -> let isSimilar = (>= scAnchorSimilarityThreshold cScanners) - similarAnchors = - filter (isSimilar . realToFrac . damerauLevenshteinNorm anchor . aName) - givenAnchors + distance = damerauLevenshteinNorm `on` toCaseFold + similarAnchors = flip filter givenAnchors + $ isSimilar + . realToFrac + . distance anchor + . aName in throwError $ AnchorDoesNotExist anchor similarAnchors -- | Parse URI according to RFC 3986 extended by allowing non-encoded diff --git a/tests/golden/check-case-sensitivity/a.md b/tests/golden/check-case-sensitivity/a.md new file mode 100644 index 0000000..988e693 --- /dev/null +++ b/tests/golden/check-case-sensitivity/a.md @@ -0,0 +1,24 @@ + +# Some header + +Some text + +# Another header + +# Custom header + +# Custom header + +[Mixing case reference](#SomE-HEADer) + +[Mixing case reference](#SomE-HEADr) + +[Reference as it is](#UPPERCASE-NAME) + +[Reference lowered](#uppercase-name) + +[Maybe ambiguous reference](#another-header) \ No newline at end of file diff --git a/tests/golden/check-case-sensitivity/check-case-sensitivity.bats b/tests/golden/check-case-sensitivity/check-case-sensitivity.bats new file mode 100644 index 0000000..cf7a4bf --- /dev/null +++ b/tests/golden/check-case-sensitivity/check-case-sensitivity.bats @@ -0,0 +1,23 @@ +#!/usr/bin/env bats + +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: MPL-2.0 + +load '../helpers/bats-support/load' +load '../helpers/bats-assert/load' +load '../helpers/bats-file/load' +load '../helpers' + + +@test "GitHub anchors: check, ambiguous and similar detection is case-insensitive" { + to_temp xrefcheck -c config-github.yaml + + assert_diff expected1.gold +} + +@test "GitLab anchors: check and ambiguous detection is case-sensitive, but similar detection is not" { + to_temp xrefcheck -c config-gitlab.yaml + + assert_diff expected2.gold +} diff --git a/tests/golden/check-case-sensitivity/config-github.yaml b/tests/golden/check-case-sensitivity/config-github.yaml new file mode 100644 index 0000000..1f920af --- /dev/null +++ b/tests/golden/check-case-sensitivity/config-github.yaml @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: Unlicense + +scanners: + markdown: + flavor: GitHub diff --git a/tests/golden/check-case-sensitivity/config-gitlab.yaml b/tests/golden/check-case-sensitivity/config-gitlab.yaml new file mode 100644 index 0000000..0e796f0 --- /dev/null +++ b/tests/golden/check-case-sensitivity/config-gitlab.yaml @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: Unlicense + +scanners: + markdown: + flavor: GitLab diff --git a/tests/golden/check-case-sensitivity/expected1.gold b/tests/golden/check-case-sensitivity/expected1.gold new file mode 100644 index 0000000..6604e4d --- /dev/null +++ b/tests/golden/check-case-sensitivity/expected1.gold @@ -0,0 +1,30 @@ +=== Invalid references found === + + ➥ In file a.md + bad reference (file-local) at src:18:1-36: + - text: "Mixing case reference" + - link: - + - anchor: SomE-HEADr + + Anchor 'SomE-HEADr' is not present, did you mean: + - some-header (header I) at src:6:1-13 + - another-header (header I) at src:10:1-16 + - custom-header (header I) at src:12:1-43 + - Another-header (hand made) at src:12:3-27 + - custom-header (header I) at src:14:1-43 + + ➥ In file a.md + bad reference (file-local) at src:24:1-44: + - text: "Maybe ambiguous reference" + - link: - + - anchor: another-header + + Ambiguous reference to anchor 'another-header' + In file a.md + It could refer to either: + - another-header (header I) at src:10:1-16 + - Another-header (hand made) at src:12:3-27 + Use of ambiguous anchors is discouraged because the target + can change silently while the document containing it evolves. + +Invalid references dumped, 2 in total. diff --git a/tests/golden/check-case-sensitivity/expected2.gold b/tests/golden/check-case-sensitivity/expected2.gold new file mode 100644 index 0000000..f7c9c58 --- /dev/null +++ b/tests/golden/check-case-sensitivity/expected2.gold @@ -0,0 +1,38 @@ +=== Invalid references found === + + ➥ In file a.md + bad reference (file-local) at src:16:1-37: + - text: "Mixing case reference" + - link: - + - anchor: SomE-HEADer + + Anchor 'SomE-HEADer' is not present, did you mean: + - some-header (header I) at src:6:1-13 + - another-header (header I) at src:10:1-16 + - custom-header (header I) at src:12:1-43 + - Another-header (hand made) at src:12:3-27 + - custom-header (header I) at src:14:1-43 + + ➥ In file a.md + bad reference (file-local) at src:18:1-36: + - text: "Mixing case reference" + - link: - + - anchor: SomE-HEADr + + Anchor 'SomE-HEADr' is not present, did you mean: + - some-header (header I) at src:6:1-13 + - another-header (header I) at src:10:1-16 + - custom-header (header I) at src:12:1-43 + - Another-header (hand made) at src:12:3-27 + - custom-header (header I) at src:14:1-43 + + ➥ In file a.md + bad reference (file-local) at src:22:1-36: + - text: "Reference lowered" + - link: - + - anchor: uppercase-name + + Anchor 'uppercase-name' is not present, did you mean: + - UPPERCASE-NAME (hand made) at src:14:3-27 + +Invalid references dumped, 3 in total.