[#77] Add support for glob patterns to ignored and notScanned

Problem: The `virtualFiles` config option supports glob patterns. On the
other hand, `ignored` only supports exact matches and `notScanned`
mathches on prefixes. There is also a bug where `ignored` does not
ignore files if they contain broken xrefcheck annotations.

Solution: Add support for glob patterns to `ignored` and
`notScanned`. Filter ignored files before parsing their contents.
This commit is contained in:
Sergey Gulin 2022-09-04 03:25:02 +10:00
parent fdf2ce27d2
commit 332da5569e
No known key found for this signature in database
GPG Key ID: 67CBDE9BE7E6399B
21 changed files with 279 additions and 58 deletions

View File

@ -22,7 +22,7 @@ steps:
artifact_paths: artifact_paths:
- "result/bin/*" - "result/bin/*"
- command: nix run -f ci.nix xrefcheck-static -c xrefcheck --ignored tests/markdowns --ignored tests/golden/ - command: nix run -f ci.nix xrefcheck-static -c xrefcheck --ignored 'tests/**/*'
label: Xrefcheck itself label: Xrefcheck itself
- label: lint - label: lint

View File

@ -37,6 +37,10 @@ Unreleased
+ Fixed bug with ignoring checks for relative anchors. + Fixed bug with ignoring checks for relative anchors.
* [#132](https://github.com/serokell/xrefcheck/pull/132) * [#132](https://github.com/serokell/xrefcheck/pull/132)
+ Display URL parsing errors. + Display URL parsing errors.
* [#131](https://github.com/serokell/xrefcheck/pull/131)
+ Add support for glob patterns to `ignored` and `notScanned`.
+ Remove support for directory names from `ignored` and `notScanned`.
+ Fix bug with `ignored` not ignoring files with broken xrefcheck annotations.
0.2.1 0.2.1
========== ==========

View File

@ -159,7 +159,7 @@ xrefcheck dump-config -t GitHub
Currently supported options include: Currently supported options include:
* Timeout for checking external references; * Timeout for checking external references;
* List of ignored folders. * List of ignored files.
## Build instructions [](#xrefcheck) ## Build instructions [](#xrefcheck)

View File

@ -37,6 +37,7 @@ import Xrefcheck.Config (VerifyConfig (..))
import Xrefcheck.Core import Xrefcheck.Core
import Xrefcheck.Scan import Xrefcheck.Scan
import Xrefcheck.Util (normaliseWithNoTrailing) import Xrefcheck.Util (normaliseWithNoTrailing)
import Xrefcheck.System (RelGlobPattern (..))
modeReadM :: ReadM VerifyMode modeReadM :: ReadM VerifyMode
modeReadM = eitherReader $ \s -> modeReadM = eitherReader $ \s ->
@ -78,7 +79,7 @@ data Options = Options
} }
data TraversalOptions = TraversalOptions data TraversalOptions = TraversalOptions
{ toIgnored :: [FilePath] { toIgnored :: [RelGlobPattern]
} }
addTraversalOptions :: TraversalConfig -> TraversalOptions -> TraversalConfig addTraversalOptions :: TraversalConfig -> TraversalOptions -> TraversalConfig
@ -115,6 +116,9 @@ type RepoType = Flavor
filepathOption :: Mod OptionFields FilePath -> Parser FilePath filepathOption :: Mod OptionFields FilePath -> Parser FilePath
filepathOption = fmap normaliseWithNoTrailing <$> strOption filepathOption = fmap normaliseWithNoTrailing <$> strOption
globOption :: Mod OptionFields FilePath -> Parser RelGlobPattern
globOption = fmap RelGlobPattern <$> filepathOption
repoTypeReadM :: ReadM RepoType repoTypeReadM :: ReadM RepoType
repoTypeReadM = eitherReader $ \name -> repoTypeReadM = eitherReader $ \name ->
maybeToRight (failureText name) $ L.lookup (map C.toLower name) allRepoTypesNamed maybeToRight (failureText name) $ L.lookup (map C.toLower name) allRepoTypesNamed
@ -174,10 +178,12 @@ optionsParser = do
traversalOptionsParser :: Parser TraversalOptions traversalOptionsParser :: Parser TraversalOptions
traversalOptionsParser = do traversalOptionsParser = do
toIgnored <- many . filepathOption $ toIgnored <- many . globOption $
long "ignored" <> long "ignored" <>
metavar "FILEPATH" <> metavar "GLOB PATTERN" <>
help "Files and folders which we pretend do not exist." help "Files which we pretend do not exist.\
\ Glob patterns that contain wildcards MUST be enclosed\
\ in quotes to avoid being expanded by shell."
return TraversalOptions{..} return TraversalOptions{..}
verifyOptionsParser :: Parser VerifyOptions verifyOptionsParser :: Parser VerifyOptions

View File

@ -28,7 +28,7 @@ import Xrefcheck.Core
import Xrefcheck.Scan import Xrefcheck.Scan
import Xrefcheck.Scanners.Markdown import Xrefcheck.Scanners.Markdown
import Xrefcheck.System (RelGlobPattern, normaliseGlobPattern) import Xrefcheck.System (RelGlobPattern, normaliseGlobPattern)
import Xrefcheck.Util (aesonConfigOption, postfixFields, (-:), normaliseWithNoTrailing) import Xrefcheck.Util (aesonConfigOption, postfixFields, (-:))
import Xrefcheck.Config.Default import Xrefcheck.Config.Default
import Text.Regex.TDFA.Common import Text.Regex.TDFA.Common
@ -53,8 +53,8 @@ data VerifyConfig = VerifyConfig
, vcExternalRefCheckTimeout :: Time Second , vcExternalRefCheckTimeout :: Time Second
, vcVirtualFiles :: [RelGlobPattern] , vcVirtualFiles :: [RelGlobPattern]
-- ^ Files which we pretend do exist. -- ^ Files which we pretend do exist.
, vcNotScanned :: [FilePath] , vcNotScanned :: [RelGlobPattern]
-- ^ Prefixes of files, references in which we should not analyze. -- ^ Files, references in which we should not analyze.
, vcIgnoreRefs :: [Regex] , vcIgnoreRefs :: [Regex]
-- ^ Regular expressions that match external references we should not verify. -- ^ Regular expressions that match external references we should not verify.
, vcCheckLocalhost :: Bool , vcCheckLocalhost :: Bool
@ -72,7 +72,7 @@ normaliseVerifyConfigFilePaths :: VerifyConfig -> VerifyConfig
normaliseVerifyConfigFilePaths vc@VerifyConfig{ vcVirtualFiles, vcNotScanned} normaliseVerifyConfigFilePaths vc@VerifyConfig{ vcVirtualFiles, vcNotScanned}
= vc = vc
{ vcVirtualFiles = map normaliseGlobPattern vcVirtualFiles { vcVirtualFiles = map normaliseGlobPattern vcVirtualFiles
, vcNotScanned = map normaliseWithNoTrailing vcNotScanned , vcNotScanned = map normaliseGlobPattern vcNotScanned
} }
-- | Configs for all the supported scanners. -- | Configs for all the supported scanners.
@ -167,12 +167,12 @@ defConfigText flavor =
GitHub -> GitHub ->
[ ".github/pull_request_template.md" [ ".github/pull_request_template.md"
, ".github/issue_template.md" , ".github/issue_template.md"
, ".github/PULL_REQUEST_TEMPLATE" , ".github/PULL_REQUEST_TEMPLATE/**/*"
, ".github/ISSUE_TEMPLATE" , ".github/ISSUE_TEMPLATE/**/*"
] ]
GitLab -> GitLab ->
[ ".gitlab/merge_request_templates/" [ ".gitlab/merge_request_templates/**/*"
, ".gitlab/issue_templates/" , ".gitlab/issue_templates/**/*"
] ]
, "virtualFiles" -: Right $ case flavor of , "virtualFiles" -: Right $ case flavor of

View File

@ -15,14 +15,14 @@ defConfigUnfilled :: ByteString
defConfigUnfilled = defConfigUnfilled =
[r|# Parameters of repository traversal. [r|# Parameters of repository traversal.
traversal: traversal:
# Files and folders which we pretend do not exist # Glob patterns describing files which we pretend do not exist
# (so they are neither analyzed nor can be referenced). # (so they are neither analyzed nor can be referenced).
ignored: ignored:
# Git files # Git files
- .git - .git/**/*
# Stack files # Stack files
- .stack-work - .stack-work/**/*
# Verification parameters. # Verification parameters.
verification: verification:
@ -34,7 +34,7 @@ verification:
# declaring "Response timeout". # declaring "Response timeout".
externalRefCheckTimeout: 10s externalRefCheckTimeout: 10s
# Prefixes of files, references in which should not be analyzed. # Glob patterns describing the files, references in which should not be analyzed.
notScanned: notScanned:
- :PLACEHOLDER:notScanned: - :PLACEHOLDER:notScanned:

View File

@ -22,24 +22,23 @@ import Universum
import Data.Aeson.TH (deriveFromJSON) import Data.Aeson.TH (deriveFromJSON)
import Data.Foldable qualified as F import Data.Foldable qualified as F
import Data.Map qualified as M import Data.Map qualified as M
import GHC.Err (errorWithoutStackTrace)
import System.Directory (doesDirectoryExist) import System.Directory (doesDirectoryExist)
import System.Directory.Tree qualified as Tree import System.Directory.Tree qualified as Tree
import System.FilePath (dropTrailingPathSeparator, takeDirectory, takeExtension, (</>), equalFilePath) import System.FilePath (dropTrailingPathSeparator, takeDirectory, takeExtension, equalFilePath)
import Xrefcheck.Core import Xrefcheck.Core
import Xrefcheck.Progress import Xrefcheck.Progress
import Xrefcheck.System (readingSystem) import Xrefcheck.System (readingSystem, RelGlobPattern, normaliseGlobPattern, matchesGlobPatterns)
import Xrefcheck.Util (aesonConfigOption, normaliseWithNoTrailing) import Xrefcheck.Util (aesonConfigOption, normaliseWithNoTrailing)
-- | Config of repositry traversal. -- | Config of repositry traversal.
data TraversalConfig = TraversalConfig data TraversalConfig = TraversalConfig
{ tcIgnored :: [FilePath] { tcIgnored :: [RelGlobPattern]
-- ^ Files and folders, files in which we completely ignore. -- ^ Files and folders, files in which we completely ignore.
} }
normaliseTraversalConfigFilePaths :: TraversalConfig -> TraversalConfig normaliseTraversalConfigFilePaths :: TraversalConfig -> TraversalConfig
normaliseTraversalConfigFilePaths = TraversalConfig . map normaliseWithNoTrailing . tcIgnored normaliseTraversalConfigFilePaths = TraversalConfig . map normaliseGlobPattern . tcIgnored
deriveFromJSON aesonConfigOption ''TraversalConfig deriveFromJSON aesonConfigOption ''TraversalConfig
@ -72,10 +71,8 @@ gatherRepoInfo rw formatsSupport config root = do
_ Tree.:/ repoTree <- liftIO $ Tree.readDirectoryWithL processFile root _ Tree.:/ repoTree <- liftIO $ Tree.readDirectoryWithL processFile root
let fileInfos = map (first normaliseWithNoTrailing) let fileInfos = map (first normaliseWithNoTrailing)
$ filter (\(path, _) -> not $ isIgnored path)
$ dropSndMaybes . F.toList $ dropSndMaybes . F.toList
$ Tree.zipPaths . (location Tree.:/) $ Tree.zipPaths $ location Tree.:/ repoTree
$ filterExcludedDirs root repoTree
return $ RepoInfo (M.fromList fileInfos) return $ RepoInfo (M.fromList fileInfos)
where where
isDirectory = readingSystem . doesDirectoryExist isDirectory = readingSystem . doesDirectoryExist
@ -83,22 +80,12 @@ gatherRepoInfo rw formatsSupport config root = do
processFile file = do processFile file = do
let ext = takeExtension file let ext = takeExtension file
let mscanner = formatsSupport ext let mscanner = formatsSupport ext
forM mscanner $ \scanFile -> scanFile file if isIgnored file
then pure Nothing
else forM mscanner ($ file)
dropSndMaybes l = [(a, b) | (a, Just b) <- l] dropSndMaybes l = [(a, b) | (a, Just b) <- l]
ignored = map (root </>) (tcIgnored config) isIgnored = matchesGlobPatterns root $ tcIgnored config
isIgnored path = any (equalFilePath path) ignored
filterExcludedDirs cur = \case
Tree.Dir name subfiles ->
let subfiles' =
if isIgnored cur
then []
else map visitRec subfiles
visitRec sub = filterExcludedDirs (cur </> Tree.name sub) sub
in Tree.Dir name subfiles'
file@Tree.File{} -> file
Tree.Failed _name err ->
errorWithoutStackTrace $ "Repository traversal failed: " <> show err
-- The context location of the root. -- The context location of the root.
-- This is done by removing the last component from the path. -- This is done by removing the last component from the path.

View File

@ -9,6 +9,7 @@ module Xrefcheck.System
, RelGlobPattern (..) , RelGlobPattern (..)
, normaliseGlobPattern , normaliseGlobPattern
, bindGlobPattern , bindGlobPattern
, matchesGlobPatterns
) where ) where
import Universum import Universum
@ -52,6 +53,14 @@ bindGlobPattern root (RelGlobPattern relPat) = readingSystem $ do
Right pat -> Right pat ->
return pat return pat
matchesGlobPatterns :: FilePath -> [RelGlobPattern] -> FilePath -> Bool
matchesGlobPatterns root globPatterns file = or
[ Glob.match pat cFile
| globPattern <- globPatterns
, let pat = bindGlobPattern root globPattern
, let cFile = readingSystem $ canonicalizePath file
]
instance FromJSON RelGlobPattern where instance FromJSON RelGlobPattern where
parseJSON = withText "Repo-relative glob pattern" $ \path -> do parseJSON = withText "Repo-relative glob pattern" $ \path -> do
let spath = toString path let spath = toString path

View File

@ -57,9 +57,8 @@ import Network.HTTP.Req
import Network.HTTP.Types.Header (hRetryAfter) import Network.HTTP.Types.Header (hRetryAfter)
import Network.HTTP.Types.Status (Status, statusCode, statusMessage) import Network.HTTP.Types.Status (Status, statusCode, statusMessage)
import System.Console.Pretty (Style (..), style) import System.Console.Pretty (Style (..), style)
import System.Directory (canonicalizePath, doesDirectoryExist, doesFileExist) import System.Directory (doesDirectoryExist, doesFileExist)
import System.FilePath (takeDirectory, (</>), normalise) import System.FilePath (takeDirectory, (</>), normalise)
import System.FilePath.Glob qualified as Glob
import Text.ParserCombinators.ReadPrec qualified as ReadPrec (lift) import Text.ParserCombinators.ReadPrec qualified as ReadPrec (lift)
import Text.Regex.TDFA.Text (Regex, regexec) import Text.Regex.TDFA.Text (Regex, regexec)
import Text.URI (Authority (..), URI (..), mkURIBs, ParseExceptionBs) import Text.URI (Authority (..), URI (..), mkURIBs, ParseExceptionBs)
@ -261,7 +260,7 @@ verifyRepo
= do = do
let toScan = do let toScan = do
(file, fileInfo) <- M.toList repoInfo (file, fileInfo) <- M.toList repoInfo
guard . not $ any ((`isPrefixOf` file) . normalise . (root </>)) vcNotScanned guard . not $ matchesGlobPatterns root vcNotScanned file
ref <- _fiReferences fileInfo ref <- _fiReferences fileInfo
return (file, ref) return (file, ref)
@ -416,12 +415,7 @@ verifyReference
let fileExists = readingSystem $ doesFileExist file let fileExists = readingSystem $ doesFileExist file
let dirExists = readingSystem $ doesDirectoryExist file let dirExists = readingSystem $ doesDirectoryExist file
let cfile = readingSystem $ canonicalizePath file let isVirtual = matchesGlobPatterns root vcVirtualFiles file
let isVirtual = or
[ Glob.match pat cfile
| virtualFile <- vcVirtualFiles
, let pat = bindGlobPattern root virtualFile
]
unless (fileExists || dirExists || isVirtual) $ unless (fileExists || dirExists || isVirtual) $
throwError (LocalFileDoesNotExist file) throwError (LocalFileDoesNotExist file)

View File

@ -1,13 +1,13 @@
# Parameters of repository traversal. # Parameters of repository traversal.
traversal: traversal:
# Files and folders which we pretend do not exist # Glob patterns describing files which we pretend do not exist
# (so they are neither analyzed nor can be referenced). # (so they are neither analyzed nor can be referenced).
ignored: ignored:
# Git files # Git files
- .git - .git/**/*
# Stack files # Stack files
- .stack-work - .stack-work/**/*
# Verification parameters. # Verification parameters.
verification: verification:
@ -19,12 +19,12 @@ verification:
# declaring "Response timeout". # declaring "Response timeout".
externalRefCheckTimeout: 10s externalRefCheckTimeout: 10s
# Prefixes of files, references in which should not be analyzed. # Glob patterns describing the files, references in which should not be analyzed.
notScanned: notScanned:
- .github/pull_request_template.md - .github/pull_request_template.md
- .github/issue_template.md - .github/issue_template.md
- .github/PULL_REQUEST_TEMPLATE - .github/PULL_REQUEST_TEMPLATE/**/*
- .github/ISSUE_TEMPLATE - .github/ISSUE_TEMPLATE/**/*
# Glob patterns describing the files which do not physically exist in the # Glob patterns describing the files which do not physically exist in the
# repository but should be treated as existing nevertheless. # repository but should be treated as existing nevertheless.

View File

@ -10,7 +10,7 @@ load '../helpers'
@test "No redundant slashes" { @test "No redundant slashes" {
run xrefcheck \ run xrefcheck \
--ignored to-ignore \ --ignored to-ignore/* \
--root . --root .
assert_output --partial "All repository links are valid." assert_output --partial "All repository links are valid."
@ -18,7 +18,7 @@ load '../helpers'
@test "Redundant slashes in root and ignored" { @test "Redundant slashes in root and ignored" {
run xrefcheck \ run xrefcheck \
--ignored ./././././././//to-ignore \ --ignored ./././././././//to-ignore/* \
--root ./ --root ./
assert_output --partial "All repository links are valid." assert_output --partial "All repository links are valid."
@ -34,7 +34,7 @@ load '../helpers'
@test "Reduchant slashes in ignored" { @test "Reduchant slashes in ignored" {
run xrefcheck \ run xrefcheck \
--ignored ./././././././//to-ignore \ --ignored ./././././././//to-ignore/* \
--root . --root .
assert_output --partial "All repository links are valid." assert_output --partial "All repository links are valid."

View File

@ -0,0 +1,40 @@
#!/usr/bin/env bats
# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
#
# SPDX-License-Identifier: MPL-2.0
load '../helpers/bats-support/load'
load '../helpers/bats-assert/load'
load '../helpers'
@test "Ignore file with broken xrefcheck annotation: full path" {
run xrefcheck --ignored ./to-ignore/inner-directory/broken_annotation.md
assert_output --partial "All repository links are valid."
}
@test "Ignore file with broken xrefcheck annotation: glob wildcard" {
run xrefcheck --ignored 'to-ignore/inner-directory/*'
assert_output --partial "All repository links are valid."
}
@test "Ignore file with broken xrefcheck annotation: nested directories with glob wildcard" {
run xrefcheck --ignored './**/*'
assert_output --partial "All repository links are valid."
}
@test "Ignore file with broken xrefcheck annotation: config file" {
run xrefcheck --config ./config-ignored.yaml
assert_output --partial "All repository links are valid."
}
@test "Ignore file with broken xrefcheck annotation: directory, check filure" {
run xrefcheck --ignored ./to-ignore/inner-directory/
assert_output --partial "Error when scanning ./to-ignore/inner-directory/broken_annotation.md"
}

View File

@ -0,0 +1,22 @@
# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
#
# SPDX-License-Identifier: Unlicense
traversal:
ignored:
- ./to-ignore/inner-directory/broken_annotation.md
verification:
anchorSimilarityThreshold: 0.5
externalRefCheckTimeout: 10s
notScanned: []
virtualFiles: []
ignoreRefs: []
checkLocalhost: true
ignoreAuthFailures: true
defaultRetryAfter: 30s
maxRetries: 3
scanners:
markdown:
flavor: GitHub

View File

@ -0,0 +1,11 @@
<!--
- SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
-
- SPDX-License-Identifier: MPL-2.0
-->
One
<!--xrefcheck: ignore file -->
Two

View File

@ -0,0 +1,40 @@
#!/usr/bin/env bats
# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
#
# SPDX-License-Identifier: MPL-2.0
load '../helpers/bats-support/load'
load '../helpers/bats-assert/load'
load '../helpers'
@test "Not scanned: full path" {
run xrefcheck -c config-full-path.yaml
assert_output --partial "All repository links are valid."
}
@test "Not scanned: glob wildcard" {
run xrefcheck -c config-wildcard.yaml
assert_output --partial "All repository links are valid."
}
@test "Not scanned: nested directories with glob wildcard" {
run xrefcheck -c config-nested-directories.yaml
assert_output --partial "All repository links are valid."
}
@test "Not scanned: directory, check failure" {
xrefcheck -c config-directory.yaml \
| prepare > /tmp/check-notScanned.test || true
diff /tmp/check-notScanned.test expected.gold \
--ignore-space-change \
--ignore-blank-lines \
--new-file # treat absent files as empty
rm /tmp/check-notScanned.test
}

View File

@ -0,0 +1,22 @@
# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
#
# SPDX-License-Identifier: Unlicense
traversal:
ignored: []
verification:
anchorSimilarityThreshold: 0.5
externalRefCheckTimeout: 10s
notScanned:
- notScanned/inner-directory
virtualFiles: []
ignoreRefs: []
checkLocalhost: true
ignoreAuthFailures: true
defaultRetryAfter: 30s
maxRetries: 3
scanners:
markdown:
flavor: GitHub

View File

@ -0,0 +1,22 @@
# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
#
# SPDX-License-Identifier: Unlicense
traversal:
ignored: []
verification:
anchorSimilarityThreshold: 0.5
externalRefCheckTimeout: 10s
notScanned:
- ./notScanned/inner-directory/bad-reference.md
virtualFiles: []
ignoreRefs: []
checkLocalhost: true
ignoreAuthFailures: true
defaultRetryAfter: 30s
maxRetries: 3
scanners:
markdown:
flavor: GitHub

View File

@ -0,0 +1,22 @@
# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
#
# SPDX-License-Identifier: Unlicense
traversal:
ignored: []
verification:
anchorSimilarityThreshold: 0.5
externalRefCheckTimeout: 10s
notScanned:
- ./**/*
virtualFiles: []
ignoreRefs: []
checkLocalhost: true
ignoreAuthFailures: true
defaultRetryAfter: 30s
maxRetries: 3
scanners:
markdown:
flavor: GitHub

View File

@ -0,0 +1,22 @@
# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
#
# SPDX-License-Identifier: Unlicense
traversal:
ignored: []
verification:
anchorSimilarityThreshold: 0.5
externalRefCheckTimeout: 10s
notScanned:
- ./notScanned/inner-directory/*
virtualFiles: []
ignoreRefs: []
checkLocalhost: true
ignoreAuthFailures: true
defaultRetryAfter: 30s
maxRetries: 3
scanners:
markdown:
flavor: GitHub

View File

@ -0,0 +1,13 @@
=== Invalid references found ===
➥ In file notScanned/inner-directory/bad-reference.md
bad reference (absolute) at src:7:1-28:
- text: "Bad reference"
- link: /no-file.md
- anchor: -
⛀ File does not exist:
./no-file.md
Invalid references dumped, 1 in total.

View File

@ -0,0 +1,7 @@
<!--
- SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
-
- SPDX-License-Identifier: MPL-2.0
-->
[Bad reference](/no-file.md)