[#77] Add support for glob patterns to ignored and notScanned

Problem: The `virtualFiles` config option supports glob patterns. On the
other hand, `ignored` only supports exact matches and `notScanned`
mathches on prefixes. There is also a bug where `ignored` does not
ignore files if they contain broken xrefcheck annotations.

Solution: Add support for glob patterns to `ignored` and
`notScanned`. Filter ignored files before parsing their contents.
This commit is contained in:
Sergey Gulin 2022-09-04 03:25:02 +10:00
parent fdf2ce27d2
commit 332da5569e
No known key found for this signature in database
GPG Key ID: 67CBDE9BE7E6399B
21 changed files with 279 additions and 58 deletions

View File

@ -22,7 +22,7 @@ steps:
artifact_paths:
- "result/bin/*"
- command: nix run -f ci.nix xrefcheck-static -c xrefcheck --ignored tests/markdowns --ignored tests/golden/
- command: nix run -f ci.nix xrefcheck-static -c xrefcheck --ignored 'tests/**/*'
label: Xrefcheck itself
- label: lint

View File

@ -37,6 +37,10 @@ Unreleased
+ Fixed bug with ignoring checks for relative anchors.
* [#132](https://github.com/serokell/xrefcheck/pull/132)
+ Display URL parsing errors.
* [#131](https://github.com/serokell/xrefcheck/pull/131)
+ Add support for glob patterns to `ignored` and `notScanned`.
+ Remove support for directory names from `ignored` and `notScanned`.
+ Fix bug with `ignored` not ignoring files with broken xrefcheck annotations.
0.2.1
==========

View File

@ -159,7 +159,7 @@ xrefcheck dump-config -t GitHub
Currently supported options include:
* Timeout for checking external references;
* List of ignored folders.
* List of ignored files.
## Build instructions [](#xrefcheck)

View File

@ -37,6 +37,7 @@ import Xrefcheck.Config (VerifyConfig (..))
import Xrefcheck.Core
import Xrefcheck.Scan
import Xrefcheck.Util (normaliseWithNoTrailing)
import Xrefcheck.System (RelGlobPattern (..))
modeReadM :: ReadM VerifyMode
modeReadM = eitherReader $ \s ->
@ -78,7 +79,7 @@ data Options = Options
}
data TraversalOptions = TraversalOptions
{ toIgnored :: [FilePath]
{ toIgnored :: [RelGlobPattern]
}
addTraversalOptions :: TraversalConfig -> TraversalOptions -> TraversalConfig
@ -115,6 +116,9 @@ type RepoType = Flavor
filepathOption :: Mod OptionFields FilePath -> Parser FilePath
filepathOption = fmap normaliseWithNoTrailing <$> strOption
globOption :: Mod OptionFields FilePath -> Parser RelGlobPattern
globOption = fmap RelGlobPattern <$> filepathOption
repoTypeReadM :: ReadM RepoType
repoTypeReadM = eitherReader $ \name ->
maybeToRight (failureText name) $ L.lookup (map C.toLower name) allRepoTypesNamed
@ -174,10 +178,12 @@ optionsParser = do
traversalOptionsParser :: Parser TraversalOptions
traversalOptionsParser = do
toIgnored <- many . filepathOption $
toIgnored <- many . globOption $
long "ignored" <>
metavar "FILEPATH" <>
help "Files and folders which we pretend do not exist."
metavar "GLOB PATTERN" <>
help "Files which we pretend do not exist.\
\ Glob patterns that contain wildcards MUST be enclosed\
\ in quotes to avoid being expanded by shell."
return TraversalOptions{..}
verifyOptionsParser :: Parser VerifyOptions

View File

@ -28,7 +28,7 @@ import Xrefcheck.Core
import Xrefcheck.Scan
import Xrefcheck.Scanners.Markdown
import Xrefcheck.System (RelGlobPattern, normaliseGlobPattern)
import Xrefcheck.Util (aesonConfigOption, postfixFields, (-:), normaliseWithNoTrailing)
import Xrefcheck.Util (aesonConfigOption, postfixFields, (-:))
import Xrefcheck.Config.Default
import Text.Regex.TDFA.Common
@ -53,8 +53,8 @@ data VerifyConfig = VerifyConfig
, vcExternalRefCheckTimeout :: Time Second
, vcVirtualFiles :: [RelGlobPattern]
-- ^ Files which we pretend do exist.
, vcNotScanned :: [FilePath]
-- ^ Prefixes of files, references in which we should not analyze.
, vcNotScanned :: [RelGlobPattern]
-- ^ Files, references in which we should not analyze.
, vcIgnoreRefs :: [Regex]
-- ^ Regular expressions that match external references we should not verify.
, vcCheckLocalhost :: Bool
@ -72,7 +72,7 @@ normaliseVerifyConfigFilePaths :: VerifyConfig -> VerifyConfig
normaliseVerifyConfigFilePaths vc@VerifyConfig{ vcVirtualFiles, vcNotScanned}
= vc
{ vcVirtualFiles = map normaliseGlobPattern vcVirtualFiles
, vcNotScanned = map normaliseWithNoTrailing vcNotScanned
, vcNotScanned = map normaliseGlobPattern vcNotScanned
}
-- | Configs for all the supported scanners.
@ -167,12 +167,12 @@ defConfigText flavor =
GitHub ->
[ ".github/pull_request_template.md"
, ".github/issue_template.md"
, ".github/PULL_REQUEST_TEMPLATE"
, ".github/ISSUE_TEMPLATE"
, ".github/PULL_REQUEST_TEMPLATE/**/*"
, ".github/ISSUE_TEMPLATE/**/*"
]
GitLab ->
[ ".gitlab/merge_request_templates/"
, ".gitlab/issue_templates/"
[ ".gitlab/merge_request_templates/**/*"
, ".gitlab/issue_templates/**/*"
]
, "virtualFiles" -: Right $ case flavor of

View File

@ -15,14 +15,14 @@ defConfigUnfilled :: ByteString
defConfigUnfilled =
[r|# Parameters of repository traversal.
traversal:
# Files and folders which we pretend do not exist
# Glob patterns describing files which we pretend do not exist
# (so they are neither analyzed nor can be referenced).
ignored:
# Git files
- .git
- .git/**/*
# Stack files
- .stack-work
- .stack-work/**/*
# Verification parameters.
verification:
@ -34,7 +34,7 @@ verification:
# declaring "Response timeout".
externalRefCheckTimeout: 10s
# Prefixes of files, references in which should not be analyzed.
# Glob patterns describing the files, references in which should not be analyzed.
notScanned:
- :PLACEHOLDER:notScanned:

View File

@ -22,24 +22,23 @@ import Universum
import Data.Aeson.TH (deriveFromJSON)
import Data.Foldable qualified as F
import Data.Map qualified as M
import GHC.Err (errorWithoutStackTrace)
import System.Directory (doesDirectoryExist)
import System.Directory.Tree qualified as Tree
import System.FilePath (dropTrailingPathSeparator, takeDirectory, takeExtension, (</>), equalFilePath)
import System.FilePath (dropTrailingPathSeparator, takeDirectory, takeExtension, equalFilePath)
import Xrefcheck.Core
import Xrefcheck.Progress
import Xrefcheck.System (readingSystem)
import Xrefcheck.System (readingSystem, RelGlobPattern, normaliseGlobPattern, matchesGlobPatterns)
import Xrefcheck.Util (aesonConfigOption, normaliseWithNoTrailing)
-- | Config of repositry traversal.
data TraversalConfig = TraversalConfig
{ tcIgnored :: [FilePath]
{ tcIgnored :: [RelGlobPattern]
-- ^ Files and folders, files in which we completely ignore.
}
normaliseTraversalConfigFilePaths :: TraversalConfig -> TraversalConfig
normaliseTraversalConfigFilePaths = TraversalConfig . map normaliseWithNoTrailing . tcIgnored
normaliseTraversalConfigFilePaths = TraversalConfig . map normaliseGlobPattern . tcIgnored
deriveFromJSON aesonConfigOption ''TraversalConfig
@ -72,10 +71,8 @@ gatherRepoInfo rw formatsSupport config root = do
_ Tree.:/ repoTree <- liftIO $ Tree.readDirectoryWithL processFile root
let fileInfos = map (first normaliseWithNoTrailing)
$ filter (\(path, _) -> not $ isIgnored path)
$ dropSndMaybes . F.toList
$ Tree.zipPaths . (location Tree.:/)
$ filterExcludedDirs root repoTree
$ Tree.zipPaths $ location Tree.:/ repoTree
return $ RepoInfo (M.fromList fileInfos)
where
isDirectory = readingSystem . doesDirectoryExist
@ -83,22 +80,12 @@ gatherRepoInfo rw formatsSupport config root = do
processFile file = do
let ext = takeExtension file
let mscanner = formatsSupport ext
forM mscanner $ \scanFile -> scanFile file
if isIgnored file
then pure Nothing
else forM mscanner ($ file)
dropSndMaybes l = [(a, b) | (a, Just b) <- l]
ignored = map (root </>) (tcIgnored config)
isIgnored path = any (equalFilePath path) ignored
filterExcludedDirs cur = \case
Tree.Dir name subfiles ->
let subfiles' =
if isIgnored cur
then []
else map visitRec subfiles
visitRec sub = filterExcludedDirs (cur </> Tree.name sub) sub
in Tree.Dir name subfiles'
file@Tree.File{} -> file
Tree.Failed _name err ->
errorWithoutStackTrace $ "Repository traversal failed: " <> show err
isIgnored = matchesGlobPatterns root $ tcIgnored config
-- The context location of the root.
-- This is done by removing the last component from the path.

View File

@ -9,6 +9,7 @@ module Xrefcheck.System
, RelGlobPattern (..)
, normaliseGlobPattern
, bindGlobPattern
, matchesGlobPatterns
) where
import Universum
@ -52,6 +53,14 @@ bindGlobPattern root (RelGlobPattern relPat) = readingSystem $ do
Right pat ->
return pat
matchesGlobPatterns :: FilePath -> [RelGlobPattern] -> FilePath -> Bool
matchesGlobPatterns root globPatterns file = or
[ Glob.match pat cFile
| globPattern <- globPatterns
, let pat = bindGlobPattern root globPattern
, let cFile = readingSystem $ canonicalizePath file
]
instance FromJSON RelGlobPattern where
parseJSON = withText "Repo-relative glob pattern" $ \path -> do
let spath = toString path

View File

@ -57,9 +57,8 @@ import Network.HTTP.Req
import Network.HTTP.Types.Header (hRetryAfter)
import Network.HTTP.Types.Status (Status, statusCode, statusMessage)
import System.Console.Pretty (Style (..), style)
import System.Directory (canonicalizePath, doesDirectoryExist, doesFileExist)
import System.Directory (doesDirectoryExist, doesFileExist)
import System.FilePath (takeDirectory, (</>), normalise)
import System.FilePath.Glob qualified as Glob
import Text.ParserCombinators.ReadPrec qualified as ReadPrec (lift)
import Text.Regex.TDFA.Text (Regex, regexec)
import Text.URI (Authority (..), URI (..), mkURIBs, ParseExceptionBs)
@ -261,7 +260,7 @@ verifyRepo
= do
let toScan = do
(file, fileInfo) <- M.toList repoInfo
guard . not $ any ((`isPrefixOf` file) . normalise . (root </>)) vcNotScanned
guard . not $ matchesGlobPatterns root vcNotScanned file
ref <- _fiReferences fileInfo
return (file, ref)
@ -416,12 +415,7 @@ verifyReference
let fileExists = readingSystem $ doesFileExist file
let dirExists = readingSystem $ doesDirectoryExist file
let cfile = readingSystem $ canonicalizePath file
let isVirtual = or
[ Glob.match pat cfile
| virtualFile <- vcVirtualFiles
, let pat = bindGlobPattern root virtualFile
]
let isVirtual = matchesGlobPatterns root vcVirtualFiles file
unless (fileExists || dirExists || isVirtual) $
throwError (LocalFileDoesNotExist file)

View File

@ -1,13 +1,13 @@
# Parameters of repository traversal.
traversal:
# Files and folders which we pretend do not exist
# Glob patterns describing files which we pretend do not exist
# (so they are neither analyzed nor can be referenced).
ignored:
# Git files
- .git
- .git/**/*
# Stack files
- .stack-work
- .stack-work/**/*
# Verification parameters.
verification:
@ -19,12 +19,12 @@ verification:
# declaring "Response timeout".
externalRefCheckTimeout: 10s
# Prefixes of files, references in which should not be analyzed.
# Glob patterns describing the files, references in which should not be analyzed.
notScanned:
- .github/pull_request_template.md
- .github/issue_template.md
- .github/PULL_REQUEST_TEMPLATE
- .github/ISSUE_TEMPLATE
- .github/PULL_REQUEST_TEMPLATE/**/*
- .github/ISSUE_TEMPLATE/**/*
# Glob patterns describing the files which do not physically exist in the
# repository but should be treated as existing nevertheless.

View File

@ -10,7 +10,7 @@ load '../helpers'
@test "No redundant slashes" {
run xrefcheck \
--ignored to-ignore \
--ignored to-ignore/* \
--root .
assert_output --partial "All repository links are valid."
@ -18,7 +18,7 @@ load '../helpers'
@test "Redundant slashes in root and ignored" {
run xrefcheck \
--ignored ./././././././//to-ignore \
--ignored ./././././././//to-ignore/* \
--root ./
assert_output --partial "All repository links are valid."
@ -34,7 +34,7 @@ load '../helpers'
@test "Reduchant slashes in ignored" {
run xrefcheck \
--ignored ./././././././//to-ignore \
--ignored ./././././././//to-ignore/* \
--root .
assert_output --partial "All repository links are valid."

View File

@ -0,0 +1,40 @@
#!/usr/bin/env bats
# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
#
# SPDX-License-Identifier: MPL-2.0
load '../helpers/bats-support/load'
load '../helpers/bats-assert/load'
load '../helpers'
@test "Ignore file with broken xrefcheck annotation: full path" {
run xrefcheck --ignored ./to-ignore/inner-directory/broken_annotation.md
assert_output --partial "All repository links are valid."
}
@test "Ignore file with broken xrefcheck annotation: glob wildcard" {
run xrefcheck --ignored 'to-ignore/inner-directory/*'
assert_output --partial "All repository links are valid."
}
@test "Ignore file with broken xrefcheck annotation: nested directories with glob wildcard" {
run xrefcheck --ignored './**/*'
assert_output --partial "All repository links are valid."
}
@test "Ignore file with broken xrefcheck annotation: config file" {
run xrefcheck --config ./config-ignored.yaml
assert_output --partial "All repository links are valid."
}
@test "Ignore file with broken xrefcheck annotation: directory, check filure" {
run xrefcheck --ignored ./to-ignore/inner-directory/
assert_output --partial "Error when scanning ./to-ignore/inner-directory/broken_annotation.md"
}

View File

@ -0,0 +1,22 @@
# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
#
# SPDX-License-Identifier: Unlicense
traversal:
ignored:
- ./to-ignore/inner-directory/broken_annotation.md
verification:
anchorSimilarityThreshold: 0.5
externalRefCheckTimeout: 10s
notScanned: []
virtualFiles: []
ignoreRefs: []
checkLocalhost: true
ignoreAuthFailures: true
defaultRetryAfter: 30s
maxRetries: 3
scanners:
markdown:
flavor: GitHub

View File

@ -0,0 +1,11 @@
<!--
- SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
-
- SPDX-License-Identifier: MPL-2.0
-->
One
<!--xrefcheck: ignore file -->
Two

View File

@ -0,0 +1,40 @@
#!/usr/bin/env bats
# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
#
# SPDX-License-Identifier: MPL-2.0
load '../helpers/bats-support/load'
load '../helpers/bats-assert/load'
load '../helpers'
@test "Not scanned: full path" {
run xrefcheck -c config-full-path.yaml
assert_output --partial "All repository links are valid."
}
@test "Not scanned: glob wildcard" {
run xrefcheck -c config-wildcard.yaml
assert_output --partial "All repository links are valid."
}
@test "Not scanned: nested directories with glob wildcard" {
run xrefcheck -c config-nested-directories.yaml
assert_output --partial "All repository links are valid."
}
@test "Not scanned: directory, check failure" {
xrefcheck -c config-directory.yaml \
| prepare > /tmp/check-notScanned.test || true
diff /tmp/check-notScanned.test expected.gold \
--ignore-space-change \
--ignore-blank-lines \
--new-file # treat absent files as empty
rm /tmp/check-notScanned.test
}

View File

@ -0,0 +1,22 @@
# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
#
# SPDX-License-Identifier: Unlicense
traversal:
ignored: []
verification:
anchorSimilarityThreshold: 0.5
externalRefCheckTimeout: 10s
notScanned:
- notScanned/inner-directory
virtualFiles: []
ignoreRefs: []
checkLocalhost: true
ignoreAuthFailures: true
defaultRetryAfter: 30s
maxRetries: 3
scanners:
markdown:
flavor: GitHub

View File

@ -0,0 +1,22 @@
# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
#
# SPDX-License-Identifier: Unlicense
traversal:
ignored: []
verification:
anchorSimilarityThreshold: 0.5
externalRefCheckTimeout: 10s
notScanned:
- ./notScanned/inner-directory/bad-reference.md
virtualFiles: []
ignoreRefs: []
checkLocalhost: true
ignoreAuthFailures: true
defaultRetryAfter: 30s
maxRetries: 3
scanners:
markdown:
flavor: GitHub

View File

@ -0,0 +1,22 @@
# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
#
# SPDX-License-Identifier: Unlicense
traversal:
ignored: []
verification:
anchorSimilarityThreshold: 0.5
externalRefCheckTimeout: 10s
notScanned:
- ./**/*
virtualFiles: []
ignoreRefs: []
checkLocalhost: true
ignoreAuthFailures: true
defaultRetryAfter: 30s
maxRetries: 3
scanners:
markdown:
flavor: GitHub

View File

@ -0,0 +1,22 @@
# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
#
# SPDX-License-Identifier: Unlicense
traversal:
ignored: []
verification:
anchorSimilarityThreshold: 0.5
externalRefCheckTimeout: 10s
notScanned:
- ./notScanned/inner-directory/*
virtualFiles: []
ignoreRefs: []
checkLocalhost: true
ignoreAuthFailures: true
defaultRetryAfter: 30s
maxRetries: 3
scanners:
markdown:
flavor: GitHub

View File

@ -0,0 +1,13 @@
=== Invalid references found ===
➥ In file notScanned/inner-directory/bad-reference.md
bad reference (absolute) at src:7:1-28:
- text: "Bad reference"
- link: /no-file.md
- anchor: -
⛀ File does not exist:
./no-file.md
Invalid references dumped, 1 in total.

View File

@ -0,0 +1,7 @@
<!--
- SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
-
- SPDX-License-Identifier: MPL-2.0
-->
[Bad reference](/no-file.md)