[INT-31] Implement cross-references verifier

This commit is contained in:
martoon 2018-10-11 00:15:08 +03:00
parent 44d735c607
commit ea83f75643
No known key found for this signature in database
GPG Key ID: FF02288E36C0E4B0
14 changed files with 688 additions and 18 deletions

1
.gitignore vendored
View File

@ -1,6 +1,7 @@
dist
dist-*
cabal-dev
*.cabal
*.o
*.hi
*.chi

43
LICENSE
View File

@ -1,21 +1,30 @@
MIT License
Copyright Author name here (c) 2018
Copyright (c) 2018 Serokell
All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Author name here nor the names of other
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,2 +1 @@
# crossref-verifier
Check cross-references of repository documents

2
Setup.hs Normal file
View File

@ -0,0 +1,2 @@
import Distribution.Simple
main = defaultMain

27
exec/Main.hs Normal file
View File

@ -0,0 +1,27 @@
module Main where
import Data.Default (def)
import Fmt (blockListF, build, fmtLn, nameF)
import Crv.Scan
import Crv.Scanners
import Crv.Verify
formats :: FormatsSupport
formats = specificFormatsSupport
[ markdownSupport
]
main :: IO ()
main = do
let root = "../universum"
repoInfo <- gatherRepoInfo formats def root
fmtLn $ nameF "Repository links" $ build repoInfo
verifyRes <- verifyRepo root repoInfo
case verifyErrors verifyRes of
Nothing ->
fmtLn "All repository links are valid"
Just (toList -> errs) -> do
fmtLn $ nameF "Invalid references found" $ blockListF errs
die ""

84
package.yaml Normal file
View File

@ -0,0 +1,84 @@
name: crossref-verifier
version: 0.1.0.0
github: "githubuser/crossref-verifier"
license: MIT
author: "Kostya Ivanov"
maintainer: "martoon.hsk@gmail.com"
copyright: "2018 Martoon"
extra-source-files:
- README.md
description: Please see the README on GitHub at <https://github.com/serokell/crossref-verifier#readme>
default-extensions:
- AllowAmbiguousTypes
- BangPatterns
- ConstraintKinds
- DataKinds
- DefaultSignatures
- DeriveDataTypeable
- DeriveGeneric
- FlexibleContexts
- FlexibleInstances
- FunctionalDependencies
- GeneralizedNewtypeDeriving
- LambdaCase
- MonadFailDesugaring
- MultiParamTypeClasses
- MultiWayIf
- NamedFieldPuns
- OverloadedStrings
- RankNTypes
- RecordWildCards
- ScopedTypeVariables
- StandaloneDeriving
- TemplateHaskell
- TupleSections
- TypeFamilies
- UndecidableInstances
- ViewPatterns
- TypeApplications
- TypeOperators
build-tools:
- autoexporter
ghc-options:
- -Wall
dependencies:
- base-noprelude
- containers
- req
- data-default
- deepseq
- directory-tree
- directory
- filepath
- fmt
- http-client
- http-types
- lens
- loot-prelude
- pretty-terminal
- network-uri
- megaparsec
- mtl
- roman-numerals
- text
library:
source-dirs: src
executables:
crossref-verify:
main: Main.hs
source-dirs: exec
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
- -O2
dependencies:
- crossref-verifier

17
src/Crv/Config.hs Normal file
View File

@ -0,0 +1,17 @@
module Crv.Config where
import Data.Default (Default (..))
data Config = Config
{ cTraversal :: TraversalConfig
}
instance Default Config where
def = Config def
data TraversalConfig = TraversalConfig
{ tcExcluded :: [FilePath]
}
instance Default TraversalConfig where
def = TraversalConfig [".stack-work"]

151
src/Crv/Core.hs Normal file
View File

@ -0,0 +1,151 @@
-- | Various primitives.
module Crv.Core where
import Control.DeepSeq (NFData)
import Control.Lens (makeLenses, (%=))
import Data.Char (isAlphaNum)
import Data.Default (Default (..))
import qualified Data.Map as M
import qualified Data.Text as T
import Fmt (Buildable (..), blockListF, blockListF', nameF, (+|), (|+))
import System.Console.Pretty (Color (..), Style (..), color, style)
import Text.Numeral.Roman (toRoman)
import Crv.Util ()
-----------------------------------------------------------
-- Types
-----------------------------------------------------------
-- | Full info about a reference.
data Reference = Reference
{ rName :: Text
-- ^ Text displayed as reference.
, rLink :: Text
-- ^ File or site reference points to.
, rAnchor :: Maybe Text
-- ^ Section or custom anchor tag.
} deriving (Show, Generic)
-- | Context of anchor.
data AnchorType
= HeaderAnchor Int
-- ^ Every section header is usually an anchor
| HandAnchor
-- ^ They can be set up manually
| BiblioAnchor
-- ^ Id of entry in bibliography
deriving (Show, Eq, Generic)
-- | A referable anchor.
data Anchor = Anchor
{ aType :: AnchorType
, aName :: Text
} deriving (Show, Generic)
-- | All information regarding a single file we care about.
data FileInfo = FileInfo
{ _fiReferences :: [Reference]
, _fiAnchors :: [Anchor]
} deriving (Show, Generic)
makeLenses ''FileInfo
instance Default FileInfo where
def = FileInfo [] []
newtype RepoInfo = RepoInfo (Map FilePath FileInfo)
deriving (Show)
finaliseFileInfo :: FileInfo -> FileInfo
finaliseFileInfo = execState $ do
fiReferences %= reverse
fiAnchors %= reverse
-----------------------------------------------------------
-- Instances
-----------------------------------------------------------
instance NFData Reference
instance NFData AnchorType
instance NFData Anchor
instance NFData FileInfo
instance Buildable Reference where
build Reference{..} = nameF ("reference (" +| build loc |+ ")") $
blockListF
[ "text: " <> show rName
, "link: " <> rLink
, "anchor: " <> (rAnchor ?: style Faint "-")
]
where
loc = locationType rLink
instance Buildable AnchorType where
build = style Faint . \case
HeaderAnchor l -> color Green ("header " <> toRoman l)
HandAnchor -> color Yellow "hand made"
BiblioAnchor -> color Cyan "biblio"
instance Buildable Anchor where
build (Anchor t a) = a |+ " (" +| t |+ ")"
instance Buildable FileInfo where
build FileInfo{..} =
blockListF
[ nameF "references" $ blockListF _fiReferences
, nameF "anchors" $ blockListF _fiAnchors
]
instance Buildable RepoInfo where
build (RepoInfo m) =
blockListF' "" buildFileReport (M.toList m)
where
buildFileReport (name, info) = mconcat
[ color Cyan $ fromString name <> ":\n"
, build info
, "\n"
]
-----------------------------------------------------------
-- Analysing
-----------------------------------------------------------
-- | Type of reference.
data LocationType
= LocalLoc
-- ^ Reference on this file
| RelativeLoc
-- ^ Reference to a file relative to given one
| AbsoluteLoc
-- ^ Reference to a file relative to the root
| ExternalLoc
-- ^ Reference to a file at outer site
deriving (Show)
instance Buildable LocationType where
build = \case
LocalLoc -> color Green "local"
RelativeLoc -> color Yellow "relative"
AbsoluteLoc -> color Blue "absolute"
ExternalLoc -> color Red "external"
-- | Get type of reference.
locationType :: Text -> LocationType
locationType location = case toString location of
[] -> LocalLoc
'/' : _ -> AbsoluteLoc
'.' : '/' : _ -> RelativeLoc
'.' : '.' : '/' : _ -> RelativeLoc
_ | hasProtocol -> ExternalLoc
_ -> RelativeLoc
where
hasProtocol = "://" `T.isInfixOf` (T.take 10 location)
-- | Convert section header name to an anchor refering it.
headerToAnchor :: Text -> Text
headerToAnchor =
T.filter (\c -> isAlphaNum c || c == '-') .
T.replace " " "-" .
T.replace "+" "-" .
T.toLower

69
src/Crv/Scan.hs Normal file
View File

@ -0,0 +1,69 @@
-- | Generalised repo scanner and analyser.
module Crv.Scan
( Extension
, ScanAction
, FormatsSupport
, RepoInfo (..)
, gatherRepoInfo
, specificFormatsSupport
) where
import qualified Data.Foldable as F
import qualified Data.Map as M
import qualified System.Directory.Tree as Tree
import System.FilePath.Posix (takeDirectory, takeExtension, (</>))
import Crv.Config
import Crv.Core
import Crv.Util ()
-- | File extension, dot included.
type Extension = String
-- | Way to parse a file.
type ScanAction = FilePath -> IO FileInfo
-- | All supported ways to parse a file.
type FormatsSupport = Extension -> Maybe ScanAction
specificFormatsSupport :: [([Extension], ScanAction)] -> FormatsSupport
specificFormatsSupport formats = \ext -> M.lookup ext formatsMap
where
formatsMap = M.fromList
[ (extension, parser)
| (extensions, parser) <- formats
, extension <- extensions
]
gatherRepoInfo
:: MonadIO m
=> FormatsSupport -> TraversalConfig -> FilePath -> m RepoInfo
gatherRepoInfo formatsSupport config root = do
_ Tree.:/ repoTree <- liftIO $ Tree.readDirectoryWithL processFile rootNE
let fileInfos = filter (\(path, _) -> not $ isExcluded path) $
dropSndMaybes . F.toList $
Tree.zipPaths . (dirOfRoot Tree.:/) $
filterExcludedDirs root repoTree
return $ RepoInfo (M.fromList fileInfos)
where
rootNE = if null root then "." else root
dirOfRoot = if null root then "" else takeDirectory root
processFile file = do
let ext = takeExtension file
forM (formatsSupport ext) $ \scanFile ->
scanFile file
dropSndMaybes l = [(a, b) | (a, Just b) <- l]
excluded = map (root </>) (tcExcluded config)
isExcluded path = any (`isPrefixOf` path) excluded
filterExcludedDirs cur = \case
Tree.Dir name subfiles ->
let subfiles' =
if isExcluded cur
then []
else map visitRec subfiles
visitRec sub = filterExcludedDirs (cur </> Tree.name sub) sub
in Tree.Dir name subfiles'
other -> other

1
src/Crv/Scanners.hs Normal file
View File

@ -0,0 +1 @@
{-# OPTIONS_GHC -F -pgmF autoexporter #-}

View File

@ -0,0 +1,117 @@
-- | Markdown documents markdownScanner.
module Crv.Scanners.Markdown
( markdownScanner
, markdownSupport
) where
import Control.Lens ((%=))
import Data.Default (Default (..))
import qualified Data.Text as T
import qualified Data.Text.Lazy as LT
import qualified Data.Text.Lazy.IO as LT
import GHC.Conc (par)
import qualified Text.Megaparsec as P
import qualified Text.Megaparsec.Char as P
import Crv.Core
import Crv.Scan
type MonadFileScan m = (P.MonadParsec () LT.Text m, MonadState FileInfo m)
inBrackets :: MonadFileScan m => LT.Text -> LT.Text -> m Text
inBrackets open close = do
_ <- P.string open
res <- P.takeWhileP (Just "in brackets") (/= LT.head close)
_ <- P.string close
return $ toStrict res
fileInfoParser :: (MonadFileScan m) => m ()
fileInfoParser = loop >> modify finaliseFileInfo
where
loop = do
asum
[ P.try parseReference
, P.try parseBiblioRef
, P.try parseFootnoteRef
, P.try parseHandAnchor
, P.try parseSectionName
, P.try parseBiblioAnchor
, skipQuotes
, skipCodeBlocks
, skipUninteresting
]
unlessM P.atEnd loop
parseReference = do
rName <- inBrackets "[" "]"
rawLink <- inBrackets "(" ")"
let link = if null rawLink then rName else rawLink
let (rLink, rAnchor) = case T.splitOn "#" link of
[t] -> (t, Nothing)
t : ts -> (t, Just $ T.intercalate "#" ts)
[] -> error "impossible"
fiReferences %= (Reference{..} :)
parseBiblioRef = do
rName <- inBrackets "[" "]"
anchor <- inBrackets "[" "]"
let rAnchor = Just $ if null anchor then rName else anchor
let rLink = ""
fiReferences %= (Reference{..} :)
parseFootnoteRef = do
rName <- inBrackets "[^" "]"
let rAnchor = Just ("^" <> rName)
rLink = ""
fiReferences %= (Reference{..} :)
parseHandAnchor = do
aName <- inBrackets "<a name=\"" "\"></a>"
let aType = HandAnchor
fiAnchors %= (Anchor{..} :)
parseBiblioAnchor = do
aName <- inBrackets "[" "]:"
let aType = BiblioAnchor
fiAnchors %= (Anchor{..} :)
parseSectionName = do
prefix <- P.takeWhile1P (Just "header prefix") (== '#')
rawName <- P.takeWhileP (Just "header") $ \c -> all (/= c) ['\r', '\n']
_ <- P.eol
let aType = HeaderAnchor (length prefix)
aName = headerToAnchor $
T.dropWhileEnd (\c -> c == '#' || c == ' ') $ T.strip $
toStrict rawName
fiAnchors %= (Anchor{..} :)
skipQuotes = do
_ <- P.char '`'
P.skipMany (void (P.string "\\\\`") <|> void (P.anySingleBut '`'))
_ <- P.char '`'
return ()
skipCodeBlocks = void $ inBrackets "```" "```"
skipUninteresting = do
_ <- P.anySingle
P.skipMany $ P.satisfy (\c -> all (/= c) ['[', '<', '#', '`'])
parseFileInfo :: FilePath -> LT.Text -> FileInfo
parseFileInfo path input =
let outcome = P.parse (execStateT fileInfoParser def) path input
in case outcome of
Left err -> error $ "Failed to parse file " <> show path <>
": " <> show err
Right res -> res
markdownScanner :: ScanAction
markdownScanner path = liftIO $ do
res <- parseFileInfo path <$> LT.readFile path
force res `par` return res
markdownSupport :: ([Extension], ScanAction)
markdownSupport = ([".md"], markdownScanner)

15
src/Crv/Util.hs Normal file
View File

@ -0,0 +1,15 @@
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Crv.Util
( nameF'
) where
import Fmt (Builder, build, fmt, nameF)
import System.Console.Pretty (Pretty (..), Style (Faint))
instance Pretty Builder where
colorize s c = build @Text . colorize s c . fmt
style s = build @Text . style s . fmt
nameF' :: Builder -> Builder -> Builder
nameF' a b = nameF (style Faint a) b

158
src/Crv/Verify.hs Normal file
View File

@ -0,0 +1,158 @@
{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE PartialTypeSignatures #-}
{-# OPTIONS_GHC -Wno-partial-type-signatures #-}
module Crv.Verify
( -- * General verification
VerifyResult (..)
, verifyOk
, verifyErrors
, verifying
, WithReferenceLoc (..)
-- * Cross-references validation
, CrvVerifyError (..)
, verifyRepo
, checkExternalResource
) where
import Control.Monad.Except (ExceptT (..), MonadError (..))
import Data.Default (def)
import qualified Data.Map as M
import Fmt (Buildable (..), listF, (+|), (|+))
import Network.HTTP.Client (HttpException (..), HttpExceptionContent (..), responseStatus)
import Network.HTTP.Req (GET (..), HEAD (..), HttpException (..), NoReqBody (..), ignoreResponse,
parseUrl, req, runReq)
import Network.HTTP.Types.Status (Status, statusCode, statusMessage)
import System.Console.Pretty (Style (Faint), style)
import System.Directory (doesDirectoryExist, doesFileExist)
import System.FilePath.Posix (takeDirectory)
import System.FilePath.Posix ((</>))
import System.Timeout (timeout)
import Crv.Core
-----------------------------------------------------------
-- General verification
-----------------------------------------------------------
newtype VerifyResult e = VerifyResult [e]
deriving (Show, Functor)
deriving instance Semigroup (VerifyResult e)
deriving instance Monoid (VerifyResult e)
instance Buildable e => Buildable (VerifyResult e) where
build vr = case verifyErrors vr of
Nothing -> "ok"
Just errs -> listF errs
verifyOk :: VerifyResult e -> Bool
verifyOk (VerifyResult errors) = null errors
verifyErrors :: VerifyResult e -> Maybe (NonEmpty e)
verifyErrors (VerifyResult errors) = nonEmpty errors
verifying :: Monad m => ExceptT e m () -> m (VerifyResult e)
verifying (ExceptT action) = fmap toVerifyRes action
toVerifyRes :: Either e () -> VerifyResult e
toVerifyRes = VerifyResult . either one (\() -> [])
-----------------------------------------------------------
-- Cross-references validation
-----------------------------------------------------------
data WithReferenceLoc a = WithReferenceLoc
{ wrlFile :: FilePath
, wrlReference :: Reference
, wrlItem :: a
}
instance Buildable a => Buildable (WithReferenceLoc a) where
build WithReferenceLoc{..} =
"In file " +| style Faint wrlFile |+ " " +| wrlReference |+ "\n"
+| wrlItem |+ "\n\n"
data CrvVerifyError
= FileDoesNotExist FilePath
| AnchorDoesNotExist Text [Anchor]
| ExternalResourceInvalidUri Text
| ExternalResourceUnavailable Text Status
| ExternalResourceSomeError Text
deriving (Show)
instance Buildable CrvVerifyError where
build = \case
FileDoesNotExist file ->
"File does not exist: " +| file |+ ""
AnchorDoesNotExist anchor available ->
"Anchor '" +| anchor |+ "' does not exist in the file, available ones: "
+| listF available |+ ""
ExternalResourceInvalidUri link ->
"Bad url: " +| link |+ ", expected 'http' or 'https'"
ExternalResourceUnavailable link status ->
"Resource unavailable: (" +| statusCode status |+ " " +|
decodeUtf8 @Text (statusMessage status) |+ "): " +| link |+ ""
ExternalResourceSomeError err -> build err
verifyRepo :: FilePath
-> RepoInfo
-> IO (VerifyResult $ WithReferenceLoc CrvVerifyError)
verifyRepo root (RepoInfo repoInfo) =
concatForM (M.toList repoInfo) $ \(file, fileInfo) ->
concatForM (_fiReferences fileInfo) $ \ref@Reference{..} ->
fmap (fmap $ WithReferenceLoc file ref) $
case locationType rLink of
LocalLoc -> checkFileRef rAnchor file
RelativeLoc -> checkFileRef rAnchor
(takeDirectory file </> toString rLink)
AbsoluteLoc -> checkFileRef rAnchor (root <> toString rLink)
ExternalLoc -> return mempty -- checkExternalResource rLink
where
checkFileRef mAnchor file = verifying $ do
fileExists <- liftIO $ doesFileExist file
dirExists <- liftIO $ doesDirectoryExist file
unless (fileExists || dirExists) $
throwError (FileDoesNotExist file)
case M.lookup file repoInfo of
Nothing -> pass -- no support for such file, can do nothing
Just referedFileInfo ->
whenJust mAnchor $ \anchor -> do
let fileAnchors = _fiAnchors referedFileInfo
maybe (throwError $ AnchorDoesNotExist anchor fileAnchors) pure $
void $ find ((== anchor) . aName) fileAnchors
checkExternalResource :: Text -> IO (VerifyResult CrvVerifyError)
checkExternalResource link = fmap toVerifyRes $ do
makeRequest HEAD >>= \case
Right () -> return $ Right ()
Left (ExternalResourceUnavailable _ status) | statusCode status == 405
|| statusCode status == 418
-> makeRequest GET
Left err -> pure (Left err)
where
makeRequest :: _ => method -> IO (Either CrvVerifyError ())
makeRequest method = runExceptT $ do
parsedUrl <- parseUrl (encodeUtf8 link)
& maybe (throwError $ ExternalResourceInvalidUri link) pure
let reqLink = case parsedUrl of
Left (url, option) ->
runReq def $ req method url NoReqBody ignoreResponse option
Right (url, option) ->
runReq def $ req method url NoReqBody ignoreResponse option
-- TODO: tunable timeout
mres <- liftIO (timeout 3000000 $ void reqLink) `catch` (throwError . processErrors)
maybe (throwError $ ExternalResourceSomeError "Response timeout") pure mres
processErrors = \case
JsonHttpException _ -> error "External link JSON parse exception"
VanillaHttpException err -> case err of
InvalidUrlException{} -> error "External link URL invalid exception"
HttpExceptionRequest _ exc -> case exc of
StatusCodeException resp _ ->
ExternalResourceUnavailable link (responseStatus resp)
other -> ExternalResourceSomeError $ show other

20
stack.yaml Normal file
View File

@ -0,0 +1,20 @@
resolver: lts-12.0
packages:
- .
extra-deps:
- base-noprelude-4.11.1.0
- megaparsec-7.0.1
- parser-combinators-1.0.0
- cmark-0.5.6
- text-format-0.3.2
- fmt-0.6
- pretty-terminal-0.1.0.0
- roman-numerals-0.5.1.5
- req-1.2.1
- git: https://github.com/serokell/lootbox.git
commit: 34e389808e34f1cfa56f456773682325bed56d17
subdirs:
- code/prelude