[#135] Add support of right in-place annotations

Problem: It can be convenient not only specifying exclusions in the config, but also annotating the excluded thing right in-place. It is about adding comment before the link or the paragraph or even the whole file to ignore this item. Solution: Support annotation as html comment in a "" format, where mode is "link" or "paragraph" or "file". Modify `nodeExtractInfo` from `Xrefcheck.Scanners.Markdown` just to skip a node in AST while parsing. Take into account that "ignore file" can only be at the top of the file or right after the license. In markdown terms: either the first node must be HTML_BLOCK with "" content or the first is HTML_BLOCK comment (smth between "") and the second is HTML_BLOCK with "ignore file" content. Also take into account that "ignore link" must be followed by a link. Strictly speaking, there is either LINK after "ignore link" or TEXT and LINK (if there is some text before the link).
2024-08-16 09:00:45 +03:00 · 2020-11-10 23:38:28 +03:00 · 2020-11-10 23:38:28 +03:00 · ecff1dc342
commit ecff1dc342
parent 40640124db
2 changed files with 191 additions and 46 deletions
--- a/package.yaml
+++ b/package.yaml
@ -52,6 +52,7 @@ default-extensions:
 ghc-options:
  - -Wall
  - -Wincomplete-record-updates
+  - -Wincomplete-uni-patterns

 dependencies:
  - aeson
@ -87,6 +88,7 @@ dependencies:
  - text-metrics
  - th-lift-instances
  - th-utilities
+  - transformers
  - name: universum
    mixin: [(Universum as Prelude), (Universum.Unsafe as Unsafe)]
  - yaml
--- a/src/Xrefcheck/Scanners/Markdown.hs
+++ b/src/Xrefcheck/Scanners/Markdown.hs
@ -14,12 +14,13 @@ module Xrefcheck.Scanners.Markdown

 import CMarkGFM (Node (..), NodeType (..), PosInfo (..), commonmarkToNode)
 import Control.Lens ((%=))
+import Control.Monad.Trans.Except (Except, runExcept, throwE)
 import qualified Data.ByteString.Lazy as BSL
+import Data.Char (isSpace)
 import Data.Default (Default (..))
 import qualified Data.Text as T
 import qualified Data.Text.Lazy as LT
 import Fmt (Buildable (..), blockListF, nameF, (+|), (|+))
-import GHC.Conc (par)

 import Xrefcheck.Core
 import Xrefcheck.Scan
@ -49,54 +50,196 @@ nodeExtractText = mconcat . map extractText . nodeFlatten
        CODE t -> t
        _ -> ""

-nodeExtractInfo :: Node -> ExceptT Text Identity FileInfo
-nodeExtractInfo docNode = fmap finaliseFileInfo $ execStateT (loop docNode) def
-  where
-    loop node@(Node pos ty subs) = case ty of
-        DOCUMENT ->
-            mapM_ loop subs
-        PARAGRAPH ->
-            mapM_ loop subs
-        HEADING lvl ->
-            let text = nodeExtractText node
-                aType = HeaderAnchor lvl
-                aName = headerToAnchor text
-                aPos = toPosition pos
-            in fiAnchors %= (Anchor{..} :)
-        LIST _ ->
-            mapM_ loop subs
-        ITEM ->
-            mapM_ loop subs
-        HTML_INLINE htmlText -> do
-            let mName = T.stripSuffix "\">" =<< T.stripPrefix "<a name=\"" htmlText
-            whenJust mName $ \aName -> do
-                let aType = HandAnchor
-                    aPos = toPosition pos
-                fiAnchors %= (Anchor{..} :)
-        LINK url _ -> do
-            let rName = nodeExtractText node
-                rPos = toPosition pos
-                link = if null url then rName else url
-            let (rLink, rAnchor) = case T.splitOn "#" link of
-                    [t]    -> (t, Nothing)
-                    t : ts -> (t, Just $ T.intercalate "#" ts)
-                    []     -> error "impossible"
-            fiReferences %= (Reference{..} :)
-        _ -> pass
+data IgnoreMode
+    = Link
+    | Paragraph
+    | File
+    deriving Eq

-parseFileInfo :: FilePath -> LT.Text -> FileInfo
-parseFileInfo path input =
-    let outcome = runIdentity . runExceptT $
-                  nodeExtractInfo $ commonmarkToNode [] [] $ toStrict input
-    in case outcome of
-        Left err  -> error $ "Failed to parse file " <> show path <>
-                             ": " <> show err
-        Right res -> res
+nodeExtractInfo :: Node -> Except Text FileInfo
+nodeExtractInfo (Node _ _ docNodes) =
+    if checkIgnoreFile docNodes
+    then return def
+    else finaliseFileInfo <$> extractionResult
+    where
+        extractionResult :: Except Text FileInfo
+        extractionResult =
+            execStateT (loop docNodes Nothing) def
+
+        loop :: [Node] -> Maybe IgnoreMode -> StateT FileInfo (Except Text) ()
+        loop [] _ = pass
+        loop (node@(Node pos ty subs) : nodes) toIgnore
+            | toIgnore == Just File = returnError toIgnore "" pos
+            | toIgnore == Just Link = do
+                let (Node startPos _ _) = maybe defNode id $ safeHead subs
+                let mNext = case ty of
+                        PARAGRAPH -> afterIgnoredLink subs <> Just nodes
+                        TEXT txt | null (dropWhile isSpace $ T.unpack txt) -> afterIgnoredLink nodes
+                        SOFTBREAK -> afterIgnoredLink nodes
+                        _ -> afterIgnoredLink (node : nodes)
+                case mNext of
+                    Just next -> loop next Nothing
+                    Nothing   -> returnError toIgnore "" startPos
+            | toIgnore == Just Paragraph =
+                case ty of
+                    PARAGRAPH -> loop nodes Nothing
+                    _         -> returnError toIgnore (prettyType ty) pos
+            | otherwise = 
+                case ty of
+                    HTML_BLOCK _ -> processHtmlNode node pos nodes toIgnore
+                    HEADING lvl -> do
+                        let aType = HeaderAnchor lvl
+                        let aName = headerToAnchor $ nodeExtractText node
+                        let aPos = toPosition pos
+                        fiAnchors %= (Anchor{..} :)
+                        loop nodes toIgnore
+                    HTML_INLINE htmlText -> do
+                        let mName = T.stripSuffix "\">" =<< T.stripPrefix "<a name=\"" htmlText
+                        whenJust mName $ \aName -> do
+                            let aType = HandAnchor
+                                aPos = toPosition pos
+                            fiAnchors %= (Anchor{..} :)
+                        processHtmlNode node pos nodes toIgnore
+                    LINK url _ -> do
+                        let rName = nodeExtractText node
+                            rPos = toPosition pos
+                            link = if null url then rName else url
+                        let (rLink, rAnchor) = case T.splitOn "#" link of
+                                [t]    -> (t, Nothing)
+                                t : ts -> (t, Just $ T.intercalate "#" ts)
+                                []     -> error "impossible"
+                        fiReferences %= (Reference{..} :)
+                        loop nodes toIgnore
+                    _ -> loop (subs ++ nodes) toIgnore
+
+        defNode :: Node
+        defNode = Node Nothing DOCUMENT [] -- hard-coded default Node
+
+        getCommentContent :: Node -> Maybe Text
+        getCommentContent node = do
+            txt <- getHTMLText node
+            T.stripSuffix "-->" =<< T.stripPrefix "<!--" (T.strip txt)
+            where
+                getHTMLText :: Node -> Maybe Text
+                getHTMLText (Node _ (HTML_BLOCK txt) _) = Just txt
+                getHTMLText (Node _ (HTML_INLINE txt) _) = Just txt
+                getHTMLText _ = Nothing
+
+        getXrefcheckContent :: Node -> Maybe Text
+        getXrefcheckContent node = 
+            let notStripped = T.stripPrefix "xrefcheck:" . T.strip =<<
+                                 getCommentContent node
+            in T.strip <$> notStripped
+
+        getIgnoreMode :: Node -> Maybe IgnoreMode
+        getIgnoreMode node =
+            let mContent = getXrefcheckContent node
+
+                textToMode :: [Text] -> Maybe IgnoreMode
+                textToMode ("ignore" : [x])
+                    | x == "link"      = return Link
+                    | x == "paragraph" = return Paragraph
+                    | x == "file"      = return File
+                    | otherwise        = Nothing
+                textToMode _           = Nothing
+            in textToMode . words =<< mContent
+
+        isComment :: Node -> Bool
+        isComment = isJust . getCommentContent
+
+        isIgnoreFile :: Node -> Bool
+        isIgnoreFile = (Just File ==) . getIgnoreMode
+
+        checkIgnoreFile :: [Node] -> Bool
+        checkIgnoreFile nodes =
+            let isSimpleComment :: Node -> Bool
+                isSimpleComment node = isComment node && not (isIgnoreFile node)
+
+                mIgnoreFile = safeHead $ dropWhile isSimpleComment nodes
+            in maybe False isIgnoreFile mIgnoreFile
+
+        isLink :: Node -> Bool
+        isLink (Node _ (LINK _ _) _) = True
+        isLink _ = False
+
+        isText :: Node -> Bool
+        isText (Node _ (TEXT _) _) = True
+        isText _ = False
+
+        afterIgnoredLink :: [Node] -> Maybe [Node]
+        afterIgnoredLink (fNode : nodes)
+            | isLink fNode = return nodes
+            | sNode : nodes' <- nodes =
+                if isText fNode && isLink sNode
+                then return nodes'
+                else Nothing
+            | otherwise = Nothing
+        afterIgnoredLink _ = Nothing
+
+        prettyPos :: Maybe PosInfo -> Text
+        prettyPos pos = 
+            let posToText :: Position -> Text
+                posToText (Position mPos) = fromMaybe "" mPos
+            in "(" <> (posToText $ toPosition pos) <> ")"
+        
+        prettyType :: NodeType -> Text
+        prettyType ty =
+            let mType = safeHead $ words $ show ty
+            in maybe "" id mType
+
+        fileMsg :: Text
+        fileMsg =
+            "\"ignore file\" must be at the top of \
+            \markdown or right after comments at the top"
+
+        linkMsg :: Text
+        linkMsg = "expected a LINK after \"ignore link\" "
+
+        paragraphMsg :: Text -> Text
+        paragraphMsg txt = unwords
+            [ "expected a PARAGRAPH after \
+               \\"ignore paragraph\", but found"
+            , txt
+            , ""
+            ]
+
+        unrecognisedMsg :: Text -> Text
+        unrecognisedMsg txt = unwords
+            [ "unrecognised option"
+            , "\"" <> txt <> "\""
+            , "perhaps you meant \
+               \<\"ignore link\"|\"ignore paragraph\"|\"ignore file\"> "
+            ]
+        
+        returnError :: Maybe IgnoreMode -> Text -> Maybe PosInfo -> StateT FileInfo (Except Text) ()
+        returnError mode txt pos =
+            let errMsg = case mode of
+                    Just Link      -> linkMsg
+                    Just Paragraph -> paragraphMsg txt
+                    Just File      -> fileMsg
+                    Nothing        -> unrecognisedMsg txt
+                posInfo = prettyPos pos
+            in lift $ throwE $ errMsg <> posInfo
+
+        processHtmlNode :: Node -> Maybe PosInfo -> [Node] -> Maybe IgnoreMode -> StateT FileInfo (Except Text) ()
+        processHtmlNode node pos nodes toIgnore = do
+            let xrefcheckContent = getXrefcheckContent node
+            case xrefcheckContent of
+                Just content -> maybe (returnError Nothing content pos)
+                    (loop nodes . pure) $ getIgnoreMode node
+                Nothing -> loop nodes toIgnore
+
+parseFileInfo :: LT.Text -> Either Text FileInfo
+parseFileInfo input = runExcept $ nodeExtractInfo $
+    commonmarkToNode [] [] $ toStrict input

 markdownScanner :: ScanAction
-markdownScanner path = liftIO $ do
-    res <- parseFileInfo path . decodeUtf8 <$> BSL.readFile path
-    force res `par` return res
+markdownScanner path = do
+    errOrInfo <- parseFileInfo . decodeUtf8 <$> BSL.readFile path
+    case errOrInfo of
+        Left errTxt -> do
+            die $ "Error when scanning " <> path <> ": " <> T.unpack errTxt
+        Right fileInfo -> return fileInfo

 markdownSupport :: ([Extension], ScanAction)
 markdownSupport = ([".md"], markdownScanner)