Replace transcript parser with cmark

We don’t need a very rich parser for transcripts, but we _do_ need to reliably identify fenced code blocks, and that implies a number of subtle cases. Using a polished CommonMark parser/printer handles those subtleties for us. I chose `cmark` for a few reasons: - it’s a wrapper around `libcmark`, which is the reference implementation of CommonMark, so it should be correct; - it provides both a parser and a printer (unlike MMark); and - it is extremely fast (about 20x faster than MMark), so the fact that our home-rolled parser got to skip over everything that’s not a block isn’t an issue.). This only _partially_ uses the `cmark` printer. I think it should use it entirely, but for the cases where we do streaming output (processing UCM commands, etc.) it’s a more involved change. So I think it should be handled separately.
2024-08-15 21:40:50 +03:00 · 2024-07-10 08:11:59 -06:00 · 2024-07-10 08:11:59 -06:00 · 9e2fa2bbe7
commit 9e2fa2bbe7
parent 2591ade890
3 changed files with 56 additions and 81 deletions
--- a/unison-cli/package.yaml
+++ b/unison-cli/package.yaml
@ -19,6 +19,7 @@ dependencies:
  - base
  - bytes
  - bytestring
+  - cmark
  - co-log-core
  - code-page
  - concurrent-output
--- a/unison-cli/src/Unison/Codebase/TranscriptParser.hs
+++ b/unison-cli/src/Unison/Codebase/TranscriptParser.hs
@ -18,6 +18,7 @@ module Unison.Codebase.TranscriptParser
  )
 where

+import CMark qualified
 import Control.Lens (use, (?~))
 import Crypto.Random qualified as Random
 import Data.Aeson qualified as Aeson
@ -121,12 +122,14 @@ instance Show APIRequest where
  show (GetRequest txt) = "GET " <> Text.unpack txt
  show (APIComment txt) = "-- " <> Text.unpack txt

+pattern CMarkCodeBlock :: (Maybe CMark.PosInfo) -> Text -> Text -> CMark.Node
+pattern CMarkCodeBlock pos info body = CMark.Node pos (CMark.CODE_BLOCK info body) []
+
 data Stanza
  = Ucm Hidden ExpectingError [UcmLine]
  | Unison Hidden ExpectingError (Maybe ScratchFileName) Text
  | API [APIRequest]
-  | UnprocessedFence FenceType Text
-  | Unfenced Text
+  | UnprocessedBlock CMark.Node

 instance Show UcmLine where
  show = \case
@ -138,43 +141,34 @@ instance Show UcmLine where
        UcmContextProject projectAndBranch -> Text.unpack (into @Text projectAndBranch)

 instance Show Stanza where
-  show s = case s of
+  show s = (<> "\n") . Text.unpack . CMark.nodeToCommonmark [] Nothing $ stanzaToNode s
+
+stanzaToNode :: Stanza -> CMark.Node
+stanzaToNode =
+  \case
    Ucm _ _ cmds ->
-      unlines
-        [ "```ucm",
-          foldl (\x y -> x ++ show y) "" cmds,
-          "```"
-        ]
+      CMarkCodeBlock Nothing "ucm" . Text.pack $
+        foldl (\x y -> x ++ show y) "" cmds
    Unison _hide _ fname txt ->
-      unlines
-        [ "```unison",
-          case fname of
-            Nothing -> Text.unpack txt <> "```\n"
-            Just fname ->
-              unlines
-                [ "---",
-                  "title: " <> Text.unpack fname,
-                  "---",
-                  Text.unpack txt,
-                  "```",
-                  ""
-                ]
-        ]
+      CMarkCodeBlock Nothing "unison" . Text.pack $
+        unlines
+          [ case fname of
+              Nothing -> Text.unpack txt
+              Just fname ->
+                unlines
+                  [ "---",
+                    "title: " <> Text.unpack fname,
+                    "---",
+                    Text.unpack txt
+                  ]
+          ]
    API apiRequests ->
-      "```api\n"
-        <> ( apiRequests
-               & fmap show
-               & unlines
-           )
-        <> "```\n"
-    UnprocessedFence typ txt ->
-      unlines
-        [ "```" <> Text.unpack typ,
-          Text.unpack txt,
-          "```",
-          ""
-        ]
-    Unfenced txt -> Text.unpack txt
+      CMarkCodeBlock Nothing "api" . Text.pack $
+        ( apiRequests
+            & fmap show
+            & unlines
+        )
+    UnprocessedBlock node -> node

 parseFile :: FilePath -> IO (Either TranscriptError [Stanza])
 parseFile filePath = do
@ -186,7 +180,7 @@ parseFile filePath = do
    else pure . Left . TranscriptParseError . Text.pack $ filePath <> " does not exist"

 parse :: String -> Text -> Either TranscriptError [Stanza]
-parse srcName txt = case P.parse (stanzas <* P.eof) srcName txt of
+parse srcName txt = case stanzas srcName txt of
  Right a -> Right a
  Left e -> Left . TranscriptParseError . Text.pack . P.errorBundlePretty $ e

@ -337,7 +331,7 @@ run verbosity dir stanzas codebase runtime sbRuntime nRuntime config ucmVersion
              for (reverse scratchFileUpdates) \(fp, contents) -> do
                let fenceDescription = "unison:added-by-ucm " <> fp
                -- Output blocks for any scratch file updates the ucm block triggered.
-                Q.undequeue inputQueue (UnprocessedFence fenceDescription contents, Nothing)
+                Q.undequeue inputQueue (UnprocessedBlock $ CMarkCodeBlock Nothing fenceDescription contents, Nothing)
            awaitInput
          -- ucm command to run
          Just (Just ucmLine) -> do
@ -420,10 +414,7 @@ run verbosity dir stanzas codebase runtime sbRuntime nRuntime config ucmVersion
                      ++ "."
                  IO.hFlush IO.stdout
                case s of
-                  Unfenced _ -> do
-                    liftIO (output $ show s)
-                    awaitInput
-                  UnprocessedFence _ _ -> do
+                  UnprocessedBlock _ -> do
                    liftIO (output $ show s)
                    awaitInput
                  Unison hide errOk filename txt -> do
@ -593,8 +584,12 @@ transcriptFailure out msg = do

 type P = P.Parsec Void Text

-stanzas :: P [Stanza]
-stanzas = P.many (fenced <|> unfenced)
+stanzas :: String -> Text -> Either (P.ParseErrorBundle Text Void) [Stanza]
+stanzas srcName = (\(CMark.Node _ _DOCUMENT blocks) -> traverse stanzaFromBlock blocks) . CMark.commonmarkToNode []
+  where
+    stanzaFromBlock block = case block of
+      CMarkCodeBlock _ info body -> fromMaybe (UnprocessedBlock block) <$> P.parse (fenced info) srcName body
+      _ -> pure $ UnprocessedBlock block

 ucmLine :: P UcmLine
 ucmLine = ucmCommand <|> ucmComment
@ -636,18 +631,21 @@ apiRequest = do
      spaces
      pure (APIComment comment)

-fenced :: P Stanza
-fenced = do
-  fence
+-- | Produce the correct parser for the code block based on the provided info string.
+fenced :: Text -> P (Maybe Stanza)
+fenced info = do
+  body <- P.getInput
+  P.setInput info
  fenceType <- lineToken (word "ucm" <|> word "unison" <|> word "api" <|> language)
  stanza <-
    case fenceType of
      "ucm" -> do
        hide <- hidden
        err <- expectingError
+        P.setInput body
        _ <- spaces
        cmds <- many ucmLine
-        pure $ Ucm hide err cmds
+        pure . pure $ Ucm hide err cmds
      "unison" ->
        do
          -- todo: this has to be more interesting
@ -657,44 +655,17 @@ fenced = do
          hide <- lineToken hidden
          err <- lineToken expectingError
          fileName <- optional untilSpace1
-          blob <- spaces *> untilFence
-          pure $ Unison hide err fileName blob
+          P.setInput body
+          blob <- spaces *> (Text.init <$> P.getInput)
+          pure . pure $ Unison hide err fileName blob
      "api" -> do
+        P.setInput body
        _ <- spaces
        apiRequests <- many apiRequest
-        pure $ API apiRequests
-      _ -> UnprocessedFence fenceType <$> untilFence
-  fence
+        pure . pure $ API apiRequests
+      _ -> pure Nothing
  pure stanza

-- Three backticks, consumes trailing spaces too
-- ```
-fence :: P ()
-fence = P.try $ do void (word "```"); spaces
-
-- Parses up until next fence
-unfenced :: P Stanza
-unfenced = Unfenced <$> untilFence
-
-untilFence :: P Text
-untilFence = do
-  _ <- P.lookAhead (P.takeP Nothing 1)
-  go mempty
-  where
-    go :: Seq Text -> P Text
-    go !acc = do
-      f <- P.lookAhead (P.optional fence)
-      case f of
-        Nothing -> do
-          oneOrTwoBackticks <- optional (word' "``" <|> word' "`")
-          let start = fromMaybe "" oneOrTwoBackticks
-          txt <- P.takeWhileP (Just "unfenced") (/= '`')
-          eof <- P.lookAhead (P.optional P.eof)
-          case eof of
-            Just _ -> pure $ fold (acc <> pure txt)
-            Nothing -> go (acc <> pure start <> pure txt)
-        Just _ -> pure $ fold acc
-
 word' :: Text -> P Text
 word' txt = P.try $ do
  chs <- P.takeP (Just $ show txt) (Text.length txt)
--- a/unison-cli/unison-cli.cabal
+++ b/unison-cli/unison-cli.cabal
@ -196,6 +196,7 @@ library
    , base
    , bytes
    , bytestring
+    , cmark
    , co-log-core
    , code-page
    , concurrent-output
@ -337,6 +338,7 @@ executable transcripts
    , base
    , bytes
    , bytestring
+    , cmark
    , co-log-core
    , code-page
    , concurrent-output
@ -485,6 +487,7 @@ test-suite cli-tests
    , base
    , bytes
    , bytestring
+    , cmark
    , co-log-core
    , code-page
    , concurrent-output