Replace transcript parser with cmark

We don’t need a very rich parser for transcripts, but we _do_ need to
reliably identify fenced code blocks, and that implies a number of
subtle cases. Using a polished CommonMark parser/printer handles those
subtleties for us.

I chose `cmark` for a few reasons:
- it’s a wrapper around `libcmark`, which is the reference
  implementation of CommonMark, so it should be correct;
- it provides both a parser and a printer (unlike MMark); and
- it is extremely fast (about 20x faster than MMark), so the fact that
  our home-rolled parser got to skip over everything that’s not a block
  isn’t an issue.).

This only _partially_ uses the `cmark` printer. I think it should use it
entirely, but for the cases where we do streaming output (processing UCM
commands, etc.) it’s a more involved change. So I think it should be
handled separately.
This commit is contained in:
Greg Pfeil 2024-07-10 08:11:59 -06:00
parent 2591ade890
commit 9e2fa2bbe7
No known key found for this signature in database
GPG Key ID: 1193ACD196ED61F2
3 changed files with 56 additions and 81 deletions

View File

@ -19,6 +19,7 @@ dependencies:
- base
- bytes
- bytestring
- cmark
- co-log-core
- code-page
- concurrent-output

View File

@ -18,6 +18,7 @@ module Unison.Codebase.TranscriptParser
)
where
import CMark qualified
import Control.Lens (use, (?~))
import Crypto.Random qualified as Random
import Data.Aeson qualified as Aeson
@ -121,12 +122,14 @@ instance Show APIRequest where
show (GetRequest txt) = "GET " <> Text.unpack txt
show (APIComment txt) = "-- " <> Text.unpack txt
pattern CMarkCodeBlock :: (Maybe CMark.PosInfo) -> Text -> Text -> CMark.Node
pattern CMarkCodeBlock pos info body = CMark.Node pos (CMark.CODE_BLOCK info body) []
data Stanza
= Ucm Hidden ExpectingError [UcmLine]
| Unison Hidden ExpectingError (Maybe ScratchFileName) Text
| API [APIRequest]
| UnprocessedFence FenceType Text
| Unfenced Text
| UnprocessedBlock CMark.Node
instance Show UcmLine where
show = \case
@ -138,43 +141,34 @@ instance Show UcmLine where
UcmContextProject projectAndBranch -> Text.unpack (into @Text projectAndBranch)
instance Show Stanza where
show s = case s of
show s = (<> "\n") . Text.unpack . CMark.nodeToCommonmark [] Nothing $ stanzaToNode s
stanzaToNode :: Stanza -> CMark.Node
stanzaToNode =
\case
Ucm _ _ cmds ->
unlines
[ "```ucm",
foldl (\x y -> x ++ show y) "" cmds,
"```"
]
CMarkCodeBlock Nothing "ucm" . Text.pack $
foldl (\x y -> x ++ show y) "" cmds
Unison _hide _ fname txt ->
unlines
[ "```unison",
case fname of
Nothing -> Text.unpack txt <> "```\n"
Just fname ->
unlines
[ "---",
"title: " <> Text.unpack fname,
"---",
Text.unpack txt,
"```",
""
]
]
CMarkCodeBlock Nothing "unison" . Text.pack $
unlines
[ case fname of
Nothing -> Text.unpack txt
Just fname ->
unlines
[ "---",
"title: " <> Text.unpack fname,
"---",
Text.unpack txt
]
]
API apiRequests ->
"```api\n"
<> ( apiRequests
& fmap show
& unlines
)
<> "```\n"
UnprocessedFence typ txt ->
unlines
[ "```" <> Text.unpack typ,
Text.unpack txt,
"```",
""
]
Unfenced txt -> Text.unpack txt
CMarkCodeBlock Nothing "api" . Text.pack $
( apiRequests
& fmap show
& unlines
)
UnprocessedBlock node -> node
parseFile :: FilePath -> IO (Either TranscriptError [Stanza])
parseFile filePath = do
@ -186,7 +180,7 @@ parseFile filePath = do
else pure . Left . TranscriptParseError . Text.pack $ filePath <> " does not exist"
parse :: String -> Text -> Either TranscriptError [Stanza]
parse srcName txt = case P.parse (stanzas <* P.eof) srcName txt of
parse srcName txt = case stanzas srcName txt of
Right a -> Right a
Left e -> Left . TranscriptParseError . Text.pack . P.errorBundlePretty $ e
@ -337,7 +331,7 @@ run verbosity dir stanzas codebase runtime sbRuntime nRuntime config ucmVersion
for (reverse scratchFileUpdates) \(fp, contents) -> do
let fenceDescription = "unison:added-by-ucm " <> fp
-- Output blocks for any scratch file updates the ucm block triggered.
Q.undequeue inputQueue (UnprocessedFence fenceDescription contents, Nothing)
Q.undequeue inputQueue (UnprocessedBlock $ CMarkCodeBlock Nothing fenceDescription contents, Nothing)
awaitInput
-- ucm command to run
Just (Just ucmLine) -> do
@ -420,10 +414,7 @@ run verbosity dir stanzas codebase runtime sbRuntime nRuntime config ucmVersion
++ "."
IO.hFlush IO.stdout
case s of
Unfenced _ -> do
liftIO (output $ show s)
awaitInput
UnprocessedFence _ _ -> do
UnprocessedBlock _ -> do
liftIO (output $ show s)
awaitInput
Unison hide errOk filename txt -> do
@ -593,8 +584,12 @@ transcriptFailure out msg = do
type P = P.Parsec Void Text
stanzas :: P [Stanza]
stanzas = P.many (fenced <|> unfenced)
stanzas :: String -> Text -> Either (P.ParseErrorBundle Text Void) [Stanza]
stanzas srcName = (\(CMark.Node _ _DOCUMENT blocks) -> traverse stanzaFromBlock blocks) . CMark.commonmarkToNode []
where
stanzaFromBlock block = case block of
CMarkCodeBlock _ info body -> fromMaybe (UnprocessedBlock block) <$> P.parse (fenced info) srcName body
_ -> pure $ UnprocessedBlock block
ucmLine :: P UcmLine
ucmLine = ucmCommand <|> ucmComment
@ -636,18 +631,21 @@ apiRequest = do
spaces
pure (APIComment comment)
fenced :: P Stanza
fenced = do
fence
-- | Produce the correct parser for the code block based on the provided info string.
fenced :: Text -> P (Maybe Stanza)
fenced info = do
body <- P.getInput
P.setInput info
fenceType <- lineToken (word "ucm" <|> word "unison" <|> word "api" <|> language)
stanza <-
case fenceType of
"ucm" -> do
hide <- hidden
err <- expectingError
P.setInput body
_ <- spaces
cmds <- many ucmLine
pure $ Ucm hide err cmds
pure . pure $ Ucm hide err cmds
"unison" ->
do
-- todo: this has to be more interesting
@ -657,44 +655,17 @@ fenced = do
hide <- lineToken hidden
err <- lineToken expectingError
fileName <- optional untilSpace1
blob <- spaces *> untilFence
pure $ Unison hide err fileName blob
P.setInput body
blob <- spaces *> (Text.init <$> P.getInput)
pure . pure $ Unison hide err fileName blob
"api" -> do
P.setInput body
_ <- spaces
apiRequests <- many apiRequest
pure $ API apiRequests
_ -> UnprocessedFence fenceType <$> untilFence
fence
pure . pure $ API apiRequests
_ -> pure Nothing
pure stanza
-- Three backticks, consumes trailing spaces too
-- ```
fence :: P ()
fence = P.try $ do void (word "```"); spaces
-- Parses up until next fence
unfenced :: P Stanza
unfenced = Unfenced <$> untilFence
untilFence :: P Text
untilFence = do
_ <- P.lookAhead (P.takeP Nothing 1)
go mempty
where
go :: Seq Text -> P Text
go !acc = do
f <- P.lookAhead (P.optional fence)
case f of
Nothing -> do
oneOrTwoBackticks <- optional (word' "``" <|> word' "`")
let start = fromMaybe "" oneOrTwoBackticks
txt <- P.takeWhileP (Just "unfenced") (/= '`')
eof <- P.lookAhead (P.optional P.eof)
case eof of
Just _ -> pure $ fold (acc <> pure txt)
Nothing -> go (acc <> pure start <> pure txt)
Just _ -> pure $ fold acc
word' :: Text -> P Text
word' txt = P.try $ do
chs <- P.takeP (Just $ show txt) (Text.length txt)

View File

@ -196,6 +196,7 @@ library
, base
, bytes
, bytestring
, cmark
, co-log-core
, code-page
, concurrent-output
@ -337,6 +338,7 @@ executable transcripts
, base
, bytes
, bytestring
, cmark
, co-log-core
, code-page
, concurrent-output
@ -485,6 +487,7 @@ test-suite cli-tests
, base
, bytes
, bytestring
, cmark
, co-log-core
, code-page
, concurrent-output