From 9e2fa2bbe74e72b2e751986f83cbead4b3848754 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Wed, 10 Jul 2024 08:11:59 -0600 Subject: [PATCH] Replace transcript parser with `cmark` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don’t need a very rich parser for transcripts, but we _do_ need to reliably identify fenced code blocks, and that implies a number of subtle cases. Using a polished CommonMark parser/printer handles those subtleties for us. I chose `cmark` for a few reasons: - it’s a wrapper around `libcmark`, which is the reference implementation of CommonMark, so it should be correct; - it provides both a parser and a printer (unlike MMark); and - it is extremely fast (about 20x faster than MMark), so the fact that our home-rolled parser got to skip over everything that’s not a block isn’t an issue.). This only _partially_ uses the `cmark` printer. I think it should use it entirely, but for the cases where we do streaming output (processing UCM commands, etc.) it’s a more involved change. So I think it should be handled separately. --- unison-cli/package.yaml | 1 + .../src/Unison/Codebase/TranscriptParser.hs | 133 +++++++----------- unison-cli/unison-cli.cabal | 3 + 3 files changed, 56 insertions(+), 81 deletions(-) diff --git a/unison-cli/package.yaml b/unison-cli/package.yaml index d64ed16ae..23b18fa9d 100644 --- a/unison-cli/package.yaml +++ b/unison-cli/package.yaml @@ -19,6 +19,7 @@ dependencies: - base - bytes - bytestring + - cmark - co-log-core - code-page - concurrent-output diff --git a/unison-cli/src/Unison/Codebase/TranscriptParser.hs b/unison-cli/src/Unison/Codebase/TranscriptParser.hs index b9e82f7ed..ebabe7b4d 100644 --- a/unison-cli/src/Unison/Codebase/TranscriptParser.hs +++ b/unison-cli/src/Unison/Codebase/TranscriptParser.hs @@ -18,6 +18,7 @@ module Unison.Codebase.TranscriptParser ) where +import CMark qualified import Control.Lens (use, (?~)) import Crypto.Random qualified as Random import Data.Aeson qualified as Aeson @@ -121,12 +122,14 @@ instance Show APIRequest where show (GetRequest txt) = "GET " <> Text.unpack txt show (APIComment txt) = "-- " <> Text.unpack txt +pattern CMarkCodeBlock :: (Maybe CMark.PosInfo) -> Text -> Text -> CMark.Node +pattern CMarkCodeBlock pos info body = CMark.Node pos (CMark.CODE_BLOCK info body) [] + data Stanza = Ucm Hidden ExpectingError [UcmLine] | Unison Hidden ExpectingError (Maybe ScratchFileName) Text | API [APIRequest] - | UnprocessedFence FenceType Text - | Unfenced Text + | UnprocessedBlock CMark.Node instance Show UcmLine where show = \case @@ -138,43 +141,34 @@ instance Show UcmLine where UcmContextProject projectAndBranch -> Text.unpack (into @Text projectAndBranch) instance Show Stanza where - show s = case s of + show s = (<> "\n") . Text.unpack . CMark.nodeToCommonmark [] Nothing $ stanzaToNode s + +stanzaToNode :: Stanza -> CMark.Node +stanzaToNode = + \case Ucm _ _ cmds -> - unlines - [ "```ucm", - foldl (\x y -> x ++ show y) "" cmds, - "```" - ] + CMarkCodeBlock Nothing "ucm" . Text.pack $ + foldl (\x y -> x ++ show y) "" cmds Unison _hide _ fname txt -> - unlines - [ "```unison", - case fname of - Nothing -> Text.unpack txt <> "```\n" - Just fname -> - unlines - [ "---", - "title: " <> Text.unpack fname, - "---", - Text.unpack txt, - "```", - "" - ] - ] + CMarkCodeBlock Nothing "unison" . Text.pack $ + unlines + [ case fname of + Nothing -> Text.unpack txt + Just fname -> + unlines + [ "---", + "title: " <> Text.unpack fname, + "---", + Text.unpack txt + ] + ] API apiRequests -> - "```api\n" - <> ( apiRequests - & fmap show - & unlines - ) - <> "```\n" - UnprocessedFence typ txt -> - unlines - [ "```" <> Text.unpack typ, - Text.unpack txt, - "```", - "" - ] - Unfenced txt -> Text.unpack txt + CMarkCodeBlock Nothing "api" . Text.pack $ + ( apiRequests + & fmap show + & unlines + ) + UnprocessedBlock node -> node parseFile :: FilePath -> IO (Either TranscriptError [Stanza]) parseFile filePath = do @@ -186,7 +180,7 @@ parseFile filePath = do else pure . Left . TranscriptParseError . Text.pack $ filePath <> " does not exist" parse :: String -> Text -> Either TranscriptError [Stanza] -parse srcName txt = case P.parse (stanzas <* P.eof) srcName txt of +parse srcName txt = case stanzas srcName txt of Right a -> Right a Left e -> Left . TranscriptParseError . Text.pack . P.errorBundlePretty $ e @@ -337,7 +331,7 @@ run verbosity dir stanzas codebase runtime sbRuntime nRuntime config ucmVersion for (reverse scratchFileUpdates) \(fp, contents) -> do let fenceDescription = "unison:added-by-ucm " <> fp -- Output blocks for any scratch file updates the ucm block triggered. - Q.undequeue inputQueue (UnprocessedFence fenceDescription contents, Nothing) + Q.undequeue inputQueue (UnprocessedBlock $ CMarkCodeBlock Nothing fenceDescription contents, Nothing) awaitInput -- ucm command to run Just (Just ucmLine) -> do @@ -420,10 +414,7 @@ run verbosity dir stanzas codebase runtime sbRuntime nRuntime config ucmVersion ++ "." IO.hFlush IO.stdout case s of - Unfenced _ -> do - liftIO (output $ show s) - awaitInput - UnprocessedFence _ _ -> do + UnprocessedBlock _ -> do liftIO (output $ show s) awaitInput Unison hide errOk filename txt -> do @@ -593,8 +584,12 @@ transcriptFailure out msg = do type P = P.Parsec Void Text -stanzas :: P [Stanza] -stanzas = P.many (fenced <|> unfenced) +stanzas :: String -> Text -> Either (P.ParseErrorBundle Text Void) [Stanza] +stanzas srcName = (\(CMark.Node _ _DOCUMENT blocks) -> traverse stanzaFromBlock blocks) . CMark.commonmarkToNode [] + where + stanzaFromBlock block = case block of + CMarkCodeBlock _ info body -> fromMaybe (UnprocessedBlock block) <$> P.parse (fenced info) srcName body + _ -> pure $ UnprocessedBlock block ucmLine :: P UcmLine ucmLine = ucmCommand <|> ucmComment @@ -636,18 +631,21 @@ apiRequest = do spaces pure (APIComment comment) -fenced :: P Stanza -fenced = do - fence +-- | Produce the correct parser for the code block based on the provided info string. +fenced :: Text -> P (Maybe Stanza) +fenced info = do + body <- P.getInput + P.setInput info fenceType <- lineToken (word "ucm" <|> word "unison" <|> word "api" <|> language) stanza <- case fenceType of "ucm" -> do hide <- hidden err <- expectingError + P.setInput body _ <- spaces cmds <- many ucmLine - pure $ Ucm hide err cmds + pure . pure $ Ucm hide err cmds "unison" -> do -- todo: this has to be more interesting @@ -657,44 +655,17 @@ fenced = do hide <- lineToken hidden err <- lineToken expectingError fileName <- optional untilSpace1 - blob <- spaces *> untilFence - pure $ Unison hide err fileName blob + P.setInput body + blob <- spaces *> (Text.init <$> P.getInput) + pure . pure $ Unison hide err fileName blob "api" -> do + P.setInput body _ <- spaces apiRequests <- many apiRequest - pure $ API apiRequests - _ -> UnprocessedFence fenceType <$> untilFence - fence + pure . pure $ API apiRequests + _ -> pure Nothing pure stanza --- Three backticks, consumes trailing spaces too --- ``` -fence :: P () -fence = P.try $ do void (word "```"); spaces - --- Parses up until next fence -unfenced :: P Stanza -unfenced = Unfenced <$> untilFence - -untilFence :: P Text -untilFence = do - _ <- P.lookAhead (P.takeP Nothing 1) - go mempty - where - go :: Seq Text -> P Text - go !acc = do - f <- P.lookAhead (P.optional fence) - case f of - Nothing -> do - oneOrTwoBackticks <- optional (word' "``" <|> word' "`") - let start = fromMaybe "" oneOrTwoBackticks - txt <- P.takeWhileP (Just "unfenced") (/= '`') - eof <- P.lookAhead (P.optional P.eof) - case eof of - Just _ -> pure $ fold (acc <> pure txt) - Nothing -> go (acc <> pure start <> pure txt) - Just _ -> pure $ fold acc - word' :: Text -> P Text word' txt = P.try $ do chs <- P.takeP (Just $ show txt) (Text.length txt) diff --git a/unison-cli/unison-cli.cabal b/unison-cli/unison-cli.cabal index d0b19db6a..dc7b8f2b8 100644 --- a/unison-cli/unison-cli.cabal +++ b/unison-cli/unison-cli.cabal @@ -196,6 +196,7 @@ library , base , bytes , bytestring + , cmark , co-log-core , code-page , concurrent-output @@ -337,6 +338,7 @@ executable transcripts , base , bytes , bytestring + , cmark , co-log-core , code-page , concurrent-output @@ -485,6 +487,7 @@ test-suite cli-tests , base , bytes , bytestring + , cmark , co-log-core , code-page , concurrent-output