mirror of
https://github.com/unisonweb/unison.git
synced 2024-08-15 21:40:50 +03:00
Replace transcript parser with cmark
We don’t need a very rich parser for transcripts, but we _do_ need to reliably identify fenced code blocks, and that implies a number of subtle cases. Using a polished CommonMark parser/printer handles those subtleties for us. I chose `cmark` for a few reasons: - it’s a wrapper around `libcmark`, which is the reference implementation of CommonMark, so it should be correct; - it provides both a parser and a printer (unlike MMark); and - it is extremely fast (about 20x faster than MMark), so the fact that our home-rolled parser got to skip over everything that’s not a block isn’t an issue.). This only _partially_ uses the `cmark` printer. I think it should use it entirely, but for the cases where we do streaming output (processing UCM commands, etc.) it’s a more involved change. So I think it should be handled separately.
This commit is contained in:
parent
2591ade890
commit
9e2fa2bbe7
@ -19,6 +19,7 @@ dependencies:
|
||||
- base
|
||||
- bytes
|
||||
- bytestring
|
||||
- cmark
|
||||
- co-log-core
|
||||
- code-page
|
||||
- concurrent-output
|
||||
|
@ -18,6 +18,7 @@ module Unison.Codebase.TranscriptParser
|
||||
)
|
||||
where
|
||||
|
||||
import CMark qualified
|
||||
import Control.Lens (use, (?~))
|
||||
import Crypto.Random qualified as Random
|
||||
import Data.Aeson qualified as Aeson
|
||||
@ -121,12 +122,14 @@ instance Show APIRequest where
|
||||
show (GetRequest txt) = "GET " <> Text.unpack txt
|
||||
show (APIComment txt) = "-- " <> Text.unpack txt
|
||||
|
||||
pattern CMarkCodeBlock :: (Maybe CMark.PosInfo) -> Text -> Text -> CMark.Node
|
||||
pattern CMarkCodeBlock pos info body = CMark.Node pos (CMark.CODE_BLOCK info body) []
|
||||
|
||||
data Stanza
|
||||
= Ucm Hidden ExpectingError [UcmLine]
|
||||
| Unison Hidden ExpectingError (Maybe ScratchFileName) Text
|
||||
| API [APIRequest]
|
||||
| UnprocessedFence FenceType Text
|
||||
| Unfenced Text
|
||||
| UnprocessedBlock CMark.Node
|
||||
|
||||
instance Show UcmLine where
|
||||
show = \case
|
||||
@ -138,43 +141,34 @@ instance Show UcmLine where
|
||||
UcmContextProject projectAndBranch -> Text.unpack (into @Text projectAndBranch)
|
||||
|
||||
instance Show Stanza where
|
||||
show s = case s of
|
||||
show s = (<> "\n") . Text.unpack . CMark.nodeToCommonmark [] Nothing $ stanzaToNode s
|
||||
|
||||
stanzaToNode :: Stanza -> CMark.Node
|
||||
stanzaToNode =
|
||||
\case
|
||||
Ucm _ _ cmds ->
|
||||
unlines
|
||||
[ "```ucm",
|
||||
foldl (\x y -> x ++ show y) "" cmds,
|
||||
"```"
|
||||
]
|
||||
CMarkCodeBlock Nothing "ucm" . Text.pack $
|
||||
foldl (\x y -> x ++ show y) "" cmds
|
||||
Unison _hide _ fname txt ->
|
||||
unlines
|
||||
[ "```unison",
|
||||
case fname of
|
||||
Nothing -> Text.unpack txt <> "```\n"
|
||||
Just fname ->
|
||||
unlines
|
||||
[ "---",
|
||||
"title: " <> Text.unpack fname,
|
||||
"---",
|
||||
Text.unpack txt,
|
||||
"```",
|
||||
""
|
||||
]
|
||||
]
|
||||
CMarkCodeBlock Nothing "unison" . Text.pack $
|
||||
unlines
|
||||
[ case fname of
|
||||
Nothing -> Text.unpack txt
|
||||
Just fname ->
|
||||
unlines
|
||||
[ "---",
|
||||
"title: " <> Text.unpack fname,
|
||||
"---",
|
||||
Text.unpack txt
|
||||
]
|
||||
]
|
||||
API apiRequests ->
|
||||
"```api\n"
|
||||
<> ( apiRequests
|
||||
& fmap show
|
||||
& unlines
|
||||
)
|
||||
<> "```\n"
|
||||
UnprocessedFence typ txt ->
|
||||
unlines
|
||||
[ "```" <> Text.unpack typ,
|
||||
Text.unpack txt,
|
||||
"```",
|
||||
""
|
||||
]
|
||||
Unfenced txt -> Text.unpack txt
|
||||
CMarkCodeBlock Nothing "api" . Text.pack $
|
||||
( apiRequests
|
||||
& fmap show
|
||||
& unlines
|
||||
)
|
||||
UnprocessedBlock node -> node
|
||||
|
||||
parseFile :: FilePath -> IO (Either TranscriptError [Stanza])
|
||||
parseFile filePath = do
|
||||
@ -186,7 +180,7 @@ parseFile filePath = do
|
||||
else pure . Left . TranscriptParseError . Text.pack $ filePath <> " does not exist"
|
||||
|
||||
parse :: String -> Text -> Either TranscriptError [Stanza]
|
||||
parse srcName txt = case P.parse (stanzas <* P.eof) srcName txt of
|
||||
parse srcName txt = case stanzas srcName txt of
|
||||
Right a -> Right a
|
||||
Left e -> Left . TranscriptParseError . Text.pack . P.errorBundlePretty $ e
|
||||
|
||||
@ -337,7 +331,7 @@ run verbosity dir stanzas codebase runtime sbRuntime nRuntime config ucmVersion
|
||||
for (reverse scratchFileUpdates) \(fp, contents) -> do
|
||||
let fenceDescription = "unison:added-by-ucm " <> fp
|
||||
-- Output blocks for any scratch file updates the ucm block triggered.
|
||||
Q.undequeue inputQueue (UnprocessedFence fenceDescription contents, Nothing)
|
||||
Q.undequeue inputQueue (UnprocessedBlock $ CMarkCodeBlock Nothing fenceDescription contents, Nothing)
|
||||
awaitInput
|
||||
-- ucm command to run
|
||||
Just (Just ucmLine) -> do
|
||||
@ -420,10 +414,7 @@ run verbosity dir stanzas codebase runtime sbRuntime nRuntime config ucmVersion
|
||||
++ "."
|
||||
IO.hFlush IO.stdout
|
||||
case s of
|
||||
Unfenced _ -> do
|
||||
liftIO (output $ show s)
|
||||
awaitInput
|
||||
UnprocessedFence _ _ -> do
|
||||
UnprocessedBlock _ -> do
|
||||
liftIO (output $ show s)
|
||||
awaitInput
|
||||
Unison hide errOk filename txt -> do
|
||||
@ -593,8 +584,12 @@ transcriptFailure out msg = do
|
||||
|
||||
type P = P.Parsec Void Text
|
||||
|
||||
stanzas :: P [Stanza]
|
||||
stanzas = P.many (fenced <|> unfenced)
|
||||
stanzas :: String -> Text -> Either (P.ParseErrorBundle Text Void) [Stanza]
|
||||
stanzas srcName = (\(CMark.Node _ _DOCUMENT blocks) -> traverse stanzaFromBlock blocks) . CMark.commonmarkToNode []
|
||||
where
|
||||
stanzaFromBlock block = case block of
|
||||
CMarkCodeBlock _ info body -> fromMaybe (UnprocessedBlock block) <$> P.parse (fenced info) srcName body
|
||||
_ -> pure $ UnprocessedBlock block
|
||||
|
||||
ucmLine :: P UcmLine
|
||||
ucmLine = ucmCommand <|> ucmComment
|
||||
@ -636,18 +631,21 @@ apiRequest = do
|
||||
spaces
|
||||
pure (APIComment comment)
|
||||
|
||||
fenced :: P Stanza
|
||||
fenced = do
|
||||
fence
|
||||
-- | Produce the correct parser for the code block based on the provided info string.
|
||||
fenced :: Text -> P (Maybe Stanza)
|
||||
fenced info = do
|
||||
body <- P.getInput
|
||||
P.setInput info
|
||||
fenceType <- lineToken (word "ucm" <|> word "unison" <|> word "api" <|> language)
|
||||
stanza <-
|
||||
case fenceType of
|
||||
"ucm" -> do
|
||||
hide <- hidden
|
||||
err <- expectingError
|
||||
P.setInput body
|
||||
_ <- spaces
|
||||
cmds <- many ucmLine
|
||||
pure $ Ucm hide err cmds
|
||||
pure . pure $ Ucm hide err cmds
|
||||
"unison" ->
|
||||
do
|
||||
-- todo: this has to be more interesting
|
||||
@ -657,44 +655,17 @@ fenced = do
|
||||
hide <- lineToken hidden
|
||||
err <- lineToken expectingError
|
||||
fileName <- optional untilSpace1
|
||||
blob <- spaces *> untilFence
|
||||
pure $ Unison hide err fileName blob
|
||||
P.setInput body
|
||||
blob <- spaces *> (Text.init <$> P.getInput)
|
||||
pure . pure $ Unison hide err fileName blob
|
||||
"api" -> do
|
||||
P.setInput body
|
||||
_ <- spaces
|
||||
apiRequests <- many apiRequest
|
||||
pure $ API apiRequests
|
||||
_ -> UnprocessedFence fenceType <$> untilFence
|
||||
fence
|
||||
pure . pure $ API apiRequests
|
||||
_ -> pure Nothing
|
||||
pure stanza
|
||||
|
||||
-- Three backticks, consumes trailing spaces too
|
||||
-- ```
|
||||
fence :: P ()
|
||||
fence = P.try $ do void (word "```"); spaces
|
||||
|
||||
-- Parses up until next fence
|
||||
unfenced :: P Stanza
|
||||
unfenced = Unfenced <$> untilFence
|
||||
|
||||
untilFence :: P Text
|
||||
untilFence = do
|
||||
_ <- P.lookAhead (P.takeP Nothing 1)
|
||||
go mempty
|
||||
where
|
||||
go :: Seq Text -> P Text
|
||||
go !acc = do
|
||||
f <- P.lookAhead (P.optional fence)
|
||||
case f of
|
||||
Nothing -> do
|
||||
oneOrTwoBackticks <- optional (word' "``" <|> word' "`")
|
||||
let start = fromMaybe "" oneOrTwoBackticks
|
||||
txt <- P.takeWhileP (Just "unfenced") (/= '`')
|
||||
eof <- P.lookAhead (P.optional P.eof)
|
||||
case eof of
|
||||
Just _ -> pure $ fold (acc <> pure txt)
|
||||
Nothing -> go (acc <> pure start <> pure txt)
|
||||
Just _ -> pure $ fold acc
|
||||
|
||||
word' :: Text -> P Text
|
||||
word' txt = P.try $ do
|
||||
chs <- P.takeP (Just $ show txt) (Text.length txt)
|
||||
|
@ -196,6 +196,7 @@ library
|
||||
, base
|
||||
, bytes
|
||||
, bytestring
|
||||
, cmark
|
||||
, co-log-core
|
||||
, code-page
|
||||
, concurrent-output
|
||||
@ -337,6 +338,7 @@ executable transcripts
|
||||
, base
|
||||
, bytes
|
||||
, bytestring
|
||||
, cmark
|
||||
, co-log-core
|
||||
, code-page
|
||||
, concurrent-output
|
||||
@ -485,6 +487,7 @@ test-suite cli-tests
|
||||
, base
|
||||
, bytes
|
||||
, bytestring
|
||||
, cmark
|
||||
, co-log-core
|
||||
, code-page
|
||||
, concurrent-output
|
||||
|
Loading…
Reference in New Issue
Block a user