delete old doc support from the lexer and parser

This commit is contained in:
Mitchell Rosen 2024-06-21 16:24:41 -04:00
parent 2010b57e2f
commit 5ca1ef7829
2 changed files with 0 additions and 396 deletions

View File

@ -19,11 +19,8 @@ import Data.List qualified as List
import Data.List.Extra qualified as List.Extra
import Data.List.NonEmpty (NonEmpty ((:|)))
import Data.List.NonEmpty qualified as NonEmpty
import Data.Maybe qualified as Maybe
import Data.Sequence qualified as Sequence
import Data.Set qualified as Set
import Data.Text qualified as Text
import Data.Tuple.Extra qualified as TupleE
import Text.Megaparsec qualified as P
import U.Core.ABT qualified as ABT
import Unison.ABT qualified as ABT
@ -56,7 +53,6 @@ import Unison.Type (Type)
import Unison.Type qualified as Type
import Unison.Typechecker.Components qualified as Components
import Unison.Util.Bytes qualified as Bytes
import Unison.Util.List (intercalateMapWith, quenchRuns)
import Unison.Var (Var)
import Unison.Var qualified as Var
import Prelude hiding (and, or, seq)
@ -129,15 +125,6 @@ termLink' = do
| Set.size s == 1 -> pure $ const (Set.findMin s) <$> id
| otherwise -> customFailure $ UnknownTerm id s
link' :: (Monad m, Var v) => P v m (Either (L.Token Reference) (L.Token Referent))
link' = do
id <- hqPrefixId
ns <- asks names
case (Names.lookupHQTerm Names.IncludeSuffixes (L.payload id) ns, Names.lookupHQType Names.IncludeSuffixes (L.payload id) ns) of
(s, s2) | Set.size s == 1 && Set.null s2 -> pure . Right $ const (Set.findMin s) <$> id
(s, s2) | Set.size s2 == 1 && Set.null s -> pure . Left $ const (Set.findMin s2) <$> id
(s, s2) -> customFailure $ UnknownId id s s2
link :: (Monad m, Var v) => TermP v m
link = termLink <|> typeLink
where
@ -452,7 +439,6 @@ termLeaf =
delayQuote,
(snd <$> delayBlock),
bang,
docBlock,
doc2Block <&> \(spanAnn, trm) -> trm {ABT.annotation = ABT.annotation trm <> spanAnn}
]
@ -610,66 +596,6 @@ doc2Block = do
pure $ (spanAnn, Term.apps' f [addDelay tm])
_ -> regular
docBlock :: (Monad m, Var v) => TermP v m
docBlock = do
openTok <- openBlockWith "[:"
segs <- many segment
closeTok <- closeBlock
let a = ann openTok <> ann closeTok
pure . docNormalize $ Term.app a (Term.constructor a (ConstructorReference DD.docRef DD.docJoinId)) (Term.list a segs)
where
segment = blob <|> linky
blob = do
s <- string
pure $
Term.app
(ann s)
(Term.constructor (ann s) (ConstructorReference DD.docRef DD.docBlobId))
(Term.text (ann s) (L.payload s))
linky = asum [include, signature, evaluate, source, link]
include = do
_ <- P.try (reserved "include")
hashQualifiedPrefixTerm
signature = do
_ <- P.try (reserved "signature")
tok <- termLink'
pure $
Term.app
(ann tok)
(Term.constructor (ann tok) (ConstructorReference DD.docRef DD.docSignatureId))
(Term.termLink (ann tok) (L.payload tok))
evaluate = do
_ <- P.try (reserved "evaluate")
tok <- termLink'
pure $
Term.app
(ann tok)
(Term.constructor (ann tok) (ConstructorReference DD.docRef DD.docEvaluateId))
(Term.termLink (ann tok) (L.payload tok))
source = do
_ <- P.try (reserved "source")
l <- link''
pure $
Term.app
(ann l)
(Term.constructor (ann l) (ConstructorReference DD.docRef DD.docSourceId))
l
link'' = either ty t <$> link'
where
t tok =
Term.app
(ann tok)
(Term.constructor (ann tok) (ConstructorReference DD.linkRef DD.linkTermId))
(Term.termLink (ann tok) (L.payload tok))
ty tok =
Term.app
(ann tok)
(Term.constructor (ann tok) (ConstructorReference DD.linkRef DD.linkTypeId))
(Term.typeLink (ann tok) (L.payload tok))
link = d <$> link''
where
d tm = Term.app (ann tm) (Term.constructor (ann tm) (ConstructorReference DD.docRef DD.docLinkId)) tm
-- Used by unbreakParas within docNormalize. Doc literals are a joined sequence
-- segments. This type describes a property of a segment.
data UnbreakCase
@ -682,298 +608,6 @@ data UnbreakCase
StartsUnindented
deriving (Eq, Show)
-- Doc literal normalization
--
-- This normalization allows the pretty-printer and doc display code to do
-- indenting, and to do line-wrap of paragraphs, but without the inserted
-- newlines being then frozen into the text for ever more over subsequent
-- edit/update cycles.
--
-- The alternative would be to stop line-wrapping docs on view/display by adding
-- newlines in the pretty-printer, and instead leave wrapping to the
-- terminal/editor. Might be worth considering if this code ends up being
-- too buggy and fragile to maintain. Maybe display could add newlines,
-- and view could refrain from doing so.
--
-- Operates on the text of the Blobs within a doc (as parsed by docBlock):
-- - reduces the whitespace after all newlines so that at least one of the
-- non-initial lines has zero indent (important because the pretty-printer adds
-- indenting when displaying doc literals)
-- - removes trailing whitespace from each line
-- - removes newlines between any sequence of non-empty zero-indent lines
-- (i.e. undo line-breaking within paragraphs).
--
-- Should be understood in tandem with Util.Pretty.paragraphyText, which
-- outputs doc text for display/edit/view.
-- See also unison-src/transcripts/doc-formatting.md.
--
-- There is some heuristic/approximate logic in here - see the comment flagged
-- with ** below.
--
-- This function is a bit painful - it's trying to act on a sequence of lines,
-- but that sequence is split up between the various blobs in the doc, which
-- are separated by the elements tracking things like @[source] etc. It
-- would be simplified if the doc representation was something like
-- [Either Char EnrichedElement].
--
-- This function has some tracing which you can enable by deleting some calls to
-- 'const id' below.
docNormalize :: (Ord v, Show v) => Term v a -> Term v a
docNormalize tm = case tm of
-- This pattern is just `DD.DocJoin seqs`, but exploded in order to grab
-- the annotations. The aim is just to map `normalize` over it.
a@(Term.App' c@(Term.Constructor' (ConstructorReference DD.DocRef DD.DocJoinId)) s@(Term.List' seqs)) ->
join
(ABT.annotation a)
(ABT.annotation c)
(ABT.annotation s)
(normalize seqs)
where
_ -> error $ "unexpected doc structure: " ++ show tm
where
normalize =
Sequence.fromList
. (map TupleE.fst3)
. (tracing "after unbreakParas")
. unbreakParas
. (tracing "after full preprocess")
. preProcess
. (tracing "after unindent")
. unIndent
. (tracing "initial parse")
. miniPreProcess
preProcess xs =
zip3
seqs
(lineStarteds $ Sequence.fromList seqs)
(followingLines $ Sequence.fromList seqs)
where
seqs = map fst xs
miniPreProcess seqs = zip (toList seqs) (lineStarteds seqs)
unIndent ::
(Ord v) =>
[(Term v a, UnbreakCase)] ->
[(Term v a, UnbreakCase)]
unIndent tms = map go tms
where
go (b, previous) =
((mapBlob $ (reduceIndent includeFirst minIndent)) b, previous)
where
-- Since previous was calculated before unindenting, it will often be wrongly
-- StartsIndented instead of StartsUnindented - but that's OK just for the test
-- below. And we'll recalculate it later in preProcess.
includeFirst = previous == LineEnds
concatenatedBlobs :: Text
concatenatedBlobs = mconcat (toList (fmap (getBlob . fst) tms))
getBlob (DD.DocBlob txt) = txt
getBlob _ = "."
-- Note we exclude the first line when calculating the minimum indent - the lexer
-- already stripped leading spaces from it, and anyway it would have been sharing
-- its line with the [: and maybe other stuff.
nonInitialNonEmptyLines =
filter (not . Text.null) $
map Text.stripEnd $
drop 1 $
Text.lines
concatenatedBlobs
minIndent =
minimumOrZero $
map
(Text.length . (Text.takeWhile Char.isSpace))
nonInitialNonEmptyLines
minimumOrZero xs = if length xs == 0 then 0 else minimum xs
reduceIndent :: Bool -> Int -> Text -> Text
reduceIndent includeFirst n t =
fixup $
Text.unlines $
mapExceptFirst reduceLineIndent onFirst $
Text.lines t
where
onFirst = if includeFirst then reduceLineIndent else id
reduceLineIndent l = result
where
currentIndent = Text.length $ (Text.takeWhile Char.isSpace) l
remainder = (Text.dropWhile Char.isSpace) l
newIndent = maximum [0, currentIndent - n]
result = Text.replicate newIndent " " `mappend` remainder
-- unlines . lines adds a trailing newline if one was not present: undo that.
fixup = if Text.takeEnd 1 t == "\n" then id else Text.dropEnd 1
-- Remove newlines between any sequence of non-empty zero-indent lines.
-- This is made more complicated by Doc elements (e.g. links) which break up a
-- blob but don't break a line of output text**. We sometimes need to refer back to the
-- previous blob to see whether a newline is between two zero-indented lines.
-- For example...
-- "This link to @foo makes it harder to see\n
-- that the newline should be removed."
-- Whether an element does this (breaks a blob but not a line of output text) really
-- depends on some things we don't know here: does an @[include] target doc occupy
-- just one line or several; whether this doc is going to be viewed or displayed.
-- So we'll get it wrong sometimes. The impact of this is that we may sometimes
-- misjudge whether a newline is separating two non-indented lines, and should therefore
-- be removed.
unbreakParas ::
(Show v, Ord v) =>
[(Term v a, UnbreakCase, Bool)] ->
[(Term v a, UnbreakCase, Bool)]
unbreakParas = map go
where
-- 'candidate' means 'candidate to be joined with an adjacent line as part of a
-- paragraph'.
go (b, previous, nextIsCandidate) =
(mapBlob go b, previous, nextIsCandidate)
where
go txt = if Text.null txt then txt else tr result'
where
tr =
const id $
trace $
"\nprocessElement on blob "
++ (show txt)
++ ", result' = "
++ (show result')
++ ", lines: "
++ (show ls)
++ ", candidates = "
++ (show candidates)
++ ", previous = "
++ (show previous)
++ ", firstIsCandidate = "
++ (show firstIsCandidate)
++ "\n\n"
-- remove trailing whitespace
-- ls is non-empty thanks to the Text.null check above
-- Don't cut the last line's trailing whitespace - there's an assumption here
-- that it's followed by something which will put more text on the same line.
ls = mapExceptLast Text.stripEnd id $ Text.lines txt
-- Work out which lines are candidates to be joined as part of a paragraph, i.e.
-- are not indented.
candidate l = case Text.uncons l of
Just (initial, _) -> not . Char.isSpace $ initial
Nothing -> False -- empty line
-- The segment of this blob that runs up to the first newline may not itself
-- be the start of a line of the doc - for example if it's preceded by a link.
-- So work out whether the line of which it is a part is a candidate.
firstIsCandidate = case previous of
LineEnds -> candidate (head ls)
StartsIndented -> False
StartsUnindented -> True
candidates = firstIsCandidate : (tail (map candidate ls))
result = mconcat $ intercalateMapWith sep fst (zip ls candidates)
sep (_, candidate1) (_, candidate2) =
if candidate1 && candidate2 then " " else "\n"
-- Text.lines forgets whether there was a trailing newline.
-- If there was one, then either add it back or convert it to a space.
result' =
if (Text.takeEnd 1 txt) == "\n"
then
if (last candidates) && nextIsCandidate
then result `Text.append` " "
else result `Text.append` "\n"
else result
-- A list whose entries match those of tms. `Nothing` is used for elements
-- which just continue a line, and so need to be ignored when looking back
-- for how the last line started. Otherwise describes whether the last
-- line of this entry is indented (or maybe terminated by a newline.)
-- A value of `Nothing` protects ensuing text from having its leading
-- whitespace removed by `unindent`.
-- Note that some elements render over multiple lines when displayed.
-- See test2 in transcript doc-formatting.md for an example of how
-- this looks when there is whitespace immediately following @[source]
-- or @[evaluate].
lastLines :: (Show v) => Sequence.Seq (Term v a) -> [Maybe UnbreakCase]
lastLines tms = (flip fmap) (toList tms) $ \case
DD.DocBlob txt -> unbreakCase txt
DD.DocLink _ -> Nothing
DD.DocSource _ -> Nothing
DD.DocSignature _ -> Nothing
DD.DocEvaluate _ -> Nothing
Term.Var' _ -> Nothing -- @[include]
e@_ -> error ("unexpected doc element: " ++ show e)
-- Work out whether the last line of this blob is indented (or maybe
-- terminated by a newline.)
unbreakCase :: Text -> Maybe UnbreakCase
unbreakCase txt =
let (startAndNewline, afterNewline) = Text.breakOnEnd "\n" txt
in if Text.null startAndNewline
then Nothing
else
if Text.null afterNewline
then Just LineEnds
else
if Char.isSpace (Text.head afterNewline)
then Just StartsIndented
else Just StartsUnindented
-- A list whose entries match those of tms. Describes how the current
-- line started (the line including the start of this entry) - or LineEnds
-- if this entry is starting a line itself.
-- Calculated as the UnbreakCase of the previous entry that included a newline.
-- Really there's a function of type (a -> Bool) -> a -> [a] -> [a] in here
-- fighting to break free - overwriting elements that are 'shadowed' by
-- a preceding element for which the predicate is true, with a copy of
-- that element.
lineStarteds :: (Show v) => Sequence.Seq (Term v a) -> [UnbreakCase]
lineStarteds tms = tr $ quenchRuns LineEnds StartsUnindented $ xs''
where
tr =
const id $
trace $
"lineStarteds: xs = "
++ (show xs)
++ ", xss = "
++ (show xss)
++ ", xs' = "
++ (show xs')
++ ", xs'' = "
++ (show xs'')
++ "\n\n"
-- Make sure there's a Just at the start of the list so we always find
-- one when searching back.
-- Example: xs = [J1,N2,J3]
xs :: [Maybe UnbreakCase]
xs = Just LineEnds : (lastLines tms)
-- Example: xss = [[J1],[J1,N2],[J1,N2,J3]]
xss :: [[Maybe UnbreakCase]]
xss = drop 1 $ List.inits xs
-- Example: after each step of the map...
-- [[J1],[N2,J1],[J3,N2,J1]] -- after reverse
-- [Just J1, Just J1, Just J3] -- after find
-- ...
-- result = [1,1,3]
xs' =
map (Maybe.fromJust . Maybe.fromJust . (List.find isJust) . reverse) xss
xs'' = List.Extra.dropEnd 1 xs'
-- For each element, can it be a line-continuation of a preceding blob?
continuesLine :: Sequence.Seq (Term v a) -> [Bool]
continuesLine tms = (flip fmap) (toList tms) \case
DD.DocBlob _ -> False -- value doesn't matter - you don't get adjacent blobs
DD.DocLink _ -> True
DD.DocSource _ -> False
DD.DocSignature _ -> False
DD.DocEvaluate _ -> False
Term.Var' _ -> False -- @[include]
_ -> error ("unexpected doc element" ++ show tm)
-- A list whose entries match those of tms. Can the subsequent entry by a
-- line continuation of this one?
followingLines tms = drop 1 ((continuesLine tms) ++ [False])
mapExceptFirst :: (a -> b) -> (a -> b) -> [a] -> [b]
mapExceptFirst fRest fFirst = \case
[] -> []
x : rest -> (fFirst x) : (map fRest rest)
mapExceptLast fRest fLast = reverse . (mapExceptFirst fRest fLast) . reverse
tracing :: (Show a) => [Char] -> a -> a
tracing when x =
(const id $ trace ("at " ++ when ++ ": " ++ (show x) ++ "\n")) x
blob aa ac at txt =
Term.app aa (Term.constructor ac (ConstructorReference DD.docRef DD.docBlobId)) (Term.text at txt)
join aa ac as segs =
Term.app aa (Term.constructor ac (ConstructorReference DD.docRef DD.docJoinId)) (Term.list' as segs)
mapBlob :: (Ord v) => (Text -> Text) -> Term v a -> Term v a
-- this pattern is just `DD.DocBlob txt` but exploded to capture the annotations as well
mapBlob f (aa@(Term.App' ac@(Term.Constructor' (ConstructorReference DD.DocRef DD.DocBlobId)) at@(Term.Text' txt))) =
blob (ABT.annotation aa) (ABT.annotation ac) (ABT.annotation at) (f txt)
mapBlob _ t = t
delayQuote :: (Monad m, Var v) => TermP v m
delayQuote = P.label "quote" do
start <- reserved "'"

View File

@ -406,7 +406,6 @@ lexemes' eof =
toks :: P [Token Lexeme]
toks =
doc2
<|> doc
<|> token numeric
<|> token character
<|> reserved
@ -835,35 +834,6 @@ lexemes' eof =
where
final = last ts
doc :: P [Token Lexeme]
doc = open <+> (CP.space *> fmap fixup body) <+> (close <* space)
where
open = token'' (\t _ _ -> t) $ tok (Open <$> lit "[:")
close = tok (Close <$ lit ":]")
at = lit "@"
-- this removes some trailing whitespace from final textual segment
fixup [] = []
fixup (Token (Textual (reverse -> txt)) start stop : []) =
[Token (Textual txt') start stop]
where
txt' = reverse (dropWhile (\c -> isSpace c && not (c == '\n')) txt)
fixup (h : t) = h : fixup t
body :: P [Token Lexeme]
body = txt <+> (atk <|> pure [])
where
ch = (":]" <$ lit "\\:]") <|> ("@" <$ lit "\\@") <|> (pure <$> P.anySingle)
txt = tok (Textual . join <$> P.manyTill ch (P.lookAhead sep))
sep = void at <|> void close
ref = at *> (tok identifierLexemeP <|> docTyp)
atk = (ref <|> docTyp) <+> body
docTyp = do
_ <- lit "["
typ <- tok (P.manyTill P.anySingle (P.lookAhead (lit "]")))
_ <- lit "]" *> CP.space
t <- tok identifierLexemeP
pure $ (fmap Reserved <$> typ) <> t
blank =
separated wordySep do
_ <- char '_'