Add error resilient parser for LSP (#655)

See waspls/src/Wasp/Backend/ConcreteParser/Internal.hs for description of the parser
This commit is contained in:
Craig McIlwrath 2022-07-26 15:14:19 -04:00 committed by GitHub
parent 0de9997198
commit 879f8e4225
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 1945 additions and 3 deletions

View File

@ -0,0 +1,101 @@
module Wasp.Backend.ConcreteParser
( -- * Concrete parser
-- | This module contains functions for converting lists of "Token"s into
-- concrete syntax trees. These trees represent any Wasp source file completely,
-- including whitespace and comments, even if the source file is invalid.
parseCST,
parseCSTExpression,
)
where
import Wasp.Backend.ConcreteParser.Internal
import Wasp.Backend.ConcreteSyntax (SyntaxKind (..), SyntaxNode)
import Wasp.Backend.ParseError
import Wasp.Backend.Token (Token)
import qualified Wasp.Backend.Token as T
-- | Parse a list of tokens representing a Wasp program into a concrete syntax
-- tree.
--
-- See "SyntaxNode" for a description of what a concrete syntax tree contains.
parseCST :: [Token] -> ([ParseError], [SyntaxNode])
parseCST tokens = parse tokens (root <> eof)
-- | Parse a list of tokens into a concrete syntax tree for a wasp expression.
-- This is mainly used for testing.
--
-- See "SyntaxNode" for a description of what a concrete syntax tree contains.
parseCSTExpression :: [Token] -> ([ParseError], [SyntaxNode])
parseCSTExpression tokens = parse tokens (expr <> eof)
root :: GrammarRule
root = Program <$$> stmts
stmts :: GrammarRule
stmts = eof <|> (stmt <> stmts)
stmt :: GrammarRule
stmt =
Decl
<$$> (T.Identifier `as` DeclType)
<> (T.Identifier `as` DeclName)
<> expr
{- ORMOLU_DISABLE -}
expr :: GrammarRule
expr =
Dict <$$> listLike lcurly dictEntry comma rcurly
<|> List <$$> listLike lsquare expr comma rsquare
-- Note that we don't check number of tuple element here: this is left to
-- the next phase of parsing.
<|> Tuple <$$> listLike lparen expr comma rparen
<|> quoter
<|> int
<|> double
<|> string
<|> (T.Identifier `as` Var)
<|> kwTrue
<|> kwFalse
<|> extImport
{- ORMOLU_ENABLE -}
dictEntry :: GrammarRule
dictEntry = DictEntry <$$> (T.Identifier `as` DictKey) <> colon <> expr
quoter :: GrammarRule
quoter = Quoter <$$> lquote <> quoterTail
where
quoterTail = rquote <|> (quoted <> quoterTail)
extImport :: GrammarRule
extImport =
ExtImport
<$$> kwImport <> extImportName <> kwFrom <> (T.String `as` ExtImportPath)
where
extImportName :: GrammarRule
extImportName =
lcurly <> (T.Identifier `as` ExtImportField) <> rcurly
<|> (T.Identifier `as` ExtImportModule)
-- | @listLike open value sep close@ parses list like structures in the form:
--
-- @open (value sep)* (value sep?) close@
--
-- In other words, a list with an optional trailing separator.
listLike ::
-- | start grammar rule
GrammarRule ->
-- | value grammar rule
GrammarRule ->
-- | separator grammar rule
GrammarRule ->
-- | end grammar rule
GrammarRule ->
GrammarRule
listLike open value sep close =
open <> listLikeTail
where
listLikeTail :: GrammarRule
listLikeTail =
close <|> (value <> ((sep <> listLikeTail) <|> close))

View File

@ -0,0 +1,774 @@
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE LambdaCase #-}
module Wasp.Backend.ConcreteParser.Internal
( -- * Internal parsing library
GrammarRule,
parse,
-- * Combinators
-- | The fixity of the operators ('(<>)', '`as`', '(<$$>)', and '(<|>)') was
-- chosen to make using operators require as little parentheses as possible:
--
-- >>> Program <$$> lparen <> rparen
-- <|> string
-- Alternative (Group Program (Chain (Token LParen) (Token RParen))) (Token String)
as,
(<$$>),
perhaps,
(<|>),
-- * Primitive grammar rules
-- | These grammar rules (besides `eof`) consume a single token of the kind
-- matching their name. The syntax node they produce has an "S.Token" kind,
-- unless otherwise specified.
lparen,
rparen,
lsquare,
rsquare,
lcurly,
rcurly,
comma,
colon,
kwImport,
kwFrom,
kwTrue,
kwFalse,
string,
int,
double,
lquote,
rquote,
quoted,
-- | A primitive for "T.Identifier" is not included: this should always be
-- done with an `as`, i.e. @"T.Identifier" ``as`` "S.DictKey"@
eof,
-- * Error Recovery
-- | This parser has built-in automatic error recovery. Currently, error
-- recovery occurs when an error is produced inside of a group rule. Recovery
-- is based on the possible tokens that can follow the current rule.
--
-- First, we define the follow set of a grammar rule @R@ in the grammar
-- @R <> S@ to be the set of token kinds that @S@ can accept as its first
-- token. For example, @"let"@ in @"let" <> ("=" <|> "+=")@ has the follow
-- set @["=", "+="]@.
--
-- We can then describe the basic mechanism as follows:
--
-- For a "GrammarRule" @label '<$$>' first '<>' next@, if an error occurs
-- while parsing @first@, check if the token it errored on is in the follow
-- set for the group.
--
-- If yes, then there is a following rule that can accept this token. So
-- just continue parsing. Otherwise, no following rule has a use for the
-- token, so discard it.
--
-- In either case, we log an error to report at the end of parsing.
--
-- As an example, consider the grammar rules:
--
-- @
-- stmts = eof <|> (stmt <> stmts)
-- stmt = Stmt <$$> "let" <> identifier <> "=" <> expr <> ";"
-- @
--
-- With the example source text:
--
-- @
-- let x = 2
-- let y = 4;
-- @
--
-- A parse error will occur at the beginning of line 2, when @let@ is found
-- instead of the expected @;@.
--
-- First, we find the follow set of the @Stmt@ group. From the definition of
-- @stmts@, we can see @stmt@ is followed by @stmts@. @stmts@ can start with
-- either @eof@ or @stmt@ (which in turn can start with @"let"@. Hence, the
-- follow set is @[eof, "let"]@.
--
-- Since the "let" token the parser erroed on is in this set, it does not
-- get discarded by the group. This leaves it available for the call to
-- @stmt@, which can parse successfully because of this, resulting in the
-- following parse tree and error:
--
-- @
-- [ Stmt ["let", "x", "=", "2", error]
-- , Stmt ["let", "y", "=", "4", ";"]
-- ]
--
-- [Unexpected "let" at line 2 column 1: expected ";"]
-- @
--
-- But if we look at a more complex situation, with nested groups, we need
-- to introduce another concept to produce correct parses. For example:
--
-- @
-- list = List <$$> '[' <> expr <> ']'
-- tuple = Tuple <$$> '(' <> expr <> ')'
-- expr = list <|> tuple <|> number
-- @
--
-- with source:
--
-- @
-- [(2 ]
-- @
--
-- If we were to apply the same logic as before, we would find the follow
-- set of the @2@ to be @[")"]@. Since @]@ is not in this set, we discard
-- the right square bracket. But what we want to do is leave it so that
-- the @list@ rule can use it as its closing bracket.
--
-- To handle this scenario, we introduce an unwinding behavior to travel up
-- through group rules until an appropriate point is found to recover. We
-- do this first by defining the extended follow set of a rule @R@ as
-- the union of the follow sets of @R@ and its ancestor rules.
--
-- For example, we find the follow set and extended follow set of the @2@ in
-- the example code. To start, we write out the tree for grammar rules that
-- are used:
--
-- @
-- List <$$> '[' <> (Tuple <$$> '(' <> number <> ')') <> ']'
-- @
--
-- Now, we see the follow set of the @2@ is just @[')']@. Then, the follow
-- set of its ancestors (@Tuple<$$>@ and @List<$$>@) are @[']']@ and @[]@,
-- respectively. So, the extended follow set is @[')', ']']@.
--
-- Unwinding starts when:
--
-- (1) The next token __is not__ in the follow set of the current rule
-- (2) The next token __is__ in the extended follow set of the current rule
--
-- While unwinding, the parser will stop at each group ancestor of the
-- original rule to determine whether to continue unwinding. Unwinding stops
-- when a group rule @S@ is reached such that the follow set of @S@ includes
-- the next token. At this point, the same thing happens as in the first
-- description of error recovery.
--
-- The example rules and source would produce the following parse tree and
-- errors:
--
-- @
-- [List ["[", Tuple ["(", "2"], "]"]]
--
-- [Unexpected "]" at line 1 column 9: expected expression]
-- @
--
-- ** Error Recovery Rules
--
-- A description of the full error recovery process:
--
-- 1. While parsing @first@ in @label '<$$>' first '<>' next@, an
-- unexpected token is encountered.
-- 2. The follow set and extended follow set for the group node are
-- calculated.
-- 3. If the unexpected token is in the follow set, continue parsing normally.
-- 4. If the unexpected token is not in the follow set, but is in the extended
-- follow set, begin unwinding:
-- a. Stop at each group rule ancestor @S@ and compute the follow set of
-- @S@.
-- b. If the follow set of @S@ contains the unexpected token, stop
-- unwinding and resume parsing.
-- c. Otherwise, keep unwinding.
-- 5. If the unexpected token is not in the extended follow set, discard the
-- token and continue parsing.
-- * Future improvements
-- |
-- __TODO #1:__
--
-- Move error recovery from the group rule to the '<>' rules.
--
-- __TODO #2:__
--
-- The current error recovery and the upgraded version described in TODO #1
-- can do what feels like the wrong thing in some cases. For example,
-- consider the following invalid wasp code:
--
-- @
-- route TestRoute { path: "/", to:
--
-- page TestPage { fn: import TestPage from "@ext/TestPage.js" }
-- @
--
-- To a human, this should parse as two declaration statements. But the
-- current system identifies @page@ as a variable for the dictionary entry
-- @to:@. Then it treats @TestPage@ as the key for another entry, and
-- complains that it found @{@ instead of @:@.
--
-- This is due to the unwinding behavior on errors being very short-sighted:
-- it only cares whether the next non-white token will succeed.
--
-- A possible fix to this is to explore all possible (or many possible)
-- unwinding stopping points and figuring out which one results in the
-- fewest future parse errors.
--
-- In case this has performance issues with having to explore lots of possible
-- parse trees, we could limit how many branches we check, similar to optimizing
-- DFS for state tree searches (like a chess solver).
--
-- __TODO #3:__
--
-- Change wasp language a bit to make the start of a declaration unambiguous.
-- This would make it much easier for the parser to recover from an unfinished
-- declaration.
--
-- Ideas for this:
--
-- (1) Make all type names reserved keywords in the grammar
-- (2) Distinguish between type names and variable names based on the case
-- of the first letter (lowercase = type, uppercase = variable)
--
-- __TODO #4:__
--
-- When the next token is not in the follow set of the rule, try discarding
-- multiple tokens, stopping when we do find a token in the follow set.
)
where
import Control.Monad (unless)
import Control.Monad.Except (ExceptT, MonadError (catchError, throwError), runExceptT)
import Control.Monad.State.Strict (State, gets, modify, runState)
import Data.Maybe (fromMaybe)
import Wasp.Backend.ConcreteSyntax (SyntaxKind, SyntaxNode (SyntaxNode, snodeChildren, snodeKind, snodeWidth))
import qualified Wasp.Backend.ConcreteSyntax as S
import Wasp.Backend.ParseError
import Wasp.Backend.Token (Token (tokenKind, tokenWidth), TokenKind, tokenKindIsTrivia)
import qualified Wasp.Backend.Token as T
import Wasp.Backend.TokenSet (TokenSet)
import qualified Wasp.Backend.TokenSet as TokenSet
-- | The type of a grammar rule. Use combinators and primitives to build more
-- complex grammars. Run with `parse`, which supports reasonable automatic error
-- recovery.
--
-- Every rule begins parsing by consuming trivia tokens at the beginning of the
-- input and creating a syntax node for each trivia token.
data GrammarRule
= -- | Always succeeds, consuming only trivia tokens.
Succeed
| -- | Checks if the first non-trivia token is the expected token
-- kind. If it is, it consumes the token and outputs a syntax node. Otherwise,
-- it does not consume the token and throws a parse error exception.
--
-- If a "SyntaxKind" is specified, use it to label the syntax node created.
Token TokenKind (Maybe SyntaxKind)
| -- | Succeeds if the input is empty after consuming trivia tokens at the
-- beginning of the input.
--
-- If it fails, it will report an error at the first non-trivia token while
-- also consuming all remaining input.
EOF
| -- | Run the inner grammar rule and wraps its resulting nodes in a node
-- labeled by the given syntax kind. If it fails, apply the error recovery
-- strategy (this is currently the only place where error recovery is
-- performed).
--
-- This is the only rule that adds structure and depth into the CST.
Group SyntaxKind GrammarRule
| -- | Run the first grammar rule. If it succeeds, run the second rule.
Chain GrammarRule GrammarRule
| -- | Run the grammar rule that can accept the next token in the input as its
-- first token.
--
-- Invariant: There can be no overlap in which token kinds each parser can
-- accept. The parser will use a Haskell 'error' in this case.
--
-- TODO: Remove the invariant and use backtracking to try each rule
Alternative GrammarRule GrammarRule
deriving (Eq, Ord, Show)
-- | @rule1 '<>' rule2@ is a parser that runs first @rule1@, then @rule2@.
-- Equivalent to @Chain rule1 rule2@.
instance Semigroup GrammarRule where
-- "Succeed" is identity of <>. The second pattern does affect the result tree,
-- but only by changing where trivia tokens are placed in the tree, which is
-- not an issue.
Succeed <> rule = rule
rule <> Succeed = rule
first <> second = Chain first second
-- | @mempty@ is a parser that matches an empty string, or a string of only
-- trivia tokens.
instance Monoid GrammarRule where
mempty = Succeed
-- | Run a "GrammarRule" on an input of tokens. Returns a tuple @(errors, nodes)@.
parse :: [Token] -> GrammarRule -> ([ParseError], [SyntaxNode])
parse tokens grammar =
let initialState =
ParseState
{ pstateInput = tokens,
pstateCurrentLevelNodes = [],
pstateErrors = [],
pstateNextTokenOffset = 0,
pstatePrevTokenOffset = 0,
pstateAcceptableNextTokens = []
}
in case runState (runExceptT (parseRule grammar)) initialState of
(Left err, state) -> case err of
Unwind -> error "Unwind at top-level GrammarRule (impossible, this is a bug in Wasp.Backend.ConcreteParser.Internal)"
ParseError perr ->
let errs = pstateErrors state ++ [perr]
newNode = makeErrorNode perr False
nodes = pstateCurrentLevelNodes state ++ [newNode]
in (errs, nodes)
(_, state) -> (pstateErrors state, pstateCurrentLevelNodes state)
data ParseState = ParseState
{ -- | Remaining tokens to be parsed.
pstateInput :: [Token],
-- | "SyntaxNode"s processed so far at the current level of the CST. An
-- initial top-level is created when the parser is started and each "Group"
-- rule creates a new level.
pstateCurrentLevelNodes :: [SyntaxNode],
-- | Errors collected that will be reported.
pstateErrors :: [ParseError],
-- | 0-based offset from the beginning of the source file, in characters,
-- of the start of the next token (@head . `pstateInput`@).
pstateNextTokenOffset :: !Int,
-- | 0-based offset from the beginning of the source file, in characters,
-- of the start of the last consumed token
pstatePrevTokenOffset :: !Int,
-- | Stack of acceptable next tokens, where entries to the stack are added
-- when we enter a group rule for parsing.
--
-- The first token set in this list is the current follow set and the union
-- of all of the token sets is the extended follow set.
pstateAcceptableNextTokens :: [TokenSet]
}
deriving (Eq, Show, Ord)
data ParseException
= -- | @ParseError parseError@ is used as a signal when an unexpected token is
-- encountered. This kind of exception is caught by error recovery points
-- and added to the output list of errors.
--
-- This may be converted into "Unwind" if error recovery determines unwinding
-- should occur.
ParseError ParseError
| -- | @Unwind tokenKind@ is used as a signal during error recovery. It is used
-- to implement the unwinding behavior described at the top of the module.
--
-- Essentially, it means "I don't know how to parse this, but I know someone
-- above me can. Can you? If not, pass it along."
Unwind
type ParserM a = ExceptT ParseException (State ParseState) a
parseRule :: GrammarRule -> ParserM ()
parseRule Succeed = parseSucceed
parseRule (Token expectedKind maybeLabel) = parseToken expectedKind maybeLabel
parseRule EOF = parseEOF
parseRule (Group label inner) = parseGroup label inner
parseRule (Chain first second) = parseChain first second
parseRule (Alternative left right) = parseAlternative left right
parseSucceed :: ParserM ()
parseSucceed = consumeTrivia
parseToken :: TokenKind -> Maybe SyntaxKind -> ParserM ()
parseToken expectedKind maybeSyntaxKind =
(consumeTrivia >> peek) >>= \case
Just nextToken | tokenKind nextToken == expectedKind -> do
advance
let newNodeKind = fromMaybe (S.Token expectedKind) maybeSyntaxKind
let newNode =
SyntaxNode
{ snodeKind = newNodeKind,
snodeWidth = tokenWidth nextToken,
snodeChildren = []
}
pushNode newNode
nextToken -> throwParseError nextToken (TokenSet.fromKind expectedKind)
parseEOF :: ParserM ()
parseEOF =
(consumeTrivia >> peek) >>= \case
Just nextToken -> do
nextTokenOffset <- gets pstateNextTokenOffset
advance
let nextTokenKind = tokenKind nextToken
-- TODO: consider reporting all token kinds here? not sure
_tokenKinds <- collectUntilEOF
throwError $
ParseError $
UnexpectedToken
(Region nextTokenOffset (nextTokenOffset + tokenWidth nextToken))
nextTokenKind
TokenSet.fromEOF
where
collectUntilEOF :: ParserM [TokenKind]
collectUntilEOF =
peek >>= \case
Just next -> advance >> ((tokenKind next :) <$> collectUntilEOF)
Nothing -> pure []
Nothing -> do
return ()
parseGroup :: SyntaxKind -> GrammarRule -> ParserM ()
parseGroup label inner = do
consumeTrivia
currentLevelNodes <- gets pstateCurrentLevelNodes
modify (\s -> s {pstateCurrentLevelNodes = []})
maybeInnerParserError <- tryError $ parseRule inner
childNodes <- gets pstateCurrentLevelNodes
modify (\s -> s {pstateCurrentLevelNodes = currentLevelNodes})
case maybeInnerParserError of
Nothing -> pushGroupNode childNodes
Just (ParseError parseError) -> handleInnerParseError childNodes parseError
Just Unwind -> handleInnerUnwind childNodes
where
handleInnerParseError :: [SyntaxNode] -> ParseError -> ParserM ()
handleInnerParseError childNodes parseError = do
-- ASSUMPTION: The token errored on has not been consumed
--
-- (1) Check whether the token kind we errored on can be accepted by any
-- following rule.
--
-- (2) Report the error and create the group node containing the error and
-- what it parsed before the error.
--
-- (3) Handle the error:
-- (3a) If it can not be accepted by a following rule, discard the error
-- token.
-- (3b) If unwinding is required to accept the token, begin unwinding.
-- (3c) Otherwise, the error can be handled without unwinding, so do
-- nothing and continue parsing.
-- (1)
let errorTokenKind = case parseError of
UnexpectedEOF _ _ -> Nothing
UnexpectedToken _ k _ -> Just k
errorCanBeHandled <- canAnyFollowingRuleAcceptTokenKind errorTokenKind
errorCanBeHandledWithoutUnwind <- canImmediateFollowingRuleAcceptTokenKind errorTokenKind
-- (2)
logError parseError
let nodeForErr = makeErrorNode parseError errorCanBeHandled
pushGroupNode $ childNodes ++ [nodeForErr]
-- (3)
if not errorCanBeHandled
then -- (3a)
advance
else -- (3b)
unless errorCanBeHandledWithoutUnwind $ throwError Unwind
handleInnerUnwind :: [SyntaxNode] -> ParserM ()
handleInnerUnwind childNodes = do
errorTokenKind <- fmap tokenKind <$> peek
-- When unwinding, check if this is the level where we can parse the error
-- token. If it is, then stop unwinding. Otherwise, we continue unwinding.
errorCanBeHandledWithoutUnwind <- canAnyFollowingRuleAcceptTokenKind errorTokenKind
pushGroupNode childNodes
unless errorCanBeHandledWithoutUnwind $ throwError Unwind
pushGroupNode :: [SyntaxNode] -> ParserM ()
pushGroupNode children =
let width = sum $ map snodeWidth children
in pushNode $
SyntaxNode
{ snodeKind = label,
snodeWidth = width,
snodeChildren = children
}
parseChain :: GrammarRule -> GrammarRule -> ParserM ()
parseChain first second = do
consumeTrivia
-- First, figure out following set for @second@
pushFollowingTokens $ getValidFirstTokens second
maybeError <- tryError (parseRule first)
-- Make sure to remove following set now that we're leaving this "Chain"
popFollowingTokens
-- Run second (or throw error if first failed)
case maybeError of
Nothing -> parseRule second
Just err -> throwError err
parseAlternative :: GrammarRule -> GrammarRule -> ParserM ()
parseAlternative left right
| leftFirstTokens `TokenSet.intersection` rightFirstTokens /= TokenSet.empty =
error $
unlines
[ "[WARNING] Alternative grammar rule has two rules that can accept the same first token."
++ "This will result in the second rule never running on these tokens.",
"\tLeft Tokens:",
"\t" ++ show leftFirstTokens,
"\n\tRight Tokens:",
"\t" ++ show rightFirstTokens
]
| otherwise = do
consumeTrivia
nextToken <- peek
let nextKind = tokenKind <$> nextToken
if nextKind `willSucceedIn` left
then parseRule left
else
if nextKind `willSucceedIn` right
then parseRule right
else throwParseError nextToken (leftFirstTokens `TokenSet.union` rightFirstTokens)
where
leftFirstTokens = getValidFirstTokens left
rightFirstTokens = getValidFirstTokens right
-- | Advance past trivia tokens, making syntax nodes for each token
consumeTrivia :: ParserM ()
consumeTrivia =
peek >>= \case
Nothing -> pure ()
Just nextToken
-- Consume any whitespace
| tokenKindIsTrivia (tokenKind nextToken) -> do
advance
let newNode =
SyntaxNode
{ snodeKind = S.Token (tokenKind nextToken),
snodeWidth = tokenWidth nextToken,
snodeChildren = []
}
pushNode newNode
consumeTrivia
| otherwise -> pure ()
-- | Peek the immediate next token in input (including trivia)
peek :: ParserM (Maybe Token)
peek =
gets pstateInput >>= \case
[] -> return Nothing
tok : _ -> return (Just tok)
-- | Advance past the next token in the input, updating offsets.
advance :: ParserM ()
advance =
gets pstateInput >>= \case
[] -> modify (\s -> s {pstatePrevTokenOffset = pstateNextTokenOffset s})
(consumed : remaining) -> do
newLastOffset <- gets pstateNextTokenOffset
let newNextOffset = newLastOffset + tokenWidth consumed
modify (\s -> s {pstateInput = remaining, pstatePrevTokenOffset = newLastOffset, pstateNextTokenOffset = newNextOffset})
-- | @throwParseError actual expected@ throws an error.
--
-- Assumes @actual@ has not been `advance`d past yet (and does not advance past
-- it).
throwParseError :: Maybe Token -> TokenSet -> ParserM ()
throwParseError Nothing eset = do
offset <- gets pstateNextTokenOffset
throwError $ ParseError $ UnexpectedEOF offset eset
throwParseError (Just actual) expectedTokens = do
offset <- gets pstateNextTokenOffset
throwError $ ParseError $ UnexpectedToken (Region offset (offset + tokenWidth actual)) (tokenKind actual) expectedTokens
-- | Make an error node. Error node should only be produced when catching and
-- pushing errors to the state.
makeErrorNode ::
-- | Parse error that will be used to calculate the width of the error node.
ParseError ->
-- | If "True", give the error node a width of 0. Otherwise, use the
-- width of the error item.
Bool ->
SyntaxNode
makeErrorNode (UnexpectedEOF _ _) _ =
SyntaxNode
{ snodeKind = S.Error,
snodeWidth = 0,
snodeChildren = []
}
makeErrorNode (UnexpectedToken (Region so eo) _ _) zeroWidth =
SyntaxNode
{ snodeKind = S.Error,
snodeWidth = if zeroWidth then 0 else eo - so,
snodeChildren = []
}
-- | Add a node to the output.
pushNode :: SyntaxNode -> ParserM ()
pushNode node = modify (\s -> s {pstateCurrentLevelNodes = pstateCurrentLevelNodes s ++ [node]})
-- | Add an error to the output (does not throw the error).
logError :: ParseError -> ParserM ()
logError err = modify (\s -> s {pstateErrors = pstateErrors s ++ [err]})
-- | Add an "TokenSet" to the following stack.
pushFollowingTokens :: TokenSet -> ParserM ()
pushFollowingTokens eset = modify (\s -> s {pstateAcceptableNextTokens = eset : pstateAcceptableNextTokens s})
-- | Remove the last added "TokenSet" from the following stack.
popFollowingTokens :: ParserM ()
popFollowingTokens = modify (\s -> s {pstateAcceptableNextTokens = drop 1 $ pstateAcceptableNextTokens s})
-- | Check if a "TokenKind" (or end of file) is one of the first possible tokens
-- in a "GrammarRule".
willSucceedIn :: Maybe TokenKind -> GrammarRule -> Bool
_ `willSucceedIn` Succeed = True
k `willSucceedIn` grammarRule = k `TokenSet.member` getValidFirstTokens grammarRule
-- | Find the "TokenSet" of "TokenKind"s that the "GrammarRule" would succeed on
-- as its first input.
getValidFirstTokens :: GrammarRule -> TokenSet
getValidFirstTokens grammarRule = fst $ go grammarRule TokenSet.empty
where
-- Returns (firstTokens, can succeed with no input?)
go :: GrammarRule -> TokenSet -> (TokenSet, Bool)
go Succeed eset = (eset, True)
go (Token tkind _) eset = (tkind `TokenSet.insertKind` eset, False)
go EOF eset = (TokenSet.insertEof eset, False)
go (Group _ p) eset = go p eset
go (Chain first second) eset =
let (eset', firstCanBeEmpty) = go first eset
in if firstCanBeEmpty
then go second eset'
else (eset', False)
go (Alternative left right) eset =
let (eset', leftCanBeEmpty) = go left eset
(eset'', rightCanBeEmpty) = go right eset'
in (eset'', leftCanBeEmpty || rightCanBeEmpty)
-- | Check if any following rule can accept the given "TokenKind". If this is
-- true, then unwinding at the current location will succeed in recovering from
-- an error.
canAnyFollowingRuleAcceptTokenKind :: Maybe TokenKind -> ParserM Bool
canAnyFollowingRuleAcceptTokenKind nextTokenKind = do
followingTokens <- gets pstateAcceptableNextTokens
return $ any (TokenSet.member nextTokenKind) followingTokens
-- | Check if the next rule can accept the given "TokenKind". If this is true,
-- then an error can be recovered from at the current context with no unwinding.
--
-- Pre-condition: @pstateAcceptableNextTokens@ is not empty.
canImmediateFollowingRuleAcceptTokenKind :: Maybe TokenKind -> ParserM Bool
canImmediateFollowingRuleAcceptTokenKind nextTokenKind = do
followingTokens <- gets pstateAcceptableNextTokens
return $ case followingTokens of
[] -> error "canImmediateFollowingRuleAcceptTokenKind called with empty pstateAcceptableNextTokens. This is a parser library error."
(nextTokens : _) -> nextTokenKind `TokenSet.member` nextTokens
infix 9 `as`
-- | Matches a single token of the specified "TokenKind" and produce an output
-- node with the given "SyntaxKind".
--
-- __Example:__
--
-- >>> parse [T.Token T.Identifier 3] $ T.Identifier `as` S.DeclName
-- ([],[SyntaxNode {snodeKind = DeclName, snodeWidth = 3, snodeChildren = []}])
as :: TokenKind -> SyntaxKind -> GrammarRule
tkind `as` skind = Token tkind (Just skind)
infixr 5 <$$>
-- | @syntaxKind '<$$>' rule@ (pronounced group) runs a grammar rule and places the
-- nodes that grammar rule adds to the
-- output in a parent node.
--
-- __Example:__
--
-- >>> parse [T.Token T.Identifier 3, T.Token T.Int 2] $ S.Decl <$$> identifier <> int
-- ([],[SyntaxNode {snodeKind = Decl, snodeWidth = 5, snodeChildren = [SyntaxNode {snodeKind = Token Identifier, snodeWidth = 3, snodeChildren = []},SyntaxNode {snodeKind = Token Int, snodeWidth = 2, snodeChildren = []}]}])
(<$$>) :: SyntaxKind -> GrammarRule -> GrammarRule
label <$$> grammarRule = Group label grammarRule
-- | Run a grammar rule or consume no input.
--
-- __Example:__
--
-- >>> parse [T.Token T.Int 1, T.Token T.Int 1] $ int <> perhaps comma <> int
-- ([],[SyntaxNode {snodeKind = Token Int, snodeWidth = 1, snodeChildren = []},SyntaxNode {snodeKind = Token Int, snodeWidth = 1, snodeChildren = []}])
perhaps :: GrammarRule -> GrammarRule
perhaps grammarRule = grammarRule <|> Succeed
infixr 4 <|>
-- | Run a grammar, choosing either the left or the right rule.
--
-- __Example:__
--
-- >>> let number = int <|> double
-- >>> parse [T.Token T.Int 2] number
-- ([],[SyntaxNode (Token T.Int) 2 []])
-- >>> parse [T.Token T.Double 3] number
-- ([],[SyntaxNode (Token T.Double) 2 []])
(<|>) :: GrammarRule -> GrammarRule -> GrammarRule
left <|> right = Alternative left right
-- Primitive grammar rules
lparen :: GrammarRule
lparen = Token T.LParen Nothing
rparen :: GrammarRule
rparen = Token T.RParen Nothing
lsquare :: GrammarRule
lsquare = Token T.LSquare Nothing
rsquare :: GrammarRule
rsquare = Token T.RSquare Nothing
lcurly :: GrammarRule
lcurly = Token T.LCurly Nothing
rcurly :: GrammarRule
rcurly = Token T.RCurly Nothing
comma :: GrammarRule
comma = Token T.Comma Nothing
colon :: GrammarRule
colon = Token T.Colon Nothing
kwImport :: GrammarRule
kwImport = Token T.KwImport Nothing
kwFrom :: GrammarRule
kwFrom = Token T.KwFrom Nothing
-- | Produces a "S.BoolTrue" syntax node
kwTrue :: GrammarRule
kwTrue = T.KwTrue `as` S.BoolTrue
-- | Produces a "S.BoolFalse" syntax node
kwFalse :: GrammarRule
kwFalse = T.KwFalse `as` S.BoolFalse
-- | Produces a "S.String" syntax node
string :: GrammarRule
string = T.String `as` S.String
-- | Produces a "S.Int" syntax node
int :: GrammarRule
int = T.Int `as` S.Int
-- | Produces a "S.Double" syntax node
double :: GrammarRule
double = T.Double `as` S.Double
lquote :: GrammarRule
lquote = Token T.LQuote Nothing
rquote :: GrammarRule
rquote = Token T.RQuote Nothing
quoted :: GrammarRule
quoted = Token T.Quoted Nothing
-- | Succeeds when the parser is at the end of the file.
eof :: GrammarRule
eof = EOF
-- UTILITIES:
tryError :: MonadError e m => m () -> m (Maybe e)
tryError m = (m >> return Nothing) `catchError` (return . Just)

View File

@ -0,0 +1,89 @@
{-# LANGUAGE DeriveGeneric #-}
module Wasp.Backend.ConcreteSyntax
( -- * Concrete Syntax
-- | This data structure for storing syntax information is inspired by the
-- rust analyzer's parsing library rowan (https://github.com/rust-analyzer/rowan).
SyntaxNode (..),
SyntaxKind (..),
cstPrettyPrint,
syntaxKindIsTrivia,
)
where
import Control.DeepSeq (NFData)
import Data.Aeson (ToJSON)
import Data.List (foldl', intercalate)
import GHC.Generics (Generic)
import Wasp.Backend.Token (TokenKind, tokenKindIsTrivia)
-- | The kind of a "SyntaxNode".
data SyntaxKind
= Token !TokenKind
| Program
| Decl
| DeclType
| DeclName
| Dict
| DictEntry
| DictKey
| List
| Tuple
| ExtImport
| ExtImportModule
| ExtImportField
| ExtImportPath
| String
| Int
| Double
| BoolTrue
| BoolFalse
| Var
| Quoter
| Error
deriving (Eq, Ord, Show, Generic)
instance NFData SyntaxKind
instance ToJSON SyntaxKind
-- | A node in a concrete syntax tree. We use such a loosely typed system so
-- that we can store all pieces of source syntax in the tree, including
-- comments, whitespace, and unexpected tokens/characters.
--
-- Later, this CST is processed into an AST, a more strongly typed format that
-- leaves out all of the trivia tokens.
data SyntaxNode = SyntaxNode
{ snodeKind :: !SyntaxKind,
-- | Width of this "SyntaxNode". For nodes with children, this is the sum
-- of their children's widths. For nodes without children, this is the
-- width of the source text covered by the node.
snodeWidth :: !Int,
snodeChildren :: [SyntaxNode]
}
deriving (Eq, Ord, Show, Generic)
instance NFData SyntaxNode
instance ToJSON SyntaxNode
-- | Pretty print a concrete syntax tree. Shows tree of syntax kinds and their
-- offsets in the source file.
cstPrettyPrint :: SyntaxNode -> String
cstPrettyPrint node = go 0 "" node
where
go :: Int -> String -> SyntaxNode -> String
go offset indent pnode =
let nodeTxt = indent ++ show (snodeKind pnode) ++ "@" ++ show offset ++ ".." ++ show (offset + snodeWidth pnode)
childrenTxt =
fst $
foldl'
(\(strs, o) child -> (strs ++ [go o (indent ++ " ") child], o + snodeWidth child))
([], offset)
(snodeChildren pnode)
in intercalate "\n" (nodeTxt : childrenTxt)
syntaxKindIsTrivia :: SyntaxKind -> Bool
syntaxKindIsTrivia (Token k) = tokenKindIsTrivia k
syntaxKindIsTrivia _ = False

View File

@ -0,0 +1,6 @@
module Wasp.Backend.Lexer
( Wasp.Backend.Lexer.Lexer.lex,
)
where
import Wasp.Backend.Lexer.Lexer (lex)

View File

@ -0,0 +1,142 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE GeneralisedNewtypeDeriving #-}
module Wasp.Backend.Lexer.Internal
( -- * Lexer monad code
Lexer,
runLexer,
initialLexState,
LexState,
LexerStartCode (..),
LexInput (..),
getInput,
updateInput,
setStartCode,
getStartCode,
-- * Alex utilities
AlexInput,
startCodeToInt,
alexGetByte,
alexInputPrevChar,
beginQuoter,
lexQuoterEndTag,
createToken,
)
where
import Codec.Binary.UTF8.String (encodeChar)
import Control.Monad.State.Strict (MonadState, State, evalState, get, modify)
import Data.Word (Word8)
import Wasp.Backend.Token (Token (..), TokenKind)
import qualified Wasp.Backend.Token as T
-- LEXER MONAD CODE
newtype Lexer a = Lexer {unLexer :: State LexState a}
deriving (Functor, Applicative, Monad, MonadState LexState)
runLexer :: Lexer a -> LexState -> a
runLexer lexer state = evalState (unLexer lexer) state
initialLexState :: String -> LexState
initialLexState source =
LexState
{ lstateInput = LexInput '\n' [] source,
lstateStartCode = DefaultStartCode
}
data LexState = LexState
{ lstateInput :: LexInput,
lstateStartCode :: LexerStartCode
}
getInput :: Lexer LexInput
getInput = lstateInput <$> get
updateInput :: Int -> Lexer ()
updateInput !consumed = do
(LexInput _ _ remaining) <- getInput
let newInput =
let (prevChar : remaining') = drop (consumed - 1) remaining
in LexInput prevChar [] remaining'
modify $ \s -> s {lstateInput = newInput}
getStartCode :: Lexer LexerStartCode
getStartCode = lstateStartCode <$> get
setStartCode :: LexerStartCode -> Lexer ()
setStartCode startCode = modify $ \s -> s {lstateStartCode = startCode}
-- | A representation of the lexer's start code: https://www.haskell.org/alex/doc/html/alex-files.html#startcodes
data LexerStartCode
= -- | For a start code @DefaultStartCode@, the lexer is in start code <0>
DefaultStartCode
| -- | For a start code @QuoterStartCode tag@, the lexer is in start code <quoter> with opening tag @tag@
QuoterStartCode String
deriving (Show)
-- | The type of the input given to the parser/lexer.
--
-- An input @(prevChar, bs, remainingSource)@ represents:
-- - @prevChar@ The previous character, successfully consumed by the lexer.
-- - @bs@ The yet unconsumed UTF8 bytes of the current character being lexed.
-- - @remainingSource@ The remaining source to be lexed and parsed
-- (including the character currently being lexed as the first char in it).
data LexInput = LexInput Char [Word8] String
-- ALEX UTILITIES
type AlexInput = LexInput
-- | Convert the ParserState's start code to an int for Alex to use
startCodeToInt :: Int -> LexerStartCode -> Int
startCodeToInt _ DefaultStartCode = 0
startCodeToInt quoter (QuoterStartCode _) = quoter
-- | Required by Alex.
--
-- This function is taken from the Alex basic wrapper.
alexGetByte :: AlexInput -> Maybe (Word8, AlexInput)
alexGetByte (LexInput prevChar (b : bs) remainingSource) = Just (b, LexInput prevChar bs remainingSource)
alexGetByte (LexInput _ [] []) = Nothing
alexGetByte (LexInput _ [] (currChar : remainingSource)) = case encodeChar currChar of
(b : bs) -> Just (b, LexInput currChar bs remainingSource)
[] -> Nothing
-- | Required by Alex.
--
-- This function is taken from the Alex basic wrapper.
alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar (LexInput prevChar _ _) = prevChar
-- | Takes a lexeme like "{=json" and sets the quoter start code
beginQuoter :: String -> Lexer Token
beginQuoter leftQuoteTag = do
let tag = drop 2 leftQuoteTag
setStartCode (QuoterStartCode tag)
createToken T.LQuote leftQuoteTag
-- | Takes a lexeme like "json=}" and either ends a quoter or add quoted text to
-- the quoter
lexQuoterEndTag :: String -> Lexer Token
lexQuoterEndTag rightQuoteTag = do
startCode <- getStartCode
case startCode of
DefaultStartCode -> error "impossible: lexQuoterEndTag with DefaultStartCode"
QuoterStartCode startTag | startTag == tag -> do
setStartCode DefaultStartCode
createToken T.RQuote rightQuoteTag
_ -> do
createToken T.Quoted rightQuoteTag
where
tag = take (length rightQuoteTag - 2) rightQuoteTag
-- | Makes an action that creates a token from a "TokenKind"
createToken :: TokenKind -> (String -> Lexer Token)
createToken kind lexeme =
return $
Token
{ tokenKind = kind,
tokenWidth = length lexeme
}

View File

@ -0,0 +1,108 @@
{
-- This file is processed by Alex (https://www.haskell.org/alex/) and generates
-- the module `Wasp.Backend.Lexer.Lexer`
module Wasp.Backend.Lexer.Lexer
( Wasp.Backend.Lexer.Lexer.lex
) where
import Wasp.Backend.Lexer.Internal
import Wasp.Backend.Token (Token)
import qualified Wasp.Backend.Token as T
}
-- Character set aliases
$space = [\ \t\f\v\r] -- Non-newline whitespace
$digit = 0-9
$alpha = [a-zA-Z]
$identstart = [_$alpha]
$ident = [_$alpha$digit]
$any = [.$white]
-- Regular expression aliases
-- matches string-literal on a single line, from https://stackoverflow.com/a/9260547/3902376
@string = \"([^\\\"]|\\.)*\"
@double = "-"? $digit+ "." $digit+
@integer = "-"? $digit+
@ident = $identstart $ident* "'"*
@linecomment = "//" [^\n\r]*
-- Based on https://stackoverflow.com/a/16165598/1509394 .
@blockcomment = "/*" (("*"[^\/]) | [^\*] | $white)* "*/"
tokens :-
<0> $space+ { createToken T.White }
<0> \n { createToken T.Newline }
<0> @linecomment { createToken T.Comment }
<0> @blockcomment { createToken T.Comment }
-- Quoter rules:
-- Uses Alex start codes to lex quoted characters with different rules:
-- - On "{=tag", enter <quoter> start code and make a TLQuote token
-- - While in <quoter>, if "tag=}" is seen
-- - If this closing tag matches the opening, enter <0> and make a TRQuote token
-- - Otherwise, stay in <quoter> and make a TQuoted token
-- - Otherwise, take one character at a time and make a TQuoted token
<0> "{=" @ident { beginQuoter }
<quoter> @ident "=}" { lexQuoterEndTag }
<quoter> $any { createToken T.Quoted }
-- Simple tokens
<0> "(" { createToken T.LParen }
<0> ")" { createToken T.RParen }
<0> "[" { createToken T.LSquare }
<0> "]" { createToken T.RSquare }
<0> "{" { createToken T.LCurly }
<0> "}" { createToken T.RCurly }
<0> "," { createToken T.Comma }
<0> ":" { createToken T.Colon }
<0> "import" { createToken T.KwImport }
<0> "from" { createToken T.KwFrom }
<0> "true" { createToken T.KwTrue }
<0> "false" { createToken T.KwFalse }
-- Strings, numbers, identifiers
<0> @string { createToken T.String }
<0> @double { createToken T.Double }
<0> @integer { createToken T.Int }
<0> @ident { createToken T.Identifier }
{
-- | Lexes a single token from the input, returning "Nothing" if the lexer has
-- reached the end of the input. This function uses a continuation passing style
-- so that this function and its consumer can benefit from tail call
-- optimization.
--
-- This function internally calls `alexScan`, which is a function generated by
-- Alex responsible for doing actual lexing/scanning.
lexOne :: (Maybe Token -> Lexer a) -> Lexer a
lexOne continue = do
input@(LexInput _ _ remaining) <- getInput
startCodeInt <- startCodeToInt quoter <$> getStartCode
case alexScan input startCodeInt of
AlexError (LexInput _ _ (c:_)) -> do
token <- createToken T.Error [c]
updateInput 1
continue (Just token)
AlexError (LexInput c _ []) -> do
token <- createToken T.Error [c]
updateInput 1
continue (Just token)
AlexSkip _ _ ->
error "AlexSkip is impossible: lexer should not skip any input"
AlexToken _ numChars makeToken -> do
let lexeme = take numChars remaining
token <- makeToken lexeme
updateInput numChars
continue (Just token)
AlexEOF -> continue Nothing
-- | @lex source@ lexes all of @source@ into "Token"s.
lex :: String -> [Token]
lex source = runLexer (lexOne continue) $ initialLexState source
where
continue :: Maybe Token -> Lexer [Token]
continue Nothing = return []
-- This is written awkwardly like this so it is a tail call to @lexOne@
continue (Just tok) = (tok:) <$> lexOne continue
}

View File

@ -0,0 +1,111 @@
{-# LANGUAGE DeriveGeneric #-}
module Wasp.Backend.ParseError
( -- * Parse error
ParseError (..),
-- * Display functions
showError,
showErrorMessage,
errorRegion,
-- * Source positions
Region (..),
SourcePos (..),
offsetToSourcePos,
)
where
import Control.DeepSeq (NFData)
import Data.List (intercalate)
import GHC.Generics (Generic)
import Wasp.Backend.Token (TokenKind)
import qualified Wasp.Backend.Token as T
import Wasp.Backend.TokenSet (TokenSet)
import qualified Wasp.Backend.TokenSet as TokenSet
data ParseError
= UnexpectedToken !Region !TokenKind TokenSet
| UnexpectedEOF !Int TokenSet
deriving (Eq, Ord, Show, Generic)
instance NFData ParseError
data Region = Region !Int !Int deriving (Eq, Ord, Show, Generic)
instance NFData Region
data SourcePos = SourcePos !Int !Int deriving (Eq, Ord, Show, Generic)
instance NFData SourcePos
offsetToSourcePos :: String -> Int -> SourcePos
offsetToSourcePos source targetOffset = reach 0 (SourcePos 1 1) source
where
reach :: Int -> SourcePos -> String -> SourcePos
reach o (SourcePos l c) remaining
| o == targetOffset = SourcePos l c
| [] <- remaining = SourcePos l c
| ('\n' : remaining') <- remaining =
let sp' = SourcePos (l + 1) 1
in reach (o + 1) sp' remaining'
| (_ : remaining') <- remaining =
let sp' = SourcePos l (c + 1)
in reach (o + 1) sp' remaining'
showError :: String -> ParseError -> String
showError source msg =
let (Region so eo) = errorRegion msg
start = offsetToSourcePos source so
end = offsetToSourcePos source eo
in "Parse error at " ++ showRegion start end ++ " (" ++ show so ++ ".." ++ show eo ++ ")\n " ++ showErrorMessage msg
errorRegion :: ParseError -> Region
errorRegion (UnexpectedEOF o _) = Region o o
errorRegion (UnexpectedToken rgn _ _) = rgn
showErrorMessage :: ParseError -> String
showErrorMessage (UnexpectedEOF _ expecteds) =
"Unexpected end of file, " ++ showExpected expecteds
showErrorMessage (UnexpectedToken _ actual expecteds) =
"Unexpected token " ++ showTokenKind actual ++ ", " ++ showExpected expecteds
showExpected :: TokenSet -> String
showExpected expecteds = "expected one of " ++ showExpecteds expecteds
showExpecteds :: TokenSet -> String
showExpecteds expecteds =
let kindStrs = map showTokenKind $ TokenSet.toList expecteds
eofStrs = if TokenSet.eofMember expecteds then ["<eof>"] else []
in intercalate "," (kindStrs ++ eofStrs)
showTokenKind :: TokenKind -> String
showTokenKind T.White = "<whitespace>"
showTokenKind T.Newline = "\\n"
showTokenKind T.Comment = "<comment>"
showTokenKind T.LParen = "'('"
showTokenKind T.RParen = "')'"
showTokenKind T.LSquare = "'['"
showTokenKind T.RSquare = "']'"
showTokenKind T.LCurly = "'{'"
showTokenKind T.RCurly = "'}'"
showTokenKind T.Comma = "','"
showTokenKind T.Colon = "':'"
showTokenKind T.KwImport = "'import'"
showTokenKind T.KwFrom = "'from'"
showTokenKind T.KwTrue = "'true'"
showTokenKind T.KwFalse = "'false'"
showTokenKind T.String = "<string>"
showTokenKind T.Int = "<number>"
showTokenKind T.Double = "<number>"
showTokenKind T.LQuote = "'{='"
showTokenKind T.RQuote = "'=}'"
showTokenKind T.Quoted = "<any>" -- Should be impossible, hard to prove though
showTokenKind T.Identifier = "<identifier>"
showTokenKind T.Error = "<error>"
showRegion :: SourcePos -> SourcePos -> String
showRegion start@(SourcePos sl sc) end@(SourcePos el ec)
| start == end = show sl ++ ":" ++ show sc
| sl == el = show sl ++ ":" ++ show sc ++ "-" ++ show ec
| otherwise = show sl ++ ":" ++ show sc ++ "-" ++ show el ++ ":" ++ show ec

View File

@ -0,0 +1,81 @@
{-# LANGUAGE DeriveGeneric #-}
module Wasp.Backend.Token
( Token (..),
TokenKind (..),
tokenKindIsTrivia,
)
where
import Control.DeepSeq (NFData)
import Data.Aeson (ToJSON)
import GHC.Generics (Generic)
-- | The kind of token
--
-- This makes no distinction between value-containing tokens and atomic tokens
-- (i.e. "String" and "KwImport") to make kind comparison easier. To recover the
-- value associated with a token, you need the context provided by a "Token".
data TokenKind
= White
| -- | Newlines (only \n) are separated from whitespace to accomodate line-break
-- based error recovery, if that is ever implemented in the future
Newline
| Comment
| LParen
| RParen
| LSquare
| RSquare
| LCurly
| RCurly
| Comma
| Colon
| KwImport
| KwFrom
| KwTrue
| KwFalse
| String
| Int
| Double
| -- | "{= <identifier>"
LQuote
| -- | "<identifier> =}"
RQuote
| Quoted
| -- | Non-keyword identifier
Identifier
| -- | Kind for unexpected characters
Error
deriving (Eq, Ord, Show, Generic)
instance NFData TokenKind
instance ToJSON TokenKind
-- | A token representing a span of text from the source. For value-containing
-- tokens, the value can be recovered using @tokenText@.
data Token = Token
{ tokenKind :: !TokenKind,
-- | The width of the text representing this token. The source position is
-- not stored; only the width. Later, offsets into the source is computed
-- from an entire tree of tokens, and source position is determined on
-- demand (for example, when an offset is going to be displayerd in an error
-- message).
--
-- This is guaranteed to be equivalent to @length . tokenText@, but is
-- stored explicitly since it is accessed frequently.
tokenWidth :: !Int
}
deriving (Eq, Show, Ord, Generic)
instance NFData Token
instance ToJSON Token
-- | Check if a "TokenKind" is trivia (a token kind that does not affect the
-- parse structure, namely whitespace and comments)
tokenKindIsTrivia :: TokenKind -> Bool
tokenKindIsTrivia White = True
tokenKindIsTrivia Newline = True
tokenKindIsTrivia Comment = True
tokenKindIsTrivia _ = False

View File

@ -0,0 +1,110 @@
{-# LANGUAGE DeriveGeneric #-}
module Wasp.Backend.TokenSet
( -- * TokenSet
-- | A set of "TokenKind"s and possibly EOF (which is not a "TokenKind").
--
-- In functions on a "TokenSet", a @Maybe "TokenKind"@ is either @Nothing@,
-- representing EOF, or @Just k@, representing the kind @k@.
TokenSet,
-- * Membership predicates
member,
eofMember,
kindMember,
-- * Operations
insert,
insertKind,
insertEof,
union,
intersection,
-- * Constructors
empty,
singleton,
fromEOF,
fromKind,
fromList,
-- * Destructors
toList,
)
where
import Control.DeepSeq (NFData)
import Data.Set (Set)
import qualified Data.Set as Set
import GHC.Generics (Generic)
import Wasp.Backend.Token (TokenKind)
data TokenSet = TokenSet
{ -- | Check if EOF is part of the "TokenSet"
eofMember :: !Bool,
kindSet :: Set TokenKind
}
deriving (Eq, Ord, Show, Generic)
instance NFData TokenSet
-- | Check if a "TokenKind" or EOF is part of the "TokenSet".
member :: Maybe TokenKind -> TokenSet -> Bool
member Nothing set = eofMember set
member (Just k) set = kindMember k set
-- | Check if a "TokenKind" is part of the "TokenSet".
kindMember :: TokenKind -> TokenSet -> Bool
kindMember k set = k `Set.member` kindSet set
-- | Insert a "TokenKind" or EOF into a "TokenSet".
insert :: Maybe TokenKind -> TokenSet -> TokenSet
insert Nothing set = insertEof set
insert (Just k) set = insertKind k set
-- | Insert EOF into a "TokenSet".
insertEof :: TokenSet -> TokenSet
insertEof set = set {eofMember = True}
-- | Insert a "TokenKind" into a "TokenSet".
insertKind :: TokenKind -> TokenSet -> TokenSet
insertKind k set = set {kindSet = Set.insert k (kindSet set)}
-- | Get the union of two "TokenSet"s.
union :: TokenSet -> TokenSet -> TokenSet
union left right =
let unionEofMember = eofMember left || eofMember right
unionKindSet = kindSet left `Set.union` kindSet right
in TokenSet {eofMember = unionEofMember, kindSet = unionKindSet}
-- | Get the intersection of two "TokenSet"s.
intersection :: TokenSet -> TokenSet -> TokenSet
intersection left right =
let intersectionEofMember = eofMember left && eofMember right
intersectionKindSet = kindSet left `Set.intersection` kindSet right
in TokenSet {eofMember = intersectionEofMember, kindSet = intersectionKindSet}
-- | The empty "TokenSet".
empty :: TokenSet
empty = TokenSet {eofMember = False, kindSet = Set.empty}
-- | Create a "TokenSet" containing a single "TokenKind" or EOF.
singleton :: Maybe TokenKind -> TokenSet
singleton Nothing = fromEOF
singleton (Just k) = fromKind k
-- | Create a "TokenSet" containing just EOF.
fromEOF :: TokenSet
fromEOF = TokenSet {eofMember = True, kindSet = Set.empty}
-- | Create a "TokenSet" containing a single "TokenKind".
fromKind :: TokenKind -> TokenSet
fromKind k = TokenSet {eofMember = False, kindSet = Set.singleton k}
-- | Create a "TokenSet" from a list of "TokenKind"s.
fromList :: [TokenKind] -> TokenSet
fromList ks = TokenSet {eofMember = False, kindSet = Set.fromList ks}
-- | Get a list of all "TokenKind"s in a "TokenSet".
toList :: TokenSet -> [TokenKind]
toList set = Set.toList (kindSet set)

View File

@ -0,0 +1,5 @@
{-# OPTIONS_GHC -F -pgmF tasty-discover -optF --modules=*Test.hs #-}
-- -optF --modules=*Test.hs tells tasty-discover to pick up only files that match *Test.hs.
-- I did not do this for any strong reason so we can remove it in the future if we figure out
-- it is too restrictive.

39
waspls/test/TestUtil.hs Normal file
View File

@ -0,0 +1,39 @@
module TestUtil
( printDiff,
shouldBeWithDiff,
Diffable (toLines),
)
where
import Data.Algorithm.Diff (Diff, PolyDiff (Both, First, Second), getDiff)
import Data.List (intercalate)
import Test.Tasty.Hspec (Expectation, expectationFailure)
shouldBeWithDiff :: (Eq a, Diffable a) => a -> a -> Expectation
shouldBeWithDiff actual expected
| actual == expected = pure ()
| otherwise =
expectationFailure $
"Actual is not expected\n"
++ printDiff (getDiff (toLines expected) (toLines actual))
++ "\ESC[31m"
printDiff :: [Diff String] -> String
printDiff diffs = intercalate "\n" $ map printDiffLine diffs
where
printDiffLine :: Diff String -> String
printDiffLine (First s) = " \ESC[31m- " ++ s
printDiffLine (Second s) = " \ESC[36m+ " ++ s
printDiffLine (Both s _) = " \ESC[0m " ++ s
class Diffable a where
toLines :: a -> [String]
instance Diffable Char where
toLines c = [[c]]
instance Diffable a => Diffable [a] where
toLines xs = concatMap toLines xs
instance (Diffable a, Diffable b) => Diffable (a, b) where
toLines (a, b) = toLines a ++ toLines b

View File

@ -0,0 +1,271 @@
{-# LANGUAGE FlexibleInstances #-}
{-# OPTIONS_GHC -Wno-orphans #-}
module Wasp.Backend.ConcreteParserTest where
import Control.DeepSeq (deepseq)
import Test.Tasty.Hspec (Spec, describe, it)
import Test.Tasty.QuickCheck
import TestUtil
import Wasp.Backend.ConcreteParser
import Wasp.Backend.ConcreteSyntax
import qualified Wasp.Backend.Lexer as L
import Wasp.Backend.ParseError
import qualified Wasp.Backend.Token as T
import qualified Wasp.Backend.TokenSet as TokenSet
token :: T.TokenKind -> String -> T.Token
token kind text = T.Token {T.tokenKind = kind, T.tokenWidth = length text}
node :: SyntaxKind -> Int -> [SyntaxNode] -> SyntaxNode
node kind width children = SyntaxNode {snodeKind = kind, snodeWidth = width, snodeChildren = children}
spec_ParseCSTExpression :: Spec
spec_ParseCSTExpression = do
it "Parses identifiers" $ do
parseCSTExpression [token T.Identifier "foo"]
`shouldBeWithDiff` ([], [node Var 3 []])
it "Parses double literals" $ do
parseCSTExpression [token T.Double "0.5"]
`shouldBeWithDiff` ([], [node Double 3 []])
it "Parses int literals" $ do
parseCSTExpression [token T.Int "0.5"]
`shouldBeWithDiff` ([], [node Int 3 []])
it "Parses true literals" $ do
parseCSTExpression [token T.KwTrue "true"]
`shouldBeWithDiff` ([], [node BoolTrue 4 []])
it "Parses false literals" $ do
parseCSTExpression [token T.KwFalse "false"]
`shouldBeWithDiff` ([], [node BoolFalse 5 []])
it "Parses external imports" $ do
parseCSTExpression [token T.KwImport "import", token T.Identifier "main", token T.KwFrom "from", token T.String "\"@ext/main.js\""]
`shouldBeWithDiff` ( [],
[ node
ExtImport
28
[ node (Token T.KwImport) 6 [],
node ExtImportModule 4 [],
node (Token T.KwFrom) 4 [],
node ExtImportPath 14 []
]
]
)
it "Parses quoted expressions" $ do
parseCSTExpression [token T.LQuote "{=txt", token T.Quoted "h", token T.Quoted "e", token T.Quoted "llo", token T.RQuote "txt=}"]
`shouldBeWithDiff` ( [],
[ node
Quoter
15
[ node (Token T.LQuote) 5 [],
node (Token T.Quoted) 1 [],
node (Token T.Quoted) 1 [],
node (Token T.Quoted) 3 [],
node (Token T.RQuote) 5 []
]
]
)
describe "Lists" $ do
it "Parses empty lists" $ do
parseCSTExpression [token T.LSquare "[", token T.RSquare "]"]
`shouldBeWithDiff` ([], [node List 2 [node (Token T.LSquare) 1 [], node (Token T.RSquare) 1 []]])
it "Parses list with items" $ do
parseCSTExpression [token T.LSquare "[", token T.Identifier "foo", token T.Comma ",", token T.Int "3", token T.RSquare "]"]
`shouldBeWithDiff` ( [],
[ node
List
7
[ node (Token T.LSquare) 1 [],
node Var 3 [],
node (Token T.Comma) 1 [],
node Int 1 [],
node (Token T.RSquare) 1 []
]
]
)
it "Parses lists with a trailing comma" $ do
parseCSTExpression [token T.LSquare "[", token T.Int "1", token T.Comma ",", token T.RSquare "]"]
`shouldBeWithDiff` ( [],
[ node
List
4
[ node (Token T.LSquare) 1 [],
node Int 1 [],
node (Token T.Comma) 1 [],
node (Token T.RSquare) 1 []
]
]
)
spec_ConcreteParser :: Spec
spec_ConcreteParser =
describe "Wasp.Backend.ConcreteParser" $ do
it "works on a simple example" $ do
let tokens =
[ token T.Identifier "app",
token T.White " ",
token T.Identifier "Test",
token T.White " ",
token T.LCurly "{",
token T.Newline "\n",
token T.White " ",
token T.Comment "// this is the title of the app",
token T.Newline "\n",
token T.White " ",
token T.Identifier "title",
token T.Colon ":",
token T.String "\"hello world\"",
token T.Newline "\n",
token T.RCurly "}"
]
let errors = []
let tree =
node
Program
68
[ node
Decl
68
[ node DeclType 3 [],
node (Token T.White) 1 [],
node DeclName 4 [],
node (Token T.White) 1 [],
node
Dict
59
[ node (Token T.LCurly) 1 [],
node (Token T.Newline) 1 [],
node (Token T.White) 2 [],
node (Token T.Comment) 31 [],
node (Token T.Newline) 1 [],
node (Token T.White) 2 [],
node
DictEntry
19
[ node DictKey 5 [],
node (Token T.Colon) 1 [],
node String 13 []
],
node (Token T.Newline) 1 [],
node (Token T.RCurly) 1 []
]
]
]
let (actualErrors, actualTrees) = parseCST tokens
actualTrees `shouldBeWithDiff` [tree]
actualErrors `shouldBeWithDiff` errors
it "works for incomplete source" $ do
let tokens =
[ token T.Identifier "app",
token T.Identifier "Test",
token T.LCurly "{",
token T.Identifier "title",
token T.Colon ":"
]
let errors = [UnexpectedEOF 14 $ TokenSet.fromList [T.LParen, T.LCurly, T.LSquare, T.KwImport, T.KwTrue, T.KwFalse, T.String, T.Double, T.Int, T.Identifier, T.LQuote]]
let tree =
node
Program
14
[ node
Decl
14
[ node DeclType 3 [],
node DeclName 4 [],
node
Dict
7
[ node (Token T.LCurly) 1 [],
node
DictEntry
6
[ node DictKey 5 [],
node (Token T.Colon) 1 [],
node Error 0 []
]
]
]
]
parseCST tokens `shouldBeWithDiff` (errors, [tree])
it "works when incomplete with closing bracket" $ do
let tokens =
[ token T.Identifier "route",
token T.White " ",
token T.Identifier "TestRoute",
token T.White " ",
token T.LCurly "{",
token T.Newline "\n",
token T.White " ",
token T.Identifier "path",
token T.Colon ":",
token T.White " ",
token T.String "\"/\"",
token T.Comma ",",
token T.Newline "\n",
token T.White " ",
token T.Identifier "to",
token T.Colon ":",
token T.White " ",
token T.Newline "\n",
token T.RCurly "}"
]
let errors =
[ UnexpectedToken (Region 38 39) T.RCurly $ TokenSet.fromList [T.LParen, T.LCurly, T.LSquare, T.KwImport, T.KwTrue, T.KwFalse, T.String, T.Double, T.Int, T.Identifier, T.LQuote]
]
let tree =
node
Program
39
[ node
Decl
39
[ node DeclType 5 [],
node (Token T.White) 1 [],
node DeclName 9 [],
node (Token T.White) 1 [],
node
Dict
23
[ node (Token T.LCurly) 1 [],
node (Token T.Newline) 1 [],
node (Token T.White) 2 [],
node
DictEntry
9
[ node DictKey 4 [],
node (Token T.Colon) 1 [],
node (Token T.White) 1 [],
node String 3 []
],
node (Token T.Comma) 1 [],
node (Token T.Newline) 1 [],
node (Token T.White) 2 [],
node
DictEntry
5
[ node DictKey 2 [],
node (Token T.Colon) 1 [],
node (Token T.White) 1 [],
node (Token T.Newline) 1 [],
node Error 0 []
],
node (Token T.RCurly) 1 []
]
]
]
parseCST tokens `shouldBeWithDiff` (errors, [tree])
-- DEPENDS ON: LexerTest#never fails to lex
-- TODO: remove dependency by making arbitrary instance for Token
--
-- The point of this test is to ensure some sort of parse tree is always built
it "never fails to parse" $
property $ \source -> parseCST (L.lex source) `deepseq` True
instance Diffable SyntaxNode where
toLines n = lines $ cstPrettyPrint n
instance Diffable ParseError where
toLines err = ["At " ++ show (errorRegion err), " " ++ showErrorMessage err]

View File

@ -0,0 +1,66 @@
{-# OPTIONS_GHC -Wno-orphans #-}
module Wasp.Backend.LexerTest where
import Control.DeepSeq (deepseq)
import Data.List (isInfixOf)
import Test.Tasty.Hspec
import Test.Tasty.QuickCheck
import TestUtil
import qualified Wasp.Backend.Lexer as L
import Wasp.Backend.Token
token :: TokenKind -> String -> Token
token kind text = Token {tokenKind = kind, tokenWidth = length text}
spec_Lexer :: Spec
spec_Lexer = do
it "works for a simple example" $ do
let source = "app Test { name: \"hello world\" }\n\n\n // this is a comment\nimport from"
let expected =
[ token Identifier "app",
token White " ",
token Identifier "Test",
token White " ",
token LCurly "{",
token White " ",
token Identifier "name",
token Colon ":",
token White " ",
token String "\"hello world\"",
token White " ",
token RCurly "}",
token Newline "\n",
token Newline "\n",
token Newline "\n",
token White " ",
token Comment "// this is a comment",
token Newline "\n",
token KwImport "import",
token White " ",
token KwFrom "from"
]
L.lex source `shouldBeWithDiff` expected
it "only makes Quoted tokens after an LQuote and before an RQuote" $ do
property $ \inner ->
if "txt=}" `isInfixOf` inner
then discard
else case L.lex $ "{=txt " ++ inner ++ " txt=}" of
[] -> False
open : rest
| tokenKind open /= LQuote -> False
| otherwise ->
all ((== Quoted) . tokenKind) (init rest)
&& (tokenKind (last rest) == RQuote)
it "never fails to lex" $ do
-- @deepseq@ is used to force evaluation of the tokens, making sure an
-- error never occurs (there are calls to @error@ in the lexer that we
-- need to make sure never actually happen)
property $ \source -> L.lex source `deepseq` True
instance Diffable Token where
toLines tok =
[ show (tokenKind tok) ++ "[" ++ show (tokenWidth tok) ++ "]"
]

View File

@ -42,14 +42,26 @@ library
import: common-all
exposed-modules:
Wasp.LSP.Server
Wasp.LSP.Core
Wasp.LSP.Handlers
Wasp.Backend.Token
Wasp.Backend.Lexer
Wasp.Backend.TokenSet
Wasp.Backend.ParseError
Wasp.Backend.ConcreteSyntax
Wasp.Backend.ConcreteParser
Wasp.Backend.ConcreteParser.Internal
other-modules:
Paths_waspls
Wasp.LSP.Core
Wasp.LSP.Handlers
Wasp.Backend.Lexer.Internal
Wasp.Backend.Lexer.Lexer
hs-source-dirs:
src
build-tool-depends:
, alex:alex
build-depends:
, base ^>=4.14.3.0
, waspc
, lsp ^>=1.4.0.0
, lsp-types ^>=1.4.0.1
, containers ^>=0.6.5.1
@ -60,7 +72,34 @@ library
, hslogger ^>=1.3.1.0
, aeson ^>=1.5.6
, lens ^>=5.1
, waspc
, aeson-pretty ^>= 0.8
, utf8-string ^>= 1.0.2
-- 'array' is used by code generated by Alex for src/Analyzer/Parser/Lexer.x
, array ^>= 0.5.4
, deepseq
test-suite waspls-test
import: common-all, common-exe
type: exitcode-stdio-1.0
hs-source-dirs: test
main-is: TastyDiscoverDriver.hs
build-tool-depends:
, tasty-discover:tasty-discover
build-depends:
, base
, waspls
, deepseq
, containers ^>= 0.6.5
, Diff ^>= 0.4.1
, QuickCheck ^>= 2.14
, tasty ^>= 1.4.2
-- tasty-hspec 1.1.7 introduces breaking changes, which is why we have < 1.1.7 .
, tasty-hspec >= 1.1 && < 1.1.7
, tasty-quickcheck ^>= 0.10
other-modules:
TestUtil
Wasp.Backend.LexerTest
Wasp.Backend.ConcreteParserTest
executable waspls
import: common-all, common-exe