Split off tokens and new layout in a separate modules

This commit is contained in:
Iavor Diatchki 2021-04-29 11:17:34 -07:00
parent 76429aaffa
commit 7727197800
7 changed files with 349 additions and 114 deletions

View File

@ -81,6 +81,8 @@ library
Exposed-modules: Cryptol.Parser,
Cryptol.Parser.Lexer,
Cryptol.Parser.Token,
Cryptol.Parser.Layout,
Cryptol.Parser.AST,
Cryptol.Parser.Position,
Cryptol.Parser.Names,

View File

@ -35,6 +35,7 @@ import Control.Monad(liftM2,msum)
import Cryptol.Parser.AST
import Cryptol.Parser.Position
import Cryptol.Parser.LexerUtils hiding (mkIdent)
import Cryptol.Parser.Token
import Cryptol.Parser.ParserUtils
import Cryptol.Parser.Unlit(PreProc(..), guessPreProc)
import Cryptol.Utils.Ident(paramInstModName)

View File

@ -0,0 +1,216 @@
{-# Language BlockArguments #-}
{-# Language OverloadedStrings #-}
module Cryptol.Parser.Layout where
import Cryptol.Parser.Position
import Cryptol.Parser.Token
{-
We assume the existence of an explicit EOF token at the end of the input. This token is *less* indented
than all other tokens (i.e., it is at column 0)
Explicit Layout Blocks
* The symbols `(`, `{`, and `[` start an explicit layout block.
* While in an explicit layout block we pass through tokens, except:
- We may start new implicit or explicit layout blocks
- We terminate the current layout block if we encounter the matching
closing symbol `)`, `}`, `]`
Implicit Layout Blocks
* The keywords `where`, `private`, and `parameter` start an implicit
layout block.
* The layout block starts at the column of the *following* token and we
insert "virtual start block" between the current and the following tokens.
* While in an implicit layout block:
- We may start new implicit or explicit layout blocks
- We insert a "virtual separator" before tokens starting at the same
column as the layout block, EXCEPT:
* we do not insert a separator if the previous token was a
"documentation comment"
* we do not insert a separator before the first token in the block
- The implicit layout block is ended by one of the tokens
`,`, `)`, `}`, `]`, or a token than is less indented that the
block's column.
- When an implicit layout block ends, we insert a "virtual end block"
token just before the token that caused the block to end.
Examples:
f = x where x = 0x1 -- end implicit layout by layout
g = 0x3 -- (`g` is less indented than `x`)
f (x where x = 2) -- end implicit layout by `)`
[ x where x = 2, 3 ] -- end implicit layout by `,`
module A where -- two implicit layout blocks with the
private -- *same* indentation (`where` and `private`)
x = 0x2
-}
layout :: Bool -> [Located Token] -> [Located Token]
layout isMod ts0
-- Star an implicit layout block at the top of the module
| let t = head ts0
blockCol = col (from (srcRange t))
, isMod && tokenType (thing t) /= KW KW_module =
go [ Virtual blockCol ] blockCol True ts0
| otherwise =
go [] 0 False ts0
where
{- State parameters for `go`:
stack:
The stack of implicit and explicit blocks
lastVirt:
The indentation of the outer most implicit block, or 0 if none.
This can be computed from the stack but we cache
it here as we need to check it on each token.
noVirtSep:
Do not emit a virtual separator even if token matches block alignment.
This is enabled at the beginning of a block, or after a doc string,
or if we just emitted a separtor, but have not yet consumed the
next token.
tokens:
remaining tokens to process
-}
go stack lastVirt noVirtSep tokens
-- End implicit layout due to indentation. If the outermost block
-- is a lyout block we just end it. If the outermost block is an
-- explicit layout block we report a lexical error.
| col curLoc < lastVirt =
endImplictBlock
-- End implicit layout block due to a symbol
| Virtual {} <- curBlock, endsLayout curTokTy =
endImplictBlock
-- Insert a virtual separator
| Virtual {} <- curBlock
, col curLoc == lastVirt && not noVirtSep =
virt curRange VSemi : go stack lastVirt True tokens
-- Start a new implicit layout. Advances token position.
| startsLayout curTokTy = startImplicitBlock
-- Start a paren block. Advances token position
| Just close <- startsParenBlock curTokTy =
curTok : go (Explicit close : stack) lastVirt False advanceTokens
-- End a paren block. Advances token position
| Explicit close <- curBlock, close == curTokTy =
curTok : go popStack lastVirt False advanceTokens
-- Disable virtual separator after doc string. Advances token position
| White DocStr <- curTokTy =
curTok : go stack lastVirt True advanceTokens
-- Check to see if we are done. Note that if we got here, implicit layout
-- blocks should have already been closed, as `EOF` is less indented than
-- all other tokens
| EOF <- curTokTy =
[curTok]
-- Any other token, just emit. Advances token position
| otherwise =
curTok : go stack lastVirt False advanceTokens
where
curTok : advanceTokens = tokens
curTokTy = tokenType (thing curTok)
curRange = srcRange curTok
curLoc = from curRange
curBlock : popStack = stack
startImplicitBlock =
let nextRng = srcRange (head advanceTokens)
nextLoc = from nextRng
blockCol = col nextLoc
in curTok
: virt nextRng VCurlyL
: go (Virtual blockCol : stack) blockCol True advanceTokens
endImplictBlock =
case curBlock of
Virtual {} -> go popStack newVirt False tokens
where newVirt = case [ n | Virtual n <- popStack ] of
n : _ -> n
_ -> 0
Explicit c -> errTok curRange (InvalidIndentation c) : advanceTokens
--------------------------------------------------------------------------------
data Block =
Virtual Int -- ^ Virtual layout block
| Explicit TokenT -- ^ An explicit layout block, expecting this ending token.
deriving (Show)
-- | These tokens start an implicit layout block
startsLayout :: TokenT -> Bool
startsLayout ty =
case ty of
KW KW_where -> True
KW KW_private -> True
KW KW_parameter -> True
_ -> False
-- | These tokens end an implicit layout block
endsLayout :: TokenT -> Bool
endsLayout ty =
case ty of
Sym Comma -> True
Sym BracketR -> True
Sym ParenR -> True
Sym CurlyR -> True
_ -> False
-- | These tokens start an explicit "paren" layout block.
-- If so, the result contains the corresponding closing paren.
startsParenBlock :: TokenT -> Maybe TokenT
startsParenBlock ty =
case ty of
Sym BracketL -> Just (Sym BracketR)
Sym ParenL -> Just (Sym ParenR)
Sym CurlyL -> Just (Sym CurlyR)
_ -> Nothing
--------------------------------------------------------------------------------
-- | Make a virtual token of the given type
virt :: Range -> TokenV -> Located Token
virt rng x = Located { srcRange = rng { to = from rng }, thing = t }
where
t = Token (Virt x)
case x of
VCurlyL -> "beginning of layout block"
VCurlyR -> "end of layout block"
VSemi -> "layout block separator"
errTok :: Range -> TokenErr -> Located Token
errTok rng x = Located { srcRange = rng { to = from rng }, thing = t }
where
t = Token { tokenType = Err x, tokenText = "" }

View File

@ -21,6 +21,7 @@ module Cryptol.Parser.Lexer
) where
import Cryptol.Parser.Position
import Cryptol.Parser.Token
import Cryptol.Parser.LexerUtils
import Cryptol.Parser.Unlit(unLit)
import Data.Text (Text)

View File

@ -6,18 +6,10 @@
-- Stability : provisional
-- Portability : portable
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PatternGuards #-}
{-# LANGUAGE BlockArguments #-}
module Cryptol.Parser.LexerUtils where
import Cryptol.Parser.Position
import Cryptol.Parser.Unlit(PreProc(None))
import Cryptol.Utils.PP
import Cryptol.Utils.Panic
import Control.Monad(guard)
import Data.Char(toLower,generalCategory,isAscii,ord,isSpace,
isAlphaNum,isAlpha)
@ -27,8 +19,12 @@ import qualified Data.Text as T
import qualified Data.Text.Read as T
import Data.Word(Word8)
import GHC.Generics (Generic)
import Control.DeepSeq
import Cryptol.Utils.Panic
import Cryptol.Parser.Position
import Cryptol.Parser.Token
import Cryptol.Parser.Unlit(PreProc(None))
data Config = Config
{ cfgSource :: !FilePath -- ^ File that we are working on
@ -464,109 +460,6 @@ virt cfg pos x = Located { srcRange = Range
--------------------------------------------------------------------------------
data Token = Token { tokenType :: !TokenT, tokenText :: !Text }
deriving (Show, Generic, NFData)
-- | Virtual tokens, inserted by layout processing.
data TokenV = VCurlyL| VCurlyR | VSemi
deriving (Eq, Show, Generic, NFData)
data TokenW = BlockComment | LineComment | Space | DocStr
deriving (Eq, Show, Generic, NFData)
data TokenKW = KW_else
| KW_extern
| KW_fin
| KW_if
| KW_private
| KW_include
| KW_inf
| KW_lg2
| KW_lengthFromThen
| KW_lengthFromThenTo
| KW_max
| KW_min
| KW_module
| KW_submodule
| KW_newtype
| KW_pragma
| KW_property
| KW_then
| KW_type
| KW_where
| KW_let
| KW_x
| KW_import
| KW_as
| KW_hiding
| KW_infixl
| KW_infixr
| KW_infix
| KW_primitive
| KW_parameter
| KW_constraint
| KW_Prop
deriving (Eq, Show, Generic, NFData)
-- | The named operators are a special case for parsing types, and 'Other' is
-- used for all other cases that lexed as an operator.
data TokenOp = Plus | Minus | Mul | Div | Exp | Mod
| Equal | LEQ | GEQ
| Complement | Hash | At
| Other [T.Text] T.Text
deriving (Eq, Show, Generic, NFData)
data TokenSym = Bar
| ArrL | ArrR | FatArrR
| Lambda
| EqDef
| Comma
| Semi
| Dot
| DotDot
| DotDotDot
| DotDotLt
| Colon
| BackTick
| ParenL | ParenR
| BracketL | BracketR
| CurlyL | CurlyR
| TriL | TriR
| Lt
| Underscore
deriving (Eq, Show, Generic, NFData)
data TokenErr = UnterminatedComment
| UnterminatedString
| UnterminatedChar
| InvalidString
| InvalidChar
| LexicalError
| MalformedLiteral
| MalformedSelector
deriving (Eq, Show, Generic, NFData)
data SelectorType = RecordSelectorTok Text | TupleSelectorTok Int
deriving (Eq, Show, Generic, NFData)
data TokenT = Num !Integer !Int !Int -- ^ value, base, number of digits
| Frac !Rational !Int -- ^ value, base.
| ChrLit !Char -- ^ character literal
| Ident ![T.Text] !T.Text -- ^ (qualified) identifier
| StrLit !String -- ^ string literal
| Selector !SelectorType -- ^ .hello or .123
| KW !TokenKW -- ^ keyword
| Op !TokenOp -- ^ operator
| Sym !TokenSym -- ^ symbol
| Virt !TokenV -- ^ virtual token (for layout)
| White !TokenW -- ^ white space token
| Err !TokenErr -- ^ error token
| EOF
deriving (Eq, Show, Generic, NFData)
instance PP Token where
ppPrec _ (Token _ s) = text (T.unpack s)
-- | Collapse characters into a single Word8, identifying ASCII, and classes of
-- unicode. This came from:
--

View File

@ -35,7 +35,7 @@ import Prelude.Compat
import Cryptol.Parser.AST
import Cryptol.Parser.Lexer
import Cryptol.Parser.LexerUtils(SelectorType(..))
import Cryptol.Parser.Token(SelectorType(..))
import Cryptol.Parser.Position
import Cryptol.Parser.Utils (translateExprToNumT,widthIdent)
import Cryptol.Utils.Ident(packModName,packIdent,modNameChunks)
@ -81,6 +81,12 @@ lexerP k = P $ \cfg p s ->
T.unpack (tokenText it)
MalformedSelector -> "malformed selector: " ++
T.unpack (tokenText it)
InvalidIndentation c -> "invalid indentation, expected " ++
case c of
Sym CurlyR -> "}"
Sym ParenR -> ")"
Sym BracketR -> "]"
_ -> show c -- basically panic
]
where it = thing t

116
src/Cryptol/Parser/Token.hs Normal file
View File

@ -0,0 +1,116 @@
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DeriveGeneric #-}
module Cryptol.Parser.Token where
import Data.Text(Text)
import qualified Data.Text as Text
import Control.DeepSeq
import GHC.Generics
import Cryptol.Utils.PP
data Token = Token { tokenType :: !TokenT, tokenText :: !Text }
deriving (Show, Generic, NFData)
-- | Virtual tokens, inserted by layout processing.
data TokenV = VCurlyL| VCurlyR | VSemi
deriving (Eq, Show, Generic, NFData)
data TokenW = BlockComment | LineComment | Space | DocStr
deriving (Eq, Show, Generic, NFData)
data TokenKW = KW_else
| KW_extern
| KW_fin
| KW_if
| KW_private
| KW_include
| KW_inf
| KW_lg2
| KW_lengthFromThen
| KW_lengthFromThenTo
| KW_max
| KW_min
| KW_module
| KW_submodule
| KW_newtype
| KW_pragma
| KW_property
| KW_then
| KW_type
| KW_where
| KW_let
| KW_x
| KW_import
| KW_as
| KW_hiding
| KW_infixl
| KW_infixr
| KW_infix
| KW_primitive
| KW_parameter
| KW_constraint
| KW_Prop
deriving (Eq, Show, Generic, NFData)
-- | The named operators are a special case for parsing types, and 'Other' is
-- used for all other cases that lexed as an operator.
data TokenOp = Plus | Minus | Mul | Div | Exp | Mod
| Equal | LEQ | GEQ
| Complement | Hash | At
| Other [Text] Text
deriving (Eq, Show, Generic, NFData)
data TokenSym = Bar
| ArrL | ArrR | FatArrR
| Lambda
| EqDef
| Comma
| Semi
| Dot
| DotDot
| DotDotDot
| DotDotLt
| Colon
| BackTick
| ParenL | ParenR
| BracketL | BracketR
| CurlyL | CurlyR
| TriL | TriR
| Lt
| Underscore
deriving (Eq, Show, Generic, NFData)
data TokenErr = UnterminatedComment
| UnterminatedString
| UnterminatedChar
| InvalidString
| InvalidChar
| LexicalError
| MalformedLiteral
| MalformedSelector
| InvalidIndentation TokenT -- expected closing paren
deriving (Eq, Show, Generic, NFData)
data SelectorType = RecordSelectorTok Text | TupleSelectorTok Int
deriving (Eq, Show, Generic, NFData)
data TokenT = Num !Integer !Int !Int -- ^ value, base, number of digits
| Frac !Rational !Int -- ^ value, base.
| ChrLit !Char -- ^ character literal
| Ident ![Text] !Text -- ^ (qualified) identifier
| StrLit !String -- ^ string literal
| Selector !SelectorType -- ^ .hello or .123
| KW !TokenKW -- ^ keyword
| Op !TokenOp -- ^ operator
| Sym !TokenSym -- ^ symbol
| Virt !TokenV -- ^ virtual token (for layout)
| White !TokenW -- ^ white space token
| Err !TokenErr -- ^ error token
| EOF
deriving (Eq, Show, Generic, NFData)
instance PP Token where
ppPrec _ (Token _ s) = text (Text.unpack s)