From 7727197800e74b5d6a63ff703a9fb7372aa6c202 Mon Sep 17 00:00:00 2001 From: Iavor Diatchki Date: Thu, 29 Apr 2021 11:17:34 -0700 Subject: [PATCH] Split off tokens and new layout in a separate modules --- cryptol.cabal | 2 + src/Cryptol/Parser.y | 1 + src/Cryptol/Parser/Layout.hs | 216 ++++++++++++++++++++++++++++++ src/Cryptol/Parser/Lexer.x | 1 + src/Cryptol/Parser/LexerUtils.hs | 119 +--------------- src/Cryptol/Parser/ParserUtils.hs | 8 +- src/Cryptol/Parser/Token.hs | 116 ++++++++++++++++ 7 files changed, 349 insertions(+), 114 deletions(-) create mode 100644 src/Cryptol/Parser/Layout.hs create mode 100644 src/Cryptol/Parser/Token.hs diff --git a/cryptol.cabal b/cryptol.cabal index df420792..8e68eb27 100644 --- a/cryptol.cabal +++ b/cryptol.cabal @@ -81,6 +81,8 @@ library Exposed-modules: Cryptol.Parser, Cryptol.Parser.Lexer, + Cryptol.Parser.Token, + Cryptol.Parser.Layout, Cryptol.Parser.AST, Cryptol.Parser.Position, Cryptol.Parser.Names, diff --git a/src/Cryptol/Parser.y b/src/Cryptol/Parser.y index dfc98293..a9fb04d1 100644 --- a/src/Cryptol/Parser.y +++ b/src/Cryptol/Parser.y @@ -35,6 +35,7 @@ import Control.Monad(liftM2,msum) import Cryptol.Parser.AST import Cryptol.Parser.Position import Cryptol.Parser.LexerUtils hiding (mkIdent) +import Cryptol.Parser.Token import Cryptol.Parser.ParserUtils import Cryptol.Parser.Unlit(PreProc(..), guessPreProc) import Cryptol.Utils.Ident(paramInstModName) diff --git a/src/Cryptol/Parser/Layout.hs b/src/Cryptol/Parser/Layout.hs new file mode 100644 index 00000000..40dc2bab --- /dev/null +++ b/src/Cryptol/Parser/Layout.hs @@ -0,0 +1,216 @@ +{-# Language BlockArguments #-} +{-# Language OverloadedStrings #-} +module Cryptol.Parser.Layout where + +import Cryptol.Parser.Position +import Cryptol.Parser.Token + +{- + +We assume the existence of an explicit EOF token at the end of the input. This token is *less* indented +than all other tokens (i.e., it is at column 0) + +Explicit Layout Blocks + + * The symbols `(`, `{`, and `[` start an explicit layout block. + * While in an explicit layout block we pass through tokens, except: + - We may start new implicit or explicit layout blocks + - We terminate the current layout block if we encounter the matching + closing symbol `)`, `}`, `]` + +Implicit Layout Blocks + + * The keywords `where`, `private`, and `parameter` start an implicit + layout block. + * The layout block starts at the column of the *following* token and we + insert "virtual start block" between the current and the following tokens. + * While in an implicit layout block: + - We may start new implicit or explicit layout blocks + - We insert a "virtual separator" before tokens starting at the same + column as the layout block, EXCEPT: + * we do not insert a separator if the previous token was a + "documentation comment" + * we do not insert a separator before the first token in the block + + - The implicit layout block is ended by one of the tokens + `,`, `)`, `}`, `]`, or a token than is less indented that the + block's column. + - When an implicit layout block ends, we insert a "virtual end block" + token just before the token that caused the block to end. + +Examples: + +f = x where x = 0x1 -- end implicit layout by layout +g = 0x3 -- (`g` is less indented than `x`) + +f (x where x = 2) -- end implicit layout by `)` + +[ x where x = 2, 3 ] -- end implicit layout by `,` + +module A where -- two implicit layout blocks with the +private -- *same* indentation (`where` and `private`) +x = 0x2 +-} + + +layout :: Bool -> [Located Token] -> [Located Token] +layout isMod ts0 + + -- Star an implicit layout block at the top of the module + | let t = head ts0 + blockCol = col (from (srcRange t)) + , isMod && tokenType (thing t) /= KW KW_module = + go [ Virtual blockCol ] blockCol True ts0 + + | otherwise = + go [] 0 False ts0 + + where + + {- State parameters for `go`: + + stack: + The stack of implicit and explicit blocks + + lastVirt: + The indentation of the outer most implicit block, or 0 if none. + This can be computed from the stack but we cache + it here as we need to check it on each token. + + noVirtSep: + Do not emit a virtual separator even if token matches block alignment. + This is enabled at the beginning of a block, or after a doc string, + or if we just emitted a separtor, but have not yet consumed the + next token. + + tokens: + remaining tokens to process + -} + + go stack lastVirt noVirtSep tokens + + -- End implicit layout due to indentation. If the outermost block + -- is a lyout block we just end it. If the outermost block is an + -- explicit layout block we report a lexical error. + | col curLoc < lastVirt = + endImplictBlock + + -- End implicit layout block due to a symbol + | Virtual {} <- curBlock, endsLayout curTokTy = + endImplictBlock + + -- Insert a virtual separator + | Virtual {} <- curBlock + , col curLoc == lastVirt && not noVirtSep = + virt curRange VSemi : go stack lastVirt True tokens + + -- Start a new implicit layout. Advances token position. + | startsLayout curTokTy = startImplicitBlock + + -- Start a paren block. Advances token position + | Just close <- startsParenBlock curTokTy = + curTok : go (Explicit close : stack) lastVirt False advanceTokens + + -- End a paren block. Advances token position + | Explicit close <- curBlock, close == curTokTy = + curTok : go popStack lastVirt False advanceTokens + + -- Disable virtual separator after doc string. Advances token position + | White DocStr <- curTokTy = + curTok : go stack lastVirt True advanceTokens + + -- Check to see if we are done. Note that if we got here, implicit layout + -- blocks should have already been closed, as `EOF` is less indented than + -- all other tokens + | EOF <- curTokTy = + [curTok] + + -- Any other token, just emit. Advances token position + | otherwise = + curTok : go stack lastVirt False advanceTokens + + where + curTok : advanceTokens = tokens + curTokTy = tokenType (thing curTok) + curRange = srcRange curTok + curLoc = from curRange + + curBlock : popStack = stack + + + startImplicitBlock = + let nextRng = srcRange (head advanceTokens) + nextLoc = from nextRng + blockCol = col nextLoc + in curTok + : virt nextRng VCurlyL + : go (Virtual blockCol : stack) blockCol True advanceTokens + + + endImplictBlock = + case curBlock of + Virtual {} -> go popStack newVirt False tokens + where newVirt = case [ n | Virtual n <- popStack ] of + n : _ -> n + _ -> 0 + + Explicit c -> errTok curRange (InvalidIndentation c) : advanceTokens + + +-------------------------------------------------------------------------------- + +data Block = + Virtual Int -- ^ Virtual layout block + | Explicit TokenT -- ^ An explicit layout block, expecting this ending token. + deriving (Show) + +-- | These tokens start an implicit layout block +startsLayout :: TokenT -> Bool +startsLayout ty = + case ty of + KW KW_where -> True + KW KW_private -> True + KW KW_parameter -> True + _ -> False + +-- | These tokens end an implicit layout block +endsLayout :: TokenT -> Bool +endsLayout ty = + case ty of + Sym Comma -> True + Sym BracketR -> True + Sym ParenR -> True + Sym CurlyR -> True + _ -> False + +-- | These tokens start an explicit "paren" layout block. +-- If so, the result contains the corresponding closing paren. +startsParenBlock :: TokenT -> Maybe TokenT +startsParenBlock ty = + case ty of + Sym BracketL -> Just (Sym BracketR) + Sym ParenL -> Just (Sym ParenR) + Sym CurlyL -> Just (Sym CurlyR) + _ -> Nothing + + +-------------------------------------------------------------------------------- + +-- | Make a virtual token of the given type +virt :: Range -> TokenV -> Located Token +virt rng x = Located { srcRange = rng { to = from rng }, thing = t } + where + t = Token (Virt x) + case x of + VCurlyL -> "beginning of layout block" + VCurlyR -> "end of layout block" + VSemi -> "layout block separator" + +errTok :: Range -> TokenErr -> Located Token +errTok rng x = Located { srcRange = rng { to = from rng }, thing = t } + where + t = Token { tokenType = Err x, tokenText = "" } + + + + diff --git a/src/Cryptol/Parser/Lexer.x b/src/Cryptol/Parser/Lexer.x index 36eba127..9bd3a080 100644 --- a/src/Cryptol/Parser/Lexer.x +++ b/src/Cryptol/Parser/Lexer.x @@ -21,6 +21,7 @@ module Cryptol.Parser.Lexer ) where import Cryptol.Parser.Position +import Cryptol.Parser.Token import Cryptol.Parser.LexerUtils import Cryptol.Parser.Unlit(unLit) import Data.Text (Text) diff --git a/src/Cryptol/Parser/LexerUtils.hs b/src/Cryptol/Parser/LexerUtils.hs index 7b18c515..15f52763 100644 --- a/src/Cryptol/Parser/LexerUtils.hs +++ b/src/Cryptol/Parser/LexerUtils.hs @@ -6,18 +6,10 @@ -- Stability : provisional -- Portability : portable -{-# LANGUAGE DeriveAnyClass #-} -{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE PatternGuards #-} -{-# LANGUAGE BlockArguments #-} module Cryptol.Parser.LexerUtils where -import Cryptol.Parser.Position -import Cryptol.Parser.Unlit(PreProc(None)) -import Cryptol.Utils.PP -import Cryptol.Utils.Panic - import Control.Monad(guard) import Data.Char(toLower,generalCategory,isAscii,ord,isSpace, isAlphaNum,isAlpha) @@ -27,8 +19,12 @@ import qualified Data.Text as T import qualified Data.Text.Read as T import Data.Word(Word8) -import GHC.Generics (Generic) -import Control.DeepSeq +import Cryptol.Utils.Panic +import Cryptol.Parser.Position +import Cryptol.Parser.Token +import Cryptol.Parser.Unlit(PreProc(None)) + + data Config = Config { cfgSource :: !FilePath -- ^ File that we are working on @@ -464,109 +460,6 @@ virt cfg pos x = Located { srcRange = Range -------------------------------------------------------------------------------- -data Token = Token { tokenType :: !TokenT, tokenText :: !Text } - deriving (Show, Generic, NFData) - --- | Virtual tokens, inserted by layout processing. -data TokenV = VCurlyL| VCurlyR | VSemi - deriving (Eq, Show, Generic, NFData) - -data TokenW = BlockComment | LineComment | Space | DocStr - deriving (Eq, Show, Generic, NFData) - -data TokenKW = KW_else - | KW_extern - | KW_fin - | KW_if - | KW_private - | KW_include - | KW_inf - | KW_lg2 - | KW_lengthFromThen - | KW_lengthFromThenTo - | KW_max - | KW_min - | KW_module - | KW_submodule - | KW_newtype - | KW_pragma - | KW_property - | KW_then - | KW_type - | KW_where - | KW_let - | KW_x - | KW_import - | KW_as - | KW_hiding - | KW_infixl - | KW_infixr - | KW_infix - | KW_primitive - | KW_parameter - | KW_constraint - | KW_Prop - deriving (Eq, Show, Generic, NFData) - --- | The named operators are a special case for parsing types, and 'Other' is --- used for all other cases that lexed as an operator. -data TokenOp = Plus | Minus | Mul | Div | Exp | Mod - | Equal | LEQ | GEQ - | Complement | Hash | At - | Other [T.Text] T.Text - deriving (Eq, Show, Generic, NFData) - -data TokenSym = Bar - | ArrL | ArrR | FatArrR - | Lambda - | EqDef - | Comma - | Semi - | Dot - | DotDot - | DotDotDot - | DotDotLt - | Colon - | BackTick - | ParenL | ParenR - | BracketL | BracketR - | CurlyL | CurlyR - | TriL | TriR - | Lt - | Underscore - deriving (Eq, Show, Generic, NFData) - -data TokenErr = UnterminatedComment - | UnterminatedString - | UnterminatedChar - | InvalidString - | InvalidChar - | LexicalError - | MalformedLiteral - | MalformedSelector - deriving (Eq, Show, Generic, NFData) - -data SelectorType = RecordSelectorTok Text | TupleSelectorTok Int - deriving (Eq, Show, Generic, NFData) - -data TokenT = Num !Integer !Int !Int -- ^ value, base, number of digits - | Frac !Rational !Int -- ^ value, base. - | ChrLit !Char -- ^ character literal - | Ident ![T.Text] !T.Text -- ^ (qualified) identifier - | StrLit !String -- ^ string literal - | Selector !SelectorType -- ^ .hello or .123 - | KW !TokenKW -- ^ keyword - | Op !TokenOp -- ^ operator - | Sym !TokenSym -- ^ symbol - | Virt !TokenV -- ^ virtual token (for layout) - | White !TokenW -- ^ white space token - | Err !TokenErr -- ^ error token - | EOF - deriving (Eq, Show, Generic, NFData) - -instance PP Token where - ppPrec _ (Token _ s) = text (T.unpack s) - -- | Collapse characters into a single Word8, identifying ASCII, and classes of -- unicode. This came from: -- diff --git a/src/Cryptol/Parser/ParserUtils.hs b/src/Cryptol/Parser/ParserUtils.hs index 0f1ebda3..4b32a872 100644 --- a/src/Cryptol/Parser/ParserUtils.hs +++ b/src/Cryptol/Parser/ParserUtils.hs @@ -35,7 +35,7 @@ import Prelude.Compat import Cryptol.Parser.AST import Cryptol.Parser.Lexer -import Cryptol.Parser.LexerUtils(SelectorType(..)) +import Cryptol.Parser.Token(SelectorType(..)) import Cryptol.Parser.Position import Cryptol.Parser.Utils (translateExprToNumT,widthIdent) import Cryptol.Utils.Ident(packModName,packIdent,modNameChunks) @@ -81,6 +81,12 @@ lexerP k = P $ \cfg p s -> T.unpack (tokenText it) MalformedSelector -> "malformed selector: " ++ T.unpack (tokenText it) + InvalidIndentation c -> "invalid indentation, expected " ++ + case c of + Sym CurlyR -> "}" + Sym ParenR -> ")" + Sym BracketR -> "]" + _ -> show c -- basically panic ] where it = thing t diff --git a/src/Cryptol/Parser/Token.hs b/src/Cryptol/Parser/Token.hs new file mode 100644 index 00000000..280f8157 --- /dev/null +++ b/src/Cryptol/Parser/Token.hs @@ -0,0 +1,116 @@ +{-# LANGUAGE DeriveAnyClass #-} +{-# LANGUAGE DeriveGeneric #-} +module Cryptol.Parser.Token where + +import Data.Text(Text) +import qualified Data.Text as Text +import Control.DeepSeq +import GHC.Generics + +import Cryptol.Utils.PP + +data Token = Token { tokenType :: !TokenT, tokenText :: !Text } + deriving (Show, Generic, NFData) + +-- | Virtual tokens, inserted by layout processing. +data TokenV = VCurlyL| VCurlyR | VSemi + deriving (Eq, Show, Generic, NFData) + +data TokenW = BlockComment | LineComment | Space | DocStr + deriving (Eq, Show, Generic, NFData) + +data TokenKW = KW_else + | KW_extern + | KW_fin + | KW_if + | KW_private + | KW_include + | KW_inf + | KW_lg2 + | KW_lengthFromThen + | KW_lengthFromThenTo + | KW_max + | KW_min + | KW_module + | KW_submodule + | KW_newtype + | KW_pragma + | KW_property + | KW_then + | KW_type + | KW_where + | KW_let + | KW_x + | KW_import + | KW_as + | KW_hiding + | KW_infixl + | KW_infixr + | KW_infix + | KW_primitive + | KW_parameter + | KW_constraint + | KW_Prop + deriving (Eq, Show, Generic, NFData) + +-- | The named operators are a special case for parsing types, and 'Other' is +-- used for all other cases that lexed as an operator. +data TokenOp = Plus | Minus | Mul | Div | Exp | Mod + | Equal | LEQ | GEQ + | Complement | Hash | At + | Other [Text] Text + deriving (Eq, Show, Generic, NFData) + +data TokenSym = Bar + | ArrL | ArrR | FatArrR + | Lambda + | EqDef + | Comma + | Semi + | Dot + | DotDot + | DotDotDot + | DotDotLt + | Colon + | BackTick + | ParenL | ParenR + | BracketL | BracketR + | CurlyL | CurlyR + | TriL | TriR + | Lt + | Underscore + deriving (Eq, Show, Generic, NFData) + +data TokenErr = UnterminatedComment + | UnterminatedString + | UnterminatedChar + | InvalidString + | InvalidChar + | LexicalError + | MalformedLiteral + | MalformedSelector + | InvalidIndentation TokenT -- expected closing paren + deriving (Eq, Show, Generic, NFData) + +data SelectorType = RecordSelectorTok Text | TupleSelectorTok Int + deriving (Eq, Show, Generic, NFData) + +data TokenT = Num !Integer !Int !Int -- ^ value, base, number of digits + | Frac !Rational !Int -- ^ value, base. + | ChrLit !Char -- ^ character literal + | Ident ![Text] !Text -- ^ (qualified) identifier + | StrLit !String -- ^ string literal + | Selector !SelectorType -- ^ .hello or .123 + | KW !TokenKW -- ^ keyword + | Op !TokenOp -- ^ operator + | Sym !TokenSym -- ^ symbol + | Virt !TokenV -- ^ virtual token (for layout) + | White !TokenW -- ^ white space token + | Err !TokenErr -- ^ error token + | EOF + deriving (Eq, Show, Generic, NFData) + +instance PP Token where + ppPrec _ (Token _ s) = text (Text.unpack s) + +