Split off tokens and new layout in a separate modules

2024-08-17 01:41:09 +03:00 · 2021-04-29 11:17:34 -07:00 · 2021-04-29 11:17:34 -07:00 · 7727197800
commit 7727197800
parent 76429aaffa
7 changed files with 349 additions and 114 deletions
--- a/cryptol.cabal
+++ b/cryptol.cabal
@ -81,6 +81,8 @@ library

  Exposed-modules:     Cryptol.Parser,
                       Cryptol.Parser.Lexer,
+                       Cryptol.Parser.Token,
+                       Cryptol.Parser.Layout,
                       Cryptol.Parser.AST,
                       Cryptol.Parser.Position,
                       Cryptol.Parser.Names,
--- a/src/Cryptol/Parser.y
+++ b/src/Cryptol/Parser.y
@ -35,6 +35,7 @@ import           Control.Monad(liftM2,msum)
 import Cryptol.Parser.AST
 import Cryptol.Parser.Position
 import Cryptol.Parser.LexerUtils hiding (mkIdent)
+import Cryptol.Parser.Token
 import Cryptol.Parser.ParserUtils
 import Cryptol.Parser.Unlit(PreProc(..), guessPreProc)
 import Cryptol.Utils.Ident(paramInstModName)
--- a/src/Cryptol/Parser/Layout.hs
+++ b/src/Cryptol/Parser/Layout.hs
@ -0,0 +1,216 @@
+{-# Language BlockArguments #-}
+{-# Language OverloadedStrings #-}
+module Cryptol.Parser.Layout where
+
+import Cryptol.Parser.Position
+import Cryptol.Parser.Token
+
+{-
+
+We assume the existence of an explicit EOF token at the end of the input.  This token is *less* indented
+than all other tokens (i.e., it is at column 0)
+
+Explicit Layout Blocks
+
+  * The symbols `(`, `{`, and `[` start an explicit layout block.
+  * While in an explicit layout block we pass through tokens, except:
+      - We may start new implicit or explicit layout blocks
+      - We terminate the current layout block if we encounter the matching
+        closing symbol `)`, `}`, `]`
+
+Implicit Layout Blocks
+
+  * The keywords `where`, `private`, and `parameter` start an implicit
+    layout block.
+  * The layout block starts at the column of the *following* token and we
+    insert "virtual start block" between the current and the following tokens.
+  * While in an implicit layout block:
+    - We may start new implicit or explicit layout blocks
+    - We insert a "virtual separator" before tokens starting at the same
+      column as the layout block, EXCEPT:
+        * we do not insert a separator if the previous token was a
+          "documentation comment"
+        * we do not insert a separator before the first token in the block
+
+    - The implicit layout block is ended by one of the tokens
+          `,`, `)`, `}`, `]`, or a token than is less indented that the
+          block's column.
+    - When an implicit layout block ends, we insert a "virtual end block"
+      token just before the token that caused the block to end.
+
+Examples:
+
+f = x where x = 0x1         -- end implicit layout by layout
+g = 0x3                     -- (`g` is less indented than `x`)
+
+f (x where x = 2)           -- end implicit layout by `)`
+
+[ x where x = 2, 3 ]        -- end implicit layout by `,`
+
+module A where              -- two implicit layout blocks with the
+private                     -- *same* indentation (`where` and `private`)
+x = 0x2
+-}
+
+
+layout :: Bool -> [Located Token] -> [Located Token]
+layout isMod ts0
+
+  -- Star an implicit layout block at the top of the module
+  | let t         = head ts0
+        blockCol  = col (from (srcRange t))
+  , isMod && tokenType (thing t) /= KW KW_module =
+    go [ Virtual blockCol ] blockCol True ts0
+
+  | otherwise =
+    go [] 0 False ts0
+
+  where
+
+  {- State parameters for `go`:
+
+       stack:
+          The stack of implicit and explicit blocks
+
+       lastVirt:
+          The indentation of the outer most implicit block, or 0 if none.
+          This can be computed from the stack but we cache
+          it here as we need to check it on each token.
+
+       noVirtSep:
+          Do not emit a virtual separator even if token matches block alignment.
+          This is enabled at the beginning of a block, or after a doc string,
+          or if we just emitted a separtor, but have not yet consumed the
+          next token.
+
+       tokens:
+          remaining tokens to process
+  -}
+
+  go stack lastVirt noVirtSep tokens
+
+    -- End implicit layout due to indentation.  If the outermost block
+    -- is a lyout block we just end it.   If the outermost block is an
+    -- explicit layout block we report a lexical error.
+    | col curLoc < lastVirt =
+      endImplictBlock
+
+    -- End implicit layout block due to a symbol
+    | Virtual {} <- curBlock, endsLayout curTokTy =
+      endImplictBlock
+
+    -- Insert a virtual separator
+    | Virtual {} <- curBlock
+    , col curLoc == lastVirt && not noVirtSep =
+      virt curRange VSemi : go stack lastVirt True tokens
+
+    -- Start a new implicit layout. Advances token position.
+    | startsLayout curTokTy = startImplicitBlock
+
+    -- Start a paren block.  Advances token position
+    | Just close <- startsParenBlock curTokTy =
+      curTok : go (Explicit close : stack) lastVirt False advanceTokens
+
+    -- End a paren block. Advances token position
+    | Explicit close <- curBlock, close == curTokTy =
+      curTok : go popStack lastVirt False advanceTokens
+
+    -- Disable virtual separator after doc string. Advances token position
+    | White DocStr <- curTokTy =
+      curTok : go stack lastVirt True advanceTokens
+
+    -- Check to see if we are done.  Note that if we got here, implicit layout
+    -- blocks should have already been closed, as `EOF` is less indented than
+    -- all other tokens
+    | EOF <- curTokTy =
+      [curTok]
+
+    -- Any other token, just emit.  Advances token position
+    | otherwise =
+      curTok : go stack lastVirt False advanceTokens
+
+    where
+    curTok : advanceTokens = tokens
+    curTokTy               = tokenType (thing curTok)
+    curRange               = srcRange curTok
+    curLoc                 = from curRange
+
+    curBlock : popStack = stack
+
+
+    startImplicitBlock =
+      let nextRng  = srcRange (head advanceTokens)
+          nextLoc  = from nextRng
+          blockCol = col nextLoc
+      in curTok
+       : virt nextRng VCurlyL
+       : go (Virtual blockCol : stack) blockCol True advanceTokens
+
+
+    endImplictBlock =
+      case curBlock of
+        Virtual {} -> go popStack newVirt False tokens
+          where newVirt = case [ n | Virtual n <- popStack ] of
+                            n : _ -> n
+                            _     -> 0
+
+        Explicit c -> errTok curRange (InvalidIndentation c) : advanceTokens
+
+
+--------------------------------------------------------------------------------
+
+data Block =
+    Virtual Int     -- ^ Virtual layout block
+  | Explicit TokenT -- ^ An explicit layout block, expecting this ending token.
+    deriving (Show)
+
+-- | These tokens start an implicit layout block
+startsLayout :: TokenT -> Bool
+startsLayout ty =
+  case ty of
+    KW KW_where       -> True
+    KW KW_private     -> True
+    KW KW_parameter   -> True
+    _                 -> False
+
+-- | These tokens end an implicit layout block
+endsLayout :: TokenT -> Bool
+endsLayout ty =
+  case ty of
+    Sym Comma    -> True
+    Sym BracketR -> True
+    Sym ParenR   -> True
+    Sym CurlyR   -> True
+    _            -> False
+
+-- | These tokens start an explicit "paren" layout block.
+-- If so, the result contains the corresponding closing paren.
+startsParenBlock :: TokenT -> Maybe TokenT
+startsParenBlock ty =
+  case ty of
+    Sym BracketL -> Just (Sym BracketR)
+    Sym ParenL   -> Just (Sym ParenR)
+    Sym CurlyL   -> Just (Sym CurlyR)
+    _            -> Nothing
+
+
+--------------------------------------------------------------------------------
+
+-- | Make a virtual token of the given type
+virt :: Range -> TokenV -> Located Token
+virt rng x = Located { srcRange = rng { to = from rng }, thing = t }
+  where
+  t = Token (Virt x)
+      case x of
+        VCurlyL -> "beginning of layout block"
+        VCurlyR -> "end of layout block"
+        VSemi   -> "layout block separator"
+
+errTok :: Range -> TokenErr -> Located Token
+errTok rng x = Located { srcRange = rng { to = from rng }, thing = t }
+  where
+  t = Token { tokenType = Err x, tokenText = "" }
+
+
+
+
--- a/src/Cryptol/Parser/Lexer.x
+++ b/src/Cryptol/Parser/Lexer.x
@ -21,6 +21,7 @@ module Cryptol.Parser.Lexer
  ) where

 import Cryptol.Parser.Position
+import Cryptol.Parser.Token
 import Cryptol.Parser.LexerUtils
 import Cryptol.Parser.Unlit(unLit)
 import Data.Text (Text)
--- a/src/Cryptol/Parser/LexerUtils.hs
+++ b/src/Cryptol/Parser/LexerUtils.hs
@ -6,18 +6,10 @@
 -- Stability   :  provisional
 -- Portability :  portable

-{-# LANGUAGE DeriveAnyClass #-}
-{-# LANGUAGE DeriveGeneric #-}
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE PatternGuards #-}
-{-# LANGUAGE BlockArguments #-}
 module Cryptol.Parser.LexerUtils where

-import Cryptol.Parser.Position
-import Cryptol.Parser.Unlit(PreProc(None))
-import Cryptol.Utils.PP
-import Cryptol.Utils.Panic
-
 import           Control.Monad(guard)
 import           Data.Char(toLower,generalCategory,isAscii,ord,isSpace,
                                                            isAlphaNum,isAlpha)
@ -27,8 +19,12 @@ import qualified Data.Text as T
 import qualified Data.Text.Read as T
 import           Data.Word(Word8)

-import GHC.Generics (Generic)
-import Control.DeepSeq
+import Cryptol.Utils.Panic
+import Cryptol.Parser.Position
+import Cryptol.Parser.Token
+import Cryptol.Parser.Unlit(PreProc(None))
+
+

 data Config = Config
  { cfgSource      :: !FilePath     -- ^ File that we are working on
@ -464,109 +460,6 @@ virt cfg pos x = Located { srcRange = Range

 --------------------------------------------------------------------------------

-data Token    = Token { tokenType :: !TokenT, tokenText :: !Text }
-                deriving (Show, Generic, NFData)
-
-- | Virtual tokens, inserted by layout processing.
-data TokenV   = VCurlyL| VCurlyR | VSemi
-                deriving (Eq, Show, Generic, NFData)
-
-data TokenW   = BlockComment | LineComment | Space | DocStr
-                deriving (Eq, Show, Generic, NFData)
-
-data TokenKW  = KW_else
-              | KW_extern
-              | KW_fin
-              | KW_if
-              | KW_private
-              | KW_include
-              | KW_inf
-              | KW_lg2
-              | KW_lengthFromThen
-              | KW_lengthFromThenTo
-              | KW_max
-              | KW_min
-              | KW_module
-              | KW_submodule
-              | KW_newtype
-              | KW_pragma
-              | KW_property
-              | KW_then
-              | KW_type
-              | KW_where
-              | KW_let
-              | KW_x
-              | KW_import
-              | KW_as
-              | KW_hiding
-              | KW_infixl
-              | KW_infixr
-              | KW_infix
-              | KW_primitive
-              | KW_parameter
-              | KW_constraint
-              | KW_Prop
-                deriving (Eq, Show, Generic, NFData)
-
-- | The named operators are a special case for parsing types, and 'Other' is
-- used for all other cases that lexed as an operator.
-data TokenOp  = Plus | Minus | Mul | Div | Exp | Mod
-              | Equal | LEQ | GEQ
-              | Complement | Hash | At
-              | Other [T.Text] T.Text
-                deriving (Eq, Show, Generic, NFData)
-
-data TokenSym = Bar
-              | ArrL | ArrR | FatArrR
-              | Lambda
-              | EqDef
-              | Comma
-              | Semi
-              | Dot
-              | DotDot
-              | DotDotDot
-              | DotDotLt
-              | Colon
-              | BackTick
-              | ParenL   | ParenR
-              | BracketL | BracketR
-              | CurlyL   | CurlyR
-              | TriL     | TriR
-              | Lt
-              | Underscore
-                deriving (Eq, Show, Generic, NFData)
-
-data TokenErr = UnterminatedComment
-              | UnterminatedString
-              | UnterminatedChar
-              | InvalidString
-              | InvalidChar
-              | LexicalError
-              | MalformedLiteral
-              | MalformedSelector
-                deriving (Eq, Show, Generic, NFData)
-
-data SelectorType = RecordSelectorTok Text | TupleSelectorTok Int
-                deriving (Eq, Show, Generic, NFData)
-
-data TokenT   = Num !Integer !Int !Int   -- ^ value, base, number of digits
-              | Frac !Rational !Int      -- ^ value, base.
-              | ChrLit  !Char         -- ^ character literal
-              | Ident ![T.Text] !T.Text -- ^ (qualified) identifier
-              | StrLit !String         -- ^ string literal
-              | Selector !SelectorType  -- ^ .hello or .123
-              | KW    !TokenKW         -- ^ keyword
-              | Op    !TokenOp         -- ^ operator
-              | Sym   !TokenSym        -- ^ symbol
-              | Virt  !TokenV          -- ^ virtual token (for layout)
-              | White !TokenW          -- ^ white space token
-              | Err   !TokenErr        -- ^ error token
-              | EOF
-                deriving (Eq, Show, Generic, NFData)
-
-instance PP Token where
-  ppPrec _ (Token _ s) = text (T.unpack s)
-
 -- | Collapse characters into a single Word8, identifying ASCII, and classes of
 -- unicode.  This came from:
 --
--- a/src/Cryptol/Parser/ParserUtils.hs
+++ b/src/Cryptol/Parser/ParserUtils.hs
@ -35,7 +35,7 @@ import Prelude.Compat

 import Cryptol.Parser.AST
 import Cryptol.Parser.Lexer
-import Cryptol.Parser.LexerUtils(SelectorType(..))
+import Cryptol.Parser.Token(SelectorType(..))
 import Cryptol.Parser.Position
 import Cryptol.Parser.Utils (translateExprToNumT,widthIdent)
 import Cryptol.Utils.Ident(packModName,packIdent,modNameChunks)
@ -81,6 +81,12 @@ lexerP k = P $ \cfg p s ->
                                    T.unpack (tokenText it)
           MalformedSelector   -> "malformed selector: " ++
                                    T.unpack (tokenText it)
+           InvalidIndentation c -> "invalid indentation, expected " ++
+              case c of
+                Sym CurlyR    -> "}"
+                Sym ParenR    -> ")"
+                Sym BracketR  -> "]"
+                _             -> show c -- basically panic
        ]
      where it = thing t

--- a/src/Cryptol/Parser/Token.hs
+++ b/src/Cryptol/Parser/Token.hs
@ -0,0 +1,116 @@
+{-# LANGUAGE DeriveAnyClass #-}
+{-# LANGUAGE DeriveGeneric #-}
+module Cryptol.Parser.Token where
+
+import Data.Text(Text)
+import qualified Data.Text as Text
+import Control.DeepSeq
+import GHC.Generics
+
+import Cryptol.Utils.PP
+
+data Token    = Token { tokenType :: !TokenT, tokenText :: !Text }
+                deriving (Show, Generic, NFData)
+
+-- | Virtual tokens, inserted by layout processing.
+data TokenV   = VCurlyL| VCurlyR | VSemi
+                deriving (Eq, Show, Generic, NFData)
+
+data TokenW   = BlockComment | LineComment | Space | DocStr
+                deriving (Eq, Show, Generic, NFData)
+
+data TokenKW  = KW_else
+              | KW_extern
+              | KW_fin
+              | KW_if
+              | KW_private
+              | KW_include
+              | KW_inf
+              | KW_lg2
+              | KW_lengthFromThen
+              | KW_lengthFromThenTo
+              | KW_max
+              | KW_min
+              | KW_module
+              | KW_submodule
+              | KW_newtype
+              | KW_pragma
+              | KW_property
+              | KW_then
+              | KW_type
+              | KW_where
+              | KW_let
+              | KW_x
+              | KW_import
+              | KW_as
+              | KW_hiding
+              | KW_infixl
+              | KW_infixr
+              | KW_infix
+              | KW_primitive
+              | KW_parameter
+              | KW_constraint
+              | KW_Prop
+                deriving (Eq, Show, Generic, NFData)
+
+-- | The named operators are a special case for parsing types, and 'Other' is
+-- used for all other cases that lexed as an operator.
+data TokenOp  = Plus | Minus | Mul | Div | Exp | Mod
+              | Equal | LEQ | GEQ
+              | Complement | Hash | At
+              | Other [Text] Text
+                deriving (Eq, Show, Generic, NFData)
+
+data TokenSym = Bar
+              | ArrL | ArrR | FatArrR
+              | Lambda
+              | EqDef
+              | Comma
+              | Semi
+              | Dot
+              | DotDot
+              | DotDotDot
+              | DotDotLt
+              | Colon
+              | BackTick
+              | ParenL   | ParenR
+              | BracketL | BracketR
+              | CurlyL   | CurlyR
+              | TriL     | TriR
+              | Lt
+              | Underscore
+                deriving (Eq, Show, Generic, NFData)
+
+data TokenErr = UnterminatedComment
+              | UnterminatedString
+              | UnterminatedChar
+              | InvalidString
+              | InvalidChar
+              | LexicalError
+              | MalformedLiteral
+              | MalformedSelector
+              | InvalidIndentation TokenT -- expected closing paren
+                deriving (Eq, Show, Generic, NFData)
+
+data SelectorType = RecordSelectorTok Text | TupleSelectorTok Int
+                deriving (Eq, Show, Generic, NFData)
+
+data TokenT   = Num !Integer !Int !Int    -- ^ value, base, number of digits
+              | Frac !Rational !Int       -- ^ value, base.
+              | ChrLit  !Char             -- ^ character literal
+              | Ident ![Text] !Text       -- ^ (qualified) identifier
+              | StrLit !String            -- ^ string literal
+              | Selector !SelectorType    -- ^ .hello or .123
+              | KW    !TokenKW            -- ^ keyword
+              | Op    !TokenOp            -- ^ operator
+              | Sym   !TokenSym           -- ^ symbol
+              | Virt  !TokenV             -- ^ virtual token (for layout)
+              | White !TokenW             -- ^ white space token
+              | Err   !TokenErr           -- ^ error token
+              | EOF
+                deriving (Eq, Show, Generic, NFData)
+
+instance PP Token where
+  ppPrec _ (Token _ s) = text (Text.unpack s)
+
+