megaparsec/Text/Parsec/Token.hs

-----------------------------------------------------------------------------
-- |
-- Module      :  Text.Parsec.Token
-- Copyright   :  (c) Daan Leijen 1999-2001, (c) Paolo Martini 2007
-- License     :  BSD-style (see the LICENSE file)
-- 
-- Maintainer  :  derek.a.elkins@gmail.com
-- Stability   :  provisional
-- Portability :  non-portable (uses local universal quantification: PolymorphicComponents)
-- 
-- A helper module to parse lexical elements (tokens). See 'makeTokenParser'
-- for a description of how to use it.
-- 
-----------------------------------------------------------------------------

{-# LANGUAGE PolymorphicComponents #-}
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}

module Text.Parsec.Token
    ( LanguageDef
    , GenLanguageDef (..)
    , TokenParser
    , GenTokenParser (..)
    , makeTokenParser
    ) where

import Data.Char ( isAlpha, toLower, toUpper, isSpace, digitToInt )
import Data.List ( nub, sort )
import Control.Monad.Identity
import Text.Parsec.Prim
import Text.Parsec.Char
import Text.Parsec.Combinator

-----------------------------------------------------------
-- Language Definition
-----------------------------------------------------------

type LanguageDef st = GenLanguageDef String st Identity

-- | The @GenLanguageDef@ type is a record that contains all parameterizable
-- features of the 'Text.Parsec.Token' module. The module 'Text.Parsec.Language'
-- contains some default definitions.

data GenLanguageDef s u m
    = LanguageDef { 
    
    -- | Describes the start of a block comment. Use the empty string if the
    -- language doesn't support block comments. For example \"\/*\". 

    commentStart   :: String,

    -- | Describes the end of a block comment. Use the empty string if the
    -- language doesn't support block comments. For example \"*\/\". 

    commentEnd     :: String,

    -- | Describes the start of a line comment. Use the empty string if the
    -- language doesn't support line comments. For example \"\/\/\". 

    commentLine    :: String,

    -- | Set to 'True' if the language supports nested block comments. 

    nestedComments :: Bool,

    -- | This parser should accept any start characters of identifiers. For
    -- example @letter \<|> char \"_\"@. 

    identStart     :: ParsecT s u m Char,

    -- | This parser should accept any legal tail characters of identifiers.
    -- For example @alphaNum \<|> char \"_\"@. 

    identLetter    :: ParsecT s u m Char,

    -- | This parser should accept any start characters of operators. For
    -- example @oneOf \":!#$%&*+.\/\<=>?\@\\\\^|-~\"@ 

    opStart        :: ParsecT s u m Char,

    -- | This parser should accept any legal tail characters of operators.
    -- Note that this parser should even be defined if the language doesn't
    -- support user-defined operators, or otherwise the 'reservedOp'
    -- parser won't work correctly. 

    opLetter       :: ParsecT s u m Char,

    -- | The list of reserved identifiers. 

    reservedNames  :: [String],

    -- | The list of reserved operators. 

    reservedOpNames:: [String],

    -- | Set to 'True' if the language is case sensitive. 

    caseSensitive  :: Bool

    }

-----------------------------------------------------------
-- A first class module: TokenParser
-----------------------------------------------------------

type TokenParser st = GenTokenParser String st Identity

-- | The type of the record that holds lexical parsers that work on
-- @s@ streams with state @u@ over a monad @m@.

data GenTokenParser s u m
    = TokenParser {

        -- | This lexeme parser parses a legal identifier. Returns the identifier
        -- string. This parser will fail on identifiers that are reserved
        -- words. Legal identifier (start) characters and reserved words are
        -- defined in the 'LanguageDef' that is passed to
        -- 'makeTokenParser'. An @identifier@ is treated as
        -- a single token using 'try'.

        identifier       :: ParsecT s u m String,
        
        -- | The lexeme parser @reserved name@ parses @symbol 
        -- name@, but it also checks that the @name@ is not a prefix of a
        -- valid identifier. A @reserved@ word is treated as a single token
        -- using 'try'. 

        reserved         :: String -> ParsecT s u m (),

        -- | This lexeme parser parses a legal operator. Returns the name of the
        -- operator. This parser will fail on any operators that are reserved
        -- operators. Legal operator (start) characters and reserved operators
        -- are defined in the 'LanguageDef' that is passed to
        -- 'makeTokenParser'. An @operator@ is treated as a
        -- single token using 'try'. 

        operator         :: ParsecT s u m String,

        -- |The lexeme parser @reservedOp name@ parses @symbol
        -- name@, but it also checks that the @name@ is not a prefix of a
        -- valid operator. A @reservedOp@ is treated as a single token using
        -- 'try'. 

        reservedOp       :: String -> ParsecT s u m (),


        -- | This lexeme parser parses a single literal character. Returns the
        -- literal character value. This parsers deals correctly with escape
        -- sequences. The literal character is parsed according to the grammar
        -- rules defined in the Haskell report (which matches most programming
        -- languages quite closely). 

        charLiteral      :: ParsecT s u m Char,

        -- | This lexeme parser parses a literal string. Returns the literal
        -- string value. This parsers deals correctly with escape sequences and
        -- gaps. The literal string is parsed according to the grammar rules
        -- defined in the Haskell report (which matches most programming
        -- languages quite closely). 

        stringLiteral    :: ParsecT s u m String,

        -- | This lexeme parser parses a natural number (a positive whole
        -- number). Returns the value of the number. The number can be
        -- specified in 'decimal', 'hexadecimal' or
        -- 'octal'. The number is parsed according to the grammar
        -- rules in the Haskell report. 

        natural          :: ParsecT s u m Integer,

        -- | This lexeme parser parses an integer (a whole number). This parser
        -- is like 'natural' except that it can be prefixed with
        -- sign (i.e. \'-\' or \'+\'). Returns the value of the number. The
        -- number can be specified in 'decimal', 'hexadecimal'
        -- or 'octal'. The number is parsed according
        -- to the grammar rules in the Haskell report. 
        
        integer          :: ParsecT s u m Integer,

        -- | This lexeme parser parses a floating point value. Returns the value
        -- of the number. The number is parsed according to the grammar rules
        -- defined in the Haskell report. 

        float            :: ParsecT s u m Double,

        -- | This lexeme parser parses either 'natural' or a 'float'.
        -- Returns the value of the number. This parsers deals with
        -- any overlap in the grammar rules for naturals and floats. The number
        -- is parsed according to the grammar rules defined in the Haskell report. 

        naturalOrFloat   :: ParsecT s u m (Either Integer Double),

        -- | Parses a positive whole number in the decimal system. Returns the
        -- value of the number. 

        decimal          :: ParsecT s u m Integer,

        -- | Parses a positive whole number in the hexadecimal system. The number
        -- should be prefixed with \"0x\" or \"0X\". Returns the value of the
        -- number. 

        hexadecimal      :: ParsecT s u m Integer,

        -- | Parses a positive whole number in the octal system. The number
        -- should be prefixed with \"0o\" or \"0O\". Returns the value of the
        -- number. 

        octal            :: ParsecT s u m Integer,

        -- | Lexeme parser @symbol s@ parses 'string' @s@ and skips
        -- trailing white space. 

        symbol           :: String -> ParsecT s u m String,

        -- | @lexeme p@ first applies parser @p@ and than the 'whiteSpace'
        -- parser, returning the value of @p@. Every lexical
        -- token (lexeme) is defined using @lexeme@, this way every parse
        -- starts at a point without white space. Parsers that use @lexeme@ are
        -- called /lexeme/ parsers in this document.
        -- 
        -- The only point where the 'whiteSpace' parser should be
        -- called explicitly is the start of the main parser in order to skip
        -- any leading white space.
        --
        -- >    mainParser  = do{ whiteSpace
        -- >                     ; ds <- many (lexeme digit)
        -- >                     ; eof
        -- >                     ; return (sum ds)
        -- >                     }

        lexeme           :: forall a. ParsecT s u m a -> ParsecT s u m a,

        -- | Parses any white space. White space consists of /zero/ or more
        -- occurrences of a 'space', a line comment or a block (multi
        -- line) comment. Block comments may be nested. How comments are
        -- started and ended is defined in the 'LanguageDef'
        -- that is passed to 'makeTokenParser'. 

        whiteSpace       :: ParsecT s u m (),

        -- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis,
        -- returning the value of @p@.

        parens           :: forall a. ParsecT s u m a -> ParsecT s u m a,

        -- | Lexeme parser @braces p@ parses @p@ enclosed in braces (\'{\' and
        -- \'}\'), returning the value of @p@. 

        braces           :: forall a. ParsecT s u m a -> ParsecT s u m a,

        -- | Lexeme parser @angles p@ parses @p@ enclosed in angle brackets (\'\<\'
        -- and \'>\'), returning the value of @p@. 

        angles           :: forall a. ParsecT s u m a -> ParsecT s u m a,

        -- | Lexeme parser @brackets p@ parses @p@ enclosed in brackets (\'[\'
        -- and \']\'), returning the value of @p@. 

        brackets         :: forall a. ParsecT s u m a -> ParsecT s u m a,

        -- | DEPRECATED: Use 'brackets'.

        squares          :: forall a. ParsecT s u m a -> ParsecT s u m a,

        -- | Lexeme parser |semi| parses the character \';\' and skips any
        -- trailing white space. Returns the string \";\". 

        semi             :: ParsecT s u m String,

        -- | Lexeme parser @comma@ parses the character \',\' and skips any
        -- trailing white space. Returns the string \",\". 

        comma            :: ParsecT s u m String,

        -- | Lexeme parser @colon@ parses the character \':\' and skips any
        -- trailing white space. Returns the string \":\". 

        colon            :: ParsecT s u m String,

        -- | Lexeme parser @dot@ parses the character \'.\' and skips any
        -- trailing white space. Returns the string \".\". 

        dot              :: ParsecT s u m String,

        -- | Lexeme parser @semiSep p@ parses /zero/ or more occurrences of @p@
        -- separated by 'semi'. Returns a list of values returned by
        -- @p@.

        semiSep          :: forall a . ParsecT s u m a -> ParsecT s u m [a],

        -- | Lexeme parser @semiSep1 p@ parses /one/ or more occurrences of @p@
        -- separated by 'semi'. Returns a list of values returned by @p@. 

        semiSep1         :: forall a . ParsecT s u m a -> ParsecT s u m [a],

        -- | Lexeme parser @commaSep p@ parses /zero/ or more occurrences of
        -- @p@ separated by 'comma'. Returns a list of values returned
        -- by @p@. 

        commaSep         :: forall a . ParsecT s u m a -> ParsecT s u m [a],

        -- | Lexeme parser @commaSep1 p@ parses /one/ or more occurrences of
        -- @p@ separated by 'comma'. Returns a list of values returned
        -- by @p@. 

        commaSep1        :: forall a . ParsecT s u m a -> ParsecT s u m [a]
    }

-----------------------------------------------------------
-- Given a LanguageDef, create a token parser.
-----------------------------------------------------------

-- | The expression @makeTokenParser language@ creates a 'GenTokenParser'
-- record that contains lexical parsers that are
-- defined using the definitions in the @language@ record.
--
-- The use of this function is quite stylized - one imports the
-- appropiate language definition and selects the lexical parsers that
-- are needed from the resulting 'GenTokenParser'.
--
-- >  module Main where
-- >
-- >  import Text.Parsec
-- >  import qualified Text.Parsec.Token as P
-- >  import Text.Parsec.Language (haskellDef)
-- >
-- >  -- The parser
-- >  ...
-- >
-- >  expr  =   parens expr
-- >        <|> identifier
-- >        <|> ...
-- >       
-- >
-- >  -- The lexer
-- >  lexer       = P.makeTokenParser haskellDef    
-- >      
-- >  parens      = P.parens lexer
-- >  braces      = P.braces lexer
-- >  identifier  = P.identifier lexer
-- >  reserved    = P.reserved lexer
-- >  ...

makeTokenParser :: (Stream s m Char)
                => GenLanguageDef s u m -> GenTokenParser s u m
makeTokenParser languageDef
    = TokenParser{ identifier = identifier
                 , reserved = reserved
                 , operator = operator
                 , reservedOp = reservedOp

                 , charLiteral = charLiteral
                 , stringLiteral = stringLiteral
                 , natural = natural
                 , integer = integer
                 , float = float
                 , naturalOrFloat = naturalOrFloat
                 , decimal = decimal
                 , hexadecimal = hexadecimal
                 , octal = octal

                 , symbol = symbol
                 , lexeme = lexeme
                 , whiteSpace = whiteSpace

                 , parens = parens
                 , braces = braces
                 , angles = angles
                 , brackets = brackets
                 , squares = brackets
                 , semi = semi
                 , comma = comma
                 , colon = colon
                 , dot = dot
                 , semiSep = semiSep
                 , semiSep1 = semiSep1
                 , commaSep = commaSep
                 , commaSep1 = commaSep1
                 }
    where

    -----------------------------------------------------------
    -- Bracketing
    -----------------------------------------------------------
    parens p        = between (symbol "(") (symbol ")") p
    braces p        = between (symbol "{") (symbol "}") p
    angles p        = between (symbol "<") (symbol ">") p
    brackets p      = between (symbol "[") (symbol "]") p

    semi            = symbol ";"
    comma           = symbol ","
    dot             = symbol "."
    colon           = symbol ":"

    commaSep p      = sepBy p comma
    semiSep p       = sepBy p semi

    commaSep1 p     = sepBy1 p comma
    semiSep1 p      = sepBy1 p semi


    -----------------------------------------------------------
    -- Chars & Strings
    -----------------------------------------------------------
    charLiteral     = lexeme (between (char '\'')
                                      (char '\'' <?> "end of character")
                                      characterChar )
                    <?> "character"

    characterChar   = charLetter <|> charEscape
                    <?> "literal character"

    charEscape      = do{ char '\\'; escapeCode }
    charLetter      = satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026'))


    stringLiteral   = lexeme (
                      do{ str <- between (char '"')
                                         (char '"' <?> "end of string")
                                         (many stringChar)
                        ; return (foldr (maybe id (:)) "" str)
                        }
                      <?> "literal string")

    stringChar      =   do{ c <- stringLetter; return (Just c) }
                    <|> stringEscape
                    <?> "string character"

    stringLetter    = satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026'))

    stringEscape    = do{ char '\\'
                        ;     do{ escapeGap  ; return Nothing }
                          <|> do{ escapeEmpty; return Nothing }
                          <|> do{ esc <- escapeCode; return (Just esc) }
                        }

    escapeEmpty     = char '&'
    escapeGap       = do{ many1 space
                        ; char '\\' <?> "end of string gap"
                        }


    -- escape codes
    escapeCode      = charEsc <|> charNum <|> charAscii <|> charControl
                    <?> "escape code"

    charControl     = do{ char '^'
                        ; code <- upper
                        ; return (toEnum (fromEnum code - fromEnum 'A'))
                        }

    charNum         = do{ code <- decimal
                                  <|> do{ char 'o'; number 8 octDigit }
                                  <|> do{ char 'x'; number 16 hexDigit }
                        ; return (toEnum (fromInteger code))
                        }

    charEsc         = choice (map parseEsc escMap)
                    where
                      parseEsc (c,code)     = do{ char c; return code }

    charAscii       = choice (map parseAscii asciiMap)
                    where
                      parseAscii (asc,code) = try (do{ string asc; return code })


    -- escape code tables
    escMap          = zip ("abfnrtv\\\"\'") ("\a\b\f\n\r\t\v\\\"\'")
    asciiMap        = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2)

    ascii2codes     = ["BS","HT","LF","VT","FF","CR","SO","SI","EM",
                       "FS","GS","RS","US","SP"]
    ascii3codes     = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL",
                       "DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB",
                       "CAN","SUB","ESC","DEL"]

    ascii2          = ['\BS','\HT','\LF','\VT','\FF','\CR','\SO','\SI',
                       '\EM','\FS','\GS','\RS','\US','\SP']
    ascii3          = ['\NUL','\SOH','\STX','\ETX','\EOT','\ENQ','\ACK',
                       '\BEL','\DLE','\DC1','\DC2','\DC3','\DC4','\NAK',
                       '\SYN','\ETB','\CAN','\SUB','\ESC','\DEL']


    -----------------------------------------------------------
    -- Numbers
    -----------------------------------------------------------
    naturalOrFloat  = lexeme (natFloat) <?> "number"

    float           = lexeme floating   <?> "float"
    integer         = lexeme int        <?> "integer"
    natural         = lexeme nat        <?> "natural"


    -- floats
    floating        = do{ n <- decimal
                        ; fractExponent n
                        }


    natFloat        = do{ char '0'
                        ; zeroNumFloat
                        }
                      <|> decimalFloat

    zeroNumFloat    =  do{ n <- hexadecimal <|> octal
                         ; return (Left n)
                         }
                    <|> decimalFloat
                    <|> fractFloat 0
                    <|> return (Left 0)

    decimalFloat    = do{ n <- decimal
                        ; option (Left n)
                                 (fractFloat n)
                        }

    fractFloat n    = do{ f <- fractExponent n
                        ; return (Right f)
                        }

    fractExponent n = do{ fract <- fraction
                        ; expo  <- option 1.0 exponent'
                        ; return ((fromInteger n + fract)*expo)
                        }
                    <|>
                      do{ expo <- exponent'
                        ; return ((fromInteger n)*expo)
                        }

    fraction        = do{ char '.'
                        ; digits <- many1 digit <?> "fraction"
                        ; return (foldr op 0.0 digits)
                        }
                      <?> "fraction"
                    where
                      op d f    = (f + fromIntegral (digitToInt d))/10.0

    exponent'       = do{ oneOf "eE"
                        ; f <- sign
                        ; e <- decimal <?> "exponent"
                        ; return (power (f e))
                        }
                      <?> "exponent"
                    where
                       power e  | e < 0      = 1.0/power(-e)
                                | otherwise  = fromInteger (10^e)


    -- integers and naturals
    int             = do{ f <- lexeme sign
                        ; n <- nat
                        ; return (f n)
                        }

    sign            =   (char '-' >> return negate)
                    <|> (char '+' >> return id)
                    <|> return id

    nat             = zeroNumber <|> decimal

    zeroNumber      = do{ char '0'
                        ; hexadecimal <|> octal <|> decimal <|> return 0
                        }
                      <?> ""

    decimal         = number 10 digit
    hexadecimal     = do{ oneOf "xX"; number 16 hexDigit }
    octal           = do{ oneOf "oO"; number 8 octDigit  }

    number base baseDigit
        = do{ digits <- many1 baseDigit
            ; let n = foldl (\x d -> base*x + toInteger (digitToInt d)) 0 digits
            ; seq n (return n)
            }

    -----------------------------------------------------------
    -- Operators & reserved ops
    -----------------------------------------------------------
    reservedOp name =
        lexeme $ try $
        do{ string name
          ; notFollowedBy (opLetter languageDef) <?> ("end of " ++ show name)
          }

    operator =
        lexeme $ try $
        do{ name <- oper
          ; if (isReservedOp name)
             then unexpected ("reserved operator " ++ show name)
             else return name
          }

    oper =
        do{ c <- (opStart languageDef)
          ; cs <- many (opLetter languageDef)
          ; return (c:cs)
          }
        <?> "operator"

    isReservedOp name =
        isReserved (sort (reservedOpNames languageDef)) name


    -----------------------------------------------------------
    -- Identifiers & Reserved words
    -----------------------------------------------------------
    reserved name =
        lexeme $ try $
        do{ caseString name
          ; notFollowedBy (identLetter languageDef) <?> ("end of " ++ show name)
          }

    caseString name
        | caseSensitive languageDef  = string name
        | otherwise               = do{ walk name; return name }
        where
          walk []     = return ()
          walk (c:cs) = do{ caseChar c <?> msg; walk cs }

          caseChar c  | isAlpha c  = char (toLower c) <|> char (toUpper c)
                      | otherwise  = char c

          msg         = show name


    identifier =
        lexeme $ try $
        do{ name <- ident
          ; if (isReservedName name)
             then unexpected ("reserved word " ++ show name)
             else return name
          }


    ident
        = do{ c <- identStart languageDef
            ; cs <- many (identLetter languageDef)
            ; return (c:cs)
            }
        <?> "identifier"

    isReservedName name
        = isReserved theReservedNames caseName
        where
          caseName      | caseSensitive languageDef  = name
                        | otherwise               = map toLower name


    isReserved names name
        = scan names
        where
          scan []       = False
          scan (r:rs)   = case (compare r name) of
                            LT  -> scan rs
                            EQ  -> True
                            GT  -> False

    theReservedNames
        | caseSensitive languageDef  = sortedNames
        | otherwise               = map (map toLower) sortedNames
        where
          sortedNames   = sort (reservedNames languageDef)


    -----------------------------------------------------------
    -- White space & symbols
    -----------------------------------------------------------
    symbol name
        = lexeme (string name)

    lexeme p
        = do{ x <- p; whiteSpace; return x  }


    --whiteSpace
    whiteSpace
        | noLine && noMulti  = skipMany (simpleSpace <?> "")
        | noLine             = skipMany (simpleSpace <|> multiLineComment <?> "")
        | noMulti            = skipMany (simpleSpace <|> oneLineComment <?> "")
        | otherwise          = skipMany (simpleSpace <|> oneLineComment <|> multiLineComment <?> "")
        where
          noLine  = null (commentLine languageDef)
          noMulti = null (commentStart languageDef)


    simpleSpace =
        skipMany1 (satisfy isSpace)

    oneLineComment =
        do{ try (string (commentLine languageDef))
          ; skipMany (satisfy (/= '\n'))
          ; return ()
          }

    multiLineComment =
        do { try (string (commentStart languageDef))
           ; inComment
           }

    inComment
        | nestedComments languageDef  = inCommentMulti
        | otherwise                = inCommentSingle

    inCommentMulti
        =   do{ try (string (commentEnd languageDef)) ; return () }
        <|> do{ multiLineComment                     ; inCommentMulti }
        <|> do{ skipMany1 (noneOf startEnd)          ; inCommentMulti }
        <|> do{ oneOf startEnd                       ; inCommentMulti }
        <?> "end of comment"
        where
          startEnd   = nub (commentEnd languageDef ++ commentStart languageDef)

    inCommentSingle
        =   do{ try (string (commentEnd languageDef)); return () }
        <|> do{ skipMany1 (noneOf startEnd)         ; inCommentSingle }
        <|> do{ oneOf startEnd                      ; inCommentSingle }
        <?> "end of comment"
        where
          startEnd   = nub (commentEnd languageDef ++ commentStart languageDef)