megaparsec/Text/MegaParsec/Token.hs
2015-07-29 14:44:58 +06:00

720 lines
25 KiB
Haskell
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

-- |
-- Module : Text.MegaParsec.Token
-- Copyright : © 19992001 Daan Leijen, © 2007 Paolo Martini, © 2015 MegaParsec contributors
-- License : BSD3
--
-- Maintainer : Mark Karpov <markkarpov@opmbx.org>
-- Stability : experimental
-- Portability : non-portable (uses local universal quantification: PolymorphicComponents)
--
-- A helper module to parse lexical elements (tokens). See 'makeTokenParser'
-- for a description of how to use it.
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
module Text.MegaParsec.Token
( LanguageDef
, GenLanguageDef (..)
, TokenParser
, GenTokenParser (..)
, makeTokenParser )
where
import Data.Char (isAlpha, toLower, toUpper, isSpace, digitToInt)
import Data.List (nub, sort)
import Control.Monad.Identity
import Text.MegaParsec.Prim
import Text.MegaParsec.Char
import Text.MegaParsec.Combinator
-----------------------------------------------------------
-- Language Definition
-----------------------------------------------------------
type LanguageDef st = GenLanguageDef String st Identity
-- | The @GenLanguageDef@ type is a record that contains all parameterizable
-- features of the "Text.Parsec.Token" module. The module "Text.Parsec.Language"
-- contains some default definitions.
data GenLanguageDef s u m
= LanguageDef {
-- | Describes the start of a block comment. Use the empty string if the
-- language doesn't support block comments. For example \"\/*\".
commentStart :: String,
-- | Describes the end of a block comment. Use the empty string if the
-- language doesn't support block comments. For example \"*\/\".
commentEnd :: String,
-- | Describes the start of a line comment. Use the empty string if the
-- language doesn't support line comments. For example \"\/\/\".
commentLine :: String,
-- | Set to 'True' if the language supports nested block comments.
nestedComments :: Bool,
-- | This parser should accept any start characters of identifiers. For
-- example @letter \<|> char \'_\'@.
identStart :: ParsecT s u m Char,
-- | This parser should accept any legal tail characters of identifiers.
-- For example @alphaNum \<|> char \'_\'@.
identLetter :: ParsecT s u m Char,
-- | This parser should accept any start characters of operators. For
-- example @oneOf \":!#$%&*+.\/\<=>?\@\\\\^|-~\"@
opStart :: ParsecT s u m Char,
-- | This parser should accept any legal tail characters of operators.
-- Note that this parser should even be defined if the language doesn't
-- support user-defined operators, or otherwise the 'reservedOp'
-- parser won't work correctly.
opLetter :: ParsecT s u m Char,
-- | The list of reserved identifiers.
reservedNames :: [String],
-- | The list of reserved operators.
reservedOpNames:: [String],
-- | Set to 'True' if the language is case sensitive.
caseSensitive :: Bool
}
-----------------------------------------------------------
-- A first class module: TokenParser
-----------------------------------------------------------
type TokenParser st = GenTokenParser String st Identity
-- | The type of the record that holds lexical parsers that work on
-- @s@ streams with state @u@ over a monad @m@.
data GenTokenParser s u m
= TokenParser {
-- | This lexeme parser parses a legal identifier. Returns the identifier
-- string. This parser will fail on identifiers that are reserved
-- words. Legal identifier (start) characters and reserved words are
-- defined in the 'LanguageDef' that is passed to
-- 'makeTokenParser'. An @identifier@ is treated as
-- a single token using 'try'.
identifier :: ParsecT s u m String,
-- | The lexeme parser @reserved name@ parses @symbol
-- name@, but it also checks that the @name@ is not a prefix of a
-- valid identifier. A @reserved@ word is treated as a single token
-- using 'try'.
reserved :: String -> ParsecT s u m (),
-- | This lexeme parser parses a legal operator. Returns the name of the
-- operator. This parser will fail on any operators that are reserved
-- operators. Legal operator (start) characters and reserved operators
-- are defined in the 'LanguageDef' that is passed to
-- 'makeTokenParser'. An @operator@ is treated as a
-- single token using 'try'.
operator :: ParsecT s u m String,
-- |The lexeme parser @reservedOp name@ parses @symbol
-- name@, but it also checks that the @name@ is not a prefix of a
-- valid operator. A @reservedOp@ is treated as a single token using
-- 'try'.
reservedOp :: String -> ParsecT s u m (),
-- | This lexeme parser parses a single literal character. Returns the
-- literal character value. This parsers deals correctly with escape
-- sequences. The literal character is parsed according to the grammar
-- rules defined in the Haskell report (which matches most programming
-- languages quite closely).
charLiteral :: ParsecT s u m Char,
-- | This lexeme parser parses a literal string. Returns the literal
-- string value. This parsers deals correctly with escape sequences and
-- gaps. The literal string is parsed according to the grammar rules
-- defined in the Haskell report (which matches most programming
-- languages quite closely).
stringLiteral :: ParsecT s u m String,
-- | This lexeme parser parses a natural number (a positive whole
-- number). Returns the value of the number. The number can be
-- specified in 'decimal', 'hexadecimal' or
-- 'octal'. The number is parsed according to the grammar
-- rules in the Haskell report.
natural :: ParsecT s u m Integer,
-- | This lexeme parser parses an integer (a whole number). This parser
-- is like 'natural' except that it can be prefixed with
-- sign (i.e. \'-\' or \'+\'). Returns the value of the number. The
-- number can be specified in 'decimal', 'hexadecimal'
-- or 'octal'. The number is parsed according
-- to the grammar rules in the Haskell report.
integer :: ParsecT s u m Integer,
-- | This lexeme parser parses a floating point value. Returns the value
-- of the number. The number is parsed according to the grammar rules
-- defined in the Haskell report.
float :: ParsecT s u m Double,
-- | This lexeme parser parses either 'natural' or a 'float'.
-- Returns the value of the number. This parsers deals with
-- any overlap in the grammar rules for naturals and floats. The number
-- is parsed according to the grammar rules defined in the Haskell report.
naturalOrFloat :: ParsecT s u m (Either Integer Double),
-- | Parses a positive whole number in the decimal system. Returns the
-- value of the number.
decimal :: ParsecT s u m Integer,
-- | Parses a positive whole number in the hexadecimal system. The number
-- should be prefixed with \"0x\" or \"0X\". Returns the value of the
-- number.
hexadecimal :: ParsecT s u m Integer,
-- | Parses a positive whole number in the octal system. The number
-- should be prefixed with \"0o\" or \"0O\". Returns the value of the
-- number.
octal :: ParsecT s u m Integer,
-- | Lexeme parser @symbol s@ parses 'string' @s@ and skips
-- trailing white space.
symbol :: String -> ParsecT s u m String,
-- | @lexeme p@ first applies parser @p@ and than the 'whiteSpace'
-- parser, returning the value of @p@. Every lexical
-- token (lexeme) is defined using @lexeme@, this way every parse
-- starts at a point without white space. Parsers that use @lexeme@ are
-- called /lexeme/ parsers in this document.
--
-- The only point where the 'whiteSpace' parser should be
-- called explicitly is the start of the main parser in order to skip
-- any leading white space.
--
-- > mainParser = do{ whiteSpace
-- > ; ds <- many (lexeme digit)
-- > ; eof
-- > ; return (sum ds)
-- > }
lexeme :: forall a. ParsecT s u m a -> ParsecT s u m a,
-- | Parses any white space. White space consists of /zero/ or more
-- occurrences of a 'space', a line comment or a block (multi
-- line) comment. Block comments may be nested. How comments are
-- started and ended is defined in the 'LanguageDef'
-- that is passed to 'makeTokenParser'.
whiteSpace :: ParsecT s u m (),
-- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis,
-- returning the value of @p@.
parens :: forall a. ParsecT s u m a -> ParsecT s u m a,
-- | Lexeme parser @braces p@ parses @p@ enclosed in braces (\'{\' and
-- \'}\'), returning the value of @p@.
braces :: forall a. ParsecT s u m a -> ParsecT s u m a,
-- | Lexeme parser @angles p@ parses @p@ enclosed in angle brackets (\'\<\'
-- and \'>\'), returning the value of @p@.
angles :: forall a. ParsecT s u m a -> ParsecT s u m a,
-- | Lexeme parser @brackets p@ parses @p@ enclosed in brackets (\'[\'
-- and \']\'), returning the value of @p@.
brackets :: forall a. ParsecT s u m a -> ParsecT s u m a,
-- | DEPRECATED: Use 'brackets'.
squares :: forall a. ParsecT s u m a -> ParsecT s u m a,
-- | Lexeme parser |semi| parses the character \';\' and skips any
-- trailing white space. Returns the string \";\".
semi :: ParsecT s u m String,
-- | Lexeme parser @comma@ parses the character \',\' and skips any
-- trailing white space. Returns the string \",\".
comma :: ParsecT s u m String,
-- | Lexeme parser @colon@ parses the character \':\' and skips any
-- trailing white space. Returns the string \":\".
colon :: ParsecT s u m String,
-- | Lexeme parser @dot@ parses the character \'.\' and skips any
-- trailing white space. Returns the string \".\".
dot :: ParsecT s u m String,
-- | Lexeme parser @semiSep p@ parses /zero/ or more occurrences of @p@
-- separated by 'semi'. Returns a list of values returned by
-- @p@.
semiSep :: forall a . ParsecT s u m a -> ParsecT s u m [a],
-- | Lexeme parser @semiSep1 p@ parses /one/ or more occurrences of @p@
-- separated by 'semi'. Returns a list of values returned by @p@.
semiSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a],
-- | Lexeme parser @commaSep p@ parses /zero/ or more occurrences of
-- @p@ separated by 'comma'. Returns a list of values returned
-- by @p@.
commaSep :: forall a . ParsecT s u m a -> ParsecT s u m [a],
-- | Lexeme parser @commaSep1 p@ parses /one/ or more occurrences of
-- @p@ separated by 'comma'. Returns a list of values returned
-- by @p@.
commaSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a]
}
-----------------------------------------------------------
-- Given a LanguageDef, create a token parser.
-----------------------------------------------------------
-- | The expression @makeTokenParser language@ creates a 'GenTokenParser'
-- record that contains lexical parsers that are
-- defined using the definitions in the @language@ record.
--
-- The use of this function is quite stylized - one imports the
-- appropiate language definition and selects the lexical parsers that
-- are needed from the resulting 'GenTokenParser'.
--
-- > module Main where
-- >
-- > import Text.Parsec
-- > import qualified Text.Parsec.Token as P
-- > import Text.Parsec.Language (haskellDef)
-- >
-- > -- The parser
-- > ...
-- >
-- > expr = parens expr
-- > <|> identifier
-- > <|> ...
-- >
-- >
-- > -- The lexer
-- > lexer = P.makeTokenParser haskellDef
-- >
-- > parens = P.parens lexer
-- > braces = P.braces lexer
-- > identifier = P.identifier lexer
-- > reserved = P.reserved lexer
-- > ...
makeTokenParser :: (Stream s m Char)
=> GenLanguageDef s u m -> GenTokenParser s u m
makeTokenParser languageDef
= TokenParser{ identifier = identifier
, reserved = reserved
, operator = operator
, reservedOp = reservedOp
, charLiteral = charLiteral
, stringLiteral = stringLiteral
, natural = natural
, integer = integer
, float = float
, naturalOrFloat = naturalOrFloat
, decimal = decimal
, hexadecimal = hexadecimal
, octal = octal
, symbol = symbol
, lexeme = lexeme
, whiteSpace = whiteSpace
, parens = parens
, braces = braces
, angles = angles
, brackets = brackets
, squares = brackets
, semi = semi
, comma = comma
, colon = colon
, dot = dot
, semiSep = semiSep
, semiSep1 = semiSep1
, commaSep = commaSep
, commaSep1 = commaSep1
}
where
-----------------------------------------------------------
-- Bracketing
-----------------------------------------------------------
parens p = between (symbol "(") (symbol ")") p
braces p = between (symbol "{") (symbol "}") p
angles p = between (symbol "<") (symbol ">") p
brackets p = between (symbol "[") (symbol "]") p
semi = symbol ";"
comma = symbol ","
dot = symbol "."
colon = symbol ":"
commaSep p = sepBy p comma
semiSep p = sepBy p semi
commaSep1 p = sepBy1 p comma
semiSep1 p = sepBy1 p semi
-----------------------------------------------------------
-- Chars & Strings
-----------------------------------------------------------
charLiteral = lexeme (between (char '\'')
(char '\'' <?> "end of character")
characterChar )
<?> "character"
characterChar = charLetter <|> charEscape
<?> "literal character"
charEscape = do{ char '\\'; escapeCode }
charLetter = satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026'))
stringLiteral = lexeme (
do{ str <- between (char '"')
(char '"' <?> "end of string")
(many stringChar)
; return (foldr (maybe id (:)) "" str)
}
<?> "literal string")
stringChar = do{ c <- stringLetter; return (Just c) }
<|> stringEscape
<?> "string character"
stringLetter = satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026'))
stringEscape = do{ char '\\'
; do{ escapeGap ; return Nothing }
<|> do{ escapeEmpty; return Nothing }
<|> do{ esc <- escapeCode; return (Just esc) }
}
escapeEmpty = char '&'
escapeGap = do{ many1 space
; char '\\' <?> "end of string gap"
}
-- escape codes
escapeCode = charEsc <|> charNum <|> charAscii <|> charControl
<?> "escape code"
charControl = do{ char '^'
; code <- upper
; return (toEnum (fromEnum code - fromEnum 'A' + 1))
}
charNum = do{ code <- decimal
<|> do{ char 'o'; number 8 octDigit }
<|> do{ char 'x'; number 16 hexDigit }
; return (toEnum (fromInteger code))
}
charEsc = choice (map parseEsc escMap)
where
parseEsc (c,code) = do{ char c; return code }
charAscii = choice (map parseAscii asciiMap)
where
parseAscii (asc,code) = try (do{ string asc; return code })
-- escape code tables
escMap = zip ("abfnrtv\\\"\'") ("\a\b\f\n\r\t\v\\\"\'")
asciiMap = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2)
ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM",
"FS","GS","RS","US","SP"]
ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL",
"DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB",
"CAN","SUB","ESC","DEL"]
ascii2 = ['\BS','\HT','\LF','\VT','\FF','\CR','\SO','\SI',
'\EM','\FS','\GS','\RS','\US','\SP']
ascii3 = ['\NUL','\SOH','\STX','\ETX','\EOT','\ENQ','\ACK',
'\BEL','\DLE','\DC1','\DC2','\DC3','\DC4','\NAK',
'\SYN','\ETB','\CAN','\SUB','\ESC','\DEL']
-----------------------------------------------------------
-- Numbers
-----------------------------------------------------------
naturalOrFloat = lexeme (natFloat) <?> "number"
float = lexeme floating <?> "float"
integer = lexeme int <?> "integer"
natural = lexeme nat <?> "natural"
-- floats
floating = do{ n <- decimal
; fractExponent n
}
natFloat = do{ char '0'
; zeroNumFloat
}
<|> decimalFloat
zeroNumFloat = do{ n <- hexadecimal <|> octal
; return (Left n)
}
<|> decimalFloat
<|> fractFloat 0
<|> return (Left 0)
decimalFloat = do{ n <- decimal
; option (Left n)
(fractFloat n)
}
fractFloat n = do{ f <- fractExponent n
; return (Right f)
}
fractExponent n = do{ fract <- fraction
; expo <- option 1.0 exponent'
; return ((fromInteger n + fract)*expo)
}
<|>
do{ expo <- exponent'
; return ((fromInteger n)*expo)
}
fraction = do{ char '.'
; digits <- many1 digit <?> "fraction"
; return (foldr op 0.0 digits)
}
<?> "fraction"
where
op d f = (f + fromIntegral (digitToInt d))/10.0
exponent' = do{ oneOf "eE"
; f <- sign
; e <- decimal <?> "exponent"
; return (power (f e))
}
<?> "exponent"
where
power e | e < 0 = 1.0/power(-e)
| otherwise = fromInteger (10^e)
-- integers and naturals
int = do{ f <- lexeme sign
; n <- nat
; return (f n)
}
sign = (char '-' >> return negate)
<|> (char '+' >> return id)
<|> return id
nat = zeroNumber <|> decimal
zeroNumber = do{ char '0'
; hexadecimal <|> octal <|> decimal <|> return 0
}
<?> ""
decimal = number 10 digit
hexadecimal = do{ oneOf "xX"; number 16 hexDigit }
octal = do{ oneOf "oO"; number 8 octDigit }
number base baseDigit
= do{ digits <- many1 baseDigit
; let n = foldl (\x d -> base*x + toInteger (digitToInt d)) 0 digits
; seq n (return n)
}
-----------------------------------------------------------
-- Operators & reserved ops
-----------------------------------------------------------
reservedOp name =
lexeme $ try $
do{ string name
; notFollowedBy (opLetter languageDef) <?> ("end of " ++ show name)
}
operator =
lexeme $ try $
do{ name <- oper
; if (isReservedOp name)
then unexpected ("reserved operator " ++ show name)
else return name
}
oper =
do{ c <- (opStart languageDef)
; cs <- many (opLetter languageDef)
; return (c:cs)
}
<?> "operator"
isReservedOp name =
isReserved (sort (reservedOpNames languageDef)) name
-----------------------------------------------------------
-- Identifiers & Reserved words
-----------------------------------------------------------
reserved name =
lexeme $ try $
do{ caseString name
; notFollowedBy (identLetter languageDef) <?> ("end of " ++ show name)
}
caseString name
| caseSensitive languageDef = string name
| otherwise = do{ walk name; return name }
where
walk [] = return ()
walk (c:cs) = do{ caseChar c <?> msg; walk cs }
caseChar c | isAlpha c = char (toLower c) <|> char (toUpper c)
| otherwise = char c
msg = show name
identifier =
lexeme $ try $
do{ name <- ident
; if (isReservedName name)
then unexpected ("reserved word " ++ show name)
else return name
}
ident
= do{ c <- identStart languageDef
; cs <- many (identLetter languageDef)
; return (c:cs)
}
<?> "identifier"
isReservedName name
= isReserved theReservedNames caseName
where
caseName | caseSensitive languageDef = name
| otherwise = map toLower name
isReserved names name
= scan names
where
scan [] = False
scan (r:rs) = case (compare r name) of
LT -> scan rs
EQ -> True
GT -> False
theReservedNames
| caseSensitive languageDef = sort reserved
| otherwise = sort . map (map toLower) $ reserved
where
reserved = reservedNames languageDef
-----------------------------------------------------------
-- White space & symbols
-----------------------------------------------------------
symbol name
= lexeme (string name)
lexeme p
= do{ x <- p; whiteSpace; return x }
--whiteSpace
whiteSpace
| noLine && noMulti = skipMany (simpleSpace <?> "")
| noLine = skipMany (simpleSpace <|> multiLineComment <?> "")
| noMulti = skipMany (simpleSpace <|> oneLineComment <?> "")
| otherwise = skipMany (simpleSpace <|> oneLineComment <|> multiLineComment <?> "")
where
noLine = null (commentLine languageDef)
noMulti = null (commentStart languageDef)
simpleSpace =
skipMany1 (satisfy isSpace)
oneLineComment =
do{ try (string (commentLine languageDef))
; skipMany (satisfy (/= '\n'))
; return ()
}
multiLineComment =
do { try (string (commentStart languageDef))
; inComment
}
inComment
| nestedComments languageDef = inCommentMulti
| otherwise = inCommentSingle
inCommentMulti
= do{ try (string (commentEnd languageDef)) ; return () }
<|> do{ multiLineComment ; inCommentMulti }
<|> do{ skipMany1 (noneOf startEnd) ; inCommentMulti }
<|> do{ oneOf startEnd ; inCommentMulti }
<?> "end of comment"
where
startEnd = nub (commentEnd languageDef ++ commentStart languageDef)
inCommentSingle
= do{ try (string (commentEnd languageDef)); return () }
<|> do{ skipMany1 (noneOf startEnd) ; inCommentSingle }
<|> do{ oneOf startEnd ; inCommentSingle }
<?> "end of comment"
where
startEnd = nub (commentEnd languageDef ++ commentStart languageDef)