megaparsec/Text/Parsec/Token.hs

475 lines
16 KiB
Haskell
Raw Normal View History

2008-01-13 20:53:15 +03:00
-----------------------------------------------------------------------------
-- |
-- Module : Text.Parsec.Token
-- Copyright : (c) Daan Leijen 1999-2001, (c) Paolo Martini 2007
-- License : BSD-style (see the LICENSE file)
--
2008-01-20 07:44:41 +03:00
-- Maintainer : derek.a.elkins@gmail.com
2008-01-13 20:53:15 +03:00
-- Stability : provisional
2008-01-20 09:39:18 +03:00
-- Portability : non-portable (uses local universal quantification: PolymorphicComponents)
2008-01-13 20:53:15 +03:00
--
-- A helper module to parse lexical elements (tokens).
--
-----------------------------------------------------------------------------
2008-01-20 09:39:18 +03:00
{-# LANGUAGE PolymorphicComponents #-}
2008-01-13 20:53:15 +03:00
module Text.Parsec.Token
( LanguageDef
, GenLanguageDef (..)
, TokenParser
, GenTokenParser (..)
, makeTokenParser
) where
import Data.Char ( isAlpha, toLower, toUpper, isSpace, digitToInt )
import Data.List ( nub, sort )
import Control.Monad.Identity
import Text.Parsec.Prim
import Text.Parsec.Char
import Text.Parsec.Combinator
-----------------------------------------------------------
-- Language Definition
-----------------------------------------------------------
type LanguageDef st = GenLanguageDef String st Identity
data GenLanguageDef s u m
= LanguageDef
{ commentStart :: String
, commentEnd :: String
, commentLine :: String
, nestedComments :: Bool
, identStart :: ParsecT s u m Char
, identLetter :: ParsecT s u m Char
, opStart :: ParsecT s u m Char
, opLetter :: ParsecT s u m Char
, reservedNames :: [String]
, reservedOpNames:: [String]
, caseSensitive :: Bool
}
-----------------------------------------------------------
-- A first class module: TokenParser
-----------------------------------------------------------
type TokenParser st = GenTokenParser String st Identity
data GenTokenParser s u m
= TokenParser
{ identifier :: ParsecT s u m String
, reserved :: String -> ParsecT s u m ()
, operator :: ParsecT s u m String
, reservedOp :: String -> ParsecT s u m ()
, charLiteral :: ParsecT s u m Char
, stringLiteral :: ParsecT s u m String
, natural :: ParsecT s u m Integer
, integer :: ParsecT s u m Integer
, float :: ParsecT s u m Double
, naturalOrFloat :: ParsecT s u m (Either Integer Double)
, decimal :: ParsecT s u m Integer
, hexadecimal :: ParsecT s u m Integer
, octal :: ParsecT s u m Integer
, symbol :: String -> ParsecT s u m String
, lexeme :: forall a. ParsecT s u m a -> ParsecT s u m a
, whiteSpace :: ParsecT s u m ()
, parens :: forall a. ParsecT s u m a -> ParsecT s u m a
, braces :: forall a. ParsecT s u m a -> ParsecT s u m a
, angles :: forall a. ParsecT s u m a -> ParsecT s u m a
, brackets :: forall a. ParsecT s u m a -> ParsecT s u m a
-- "squares" is deprecated
, squares :: forall a. ParsecT s u m a -> ParsecT s u m a
, semi :: ParsecT s u m String
, comma :: ParsecT s u m String
, colon :: ParsecT s u m String
, dot :: ParsecT s u m String
, semiSep :: forall a . ParsecT s u m a -> ParsecT s u m [a]
, semiSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a]
, commaSep :: forall a . ParsecT s u m a -> ParsecT s u m [a]
, commaSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a]
}
-----------------------------------------------------------
-- Given a LanguageDef, create a token parser.
-----------------------------------------------------------
makeTokenParser :: (Stream s m Char)
=> GenLanguageDef s u m -> GenTokenParser s u m
makeTokenParser languageDef
= TokenParser{ identifier = identifier
, reserved = reserved
, operator = operator
, reservedOp = reservedOp
, charLiteral = charLiteral
, stringLiteral = stringLiteral
, natural = natural
, integer = integer
, float = float
, naturalOrFloat = naturalOrFloat
, decimal = decimal
, hexadecimal = hexadecimal
, octal = octal
, symbol = symbol
, lexeme = lexeme
, whiteSpace = whiteSpace
, parens = parens
, braces = braces
, angles = angles
, brackets = brackets
, squares = brackets
, semi = semi
, comma = comma
, colon = colon
, dot = dot
, semiSep = semiSep
, semiSep1 = semiSep1
, commaSep = commaSep
, commaSep1 = commaSep1
}
where
-----------------------------------------------------------
-- Bracketing
-----------------------------------------------------------
parens p = between (symbol "(") (symbol ")") p
braces p = between (symbol "{") (symbol "}") p
angles p = between (symbol "<") (symbol ">") p
brackets p = between (symbol "[") (symbol "]") p
semi = symbol ";"
comma = symbol ","
dot = symbol "."
colon = symbol ":"
commaSep p = sepBy p comma
semiSep p = sepBy p semi
commaSep1 p = sepBy1 p comma
semiSep1 p = sepBy1 p semi
-----------------------------------------------------------
-- Chars & Strings
-----------------------------------------------------------
charLiteral = lexeme (between (char '\'')
(char '\'' <?> "end of character")
characterChar )
<?> "character"
characterChar = charLetter <|> charEscape
<?> "literal character"
charEscape = do{ char '\\'; escapeCode }
charLetter = satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026'))
stringLiteral = lexeme (
do{ str <- between (char '"')
(char '"' <?> "end of string")
(many stringChar)
; return (foldr (maybe id (:)) "" str)
}
<?> "literal string")
stringChar = do{ c <- stringLetter; return (Just c) }
<|> stringEscape
<?> "string character"
stringLetter = satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026'))
stringEscape = do{ char '\\'
; do{ escapeGap ; return Nothing }
<|> do{ escapeEmpty; return Nothing }
<|> do{ esc <- escapeCode; return (Just esc) }
}
escapeEmpty = char '&'
escapeGap = do{ many1 space
; char '\\' <?> "end of string gap"
}
-- escape codes
escapeCode = charEsc <|> charNum <|> charAscii <|> charControl
<?> "escape code"
charControl = do{ char '^'
; code <- upper
; return (toEnum (fromEnum code - fromEnum 'A'))
}
charNum = do{ code <- decimal
<|> do{ char 'o'; number 8 octDigit }
<|> do{ char 'x'; number 16 hexDigit }
; return (toEnum (fromInteger code))
}
charEsc = choice (map parseEsc escMap)
where
parseEsc (c,code) = do{ char c; return code }
charAscii = choice (map parseAscii asciiMap)
where
parseAscii (asc,code) = try (do{ string asc; return code })
-- escape code tables
escMap = zip ("abfnrtv\\\"\'") ("\a\b\f\n\r\t\v\\\"\'")
asciiMap = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2)
ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM",
"FS","GS","RS","US","SP"]
ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL",
"DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB",
"CAN","SUB","ESC","DEL"]
ascii2 = ['\BS','\HT','\LF','\VT','\FF','\CR','\SO','\SI',
'\EM','\FS','\GS','\RS','\US','\SP']
ascii3 = ['\NUL','\SOH','\STX','\ETX','\EOT','\ENQ','\ACK',
'\BEL','\DLE','\DC1','\DC2','\DC3','\DC4','\NAK',
'\SYN','\ETB','\CAN','\SUB','\ESC','\DEL']
-----------------------------------------------------------
-- Numbers
-----------------------------------------------------------
naturalOrFloat = lexeme (natFloat) <?> "number"
float = lexeme floating <?> "float"
integer = lexeme int <?> "integer"
natural = lexeme nat <?> "natural"
-- floats
floating = do{ n <- decimal
; fractExponent n
}
natFloat = do{ char '0'
; zeroNumFloat
}
<|> decimalFloat
zeroNumFloat = do{ n <- hexadecimal <|> octal
; return (Left n)
}
<|> decimalFloat
<|> fractFloat 0
<|> return (Left 0)
decimalFloat = do{ n <- decimal
; option (Left n)
(fractFloat n)
}
fractFloat n = do{ f <- fractExponent n
; return (Right f)
}
fractExponent n = do{ fract <- fraction
; expo <- option 1.0 exponent'
; return ((fromInteger n + fract)*expo)
}
<|>
do{ expo <- exponent'
; return ((fromInteger n)*expo)
}
fraction = do{ char '.'
; digits <- many1 digit <?> "fraction"
; return (foldr op 0.0 digits)
}
<?> "fraction"
where
op d f = (f + fromIntegral (digitToInt d))/10.0
exponent' = do{ oneOf "eE"
; f <- sign
; e <- decimal <?> "exponent"
; return (power (f e))
}
<?> "exponent"
where
power e | e < 0 = 1.0/power(-e)
| otherwise = fromInteger (10^e)
-- integers and naturals
int = do{ f <- lexeme sign
; n <- nat
; return (f n)
}
sign = (char '-' >> return negate)
<|> (char '+' >> return id)
<|> return id
nat = zeroNumber <|> decimal
zeroNumber = do{ char '0'
; hexadecimal <|> octal <|> decimal <|> return 0
}
<?> ""
decimal = number 10 digit
hexadecimal = do{ oneOf "xX"; number 16 hexDigit }
octal = do{ oneOf "oO"; number 8 octDigit }
number base baseDigit
= do{ digits <- many1 baseDigit
; let n = foldl (\x d -> base*x + toInteger (digitToInt d)) 0 digits
; seq n (return n)
}
-----------------------------------------------------------
-- Operators & reserved ops
-----------------------------------------------------------
reservedOp name =
lexeme $ try $
do{ string name
; notFollowedBy (opLetter languageDef) <?> ("end of " ++ show name)
}
operator =
lexeme $ try $
do{ name <- oper
; if (isReservedOp name)
then unexpected ("reserved operator " ++ show name)
else return name
}
oper =
do{ c <- (opStart languageDef)
; cs <- many (opLetter languageDef)
; return (c:cs)
}
<?> "operator"
isReservedOp name =
isReserved (sort (reservedOpNames languageDef)) name
-----------------------------------------------------------
-- Identifiers & Reserved words
-----------------------------------------------------------
reserved name =
lexeme $ try $
do{ caseString name
; notFollowedBy (identLetter languageDef) <?> ("end of " ++ show name)
}
caseString name
| caseSensitive languageDef = string name
| otherwise = do{ walk name; return name }
where
walk [] = return ()
walk (c:cs) = do{ caseChar c <?> msg; walk cs }
caseChar c | isAlpha c = char (toLower c) <|> char (toUpper c)
| otherwise = char c
msg = show name
identifier =
lexeme $ try $
do{ name <- ident
; if (isReservedName name)
then unexpected ("reserved word " ++ show name)
else return name
}
ident
= do{ c <- identStart languageDef
; cs <- many (identLetter languageDef)
; return (c:cs)
}
<?> "identifier"
isReservedName name
= isReserved theReservedNames caseName
where
caseName | caseSensitive languageDef = name
| otherwise = map toLower name
isReserved names name
= scan names
where
scan [] = False
scan (r:rs) = case (compare r name) of
LT -> scan rs
EQ -> True
GT -> False
theReservedNames
| caseSensitive languageDef = sortedNames
| otherwise = map (map toLower) sortedNames
where
sortedNames = sort (reservedNames languageDef)
-----------------------------------------------------------
-- White space & symbols
-----------------------------------------------------------
symbol name
= lexeme (string name)
lexeme p
= do{ x <- p; whiteSpace; return x }
--whiteSpace
whiteSpace
| noLine && noMulti = skipMany (simpleSpace <?> "")
| noLine = skipMany (simpleSpace <|> multiLineComment <?> "")
| noMulti = skipMany (simpleSpace <|> oneLineComment <?> "")
| otherwise = skipMany (simpleSpace <|> oneLineComment <|> multiLineComment <?> "")
where
noLine = null (commentLine languageDef)
noMulti = null (commentStart languageDef)
simpleSpace =
skipMany1 (satisfy isSpace)
oneLineComment =
do{ try (string (commentLine languageDef))
; skipMany (satisfy (/= '\n'))
; return ()
}
multiLineComment =
do { try (string (commentStart languageDef))
; inComment
}
inComment
| nestedComments languageDef = inCommentMulti
| otherwise = inCommentSingle
inCommentMulti
= do{ try (string (commentEnd languageDef)) ; return () }
<|> do{ multiLineComment ; inCommentMulti }
<|> do{ skipMany1 (noneOf startEnd) ; inCommentMulti }
<|> do{ oneOf startEnd ; inCommentMulti }
<?> "end of comment"
where
startEnd = nub (commentEnd languageDef ++ commentStart languageDef)
inCommentSingle
= do{ try (string (commentEnd languageDef)); return () }
<|> do{ skipMany1 (noneOf startEnd) ; inCommentSingle }
<|> do{ oneOf startEnd ; inCommentSingle }
<?> "end of comment"
where
startEnd = nub (commentEnd languageDef ++ commentStart languageDef)