2008-01-13 20:53:15 +03:00
|
|
|
|
-- |
|
2015-08-30 13:00:07 +03:00
|
|
|
|
-- Module : Text.Megaparsec.Lexer
|
2015-08-01 19:24:45 +03:00
|
|
|
|
-- Copyright : © 2015 Megaparsec contributors
|
2015-07-30 19:20:37 +03:00
|
|
|
|
-- © 2007 Paolo Martini
|
|
|
|
|
-- © 1999–2001 Daan Leijen
|
2015-07-28 16:32:19 +03:00
|
|
|
|
-- License : BSD3
|
|
|
|
|
--
|
|
|
|
|
-- Maintainer : Mark Karpov <markkarpov@opmbx.org>
|
2015-07-29 11:38:32 +03:00
|
|
|
|
-- Stability : experimental
|
2008-01-20 09:39:18 +03:00
|
|
|
|
-- Portability : non-portable (uses local universal quantification: PolymorphicComponents)
|
2015-07-28 16:32:19 +03:00
|
|
|
|
--
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- High-level parsers to help you write your lexer. The module doesn't
|
|
|
|
|
-- impose how you should write your parser, but certain approaches may be
|
|
|
|
|
-- more elegant than others. Especially important theme is parsing of write
|
|
|
|
|
-- space, comments and indentation.
|
|
|
|
|
--
|
|
|
|
|
-- This module is supposed to be imported qualified:
|
|
|
|
|
--
|
|
|
|
|
-- > import qualified Text.Megaparsec.Lexer as L
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-30 13:00:07 +03:00
|
|
|
|
module Text.Megaparsec.Lexer
|
2015-09-08 14:34:02 +03:00
|
|
|
|
( -- * White space and indentation
|
|
|
|
|
space
|
|
|
|
|
, lexeme
|
|
|
|
|
, symbol
|
|
|
|
|
, symbol'
|
|
|
|
|
, indentGuard
|
2015-09-03 10:35:22 +03:00
|
|
|
|
, skipLineComment
|
|
|
|
|
, skipBlockComment
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- * Character and string literals
|
|
|
|
|
, charLiteral
|
|
|
|
|
-- * Numbers
|
|
|
|
|
, integer
|
|
|
|
|
, decimal
|
|
|
|
|
, hexadecimal
|
|
|
|
|
, octal
|
|
|
|
|
, float
|
|
|
|
|
, number
|
|
|
|
|
, signed )
|
2015-07-28 16:32:19 +03:00
|
|
|
|
where
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-09-08 14:34:02 +03:00
|
|
|
|
import Control.Applicative ((<|>), some)
|
2015-08-01 17:39:20 +03:00
|
|
|
|
import Control.Monad (void)
|
2015-09-08 14:34:02 +03:00
|
|
|
|
import Data.Char (readLitChar)
|
|
|
|
|
import Data.Maybe (listToMaybe)
|
2015-07-30 18:45:06 +03:00
|
|
|
|
|
2015-08-01 19:24:45 +03:00
|
|
|
|
import Text.Megaparsec.Combinator
|
2015-09-02 16:27:48 +03:00
|
|
|
|
import Text.Megaparsec.Pos
|
|
|
|
|
import Text.Megaparsec.Prim
|
2015-09-08 14:34:02 +03:00
|
|
|
|
import Text.Megaparsec.ShowToken
|
2015-09-02 16:27:48 +03:00
|
|
|
|
import qualified Text.Megaparsec.Char as C
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- White space and indentation
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- | @space spaceChar lineComment blockComment@ produces parser that can
|
|
|
|
|
-- parse white space in general. It's expected that you create such a parser
|
|
|
|
|
-- once and pass it to many other function in this module as needed (it's
|
|
|
|
|
-- usually called @spaceConsumer@ in doc-strings here).
|
|
|
|
|
--
|
|
|
|
|
-- @spaceChar@ is used to parse trivial space characters. You can use
|
|
|
|
|
-- 'C.spaceChar' from "Text.Megaparsec.Char" for this purpose as well as
|
|
|
|
|
-- your own parser (if you don't want automatically consume newlines, for
|
|
|
|
|
-- example).
|
|
|
|
|
--
|
|
|
|
|
-- @lineComment@ is used to parse line comments. You can use
|
|
|
|
|
-- 'skipLineComment' if you don't need anything special.
|
|
|
|
|
--
|
|
|
|
|
-- @blockComment@ is used to parse block (multi-line) comments. You can use
|
|
|
|
|
-- 'skipBlockComment' if you don't need anything special.
|
|
|
|
|
--
|
|
|
|
|
-- Parsing of white space is important part of any parser. We propose scheme
|
|
|
|
|
-- where every lexeme should consume all trailing white space, but not
|
|
|
|
|
-- leading one. You should wrap every lexeme parser with 'lexeme' to achieve
|
|
|
|
|
-- this. You only need to call 'space' “manually” to consume any white space
|
|
|
|
|
-- before the first lexeme (at the beginning of file).
|
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
space :: MonadParsec s m Char => m () -> m () -> m () -> m ()
|
2015-09-08 14:34:02 +03:00
|
|
|
|
space ch line block = hidden . skipMany $ choice [ch, line, block]
|
|
|
|
|
|
|
|
|
|
-- | This is wrapper for lexemes. Typical usage is to supply first argument
|
|
|
|
|
-- (parser that consumes white space, probably defined via 'space') and use
|
|
|
|
|
-- resulting function to wrap parsers for every lexeme.
|
|
|
|
|
--
|
|
|
|
|
-- > lexeme = L.lexeme spaceConsumer
|
|
|
|
|
-- > integer = lexeme L.integer
|
2015-09-02 16:27:48 +03:00
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
lexeme :: MonadParsec s m Char => m () -> m a -> m a
|
2015-09-08 14:34:02 +03:00
|
|
|
|
lexeme spc p = p <* spc
|
2015-09-02 16:27:48 +03:00
|
|
|
|
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- | This is a helper to parse symbols, i.e. verbatim strings. You pass the
|
|
|
|
|
-- first argument (parser that consumes white space, probably defined via
|
|
|
|
|
-- 'space') and then you can use the resulting function to parse strings:
|
|
|
|
|
--
|
|
|
|
|
-- > symbol = L.symbol spaceConsumer
|
|
|
|
|
-- >
|
|
|
|
|
-- > parens = between (symbol "(") (symbol ")")
|
|
|
|
|
-- > braces = between (symbol "{") (symbol "}")
|
|
|
|
|
-- > angles = between (symbol "<") (symbol ">")
|
|
|
|
|
-- > brackets = between (symbol "[") (symbol "]")
|
|
|
|
|
-- > semicolon = symbol ";"
|
|
|
|
|
-- > comma = symbol ","
|
|
|
|
|
-- > colon = symbol ":"
|
|
|
|
|
-- > dot = symbol "."
|
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
symbol :: MonadParsec s m Char => m () -> String -> m String
|
2015-09-08 14:34:02 +03:00
|
|
|
|
symbol spc = lexeme spc . C.string
|
|
|
|
|
|
|
|
|
|
-- | Case-insensitive version of 'symbol'. This may be helpful if you're
|
|
|
|
|
-- working with case-insensitive languages.
|
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
symbol' :: MonadParsec s m Char => m () -> String -> m String
|
2015-09-08 14:34:02 +03:00
|
|
|
|
symbol' spc = lexeme spc . C.string'
|
|
|
|
|
|
|
|
|
|
-- | @indentGuard spaceConsumer test@ first consumes all white space
|
|
|
|
|
-- (indentation) with @spaceConsumer@ parser, then it checks column
|
|
|
|
|
-- position. It should satisfy supplied predicate @test@, otherwise the
|
|
|
|
|
-- parser fails with error message “incorrect indentation”. On success
|
|
|
|
|
-- current column position is returned.
|
|
|
|
|
--
|
|
|
|
|
-- When you want to parse block of indentation first run this parser with
|
2015-09-13 18:00:22 +03:00
|
|
|
|
-- predicate like @(> 1)@ — this will make sure you have some
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- indentation. Use returned value to check indentation on every subsequent
|
|
|
|
|
-- line according to syntax of your language.
|
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
indentGuard :: MonadParsec s m Char => m () -> (Int -> Bool) -> m Int
|
2015-09-08 14:34:02 +03:00
|
|
|
|
indentGuard spc p = do
|
|
|
|
|
spc
|
|
|
|
|
pos <- sourceColumn <$> getPosition
|
|
|
|
|
if p pos
|
|
|
|
|
then return pos
|
|
|
|
|
else fail "incorrect indentation"
|
2015-09-03 10:35:22 +03:00
|
|
|
|
|
|
|
|
|
-- | Given comment prefix this function returns parser that skips line
|
|
|
|
|
-- comments. Note that it stops just before newline character but doesn't
|
|
|
|
|
-- consume the newline. Newline is either supposed to be consumed by 'space'
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- parser or picked up manually.
|
2015-09-03 10:35:22 +03:00
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
skipLineComment :: MonadParsec s m Char => String -> m ()
|
2015-09-13 15:51:15 +03:00
|
|
|
|
skipLineComment prefix = p >> void (manyTill C.anyChar n)
|
|
|
|
|
where p = try $ C.string prefix
|
|
|
|
|
n = lookAhead C.newline
|
2015-09-03 10:35:22 +03:00
|
|
|
|
|
|
|
|
|
-- | @skipBlockComment start end@ skips non-nested block comment starting
|
|
|
|
|
-- with @start@ and ending with @end@.
|
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
skipBlockComment :: MonadParsec s m Char => String -> String -> m ()
|
2015-09-13 15:51:15 +03:00
|
|
|
|
skipBlockComment start end = p >> void (manyTill C.anyChar n)
|
|
|
|
|
where p = try $ C.string start
|
|
|
|
|
n = try $ C.string end
|
2015-07-28 16:32:19 +03:00
|
|
|
|
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- Character and string literals
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- | The lexeme parser parses a single literal character without
|
|
|
|
|
-- quotes. Purpose of this parser is to help with parsing of commonly used
|
|
|
|
|
-- escape sequences. It's your responsibility to take care of character
|
|
|
|
|
-- literal syntax in your language (surround it with single quotes or
|
|
|
|
|
-- similar).
|
2008-01-22 08:14:30 +03:00
|
|
|
|
--
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- The literal character is parsed according to the grammar rules defined in
|
|
|
|
|
-- the Haskell report.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
--
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- Note that you can use this parser as a building block to parse various
|
|
|
|
|
-- string literals:
|
|
|
|
|
--
|
|
|
|
|
-- > stringLiteral = char '"' >> manyTill L.charLiteral (char '"')
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
charLiteral :: MonadParsec s m Char => m Char
|
2015-09-08 14:34:02 +03:00
|
|
|
|
charLiteral = label "literal character" $ do
|
|
|
|
|
r@(x:_) <- lookAhead $ count' 1 8 C.anyChar
|
|
|
|
|
case listToMaybe (readLitChar r) of
|
|
|
|
|
Just (c, r') -> count (length r - length r') C.anyChar >> return c
|
|
|
|
|
Nothing -> unexpected (showToken x)
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- Numbers
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- | Parse an integer without sign in decimal representation (according to
|
|
|
|
|
-- format of integer literals described in Haskell report).
|
|
|
|
|
--
|
|
|
|
|
-- If you need to parse signed integers, see 'signed' combinator.
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
integer :: MonadParsec s m Char => m Integer
|
2015-09-08 14:34:02 +03:00
|
|
|
|
integer = decimal <?> "integer"
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- | The same as 'integer', but 'integer' is 'label'ed with “integer” label,
|
2015-09-22 12:09:40 +03:00
|
|
|
|
-- while this parser is labeled with “decimal integer”.
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
decimal :: MonadParsec s m Char => m Integer
|
2015-09-22 12:09:40 +03:00
|
|
|
|
decimal = nump "" C.digitChar <?> "decimal integer"
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- | Parse an integer in hexadecimal representation. Representation of
|
2015-09-09 11:15:39 +03:00
|
|
|
|
-- hexadecimal number is expected to be according to Haskell report except
|
|
|
|
|
-- for the fact that this parser doesn't parse “0x” or “0X” prefix. It is
|
|
|
|
|
-- reponsibility of the programmer to parse correct prefix before parsing
|
|
|
|
|
-- the number itself.
|
|
|
|
|
--
|
|
|
|
|
-- For example you can make it conform to Haskell report like this:
|
|
|
|
|
--
|
|
|
|
|
-- > hexadecimal = char '0' >> char' 'x' >> L.hexadecimal
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
hexadecimal :: MonadParsec s m Char => m Integer
|
2015-09-22 12:09:40 +03:00
|
|
|
|
hexadecimal = nump "0x" C.hexDigitChar <?> "hexadecimal integer"
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- | Parse an integer in octal representation. Representation of octal
|
2015-09-09 11:15:39 +03:00
|
|
|
|
-- number is expected to be according to Haskell report except for the fact
|
|
|
|
|
-- that this parser doesn't parse “0o” or “0O” prefix. It is responsibility
|
|
|
|
|
-- of the programmer to parse correct prefix before parsing the number
|
|
|
|
|
-- itself.
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
octal :: MonadParsec s m Char => m Integer
|
2015-09-22 12:09:40 +03:00
|
|
|
|
octal = nump "0o" C.octDigitChar <?> "octal integer"
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- | @nump prefix p@ parses /one/ or more characters with @p@ parser, then
|
|
|
|
|
-- prepends @prefix@ to returned value and tries to interpret the result as
|
|
|
|
|
-- an integer according to Haskell syntax.
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
nump :: MonadParsec s m Char => String -> m Char -> m Integer
|
2015-09-08 14:34:02 +03:00
|
|
|
|
nump prefix baseDigit = read . (prefix ++) <$> some baseDigit
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- | Parse a floating point value without sign. Representation of floating
|
|
|
|
|
-- point value is expected to be according to Haskell report.
|
|
|
|
|
--
|
|
|
|
|
-- If you need to parse signed floats, see 'signed' combinator.
|
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
float :: MonadParsec s m Char => m Double
|
2015-09-08 14:34:02 +03:00
|
|
|
|
float = label "float" $ read <$> f
|
|
|
|
|
where f = do
|
|
|
|
|
d <- some C.digitChar
|
|
|
|
|
rest <- fraction <|> fExp
|
|
|
|
|
return $ d ++ rest
|
|
|
|
|
|
|
|
|
|
-- | This is a helper for 'float' parser. It parses fractional part of
|
|
|
|
|
-- floating point number, that is, dot and everything after it.
|
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
fraction :: MonadParsec s m Char => m String
|
2015-09-08 14:34:02 +03:00
|
|
|
|
fraction = do
|
|
|
|
|
void $ C.char '.'
|
|
|
|
|
d <- some C.digitChar
|
|
|
|
|
e <- option "" fExp
|
|
|
|
|
return $ '.' : d ++ e
|
|
|
|
|
|
|
|
|
|
-- | This helper parses exponent of floating point numbers.
|
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
fExp :: MonadParsec s m Char => m String
|
2015-09-08 14:34:02 +03:00
|
|
|
|
fExp = do
|
|
|
|
|
expChar <- C.char' 'e'
|
|
|
|
|
signStr <- option "" (pure <$> choice (C.char <$> "+-"))
|
|
|
|
|
d <- some C.digitChar
|
|
|
|
|
return $ expChar : signStr ++ d
|
|
|
|
|
|
|
|
|
|
-- | Parse a number: either integer or floating point. The parser can handle
|
|
|
|
|
-- overlapping grammars graciously.
|
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
number :: MonadParsec s m Char => m (Either Integer Double)
|
2015-09-08 14:34:02 +03:00
|
|
|
|
number = (Right <$> try float) <|> (Left <$> integer) <?> "number"
|
|
|
|
|
|
|
|
|
|
-- | @signed space p@ parser parses optional sign, then if there is a sign
|
|
|
|
|
-- it will consume optional white space (using @space@ parser), then it runs
|
|
|
|
|
-- parser @p@ which should return a number. Sign of the number is changed
|
|
|
|
|
-- according to previously parsed sign.
|
|
|
|
|
--
|
|
|
|
|
-- For example, to parse signed integer you can write:
|
|
|
|
|
--
|
|
|
|
|
-- > lexeme = L.lexeme spaceConsumer
|
|
|
|
|
-- > integer = lexeme L.integer
|
|
|
|
|
-- > signedInteger = signed spaceConsumer integer
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
signed :: (MonadParsec s m Char, Num a) => m () -> m a -> m a
|
2015-09-08 14:34:02 +03:00
|
|
|
|
signed spc p = ($) <$> option id (lexeme spc sign) <*> p
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
2015-09-08 14:34:02 +03:00
|
|
|
|
-- | Parse a sign and return either 'id' or 'negate' according to parsed
|
|
|
|
|
-- sign.
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
2015-09-18 12:33:44 +03:00
|
|
|
|
sign :: (MonadParsec s m Char, Num a) => m (a -> a)
|
2015-09-08 14:34:02 +03:00
|
|
|
|
sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate)
|