-- | -- Module : Text.Megaparsec.Lexer -- Copyright : © 2015 Megaparsec contributors -- © 2007 Paolo Martini -- © 1999–2001 Daan Leijen -- License : BSD3 -- -- Maintainer : Mark Karpov -- Stability : experimental -- Portability : non-portable -- -- High-level parsers to help you write your lexer. The module doesn't -- impose how you should write your parser, but certain approaches may be -- more elegant than others. Especially important theme is parsing of write -- space, comments and indentation. -- -- This module is intended to be imported qualified: -- -- > import qualified Text.Megaparsec.Lexer as L module Text.Megaparsec.Lexer ( -- * White space and indentation space , lexeme , symbol , symbol' , indentGuard , skipLineComment , skipBlockComment -- * Character and string literals , charLiteral -- * Numbers , integer , decimal , hexadecimal , octal , float , number , signed ) where import Control.Applicative ((<|>), some) import Control.Monad (void) import Data.Char (readLitChar) import Data.Maybe (listToMaybe) import Text.Megaparsec.Combinator import Text.Megaparsec.Pos import Text.Megaparsec.Prim import Text.Megaparsec.ShowToken import qualified Text.Megaparsec.Char as C -- White space and indentation -- | @space spaceChar lineComment blockComment@ produces parser that can -- parse white space in general. It's expected that you create such a parser -- once and pass it to other functions in this module as needed (when you -- see @spaceConsumer@ in documentation, usually it means that something -- like 'space' is expected there). -- -- @spaceChar@ is used to parse trivial space characters. You can use -- 'C.spaceChar' from "Text.Megaparsec.Char" for this purpose as well as -- your own parser (if you don't want automatically consume newlines, for -- example). -- -- @lineComment@ is used to parse line comments. You can use -- 'skipLineComment' if you don't need anything special. -- -- @blockComment@ is used to parse block (multi-line) comments. You can use -- 'skipBlockComment' if you don't need anything special. -- -- Parsing of white space is an important part of any parser. We propose a -- convention where every lexeme parser assumes no spaces before the lexeme -- and consumes all spaces after the lexeme; this is what the 'lexeme' -- combinator does, and so it's enough to wrap every lexeme parser with -- 'lexeme' to achieve this. Note that you'll need to call 'space' manually -- to consume any white space before the first lexeme (i.e. at the beginning -- of the file). space :: MonadParsec s m Char => m () -- ^ A parser for a space character (e.g. 'C.spaceChar') -> m () -- ^ A parser for a line comment (e.g. 'skipLineComment') -> m () -- ^ A parser for a block comment (e.g. 'skipBlockComment') -> m () space ch line block = hidden . skipMany $ choice [ch, line, block] -- | This is wrapper for lexemes. Typical usage is to supply first argument -- (parser that consumes white space, probably defined via 'space') and use -- the resulting function to wrap parsers for every lexeme. -- -- > lexeme = L.lexeme spaceConsumer -- > integer = lexeme L.integer lexeme :: MonadParsec s m Char => m () -> m a -> m a lexeme spc p = p <* spc -- | This is a helper to parse symbols, i.e. verbatim strings. You pass the -- first argument (parser that consumes white space, probably defined via -- 'space') and then you can use the resulting function to parse strings: -- -- > symbol = L.symbol spaceConsumer -- > -- > parens = between (symbol "(") (symbol ")") -- > braces = between (symbol "{") (symbol "}") -- > angles = between (symbol "<") (symbol ">") -- > brackets = between (symbol "[") (symbol "]") -- > semicolon = symbol ";" -- > comma = symbol "," -- > colon = symbol ":" -- > dot = symbol "." symbol :: MonadParsec s m Char => m () -> String -> m String symbol spc = lexeme spc . C.string -- | Case-insensitive version of 'symbol'. This may be helpful if you're -- working with case-insensitive languages. symbol' :: MonadParsec s m Char => m () -> String -> m String symbol' spc = lexeme spc . C.string' -- | @indentGuard spaceConsumer test@ first consumes all white space -- (indentation) with @spaceConsumer@ parser, then it checks column -- position. It should satisfy supplied predicate @test@, otherwise the -- parser fails with error message “incorrect indentation”. On success -- current column position is returned. -- -- When you want to parse block of indentation first run this parser with -- predicate like @(> 1)@ — this will make sure you have some -- indentation. Use returned value to check indentation on every subsequent -- line according to syntax of your language. indentGuard :: MonadParsec s m Char => m () -> (Int -> Bool) -> m Int indentGuard spc p = do spc pos <- sourceColumn <$> getPosition if p pos then return pos else fail "incorrect indentation" -- | Given comment prefix this function returns parser that skips line -- comments. Note that it stops just before newline character but doesn't -- consume the newline. Newline is either supposed to be consumed by 'space' -- parser or picked up manually. skipLineComment :: MonadParsec s m Char => String -> m () skipLineComment prefix = p >> void (manyTill C.anyChar n) where p = try $ C.string prefix n = lookAhead C.newline -- | @skipBlockComment start end@ skips non-nested block comment starting -- with @start@ and ending with @end@. skipBlockComment :: MonadParsec s m Char => String -> String -> m () skipBlockComment start end = p >> void (manyTill C.anyChar n) where p = try $ C.string start n = try $ C.string end -- Character and string literals -- | The lexeme parser parses a single literal character without -- quotes. Purpose of this parser is to help with parsing of commonly used -- escape sequences. It's your responsibility to take care of character -- literal syntax in your language (by surrounding it with single quotes or -- similar). -- -- The literal character is parsed according to the grammar rules defined in -- the Haskell report. -- -- Note that you can use this parser as a building block to parse various -- string literals: -- -- > stringLiteral = char '"' >> manyTill L.charLiteral (char '"') charLiteral :: MonadParsec s m Char => m Char charLiteral = label "literal character" $ do r@(x:_) <- lookAhead $ count' 1 8 C.anyChar case listToMaybe (readLitChar r) of Just (c, r') -> count (length r - length r') C.anyChar >> return c Nothing -> unexpected (showToken x) -- Numbers -- | Parse an integer without sign in decimal representation (according to -- format of integer literals described in Haskell report). -- -- If you need to parse signed integers, see 'signed' combinator. integer :: MonadParsec s m Char => m Integer integer = decimal "integer" -- | The same as 'integer', but 'integer' is 'label'ed with “integer” label, -- while this parser is labeled with “decimal integer”. decimal :: MonadParsec s m Char => m Integer decimal = nump "" C.digitChar "decimal integer" -- | Parse an integer in hexadecimal representation. Representation of -- hexadecimal number is expected to be according to Haskell report except -- for the fact that this parser doesn't parse “0x” or “0X” prefix. It is -- reponsibility of the programmer to parse correct prefix before parsing -- the number itself. -- -- For example you can make it conform to Haskell report like this: -- -- > hexadecimal = char '0' >> char' 'x' >> L.hexadecimal hexadecimal :: MonadParsec s m Char => m Integer hexadecimal = nump "0x" C.hexDigitChar "hexadecimal integer" -- | Parse an integer in octal representation. Representation of octal -- number is expected to be according to Haskell report except for the fact -- that this parser doesn't parse “0o” or “0O” prefix. It is responsibility -- of the programmer to parse correct prefix before parsing the number -- itself. octal :: MonadParsec s m Char => m Integer octal = nump "0o" C.octDigitChar "octal integer" -- | @nump prefix p@ parses /one/ or more characters with @p@ parser, then -- prepends @prefix@ to returned value and tries to interpret the result as -- an integer according to Haskell syntax. nump :: MonadParsec s m Char => String -> m Char -> m Integer nump prefix baseDigit = read . (prefix ++) <$> some baseDigit -- | Parse a floating point value without sign. Representation of floating -- point value is expected to be according to Haskell report. -- -- If you need to parse signed floats, see 'signed'. float :: MonadParsec s m Char => m Double float = label "float" $ read <$> f where f = do d <- some C.digitChar rest <- fraction <|> fExp return $ d ++ rest -- | This is a helper for 'float' parser. It parses fractional part of -- floating point number, that is, dot and everything after it. fraction :: MonadParsec s m Char => m String fraction = do void $ C.char '.' d <- some C.digitChar e <- option "" fExp return $ '.' : d ++ e -- | This helper parses exponent of floating point numbers. fExp :: MonadParsec s m Char => m String fExp = do expChar <- C.char' 'e' signStr <- option "" (pure <$> choice (C.char <$> "+-")) d <- some C.digitChar return $ expChar : signStr ++ d -- | Parse a number: either integer or floating point. The parser can handle -- overlapping grammars graciously. number :: MonadParsec s m Char => m (Either Integer Double) number = (Right <$> try float) <|> (Left <$> integer) "number" -- | @signed space p@ parser parses optional sign, then if there is a sign -- it will consume optional white space (using @space@ parser), then it runs -- parser @p@ which should return a number. Sign of the number is changed -- according to previously parsed sign. -- -- For example, to parse signed integer you can write: -- -- > lexeme = L.lexeme spaceConsumer -- > integer = lexeme L.integer -- > signedInteger = signed spaceConsumer integer signed :: (MonadParsec s m Char, Num a) => m () -> m a -> m a signed spc p = ($) <$> option id (lexeme spc sign) <*> p -- | Parse a sign and return either 'id' or 'negate' according to parsed -- sign. sign :: (MonadParsec s m Char, Num a) => m (a -> a) sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate)