megaparsec/Text/Megaparsec/Lexer.hs

-- |
-- Module      :  Text.Megaparsec.Lexer
-- Copyright   :  © 2015 Megaparsec contributors
--                © 2007 Paolo Martini
--                © 1999–2001 Daan Leijen
-- License     :  BSD3
--
-- Maintainer  :  Mark Karpov <markkarpov@opmbx.org>
-- Stability   :  experimental
-- Portability :  non-portable
--
-- High-level parsers to help you write your lexer. The module doesn't
-- impose how you should write your parser, but certain approaches may be
-- more elegant than others. Especially important theme is parsing of write
-- space, comments and indentation.
--
-- This module is intended to be imported qualified:
--
-- > import qualified Text.Megaparsec.Lexer as L

module Text.Megaparsec.Lexer
  ( -- * White space and indentation
    space
  , lexeme
  , symbol
  , symbol'
  , indentGuard
  , skipLineComment
  , skipBlockComment
    -- * Character and string literals
  , charLiteral
    -- * Numbers
  , integer
  , decimal
  , hexadecimal
  , octal
  , float
  , number
  , signed )
where

import Control.Applicative ((<|>), some)
import Control.Monad (void)
import Data.Char (readLitChar)
import Data.Maybe (listToMaybe)

import Text.Megaparsec.Combinator
import Text.Megaparsec.Pos
import Text.Megaparsec.Prim
import Text.Megaparsec.ShowToken
import qualified Text.Megaparsec.Char as C

-- White space and indentation

-- | @space spaceChar lineComment blockComment@ produces parser that can
-- parse white space in general. It's expected that you create such a parser
-- once and pass it to other functions in this module as needed (when you
-- see @spaceConsumer@ in documentation, usually it means that something
-- like 'space' is expected there).
--
-- @spaceChar@ is used to parse trivial space characters. You can use
-- 'C.spaceChar' from "Text.Megaparsec.Char" for this purpose as well as
-- your own parser (if you don't want automatically consume newlines, for
-- example).
--
-- @lineComment@ is used to parse line comments. You can use
-- 'skipLineComment' if you don't need anything special.
--
-- @blockComment@ is used to parse block (multi-line) comments. You can use
-- 'skipBlockComment' if you don't need anything special.
--
-- Parsing of white space is an important part of any parser. We propose a
-- convention where every lexeme parser assumes no spaces before the lexeme
-- and consumes all spaces after the lexeme; this is what the 'lexeme'
-- combinator does, and so it's enough to wrap every lexeme parser with
-- 'lexeme' to achieve this. Note that you'll need to call 'space' manually
-- to consume any white space before the first lexeme (i.e. at the beginning
-- of the file).

space :: MonadParsec s m Char
      => m () -- ^ A parser for a space character (e.g. 'C.spaceChar')
      -> m () -- ^ A parser for a line comment (e.g. 'skipLineComment')
      -> m () -- ^ A parser for a block comment (e.g. 'skipBlockComment')
      -> m ()
space ch line block = hidden . skipMany $ choice [ch, line, block]

-- | This is wrapper for lexemes. Typical usage is to supply first argument
-- (parser that consumes white space, probably defined via 'space') and use
-- the resulting function to wrap parsers for every lexeme.
--
-- > lexeme  = L.lexeme spaceConsumer
-- > integer = lexeme L.integer

lexeme :: MonadParsec s m Char => m () -> m a -> m a
lexeme spc p = p <* spc

-- | This is a helper to parse symbols, i.e. verbatim strings. You pass the
-- first argument (parser that consumes white space, probably defined via
-- 'space') and then you can use the resulting function to parse strings:
--
-- > symbol    = L.symbol spaceConsumer
-- >
-- > parens    = between (symbol "(") (symbol ")")
-- > braces    = between (symbol "{") (symbol "}")
-- > angles    = between (symbol "<") (symbol ">")
-- > brackets  = between (symbol "[") (symbol "]")
-- > semicolon = symbol ";"
-- > comma     = symbol ","
-- > colon     = symbol ":"
-- > dot       = symbol "."

symbol :: MonadParsec s m Char => m () -> String -> m String
symbol spc = lexeme spc . C.string

-- | Case-insensitive version of 'symbol'. This may be helpful if you're
-- working with case-insensitive languages.

symbol' :: MonadParsec s m Char => m () -> String -> m String
symbol' spc = lexeme spc . C.string'

-- | @indentGuard spaceConsumer test@ first consumes all white space
-- (indentation) with @spaceConsumer@ parser, then it checks column
-- position. It should satisfy supplied predicate @test@, otherwise the
-- parser fails with error message “incorrect indentation”. On success
-- current column position is returned.
--
-- When you want to parse block of indentation first run this parser with
-- predicate like @(> 1)@ — this will make sure you have some
-- indentation. Use returned value to check indentation on every subsequent
-- line according to syntax of your language.

indentGuard :: MonadParsec s m Char => m () -> (Int -> Bool) -> m Int
indentGuard spc p = do
  spc
  pos <- sourceColumn <$> getPosition
  if p pos
  then return pos
  else fail "incorrect indentation"

-- | Given comment prefix this function returns parser that skips line
-- comments. Note that it stops just before newline character but doesn't
-- consume the newline. Newline is either supposed to be consumed by 'space'
-- parser or picked up manually.

skipLineComment :: MonadParsec s m Char => String -> m ()
skipLineComment prefix = p >> void (manyTill C.anyChar n)
  where p = try $ C.string prefix
        n = lookAhead C.newline

-- | @skipBlockComment start end@ skips non-nested block comment starting
-- with @start@ and ending with @end@.

skipBlockComment :: MonadParsec s m Char => String -> String -> m ()
skipBlockComment start end = p >> void (manyTill C.anyChar n)
  where p = try $ C.string start
        n = try $ C.string end

-- Character and string literals

-- | The lexeme parser parses a single literal character without
-- quotes. Purpose of this parser is to help with parsing of commonly used
-- escape sequences. It's your responsibility to take care of character
-- literal syntax in your language (by surrounding it with single quotes or
-- similar).
--
-- The literal character is parsed according to the grammar rules defined in
-- the Haskell report.
--
-- Note that you can use this parser as a building block to parse various
-- string literals:
--
-- > stringLiteral = char '"' >> manyTill L.charLiteral (char '"')

charLiteral :: MonadParsec s m Char => m Char
charLiteral = label "literal character" $ do
  r@(x:_) <- lookAhead $ count' 1 8 C.anyChar
  case listToMaybe (readLitChar r) of
    Just (c, r') -> count (length r - length r') C.anyChar >> return c
    Nothing      -> unexpected (showToken x)

-- Numbers

-- | Parse an integer without sign in decimal representation (according to
-- format of integer literals described in Haskell report).
--
-- If you need to parse signed integers, see 'signed' combinator.

integer :: MonadParsec s m Char => m Integer
integer = decimal <?> "integer"

-- | The same as 'integer', but 'integer' is 'label'ed with “integer” label,
-- while this parser is labeled with “decimal integer”.

decimal :: MonadParsec s m Char => m Integer
decimal = nump "" C.digitChar <?> "decimal integer"

-- | Parse an integer in hexadecimal representation. Representation of
-- hexadecimal number is expected to be according to Haskell report except
-- for the fact that this parser doesn't parse “0x” or “0X” prefix. It is
-- reponsibility of the programmer to parse correct prefix before parsing
-- the number itself.
--
-- For example you can make it conform to Haskell report like this:
--
-- > hexadecimal = char '0' >> char' 'x' >> L.hexadecimal

hexadecimal :: MonadParsec s m Char => m Integer
hexadecimal = nump "0x" C.hexDigitChar <?> "hexadecimal integer"

-- | Parse an integer in octal representation. Representation of octal
-- number is expected to be according to Haskell report except for the fact
-- that this parser doesn't parse “0o” or “0O” prefix. It is responsibility
-- of the programmer to parse correct prefix before parsing the number
-- itself.

octal :: MonadParsec s m Char => m Integer
octal = nump "0o" C.octDigitChar <?> "octal integer"

-- | @nump prefix p@ parses /one/ or more characters with @p@ parser, then
-- prepends @prefix@ to returned value and tries to interpret the result as
-- an integer according to Haskell syntax.

nump :: MonadParsec s m Char => String -> m Char -> m Integer
nump prefix baseDigit = read . (prefix ++) <$> some baseDigit

-- | Parse a floating point value without sign. Representation of floating
-- point value is expected to be according to Haskell report.
--
-- If you need to parse signed floats, see 'signed'.

float :: MonadParsec s m Char => m Double
float = label "float" $ read <$> f
  where f = do
          d    <- some C.digitChar
          rest <- fraction <|> fExp
          return $ d ++ rest

-- | This is a helper for 'float' parser. It parses fractional part of
-- floating point number, that is, dot and everything after it.

fraction :: MonadParsec s m Char => m String
fraction = do
  void $ C.char '.'
  d <- some C.digitChar
  e <- option "" fExp
  return $ '.' : d ++ e

-- | This helper parses exponent of floating point numbers.

fExp :: MonadParsec s m Char => m String
fExp = do
  expChar <- C.char' 'e'
  signStr <- option "" (pure <$> choice (C.char <$> "+-"))
  d       <- some C.digitChar
  return $ expChar : signStr ++ d

-- | Parse a number: either integer or floating point. The parser can handle
-- overlapping grammars graciously.

number :: MonadParsec s m Char => m (Either Integer Double)
number = (Right <$> try float) <|> (Left <$> integer) <?> "number"

-- | @signed space p@ parser parses optional sign, then if there is a sign
-- it will consume optional white space (using @space@ parser), then it runs
-- parser @p@ which should return a number. Sign of the number is changed
-- according to previously parsed sign.
--
-- For example, to parse signed integer you can write:
--
-- > lexeme        = L.lexeme spaceConsumer
-- > integer       = lexeme L.integer
-- > signedInteger = signed spaceConsumer integer

signed :: (MonadParsec s m Char, Num a) => m () -> m a -> m a
signed spc p = ($) <$> option id (lexeme spc sign) <*> p

-- | Parse a sign and return either 'id' or 'negate' according to parsed
-- sign.

sign :: (MonadParsec s m Char, Num a) => m (a -> a)
sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate)
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
+								-- |
-												renamed ‘Text.Megaparsec.Token’ module

Now it's called ‘Text.Megaparsec.Lexer’. This commit contains other
cosmetic changes as well.

											
										
										
											2015-08-30 13:00:07 +03:00
+								-- Module      :  Text.Megaparsec.Lexer
-												renamed ‘MegaParsec’ → ‘Megaparsec’, close #10

											
										
										
											2015-08-01 19:24:45 +03:00
+								-- Copyright   :  © 2015 Megaparsec contributors
-												cosmetic changes in copyright (headers)

											
										
										
											2015-07-30 19:20:37 +03:00
+								--                © 2007 Paolo Martini
 								--                © 1999–2001 Daan Leijen
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								-- License     :  BSD3
 								--
 								-- Maintainer  :  Mark Karpov <markkarpov@opmbx.org>
-												refactoring, phase 2

											
										
										
											2015-07-29 11:38:32 +03:00
+								-- Stability   :  experimental
-												refresh values of “Portability” field

‘Text.Megaparsec.Prim’ cannot be considered portable since it uses
multi-parameter type classes and functional dependencies.

Other modules that depend on these non-portable features from
‘Text.Megaparsec.Prim’ should be considered non-portable too.

											
										
										
											2015-09-27 11:46:12 +03:00
+								-- Portability :  non-portable
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								--
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- High-level parsers to help you write your lexer. The module doesn't
 								-- impose how you should write your parser, but certain approaches may be
 								-- more elegant than others. Especially important theme is parsing of write
 								-- space, comments and indentation.
 								--
-												cosmetic improvements, courtesy of @neongreen

Closes #37.

Most part of these changes is proposed by @neongreen. To apply precisely
what I deem acceptable, correct some of them in other way, and add some
other things, I've manually re-edited this.

											
										
										
											2015-09-23 14:23:24 +03:00
+								-- This module is intended to be imported qualified:
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								--
 								-- > import qualified Text.Megaparsec.Lexer as L
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												renamed ‘Text.Megaparsec.Token’ module

Now it's called ‘Text.Megaparsec.Lexer’. This commit contains other
cosmetic changes as well.

											
										
										
											2015-08-30 13:00:07 +03:00
+								module Text.Megaparsec.Lexer
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								  ( -- * White space and indentation
 								    space
 								  , lexeme
 								  , symbol
 								  , symbol'
 								  , indentGuard
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  , skipLineComment
 								  , skipBlockComment
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								    -- * Character and string literals
 								  , charLiteral
 								    -- * Numbers
 								  , integer
 								  , decimal
 								  , hexadecimal
 								  , octal
 								  , float
 								  , number
 								  , signed )
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								where
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								import Control.Applicative ((<|>), some)
-												re-export ‘(<|>)’, ‘many’, ‘some’, and ‘optional’, fixes #9

These functions are now re-exported from ‘Control.Applicative’
module. ‘many’ and ‘some’ are now part of ‘Alternative’ instance of
‘ParsecT’.

Note that these functions are re-exported only in ‘Text.MegaParsec’
module, but not in ‘Text.MegaParsec.Prim’ to avoid duplication of
floating doc-strings. Others internal modules now just casually import
‘Control.Applicative’ for their needs.

Note that ‘many1’ was renamed to ‘some’, the same is done for other
parsers that had ‘many1’ part in their names (for consistency).

											
										
										
											2015-08-01 17:39:20 +03:00
+								import Control.Monad (void)
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								import Data.Char (readLitChar)
 								import Data.Maybe (listToMaybe)
-												refactoring, phase 3

											
										
										
											2015-07-30 18:45:06 +03:00
-												renamed ‘MegaParsec’ → ‘Megaparsec’, close #10

											
										
										
											2015-08-01 19:24:45 +03:00
+								import Text.Megaparsec.Combinator
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								import Text.Megaparsec.Pos
 								import Text.Megaparsec.Prim
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								import Text.Megaparsec.ShowToken
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								import qualified Text.Megaparsec.Char as C
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- White space and indentation
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- | @space spaceChar lineComment blockComment@ produces parser that can
 								-- parse white space in general. It's expected that you create such a parser
-												cosmetic improvements, courtesy of @neongreen

Closes #37.

Most part of these changes is proposed by @neongreen. To apply precisely
what I deem acceptable, correct some of them in other way, and add some
other things, I've manually re-edited this.

											
										
										
											2015-09-23 14:23:24 +03:00
+								-- once and pass it to other functions in this module as needed (when you
 								-- see @spaceConsumer@ in documentation, usually it means that something
 								-- like 'space' is expected there).
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								--
 								-- @spaceChar@ is used to parse trivial space characters. You can use
 								-- 'C.spaceChar' from "Text.Megaparsec.Char" for this purpose as well as
 								-- your own parser (if you don't want automatically consume newlines, for
 								-- example).
 								--
 								-- @lineComment@ is used to parse line comments. You can use
 								-- 'skipLineComment' if you don't need anything special.
 								--
 								-- @blockComment@ is used to parse block (multi-line) comments. You can use
 								-- 'skipBlockComment' if you don't need anything special.
 								--
-												cosmetic improvements, courtesy of @neongreen

Closes #37.

Most part of these changes is proposed by @neongreen. To apply precisely
what I deem acceptable, correct some of them in other way, and add some
other things, I've manually re-edited this.

											
										
										
											2015-09-23 14:23:24 +03:00
+								-- Parsing of white space is an important part of any parser. We propose a
 								-- convention where every lexeme parser assumes no spaces before the lexeme
 								-- and consumes all spaces after the lexeme; this is what the 'lexeme'
 								-- combinator does, and so it's enough to wrap every lexeme parser with
 								-- 'lexeme' to achieve this. Note that you'll need to call 'space' manually
 								-- to consume any white space before the first lexeme (i.e. at the beginning
 								-- of the file).
-												even more cosmetic corrections

Make details consistent and fix some minor cosmetic issues.

											
										
										
											2015-09-23 16:46:24 +03:00
+								space :: MonadParsec s m Char
 								      => m () -- ^ A parser for a space character (e.g. 'C.spaceChar')
-												cosmetic improvements, courtesy of @neongreen

Closes #37.

Most part of these changes is proposed by @neongreen. To apply precisely
what I deem acceptable, correct some of them in other way, and add some
other things, I've manually re-edited this.

											
										
										
											2015-09-23 14:23:24 +03:00
+								      -> m () -- ^ A parser for a line comment (e.g. 'skipLineComment')
 								      -> m () -- ^ A parser for a block comment (e.g. 'skipBlockComment')
 								      -> m ()
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								space ch line block = hidden . skipMany $ choice [ch, line, block]
 								-- | This is wrapper for lexemes. Typical usage is to supply first argument
 								-- (parser that consumes white space, probably defined via 'space') and use
-												cosmetic improvements, courtesy of @neongreen

Closes #37.

Most part of these changes is proposed by @neongreen. To apply precisely
what I deem acceptable, correct some of them in other way, and add some
other things, I've manually re-edited this.

											
										
										
											2015-09-23 14:23:24 +03:00
+								-- the resulting function to wrap parsers for every lexeme.
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								--
 								-- > lexeme  = L.lexeme spaceConsumer
 								-- > integer = lexeme L.integer
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								lexeme :: MonadParsec s m Char => m () -> m a -> m a
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								lexeme spc p = p <* spc
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- | This is a helper to parse symbols, i.e. verbatim strings. You pass the
 								-- first argument (parser that consumes white space, probably defined via
 								-- 'space') and then you can use the resulting function to parse strings:
 								--
 								-- > symbol    = L.symbol spaceConsumer
 								-- >
 								-- > parens    = between (symbol "(") (symbol ")")
 								-- > braces    = between (symbol "{") (symbol "}")
 								-- > angles    = between (symbol "<") (symbol ">")
 								-- > brackets  = between (symbol "[") (symbol "]")
 								-- > semicolon = symbol ";"
 								-- > comma     = symbol ","
 								-- > colon     = symbol ":"
 								-- > dot       = symbol "."
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								symbol :: MonadParsec s m Char => m () -> String -> m String
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								symbol spc = lexeme spc . C.string
 								-- | Case-insensitive version of 'symbol'. This may be helpful if you're
 								-- working with case-insensitive languages.
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								symbol' :: MonadParsec s m Char => m () -> String -> m String
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								symbol' spc = lexeme spc . C.string'
 								-- | @indentGuard spaceConsumer test@ first consumes all white space
 								-- (indentation) with @spaceConsumer@ parser, then it checks column
 								-- position. It should satisfy supplied predicate @test@, otherwise the
 								-- parser fails with error message “incorrect indentation”. On success
 								-- current column position is returned.
 								--
 								-- When you want to parse block of indentation first run this parser with
-												fix a typo (columns starts from 1)

											
										
										
											2015-09-13 18:00:22 +03:00
+								-- predicate like @(> 1)@ — this will make sure you have some
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- indentation. Use returned value to check indentation on every subsequent
 								-- line according to syntax of your language.
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								indentGuard :: MonadParsec s m Char => m () -> (Int -> Bool) -> m Int
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								indentGuard spc p = do
 								  spc
 								  pos <- sourceColumn <$> getPosition
 								  if p pos
 								  then return pos
 								  else fail "incorrect indentation"
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
 								-- | Given comment prefix this function returns parser that skips line
 								-- comments. Note that it stops just before newline character but doesn't
 								-- consume the newline. Newline is either supposed to be consumed by 'space'
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- parser or picked up manually.
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								skipLineComment :: MonadParsec s m Char => String -> m ()
-												fixed ‘skipLineComment’ and ‘skipBlockComment’

Multi-character staring/ending sequences should be wrapped with
‘try’. Also, ‘lookAhead’ should not be used in ‘skipBlockComment’.

											
										
										
											2015-09-13 15:51:15 +03:00
+								skipLineComment prefix = p >> void (manyTill C.anyChar n)
 								  where p = try $ C.string prefix
 								        n = lookAhead C.newline
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
 								-- | @skipBlockComment start end@ skips non-nested block comment starting
 								-- with @start@ and ending with @end@.
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								skipBlockComment :: MonadParsec s m Char => String -> String -> m ()
-												fixed ‘skipLineComment’ and ‘skipBlockComment’

Multi-character staring/ending sequences should be wrapped with
‘try’. Also, ‘lookAhead’ should not be used in ‘skipBlockComment’.

											
										
										
											2015-09-13 15:51:15 +03:00
+								skipBlockComment start end = p >> void (manyTill C.anyChar n)
 								  where p = try $ C.string start
 								        n = try $ C.string end
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- Character and string literals
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- | The lexeme parser parses a single literal character without
 								-- quotes. Purpose of this parser is to help with parsing of commonly used
 								-- escape sequences. It's your responsibility to take care of character
-												cosmetic improvements, courtesy of @neongreen

Closes #37.

Most part of these changes is proposed by @neongreen. To apply precisely
what I deem acceptable, correct some of them in other way, and add some
other things, I've manually re-edited this.

											
										
										
											2015-09-23 14:23:24 +03:00
+								-- literal syntax in your language (by surrounding it with single quotes or
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- similar).
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
+								--
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- The literal character is parsed according to the grammar rules defined in
 								-- the Haskell report.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
+								--
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- Note that you can use this parser as a building block to parse various
 								-- string literals:
 								--
 								-- > stringLiteral = char '"' >> manyTill L.charLiteral (char '"')
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								charLiteral :: MonadParsec s m Char => m Char
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								charLiteral = label "literal character" $ do
 								  r@(x:_) <- lookAhead $ count' 1 8 C.anyChar
 								  case listToMaybe (readLitChar r) of
 								    Just (c, r') -> count (length r - length r') C.anyChar >> return c
 								    Nothing      -> unexpected (showToken x)
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- Numbers
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- | Parse an integer without sign in decimal representation (according to
 								-- format of integer literals described in Haskell report).
 								--
 								-- If you need to parse signed integers, see 'signed' combinator.
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								integer :: MonadParsec s m Char => m Integer
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								integer = decimal <?> "integer"
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- | The same as 'integer', but 'integer' is 'label'ed with “integer” label,
-												improved error messages for labelled ‘many’

Closes #35.

Since ‘many’ (and thus ‘some’) are the only combinator that can succeed
consuming input and produce hints at the same time we can conclude that
‘cok'’ continuation in ‘pLabel’ combinator is only called when ‘many’ is
labelled. By correcting label in this case prepending the phrase “rest
of ” to actual label we can greatly improve result error message.

											
										
										
											2015-09-22 12:09:40 +03:00
+								-- while this parser is labeled with “decimal integer”.
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								decimal :: MonadParsec s m Char => m Integer
-												improved error messages for labelled ‘many’

Closes #35.

Since ‘many’ (and thus ‘some’) are the only combinator that can succeed
consuming input and produce hints at the same time we can conclude that
‘cok'’ continuation in ‘pLabel’ combinator is only called when ‘many’ is
labelled. By correcting label in this case prepending the phrase “rest
of ” to actual label we can greatly improve result error message.

											
										
										
											2015-09-22 12:09:40 +03:00
+								decimal = nump "" C.digitChar <?> "decimal integer"
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- | Parse an integer in hexadecimal representation. Representation of
-												make ‘hexadecimal’ and ‘octal’ more powerful

Various languages may vary in how hexadecimal and octal literals should
be prefixed. Following the spirit of the new lexer we leave this to
programmer to decide.

											
										
										
											2015-09-09 11:15:39 +03:00
+								-- hexadecimal number is expected to be according to Haskell report except
 								-- for the fact that this parser doesn't parse “0x” or “0X” prefix. It is
 								-- reponsibility of the programmer to parse correct prefix before parsing
 								-- the number itself.
 								--
 								-- For example you can make it conform to Haskell report like this:
 								--
 								-- > hexadecimal = char '0' >> char' 'x' >> L.hexadecimal
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								hexadecimal :: MonadParsec s m Char => m Integer
-												improved error messages for labelled ‘many’

Closes #35.

Since ‘many’ (and thus ‘some’) are the only combinator that can succeed
consuming input and produce hints at the same time we can conclude that
‘cok'’ continuation in ‘pLabel’ combinator is only called when ‘many’ is
labelled. By correcting label in this case prepending the phrase “rest
of ” to actual label we can greatly improve result error message.

											
										
										
											2015-09-22 12:09:40 +03:00
+								hexadecimal = nump "0x" C.hexDigitChar <?> "hexadecimal integer"
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- | Parse an integer in octal representation. Representation of octal
-												make ‘hexadecimal’ and ‘octal’ more powerful

Various languages may vary in how hexadecimal and octal literals should
be prefixed. Following the spirit of the new lexer we leave this to
programmer to decide.

											
										
										
											2015-09-09 11:15:39 +03:00
+								-- number is expected to be according to Haskell report except for the fact
 								-- that this parser doesn't parse “0o” or “0O” prefix. It is responsibility
 								-- of the programmer to parse correct prefix before parsing the number
 								-- itself.
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								octal :: MonadParsec s m Char => m Integer
-												improved error messages for labelled ‘many’

Closes #35.

Since ‘many’ (and thus ‘some’) are the only combinator that can succeed
consuming input and produce hints at the same time we can conclude that
‘cok'’ continuation in ‘pLabel’ combinator is only called when ‘many’ is
labelled. By correcting label in this case prepending the phrase “rest
of ” to actual label we can greatly improve result error message.

											
										
										
											2015-09-22 12:09:40 +03:00
+								octal = nump "0o" C.octDigitChar <?> "octal integer"
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- | @nump prefix p@ parses /one/ or more characters with @p@ parser, then
 								-- prepends @prefix@ to returned value and tries to interpret the result as
 								-- an integer according to Haskell syntax.
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								nump :: MonadParsec s m Char => String -> m Char -> m Integer
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								nump prefix baseDigit = read . (prefix ++) <$> some baseDigit
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- | Parse a floating point value without sign. Representation of floating
 								-- point value is expected to be according to Haskell report.
 								--
-												cosmetic improvements, courtesy of @neongreen

Closes #37.

Most part of these changes is proposed by @neongreen. To apply precisely
what I deem acceptable, correct some of them in other way, and add some
other things, I've manually re-edited this.

											
										
										
											2015-09-23 14:23:24 +03:00
+								-- If you need to parse signed floats, see 'signed'.
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								float :: MonadParsec s m Char => m Double
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								float = label "float" $ read <$> f
 								  where f = do
 								          d    <- some C.digitChar
 								          rest <- fraction <|> fExp
 								          return $ d ++ rest
 								-- | This is a helper for 'float' parser. It parses fractional part of
 								-- floating point number, that is, dot and everything after it.
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								fraction :: MonadParsec s m Char => m String
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								fraction = do
 								  void $ C.char '.'
 								  d <- some C.digitChar
 								  e <- option "" fExp
 								  return $ '.' : d ++ e
 								-- | This helper parses exponent of floating point numbers.
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								fExp :: MonadParsec s m Char => m String
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								fExp = do
 								  expChar <- C.char' 'e'
 								  signStr <- option "" (pure <$> choice (C.char <$> "+-"))
 								  d       <- some C.digitChar
 								  return $ expChar : signStr ++ d
 								-- | Parse a number: either integer or floating point. The parser can handle
 								-- overlapping grammars graciously.
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								number :: MonadParsec s m Char => m (Either Integer Double)
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								number = (Right <$> try float) <|> (Left <$> integer) <?> "number"
 								-- | @signed space p@ parser parses optional sign, then if there is a sign
 								-- it will consume optional white space (using @space@ parser), then it runs
 								-- parser @p@ which should return a number. Sign of the number is changed
 								-- according to previously parsed sign.
 								--
 								-- For example, to parse signed integer you can write:
 								--
 								-- > lexeme        = L.lexeme spaceConsumer
 								-- > integer       = lexeme L.integer
 								-- > signedInteger = signed spaceConsumer integer
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								signed :: (MonadParsec s m Char, Num a) => m () -> m a -> m a
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								signed spc p = ($) <$> option id (lexeme spc sign) <*> p
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								-- | Parse a sign and return either 'id' or 'negate' according to parsed
 								-- sign.
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
-												eliminated user state and written ‘MonadParsec’

Close # 27.

Backtracking user state can be achieved via combination of ‘StateT’
monad transformer and ‘ParsecT’:

  StateT StateType (ParsecT s m a)

This user state can be more flexible. This fact renders current built-in
user state redundant.

To help work with this new approach (combining monad transformers more
freely) we introduce ‘MonadParsec’ MTL-style type class. All tools that
come with Megaparsec library were modified to work smoothly with any
instance of ‘MonadParsec’, not only ‘ParsecT’.

											
										
										
											2015-09-18 12:33:44 +03:00
+								sign :: (MonadParsec s m Char, Num a) => m (a -> a)
-												first version of the new lexer module

											
										
										
											2015-09-08 14:34:02 +03:00
+								sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate)