megaparsec/Text/Megaparsec/Lexer.hs

-- |
-- Module      :  Text.Megaparsec.Lexer
-- Copyright   :  © 2015 Megaparsec contributors
--                © 2007 Paolo Martini
--                © 1999–2001 Daan Leijen
-- License     :  BSD3
--
-- Maintainer  :  Mark Karpov <markkarpov@opmbx.org>
-- Stability   :  experimental
-- Portability :  non-portable (uses local universal quantification: PolymorphicComponents)
--
-- A helper module to parse lexical elements. See 'makeLexer' for a
-- description of how to use it. This module is supposed to be imported
-- qualified.

{-# OPTIONS_GHC -fno-warn-name-shadowing #-}

module Text.Megaparsec.Lexer
  ( LanguageDef (..)
  , defaultLang
  , skipLineComment
  , skipBlockComment
  , Lexer (..)
  , makeLexer )
where

import Control.Applicative ((<|>), many, some, empty)
import Control.Monad (void)
import Data.Char (isAlpha, toLower, toUpper)
import Data.List (sort)

import Text.Megaparsec.Combinator
import Text.Megaparsec.Pos
import Text.Megaparsec.Prim
import qualified Text.Megaparsec.Char as C

-- Language definition

-- | The @LanguageDef@ type is a record that contains all parameters used to
-- control features of the "Text.Megaparsec.Lexer" module. 'defaultLang' can
-- be used as a basis for new language definitions.

data LanguageDef s u m =
  LanguageDef {

  -- | The parser is used to parse single white space character. If
  -- indentation is important in your language you should probably not treat
  -- newline as white space character. Also note that if newline is not
  -- white space character, you will need to pick it up manually.

    spaceChar :: ParsecT s u m Char

  -- | The parser parses line comments. It's responsibility of the parser to
  -- stop at the end of line. If your language doesn't support this type of
  -- comments, set this value to 'empty'. In simple cases you can use
  -- 'skipLineComment' to quickly construct line comment parser.

  , lineComment :: ParsecT s u m ()

  -- | The parser parses block (multi-line) comments. If your language
  -- doesn't support this type of comments, set this value to 'empty'. In
  -- simple cases you can use 'skipBlockComment' to quickly construct block
  -- comment parser.

  , blockComment :: ParsecT s u m ()

  -- NEXT

  -- | This parser should accept any start characters of identifiers, for
  -- example @letter \<|> char \'_\'@.

  , identStart :: ParsecT s u m Char

  -- | This parser should accept any legal tail characters of identifiers,
  -- for example @alphaNum \<|> char \'_\'@.

  , identLetter :: ParsecT s u m Char

  -- | This parser should accept any start characters of operators, for
  -- example @oneOf \":!#$%&*+.\/\<=>?\@\\\\^|-~\"@

  , opStart :: ParsecT s u m Char

  -- | This parser should accept any legal tail characters of operators.
  -- Note that this parser should even be defined if the language doesn't
  -- support user-defined operators, or otherwise the 'reservedOp' parser
  -- won't work correctly.

  , opLetter :: ParsecT s u m Char

  -- | The list of reserved identifiers.

  , reservedNames :: [String]

  -- | The list of reserved operators.

  , reservedOpNames :: [String]

  -- | Set to 'True' if the language is case sensitive.

  , caseSensitive :: Bool }

-- Default language definition

-- | This is standard language definition. It is recommended to use
-- this definition as the basis for other definitions. @defaultLang@ has no
-- reserved names or operators, is case sensitive and doesn't accept
-- comments, identifiers or operators.

defaultLang :: Stream s m Char => LanguageDef s u m
defaultLang =
  LanguageDef
  { spaceChar       = C.spaceChar
  , lineComment     = empty
  , blockComment    = empty
  -- NEXT
  , identStart      = C.letterChar <|> C.char '_'
  , identLetter     = C.alphaNumChar <|> C.oneOf "_'"
  , opStart         = opLetter defaultLang
  , opLetter        = C.oneOf ":!#$%&*+./<=>?@\\^|-~"
  , reservedOpNames = []
  , reservedNames   = []
  , caseSensitive   = True }

-- Utility functions

-- | Given comment prefix this function returns parser that skips line
-- comments. Note that it stops just before newline character but doesn't
-- consume the newline. Newline is either supposed to be consumed by 'space'
-- parser or picked manually.

skipLineComment :: Stream s m Char => String -> ParsecT s u m ()
skipLineComment prefix = C.string prefix >> void (manyTill C.anyChar n)
  where n = lookAhead C.newline

-- | @skipBlockComment start end@ skips non-nested block comment starting
-- with @start@ and ending with @end@.

skipBlockComment :: Stream s m Char => String -> String -> ParsecT s u m ()
skipBlockComment start end = C.string start >> void (manyTill C.anyChar n)
  where n = lookAhead (C.string end)

-- Lexer

-- | The type of the record that holds lexical parsers that work on
-- @s@ streams with state @u@ over a monad @m@.

data Lexer s u m =
  Lexer {

  -- | Skips any white space. White space consists of /zero/ or more
  -- occurrences of 'spaceChar', a line comment or a block (multi-line)
  -- comment.

    space :: ParsecT s u m ()

  -- | @lexeme p@ first applies parser @p@ and then the 'space' parser,
  -- returning the value of @p@. Every lexical token (lexeme) is defined
  -- using @lexeme@, this way every parse starts at a point without white
  -- space. Parsers that use @lexeme@ are called /lexeme/ parsers in this
  -- document.
  --
  -- The only point where the 'space' parser should be called explicitly is
  -- the start of the main parser in order to skip any leading white space.

  , lexeme :: forall a. ParsecT s u m a -> ParsecT s u m a

  -- | Lexeme parser @symbol s@ parses 'string' @s@ and skips
  -- trailing white space.

  , symbol :: String -> ParsecT s u m String

  -- | @indentGuard p@ consumes all white space it can consume, then checks
  -- column number. The column number should satisfy given predicate @p@,
  -- otherwise the parser fails with “incorrect indentation” message. In
  -- successful cases @indentGuard@ returns current column number.

  , indentGuard :: (Int -> Bool) -> ParsecT s u m Int

  -- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis,
  -- returning the value of @p@.

  , parens :: forall a. ParsecT s u m a -> ParsecT s u m a

  -- | Lexeme parser @braces p@ parses @p@ enclosed in braces (“{” and
  -- “}”), returning the value of @p@.

  , braces :: forall a. ParsecT s u m a -> ParsecT s u m a

  -- | Lexeme parser @angles p@ parses @p@ enclosed in angle brackets (“\<”
  -- and “>”), returning the value of @p@.

  , angles :: forall a. ParsecT s u m a -> ParsecT s u m a

  -- | Lexeme parser @brackets p@ parses @p@ enclosed in brackets (“[”
  -- and “]”), returning the value of @p@.

  , brackets :: forall a. ParsecT s u m a -> ParsecT s u m a

  -- | Lexeme parser @semicolon@ parses the character “;” and skips any
  -- trailing white space. Returns the string “;”.

  , semicolon :: ParsecT s u m String

  -- | Lexeme parser @comma@ parses the character “,” and skips any
  -- trailing white space. Returns the string “,”.

  , comma :: ParsecT s u m String

  -- | Lexeme parser @colon@ parses the character “:” and skips any
  -- trailing white space. Returns the string “:”.

  , colon :: ParsecT s u m String

  -- | Lexeme parser @dot@ parses the character “.” and skips any
  -- trailing white space. Returns the string “.”.

  , dot :: ParsecT s u m String

  -- | The lexeme parser parses a single literal character. Returns the
  -- literal character value. This parsers deals correctly with escape
  -- sequences. The literal character is parsed according to the grammar
  -- rules defined in the Haskell report (which matches most programming
  -- languages quite closely).

  , charLiteral :: ParsecT s u m Char

  -- | The lexeme parser parses a literal string. Returns the literal
  -- string value. This parsers deals correctly with escape sequences and
  -- gaps. The literal string is parsed according to the grammar rules
  -- defined in the Haskell report (which matches most programming languages
  -- quite closely).

  , stringLiteral :: ParsecT s u m String

  -- | The lexeme parser parses an integer (a whole number). This parser
  -- /does not/ parse sign. Returns the value of the number. The number can
  -- be specified in 'decimal', 'hexadecimal' or 'octal'. The number is
  -- parsed according to the grammar rules in the Haskell report.

  , integer :: ParsecT s u m Integer

  -- | This is just like 'integer', except it can parse sign.

  , integer' :: ParsecT s u m Integer

  -- | The lexeme parses a positive whole number in the decimal system.
  -- Returns the value of the number.

  , decimal :: ParsecT s u m Integer

  -- | The lexeme parses a positive whole number in the hexadecimal
  -- system. The number should be prefixed with “0x” or “0X”. Returns the
  -- value of the number.

  , hexadecimal :: ParsecT s u m Integer

  -- | The lexeme parses a positive whole number in the octal system.
  -- The number should be prefixed with “0o” or “0O”. Returns the value of
  -- the number.

  , octal :: ParsecT s u m Integer

  -- | @signed p@ tries to parse sign (i.e. “+”, “-”, or nothing) and
  -- then runs parser @p@, changing sign of its result accordingly. Note
  -- that there may be white space after the sign but not before it.

  , signed :: forall a. Num a => ParsecT s u m a -> ParsecT s u m a

  -- | The lexeme parser parses a floating point value. Returns the value
  -- of the number. The number is parsed according to the grammar rules
  -- defined in the Haskell report, sign is /not/ parsed, use 'float'' to
  -- achieve parsing of signed floating point values.

  , float :: ParsecT s u m Double

  -- | This is just like 'float', except it can parse sign.

  , float' :: ParsecT s u m Double

  -- | The lexeme parser parses either 'integer' or a 'float'.
  -- Returns the value of the number. This parser deals with any overlap in
  -- the grammar rules for integers and floats. The number is parsed
  -- according to the grammar rules defined in the Haskell report.

  , number :: ParsecT s u m (Either Integer Double)

  -- | This is just like 'number', except it can parse sign.

  , number' :: ParsecT s u m (Either Integer Double)

  -- | The lexeme parser parses a legal identifier. Returns the identifier
  -- string. This parser will fail on identifiers that are reserved
  -- words. Legal identifier (start) characters and reserved words are
  -- defined in the 'LanguageDef' that is passed to 'makeLexer'.

  , identifier :: ParsecT s u m String

  -- | The lexeme parser @reserved name@ parses @symbol name@, but it also
  -- checks that the @name@ is not a prefix of a valid identifier.

  , reserved :: String -> ParsecT s u m ()

  -- | The lexeme parser parses a legal operator. Returns the name of the
  -- operator. This parser will fail on any operators that are reserved
  -- operators. Legal operator (start) characters and reserved operators are
  -- defined in the 'LanguageDef' that is passed to 'makeLexer'.

  , operator :: ParsecT s u m String

  -- | The lexeme parser @reservedOp name@ parses @symbol name@, but it
  -- also checks that the @name@ is not a prefix of a valid operator.

  , reservedOp :: String -> ParsecT s u m () }

-- | The expression @makeLexer language@ creates a 'Lexer' record that
-- contains lexical parsers that are defined using the definitions in the
-- @language@ record.
--
-- The use of this function is quite stylized — one imports the appropriate
-- language definition and selects the lexical parsers that are needed from
-- the resulting 'Lexer'.
--
-- > module Main (main) where
-- >
-- > import Text.Megaparsec
-- > import Text.Megaparsec.Language (haskellDef)
-- > import qualified Text.Megaparsec.Lexer as L
-- >
-- > -- The parser
-- > …
-- >
-- > expr =  parens expr
-- >     <|> identifier
-- >     <|> …
-- >
-- > -- The lexer
-- > lexer      = L.makeLexer haskellDef
-- >
-- > parens     = L.parens     lexer
-- > braces     = L.braces     lexer
-- > identifier = L.identifier lexer
-- > reserved   = L.reserved   lexer
-- > …

makeLexer :: Stream s m Char => LanguageDef s u m -> Lexer s u m
makeLexer lang =
  Lexer
  { space         = space
  , lexeme        = lexeme
  , symbol        = symbol
  , indentGuard   = indentGuard

  , parens        = parens
  , braces        = braces
  , angles        = angles
  , brackets      = brackets
  , semicolon     = semicolon
  , comma         = comma
  , colon         = colon
  , dot           = dot

  , charLiteral   = charLiteral
  , stringLiteral = stringLiteral

  , integer       = integer
  , integer'      = integer'
  , decimal       = decimal
  , hexadecimal   = hexadecimal
  , octal         = octal
  , signed        = signed
  , float         = float
  , float'        = float'
  , number        = number
  , number'       = number'

  , identifier    = identifier
  , reserved      = reserved
  , operator      = operator
  , reservedOp    = reservedOp }
  where

  -- white space & indentation

  space    = hidden . skipMany . choice $
             ($ lang) <$> [void . spaceChar, blockComment, lineComment]
  lexeme p = p <* space
  symbol   = lexeme . C.string
  indentGuard p = do
    space
    pos <- sourceColumn <$> getPosition
    if p pos
    then return pos
    else fail "incorrect indentation"

  -- auxiliary parsers

  parens    = between (symbol "(") (symbol ")")
  braces    = between (symbol "{") (symbol "}")
  angles    = between (symbol "<") (symbol ">")
  brackets  = between (symbol "[") (symbol "]")
  semicolon = symbol ";"
  comma     = symbol ","
  colon     = symbol ":"
  dot       = symbol "."

  -- char & string literals

  charLiteral = lexeme ( between (C.char '\'')
                                 (C.char '\'' <?> "end of character")
                                 characterChar )
                <?> "character"

  characterChar = charLetter <|> charEscape <?> "literal character"

  charEscape = C.char '\\' >> escapeCode
  charLetter = C.satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026'))

  stringLiteral =
      lexeme ((foldr (maybe id (:)) "" <$>
               between (C.char '"') (C.char '"' <?> "end of string")
                           (many stringChar)) <?> "literal string")

  stringChar = (Just <$> stringLetter) <|> stringEscape <?> "string character"

  stringLetter = C.satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026'))

  stringEscape = C.char '\\' >>
                 ( (escapeGap >> return Nothing)   <|>
                   (escapeEmpty >> return Nothing) <|>
                   (Just <$> escapeCode) )

  escapeEmpty = C.char '&'
  escapeGap   = some C.spaceChar >> C.char '\\' <?> "end of string gap"

  -- escape codes

  escapeCode = charEsc <|> charNum <|> charAscii <|> charControl
               <?> "escape code"

  charEsc = choice (parseEsc <$> escMap)
      where parseEsc (c, code) = C.char c >> return code

  charNum = toEnum . fromInteger <$>
            ( decimal <|>
             (C.char 'o' >> nump "0o" C.octDigitChar) <|>
             (C.char 'x' >> nump "0x" C.hexDigitChar) )

  charAscii = choice (parseAscii <$> asciiMap)
      where parseAscii (asc, code) = try (C.string asc >> return code)

  charControl = toEnum . subtract 64 . fromEnum <$> (C.char '^' >> C.upperChar)

  -- escape code tables

  escMap      = zip "abfnrtv\\\"\'" "\a\b\f\n\r\t\v\\\"\'"
  asciiMap    = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2)

  ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM",
                 "FS","GS","RS","US","SP"]
  ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL",
                 "DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB",
                 "CAN","SUB","ESC","DEL"]

  ascii2 = "\b\t\n\v\f\r\SO\SI\EM\FS\GS\RS\US "
  ascii3 = "\NUL\SOH\STX\ETX\EOT\ENQ\ACK\a\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\SUB\ESC\DEL"

  -- numbers — integers

  integer  = decimal
  integer' = signed integer

  decimal     = lexeme (nump "" C.digitChar <?> "integer")
  hexadecimal = lexeme $ C.char '0' >> C.oneOf "xX" >> nump "0x" C.hexDigitChar
  octal       = lexeme $ C.char '0' >> C.oneOf "oO" >> nump "0o" C.octDigitChar

  nump prefix baseDigit = read . (prefix ++) <$> some baseDigit

  signed p = ($) <$> option id (lexeme sign) <*> p

  sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a)
  sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate)

  -- numbers — floats

  float  = lexeme ffloat <?> "float"
  float' = signed float

  ffloat = read <$> ffloat'
    where
      ffloat' = do
        decimal <- fDec
        rest <- fraction <|> fExp
        return $ decimal ++ rest

  fraction = do
    void $ C.char '.'
    decimal <- fDec
    exp <- option "" fExp
    return $ '.' : decimal ++  exp

  fDec = some C.digitChar

  fExp = do
    expChar <- C.oneOf "eE"
    signStr <- option "" (pure <$> C.oneOf "+-")
    decimal <- fDec
    return $ expChar : signStr ++ decimal

  -- numbers — a more general case

  number  = (Right <$> try float)  <|> (Left <$> integer)  <?> "number"
  number' = (Right <$> try float') <|> (Left <$> integer') <?> "number"

  -- operators & reserved ops

  reservedOp name =
      lexeme $ try $ do
        void $ C.string name
        notFollowedBy (opLetter lang) <?> ("end of " ++ show name)

  operator =
      lexeme $ try $ do
        name <- oper
        if isReservedOp name
        then unexpected ("reserved operator " ++ show name)
        else return name

  oper = ((:) <$> opStart lang <*> many (opLetter lang))
         <?> "operator"

  isReservedOp = isReserved . sort $ reservedOpNames lang

  -- identifiers & reserved words

  reserved name =
      lexeme $ try $ do
        void $ caseString name
        notFollowedBy (identLetter lang) <?> ("end of " ++ show name)

  caseString name
      | caseSensitive lang = C.string name
      | otherwise                 = walk name >> return name
      where walk = foldr (\c -> ((caseChar c <?> show name) >>)) (return ())
            caseChar c
                | isAlpha c = C.char (toLower c) <|> C.char (toUpper c)
                | otherwise = C.char c

  identifier =
      lexeme $ try $ do
        name <- ident
        if isReservedName name
        then unexpected ("reserved word " ++ show name)
        else return name

  ident = ((:) <$> identStart lang <*> many (identLetter lang))
          <?> "identifier"

  isReservedName name = isReserved theReservedNames caseName
      where caseName
                | caseSensitive lang = name
                | otherwise                 = toLower <$> name

  isReserved names name = scan names
      where scan []     = False
            scan (r:rs) = case compare r name of
                            LT  -> scan rs
                            EQ  -> True
                            GT  -> False

  theReservedNames
      | caseSensitive lang = sort reserved
      | otherwise                 = sort . fmap (fmap toLower) $ reserved
      where reserved = reservedNames lang
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
+								-- |
-												renamed ‘Text.Megaparsec.Token’ module

Now it's called ‘Text.Megaparsec.Lexer’. This commit contains other
cosmetic changes as well.

											
										
										
											2015-08-30 13:00:07 +03:00
+								-- Module      :  Text.Megaparsec.Lexer
-												renamed ‘MegaParsec’ → ‘Megaparsec’, close #10

											
										
										
											2015-08-01 19:24:45 +03:00
+								-- Copyright   :  © 2015 Megaparsec contributors
-												cosmetic changes in copyright (headers)

											
										
										
											2015-07-30 19:20:37 +03:00
+								--                © 2007 Paolo Martini
 								--                © 1999–2001 Daan Leijen
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								-- License     :  BSD3
 								--
 								-- Maintainer  :  Mark Karpov <markkarpov@opmbx.org>
-												refactoring, phase 2

											
										
										
											2015-07-29 11:38:32 +03:00
+								-- Stability   :  experimental
-												More accurate extension pragmas

											
										
										
											2008-01-20 09:39:18 +03:00
+								-- Portability :  non-portable (uses local universal quantification: PolymorphicComponents)
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								--
-												renamed ‘Text.Megaparsec.Token’ module

Now it's called ‘Text.Megaparsec.Lexer’. This commit contains other
cosmetic changes as well.

											
										
										
											2015-08-30 13:00:07 +03:00
+								-- A helper module to parse lexical elements. See 'makeLexer' for a
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								-- description of how to use it. This module is supposed to be imported
 								-- qualified.
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												Clean most warnings

											
										
										
											2008-02-13 07:32:24 +03:00
+								{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												renamed ‘Text.Megaparsec.Token’ module

Now it's called ‘Text.Megaparsec.Lexer’. This commit contains other
cosmetic changes as well.

											
										
										
											2015-08-30 13:00:07 +03:00
+								module Text.Megaparsec.Lexer
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  ( LanguageDef (..)
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  , defaultLang
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  , skipLineComment
 								  , skipBlockComment
 								  , Lexer (..)
-												various cosmetic changes (including renaming)

The following functions and data types have been renamed:

* ‘permute’ → ‘makePermParser’
* ‘buildExpressionParser’ → ‘makeExprParser’
* ‘GenLanguageDef’ → ‘LanguageDef’
* ‘GenTokenParser’ → ‘Lexer’
* ‘makeTokenParser’ → ‘makeLexer’

											
										
										
											2015-08-23 11:04:12 +03:00
+								  , makeLexer )
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								where
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								import Control.Applicative ((<|>), many, some, empty)
-												re-export ‘(<|>)’, ‘many’, ‘some’, and ‘optional’, fixes #9

These functions are now re-exported from ‘Control.Applicative’
module. ‘many’ and ‘some’ are now part of ‘Alternative’ instance of
‘ParsecT’.

Note that these functions are re-exported only in ‘Text.MegaParsec’
module, but not in ‘Text.MegaParsec.Prim’ to avoid duplication of
floating doc-strings. Others internal modules now just casually import
‘Control.Applicative’ for their needs.

Note that ‘many1’ was renamed to ‘some’, the same is done for other
parsers that had ‘many1’ part in their names (for consistency).

											
										
										
											2015-08-01 17:39:20 +03:00
+								import Control.Monad (void)
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								import Data.Char (isAlpha, toLower, toUpper)
-												temporarily simplify token parsing

The improved error messages in Megaparsec are quite sensitive to how
parsers are written, which parts of parser are labeled, etc. Current
implementation of token parsers in ‘Text.Megaparsec.Token’ is written
without this in mind. We will improve the module later, for now let us
rewrite/simplify some parts to avoid failing tests.

											
										
										
											2015-08-19 22:11:21 +03:00
+								import Data.List (sort)
-												refactoring, phase 3

											
										
										
											2015-07-30 18:45:06 +03:00
-												renamed ‘MegaParsec’ → ‘Megaparsec’, close #10

											
										
										
											2015-08-01 19:24:45 +03:00
+								import Text.Megaparsec.Combinator
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								import Text.Megaparsec.Pos
 								import Text.Megaparsec.Prim
 								import qualified Text.Megaparsec.Char as C
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												refactoring, phase 3

											
										
										
											2015-07-30 18:45:06 +03:00
+								-- Language definition
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												various minor changes

											
										
										
											2015-08-12 15:41:22 +03:00
+								-- | The @LanguageDef@ type is a record that contains all parameters used to
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								-- control features of the "Text.Megaparsec.Lexer" module. 'defaultLang' can
 								-- be used as a basis for new language definitions.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												improved documentation, fixes #1

											
										
										
											2015-07-30 21:36:54 +03:00
+								data LanguageDef s u m =
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  LanguageDef {
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  -- | The parser is used to parse single white space character. If
 								  -- indentation is important in your language you should probably not treat
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  -- newline as white space character. Also note that if newline is not
 								  -- white space character, you will need to pick it up manually.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								    spaceChar :: ParsecT s u m Char
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  -- | The parser parses line comments. It's responsibility of the parser to
 								  -- stop at the end of line. If your language doesn't support this type of
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  -- comments, set this value to 'empty'. In simple cases you can use
 								  -- 'skipLineComment' to quickly construct line comment parser.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  , lineComment :: ParsecT s u m ()
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  -- | The parser parses block (multi-line) comments. If your language
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  -- doesn't support this type of comments, set this value to 'empty'. In
 								  -- simple cases you can use 'skipBlockComment' to quickly construct block
 								  -- comment parser.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  , blockComment :: ParsecT s u m ()
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  -- NEXT
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | This parser should accept any start characters of identifiers, for
 								  -- example @letter \<|> char \'_\'@.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , identStart :: ParsecT s u m Char
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | This parser should accept any legal tail characters of identifiers,
 								  -- for example @alphaNum \<|> char \'_\'@.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , identLetter :: ParsecT s u m Char
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | This parser should accept any start characters of operators, for
 								  -- example @oneOf \":!#$%&*+.\/\<=>?\@\\\\^|-~\"@
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , opStart :: ParsecT s u m Char
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | This parser should accept any legal tail characters of operators.
 								  -- Note that this parser should even be defined if the language doesn't
 								  -- support user-defined operators, or otherwise the 'reservedOp' parser
 								  -- won't work correctly.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , opLetter :: ParsecT s u m Char
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | The list of reserved identifiers.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , reservedNames :: [String]
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | The list of reserved operators.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , reservedOpNames :: [String]
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | Set to 'True' if the language is case sensitive.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , caseSensitive :: Bool }
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								-- Default language definition
 								-- | This is standard language definition. It is recommended to use
 								-- this definition as the basis for other definitions. @defaultLang@ has no
 								-- reserved names or operators, is case sensitive and doesn't accept
 								-- comments, identifiers or operators.
 								defaultLang :: Stream s m Char => LanguageDef s u m
 								defaultLang =
 								  LanguageDef
 								  { spaceChar       = C.spaceChar
 								  , lineComment     = empty
 								  , blockComment    = empty
 								  -- NEXT
 								  , identStart      = C.letterChar <|> C.char '_'
 								  , identLetter     = C.alphaNumChar <|> C.oneOf "_'"
 								  , opStart         = opLetter defaultLang
 								  , opLetter        = C.oneOf ":!#$%&*+./<=>?@\\^|-~"
 								  , reservedOpNames = []
 								  , reservedNames   = []
 								  , caseSensitive   = True }
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								-- Utility functions
 								-- | Given comment prefix this function returns parser that skips line
 								-- comments. Note that it stops just before newline character but doesn't
 								-- consume the newline. Newline is either supposed to be consumed by 'space'
 								-- parser or picked manually.
 								skipLineComment :: Stream s m Char => String -> ParsecT s u m ()
 								skipLineComment prefix = C.string prefix >> void (manyTill C.anyChar n)
 								  where n = lookAhead C.newline
 								-- | @skipBlockComment start end@ skips non-nested block comment starting
 								-- with @start@ and ending with @end@.
 								skipBlockComment :: Stream s m Char => String -> String -> ParsecT s u m ()
 								skipBlockComment start end = C.string start >> void (manyTill C.anyChar n)
 								  where n = lookAhead (C.string end)
-												renamed ‘Text.Megaparsec.Token’ module

Now it's called ‘Text.Megaparsec.Lexer’. This commit contains other
cosmetic changes as well.

											
										
										
											2015-08-30 13:00:07 +03:00
+								-- Lexer
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
 								-- | The type of the record that holds lexical parsers that work on
 								-- @s@ streams with state @u@ over a monad @m@.
-												various cosmetic changes (including renaming)

The following functions and data types have been renamed:

* ‘permute’ → ‘makePermParser’
* ‘buildExpressionParser’ → ‘makeExprParser’
* ‘GenLanguageDef’ → ‘LanguageDef’
* ‘GenTokenParser’ → ‘Lexer’
* ‘makeTokenParser’ → ‘makeLexer’

											
										
										
											2015-08-23 11:04:12 +03:00
+								data Lexer s u m =
 								  Lexer {
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  -- | Skips any white space. White space consists of /zero/ or more
 								  -- occurrences of 'spaceChar', a line comment or a block (multi-line)
 								  -- comment.
 								    space :: ParsecT s u m ()
 								  -- | @lexeme p@ first applies parser @p@ and then the 'space' parser,
 								  -- returning the value of @p@. Every lexical token (lexeme) is defined
 								  -- using @lexeme@, this way every parse starts at a point without white
 								  -- space. Parsers that use @lexeme@ are called /lexeme/ parsers in this
 								  -- document.
 								  --
 								  -- The only point where the 'space' parser should be called explicitly is
 								  -- the start of the main parser in order to skip any leading white space.
 								  , lexeme :: forall a. ParsecT s u m a -> ParsecT s u m a
 								  -- | Lexeme parser @symbol s@ parses 'string' @s@ and skips
 								  -- trailing white space.
 								  , symbol :: String -> ParsecT s u m String
 								  -- | @indentGuard p@ consumes all white space it can consume, then checks
 								  -- column number. The column number should satisfy given predicate @p@,
 								  -- otherwise the parser fails with “incorrect indentation” message. In
 								  -- successful cases @indentGuard@ returns current column number.
 								  , indentGuard :: (Int -> Bool) -> ParsecT s u m Int
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  -- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis,
 								  -- returning the value of @p@.
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  , parens :: forall a. ParsecT s u m a -> ParsecT s u m a
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  -- | Lexeme parser @braces p@ parses @p@ enclosed in braces (“{” and
 								  -- “}”), returning the value of @p@.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  , braces :: forall a. ParsecT s u m a -> ParsecT s u m a
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  -- | Lexeme parser @angles p@ parses @p@ enclosed in angle brackets (“\<”
 								  -- and “>”), returning the value of @p@.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  , angles :: forall a. ParsecT s u m a -> ParsecT s u m a
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  -- | Lexeme parser @brackets p@ parses @p@ enclosed in brackets (“[”
 								  -- and “]”), returning the value of @p@.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  , brackets :: forall a. ParsecT s u m a -> ParsecT s u m a
 								  -- | Lexeme parser @semicolon@ parses the character “;” and skips any
 								  -- trailing white space. Returns the string “;”.
 								  , semicolon :: ParsecT s u m String
 								  -- | Lexeme parser @comma@ parses the character “,” and skips any
 								  -- trailing white space. Returns the string “,”.
 								  , comma :: ParsecT s u m String
 								  -- | Lexeme parser @colon@ parses the character “:” and skips any
 								  -- trailing white space. Returns the string “:”.
 								  , colon :: ParsecT s u m String
 								  -- | Lexeme parser @dot@ parses the character “.” and skips any
 								  -- trailing white space. Returns the string “.”.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  , dot :: ParsecT s u m String
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | The lexeme parser parses a single literal character. Returns the
 								  -- literal character value. This parsers deals correctly with escape
 								  -- sequences. The literal character is parsed according to the grammar
 								  -- rules defined in the Haskell report (which matches most programming
 								  -- languages quite closely).
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , charLiteral :: ParsecT s u m Char
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | The lexeme parser parses a literal string. Returns the literal
 								  -- string value. This parsers deals correctly with escape sequences and
 								  -- gaps. The literal string is parsed according to the grammar rules
 								  -- defined in the Haskell report (which matches most programming languages
 								  -- quite closely).
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , stringLiteral :: ParsecT s u m String
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | The lexeme parser parses an integer (a whole number). This parser
 								  -- /does not/ parse sign. Returns the value of the number. The number can
 								  -- be specified in 'decimal', 'hexadecimal' or 'octal'. The number is
 								  -- parsed according to the grammar rules in the Haskell report.
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , integer :: ParsecT s u m Integer
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | This is just like 'integer', except it can parse sign.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , integer' :: ParsecT s u m Integer
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | The lexeme parses a positive whole number in the decimal system.
 								  -- Returns the value of the number.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , decimal :: ParsecT s u m Integer
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | The lexeme parses a positive whole number in the hexadecimal
 								  -- system. The number should be prefixed with “0x” or “0X”. Returns the
 								  -- value of the number.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , hexadecimal :: ParsecT s u m Integer
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | The lexeme parses a positive whole number in the octal system.
 								  -- The number should be prefixed with “0o” or “0O”. Returns the value of
 								  -- the number.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , octal :: ParsecT s u m Integer
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | @signed p@ tries to parse sign (i.e. “+”, “-”, or nothing) and
 								  -- then runs parser @p@, changing sign of its result accordingly. Note
 								  -- that there may be white space after the sign but not before it.
-												rewritten parsing of numbers, fixes #2 and #3

Changed how numbers are parsed because they were parsed in a naïf and
hairy way. Added tests for #2 and #3 (in old Parsec project these are
number 35 and 39 respectively).

* Since Haskell report doesn't say anything about sign, I've made
  ‘integer’ and ‘float’ parse numbers without sign.

* Removed ‘natural’ parser, it's equal to new ‘integer’ now.

* Renamed ‘naturalOrFloat’ → ‘number’ — this doesn't parse sign too.

* Added new combinator ‘signed’ to parse all sorts of signed numbers.

* For the sake of convenience I've added ‘integer'’, ‘float'’, and
 ‘number'’ combinators that also can parse signed numbers out of box.

											
										
										
											2015-07-31 14:30:38 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  , signed :: forall a. Num a => ParsecT s u m a -> ParsecT s u m a
-												rewritten parsing of numbers, fixes #2 and #3

Changed how numbers are parsed because they were parsed in a naïf and
hairy way. Added tests for #2 and #3 (in old Parsec project these are
number 35 and 39 respectively).

* Since Haskell report doesn't say anything about sign, I've made
  ‘integer’ and ‘float’ parse numbers without sign.

* Removed ‘natural’ parser, it's equal to new ‘integer’ now.

* Renamed ‘naturalOrFloat’ → ‘number’ — this doesn't parse sign too.

* Added new combinator ‘signed’ to parse all sorts of signed numbers.

* For the sake of convenience I've added ‘integer'’, ‘float'’, and
 ‘number'’ combinators that also can parse signed numbers out of box.

											
										
										
											2015-07-31 14:30:38 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | The lexeme parser parses a floating point value. Returns the value
 								  -- of the number. The number is parsed according to the grammar rules
 								  -- defined in the Haskell report, sign is /not/ parsed, use 'float'' to
 								  -- achieve parsing of signed floating point values.
-												rewritten parsing of numbers, fixes #2 and #3

Changed how numbers are parsed because they were parsed in a naïf and
hairy way. Added tests for #2 and #3 (in old Parsec project these are
number 35 and 39 respectively).

* Since Haskell report doesn't say anything about sign, I've made
  ‘integer’ and ‘float’ parse numbers without sign.

* Removed ‘natural’ parser, it's equal to new ‘integer’ now.

* Renamed ‘naturalOrFloat’ → ‘number’ — this doesn't parse sign too.

* Added new combinator ‘signed’ to parse all sorts of signed numbers.

* For the sake of convenience I've added ‘integer'’, ‘float'’, and
 ‘number'’ combinators that also can parse signed numbers out of box.

											
										
										
											2015-07-31 14:30:38 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , float :: ParsecT s u m Double
-												rewritten parsing of numbers, fixes #2 and #3

Changed how numbers are parsed because they were parsed in a naïf and
hairy way. Added tests for #2 and #3 (in old Parsec project these are
number 35 and 39 respectively).

* Since Haskell report doesn't say anything about sign, I've made
  ‘integer’ and ‘float’ parse numbers without sign.

* Removed ‘natural’ parser, it's equal to new ‘integer’ now.

* Renamed ‘naturalOrFloat’ → ‘number’ — this doesn't parse sign too.

* Added new combinator ‘signed’ to parse all sorts of signed numbers.

* For the sake of convenience I've added ‘integer'’, ‘float'’, and
 ‘number'’ combinators that also can parse signed numbers out of box.

											
										
										
											2015-07-31 14:30:38 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | This is just like 'float', except it can parse sign.
-												rewritten parsing of numbers, fixes #2 and #3

Changed how numbers are parsed because they were parsed in a naïf and
hairy way. Added tests for #2 and #3 (in old Parsec project these are
number 35 and 39 respectively).

* Since Haskell report doesn't say anything about sign, I've made
  ‘integer’ and ‘float’ parse numbers without sign.

* Removed ‘natural’ parser, it's equal to new ‘integer’ now.

* Renamed ‘naturalOrFloat’ → ‘number’ — this doesn't parse sign too.

* Added new combinator ‘signed’ to parse all sorts of signed numbers.

* For the sake of convenience I've added ‘integer'’, ‘float'’, and
 ‘number'’ combinators that also can parse signed numbers out of box.

											
										
										
											2015-07-31 14:30:38 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , float' :: ParsecT s u m Double
-												rewritten parsing of numbers, fixes #2 and #3

Changed how numbers are parsed because they were parsed in a naïf and
hairy way. Added tests for #2 and #3 (in old Parsec project these are
number 35 and 39 respectively).

* Since Haskell report doesn't say anything about sign, I've made
  ‘integer’ and ‘float’ parse numbers without sign.

* Removed ‘natural’ parser, it's equal to new ‘integer’ now.

* Renamed ‘naturalOrFloat’ → ‘number’ — this doesn't parse sign too.

* Added new combinator ‘signed’ to parse all sorts of signed numbers.

* For the sake of convenience I've added ‘integer'’, ‘float'’, and
 ‘number'’ combinators that also can parse signed numbers out of box.

											
										
										
											2015-07-31 14:30:38 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | The lexeme parser parses either 'integer' or a 'float'.
 								  -- Returns the value of the number. This parser deals with any overlap in
 								  -- the grammar rules for integers and floats. The number is parsed
 								  -- according to the grammar rules defined in the Haskell report.
-												rewritten parsing of numbers, fixes #2 and #3

Changed how numbers are parsed because they were parsed in a naïf and
hairy way. Added tests for #2 and #3 (in old Parsec project these are
number 35 and 39 respectively).

* Since Haskell report doesn't say anything about sign, I've made
  ‘integer’ and ‘float’ parse numbers without sign.

* Removed ‘natural’ parser, it's equal to new ‘integer’ now.

* Renamed ‘naturalOrFloat’ → ‘number’ — this doesn't parse sign too.

* Added new combinator ‘signed’ to parse all sorts of signed numbers.

* For the sake of convenience I've added ‘integer'’, ‘float'’, and
 ‘number'’ combinators that also can parse signed numbers out of box.

											
										
										
											2015-07-31 14:30:38 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , number :: ParsecT s u m (Either Integer Double)
-												rewritten parsing of numbers, fixes #2 and #3

Changed how numbers are parsed because they were parsed in a naïf and
hairy way. Added tests for #2 and #3 (in old Parsec project these are
number 35 and 39 respectively).

* Since Haskell report doesn't say anything about sign, I've made
  ‘integer’ and ‘float’ parse numbers without sign.

* Removed ‘natural’ parser, it's equal to new ‘integer’ now.

* Renamed ‘naturalOrFloat’ → ‘number’ — this doesn't parse sign too.

* Added new combinator ‘signed’ to parse all sorts of signed numbers.

* For the sake of convenience I've added ‘integer'’, ‘float'’, and
 ‘number'’ combinators that also can parse signed numbers out of box.

											
										
										
											2015-07-31 14:30:38 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- | This is just like 'number', except it can parse sign.
-												rewritten parsing of numbers, fixes #2 and #3

Changed how numbers are parsed because they were parsed in a naïf and
hairy way. Added tests for #2 and #3 (in old Parsec project these are
number 35 and 39 respectively).

* Since Haskell report doesn't say anything about sign, I've made
  ‘integer’ and ‘float’ parse numbers without sign.

* Removed ‘natural’ parser, it's equal to new ‘integer’ now.

* Renamed ‘naturalOrFloat’ → ‘number’ — this doesn't parse sign too.

* Added new combinator ‘signed’ to parse all sorts of signed numbers.

* For the sake of convenience I've added ‘integer'’, ‘float'’, and
 ‘number'’ combinators that also can parse signed numbers out of box.

											
										
										
											2015-07-31 14:30:38 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  , number' :: ParsecT s u m (Either Integer Double)
-												rewritten parsing of numbers, fixes #2 and #3

Changed how numbers are parsed because they were parsed in a naïf and
hairy way. Added tests for #2 and #3 (in old Parsec project these are
number 35 and 39 respectively).

* Since Haskell report doesn't say anything about sign, I've made
  ‘integer’ and ‘float’ parse numbers without sign.

* Removed ‘natural’ parser, it's equal to new ‘integer’ now.

* Renamed ‘naturalOrFloat’ → ‘number’ — this doesn't parse sign too.

* Added new combinator ‘signed’ to parse all sorts of signed numbers.

* For the sake of convenience I've added ‘integer'’, ‘float'’, and
 ‘number'’ combinators that also can parse signed numbers out of box.

											
										
										
											2015-07-31 14:30:38 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  -- | The lexeme parser parses a legal identifier. Returns the identifier
 								  -- string. This parser will fail on identifiers that are reserved
 								  -- words. Legal identifier (start) characters and reserved words are
 								  -- defined in the 'LanguageDef' that is passed to 'makeLexer'.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  , identifier :: ParsecT s u m String
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  -- | The lexeme parser @reserved name@ parses @symbol name@, but it also
 								  -- checks that the @name@ is not a prefix of a valid identifier.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  , reserved :: String -> ParsecT s u m ()
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  -- | The lexeme parser parses a legal operator. Returns the name of the
 								  -- operator. This parser will fail on any operators that are reserved
 								  -- operators. Legal operator (start) characters and reserved operators are
 								  -- defined in the 'LanguageDef' that is passed to 'makeLexer'.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  , operator :: ParsecT s u m String
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  -- | The lexeme parser @reservedOp name@ parses @symbol name@, but it
 								  -- also checks that the @name@ is not a prefix of a valid operator.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  , reservedOp :: String -> ParsecT s u m () }
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
-												various cosmetic changes (including renaming)

The following functions and data types have been renamed:

* ‘permute’ → ‘makePermParser’
* ‘buildExpressionParser’ → ‘makeExprParser’
* ‘GenLanguageDef’ → ‘LanguageDef’
* ‘GenTokenParser’ → ‘Lexer’
* ‘makeTokenParser’ → ‘makeLexer’

											
										
										
											2015-08-23 11:04:12 +03:00
+								-- | The expression @makeLexer language@ creates a 'Lexer' record that
 								-- contains lexical parsers that are defined using the definitions in the
 								-- @language@ record.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
+								--
-												refactoring, phase 3

											
										
										
											2015-07-30 18:45:06 +03:00
+								-- The use of this function is quite stylized — one imports the appropriate
 								-- language definition and selects the lexical parsers that are needed from
-												various cosmetic changes (including renaming)

The following functions and data types have been renamed:

* ‘permute’ → ‘makePermParser’
* ‘buildExpressionParser’ → ‘makeExprParser’
* ‘GenLanguageDef’ → ‘LanguageDef’
* ‘GenTokenParser’ → ‘Lexer’
* ‘makeTokenParser’ → ‘makeLexer’

											
										
										
											2015-08-23 11:04:12 +03:00
+								-- the resulting 'Lexer'.
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
+								--
-												refactoring, phase 3

											
										
										
											2015-07-30 18:45:06 +03:00
+								-- > module Main (main) where
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
+								-- >
-												renamed ‘Text.Megaparsec.Token’ module

Now it's called ‘Text.Megaparsec.Lexer’. This commit contains other
cosmetic changes as well.

											
										
										
											2015-08-30 13:00:07 +03:00
+								-- > import Text.Megaparsec
 								-- > import Text.Megaparsec.Language (haskellDef)
 								-- > import qualified Text.Megaparsec.Lexer as L
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
+								-- >
-												refactoring, phase 3

											
										
										
											2015-07-30 18:45:06 +03:00
+								-- > -- The parser
-												renamed ‘Text.Megaparsec.Token’ module

Now it's called ‘Text.Megaparsec.Lexer’. This commit contains other
cosmetic changes as well.

											
										
										
											2015-08-30 13:00:07 +03:00
+								-- > …
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								-- >
-												refactoring, phase 3

											
										
										
											2015-07-30 18:45:06 +03:00
+								-- > expr =  parens expr
 								-- >     <|> identifier
-												renamed ‘Text.Megaparsec.Token’ module

Now it's called ‘Text.Megaparsec.Lexer’. This commit contains other
cosmetic changes as well.

											
										
										
											2015-08-30 13:00:07 +03:00
+								-- >     <|> …
-												Haddock documentation for Text.Parsec.Token

											
										
										
											2008-01-22 08:14:30 +03:00
+								-- >
-												refactoring, phase 3

											
										
										
											2015-07-30 18:45:06 +03:00
+								-- > -- The lexer
-												renamed ‘Text.Megaparsec.Token’ module

Now it's called ‘Text.Megaparsec.Lexer’. This commit contains other
cosmetic changes as well.

											
										
										
											2015-08-30 13:00:07 +03:00
+								-- > lexer      = L.makeLexer haskellDef
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								-- >
-												renamed ‘Text.Megaparsec.Token’ module

Now it's called ‘Text.Megaparsec.Lexer’. This commit contains other
cosmetic changes as well.

											
										
										
											2015-08-30 13:00:07 +03:00
+								-- > parens     = L.parens     lexer
 								-- > braces     = L.braces     lexer
 								-- > identifier = L.identifier lexer
 								-- > reserved   = L.reserved   lexer
 								-- > …
-												refactoring, phase 3

											
										
										
											2015-07-30 18:45:06 +03:00
-												various cosmetic changes (including renaming)

The following functions and data types have been renamed:

* ‘permute’ → ‘makePermParser’
* ‘buildExpressionParser’ → ‘makeExprParser’
* ‘GenLanguageDef’ → ‘LanguageDef’
* ‘GenTokenParser’ → ‘Lexer’
* ‘makeTokenParser’ → ‘makeLexer’

											
										
										
											2015-08-23 11:04:12 +03:00
+								makeLexer :: Stream s m Char => LanguageDef s u m -> Lexer s u m
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								makeLexer lang =
-												various cosmetic changes (including renaming)

The following functions and data types have been renamed:

* ‘permute’ → ‘makePermParser’
* ‘buildExpressionParser’ → ‘makeExprParser’
* ‘GenLanguageDef’ → ‘LanguageDef’
* ‘GenTokenParser’ → ‘Lexer’
* ‘makeTokenParser’ → ‘makeLexer’

											
										
										
											2015-08-23 11:04:12 +03:00
+								  Lexer
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  { space         = space
 								  , lexeme        = lexeme
 								  , symbol        = symbol
 								  , indentGuard   = indentGuard
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  , parens        = parens
 								  , braces        = braces
 								  , angles        = angles
 								  , brackets      = brackets
 								  , semicolon     = semicolon
 								  , comma         = comma
 								  , colon         = colon
 								  , dot           = dot
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
 								  , charLiteral   = charLiteral
 								  , stringLiteral = stringLiteral
 								  , integer       = integer
 								  , integer'      = integer'
 								  , decimal       = decimal
 								  , hexadecimal   = hexadecimal
 								  , octal         = octal
 								  , signed        = signed
 								  , float         = float
 								  , float'        = float'
 								  , number        = number
 								  , number'       = number'
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  , identifier    = identifier
 								  , reserved      = reserved
 								  , operator      = operator
 								  , reservedOp    = reservedOp }
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  where
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  -- white space & indentation
 								  space    = hidden . skipMany . choice $
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								             ($ lang) <$> [void . spaceChar, blockComment, lineComment]
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  lexeme p = p <* space
 								  symbol   = lexeme . C.string
 								  indentGuard p = do
 								    space
 								    pos <- sourceColumn <$> getPosition
 								    if p pos
 								    then return pos
 								    else fail "incorrect indentation"
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  -- auxiliary parsers
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  parens    = between (symbol "(") (symbol ")")
 								  braces    = between (symbol "{") (symbol "}")
 								  angles    = between (symbol "<") (symbol ">")
 								  brackets  = between (symbol "[") (symbol "]")
 								  semicolon = symbol ";"
 								  comma     = symbol ","
 								  colon     = symbol ":"
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  dot       = symbol "."
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												further refinement

											
										
										
											2015-09-03 10:35:22 +03:00
+								  -- char & string literals
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  charLiteral = lexeme ( between (C.char '\'')
 								                                 (C.char '\'' <?> "end of character")
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								                                 characterChar )
 								                <?> "character"
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  characterChar = charLetter <|> charEscape <?> "literal character"
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  charEscape = C.char '\\' >> escapeCode
 								  charLetter = C.satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026'))
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  stringLiteral =
 								      lexeme ((foldr (maybe id (:)) "" <$>
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								               between (C.char '"') (C.char '"' <?> "end of string")
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								                           (many stringChar)) <?> "literal string")
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  stringChar = (Just <$> stringLetter) <|> stringEscape <?> "string character"
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  stringLetter = C.satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026'))
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  stringEscape = C.char '\\' >>
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								                 ( (escapeGap >> return Nothing)   <|>
 								                   (escapeEmpty >> return Nothing) <|>
 								                   (Just <$> escapeCode) )
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  escapeEmpty = C.char '&'
 								  escapeGap   = some C.spaceChar >> C.char '\\' <?> "end of string gap"
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- escape codes
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  escapeCode = charEsc <|> charNum <|> charAscii <|> charControl
 								               <?> "escape code"
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  charEsc = choice (parseEsc <$> escMap)
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								      where parseEsc (c, code) = C.char c >> return code
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  charNum = toEnum . fromInteger <$>
 								            ( decimal <|>
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								             (C.char 'o' >> nump "0o" C.octDigitChar) <|>
 								             (C.char 'x' >> nump "0x" C.hexDigitChar) )
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  charAscii = choice (parseAscii <$> asciiMap)
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								      where parseAscii (asc, code) = try (C.string asc >> return code)
-												rewritten parsing of numbers, fixes #2 and #3

Changed how numbers are parsed because they were parsed in a naïf and
hairy way. Added tests for #2 and #3 (in old Parsec project these are
number 35 and 39 respectively).

* Since Haskell report doesn't say anything about sign, I've made
  ‘integer’ and ‘float’ parse numbers without sign.

* Removed ‘natural’ parser, it's equal to new ‘integer’ now.

* Renamed ‘naturalOrFloat’ → ‘number’ — this doesn't parse sign too.

* Added new combinator ‘signed’ to parse all sorts of signed numbers.

* For the sake of convenience I've added ‘integer'’, ‘float'’, and
 ‘number'’ combinators that also can parse signed numbers out of box.

											
										
										
											2015-07-31 14:30:38 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  charControl = toEnum . subtract 64 . fromEnum <$> (C.char '^' >> C.upperChar)
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- escape code tables
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  escMap      = zip "abfnrtv\\\"\'" "\a\b\f\n\r\t\v\\\"\'"
 								  asciiMap    = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2)
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM",
 								                 "FS","GS","RS","US","SP"]
 								  ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL",
 								                 "DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB",
 								                 "CAN","SUB","ESC","DEL"]
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  ascii2 = "\b\t\n\v\f\r\SO\SI\EM\FS\GS\RS\US "
 								  ascii3 = "\NUL\SOH\STX\ETX\EOT\ENQ\ACK\a\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\SUB\ESC\DEL"
-												refactoring, phase 3

											
										
										
											2015-07-30 18:45:06 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- numbers — integers
-												refactoring, phase 3

											
										
										
											2015-07-30 18:45:06 +03:00
-												temporarily simplify token parsing

The improved error messages in Megaparsec are quite sensitive to how
parsers are written, which parts of parser are labeled, etc. Current
implementation of token parsers in ‘Text.Megaparsec.Token’ is written
without this in mind. We will improve the module later, for now let us
rewrite/simplify some parts to avoid failing tests.

											
										
										
											2015-08-19 22:11:21 +03:00
+								  integer  = decimal
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  integer' = signed integer
-												refactoring, phase 3

											
										
										
											2015-07-30 18:45:06 +03:00
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  decimal     = lexeme (nump "" C.digitChar <?> "integer")
 								  hexadecimal = lexeme $ C.char '0' >> C.oneOf "xX" >> nump "0x" C.hexDigitChar
 								  octal       = lexeme $ C.char '0' >> C.oneOf "oO" >> nump "0o" C.octDigitChar
-												refactoring, phase 3

											
										
										
											2015-07-30 18:45:06 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  nump prefix baseDigit = read . (prefix ++) <$> some baseDigit
-												refactoring, phase 3

											
										
										
											2015-07-30 18:45:06 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  signed p = ($) <$> option id (lexeme sign) <*> p
-												refactoring, phase 3

											
										
										
											2015-07-30 18:45:06 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a)
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate)
-												refactoring, phase 3

											
										
										
											2015-07-30 18:45:06 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  -- numbers — floats
-												Initial import

											
										
										
											2008-01-13 20:53:15 +03:00
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								  float  = lexeme ffloat <?> "float"
 								  float' = signed float
 								  ffloat = read <$> ffloat'
 								    where
 								      ffloat' = do
 								        decimal <- fDec
 								        rest <- fraction <|> fExp
 								        return $ decimal ++ rest
 								  fraction = do
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								    void $ C.char '.'
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								    decimal <- fDec
 								    exp <- option "" fExp
 								    return $ '.' : decimal ++  exp
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  fDec = some C.digitChar
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
 								  fExp = do
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								    expChar <- C.oneOf "eE"
 								    signStr <- option "" (pure <$> C.oneOf "+-")
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								    decimal <- fDec
 								    return $ expChar : signStr ++ decimal
 								  -- numbers — a more general case
 								  number  = (Right <$> try float)  <|> (Left <$> integer)  <?> "number"
 								  number' = (Right <$> try float') <|> (Left <$> integer') <?> "number"
 								  -- operators & reserved ops
 								  reservedOp name =
 								      lexeme $ try $ do
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								        void $ C.string name
 								        notFollowedBy (opLetter lang) <?> ("end of " ++ show name)
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
 								  operator =
 								      lexeme $ try $ do
 								        name <- oper
 								        if isReservedOp name
 								        then unexpected ("reserved operator " ++ show name)
 								        else return name
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  oper = ((:) <$> opStart lang <*> many (opLetter lang))
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								         <?> "operator"
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  isReservedOp = isReserved . sort $ reservedOpNames lang
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
 								  -- identifiers & reserved words
 								  reserved name =
 								      lexeme $ try $ do
 								        void $ caseString name
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								        notFollowedBy (identLetter lang) <?> ("end of " ++ show name)
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
 								  caseString name
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								      | caseSensitive lang = C.string name
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								      | otherwise                 = walk name >> return name
 								      where walk = foldr (\c -> ((caseChar c <?> show name) >>)) (return ())
 								            caseChar c
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								                | isAlpha c = C.char (toLower c) <|> C.char (toUpper c)
 								                | otherwise = C.char c
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
 								  identifier =
 								      lexeme $ try $ do
 								        name <- ident
 								        if isReservedName name
 								        then unexpected ("reserved word " ++ show name)
 								        else return name
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								  ident = ((:) <$> identStart lang <*> many (identLetter lang))
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								          <?> "identifier"
 								  isReservedName name = isReserved theReservedNames caseName
 								      where caseName
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								                | caseSensitive lang = name
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								                | otherwise                 = toLower <$> name
 								  isReserved names name = scan names
 								      where scan []     = False
 								            scan (r:rs) = case compare r name of
 								                            LT  -> scan rs
 								                            EQ  -> True
 								                            GT  -> False
 								  theReservedNames
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								      | caseSensitive lang = sort reserved
-												cosmetic changes (indentation, etc)

											
										
										
											2015-08-12 20:51:06 +03:00
+								      | otherwise                 = sort . fmap (fmap toLower) $ reserved
-												started work on new lexer

Eliminated ‘Text.Megaparsec.Language’ module because at this point it is
clear that already existing definitions are of little use in
Megaparsec. I started writing “default” language definition in
‘Text.Megaparsec.Lexer’.

At this point it should be possible to parse languages where indentation
matters, although we will need to provide more helpers to make it
easier.

											
										
										
											2015-09-02 16:27:48 +03:00
+								      where reserved = reservedNames lang