first version of the new lexer module

This commit is contained in:
mrkkrp 2015-09-08 17:34:02 +06:00
parent f58d5bfe1c
commit 7298c01387
7 changed files with 249 additions and 524 deletions

View File

@ -9,125 +9,135 @@
-- Stability : experimental
-- Portability : non-portable (uses local universal quantification: PolymorphicComponents)
--
-- A helper module to parse lexical elements. See 'makeLexer' for a
-- description of how to use it. This module is supposed to be imported
-- qualified.
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
-- High-level parsers to help you write your lexer. The module doesn't
-- impose how you should write your parser, but certain approaches may be
-- more elegant than others. Especially important theme is parsing of write
-- space, comments and indentation.
--
-- This module is supposed to be imported qualified:
--
-- > import qualified Text.Megaparsec.Lexer as L
module Text.Megaparsec.Lexer
( LanguageDef (..)
, defaultLang
( -- * White space and indentation
space
, lexeme
, symbol
, symbol'
, indentGuard
, skipLineComment
, skipBlockComment
, Lexer (..)
, makeLexer )
-- * Character and string literals
, charLiteral
-- * Numbers
, integer
, decimal
, hexadecimal
, octal
, float
, number
, signed )
where
import Control.Applicative ((<|>), many, some, empty)
import Control.Applicative ((<|>), some)
import Control.Monad (void)
import Data.Char (isAlpha, toLower, toUpper)
import Data.List (sort)
import Data.Char (readLitChar)
import Data.Maybe (listToMaybe)
import Text.Megaparsec.Combinator
import Text.Megaparsec.Pos
import Text.Megaparsec.Prim
import Text.Megaparsec.ShowToken
import qualified Text.Megaparsec.Char as C
-- Language definition
-- White space and indentation
-- | The @LanguageDef@ type is a record that contains all parameters used to
-- control features of the "Text.Megaparsec.Lexer" module. 'defaultLang' can
-- be used as a basis for new language definitions.
-- | @space spaceChar lineComment blockComment@ produces parser that can
-- parse white space in general. It's expected that you create such a parser
-- once and pass it to many other function in this module as needed (it's
-- usually called @spaceConsumer@ in doc-strings here).
--
-- @spaceChar@ is used to parse trivial space characters. You can use
-- 'C.spaceChar' from "Text.Megaparsec.Char" for this purpose as well as
-- your own parser (if you don't want automatically consume newlines, for
-- example).
--
-- @lineComment@ is used to parse line comments. You can use
-- 'skipLineComment' if you don't need anything special.
--
-- @blockComment@ is used to parse block (multi-line) comments. You can use
-- 'skipBlockComment' if you don't need anything special.
--
-- Parsing of white space is important part of any parser. We propose scheme
-- where every lexeme should consume all trailing white space, but not
-- leading one. You should wrap every lexeme parser with 'lexeme' to achieve
-- this. You only need to call 'space' “manually” to consume any white space
-- before the first lexeme (at the beginning of file).
data LanguageDef s u m =
LanguageDef {
space :: Stream s m t => ParsecT s u m () ->
ParsecT s u m () -> ParsecT s u m () -> ParsecT s u m ()
space ch line block = hidden . skipMany $ choice [ch, line, block]
-- | The parser is used to parse single white space character. If
-- indentation is important in your language you should probably not treat
-- newline as white space character. Also note that if newline is not
-- white space character, you will need to pick it up manually.
-- | This is wrapper for lexemes. Typical usage is to supply first argument
-- (parser that consumes white space, probably defined via 'space') and use
-- resulting function to wrap parsers for every lexeme.
--
-- > lexeme = L.lexeme spaceConsumer
-- > integer = lexeme L.integer
spaceChar :: ParsecT s u m Char
lexeme :: ParsecT s u m () -> ParsecT s u m a -> ParsecT s u m a
lexeme spc p = p <* spc
-- | The parser parses line comments. It's responsibility of the parser to
-- stop at the end of line. If your language doesn't support this type of
-- comments, set this value to 'empty'. In simple cases you can use
-- 'skipLineComment' to quickly construct line comment parser.
-- | This is a helper to parse symbols, i.e. verbatim strings. You pass the
-- first argument (parser that consumes white space, probably defined via
-- 'space') and then you can use the resulting function to parse strings:
--
-- > symbol = L.symbol spaceConsumer
-- >
-- > parens = between (symbol "(") (symbol ")")
-- > braces = between (symbol "{") (symbol "}")
-- > angles = between (symbol "<") (symbol ">")
-- > brackets = between (symbol "[") (symbol "]")
-- > semicolon = symbol ";"
-- > comma = symbol ","
-- > colon = symbol ":"
-- > dot = symbol "."
, lineComment :: ParsecT s u m ()
symbol :: Stream s m Char =>
ParsecT s u m () -> String -> ParsecT s u m String
symbol spc = lexeme spc . C.string
-- | The parser parses block (multi-line) comments. If your language
-- doesn't support this type of comments, set this value to 'empty'. In
-- simple cases you can use 'skipBlockComment' to quickly construct block
-- comment parser.
-- | Case-insensitive version of 'symbol'. This may be helpful if you're
-- working with case-insensitive languages.
, blockComment :: ParsecT s u m ()
symbol' :: Stream s m Char =>
ParsecT s u m () -> String -> ParsecT s u m String
symbol' spc = lexeme spc . C.string'
-- NEXT
-- | @indentGuard spaceConsumer test@ first consumes all white space
-- (indentation) with @spaceConsumer@ parser, then it checks column
-- position. It should satisfy supplied predicate @test@, otherwise the
-- parser fails with error message “incorrect indentation”. On success
-- current column position is returned.
--
-- When you want to parse block of indentation first run this parser with
-- predicate like @(> 0)@ — this will make sure you have some
-- indentation. Use returned value to check indentation on every subsequent
-- line according to syntax of your language.
-- | This parser should accept any start characters of identifiers, for
-- example @letter \<|> char \'_\'@.
, identStart :: ParsecT s u m Char
-- | This parser should accept any legal tail characters of identifiers,
-- for example @alphaNum \<|> char \'_\'@.
, identLetter :: ParsecT s u m Char
-- | This parser should accept any start characters of operators, for
-- example @oneOf \":!#$%&*+.\/\<=>?\@\\\\^|-~\"@
, opStart :: ParsecT s u m Char
-- | This parser should accept any legal tail characters of operators.
-- Note that this parser should even be defined if the language doesn't
-- support user-defined operators, or otherwise the 'reservedOp' parser
-- won't work correctly.
, opLetter :: ParsecT s u m Char
-- | The list of reserved identifiers.
, reservedNames :: [String]
-- | The list of reserved operators.
, reservedOpNames :: [String]
-- | Set to 'True' if the language is case sensitive.
, caseSensitive :: Bool }
-- Default language definition
-- | This is standard language definition. It is recommended to use
-- this definition as the basis for other definitions. @defaultLang@ has no
-- reserved names or operators, is case sensitive and doesn't accept
-- comments, identifiers or operators.
defaultLang :: Stream s m Char => LanguageDef s u m
defaultLang =
LanguageDef
{ spaceChar = C.spaceChar
, lineComment = empty
, blockComment = empty
-- NEXT
, identStart = C.letterChar <|> C.char '_'
, identLetter = C.alphaNumChar <|> C.oneOf "_'"
, opStart = opLetter defaultLang
, opLetter = C.oneOf ":!#$%&*+./<=>?@\\^|-~"
, reservedOpNames = []
, reservedNames = []
, caseSensitive = True }
-- Utility functions
indentGuard :: Stream s m t =>
ParsecT s u m () -> (Int -> Bool) -> ParsecT s u m Int
indentGuard spc p = do
spc
pos <- sourceColumn <$> getPosition
if p pos
then return pos
else fail "incorrect indentation"
-- | Given comment prefix this function returns parser that skips line
-- comments. Note that it stops just before newline character but doesn't
-- consume the newline. Newline is either supposed to be consumed by 'space'
-- parser or picked manually.
-- parser or picked up manually.
skipLineComment :: Stream s m Char => String -> ParsecT s u m ()
skipLineComment prefix = C.string prefix >> void (manyTill C.anyChar n)
@ -138,437 +148,122 @@ skipLineComment prefix = C.string prefix >> void (manyTill C.anyChar n)
skipBlockComment :: Stream s m Char => String -> String -> ParsecT s u m ()
skipBlockComment start end = C.string start >> void (manyTill C.anyChar n)
where n = lookAhead (C.string end)
where n = lookAhead . try $ C.string end
-- Lexer
-- Character and string literals
-- | The type of the record that holds lexical parsers that work on
-- @s@ streams with state @u@ over a monad @m@.
data Lexer s u m =
Lexer {
-- | Skips any white space. White space consists of /zero/ or more
-- occurrences of 'spaceChar', a line comment or a block (multi-line)
-- comment.
space :: ParsecT s u m ()
-- | @lexeme p@ first applies parser @p@ and then the 'space' parser,
-- returning the value of @p@. Every lexical token (lexeme) is defined
-- using @lexeme@, this way every parse starts at a point without white
-- space. Parsers that use @lexeme@ are called /lexeme/ parsers in this
-- document.
--
-- The only point where the 'space' parser should be called explicitly is
-- the start of the main parser in order to skip any leading white space.
, lexeme :: forall a. ParsecT s u m a -> ParsecT s u m a
-- | Lexeme parser @symbol s@ parses 'string' @s@ and skips
-- trailing white space.
, symbol :: String -> ParsecT s u m String
-- | @indentGuard p@ consumes all white space it can consume, then checks
-- column number. The column number should satisfy given predicate @p@,
-- otherwise the parser fails with “incorrect indentation” message. In
-- successful cases @indentGuard@ returns current column number.
, indentGuard :: (Int -> Bool) -> ParsecT s u m Int
-- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis,
-- returning the value of @p@.
, parens :: forall a. ParsecT s u m a -> ParsecT s u m a
-- | Lexeme parser @braces p@ parses @p@ enclosed in braces (“{” and
-- “}”), returning the value of @p@.
, braces :: forall a. ParsecT s u m a -> ParsecT s u m a
-- | Lexeme parser @angles p@ parses @p@ enclosed in angle brackets (“\<”
-- and “>”), returning the value of @p@.
, angles :: forall a. ParsecT s u m a -> ParsecT s u m a
-- | Lexeme parser @brackets p@ parses @p@ enclosed in brackets (“[”
-- and “]”), returning the value of @p@.
, brackets :: forall a. ParsecT s u m a -> ParsecT s u m a
-- | Lexeme parser @semicolon@ parses the character “;” and skips any
-- trailing white space. Returns the string “;”.
, semicolon :: ParsecT s u m String
-- | Lexeme parser @comma@ parses the character “,” and skips any
-- trailing white space. Returns the string “,”.
, comma :: ParsecT s u m String
-- | Lexeme parser @colon@ parses the character “:” and skips any
-- trailing white space. Returns the string “:”.
, colon :: ParsecT s u m String
-- | Lexeme parser @dot@ parses the character “.” and skips any
-- trailing white space. Returns the string “.”.
, dot :: ParsecT s u m String
-- | The lexeme parser parses a single literal character. Returns the
-- literal character value. This parsers deals correctly with escape
-- sequences. The literal character is parsed according to the grammar
-- rules defined in the Haskell report (which matches most programming
-- languages quite closely).
, charLiteral :: ParsecT s u m Char
-- | The lexeme parser parses a literal string. Returns the literal
-- string value. This parsers deals correctly with escape sequences and
-- gaps. The literal string is parsed according to the grammar rules
-- defined in the Haskell report (which matches most programming languages
-- quite closely).
, stringLiteral :: ParsecT s u m String
-- | The lexeme parser parses an integer (a whole number). This parser
-- /does not/ parse sign. Returns the value of the number. The number can
-- be specified in 'decimal', 'hexadecimal' or 'octal'. The number is
-- parsed according to the grammar rules in the Haskell report.
, integer :: ParsecT s u m Integer
-- | This is just like 'integer', except it can parse sign.
, integer' :: ParsecT s u m Integer
-- | The lexeme parses a positive whole number in the decimal system.
-- Returns the value of the number.
, decimal :: ParsecT s u m Integer
-- | The lexeme parses a positive whole number in the hexadecimal
-- system. The number should be prefixed with “0x” or “0X”. Returns the
-- value of the number.
, hexadecimal :: ParsecT s u m Integer
-- | The lexeme parses a positive whole number in the octal system.
-- The number should be prefixed with “0o” or “0O”. Returns the value of
-- the number.
, octal :: ParsecT s u m Integer
-- | @signed p@ tries to parse sign (i.e. “+”, “-”, or nothing) and
-- then runs parser @p@, changing sign of its result accordingly. Note
-- that there may be white space after the sign but not before it.
, signed :: forall a. Num a => ParsecT s u m a -> ParsecT s u m a
-- | The lexeme parser parses a floating point value. Returns the value
-- of the number. The number is parsed according to the grammar rules
-- defined in the Haskell report, sign is /not/ parsed, use 'float'' to
-- achieve parsing of signed floating point values.
, float :: ParsecT s u m Double
-- | This is just like 'float', except it can parse sign.
, float' :: ParsecT s u m Double
-- | The lexeme parser parses either 'integer' or a 'float'.
-- Returns the value of the number. This parser deals with any overlap in
-- the grammar rules for integers and floats. The number is parsed
-- according to the grammar rules defined in the Haskell report.
, number :: ParsecT s u m (Either Integer Double)
-- | This is just like 'number', except it can parse sign.
, number' :: ParsecT s u m (Either Integer Double)
-- | The lexeme parser parses a legal identifier. Returns the identifier
-- string. This parser will fail on identifiers that are reserved
-- words. Legal identifier (start) characters and reserved words are
-- defined in the 'LanguageDef' that is passed to 'makeLexer'.
, identifier :: ParsecT s u m String
-- | The lexeme parser @reserved name@ parses @symbol name@, but it also
-- checks that the @name@ is not a prefix of a valid identifier.
, reserved :: String -> ParsecT s u m ()
-- | The lexeme parser parses a legal operator. Returns the name of the
-- operator. This parser will fail on any operators that are reserved
-- operators. Legal operator (start) characters and reserved operators are
-- defined in the 'LanguageDef' that is passed to 'makeLexer'.
, operator :: ParsecT s u m String
-- | The lexeme parser @reservedOp name@ parses @symbol name@, but it
-- also checks that the @name@ is not a prefix of a valid operator.
, reservedOp :: String -> ParsecT s u m () }
-- | The expression @makeLexer language@ creates a 'Lexer' record that
-- contains lexical parsers that are defined using the definitions in the
-- @language@ record.
-- | The lexeme parser parses a single literal character without
-- quotes. Purpose of this parser is to help with parsing of commonly used
-- escape sequences. It's your responsibility to take care of character
-- literal syntax in your language (surround it with single quotes or
-- similar).
--
-- The use of this function is quite stylized — one imports the appropriate
-- language definition and selects the lexical parsers that are needed from
-- the resulting 'Lexer'.
-- The literal character is parsed according to the grammar rules defined in
-- the Haskell report.
--
-- > module Main (main) where
-- >
-- > import Text.Megaparsec
-- > import Text.Megaparsec.Language (haskellDef)
-- > import qualified Text.Megaparsec.Lexer as L
-- >
-- > -- The parser
-- > …
-- >
-- > expr = parens expr
-- > <|> identifier
-- > <|> …
-- >
-- > -- The lexer
-- > lexer = L.makeLexer haskellDef
-- >
-- > parens = L.parens lexer
-- > braces = L.braces lexer
-- > identifier = L.identifier lexer
-- > reserved = L.reserved lexer
-- > …
-- Note that you can use this parser as a building block to parse various
-- string literals:
--
-- > stringLiteral = char '"' >> manyTill L.charLiteral (char '"')
makeLexer :: Stream s m Char => LanguageDef s u m -> Lexer s u m
makeLexer lang =
Lexer
{ space = space
, lexeme = lexeme
, symbol = symbol
, indentGuard = indentGuard
charLiteral :: Stream s m Char => ParsecT s u m Char
charLiteral = label "literal character" $ do
r@(x:_) <- lookAhead $ count' 1 8 C.anyChar
case listToMaybe (readLitChar r) of
Just (c, r') -> count (length r - length r') C.anyChar >> return c
Nothing -> unexpected (showToken x)
, parens = parens
, braces = braces
, angles = angles
, brackets = brackets
, semicolon = semicolon
, comma = comma
, colon = colon
, dot = dot
-- Numbers
, charLiteral = charLiteral
, stringLiteral = stringLiteral
-- | Parse an integer without sign in decimal representation (according to
-- format of integer literals described in Haskell report).
--
-- If you need to parse signed integers, see 'signed' combinator.
, integer = integer
, integer' = integer'
, decimal = decimal
, hexadecimal = hexadecimal
, octal = octal
, signed = signed
, float = float
, float' = float'
, number = number
, number' = number'
integer :: Stream s m Char => ParsecT s u m Integer
integer = decimal <?> "integer"
, identifier = identifier
, reserved = reserved
, operator = operator
, reservedOp = reservedOp }
where
-- | The same as 'integer', but 'integer' is 'label'ed with “integer” label,
-- while this parser is not labeled.
-- white space & indentation
decimal :: Stream s m Char => ParsecT s u m Integer
decimal = nump "" C.digitChar
space = hidden . skipMany . choice $
($ lang) <$> [void . spaceChar, blockComment, lineComment]
lexeme p = p <* space
symbol = lexeme . C.string
indentGuard p = do
space
pos <- sourceColumn <$> getPosition
if p pos
then return pos
else fail "incorrect indentation"
-- | Parse an integer in hexadecimal representation. Representation of
-- hexadecimal number is expected to be according to Haskell report, that
-- is, it should be prefixed with “0x” or “0X” prefix.
-- auxiliary parsers
hexadecimal :: Stream s m Char => ParsecT s u m Integer
hexadecimal = C.char '0' >> C.char' 'x' >> nump "0x" C.hexDigitChar
parens = between (symbol "(") (symbol ")")
braces = between (symbol "{") (symbol "}")
angles = between (symbol "<") (symbol ">")
brackets = between (symbol "[") (symbol "]")
semicolon = symbol ";"
comma = symbol ","
colon = symbol ":"
dot = symbol "."
-- | Parse an integer in octal representation. Representation of octal
-- number is expected to be according to Haskell report, that is, it should
-- be prefixed with “0o” or “0O” prefix.
-- char & string literals
octal :: Stream s m Char => ParsecT s u m Integer
octal = C.char '0' >> C.char' 'o' >> nump "0o" C.octDigitChar
charLiteral = lexeme ( between (C.char '\'')
(C.char '\'' <?> "end of character")
characterChar )
<?> "character"
-- | @nump prefix p@ parses /one/ or more characters with @p@ parser, then
-- prepends @prefix@ to returned value and tries to interpret the result as
-- an integer according to Haskell syntax.
characterChar = charLetter <|> charEscape <?> "literal character"
nump :: String -> ParsecT s u m Char -> ParsecT s u m Integer
nump prefix baseDigit = read . (prefix ++) <$> some baseDigit
charEscape = C.char '\\' >> escapeCode
charLetter = C.satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026'))
-- | Parse a floating point value without sign. Representation of floating
-- point value is expected to be according to Haskell report.
--
-- If you need to parse signed floats, see 'signed' combinator.
stringLiteral =
lexeme ((foldr (maybe id (:)) "" <$>
between (C.char '"') (C.char '"' <?> "end of string")
(many stringChar)) <?> "literal string")
float :: Stream s m Char => ParsecT s u m Double
float = label "float" $ read <$> f
where f = do
d <- some C.digitChar
rest <- fraction <|> fExp
return $ d ++ rest
stringChar = (Just <$> stringLetter) <|> stringEscape <?> "string character"
-- | This is a helper for 'float' parser. It parses fractional part of
-- floating point number, that is, dot and everything after it.
stringLetter = C.satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026'))
fraction :: Stream s m Char => ParsecT s u m String
fraction = do
void $ C.char '.'
d <- some C.digitChar
e <- option "" fExp
return $ '.' : d ++ e
stringEscape = C.char '\\' >>
( (escapeGap >> return Nothing) <|>
(escapeEmpty >> return Nothing) <|>
(Just <$> escapeCode) )
-- | This helper parses exponent of floating point numbers.
escapeEmpty = C.char '&'
escapeGap = some C.spaceChar >> C.char '\\' <?> "end of string gap"
fExp :: Stream s m Char => ParsecT s u m String
fExp = do
expChar <- C.char' 'e'
signStr <- option "" (pure <$> choice (C.char <$> "+-"))
d <- some C.digitChar
return $ expChar : signStr ++ d
-- escape codes
-- | Parse a number: either integer or floating point. The parser can handle
-- overlapping grammars graciously.
escapeCode = charEsc <|> charNum <|> charAscii <|> charControl
<?> "escape code"
number :: Stream s m Char => ParsecT s u m (Either Integer Double)
number = (Right <$> try float) <|> (Left <$> integer) <?> "number"
charEsc = choice (parseEsc <$> escMap)
where parseEsc (c, code) = C.char c >> return code
-- | @signed space p@ parser parses optional sign, then if there is a sign
-- it will consume optional white space (using @space@ parser), then it runs
-- parser @p@ which should return a number. Sign of the number is changed
-- according to previously parsed sign.
--
-- For example, to parse signed integer you can write:
--
-- > lexeme = L.lexeme spaceConsumer
-- > integer = lexeme L.integer
-- > signedInteger = signed spaceConsumer integer
charNum = toEnum . fromInteger <$>
( decimal <|>
(C.char 'o' >> nump "0o" C.octDigitChar) <|>
(C.char 'x' >> nump "0x" C.hexDigitChar) )
signed :: (Stream s m Char, Num a) =>
ParsecT s u m () -> ParsecT s u m a -> ParsecT s u m a
signed spc p = ($) <$> option id (lexeme spc sign) <*> p
charAscii = choice (parseAscii <$> asciiMap)
where parseAscii (asc, code) = try (C.string asc >> return code)
-- | Parse a sign and return either 'id' or 'negate' according to parsed
-- sign.
charControl = toEnum . subtract 64 . fromEnum <$> (C.char '^' >> C.upperChar)
-- escape code tables
escMap = zip "abfnrtv\\\"\'" "\a\b\f\n\r\t\v\\\"\'"
asciiMap = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2)
ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM",
"FS","GS","RS","US","SP"]
ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL",
"DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB",
"CAN","SUB","ESC","DEL"]
ascii2 = "\b\t\n\v\f\r\SO\SI\EM\FS\GS\RS\US "
ascii3 = "\NUL\SOH\STX\ETX\EOT\ENQ\ACK\a\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\SUB\ESC\DEL"
-- numbers — integers
integer = decimal
integer' = signed integer
decimal = lexeme (nump "" C.digitChar <?> "integer")
hexadecimal = lexeme $ C.char '0' >> C.oneOf "xX" >> nump "0x" C.hexDigitChar
octal = lexeme $ C.char '0' >> C.oneOf "oO" >> nump "0o" C.octDigitChar
nump prefix baseDigit = read . (prefix ++) <$> some baseDigit
signed p = ($) <$> option id (lexeme sign) <*> p
sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a)
sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate)
-- numbers — floats
float = lexeme ffloat <?> "float"
float' = signed float
ffloat = read <$> ffloat'
where
ffloat' = do
decimal <- fDec
rest <- fraction <|> fExp
return $ decimal ++ rest
fraction = do
void $ C.char '.'
decimal <- fDec
exp <- option "" fExp
return $ '.' : decimal ++ exp
fDec = some C.digitChar
fExp = do
expChar <- C.oneOf "eE"
signStr <- option "" (pure <$> C.oneOf "+-")
decimal <- fDec
return $ expChar : signStr ++ decimal
-- numbers — a more general case
number = (Right <$> try float) <|> (Left <$> integer) <?> "number"
number' = (Right <$> try float') <|> (Left <$> integer') <?> "number"
-- operators & reserved ops
reservedOp name =
lexeme $ try $ do
void $ C.string name
notFollowedBy (opLetter lang) <?> ("end of " ++ show name)
operator =
lexeme $ try $ do
name <- oper
if isReservedOp name
then unexpected ("reserved operator " ++ show name)
else return name
oper = ((:) <$> opStart lang <*> many (opLetter lang))
<?> "operator"
isReservedOp = isReserved . sort $ reservedOpNames lang
-- identifiers & reserved words
reserved name =
lexeme $ try $ do
void $ caseString name
notFollowedBy (identLetter lang) <?> ("end of " ++ show name)
caseString name
| caseSensitive lang = C.string name
| otherwise = walk name >> return name
where walk = foldr (\c -> ((caseChar c <?> show name) >>)) (return ())
caseChar c
| isAlpha c = C.char (toLower c) <|> C.char (toUpper c)
| otherwise = C.char c
identifier =
lexeme $ try $ do
name <- ident
if isReservedName name
then unexpected ("reserved word " ++ show name)
else return name
ident = ((:) <$> identStart lang <*> many (identLetter lang))
<?> "identifier"
isReservedName name = isReserved theReservedNames caseName
where caseName
| caseSensitive lang = name
| otherwise = toLower <$> name
isReserved names name = scan names
where scan [] = False
scan (r:rs) = case compare r name of
LT -> scan rs
EQ -> True
GT -> False
theReservedNames
| caseSensitive lang = sort reserved
| otherwise = sort . fmap (fmap toLower) $ reserved
where reserved = reservedNames lang
sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a)
sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate)

View File

@ -115,6 +115,7 @@ test-suite old-tests
main-is: Main.hs
hs-source-dirs: old-tests
type: exitcode-stdio-1.0
ghc-options: -O2 -Wall
other-modules: Bugs
, Bugs.Bug2
, Bugs.Bug6
@ -125,7 +126,8 @@ test-suite old-tests
, HUnit == 1.2.*
, test-framework >= 0.6 && < 1
, test-framework-hunit >= 0.2 && < 0.4
ghc-options: -O2 -Wall
default-extensions:
FlexibleContexts
default-language: Haskell2010
test-suite tests

View File

@ -1,13 +1,26 @@
module Bugs.Bug2 (main) where
import Test.HUnit hiding (Test)
import Test.Framework
import Test.Framework.Providers.HUnit
import Control.Applicative (empty)
import Control.Monad (void)
import Text.Megaparsec
import Text.Megaparsec.String
import qualified Text.Megaparsec.Lexer as L
import Test.Framework
import Test.Framework.Providers.HUnit
import Test.HUnit hiding (Test)
sc :: Parser ()
sc = L.space (void spaceChar) empty empty
lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc
stringLiteral :: Parser String
stringLiteral = lexeme $ char '"' >> manyTill L.charLiteral (char '"')
main :: Test
main =
testCase "Control Char Parsing (#2)" $
@ -15,7 +28,6 @@ main =
where
parseString :: String -> String
parseString input =
case parse parser "Example" input of
case parse stringLiteral "Example" input of
Left{} -> error "Parse failure"
Right str -> str
parser = L.stringLiteral $ L.makeLexer L.defaultLang

View File

@ -2,12 +2,11 @@
module Bugs.Bug35 (main) where
import Text.Megaparsec
import Text.Megaparsec.String
import qualified Text.Megaparsec.Lexer as L
import Test.HUnit hiding (Test)
import Test.Framework
import Test.Framework.Providers.HUnit
import Test.HUnit hiding (Test)
trickyFloats :: [String]
trickyFloats =
@ -27,12 +26,9 @@ trickyFloats =
, "32.96176575630599"
, "38.47735512322269" ]
float :: Parser Double
float = L.float (L.makeLexer L.defaultLang)
testBatch :: Assertion
testBatch = mapM_ testFloat trickyFloats
where testFloat x = parse float "" x @?= Right (read x :: Double)
where testFloat x = parse L.float "" x @?= Right (read x :: Double)
main :: Test
main = testCase "Output of Text.Megaparsec.Lexer.float (#35)" testBatch

View File

@ -1,15 +1,17 @@
module Bugs.Bug39 (main) where
import Control.Applicative (empty)
import Control.Monad (void)
import Data.Either (isLeft, isRight)
import Text.Megaparsec
import Text.Megaparsec.String
import qualified Text.Megaparsec.Lexer as L
import Test.HUnit hiding (Test)
import Test.Framework
import Test.Framework.Providers.HUnit
import Test.HUnit hiding (Test)
shouldFail :: [String]
shouldFail = [" 1", " +1", " -1"]
@ -17,8 +19,14 @@ shouldFail = [" 1", " +1", " -1"]
shouldSucceed :: [String]
shouldSucceed = ["1", "+1", "-1", "+ 1 ", "- 1 ", "1 "]
sc :: Parser ()
sc = L.space (void spaceChar) empty empty
lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc
integer :: Parser Integer
integer = L.integer' (L.makeLexer L.defaultLang)
integer = lexeme $ L.signed sc L.integer
testBatch :: Assertion
testBatch = mapM_ (f testFail) shouldFail >>

View File

@ -1,13 +1,13 @@
module Bugs.Bug6 (main) where
import Test.HUnit hiding (Test)
import Test.Framework
import Test.Framework.Providers.HUnit
import Text.Megaparsec
import Text.Megaparsec.String
import Test.Framework
import Test.Framework.Providers.HUnit
import Test.HUnit hiding (Test)
import Util
main :: Test
@ -19,5 +19,5 @@ variable :: Parser String
variable = do
x <- lookAhead (some letterChar)
if x == "return"
then fail "'return' is a reserved keyword"
else string x
then fail "'return' is a reserved keyword"
else string x

View File

@ -1,14 +1,17 @@
module Bugs.Bug9 (main) where
import Control.Applicative (empty)
import Control.Monad (void)
import Text.Megaparsec
import Text.Megaparsec.String (Parser)
import Text.Megaparsec.Expr
import Text.Megaparsec.String (Parser)
import qualified Text.Megaparsec.Lexer as L
import Test.HUnit hiding (Test)
import Test.Framework
import Test.Framework.Providers.HUnit
import Test.HUnit hiding (Test)
import Util
@ -24,12 +27,21 @@ main =
-- Syntax analysis
sc :: Stream s m Char => ParsecT s u m ()
sc = L.space (void spaceChar) empty empty
lexeme :: Stream s m Char => ParsecT s u m a -> ParsecT s u m a
lexeme = L.lexeme sc
integer :: Stream s m Char => ParsecT s u m Integer
integer = lexeme L.integer
operator :: Stream s m Char => String -> ParsecT s u m String
operator op = L.symbol sc op <?> "operator"
parseTopLevel :: Parser Expr
parseTopLevel = parseExpr <* eof
parseExpr :: Parser Expr
parseExpr = makeExprParser (Const <$> integer) table
where table = [[ InfixL (Op <$ reserved ">>>") ]]
lexer = L.makeLexer L.defaultLang { L.reservedOpNames = [">>>"] }
integer = L.integer lexer
reserved = L.reserved lexer
where table = [[ InfixL (Op <$ operator ">>>") ]]