Merge branch 'new-lexer'

This commit is contained in:
mrkkrp 2015-09-13 21:16:39 +06:00
commit 0d39e44f40
10 changed files with 514 additions and 713 deletions

View File

@ -41,11 +41,10 @@ data Operator s u m a
-- descending precedence. All operators in one list have the same precedence
-- (but may have a different associativity).
--
-- Prefix and postfix
-- operators of the same precedence can only occur once (i.e. @--2@ is not
-- allowed if @-@ is prefix negate). Prefix and postfix operators of the
-- same precedence associate to the left (i.e. if @++@ is postfix increment,
-- than @-2++@ equals @-1@, not @-3@).
-- Prefix and postfix operators of the same precedence can only occur once
-- (i.e. @--2@ is not allowed if @-@ is prefix negate). Prefix and postfix
-- operators of the same precedence associate to the left (i.e. if @++@ is
-- postfix increment, than @-2++@ equals @-1@, not @-3@).
--
-- The @makeExprParser@ takes care of all the complexity involved in
-- building expression parser. Here is an example of an expression parser
@ -66,6 +65,9 @@ data Operator s u m a
-- > binary name f = InfixL (reservedOp name >> return f)
-- > prefix name f = Prefix (reservedOp name >> return f)
-- > postfix name f = Postfix (reservedOp name >> return f)
--
-- Please note that multi-character operators should use 'try' in order to
-- be reported correctly in error messages.
makeExprParser :: Stream s m t => ParsecT s u m a ->
[[Operator s u m a]] -> ParsecT s u m a

View File

@ -1,115 +0,0 @@
-- |
-- Module : Text.Megaparsec.Language
-- Copyright : © 2015 Megaparsec contributors
-- © 2007 Paolo Martini
-- © 19992001 Daan Leijen
-- License : BSD3
--
-- Maintainer : Mark Karpov <markkarpov@opmbx.org>
-- Stability : experimental
-- Portability : non-portable (uses non-portable module Text.Megaparsec.Lexer)
--
-- A helper module that defines some language definitions that can be used
-- to instantiate a token parser (see "Text.Megaparsec.Lexer").
module Text.Megaparsec.Language
( LanguageDef
, emptyDef
, haskellStyle
, javaStyle
, haskellDef
, mondrianDef )
where
import Control.Monad.Identity
import Control.Applicative ((<|>))
import Text.Megaparsec.Char
import Text.Megaparsec.Lexer
-- | This is the most minimal token definition. It is recommended to use
-- this definition as the basis for other definitions. @emptyDef@ has no
-- reserved names or operators, is case sensitive and doesn't accept
-- comments, identifiers or operators.
emptyDef :: LanguageDef String st Identity
emptyDef =
LanguageDef
{ commentStart = ""
, commentEnd = ""
, commentLine = ""
, nestedComments = True
, identStart = letterChar <|> char '_'
, identLetter = alphaNumChar <|> oneOf "_'"
, opStart = opLetter emptyDef
, opLetter = oneOf ":!#$%&*+./<=>?@\\^|-~"
, reservedOpNames = []
, reservedNames = []
, caseSensitive = True }
-- | This is a minimal token definition for Haskell-style languages. It
-- defines the style of comments, valid identifiers and case sensitivity. It
-- does not define any reserved words or operators.
haskellStyle :: LanguageDef String u Identity
haskellStyle =
emptyDef
{ commentStart = "{-"
, commentEnd = "-}"
, commentLine = "--"
, nestedComments = True
, identStart = letterChar
, identLetter = alphaNumChar <|> oneOf "_'"
, opStart = opLetter haskellStyle
, opLetter = oneOf ":!#$%&*+./<=>?@\\^|-~"
, reservedOpNames = []
, reservedNames = []
, caseSensitive = True }
-- | This is a minimal token definition for Java-style languages. It
-- defines the style of comments, valid identifiers and case sensitivity. It
-- does not define any reserved words or operators.
javaStyle :: LanguageDef String u Identity
javaStyle =
emptyDef
{ commentStart = "/*"
, commentEnd = "*/"
, commentLine = "//"
, nestedComments = True
, identStart = letterChar
, identLetter = alphaNumChar <|> oneOf "_'"
, reservedNames = []
, reservedOpNames = []
, caseSensitive = False }
-- | The language definition for the Haskell language.
haskellDef :: LanguageDef String u Identity
haskellDef =
haskell98Def
{ identLetter = identLetter haskell98Def <|> char '#'
, reservedNames = reservedNames haskell98Def ++
[ "foreign", "import", "export", "primitive"
, "_ccall_", "_casm_", "forall"] }
-- | The language definition for the language Haskell98.
haskell98Def :: LanguageDef String u Identity
haskell98Def =
haskellStyle
{ reservedOpNames = ["::","..","=","\\","|","<-","->","@","~","=>"]
, reservedNames = [ "let", "in", "case", "of", "if", "then", "else"
, "data", "type", "class", "default", "deriving"
, "do", "import", "infix", "infixl", "infixr"
, "instance", "module", "newtype", "where"
, "primitive" ] }
-- | The language definition for the language Mondrian.
mondrianDef :: LanguageDef String u Identity
mondrianDef =
javaStyle
{ reservedNames = [ "case", "class", "default", "extends"
, "import", "in", "let", "new", "of", "package" ]
, caseSensitive = True }

View File

@ -9,573 +9,271 @@
-- Stability : experimental
-- Portability : non-portable (uses local universal quantification: PolymorphicComponents)
--
-- A helper module to parse lexical elements. See 'makeLexer' for a
-- description of how to use it.
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
-- High-level parsers to help you write your lexer. The module doesn't
-- impose how you should write your parser, but certain approaches may be
-- more elegant than others. Especially important theme is parsing of write
-- space, comments and indentation.
--
-- This module is supposed to be imported qualified:
--
-- > import qualified Text.Megaparsec.Lexer as L
module Text.Megaparsec.Lexer
( LanguageDef (..)
, Lexer (..)
, makeLexer )
( -- * White space and indentation
space
, lexeme
, symbol
, symbol'
, indentGuard
, skipLineComment
, skipBlockComment
-- * Character and string literals
, charLiteral
-- * Numbers
, integer
, decimal
, hexadecimal
, octal
, float
, number
, signed )
where
import Control.Applicative ((<|>), many, some)
import Control.Applicative ((<|>), some)
import Control.Monad (void)
import Data.Char (isAlpha, toLower, toUpper)
import Data.List (sort)
import Data.Char (readLitChar)
import Data.Maybe (listToMaybe)
import Text.Megaparsec.Prim
import Text.Megaparsec.Char
import Text.Megaparsec.Combinator
import Text.Megaparsec.Pos
import Text.Megaparsec.Prim
import Text.Megaparsec.ShowToken
import qualified Text.Megaparsec.Char as C
-- Language definition
-- White space and indentation
-- | The @LanguageDef@ type is a record that contains all parameters used to
-- control features of the "Text.Megaparsec.Lexer" module. The module
-- "Text.Megaparsec.Language" contains some default definitions.
data LanguageDef s u m =
LanguageDef {
-- | Describes the start of a block comment. Use the empty string if the
-- language doesn't support block comments.
commentStart :: String
-- | Describes the end of a block comment. Use the empty string if the
-- language doesn't support block comments.
, commentEnd :: String
-- | Describes the start of a line comment. Use the empty string if the
-- language doesn't support line comments.
, commentLine :: String
-- | Set to 'True' if the language supports nested block comments.
, nestedComments :: Bool
-- | This parser should accept any start characters of identifiers, for
-- example @letter \<|> char \'_\'@.
, identStart :: ParsecT s u m Char
-- | This parser should accept any legal tail characters of identifiers,
-- for example @alphaNum \<|> char \'_\'@.
, identLetter :: ParsecT s u m Char
-- | This parser should accept any start characters of operators, for
-- example @oneOf \":!#$%&*+.\/\<=>?\@\\\\^|-~\"@
, opStart :: ParsecT s u m Char
-- | This parser should accept any legal tail characters of operators.
-- Note that this parser should even be defined if the language doesn't
-- support user-defined operators, or otherwise the 'reservedOp' parser
-- won't work correctly.
, opLetter :: ParsecT s u m Char
-- | The list of reserved identifiers.
, reservedNames :: [String]
-- | The list of reserved operators.
, reservedOpNames :: [String]
-- | Set to 'True' if the language is case sensitive.
, caseSensitive :: Bool }
-- Lexer
-- | The type of the record that holds lexical parsers that work on
-- @s@ streams with state @u@ over a monad @m@.
data Lexer s u m =
Lexer {
-- | The lexeme parser parses a legal identifier. Returns the identifier
-- string. This parser will fail on identifiers that are reserved
-- words. Legal identifier (start) characters and reserved words are
-- defined in the 'LanguageDef' that is passed to 'makeLexer'.
identifier :: ParsecT s u m String
-- | The lexeme parser @reserved name@ parses @symbol name@, but it also
-- checks that the @name@ is not a prefix of a valid identifier.
, reserved :: String -> ParsecT s u m ()
-- | The lexeme parser parses a legal operator. Returns the name of the
-- operator. This parser will fail on any operators that are reserved
-- operators. Legal operator (start) characters and reserved operators are
-- defined in the 'LanguageDef' that is passed to 'makeLexer'.
, operator :: ParsecT s u m String
-- | The lexeme parser @reservedOp name@ parses @symbol name@, but it
-- also checks that the @name@ is not a prefix of a valid operator.
, reservedOp :: String -> ParsecT s u m ()
-- | The lexeme parser parses a single literal character. Returns the
-- literal character value. This parsers deals correctly with escape
-- sequences. The literal character is parsed according to the grammar
-- rules defined in the Haskell report (which matches most programming
-- languages quite closely).
, charLiteral :: ParsecT s u m Char
-- | The lexeme parser parses a literal string. Returns the literal
-- string value. This parsers deals correctly with escape sequences and
-- gaps. The literal string is parsed according to the grammar rules
-- defined in the Haskell report (which matches most programming languages
-- quite closely).
, stringLiteral :: ParsecT s u m String
-- | The lexeme parser parses an integer (a whole number). This parser
-- /does not/ parse sign. Returns the value of the number. The number can
-- be specified in 'decimal', 'hexadecimal' or 'octal'. The number is
-- parsed according to the grammar rules in the Haskell report.
, integer :: ParsecT s u m Integer
-- | This is just like 'integer', except it can parse sign.
, integer' :: ParsecT s u m Integer
-- | The lexeme parses a positive whole number in the decimal system.
-- Returns the value of the number.
, decimal :: ParsecT s u m Integer
-- | The lexeme parses a positive whole number in the hexadecimal
-- system. The number should be prefixed with “0x” or “0X”. Returns the
-- value of the number.
, hexadecimal :: ParsecT s u m Integer
-- | The lexeme parses a positive whole number in the octal system.
-- The number should be prefixed with “0o” or “0O”. Returns the value of
-- the number.
, octal :: ParsecT s u m Integer
-- | @signed p@ tries to parse sign (i.e. “+”, “-”, or nothing) and
-- then runs parser @p@, changing sign of its result accordingly. Note
-- that there may be white space after the sign but not before it.
, signed :: forall a . Num a => ParsecT s u m a -> ParsecT s u m a
-- | The lexeme parser parses a floating point value. Returns the value
-- of the number. The number is parsed according to the grammar rules
-- defined in the Haskell report, sign is /not/ parsed, use 'float'' to
-- achieve parsing of signed floating point values.
, float :: ParsecT s u m Double
-- | This is just like 'float', except it can parse sign.
, float' :: ParsecT s u m Double
-- | The lexeme parser parses either 'integer' or a 'float'.
-- Returns the value of the number. This parser deals with any overlap in
-- the grammar rules for integers and floats. The number is parsed
-- according to the grammar rules defined in the Haskell report.
, number :: ParsecT s u m (Either Integer Double)
-- | This is just like 'number', except it can parse sign.
, number' :: ParsecT s u m (Either Integer Double)
-- | Lexeme parser @symbol s@ parses 'string' @s@ and skips
-- trailing white space.
, symbol :: String -> ParsecT s u m String
-- | @lexeme p@ first applies parser @p@ and then the 'whiteSpace'
-- parser, returning the value of @p@. Every lexical token (lexeme) is
-- defined using @lexeme@, this way every parse starts at a point without
-- white space. Parsers that use @lexeme@ are called /lexeme/ parsers in
-- this document.
--
-- The only point where the 'whiteSpace' parser should be called
-- explicitly is the start of the main parser in order to skip any leading
-- white space.
, lexeme :: forall a. ParsecT s u m a -> ParsecT s u m a
-- | Parses any white space. White space consists of /zero/ or more
-- occurrences of a 'space', a line comment or a block (multi line)
-- comment. Block comments may be nested. How comments are started and
-- ended is defined in the 'LanguageDef' that is passed to
-- 'makeLexer'.
, whiteSpace :: ParsecT s u m ()
-- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis,
-- returning the value of @p@.
, parens :: forall a. ParsecT s u m a -> ParsecT s u m a
-- | Lexeme parser @braces p@ parses @p@ enclosed in braces (“{” and
-- “}”), returning the value of @p@.
, braces :: forall a. ParsecT s u m a -> ParsecT s u m a
-- | Lexeme parser @angles p@ parses @p@ enclosed in angle brackets (“\<”
-- and “>”), returning the value of @p@.
, angles :: forall a. ParsecT s u m a -> ParsecT s u m a
-- | Lexeme parser @brackets p@ parses @p@ enclosed in brackets (“[”
-- and “]”), returning the value of @p@.
, brackets :: forall a. ParsecT s u m a -> ParsecT s u m a
-- | Lexeme parser @semicolon@ parses the character “;” and skips any
-- trailing white space. Returns the string “;”.
, semicolon :: ParsecT s u m String
-- | Lexeme parser @comma@ parses the character “,” and skips any
-- trailing white space. Returns the string “,”.
, comma :: ParsecT s u m String
-- | Lexeme parser @colon@ parses the character “:” and skips any
-- trailing white space. Returns the string “:”.
, colon :: ParsecT s u m String
-- | Lexeme parser @dot@ parses the character “.” and skips any
-- trailing white space. Returns the string “.”.
, dot :: ParsecT s u m String
-- | Lexeme parser @semiSep p@ parses /zero/ or more occurrences of @p@
-- separated by 'semicolon'. Returns a list of values returned by @p@.
, semicolonSep :: forall a . ParsecT s u m a -> ParsecT s u m [a]
-- | Lexeme parser @semiSep1 p@ parses /one/ or more occurrences of @p@
-- separated by 'semi'. Returns a list of values returned by @p@.
, semicolonSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a]
-- | Lexeme parser @commaSep p@ parses /zero/ or more occurrences of
-- @p@ separated by 'comma'. Returns a list of values returned by @p@.
, commaSep :: forall a . ParsecT s u m a -> ParsecT s u m [a]
-- | Lexeme parser @commaSep1 p@ parses /one/ or more occurrences of
-- @p@ separated by 'comma'. Returns a list of values returned by @p@.
, commaSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a] }
-- | The expression @makeLexer language@ creates a 'Lexer' record that
-- contains lexical parsers that are defined using the definitions in the
-- @language@ record.
-- | @space spaceChar lineComment blockComment@ produces parser that can
-- parse white space in general. It's expected that you create such a parser
-- once and pass it to many other function in this module as needed (it's
-- usually called @spaceConsumer@ in doc-strings here).
--
-- The use of this function is quite stylized — one imports the appropriate
-- language definition and selects the lexical parsers that are needed from
-- the resulting 'Lexer'.
-- @spaceChar@ is used to parse trivial space characters. You can use
-- 'C.spaceChar' from "Text.Megaparsec.Char" for this purpose as well as
-- your own parser (if you don't want automatically consume newlines, for
-- example).
--
-- > module Main (main) where
-- @lineComment@ is used to parse line comments. You can use
-- 'skipLineComment' if you don't need anything special.
--
-- @blockComment@ is used to parse block (multi-line) comments. You can use
-- 'skipBlockComment' if you don't need anything special.
--
-- Parsing of white space is important part of any parser. We propose scheme
-- where every lexeme should consume all trailing white space, but not
-- leading one. You should wrap every lexeme parser with 'lexeme' to achieve
-- this. You only need to call 'space' “manually” to consume any white space
-- before the first lexeme (at the beginning of file).
space :: Stream s m t => ParsecT s u m () ->
ParsecT s u m () -> ParsecT s u m () -> ParsecT s u m ()
space ch line block = hidden . skipMany $ choice [ch, line, block]
-- | This is wrapper for lexemes. Typical usage is to supply first argument
-- (parser that consumes white space, probably defined via 'space') and use
-- resulting function to wrap parsers for every lexeme.
--
-- > lexeme = L.lexeme spaceConsumer
-- > integer = lexeme L.integer
lexeme :: ParsecT s u m () -> ParsecT s u m a -> ParsecT s u m a
lexeme spc p = p <* spc
-- | This is a helper to parse symbols, i.e. verbatim strings. You pass the
-- first argument (parser that consumes white space, probably defined via
-- 'space') and then you can use the resulting function to parse strings:
--
-- > symbol = L.symbol spaceConsumer
-- >
-- > import Text.Megaparsec
-- > import Text.Megaparsec.Language (haskellDef)
-- > import qualified Text.Megaparsec.Lexer as L
-- >
-- > -- The parser
-- > …
-- >
-- > expr = parens expr
-- > <|> identifier
-- > <|> …
-- >
-- > -- The lexer
-- > lexer = L.makeLexer haskellDef
-- >
-- > parens = L.parens lexer
-- > braces = L.braces lexer
-- > identifier = L.identifier lexer
-- > reserved = L.reserved lexer
-- > …
makeLexer :: Stream s m Char => LanguageDef s u m -> Lexer s u m
makeLexer languageDef =
Lexer
{ identifier = identifier
, reserved = reserved
, operator = operator
, reservedOp = reservedOp
, charLiteral = charLiteral
, stringLiteral = stringLiteral
, integer = integer
, integer' = integer'
, decimal = decimal
, hexadecimal = hexadecimal
, octal = octal
, signed = signed
, float = float
, float' = float'
, number = number
, number' = number'
, symbol = symbol
, lexeme = lexeme
, whiteSpace = whiteSpace
, parens = parens
, braces = braces
, angles = angles
, brackets = brackets
, semicolon = semicolon
, comma = comma
, colon = colon
, dot = dot
, semicolonSep = semicolonSep
, semicolonSep1 = semicolonSep1
, commaSep = commaSep
, commaSep1 = commaSep1 }
where
-- bracketing
parens = between (symbol "(") (symbol ")")
braces = between (symbol "{") (symbol "}")
angles = between (symbol "<") (symbol ">")
brackets = between (symbol "[") (symbol "]")
semicolon = symbol ";"
comma = symbol ","
dot = symbol "."
colon = symbol ":"
commaSep = (`sepBy` comma)
semicolonSep = (`sepBy` semicolon)
commaSep1 = (`sepBy1` comma)
semicolonSep1 = (`sepBy1` semicolon)
-- chars & strings
charLiteral = lexeme ( between (char '\'')
(char '\'' <?> "end of character")
characterChar )
<?> "character"
characterChar = charLetter <|> charEscape <?> "literal character"
charEscape = char '\\' >> escapeCode
charLetter = satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026'))
stringLiteral =
lexeme ((foldr (maybe id (:)) "" <$>
between (char '"') (char '"' <?> "end of string")
(many stringChar)) <?> "literal string")
stringChar = (Just <$> stringLetter) <|> stringEscape <?> "string character"
stringLetter = satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026'))
stringEscape = char '\\' >>
( (escapeGap >> return Nothing) <|>
(escapeEmpty >> return Nothing) <|>
(Just <$> escapeCode) )
escapeEmpty = char '&'
escapeGap = some spaceChar >> char '\\' <?> "end of string gap"
-- escape codes
escapeCode = charEsc <|> charNum <|> charAscii <|> charControl
<?> "escape code"
charEsc = choice (parseEsc <$> escMap)
where parseEsc (c, code) = char c >> return code
charNum = toEnum . fromInteger <$>
( decimal <|>
(char 'o' >> nump "0o" octDigitChar) <|>
(char 'x' >> nump "0x" hexDigitChar) )
charAscii = choice (parseAscii <$> asciiMap)
where parseAscii (asc, code) = try (string asc >> return code)
charControl = toEnum . subtract 64 . fromEnum <$> (char '^' >> upperChar)
-- escape code tables
escMap = zip "abfnrtv\\\"\'" "\a\b\f\n\r\t\v\\\"\'"
asciiMap = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2)
ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM",
"FS","GS","RS","US","SP"]
ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL",
"DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB",
"CAN","SUB","ESC","DEL"]
ascii2 = "\b\t\n\v\f\r\SO\SI\EM\FS\GS\RS\US "
ascii3 = "\NUL\SOH\STX\ETX\EOT\ENQ\ACK\a\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\SUB\ESC\DEL"
-- numbers — integers
integer = decimal
integer' = signed integer
decimal = lexeme (nump "" digitChar <?> "integer")
hexadecimal = lexeme $ char '0' >> oneOf "xX" >> nump "0x" hexDigitChar
octal = lexeme $ char '0' >> oneOf "oO" >> nump "0o" octDigitChar
nump prefix baseDigit = read . (prefix ++) <$> some baseDigit
signed p = ($) <$> option id (lexeme sign) <*> p
sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a)
sign = (char '+' *> return id) <|> (char '-' *> return negate)
-- numbers — floats
float = lexeme ffloat <?> "float"
float' = signed float
ffloat = read <$> ffloat'
where
ffloat' = do
decimal <- fDec
rest <- fraction <|> fExp
return $ decimal ++ rest
fraction = do
void $ char '.'
decimal <- fDec
exp <- option "" fExp
return $ '.' : decimal ++ exp
fDec = some digitChar
fExp = do
expChar <- oneOf "eE"
signStr <- option "" (pure <$> oneOf "+-")
decimal <- fDec
return $ expChar : signStr ++ decimal
-- numbers — a more general case
number = (Right <$> try float) <|> (Left <$> integer) <?> "number"
number' = (Right <$> try float') <|> (Left <$> integer') <?> "number"
-- operators & reserved ops
reservedOp name =
lexeme $ try $ do
void $ string name
notFollowedBy (opLetter languageDef) <?> ("end of " ++ show name)
operator =
lexeme $ try $ do
name <- oper
if isReservedOp name
then unexpected ("reserved operator " ++ show name)
else return name
oper = ((:) <$> opStart languageDef <*> many (opLetter languageDef))
<?> "operator"
isReservedOp = isReserved . sort $ reservedOpNames languageDef
-- identifiers & reserved words
reserved name =
lexeme $ try $ do
void $ caseString name
notFollowedBy (identLetter languageDef) <?> ("end of " ++ show name)
caseString name
| caseSensitive languageDef = string name
| otherwise = walk name >> return name
where walk = foldr (\c -> ((caseChar c <?> show name) >>)) (return ())
caseChar c
| isAlpha c = char (toLower c) <|> char (toUpper c)
| otherwise = char c
identifier =
lexeme $ try $ do
name <- ident
if isReservedName name
then unexpected ("reserved word " ++ show name)
else return name
ident = ((:) <$> identStart languageDef <*> many (identLetter languageDef))
<?> "identifier"
isReservedName name = isReserved theReservedNames caseName
where caseName
| caseSensitive languageDef = name
| otherwise = toLower <$> name
isReserved names name = scan names
where scan [] = False
scan (r:rs) = case compare r name of
LT -> scan rs
EQ -> True
GT -> False
theReservedNames
| caseSensitive languageDef = sort reserved
| otherwise = sort . fmap (fmap toLower) $ reserved
where reserved = reservedNames languageDef
-- white space & symbols
symbol = lexeme . string
lexeme p = p <* whiteSpace
whiteSpace = hidden space -- FIXME: write it in a decent manner
-- \| noLine && noMulti = skipMany (space <?> "")
-- \| noLine = skipMany (space <|>
-- multiLineComment <?> "")
-- \| noMulti = skipMany (space <|>
-- oneLineComment <?> "")
-- \| otherwise = skipMany (space <|>
-- oneLineComment <|>
-- multiLineComment <?> "")
-- where
-- noLine = null (commentLine languageDef)
-- noMulti = null (commentStart languageDef)
-- oneLineComment = void (try (string (commentLine languageDef))
-- >> skipMany (satisfy (/= '\n')))
-- multiLineComment = try (string (commentStart languageDef)) >> inComment
-- inComment = if nestedComments languageDef
-- then inCommentMulti
-- else inCommentSingle
-- inCommentMulti
-- = void (try . string $ commentEnd languageDef)
-- <|> (multiLineComment >> inCommentMulti)
-- <|> (skipSome (noneOf startEnd) >> inCommentMulti)
-- <|> (oneOf startEnd >> inCommentMulti)
-- <?> "end of comment"
-- inCommentSingle
-- = void (try . string $ commentEnd languageDef)
-- <|> (skipSome (noneOf startEnd) >> inCommentSingle)
-- <|> (oneOf startEnd >> inCommentSingle)
-- <?> "end of comment"
-- startEnd = nub $ (++) <$> commentEnd <*> commentStart $ languageDef
-- > parens = between (symbol "(") (symbol ")")
-- > braces = between (symbol "{") (symbol "}")
-- > angles = between (symbol "<") (symbol ">")
-- > brackets = between (symbol "[") (symbol "]")
-- > semicolon = symbol ";"
-- > comma = symbol ","
-- > colon = symbol ":"
-- > dot = symbol "."
symbol :: Stream s m Char =>
ParsecT s u m () -> String -> ParsecT s u m String
symbol spc = lexeme spc . C.string
-- | Case-insensitive version of 'symbol'. This may be helpful if you're
-- working with case-insensitive languages.
symbol' :: Stream s m Char =>
ParsecT s u m () -> String -> ParsecT s u m String
symbol' spc = lexeme spc . C.string'
-- | @indentGuard spaceConsumer test@ first consumes all white space
-- (indentation) with @spaceConsumer@ parser, then it checks column
-- position. It should satisfy supplied predicate @test@, otherwise the
-- parser fails with error message “incorrect indentation”. On success
-- current column position is returned.
--
-- When you want to parse block of indentation first run this parser with
-- predicate like @(> 1)@ — this will make sure you have some
-- indentation. Use returned value to check indentation on every subsequent
-- line according to syntax of your language.
indentGuard :: Stream s m t =>
ParsecT s u m () -> (Int -> Bool) -> ParsecT s u m Int
indentGuard spc p = do
spc
pos <- sourceColumn <$> getPosition
if p pos
then return pos
else fail "incorrect indentation"
-- | Given comment prefix this function returns parser that skips line
-- comments. Note that it stops just before newline character but doesn't
-- consume the newline. Newline is either supposed to be consumed by 'space'
-- parser or picked up manually.
skipLineComment :: Stream s m Char => String -> ParsecT s u m ()
skipLineComment prefix = p >> void (manyTill C.anyChar n)
where p = try $ C.string prefix
n = lookAhead C.newline
-- | @skipBlockComment start end@ skips non-nested block comment starting
-- with @start@ and ending with @end@.
skipBlockComment :: Stream s m Char => String -> String -> ParsecT s u m ()
skipBlockComment start end = p >> void (manyTill C.anyChar n)
where p = try $ C.string start
n = try $ C.string end
-- Character and string literals
-- | The lexeme parser parses a single literal character without
-- quotes. Purpose of this parser is to help with parsing of commonly used
-- escape sequences. It's your responsibility to take care of character
-- literal syntax in your language (surround it with single quotes or
-- similar).
--
-- The literal character is parsed according to the grammar rules defined in
-- the Haskell report.
--
-- Note that you can use this parser as a building block to parse various
-- string literals:
--
-- > stringLiteral = char '"' >> manyTill L.charLiteral (char '"')
charLiteral :: Stream s m Char => ParsecT s u m Char
charLiteral = label "literal character" $ do
r@(x:_) <- lookAhead $ count' 1 8 C.anyChar
case listToMaybe (readLitChar r) of
Just (c, r') -> count (length r - length r') C.anyChar >> return c
Nothing -> unexpected (showToken x)
-- Numbers
-- | Parse an integer without sign in decimal representation (according to
-- format of integer literals described in Haskell report).
--
-- If you need to parse signed integers, see 'signed' combinator.
integer :: Stream s m Char => ParsecT s u m Integer
integer = decimal <?> "integer"
-- | The same as 'integer', but 'integer' is 'label'ed with “integer” label,
-- while this parser is not labeled.
decimal :: Stream s m Char => ParsecT s u m Integer
decimal = nump "" C.digitChar
-- | Parse an integer in hexadecimal representation. Representation of
-- hexadecimal number is expected to be according to Haskell report except
-- for the fact that this parser doesn't parse “0x” or “0X” prefix. It is
-- reponsibility of the programmer to parse correct prefix before parsing
-- the number itself.
--
-- For example you can make it conform to Haskell report like this:
--
-- > hexadecimal = char '0' >> char' 'x' >> L.hexadecimal
hexadecimal :: Stream s m Char => ParsecT s u m Integer
hexadecimal = nump "0x" C.hexDigitChar
-- | Parse an integer in octal representation. Representation of octal
-- number is expected to be according to Haskell report except for the fact
-- that this parser doesn't parse “0o” or “0O” prefix. It is responsibility
-- of the programmer to parse correct prefix before parsing the number
-- itself.
octal :: Stream s m Char => ParsecT s u m Integer
octal = nump "0o" C.octDigitChar
-- | @nump prefix p@ parses /one/ or more characters with @p@ parser, then
-- prepends @prefix@ to returned value and tries to interpret the result as
-- an integer according to Haskell syntax.
nump :: String -> ParsecT s u m Char -> ParsecT s u m Integer
nump prefix baseDigit = read . (prefix ++) <$> some baseDigit
-- | Parse a floating point value without sign. Representation of floating
-- point value is expected to be according to Haskell report.
--
-- If you need to parse signed floats, see 'signed' combinator.
float :: Stream s m Char => ParsecT s u m Double
float = label "float" $ read <$> f
where f = do
d <- some C.digitChar
rest <- fraction <|> fExp
return $ d ++ rest
-- | This is a helper for 'float' parser. It parses fractional part of
-- floating point number, that is, dot and everything after it.
fraction :: Stream s m Char => ParsecT s u m String
fraction = do
void $ C.char '.'
d <- some C.digitChar
e <- option "" fExp
return $ '.' : d ++ e
-- | This helper parses exponent of floating point numbers.
fExp :: Stream s m Char => ParsecT s u m String
fExp = do
expChar <- C.char' 'e'
signStr <- option "" (pure <$> choice (C.char <$> "+-"))
d <- some C.digitChar
return $ expChar : signStr ++ d
-- | Parse a number: either integer or floating point. The parser can handle
-- overlapping grammars graciously.
number :: Stream s m Char => ParsecT s u m (Either Integer Double)
number = (Right <$> try float) <|> (Left <$> integer) <?> "number"
-- | @signed space p@ parser parses optional sign, then if there is a sign
-- it will consume optional white space (using @space@ parser), then it runs
-- parser @p@ which should return a number. Sign of the number is changed
-- according to previously parsed sign.
--
-- For example, to parse signed integer you can write:
--
-- > lexeme = L.lexeme spaceConsumer
-- > integer = lexeme L.integer
-- > signedInteger = signed spaceConsumer integer
signed :: (Stream s m Char, Num a) =>
ParsecT s u m () -> ParsecT s u m a -> ParsecT s u m a
signed spc p = ($) <$> option id (lexeme spc sign) <*> p
-- | Parse a sign and return either 'id' or 'negate' according to parsed
-- sign.
sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a)
sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate)

View File

@ -100,7 +100,6 @@ library
, Text.Megaparsec.Combinator
, Text.Megaparsec.Error
, Text.Megaparsec.Expr
, Text.Megaparsec.Language
, Text.Megaparsec.Lexer
, Text.Megaparsec.Perm
, Text.Megaparsec.Pos
@ -116,6 +115,7 @@ test-suite old-tests
main-is: Main.hs
hs-source-dirs: old-tests
type: exitcode-stdio-1.0
ghc-options: -O2 -Wall
other-modules: Bugs
, Bugs.Bug2
, Bugs.Bug6
@ -126,7 +126,8 @@ test-suite old-tests
, HUnit == 1.2.*
, test-framework >= 0.6 && < 1
, test-framework-hunit >= 0.2 && < 0.4
ghc-options: -O2 -Wall
default-extensions:
FlexibleContexts
default-language: Haskell2010
test-suite tests

View File

@ -1,21 +1,33 @@
module Bugs.Bug2 (main) where
import Test.HUnit hiding (Test)
import Test.Framework
import Test.Framework.Providers.HUnit
import Control.Applicative (empty)
import Control.Monad (void)
import Text.Megaparsec
import Text.Megaparsec.Language (haskellDef)
import Text.Megaparsec.String
import qualified Text.Megaparsec.Lexer as L
import Test.Framework
import Test.Framework.Providers.HUnit
import Test.HUnit hiding (Test)
sc :: Parser ()
sc = L.space (void spaceChar) empty empty
lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc
stringLiteral :: Parser String
stringLiteral = lexeme $ char '"' >> manyTill L.charLiteral (char '"')
main :: Test
main =
testCase "Control Char Parsing (#2)" $
parseString "\"test\\^Bstring\"" @?= "test\^Bstring"
where
parseString :: String -> String
parseString input =
case parse parser "Example" input of
case parse stringLiteral "Example" input of
Left{} -> error "Parse failure"
Right str -> str
parser = L.stringLiteral $ L.makeLexer haskellDef

View File

@ -2,13 +2,11 @@
module Bugs.Bug35 (main) where
import Text.Megaparsec
import Text.Megaparsec.Language
import Text.Megaparsec.String
import qualified Text.Megaparsec.Lexer as L
import Test.HUnit hiding (Test)
import Test.Framework
import Test.Framework.Providers.HUnit
import Test.HUnit hiding (Test)
trickyFloats :: [String]
trickyFloats =
@ -28,12 +26,9 @@ trickyFloats =
, "32.96176575630599"
, "38.47735512322269" ]
float :: Parser Double
float = L.float (L.makeLexer emptyDef)
testBatch :: Assertion
testBatch = mapM_ testFloat trickyFloats
where testFloat x = parse float "" x @?= Right (read x :: Double)
where testFloat x = parse L.float "" x @?= Right (read x :: Double)
main :: Test
main = testCase "Output of Text.Megaparsec.Lexer.float (#35)" testBatch

View File

@ -1,16 +1,17 @@
module Bugs.Bug39 (main) where
import Control.Applicative (empty)
import Control.Monad (void)
import Data.Either (isLeft, isRight)
import Text.Megaparsec
import Text.Megaparsec.Language
import Text.Megaparsec.String
import qualified Text.Megaparsec.Lexer as L
import Test.HUnit hiding (Test)
import Test.Framework
import Test.Framework.Providers.HUnit
import Test.HUnit hiding (Test)
shouldFail :: [String]
shouldFail = [" 1", " +1", " -1"]
@ -18,8 +19,14 @@ shouldFail = [" 1", " +1", " -1"]
shouldSucceed :: [String]
shouldSucceed = ["1", "+1", "-1", "+ 1 ", "- 1 ", "1 "]
sc :: Parser ()
sc = L.space (void spaceChar) empty empty
lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc
integer :: Parser Integer
integer = L.integer' (L.makeLexer emptyDef)
integer = lexeme $ L.signed sc L.integer
testBatch :: Assertion
testBatch = mapM_ (f testFail) shouldFail >>

View File

@ -1,13 +1,13 @@
module Bugs.Bug6 (main) where
import Test.HUnit hiding (Test)
import Test.Framework
import Test.Framework.Providers.HUnit
import Text.Megaparsec
import Text.Megaparsec.String
import Test.Framework
import Test.Framework.Providers.HUnit
import Test.HUnit hiding (Test)
import Util
main :: Test
@ -19,5 +19,5 @@ variable :: Parser String
variable = do
x <- lookAhead (some letterChar)
if x == "return"
then fail "'return' is a reserved keyword"
else string x
then fail "'return' is a reserved keyword"
else string x

View File

@ -1,15 +1,17 @@
module Bugs.Bug9 (main) where
import Control.Applicative (empty)
import Control.Monad (void)
import Text.Megaparsec
import Text.Megaparsec.Language (haskellStyle)
import Text.Megaparsec.String (Parser)
import Text.Megaparsec.Expr
import Text.Megaparsec.String (Parser)
import qualified Text.Megaparsec.Lexer as L
import Test.HUnit hiding (Test)
import Test.Framework
import Test.Framework.Providers.HUnit
import Test.HUnit hiding (Test)
import Util
@ -25,12 +27,21 @@ main =
-- Syntax analysis
sc :: Stream s m Char => ParsecT s u m ()
sc = L.space (void spaceChar) empty empty
lexeme :: Stream s m Char => ParsecT s u m a -> ParsecT s u m a
lexeme = L.lexeme sc
integer :: Stream s m Char => ParsecT s u m Integer
integer = lexeme L.integer
operator :: Stream s m Char => String -> ParsecT s u m String
operator = try . L.symbol sc
parseTopLevel :: Parser Expr
parseTopLevel = parseExpr <* eof
parseExpr :: Parser Expr
parseExpr = makeExprParser (Const <$> integer) table
where table = [[ InfixL (Op <$ reserved ">>>") ]]
lexer = L.makeLexer haskellStyle { L.reservedOpNames = [">>>"] }
integer = L.integer lexer
reserved = L.reserved lexer
where table = [[ InfixL (Op <$ operator ">>>") ]]

View File

@ -29,16 +29,206 @@
module Lexer (tests) where
import Control.Applicative (some, (<|>))
import Control.Applicative (empty)
import Control.Monad (void)
import Data.Char
( readLitChar
, showLitChar
, isDigit
, isAlphaNum
, isSpace
, toLower )
import Data.List (findIndices, isInfixOf)
import Data.Maybe (listToMaybe, maybeToList, isNothing, fromJust)
import Numeric (showInt, showHex, showOct, showSigned)
import Test.Framework
import Test.Framework.Providers.QuickCheck2 (testProperty)
import Test.QuickCheck
import Text.Megaparsec.Char
import Text.Megaparsec.Combinator
import Text.Megaparsec.Error
import Text.Megaparsec.Lexer
import Text.Megaparsec.Pos
import Text.Megaparsec.Prim
import Text.Megaparsec.String
import qualified Text.Megaparsec.Char as C
import Util
tests :: Test
tests = testGroup "Lexer"
[]
[ testProperty "space combinator" prop_space
, testProperty "symbol combinator" prop_symbol
, testProperty "symbol' combinator" prop_symbol'
, testProperty "indentGuard combinator" prop_indentGuard
, testProperty "charLiteral" prop_charLiteral
, testProperty "integer" prop_integer
, testProperty "decimal" prop_decimal
, testProperty "hexadecimal" prop_hexadecimal
, testProperty "octal" prop_octal
, testProperty "float 0" prop_float_0
, testProperty "float 1" prop_float_1
, testProperty "number" prop_number
, testProperty "signed" prop_signed ]
newtype WhiteSpace = WhiteSpace
{ getWhiteSpace :: String }
deriving (Show, Eq)
instance Arbitrary WhiteSpace where
arbitrary = WhiteSpace . concat <$> listOf whiteUnit
newtype Symbol = Symbol
{ getSymbol :: String }
deriving (Show, Eq)
instance Arbitrary Symbol where
arbitrary = Symbol <$> ((++) <$> symbolName <*> whiteChars)
whiteUnit :: Gen String
whiteUnit = oneof [whiteChars, whiteLine, whiteBlock]
whiteChars :: Gen String
whiteChars = listOf $ elements "\t\n "
whiteLine :: Gen String
whiteLine = commentOut <$> arbitrary `suchThat` goodEnough
where commentOut x = "//" ++ x ++ "\n"
goodEnough x = '\n' `notElem` x
whiteBlock :: Gen String
whiteBlock = commentOut <$> arbitrary `suchThat` goodEnough
where commentOut x = "/*" ++ x ++ "*/"
goodEnough x = not $ "*/" `isInfixOf` x
symbolName :: Gen String
symbolName = listOf $ arbitrary `suchThat` isAlphaNum
sc :: Parser ()
sc = space (void C.spaceChar) l b
where l = skipLineComment "//"
b = skipBlockComment "/*" "*/"
sc' :: Parser ()
sc' = space (void $ C.oneOf " \t") empty empty
prop_space :: WhiteSpace -> Property
prop_space w = checkParser p r s
where p = sc
r = Right ()
s = getWhiteSpace w
prop_symbol :: Symbol -> Maybe Char -> Property
prop_symbol = parseSymbol (symbol sc) id
prop_symbol' :: Symbol -> Maybe Char -> Property
prop_symbol' = parseSymbol (symbol' sc) (fmap toLower)
parseSymbol :: (String -> Parser String) -> (String -> String)
-> Symbol -> Maybe Char -> Property
parseSymbol p' f s' t = checkParser p r s
where p = p' (f g)
r | g == s || isSpace (last s) = Right (f g)
| otherwise = posErr (length s - 1) s [uneCh (last s), exEof]
g = takeWhile (not . isSpace) s
s = getSymbol s' ++ maybeToList t
newtype IndLine = IndLine
{ getIndLine :: String }
deriving (Show, Eq)
instance Arbitrary IndLine where
arbitrary = IndLine . concat <$> sequence [spc, sym, spc, eol]
where spc = listOf (elements " \t")
sym = return "xxx"
eol = return "\n"
prop_indentGuard :: IndLine -> IndLine -> IndLine -> Property
prop_indentGuard l0 l1 l2 = checkParser p r s
where p = ip (> 1) >>= \x -> sp >> ip (== x) >> sp >> ip (> x) >> sp
ip = indentGuard sc'
sp = void $ symbol sc' "xxx" <* C.eol
r | f' l0 <= 1 = posErr 0 s msg'
| f' l1 /= f' l0 = posErr (f l1 + g [l0]) s msg'
| f' l2 <= f' l0 = posErr (f l2 + g [l0, l1]) s msg'
| otherwise = Right ()
msg' = [msg "incorrect indentation"]
f = length . takeWhile isSpace . getIndLine
f' x = sourceColumn $
updatePosString (initialPos "") $ take (f x) (getIndLine x)
g xs = sum $ length . getIndLine <$> xs
s = concat $ getIndLine <$> [l0, l1, l2]
prop_charLiteral :: String -> Bool -> Property
prop_charLiteral t i = checkParser charLiteral r s
where b = listToMaybe $ readLitChar s
(h, g) = fromJust b
r | isNothing b = posErr 0 s $ exSpec "literal character" :
[ if null s then uneEof else uneCh (head s) ]
| null g = Right h
| otherwise = posErr l s [uneCh (head g), exEof]
l = length s - length g
s = if null t || i then t else showLitChar (head t) (tail t)
prop_integer :: NonNegative Integer -> Int -> Property
prop_integer n' i = checkParser integer r s
where (r, s) = quasiCorrupted n' i showInt "integer"
prop_decimal :: NonNegative Integer -> Int -> Property
prop_decimal n' i = checkParser decimal r s
where (r, s) = quasiCorrupted n' i showInt "digit"
prop_hexadecimal :: NonNegative Integer -> Int -> Property
prop_hexadecimal n' i = checkParser hexadecimal r s
where (r, s) = quasiCorrupted n' i showHex "hexadecimal digit"
prop_octal :: NonNegative Integer -> Int -> Property
prop_octal n' i = checkParser octal r s
where (r, s) = quasiCorrupted n' i showOct "octal digit"
prop_float_0 :: NonNegative Double -> Property
prop_float_0 n' = checkParser float r s
where n = getNonNegative n'
r = Right n
s = show n
prop_float_1 :: Maybe (NonNegative Integer) -> Property
prop_float_1 n' = checkParser float r s
where r | isNothing n' = posErr 0 s [uneEof, exSpec "float"]
| otherwise = posErr (length s) s [ uneEof, exCh '.', exCh 'E'
, exCh 'e', exSpec "digit" ]
s = maybe "" (show . getNonNegative) n'
prop_number :: Either (NonNegative Integer) (NonNegative Double)
-> Integer -> Property
prop_number n' i = checkParser number r s
where r | null s = posErr 0 s [uneEof, exSpec "number"]
| otherwise =
Right $ case n' of
Left x -> Left $ getNonNegative x
Right x -> Right $ getNonNegative x
s = if i < 5
then ""
else either (show . getNonNegative) (show . getNonNegative) n'
prop_signed :: Integer -> Int -> Bool -> Property
prop_signed n i plus = checkParser p r s
where p = signed (hidden C.space) integer
r | i > length z = Right n
| otherwise = posErr i s $ [uneCh '?', exSpec "integer"] ++
(if i <= 0 then [exCh '+', exCh '-'] else []) ++
[exEof | i > head (findIndices isDigit s)]
z = let bar = showSigned showInt 0 n ""
in if n < 0 || plus then bar else '+' : bar
s = if i <= length z then take i z ++ "?" ++ drop i z else z
quasiCorrupted :: NonNegative Integer -> Int
-> (Integer -> String -> String) -> String
-> (Either ParseError Integer, String)
quasiCorrupted n' i shower l = (r, s)
where n = getNonNegative n'
r | i > length z = Right n
| otherwise = posErr i s $ [uneCh '?', exSpec l] ++
[ exEof | i > 0 ]
z = shower n ""
s = if i <= length z then take i z ++ "?" ++ drop i z else z