mirror of
https://github.com/mrkkrp/megaparsec.git
synced 2024-11-23 19:38:05 +03:00
Merge branch 'new-lexer'
This commit is contained in:
commit
0d39e44f40
@ -41,11 +41,10 @@ data Operator s u m a
|
||||
-- descending precedence. All operators in one list have the same precedence
|
||||
-- (but may have a different associativity).
|
||||
--
|
||||
-- Prefix and postfix
|
||||
-- operators of the same precedence can only occur once (i.e. @--2@ is not
|
||||
-- allowed if @-@ is prefix negate). Prefix and postfix operators of the
|
||||
-- same precedence associate to the left (i.e. if @++@ is postfix increment,
|
||||
-- than @-2++@ equals @-1@, not @-3@).
|
||||
-- Prefix and postfix operators of the same precedence can only occur once
|
||||
-- (i.e. @--2@ is not allowed if @-@ is prefix negate). Prefix and postfix
|
||||
-- operators of the same precedence associate to the left (i.e. if @++@ is
|
||||
-- postfix increment, than @-2++@ equals @-1@, not @-3@).
|
||||
--
|
||||
-- The @makeExprParser@ takes care of all the complexity involved in
|
||||
-- building expression parser. Here is an example of an expression parser
|
||||
@ -66,6 +65,9 @@ data Operator s u m a
|
||||
-- > binary name f = InfixL (reservedOp name >> return f)
|
||||
-- > prefix name f = Prefix (reservedOp name >> return f)
|
||||
-- > postfix name f = Postfix (reservedOp name >> return f)
|
||||
--
|
||||
-- Please note that multi-character operators should use 'try' in order to
|
||||
-- be reported correctly in error messages.
|
||||
|
||||
makeExprParser :: Stream s m t => ParsecT s u m a ->
|
||||
[[Operator s u m a]] -> ParsecT s u m a
|
||||
|
@ -1,115 +0,0 @@
|
||||
-- |
|
||||
-- Module : Text.Megaparsec.Language
|
||||
-- Copyright : © 2015 Megaparsec contributors
|
||||
-- © 2007 Paolo Martini
|
||||
-- © 1999–2001 Daan Leijen
|
||||
-- License : BSD3
|
||||
--
|
||||
-- Maintainer : Mark Karpov <markkarpov@opmbx.org>
|
||||
-- Stability : experimental
|
||||
-- Portability : non-portable (uses non-portable module Text.Megaparsec.Lexer)
|
||||
--
|
||||
-- A helper module that defines some language definitions that can be used
|
||||
-- to instantiate a token parser (see "Text.Megaparsec.Lexer").
|
||||
|
||||
module Text.Megaparsec.Language
|
||||
( LanguageDef
|
||||
, emptyDef
|
||||
, haskellStyle
|
||||
, javaStyle
|
||||
, haskellDef
|
||||
, mondrianDef )
|
||||
where
|
||||
|
||||
import Control.Monad.Identity
|
||||
|
||||
import Control.Applicative ((<|>))
|
||||
import Text.Megaparsec.Char
|
||||
import Text.Megaparsec.Lexer
|
||||
|
||||
-- | This is the most minimal token definition. It is recommended to use
|
||||
-- this definition as the basis for other definitions. @emptyDef@ has no
|
||||
-- reserved names or operators, is case sensitive and doesn't accept
|
||||
-- comments, identifiers or operators.
|
||||
|
||||
emptyDef :: LanguageDef String st Identity
|
||||
emptyDef =
|
||||
LanguageDef
|
||||
{ commentStart = ""
|
||||
, commentEnd = ""
|
||||
, commentLine = ""
|
||||
, nestedComments = True
|
||||
, identStart = letterChar <|> char '_'
|
||||
, identLetter = alphaNumChar <|> oneOf "_'"
|
||||
, opStart = opLetter emptyDef
|
||||
, opLetter = oneOf ":!#$%&*+./<=>?@\\^|-~"
|
||||
, reservedOpNames = []
|
||||
, reservedNames = []
|
||||
, caseSensitive = True }
|
||||
|
||||
-- | This is a minimal token definition for Haskell-style languages. It
|
||||
-- defines the style of comments, valid identifiers and case sensitivity. It
|
||||
-- does not define any reserved words or operators.
|
||||
|
||||
haskellStyle :: LanguageDef String u Identity
|
||||
haskellStyle =
|
||||
emptyDef
|
||||
{ commentStart = "{-"
|
||||
, commentEnd = "-}"
|
||||
, commentLine = "--"
|
||||
, nestedComments = True
|
||||
, identStart = letterChar
|
||||
, identLetter = alphaNumChar <|> oneOf "_'"
|
||||
, opStart = opLetter haskellStyle
|
||||
, opLetter = oneOf ":!#$%&*+./<=>?@\\^|-~"
|
||||
, reservedOpNames = []
|
||||
, reservedNames = []
|
||||
, caseSensitive = True }
|
||||
|
||||
-- | This is a minimal token definition for Java-style languages. It
|
||||
-- defines the style of comments, valid identifiers and case sensitivity. It
|
||||
-- does not define any reserved words or operators.
|
||||
|
||||
javaStyle :: LanguageDef String u Identity
|
||||
javaStyle =
|
||||
emptyDef
|
||||
{ commentStart = "/*"
|
||||
, commentEnd = "*/"
|
||||
, commentLine = "//"
|
||||
, nestedComments = True
|
||||
, identStart = letterChar
|
||||
, identLetter = alphaNumChar <|> oneOf "_'"
|
||||
, reservedNames = []
|
||||
, reservedOpNames = []
|
||||
, caseSensitive = False }
|
||||
|
||||
-- | The language definition for the Haskell language.
|
||||
|
||||
haskellDef :: LanguageDef String u Identity
|
||||
haskellDef =
|
||||
haskell98Def
|
||||
{ identLetter = identLetter haskell98Def <|> char '#'
|
||||
, reservedNames = reservedNames haskell98Def ++
|
||||
[ "foreign", "import", "export", "primitive"
|
||||
, "_ccall_", "_casm_", "forall"] }
|
||||
|
||||
-- | The language definition for the language Haskell98.
|
||||
|
||||
haskell98Def :: LanguageDef String u Identity
|
||||
haskell98Def =
|
||||
haskellStyle
|
||||
{ reservedOpNames = ["::","..","=","\\","|","<-","->","@","~","=>"]
|
||||
, reservedNames = [ "let", "in", "case", "of", "if", "then", "else"
|
||||
, "data", "type", "class", "default", "deriving"
|
||||
, "do", "import", "infix", "infixl", "infixr"
|
||||
, "instance", "module", "newtype", "where"
|
||||
, "primitive" ] }
|
||||
|
||||
-- | The language definition for the language Mondrian.
|
||||
|
||||
mondrianDef :: LanguageDef String u Identity
|
||||
mondrianDef =
|
||||
javaStyle
|
||||
{ reservedNames = [ "case", "class", "default", "extends"
|
||||
, "import", "in", "let", "new", "of", "package" ]
|
||||
, caseSensitive = True }
|
@ -9,573 +9,271 @@
|
||||
-- Stability : experimental
|
||||
-- Portability : non-portable (uses local universal quantification: PolymorphicComponents)
|
||||
--
|
||||
-- A helper module to parse lexical elements. See 'makeLexer' for a
|
||||
-- description of how to use it.
|
||||
|
||||
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
|
||||
-- High-level parsers to help you write your lexer. The module doesn't
|
||||
-- impose how you should write your parser, but certain approaches may be
|
||||
-- more elegant than others. Especially important theme is parsing of write
|
||||
-- space, comments and indentation.
|
||||
--
|
||||
-- This module is supposed to be imported qualified:
|
||||
--
|
||||
-- > import qualified Text.Megaparsec.Lexer as L
|
||||
|
||||
module Text.Megaparsec.Lexer
|
||||
( LanguageDef (..)
|
||||
, Lexer (..)
|
||||
, makeLexer )
|
||||
( -- * White space and indentation
|
||||
space
|
||||
, lexeme
|
||||
, symbol
|
||||
, symbol'
|
||||
, indentGuard
|
||||
, skipLineComment
|
||||
, skipBlockComment
|
||||
-- * Character and string literals
|
||||
, charLiteral
|
||||
-- * Numbers
|
||||
, integer
|
||||
, decimal
|
||||
, hexadecimal
|
||||
, octal
|
||||
, float
|
||||
, number
|
||||
, signed )
|
||||
where
|
||||
|
||||
import Control.Applicative ((<|>), many, some)
|
||||
import Control.Applicative ((<|>), some)
|
||||
import Control.Monad (void)
|
||||
import Data.Char (isAlpha, toLower, toUpper)
|
||||
import Data.List (sort)
|
||||
import Data.Char (readLitChar)
|
||||
import Data.Maybe (listToMaybe)
|
||||
|
||||
import Text.Megaparsec.Prim
|
||||
import Text.Megaparsec.Char
|
||||
import Text.Megaparsec.Combinator
|
||||
import Text.Megaparsec.Pos
|
||||
import Text.Megaparsec.Prim
|
||||
import Text.Megaparsec.ShowToken
|
||||
import qualified Text.Megaparsec.Char as C
|
||||
|
||||
-- Language definition
|
||||
-- White space and indentation
|
||||
|
||||
-- | The @LanguageDef@ type is a record that contains all parameters used to
|
||||
-- control features of the "Text.Megaparsec.Lexer" module. The module
|
||||
-- "Text.Megaparsec.Language" contains some default definitions.
|
||||
|
||||
data LanguageDef s u m =
|
||||
LanguageDef {
|
||||
|
||||
-- | Describes the start of a block comment. Use the empty string if the
|
||||
-- language doesn't support block comments.
|
||||
|
||||
commentStart :: String
|
||||
|
||||
-- | Describes the end of a block comment. Use the empty string if the
|
||||
-- language doesn't support block comments.
|
||||
|
||||
, commentEnd :: String
|
||||
|
||||
-- | Describes the start of a line comment. Use the empty string if the
|
||||
-- language doesn't support line comments.
|
||||
|
||||
, commentLine :: String
|
||||
|
||||
-- | Set to 'True' if the language supports nested block comments.
|
||||
|
||||
, nestedComments :: Bool
|
||||
|
||||
-- | This parser should accept any start characters of identifiers, for
|
||||
-- example @letter \<|> char \'_\'@.
|
||||
|
||||
, identStart :: ParsecT s u m Char
|
||||
|
||||
-- | This parser should accept any legal tail characters of identifiers,
|
||||
-- for example @alphaNum \<|> char \'_\'@.
|
||||
|
||||
, identLetter :: ParsecT s u m Char
|
||||
|
||||
-- | This parser should accept any start characters of operators, for
|
||||
-- example @oneOf \":!#$%&*+.\/\<=>?\@\\\\^|-~\"@
|
||||
|
||||
, opStart :: ParsecT s u m Char
|
||||
|
||||
-- | This parser should accept any legal tail characters of operators.
|
||||
-- Note that this parser should even be defined if the language doesn't
|
||||
-- support user-defined operators, or otherwise the 'reservedOp' parser
|
||||
-- won't work correctly.
|
||||
|
||||
, opLetter :: ParsecT s u m Char
|
||||
|
||||
-- | The list of reserved identifiers.
|
||||
|
||||
, reservedNames :: [String]
|
||||
|
||||
-- | The list of reserved operators.
|
||||
|
||||
, reservedOpNames :: [String]
|
||||
|
||||
-- | Set to 'True' if the language is case sensitive.
|
||||
|
||||
, caseSensitive :: Bool }
|
||||
|
||||
-- Lexer
|
||||
|
||||
-- | The type of the record that holds lexical parsers that work on
|
||||
-- @s@ streams with state @u@ over a monad @m@.
|
||||
|
||||
data Lexer s u m =
|
||||
Lexer {
|
||||
|
||||
-- | The lexeme parser parses a legal identifier. Returns the identifier
|
||||
-- string. This parser will fail on identifiers that are reserved
|
||||
-- words. Legal identifier (start) characters and reserved words are
|
||||
-- defined in the 'LanguageDef' that is passed to 'makeLexer'.
|
||||
|
||||
identifier :: ParsecT s u m String
|
||||
|
||||
-- | The lexeme parser @reserved name@ parses @symbol name@, but it also
|
||||
-- checks that the @name@ is not a prefix of a valid identifier.
|
||||
|
||||
, reserved :: String -> ParsecT s u m ()
|
||||
|
||||
-- | The lexeme parser parses a legal operator. Returns the name of the
|
||||
-- operator. This parser will fail on any operators that are reserved
|
||||
-- operators. Legal operator (start) characters and reserved operators are
|
||||
-- defined in the 'LanguageDef' that is passed to 'makeLexer'.
|
||||
|
||||
, operator :: ParsecT s u m String
|
||||
|
||||
-- | The lexeme parser @reservedOp name@ parses @symbol name@, but it
|
||||
-- also checks that the @name@ is not a prefix of a valid operator.
|
||||
|
||||
, reservedOp :: String -> ParsecT s u m ()
|
||||
|
||||
-- | The lexeme parser parses a single literal character. Returns the
|
||||
-- literal character value. This parsers deals correctly with escape
|
||||
-- sequences. The literal character is parsed according to the grammar
|
||||
-- rules defined in the Haskell report (which matches most programming
|
||||
-- languages quite closely).
|
||||
|
||||
, charLiteral :: ParsecT s u m Char
|
||||
|
||||
-- | The lexeme parser parses a literal string. Returns the literal
|
||||
-- string value. This parsers deals correctly with escape sequences and
|
||||
-- gaps. The literal string is parsed according to the grammar rules
|
||||
-- defined in the Haskell report (which matches most programming languages
|
||||
-- quite closely).
|
||||
|
||||
, stringLiteral :: ParsecT s u m String
|
||||
|
||||
-- | The lexeme parser parses an integer (a whole number). This parser
|
||||
-- /does not/ parse sign. Returns the value of the number. The number can
|
||||
-- be specified in 'decimal', 'hexadecimal' or 'octal'. The number is
|
||||
-- parsed according to the grammar rules in the Haskell report.
|
||||
|
||||
, integer :: ParsecT s u m Integer
|
||||
|
||||
-- | This is just like 'integer', except it can parse sign.
|
||||
|
||||
, integer' :: ParsecT s u m Integer
|
||||
|
||||
-- | The lexeme parses a positive whole number in the decimal system.
|
||||
-- Returns the value of the number.
|
||||
|
||||
, decimal :: ParsecT s u m Integer
|
||||
|
||||
-- | The lexeme parses a positive whole number in the hexadecimal
|
||||
-- system. The number should be prefixed with “0x” or “0X”. Returns the
|
||||
-- value of the number.
|
||||
|
||||
, hexadecimal :: ParsecT s u m Integer
|
||||
|
||||
-- | The lexeme parses a positive whole number in the octal system.
|
||||
-- The number should be prefixed with “0o” or “0O”. Returns the value of
|
||||
-- the number.
|
||||
|
||||
, octal :: ParsecT s u m Integer
|
||||
|
||||
-- | @signed p@ tries to parse sign (i.e. “+”, “-”, or nothing) and
|
||||
-- then runs parser @p@, changing sign of its result accordingly. Note
|
||||
-- that there may be white space after the sign but not before it.
|
||||
|
||||
, signed :: forall a . Num a => ParsecT s u m a -> ParsecT s u m a
|
||||
|
||||
-- | The lexeme parser parses a floating point value. Returns the value
|
||||
-- of the number. The number is parsed according to the grammar rules
|
||||
-- defined in the Haskell report, sign is /not/ parsed, use 'float'' to
|
||||
-- achieve parsing of signed floating point values.
|
||||
|
||||
, float :: ParsecT s u m Double
|
||||
|
||||
-- | This is just like 'float', except it can parse sign.
|
||||
|
||||
, float' :: ParsecT s u m Double
|
||||
|
||||
-- | The lexeme parser parses either 'integer' or a 'float'.
|
||||
-- Returns the value of the number. This parser deals with any overlap in
|
||||
-- the grammar rules for integers and floats. The number is parsed
|
||||
-- according to the grammar rules defined in the Haskell report.
|
||||
|
||||
, number :: ParsecT s u m (Either Integer Double)
|
||||
|
||||
-- | This is just like 'number', except it can parse sign.
|
||||
|
||||
, number' :: ParsecT s u m (Either Integer Double)
|
||||
|
||||
-- | Lexeme parser @symbol s@ parses 'string' @s@ and skips
|
||||
-- trailing white space.
|
||||
|
||||
, symbol :: String -> ParsecT s u m String
|
||||
|
||||
-- | @lexeme p@ first applies parser @p@ and then the 'whiteSpace'
|
||||
-- parser, returning the value of @p@. Every lexical token (lexeme) is
|
||||
-- defined using @lexeme@, this way every parse starts at a point without
|
||||
-- white space. Parsers that use @lexeme@ are called /lexeme/ parsers in
|
||||
-- this document.
|
||||
--
|
||||
-- The only point where the 'whiteSpace' parser should be called
|
||||
-- explicitly is the start of the main parser in order to skip any leading
|
||||
-- white space.
|
||||
|
||||
, lexeme :: forall a. ParsecT s u m a -> ParsecT s u m a
|
||||
|
||||
-- | Parses any white space. White space consists of /zero/ or more
|
||||
-- occurrences of a 'space', a line comment or a block (multi line)
|
||||
-- comment. Block comments may be nested. How comments are started and
|
||||
-- ended is defined in the 'LanguageDef' that is passed to
|
||||
-- 'makeLexer'.
|
||||
|
||||
, whiteSpace :: ParsecT s u m ()
|
||||
|
||||
-- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis,
|
||||
-- returning the value of @p@.
|
||||
|
||||
, parens :: forall a. ParsecT s u m a -> ParsecT s u m a
|
||||
|
||||
-- | Lexeme parser @braces p@ parses @p@ enclosed in braces (“{” and
|
||||
-- “}”), returning the value of @p@.
|
||||
|
||||
, braces :: forall a. ParsecT s u m a -> ParsecT s u m a
|
||||
|
||||
-- | Lexeme parser @angles p@ parses @p@ enclosed in angle brackets (“\<”
|
||||
-- and “>”), returning the value of @p@.
|
||||
|
||||
, angles :: forall a. ParsecT s u m a -> ParsecT s u m a
|
||||
|
||||
-- | Lexeme parser @brackets p@ parses @p@ enclosed in brackets (“[”
|
||||
-- and “]”), returning the value of @p@.
|
||||
|
||||
, brackets :: forall a. ParsecT s u m a -> ParsecT s u m a
|
||||
|
||||
-- | Lexeme parser @semicolon@ parses the character “;” and skips any
|
||||
-- trailing white space. Returns the string “;”.
|
||||
|
||||
, semicolon :: ParsecT s u m String
|
||||
|
||||
-- | Lexeme parser @comma@ parses the character “,” and skips any
|
||||
-- trailing white space. Returns the string “,”.
|
||||
|
||||
, comma :: ParsecT s u m String
|
||||
|
||||
-- | Lexeme parser @colon@ parses the character “:” and skips any
|
||||
-- trailing white space. Returns the string “:”.
|
||||
|
||||
, colon :: ParsecT s u m String
|
||||
|
||||
-- | Lexeme parser @dot@ parses the character “.” and skips any
|
||||
-- trailing white space. Returns the string “.”.
|
||||
|
||||
, dot :: ParsecT s u m String
|
||||
|
||||
-- | Lexeme parser @semiSep p@ parses /zero/ or more occurrences of @p@
|
||||
-- separated by 'semicolon'. Returns a list of values returned by @p@.
|
||||
|
||||
, semicolonSep :: forall a . ParsecT s u m a -> ParsecT s u m [a]
|
||||
|
||||
-- | Lexeme parser @semiSep1 p@ parses /one/ or more occurrences of @p@
|
||||
-- separated by 'semi'. Returns a list of values returned by @p@.
|
||||
|
||||
, semicolonSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a]
|
||||
|
||||
-- | Lexeme parser @commaSep p@ parses /zero/ or more occurrences of
|
||||
-- @p@ separated by 'comma'. Returns a list of values returned by @p@.
|
||||
|
||||
, commaSep :: forall a . ParsecT s u m a -> ParsecT s u m [a]
|
||||
|
||||
-- | Lexeme parser @commaSep1 p@ parses /one/ or more occurrences of
|
||||
-- @p@ separated by 'comma'. Returns a list of values returned by @p@.
|
||||
|
||||
, commaSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a] }
|
||||
|
||||
-- | The expression @makeLexer language@ creates a 'Lexer' record that
|
||||
-- contains lexical parsers that are defined using the definitions in the
|
||||
-- @language@ record.
|
||||
-- | @space spaceChar lineComment blockComment@ produces parser that can
|
||||
-- parse white space in general. It's expected that you create such a parser
|
||||
-- once and pass it to many other function in this module as needed (it's
|
||||
-- usually called @spaceConsumer@ in doc-strings here).
|
||||
--
|
||||
-- The use of this function is quite stylized — one imports the appropriate
|
||||
-- language definition and selects the lexical parsers that are needed from
|
||||
-- the resulting 'Lexer'.
|
||||
-- @spaceChar@ is used to parse trivial space characters. You can use
|
||||
-- 'C.spaceChar' from "Text.Megaparsec.Char" for this purpose as well as
|
||||
-- your own parser (if you don't want automatically consume newlines, for
|
||||
-- example).
|
||||
--
|
||||
-- > module Main (main) where
|
||||
-- @lineComment@ is used to parse line comments. You can use
|
||||
-- 'skipLineComment' if you don't need anything special.
|
||||
--
|
||||
-- @blockComment@ is used to parse block (multi-line) comments. You can use
|
||||
-- 'skipBlockComment' if you don't need anything special.
|
||||
--
|
||||
-- Parsing of white space is important part of any parser. We propose scheme
|
||||
-- where every lexeme should consume all trailing white space, but not
|
||||
-- leading one. You should wrap every lexeme parser with 'lexeme' to achieve
|
||||
-- this. You only need to call 'space' “manually” to consume any white space
|
||||
-- before the first lexeme (at the beginning of file).
|
||||
|
||||
space :: Stream s m t => ParsecT s u m () ->
|
||||
ParsecT s u m () -> ParsecT s u m () -> ParsecT s u m ()
|
||||
space ch line block = hidden . skipMany $ choice [ch, line, block]
|
||||
|
||||
-- | This is wrapper for lexemes. Typical usage is to supply first argument
|
||||
-- (parser that consumes white space, probably defined via 'space') and use
|
||||
-- resulting function to wrap parsers for every lexeme.
|
||||
--
|
||||
-- > lexeme = L.lexeme spaceConsumer
|
||||
-- > integer = lexeme L.integer
|
||||
|
||||
lexeme :: ParsecT s u m () -> ParsecT s u m a -> ParsecT s u m a
|
||||
lexeme spc p = p <* spc
|
||||
|
||||
-- | This is a helper to parse symbols, i.e. verbatim strings. You pass the
|
||||
-- first argument (parser that consumes white space, probably defined via
|
||||
-- 'space') and then you can use the resulting function to parse strings:
|
||||
--
|
||||
-- > symbol = L.symbol spaceConsumer
|
||||
-- >
|
||||
-- > import Text.Megaparsec
|
||||
-- > import Text.Megaparsec.Language (haskellDef)
|
||||
-- > import qualified Text.Megaparsec.Lexer as L
|
||||
-- >
|
||||
-- > -- The parser
|
||||
-- > …
|
||||
-- >
|
||||
-- > expr = parens expr
|
||||
-- > <|> identifier
|
||||
-- > <|> …
|
||||
-- >
|
||||
-- > -- The lexer
|
||||
-- > lexer = L.makeLexer haskellDef
|
||||
-- >
|
||||
-- > parens = L.parens lexer
|
||||
-- > braces = L.braces lexer
|
||||
-- > identifier = L.identifier lexer
|
||||
-- > reserved = L.reserved lexer
|
||||
-- > …
|
||||
|
||||
makeLexer :: Stream s m Char => LanguageDef s u m -> Lexer s u m
|
||||
makeLexer languageDef =
|
||||
Lexer
|
||||
{ identifier = identifier
|
||||
, reserved = reserved
|
||||
, operator = operator
|
||||
, reservedOp = reservedOp
|
||||
|
||||
, charLiteral = charLiteral
|
||||
, stringLiteral = stringLiteral
|
||||
|
||||
, integer = integer
|
||||
, integer' = integer'
|
||||
, decimal = decimal
|
||||
, hexadecimal = hexadecimal
|
||||
, octal = octal
|
||||
, signed = signed
|
||||
, float = float
|
||||
, float' = float'
|
||||
, number = number
|
||||
, number' = number'
|
||||
|
||||
, symbol = symbol
|
||||
, lexeme = lexeme
|
||||
, whiteSpace = whiteSpace
|
||||
|
||||
, parens = parens
|
||||
, braces = braces
|
||||
, angles = angles
|
||||
, brackets = brackets
|
||||
, semicolon = semicolon
|
||||
, comma = comma
|
||||
, colon = colon
|
||||
, dot = dot
|
||||
, semicolonSep = semicolonSep
|
||||
, semicolonSep1 = semicolonSep1
|
||||
, commaSep = commaSep
|
||||
, commaSep1 = commaSep1 }
|
||||
where
|
||||
|
||||
-- bracketing
|
||||
|
||||
parens = between (symbol "(") (symbol ")")
|
||||
braces = between (symbol "{") (symbol "}")
|
||||
angles = between (symbol "<") (symbol ">")
|
||||
brackets = between (symbol "[") (symbol "]")
|
||||
|
||||
semicolon = symbol ";"
|
||||
comma = symbol ","
|
||||
dot = symbol "."
|
||||
colon = symbol ":"
|
||||
|
||||
commaSep = (`sepBy` comma)
|
||||
semicolonSep = (`sepBy` semicolon)
|
||||
|
||||
commaSep1 = (`sepBy1` comma)
|
||||
semicolonSep1 = (`sepBy1` semicolon)
|
||||
|
||||
-- chars & strings
|
||||
|
||||
charLiteral = lexeme ( between (char '\'')
|
||||
(char '\'' <?> "end of character")
|
||||
characterChar )
|
||||
<?> "character"
|
||||
|
||||
characterChar = charLetter <|> charEscape <?> "literal character"
|
||||
|
||||
charEscape = char '\\' >> escapeCode
|
||||
charLetter = satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026'))
|
||||
|
||||
stringLiteral =
|
||||
lexeme ((foldr (maybe id (:)) "" <$>
|
||||
between (char '"') (char '"' <?> "end of string")
|
||||
(many stringChar)) <?> "literal string")
|
||||
|
||||
stringChar = (Just <$> stringLetter) <|> stringEscape <?> "string character"
|
||||
|
||||
stringLetter = satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026'))
|
||||
|
||||
stringEscape = char '\\' >>
|
||||
( (escapeGap >> return Nothing) <|>
|
||||
(escapeEmpty >> return Nothing) <|>
|
||||
(Just <$> escapeCode) )
|
||||
|
||||
escapeEmpty = char '&'
|
||||
escapeGap = some spaceChar >> char '\\' <?> "end of string gap"
|
||||
|
||||
-- escape codes
|
||||
|
||||
escapeCode = charEsc <|> charNum <|> charAscii <|> charControl
|
||||
<?> "escape code"
|
||||
|
||||
charEsc = choice (parseEsc <$> escMap)
|
||||
where parseEsc (c, code) = char c >> return code
|
||||
|
||||
charNum = toEnum . fromInteger <$>
|
||||
( decimal <|>
|
||||
(char 'o' >> nump "0o" octDigitChar) <|>
|
||||
(char 'x' >> nump "0x" hexDigitChar) )
|
||||
|
||||
charAscii = choice (parseAscii <$> asciiMap)
|
||||
where parseAscii (asc, code) = try (string asc >> return code)
|
||||
|
||||
charControl = toEnum . subtract 64 . fromEnum <$> (char '^' >> upperChar)
|
||||
|
||||
-- escape code tables
|
||||
|
||||
escMap = zip "abfnrtv\\\"\'" "\a\b\f\n\r\t\v\\\"\'"
|
||||
asciiMap = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2)
|
||||
|
||||
ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM",
|
||||
"FS","GS","RS","US","SP"]
|
||||
ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL",
|
||||
"DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB",
|
||||
"CAN","SUB","ESC","DEL"]
|
||||
|
||||
ascii2 = "\b\t\n\v\f\r\SO\SI\EM\FS\GS\RS\US "
|
||||
ascii3 = "\NUL\SOH\STX\ETX\EOT\ENQ\ACK\a\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\SUB\ESC\DEL"
|
||||
|
||||
-- numbers — integers
|
||||
|
||||
integer = decimal
|
||||
integer' = signed integer
|
||||
|
||||
decimal = lexeme (nump "" digitChar <?> "integer")
|
||||
hexadecimal = lexeme $ char '0' >> oneOf "xX" >> nump "0x" hexDigitChar
|
||||
octal = lexeme $ char '0' >> oneOf "oO" >> nump "0o" octDigitChar
|
||||
|
||||
nump prefix baseDigit = read . (prefix ++) <$> some baseDigit
|
||||
|
||||
signed p = ($) <$> option id (lexeme sign) <*> p
|
||||
|
||||
sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a)
|
||||
sign = (char '+' *> return id) <|> (char '-' *> return negate)
|
||||
|
||||
-- numbers — floats
|
||||
|
||||
float = lexeme ffloat <?> "float"
|
||||
float' = signed float
|
||||
|
||||
ffloat = read <$> ffloat'
|
||||
where
|
||||
ffloat' = do
|
||||
decimal <- fDec
|
||||
rest <- fraction <|> fExp
|
||||
return $ decimal ++ rest
|
||||
|
||||
fraction = do
|
||||
void $ char '.'
|
||||
decimal <- fDec
|
||||
exp <- option "" fExp
|
||||
return $ '.' : decimal ++ exp
|
||||
|
||||
fDec = some digitChar
|
||||
|
||||
fExp = do
|
||||
expChar <- oneOf "eE"
|
||||
signStr <- option "" (pure <$> oneOf "+-")
|
||||
decimal <- fDec
|
||||
return $ expChar : signStr ++ decimal
|
||||
|
||||
-- numbers — a more general case
|
||||
|
||||
number = (Right <$> try float) <|> (Left <$> integer) <?> "number"
|
||||
number' = (Right <$> try float') <|> (Left <$> integer') <?> "number"
|
||||
|
||||
-- operators & reserved ops
|
||||
|
||||
reservedOp name =
|
||||
lexeme $ try $ do
|
||||
void $ string name
|
||||
notFollowedBy (opLetter languageDef) <?> ("end of " ++ show name)
|
||||
|
||||
operator =
|
||||
lexeme $ try $ do
|
||||
name <- oper
|
||||
if isReservedOp name
|
||||
then unexpected ("reserved operator " ++ show name)
|
||||
else return name
|
||||
|
||||
oper = ((:) <$> opStart languageDef <*> many (opLetter languageDef))
|
||||
<?> "operator"
|
||||
|
||||
isReservedOp = isReserved . sort $ reservedOpNames languageDef
|
||||
|
||||
-- identifiers & reserved words
|
||||
|
||||
reserved name =
|
||||
lexeme $ try $ do
|
||||
void $ caseString name
|
||||
notFollowedBy (identLetter languageDef) <?> ("end of " ++ show name)
|
||||
|
||||
caseString name
|
||||
| caseSensitive languageDef = string name
|
||||
| otherwise = walk name >> return name
|
||||
where walk = foldr (\c -> ((caseChar c <?> show name) >>)) (return ())
|
||||
caseChar c
|
||||
| isAlpha c = char (toLower c) <|> char (toUpper c)
|
||||
| otherwise = char c
|
||||
|
||||
identifier =
|
||||
lexeme $ try $ do
|
||||
name <- ident
|
||||
if isReservedName name
|
||||
then unexpected ("reserved word " ++ show name)
|
||||
else return name
|
||||
|
||||
ident = ((:) <$> identStart languageDef <*> many (identLetter languageDef))
|
||||
<?> "identifier"
|
||||
|
||||
isReservedName name = isReserved theReservedNames caseName
|
||||
where caseName
|
||||
| caseSensitive languageDef = name
|
||||
| otherwise = toLower <$> name
|
||||
|
||||
isReserved names name = scan names
|
||||
where scan [] = False
|
||||
scan (r:rs) = case compare r name of
|
||||
LT -> scan rs
|
||||
EQ -> True
|
||||
GT -> False
|
||||
|
||||
theReservedNames
|
||||
| caseSensitive languageDef = sort reserved
|
||||
| otherwise = sort . fmap (fmap toLower) $ reserved
|
||||
where reserved = reservedNames languageDef
|
||||
|
||||
-- white space & symbols
|
||||
|
||||
symbol = lexeme . string
|
||||
|
||||
lexeme p = p <* whiteSpace
|
||||
|
||||
whiteSpace = hidden space -- FIXME: write it in a decent manner
|
||||
-- \| noLine && noMulti = skipMany (space <?> "")
|
||||
-- \| noLine = skipMany (space <|>
|
||||
-- multiLineComment <?> "")
|
||||
-- \| noMulti = skipMany (space <|>
|
||||
-- oneLineComment <?> "")
|
||||
-- \| otherwise = skipMany (space <|>
|
||||
-- oneLineComment <|>
|
||||
-- multiLineComment <?> "")
|
||||
-- where
|
||||
-- noLine = null (commentLine languageDef)
|
||||
-- noMulti = null (commentStart languageDef)
|
||||
|
||||
-- oneLineComment = void (try (string (commentLine languageDef))
|
||||
-- >> skipMany (satisfy (/= '\n')))
|
||||
|
||||
-- multiLineComment = try (string (commentStart languageDef)) >> inComment
|
||||
|
||||
-- inComment = if nestedComments languageDef
|
||||
-- then inCommentMulti
|
||||
-- else inCommentSingle
|
||||
|
||||
-- inCommentMulti
|
||||
-- = void (try . string $ commentEnd languageDef)
|
||||
-- <|> (multiLineComment >> inCommentMulti)
|
||||
-- <|> (skipSome (noneOf startEnd) >> inCommentMulti)
|
||||
-- <|> (oneOf startEnd >> inCommentMulti)
|
||||
-- <?> "end of comment"
|
||||
|
||||
-- inCommentSingle
|
||||
-- = void (try . string $ commentEnd languageDef)
|
||||
-- <|> (skipSome (noneOf startEnd) >> inCommentSingle)
|
||||
-- <|> (oneOf startEnd >> inCommentSingle)
|
||||
-- <?> "end of comment"
|
||||
|
||||
-- startEnd = nub $ (++) <$> commentEnd <*> commentStart $ languageDef
|
||||
-- > parens = between (symbol "(") (symbol ")")
|
||||
-- > braces = between (symbol "{") (symbol "}")
|
||||
-- > angles = between (symbol "<") (symbol ">")
|
||||
-- > brackets = between (symbol "[") (symbol "]")
|
||||
-- > semicolon = symbol ";"
|
||||
-- > comma = symbol ","
|
||||
-- > colon = symbol ":"
|
||||
-- > dot = symbol "."
|
||||
|
||||
symbol :: Stream s m Char =>
|
||||
ParsecT s u m () -> String -> ParsecT s u m String
|
||||
symbol spc = lexeme spc . C.string
|
||||
|
||||
-- | Case-insensitive version of 'symbol'. This may be helpful if you're
|
||||
-- working with case-insensitive languages.
|
||||
|
||||
symbol' :: Stream s m Char =>
|
||||
ParsecT s u m () -> String -> ParsecT s u m String
|
||||
symbol' spc = lexeme spc . C.string'
|
||||
|
||||
-- | @indentGuard spaceConsumer test@ first consumes all white space
|
||||
-- (indentation) with @spaceConsumer@ parser, then it checks column
|
||||
-- position. It should satisfy supplied predicate @test@, otherwise the
|
||||
-- parser fails with error message “incorrect indentation”. On success
|
||||
-- current column position is returned.
|
||||
--
|
||||
-- When you want to parse block of indentation first run this parser with
|
||||
-- predicate like @(> 1)@ — this will make sure you have some
|
||||
-- indentation. Use returned value to check indentation on every subsequent
|
||||
-- line according to syntax of your language.
|
||||
|
||||
indentGuard :: Stream s m t =>
|
||||
ParsecT s u m () -> (Int -> Bool) -> ParsecT s u m Int
|
||||
indentGuard spc p = do
|
||||
spc
|
||||
pos <- sourceColumn <$> getPosition
|
||||
if p pos
|
||||
then return pos
|
||||
else fail "incorrect indentation"
|
||||
|
||||
-- | Given comment prefix this function returns parser that skips line
|
||||
-- comments. Note that it stops just before newline character but doesn't
|
||||
-- consume the newline. Newline is either supposed to be consumed by 'space'
|
||||
-- parser or picked up manually.
|
||||
|
||||
skipLineComment :: Stream s m Char => String -> ParsecT s u m ()
|
||||
skipLineComment prefix = p >> void (manyTill C.anyChar n)
|
||||
where p = try $ C.string prefix
|
||||
n = lookAhead C.newline
|
||||
|
||||
-- | @skipBlockComment start end@ skips non-nested block comment starting
|
||||
-- with @start@ and ending with @end@.
|
||||
|
||||
skipBlockComment :: Stream s m Char => String -> String -> ParsecT s u m ()
|
||||
skipBlockComment start end = p >> void (manyTill C.anyChar n)
|
||||
where p = try $ C.string start
|
||||
n = try $ C.string end
|
||||
|
||||
-- Character and string literals
|
||||
|
||||
-- | The lexeme parser parses a single literal character without
|
||||
-- quotes. Purpose of this parser is to help with parsing of commonly used
|
||||
-- escape sequences. It's your responsibility to take care of character
|
||||
-- literal syntax in your language (surround it with single quotes or
|
||||
-- similar).
|
||||
--
|
||||
-- The literal character is parsed according to the grammar rules defined in
|
||||
-- the Haskell report.
|
||||
--
|
||||
-- Note that you can use this parser as a building block to parse various
|
||||
-- string literals:
|
||||
--
|
||||
-- > stringLiteral = char '"' >> manyTill L.charLiteral (char '"')
|
||||
|
||||
charLiteral :: Stream s m Char => ParsecT s u m Char
|
||||
charLiteral = label "literal character" $ do
|
||||
r@(x:_) <- lookAhead $ count' 1 8 C.anyChar
|
||||
case listToMaybe (readLitChar r) of
|
||||
Just (c, r') -> count (length r - length r') C.anyChar >> return c
|
||||
Nothing -> unexpected (showToken x)
|
||||
|
||||
-- Numbers
|
||||
|
||||
-- | Parse an integer without sign in decimal representation (according to
|
||||
-- format of integer literals described in Haskell report).
|
||||
--
|
||||
-- If you need to parse signed integers, see 'signed' combinator.
|
||||
|
||||
integer :: Stream s m Char => ParsecT s u m Integer
|
||||
integer = decimal <?> "integer"
|
||||
|
||||
-- | The same as 'integer', but 'integer' is 'label'ed with “integer” label,
|
||||
-- while this parser is not labeled.
|
||||
|
||||
decimal :: Stream s m Char => ParsecT s u m Integer
|
||||
decimal = nump "" C.digitChar
|
||||
|
||||
-- | Parse an integer in hexadecimal representation. Representation of
|
||||
-- hexadecimal number is expected to be according to Haskell report except
|
||||
-- for the fact that this parser doesn't parse “0x” or “0X” prefix. It is
|
||||
-- reponsibility of the programmer to parse correct prefix before parsing
|
||||
-- the number itself.
|
||||
--
|
||||
-- For example you can make it conform to Haskell report like this:
|
||||
--
|
||||
-- > hexadecimal = char '0' >> char' 'x' >> L.hexadecimal
|
||||
|
||||
hexadecimal :: Stream s m Char => ParsecT s u m Integer
|
||||
hexadecimal = nump "0x" C.hexDigitChar
|
||||
|
||||
-- | Parse an integer in octal representation. Representation of octal
|
||||
-- number is expected to be according to Haskell report except for the fact
|
||||
-- that this parser doesn't parse “0o” or “0O” prefix. It is responsibility
|
||||
-- of the programmer to parse correct prefix before parsing the number
|
||||
-- itself.
|
||||
|
||||
octal :: Stream s m Char => ParsecT s u m Integer
|
||||
octal = nump "0o" C.octDigitChar
|
||||
|
||||
-- | @nump prefix p@ parses /one/ or more characters with @p@ parser, then
|
||||
-- prepends @prefix@ to returned value and tries to interpret the result as
|
||||
-- an integer according to Haskell syntax.
|
||||
|
||||
nump :: String -> ParsecT s u m Char -> ParsecT s u m Integer
|
||||
nump prefix baseDigit = read . (prefix ++) <$> some baseDigit
|
||||
|
||||
-- | Parse a floating point value without sign. Representation of floating
|
||||
-- point value is expected to be according to Haskell report.
|
||||
--
|
||||
-- If you need to parse signed floats, see 'signed' combinator.
|
||||
|
||||
float :: Stream s m Char => ParsecT s u m Double
|
||||
float = label "float" $ read <$> f
|
||||
where f = do
|
||||
d <- some C.digitChar
|
||||
rest <- fraction <|> fExp
|
||||
return $ d ++ rest
|
||||
|
||||
-- | This is a helper for 'float' parser. It parses fractional part of
|
||||
-- floating point number, that is, dot and everything after it.
|
||||
|
||||
fraction :: Stream s m Char => ParsecT s u m String
|
||||
fraction = do
|
||||
void $ C.char '.'
|
||||
d <- some C.digitChar
|
||||
e <- option "" fExp
|
||||
return $ '.' : d ++ e
|
||||
|
||||
-- | This helper parses exponent of floating point numbers.
|
||||
|
||||
fExp :: Stream s m Char => ParsecT s u m String
|
||||
fExp = do
|
||||
expChar <- C.char' 'e'
|
||||
signStr <- option "" (pure <$> choice (C.char <$> "+-"))
|
||||
d <- some C.digitChar
|
||||
return $ expChar : signStr ++ d
|
||||
|
||||
-- | Parse a number: either integer or floating point. The parser can handle
|
||||
-- overlapping grammars graciously.
|
||||
|
||||
number :: Stream s m Char => ParsecT s u m (Either Integer Double)
|
||||
number = (Right <$> try float) <|> (Left <$> integer) <?> "number"
|
||||
|
||||
-- | @signed space p@ parser parses optional sign, then if there is a sign
|
||||
-- it will consume optional white space (using @space@ parser), then it runs
|
||||
-- parser @p@ which should return a number. Sign of the number is changed
|
||||
-- according to previously parsed sign.
|
||||
--
|
||||
-- For example, to parse signed integer you can write:
|
||||
--
|
||||
-- > lexeme = L.lexeme spaceConsumer
|
||||
-- > integer = lexeme L.integer
|
||||
-- > signedInteger = signed spaceConsumer integer
|
||||
|
||||
signed :: (Stream s m Char, Num a) =>
|
||||
ParsecT s u m () -> ParsecT s u m a -> ParsecT s u m a
|
||||
signed spc p = ($) <$> option id (lexeme spc sign) <*> p
|
||||
|
||||
-- | Parse a sign and return either 'id' or 'negate' according to parsed
|
||||
-- sign.
|
||||
|
||||
sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a)
|
||||
sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate)
|
||||
|
@ -100,7 +100,6 @@ library
|
||||
, Text.Megaparsec.Combinator
|
||||
, Text.Megaparsec.Error
|
||||
, Text.Megaparsec.Expr
|
||||
, Text.Megaparsec.Language
|
||||
, Text.Megaparsec.Lexer
|
||||
, Text.Megaparsec.Perm
|
||||
, Text.Megaparsec.Pos
|
||||
@ -116,6 +115,7 @@ test-suite old-tests
|
||||
main-is: Main.hs
|
||||
hs-source-dirs: old-tests
|
||||
type: exitcode-stdio-1.0
|
||||
ghc-options: -O2 -Wall
|
||||
other-modules: Bugs
|
||||
, Bugs.Bug2
|
||||
, Bugs.Bug6
|
||||
@ -126,7 +126,8 @@ test-suite old-tests
|
||||
, HUnit == 1.2.*
|
||||
, test-framework >= 0.6 && < 1
|
||||
, test-framework-hunit >= 0.2 && < 0.4
|
||||
ghc-options: -O2 -Wall
|
||||
default-extensions:
|
||||
FlexibleContexts
|
||||
default-language: Haskell2010
|
||||
|
||||
test-suite tests
|
||||
|
@ -1,21 +1,33 @@
|
||||
|
||||
module Bugs.Bug2 (main) where
|
||||
|
||||
import Test.HUnit hiding (Test)
|
||||
import Test.Framework
|
||||
import Test.Framework.Providers.HUnit
|
||||
import Control.Applicative (empty)
|
||||
import Control.Monad (void)
|
||||
|
||||
import Text.Megaparsec
|
||||
import Text.Megaparsec.Language (haskellDef)
|
||||
import Text.Megaparsec.String
|
||||
import qualified Text.Megaparsec.Lexer as L
|
||||
|
||||
import Test.Framework
|
||||
import Test.Framework.Providers.HUnit
|
||||
import Test.HUnit hiding (Test)
|
||||
|
||||
sc :: Parser ()
|
||||
sc = L.space (void spaceChar) empty empty
|
||||
|
||||
lexeme :: Parser a -> Parser a
|
||||
lexeme = L.lexeme sc
|
||||
|
||||
stringLiteral :: Parser String
|
||||
stringLiteral = lexeme $ char '"' >> manyTill L.charLiteral (char '"')
|
||||
|
||||
main :: Test
|
||||
main =
|
||||
testCase "Control Char Parsing (#2)" $
|
||||
parseString "\"test\\^Bstring\"" @?= "test\^Bstring"
|
||||
where
|
||||
parseString :: String -> String
|
||||
parseString input =
|
||||
case parse parser "Example" input of
|
||||
case parse stringLiteral "Example" input of
|
||||
Left{} -> error "Parse failure"
|
||||
Right str -> str
|
||||
parser = L.stringLiteral $ L.makeLexer haskellDef
|
||||
|
@ -2,13 +2,11 @@
|
||||
module Bugs.Bug35 (main) where
|
||||
|
||||
import Text.Megaparsec
|
||||
import Text.Megaparsec.Language
|
||||
import Text.Megaparsec.String
|
||||
import qualified Text.Megaparsec.Lexer as L
|
||||
|
||||
import Test.HUnit hiding (Test)
|
||||
import Test.Framework
|
||||
import Test.Framework.Providers.HUnit
|
||||
import Test.HUnit hiding (Test)
|
||||
|
||||
trickyFloats :: [String]
|
||||
trickyFloats =
|
||||
@ -28,12 +26,9 @@ trickyFloats =
|
||||
, "32.96176575630599"
|
||||
, "38.47735512322269" ]
|
||||
|
||||
float :: Parser Double
|
||||
float = L.float (L.makeLexer emptyDef)
|
||||
|
||||
testBatch :: Assertion
|
||||
testBatch = mapM_ testFloat trickyFloats
|
||||
where testFloat x = parse float "" x @?= Right (read x :: Double)
|
||||
where testFloat x = parse L.float "" x @?= Right (read x :: Double)
|
||||
|
||||
main :: Test
|
||||
main = testCase "Output of Text.Megaparsec.Lexer.float (#35)" testBatch
|
||||
|
@ -1,16 +1,17 @@
|
||||
|
||||
module Bugs.Bug39 (main) where
|
||||
|
||||
import Control.Applicative (empty)
|
||||
import Control.Monad (void)
|
||||
import Data.Either (isLeft, isRight)
|
||||
|
||||
import Text.Megaparsec
|
||||
import Text.Megaparsec.Language
|
||||
import Text.Megaparsec.String
|
||||
import qualified Text.Megaparsec.Lexer as L
|
||||
|
||||
import Test.HUnit hiding (Test)
|
||||
import Test.Framework
|
||||
import Test.Framework.Providers.HUnit
|
||||
import Test.HUnit hiding (Test)
|
||||
|
||||
shouldFail :: [String]
|
||||
shouldFail = [" 1", " +1", " -1"]
|
||||
@ -18,8 +19,14 @@ shouldFail = [" 1", " +1", " -1"]
|
||||
shouldSucceed :: [String]
|
||||
shouldSucceed = ["1", "+1", "-1", "+ 1 ", "- 1 ", "1 "]
|
||||
|
||||
sc :: Parser ()
|
||||
sc = L.space (void spaceChar) empty empty
|
||||
|
||||
lexeme :: Parser a -> Parser a
|
||||
lexeme = L.lexeme sc
|
||||
|
||||
integer :: Parser Integer
|
||||
integer = L.integer' (L.makeLexer emptyDef)
|
||||
integer = lexeme $ L.signed sc L.integer
|
||||
|
||||
testBatch :: Assertion
|
||||
testBatch = mapM_ (f testFail) shouldFail >>
|
||||
|
@ -1,13 +1,13 @@
|
||||
|
||||
module Bugs.Bug6 (main) where
|
||||
|
||||
import Test.HUnit hiding (Test)
|
||||
import Test.Framework
|
||||
import Test.Framework.Providers.HUnit
|
||||
|
||||
import Text.Megaparsec
|
||||
import Text.Megaparsec.String
|
||||
|
||||
import Test.Framework
|
||||
import Test.Framework.Providers.HUnit
|
||||
import Test.HUnit hiding (Test)
|
||||
|
||||
import Util
|
||||
|
||||
main :: Test
|
||||
@ -19,5 +19,5 @@ variable :: Parser String
|
||||
variable = do
|
||||
x <- lookAhead (some letterChar)
|
||||
if x == "return"
|
||||
then fail "'return' is a reserved keyword"
|
||||
else string x
|
||||
then fail "'return' is a reserved keyword"
|
||||
else string x
|
||||
|
@ -1,15 +1,17 @@
|
||||
|
||||
module Bugs.Bug9 (main) where
|
||||
|
||||
import Control.Applicative (empty)
|
||||
import Control.Monad (void)
|
||||
|
||||
import Text.Megaparsec
|
||||
import Text.Megaparsec.Language (haskellStyle)
|
||||
import Text.Megaparsec.String (Parser)
|
||||
import Text.Megaparsec.Expr
|
||||
import Text.Megaparsec.String (Parser)
|
||||
import qualified Text.Megaparsec.Lexer as L
|
||||
|
||||
import Test.HUnit hiding (Test)
|
||||
import Test.Framework
|
||||
import Test.Framework.Providers.HUnit
|
||||
import Test.HUnit hiding (Test)
|
||||
|
||||
import Util
|
||||
|
||||
@ -25,12 +27,21 @@ main =
|
||||
|
||||
-- Syntax analysis
|
||||
|
||||
sc :: Stream s m Char => ParsecT s u m ()
|
||||
sc = L.space (void spaceChar) empty empty
|
||||
|
||||
lexeme :: Stream s m Char => ParsecT s u m a -> ParsecT s u m a
|
||||
lexeme = L.lexeme sc
|
||||
|
||||
integer :: Stream s m Char => ParsecT s u m Integer
|
||||
integer = lexeme L.integer
|
||||
|
||||
operator :: Stream s m Char => String -> ParsecT s u m String
|
||||
operator = try . L.symbol sc
|
||||
|
||||
parseTopLevel :: Parser Expr
|
||||
parseTopLevel = parseExpr <* eof
|
||||
|
||||
parseExpr :: Parser Expr
|
||||
parseExpr = makeExprParser (Const <$> integer) table
|
||||
where table = [[ InfixL (Op <$ reserved ">>>") ]]
|
||||
lexer = L.makeLexer haskellStyle { L.reservedOpNames = [">>>"] }
|
||||
integer = L.integer lexer
|
||||
reserved = L.reserved lexer
|
||||
where table = [[ InfixL (Op <$ operator ">>>") ]]
|
||||
|
198
tests/Lexer.hs
198
tests/Lexer.hs
@ -29,16 +29,206 @@
|
||||
|
||||
module Lexer (tests) where
|
||||
|
||||
import Control.Applicative (some, (<|>))
|
||||
import Control.Applicative (empty)
|
||||
import Control.Monad (void)
|
||||
import Data.Char
|
||||
( readLitChar
|
||||
, showLitChar
|
||||
, isDigit
|
||||
, isAlphaNum
|
||||
, isSpace
|
||||
, toLower )
|
||||
import Data.List (findIndices, isInfixOf)
|
||||
import Data.Maybe (listToMaybe, maybeToList, isNothing, fromJust)
|
||||
import Numeric (showInt, showHex, showOct, showSigned)
|
||||
|
||||
import Test.Framework
|
||||
import Test.Framework.Providers.QuickCheck2 (testProperty)
|
||||
import Test.QuickCheck
|
||||
|
||||
import Text.Megaparsec.Char
|
||||
import Text.Megaparsec.Combinator
|
||||
import Text.Megaparsec.Error
|
||||
import Text.Megaparsec.Lexer
|
||||
import Text.Megaparsec.Pos
|
||||
import Text.Megaparsec.Prim
|
||||
import Text.Megaparsec.String
|
||||
import qualified Text.Megaparsec.Char as C
|
||||
|
||||
import Util
|
||||
|
||||
tests :: Test
|
||||
tests = testGroup "Lexer"
|
||||
[]
|
||||
[ testProperty "space combinator" prop_space
|
||||
, testProperty "symbol combinator" prop_symbol
|
||||
, testProperty "symbol' combinator" prop_symbol'
|
||||
, testProperty "indentGuard combinator" prop_indentGuard
|
||||
, testProperty "charLiteral" prop_charLiteral
|
||||
, testProperty "integer" prop_integer
|
||||
, testProperty "decimal" prop_decimal
|
||||
, testProperty "hexadecimal" prop_hexadecimal
|
||||
, testProperty "octal" prop_octal
|
||||
, testProperty "float 0" prop_float_0
|
||||
, testProperty "float 1" prop_float_1
|
||||
, testProperty "number" prop_number
|
||||
, testProperty "signed" prop_signed ]
|
||||
|
||||
newtype WhiteSpace = WhiteSpace
|
||||
{ getWhiteSpace :: String }
|
||||
deriving (Show, Eq)
|
||||
|
||||
instance Arbitrary WhiteSpace where
|
||||
arbitrary = WhiteSpace . concat <$> listOf whiteUnit
|
||||
|
||||
newtype Symbol = Symbol
|
||||
{ getSymbol :: String }
|
||||
deriving (Show, Eq)
|
||||
|
||||
instance Arbitrary Symbol where
|
||||
arbitrary = Symbol <$> ((++) <$> symbolName <*> whiteChars)
|
||||
|
||||
whiteUnit :: Gen String
|
||||
whiteUnit = oneof [whiteChars, whiteLine, whiteBlock]
|
||||
|
||||
whiteChars :: Gen String
|
||||
whiteChars = listOf $ elements "\t\n "
|
||||
|
||||
whiteLine :: Gen String
|
||||
whiteLine = commentOut <$> arbitrary `suchThat` goodEnough
|
||||
where commentOut x = "//" ++ x ++ "\n"
|
||||
goodEnough x = '\n' `notElem` x
|
||||
|
||||
whiteBlock :: Gen String
|
||||
whiteBlock = commentOut <$> arbitrary `suchThat` goodEnough
|
||||
where commentOut x = "/*" ++ x ++ "*/"
|
||||
goodEnough x = not $ "*/" `isInfixOf` x
|
||||
|
||||
symbolName :: Gen String
|
||||
symbolName = listOf $ arbitrary `suchThat` isAlphaNum
|
||||
|
||||
sc :: Parser ()
|
||||
sc = space (void C.spaceChar) l b
|
||||
where l = skipLineComment "//"
|
||||
b = skipBlockComment "/*" "*/"
|
||||
|
||||
sc' :: Parser ()
|
||||
sc' = space (void $ C.oneOf " \t") empty empty
|
||||
|
||||
prop_space :: WhiteSpace -> Property
|
||||
prop_space w = checkParser p r s
|
||||
where p = sc
|
||||
r = Right ()
|
||||
s = getWhiteSpace w
|
||||
|
||||
prop_symbol :: Symbol -> Maybe Char -> Property
|
||||
prop_symbol = parseSymbol (symbol sc) id
|
||||
|
||||
prop_symbol' :: Symbol -> Maybe Char -> Property
|
||||
prop_symbol' = parseSymbol (symbol' sc) (fmap toLower)
|
||||
|
||||
parseSymbol :: (String -> Parser String) -> (String -> String)
|
||||
-> Symbol -> Maybe Char -> Property
|
||||
parseSymbol p' f s' t = checkParser p r s
|
||||
where p = p' (f g)
|
||||
r | g == s || isSpace (last s) = Right (f g)
|
||||
| otherwise = posErr (length s - 1) s [uneCh (last s), exEof]
|
||||
g = takeWhile (not . isSpace) s
|
||||
s = getSymbol s' ++ maybeToList t
|
||||
|
||||
newtype IndLine = IndLine
|
||||
{ getIndLine :: String }
|
||||
deriving (Show, Eq)
|
||||
|
||||
instance Arbitrary IndLine where
|
||||
arbitrary = IndLine . concat <$> sequence [spc, sym, spc, eol]
|
||||
where spc = listOf (elements " \t")
|
||||
sym = return "xxx"
|
||||
eol = return "\n"
|
||||
|
||||
prop_indentGuard :: IndLine -> IndLine -> IndLine -> Property
|
||||
prop_indentGuard l0 l1 l2 = checkParser p r s
|
||||
where p = ip (> 1) >>= \x -> sp >> ip (== x) >> sp >> ip (> x) >> sp
|
||||
ip = indentGuard sc'
|
||||
sp = void $ symbol sc' "xxx" <* C.eol
|
||||
r | f' l0 <= 1 = posErr 0 s msg'
|
||||
| f' l1 /= f' l0 = posErr (f l1 + g [l0]) s msg'
|
||||
| f' l2 <= f' l0 = posErr (f l2 + g [l0, l1]) s msg'
|
||||
| otherwise = Right ()
|
||||
msg' = [msg "incorrect indentation"]
|
||||
f = length . takeWhile isSpace . getIndLine
|
||||
f' x = sourceColumn $
|
||||
updatePosString (initialPos "") $ take (f x) (getIndLine x)
|
||||
g xs = sum $ length . getIndLine <$> xs
|
||||
s = concat $ getIndLine <$> [l0, l1, l2]
|
||||
|
||||
prop_charLiteral :: String -> Bool -> Property
|
||||
prop_charLiteral t i = checkParser charLiteral r s
|
||||
where b = listToMaybe $ readLitChar s
|
||||
(h, g) = fromJust b
|
||||
r | isNothing b = posErr 0 s $ exSpec "literal character" :
|
||||
[ if null s then uneEof else uneCh (head s) ]
|
||||
| null g = Right h
|
||||
| otherwise = posErr l s [uneCh (head g), exEof]
|
||||
l = length s - length g
|
||||
s = if null t || i then t else showLitChar (head t) (tail t)
|
||||
|
||||
prop_integer :: NonNegative Integer -> Int -> Property
|
||||
prop_integer n' i = checkParser integer r s
|
||||
where (r, s) = quasiCorrupted n' i showInt "integer"
|
||||
|
||||
prop_decimal :: NonNegative Integer -> Int -> Property
|
||||
prop_decimal n' i = checkParser decimal r s
|
||||
where (r, s) = quasiCorrupted n' i showInt "digit"
|
||||
|
||||
prop_hexadecimal :: NonNegative Integer -> Int -> Property
|
||||
prop_hexadecimal n' i = checkParser hexadecimal r s
|
||||
where (r, s) = quasiCorrupted n' i showHex "hexadecimal digit"
|
||||
|
||||
prop_octal :: NonNegative Integer -> Int -> Property
|
||||
prop_octal n' i = checkParser octal r s
|
||||
where (r, s) = quasiCorrupted n' i showOct "octal digit"
|
||||
|
||||
prop_float_0 :: NonNegative Double -> Property
|
||||
prop_float_0 n' = checkParser float r s
|
||||
where n = getNonNegative n'
|
||||
r = Right n
|
||||
s = show n
|
||||
|
||||
prop_float_1 :: Maybe (NonNegative Integer) -> Property
|
||||
prop_float_1 n' = checkParser float r s
|
||||
where r | isNothing n' = posErr 0 s [uneEof, exSpec "float"]
|
||||
| otherwise = posErr (length s) s [ uneEof, exCh '.', exCh 'E'
|
||||
, exCh 'e', exSpec "digit" ]
|
||||
s = maybe "" (show . getNonNegative) n'
|
||||
|
||||
prop_number :: Either (NonNegative Integer) (NonNegative Double)
|
||||
-> Integer -> Property
|
||||
prop_number n' i = checkParser number r s
|
||||
where r | null s = posErr 0 s [uneEof, exSpec "number"]
|
||||
| otherwise =
|
||||
Right $ case n' of
|
||||
Left x -> Left $ getNonNegative x
|
||||
Right x -> Right $ getNonNegative x
|
||||
s = if i < 5
|
||||
then ""
|
||||
else either (show . getNonNegative) (show . getNonNegative) n'
|
||||
|
||||
prop_signed :: Integer -> Int -> Bool -> Property
|
||||
prop_signed n i plus = checkParser p r s
|
||||
where p = signed (hidden C.space) integer
|
||||
r | i > length z = Right n
|
||||
| otherwise = posErr i s $ [uneCh '?', exSpec "integer"] ++
|
||||
(if i <= 0 then [exCh '+', exCh '-'] else []) ++
|
||||
[exEof | i > head (findIndices isDigit s)]
|
||||
z = let bar = showSigned showInt 0 n ""
|
||||
in if n < 0 || plus then bar else '+' : bar
|
||||
s = if i <= length z then take i z ++ "?" ++ drop i z else z
|
||||
|
||||
quasiCorrupted :: NonNegative Integer -> Int
|
||||
-> (Integer -> String -> String) -> String
|
||||
-> (Either ParseError Integer, String)
|
||||
quasiCorrupted n' i shower l = (r, s)
|
||||
where n = getNonNegative n'
|
||||
r | i > length z = Right n
|
||||
| otherwise = posErr i s $ [uneCh '?', exSpec l] ++
|
||||
[ exEof | i > 0 ]
|
||||
z = shower n ""
|
||||
s = if i <= length z then take i z ++ "?" ++ drop i z else z
|
||||
|
Loading…
Reference in New Issue
Block a user