diff --git a/Text/Megaparsec/Lexer.hs b/Text/Megaparsec/Lexer.hs index db32d55..ddc3088 100644 --- a/Text/Megaparsec/Lexer.hs +++ b/Text/Megaparsec/Lexer.hs @@ -9,125 +9,135 @@ -- Stability : experimental -- Portability : non-portable (uses local universal quantification: PolymorphicComponents) -- --- A helper module to parse lexical elements. See 'makeLexer' for a --- description of how to use it. This module is supposed to be imported --- qualified. - -{-# OPTIONS_GHC -fno-warn-name-shadowing #-} +-- High-level parsers to help you write your lexer. The module doesn't +-- impose how you should write your parser, but certain approaches may be +-- more elegant than others. Especially important theme is parsing of write +-- space, comments and indentation. +-- +-- This module is supposed to be imported qualified: +-- +-- > import qualified Text.Megaparsec.Lexer as L module Text.Megaparsec.Lexer - ( LanguageDef (..) - , defaultLang + ( -- * White space and indentation + space + , lexeme + , symbol + , symbol' + , indentGuard , skipLineComment , skipBlockComment - , Lexer (..) - , makeLexer ) + -- * Character and string literals + , charLiteral + -- * Numbers + , integer + , decimal + , hexadecimal + , octal + , float + , number + , signed ) where -import Control.Applicative ((<|>), many, some, empty) +import Control.Applicative ((<|>), some) import Control.Monad (void) -import Data.Char (isAlpha, toLower, toUpper) -import Data.List (sort) +import Data.Char (readLitChar) +import Data.Maybe (listToMaybe) import Text.Megaparsec.Combinator import Text.Megaparsec.Pos import Text.Megaparsec.Prim +import Text.Megaparsec.ShowToken import qualified Text.Megaparsec.Char as C --- Language definition +-- White space and indentation --- | The @LanguageDef@ type is a record that contains all parameters used to --- control features of the "Text.Megaparsec.Lexer" module. 'defaultLang' can --- be used as a basis for new language definitions. +-- | @space spaceChar lineComment blockComment@ produces parser that can +-- parse white space in general. It's expected that you create such a parser +-- once and pass it to many other function in this module as needed (it's +-- usually called @spaceConsumer@ in doc-strings here). +-- +-- @spaceChar@ is used to parse trivial space characters. You can use +-- 'C.spaceChar' from "Text.Megaparsec.Char" for this purpose as well as +-- your own parser (if you don't want automatically consume newlines, for +-- example). +-- +-- @lineComment@ is used to parse line comments. You can use +-- 'skipLineComment' if you don't need anything special. +-- +-- @blockComment@ is used to parse block (multi-line) comments. You can use +-- 'skipBlockComment' if you don't need anything special. +-- +-- Parsing of white space is important part of any parser. We propose scheme +-- where every lexeme should consume all trailing white space, but not +-- leading one. You should wrap every lexeme parser with 'lexeme' to achieve +-- this. You only need to call 'space' “manually” to consume any white space +-- before the first lexeme (at the beginning of file). -data LanguageDef s u m = - LanguageDef { +space :: Stream s m t => ParsecT s u m () -> + ParsecT s u m () -> ParsecT s u m () -> ParsecT s u m () +space ch line block = hidden . skipMany $ choice [ch, line, block] - -- | The parser is used to parse single white space character. If - -- indentation is important in your language you should probably not treat - -- newline as white space character. Also note that if newline is not - -- white space character, you will need to pick it up manually. +-- | This is wrapper for lexemes. Typical usage is to supply first argument +-- (parser that consumes white space, probably defined via 'space') and use +-- resulting function to wrap parsers for every lexeme. +-- +-- > lexeme = L.lexeme spaceConsumer +-- > integer = lexeme L.integer - spaceChar :: ParsecT s u m Char +lexeme :: ParsecT s u m () -> ParsecT s u m a -> ParsecT s u m a +lexeme spc p = p <* spc - -- | The parser parses line comments. It's responsibility of the parser to - -- stop at the end of line. If your language doesn't support this type of - -- comments, set this value to 'empty'. In simple cases you can use - -- 'skipLineComment' to quickly construct line comment parser. +-- | This is a helper to parse symbols, i.e. verbatim strings. You pass the +-- first argument (parser that consumes white space, probably defined via +-- 'space') and then you can use the resulting function to parse strings: +-- +-- > symbol = L.symbol spaceConsumer +-- > +-- > parens = between (symbol "(") (symbol ")") +-- > braces = between (symbol "{") (symbol "}") +-- > angles = between (symbol "<") (symbol ">") +-- > brackets = between (symbol "[") (symbol "]") +-- > semicolon = symbol ";" +-- > comma = symbol "," +-- > colon = symbol ":" +-- > dot = symbol "." - , lineComment :: ParsecT s u m () +symbol :: Stream s m Char => + ParsecT s u m () -> String -> ParsecT s u m String +symbol spc = lexeme spc . C.string - -- | The parser parses block (multi-line) comments. If your language - -- doesn't support this type of comments, set this value to 'empty'. In - -- simple cases you can use 'skipBlockComment' to quickly construct block - -- comment parser. +-- | Case-insensitive version of 'symbol'. This may be helpful if you're +-- working with case-insensitive languages. - , blockComment :: ParsecT s u m () +symbol' :: Stream s m Char => + ParsecT s u m () -> String -> ParsecT s u m String +symbol' spc = lexeme spc . C.string' - -- NEXT +-- | @indentGuard spaceConsumer test@ first consumes all white space +-- (indentation) with @spaceConsumer@ parser, then it checks column +-- position. It should satisfy supplied predicate @test@, otherwise the +-- parser fails with error message “incorrect indentation”. On success +-- current column position is returned. +-- +-- When you want to parse block of indentation first run this parser with +-- predicate like @(> 0)@ — this will make sure you have some +-- indentation. Use returned value to check indentation on every subsequent +-- line according to syntax of your language. - -- | This parser should accept any start characters of identifiers, for - -- example @letter \<|> char \'_\'@. - - , identStart :: ParsecT s u m Char - - -- | This parser should accept any legal tail characters of identifiers, - -- for example @alphaNum \<|> char \'_\'@. - - , identLetter :: ParsecT s u m Char - - -- | This parser should accept any start characters of operators, for - -- example @oneOf \":!#$%&*+.\/\<=>?\@\\\\^|-~\"@ - - , opStart :: ParsecT s u m Char - - -- | This parser should accept any legal tail characters of operators. - -- Note that this parser should even be defined if the language doesn't - -- support user-defined operators, or otherwise the 'reservedOp' parser - -- won't work correctly. - - , opLetter :: ParsecT s u m Char - - -- | The list of reserved identifiers. - - , reservedNames :: [String] - - -- | The list of reserved operators. - - , reservedOpNames :: [String] - - -- | Set to 'True' if the language is case sensitive. - - , caseSensitive :: Bool } - --- Default language definition - --- | This is standard language definition. It is recommended to use --- this definition as the basis for other definitions. @defaultLang@ has no --- reserved names or operators, is case sensitive and doesn't accept --- comments, identifiers or operators. - -defaultLang :: Stream s m Char => LanguageDef s u m -defaultLang = - LanguageDef - { spaceChar = C.spaceChar - , lineComment = empty - , blockComment = empty - -- NEXT - , identStart = C.letterChar <|> C.char '_' - , identLetter = C.alphaNumChar <|> C.oneOf "_'" - , opStart = opLetter defaultLang - , opLetter = C.oneOf ":!#$%&*+./<=>?@\\^|-~" - , reservedOpNames = [] - , reservedNames = [] - , caseSensitive = True } - --- Utility functions +indentGuard :: Stream s m t => + ParsecT s u m () -> (Int -> Bool) -> ParsecT s u m Int +indentGuard spc p = do + spc + pos <- sourceColumn <$> getPosition + if p pos + then return pos + else fail "incorrect indentation" -- | Given comment prefix this function returns parser that skips line -- comments. Note that it stops just before newline character but doesn't -- consume the newline. Newline is either supposed to be consumed by 'space' --- parser or picked manually. +-- parser or picked up manually. skipLineComment :: Stream s m Char => String -> ParsecT s u m () skipLineComment prefix = C.string prefix >> void (manyTill C.anyChar n) @@ -138,437 +148,122 @@ skipLineComment prefix = C.string prefix >> void (manyTill C.anyChar n) skipBlockComment :: Stream s m Char => String -> String -> ParsecT s u m () skipBlockComment start end = C.string start >> void (manyTill C.anyChar n) - where n = lookAhead (C.string end) + where n = lookAhead . try $ C.string end --- Lexer +-- Character and string literals --- | The type of the record that holds lexical parsers that work on --- @s@ streams with state @u@ over a monad @m@. - -data Lexer s u m = - Lexer { - - -- | Skips any white space. White space consists of /zero/ or more - -- occurrences of 'spaceChar', a line comment or a block (multi-line) - -- comment. - - space :: ParsecT s u m () - - -- | @lexeme p@ first applies parser @p@ and then the 'space' parser, - -- returning the value of @p@. Every lexical token (lexeme) is defined - -- using @lexeme@, this way every parse starts at a point without white - -- space. Parsers that use @lexeme@ are called /lexeme/ parsers in this - -- document. - -- - -- The only point where the 'space' parser should be called explicitly is - -- the start of the main parser in order to skip any leading white space. - - , lexeme :: forall a. ParsecT s u m a -> ParsecT s u m a - - -- | Lexeme parser @symbol s@ parses 'string' @s@ and skips - -- trailing white space. - - , symbol :: String -> ParsecT s u m String - - -- | @indentGuard p@ consumes all white space it can consume, then checks - -- column number. The column number should satisfy given predicate @p@, - -- otherwise the parser fails with “incorrect indentation” message. In - -- successful cases @indentGuard@ returns current column number. - - , indentGuard :: (Int -> Bool) -> ParsecT s u m Int - - -- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis, - -- returning the value of @p@. - - , parens :: forall a. ParsecT s u m a -> ParsecT s u m a - - -- | Lexeme parser @braces p@ parses @p@ enclosed in braces (“{” and - -- “}”), returning the value of @p@. - - , braces :: forall a. ParsecT s u m a -> ParsecT s u m a - - -- | Lexeme parser @angles p@ parses @p@ enclosed in angle brackets (“\<” - -- and “>”), returning the value of @p@. - - , angles :: forall a. ParsecT s u m a -> ParsecT s u m a - - -- | Lexeme parser @brackets p@ parses @p@ enclosed in brackets (“[” - -- and “]”), returning the value of @p@. - - , brackets :: forall a. ParsecT s u m a -> ParsecT s u m a - - -- | Lexeme parser @semicolon@ parses the character “;” and skips any - -- trailing white space. Returns the string “;”. - - , semicolon :: ParsecT s u m String - - -- | Lexeme parser @comma@ parses the character “,” and skips any - -- trailing white space. Returns the string “,”. - - , comma :: ParsecT s u m String - - -- | Lexeme parser @colon@ parses the character “:” and skips any - -- trailing white space. Returns the string “:”. - - , colon :: ParsecT s u m String - - -- | Lexeme parser @dot@ parses the character “.” and skips any - -- trailing white space. Returns the string “.”. - - , dot :: ParsecT s u m String - - -- | The lexeme parser parses a single literal character. Returns the - -- literal character value. This parsers deals correctly with escape - -- sequences. The literal character is parsed according to the grammar - -- rules defined in the Haskell report (which matches most programming - -- languages quite closely). - - , charLiteral :: ParsecT s u m Char - - -- | The lexeme parser parses a literal string. Returns the literal - -- string value. This parsers deals correctly with escape sequences and - -- gaps. The literal string is parsed according to the grammar rules - -- defined in the Haskell report (which matches most programming languages - -- quite closely). - - , stringLiteral :: ParsecT s u m String - - -- | The lexeme parser parses an integer (a whole number). This parser - -- /does not/ parse sign. Returns the value of the number. The number can - -- be specified in 'decimal', 'hexadecimal' or 'octal'. The number is - -- parsed according to the grammar rules in the Haskell report. - - , integer :: ParsecT s u m Integer - - -- | This is just like 'integer', except it can parse sign. - - , integer' :: ParsecT s u m Integer - - -- | The lexeme parses a positive whole number in the decimal system. - -- Returns the value of the number. - - , decimal :: ParsecT s u m Integer - - -- | The lexeme parses a positive whole number in the hexadecimal - -- system. The number should be prefixed with “0x” or “0X”. Returns the - -- value of the number. - - , hexadecimal :: ParsecT s u m Integer - - -- | The lexeme parses a positive whole number in the octal system. - -- The number should be prefixed with “0o” or “0O”. Returns the value of - -- the number. - - , octal :: ParsecT s u m Integer - - -- | @signed p@ tries to parse sign (i.e. “+”, “-”, or nothing) and - -- then runs parser @p@, changing sign of its result accordingly. Note - -- that there may be white space after the sign but not before it. - - , signed :: forall a. Num a => ParsecT s u m a -> ParsecT s u m a - - -- | The lexeme parser parses a floating point value. Returns the value - -- of the number. The number is parsed according to the grammar rules - -- defined in the Haskell report, sign is /not/ parsed, use 'float'' to - -- achieve parsing of signed floating point values. - - , float :: ParsecT s u m Double - - -- | This is just like 'float', except it can parse sign. - - , float' :: ParsecT s u m Double - - -- | The lexeme parser parses either 'integer' or a 'float'. - -- Returns the value of the number. This parser deals with any overlap in - -- the grammar rules for integers and floats. The number is parsed - -- according to the grammar rules defined in the Haskell report. - - , number :: ParsecT s u m (Either Integer Double) - - -- | This is just like 'number', except it can parse sign. - - , number' :: ParsecT s u m (Either Integer Double) - - -- | The lexeme parser parses a legal identifier. Returns the identifier - -- string. This parser will fail on identifiers that are reserved - -- words. Legal identifier (start) characters and reserved words are - -- defined in the 'LanguageDef' that is passed to 'makeLexer'. - - , identifier :: ParsecT s u m String - - -- | The lexeme parser @reserved name@ parses @symbol name@, but it also - -- checks that the @name@ is not a prefix of a valid identifier. - - , reserved :: String -> ParsecT s u m () - - -- | The lexeme parser parses a legal operator. Returns the name of the - -- operator. This parser will fail on any operators that are reserved - -- operators. Legal operator (start) characters and reserved operators are - -- defined in the 'LanguageDef' that is passed to 'makeLexer'. - - , operator :: ParsecT s u m String - - -- | The lexeme parser @reservedOp name@ parses @symbol name@, but it - -- also checks that the @name@ is not a prefix of a valid operator. - - , reservedOp :: String -> ParsecT s u m () } - --- | The expression @makeLexer language@ creates a 'Lexer' record that --- contains lexical parsers that are defined using the definitions in the --- @language@ record. +-- | The lexeme parser parses a single literal character without +-- quotes. Purpose of this parser is to help with parsing of commonly used +-- escape sequences. It's your responsibility to take care of character +-- literal syntax in your language (surround it with single quotes or +-- similar). -- --- The use of this function is quite stylized — one imports the appropriate --- language definition and selects the lexical parsers that are needed from --- the resulting 'Lexer'. +-- The literal character is parsed according to the grammar rules defined in +-- the Haskell report. -- --- > module Main (main) where --- > --- > import Text.Megaparsec --- > import Text.Megaparsec.Language (haskellDef) --- > import qualified Text.Megaparsec.Lexer as L --- > --- > -- The parser --- > … --- > --- > expr = parens expr --- > <|> identifier --- > <|> … --- > --- > -- The lexer --- > lexer = L.makeLexer haskellDef --- > --- > parens = L.parens lexer --- > braces = L.braces lexer --- > identifier = L.identifier lexer --- > reserved = L.reserved lexer --- > … +-- Note that you can use this parser as a building block to parse various +-- string literals: +-- +-- > stringLiteral = char '"' >> manyTill L.charLiteral (char '"') -makeLexer :: Stream s m Char => LanguageDef s u m -> Lexer s u m -makeLexer lang = - Lexer - { space = space - , lexeme = lexeme - , symbol = symbol - , indentGuard = indentGuard +charLiteral :: Stream s m Char => ParsecT s u m Char +charLiteral = label "literal character" $ do + r@(x:_) <- lookAhead $ count' 1 8 C.anyChar + case listToMaybe (readLitChar r) of + Just (c, r') -> count (length r - length r') C.anyChar >> return c + Nothing -> unexpected (showToken x) - , parens = parens - , braces = braces - , angles = angles - , brackets = brackets - , semicolon = semicolon - , comma = comma - , colon = colon - , dot = dot +-- Numbers - , charLiteral = charLiteral - , stringLiteral = stringLiteral +-- | Parse an integer without sign in decimal representation (according to +-- format of integer literals described in Haskell report). +-- +-- If you need to parse signed integers, see 'signed' combinator. - , integer = integer - , integer' = integer' - , decimal = decimal - , hexadecimal = hexadecimal - , octal = octal - , signed = signed - , float = float - , float' = float' - , number = number - , number' = number' +integer :: Stream s m Char => ParsecT s u m Integer +integer = decimal "integer" - , identifier = identifier - , reserved = reserved - , operator = operator - , reservedOp = reservedOp } - where +-- | The same as 'integer', but 'integer' is 'label'ed with “integer” label, +-- while this parser is not labeled. - -- white space & indentation +decimal :: Stream s m Char => ParsecT s u m Integer +decimal = nump "" C.digitChar - space = hidden . skipMany . choice $ - ($ lang) <$> [void . spaceChar, blockComment, lineComment] - lexeme p = p <* space - symbol = lexeme . C.string - indentGuard p = do - space - pos <- sourceColumn <$> getPosition - if p pos - then return pos - else fail "incorrect indentation" +-- | Parse an integer in hexadecimal representation. Representation of +-- hexadecimal number is expected to be according to Haskell report, that +-- is, it should be prefixed with “0x” or “0X” prefix. - -- auxiliary parsers +hexadecimal :: Stream s m Char => ParsecT s u m Integer +hexadecimal = C.char '0' >> C.char' 'x' >> nump "0x" C.hexDigitChar - parens = between (symbol "(") (symbol ")") - braces = between (symbol "{") (symbol "}") - angles = between (symbol "<") (symbol ">") - brackets = between (symbol "[") (symbol "]") - semicolon = symbol ";" - comma = symbol "," - colon = symbol ":" - dot = symbol "." +-- | Parse an integer in octal representation. Representation of octal +-- number is expected to be according to Haskell report, that is, it should +-- be prefixed with “0o” or “0O” prefix. - -- char & string literals +octal :: Stream s m Char => ParsecT s u m Integer +octal = C.char '0' >> C.char' 'o' >> nump "0o" C.octDigitChar - charLiteral = lexeme ( between (C.char '\'') - (C.char '\'' "end of character") - characterChar ) - "character" +-- | @nump prefix p@ parses /one/ or more characters with @p@ parser, then +-- prepends @prefix@ to returned value and tries to interpret the result as +-- an integer according to Haskell syntax. - characterChar = charLetter <|> charEscape "literal character" +nump :: String -> ParsecT s u m Char -> ParsecT s u m Integer +nump prefix baseDigit = read . (prefix ++) <$> some baseDigit - charEscape = C.char '\\' >> escapeCode - charLetter = C.satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026')) +-- | Parse a floating point value without sign. Representation of floating +-- point value is expected to be according to Haskell report. +-- +-- If you need to parse signed floats, see 'signed' combinator. - stringLiteral = - lexeme ((foldr (maybe id (:)) "" <$> - between (C.char '"') (C.char '"' "end of string") - (many stringChar)) "literal string") +float :: Stream s m Char => ParsecT s u m Double +float = label "float" $ read <$> f + where f = do + d <- some C.digitChar + rest <- fraction <|> fExp + return $ d ++ rest - stringChar = (Just <$> stringLetter) <|> stringEscape "string character" +-- | This is a helper for 'float' parser. It parses fractional part of +-- floating point number, that is, dot and everything after it. - stringLetter = C.satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026')) +fraction :: Stream s m Char => ParsecT s u m String +fraction = do + void $ C.char '.' + d <- some C.digitChar + e <- option "" fExp + return $ '.' : d ++ e - stringEscape = C.char '\\' >> - ( (escapeGap >> return Nothing) <|> - (escapeEmpty >> return Nothing) <|> - (Just <$> escapeCode) ) +-- | This helper parses exponent of floating point numbers. - escapeEmpty = C.char '&' - escapeGap = some C.spaceChar >> C.char '\\' "end of string gap" +fExp :: Stream s m Char => ParsecT s u m String +fExp = do + expChar <- C.char' 'e' + signStr <- option "" (pure <$> choice (C.char <$> "+-")) + d <- some C.digitChar + return $ expChar : signStr ++ d - -- escape codes +-- | Parse a number: either integer or floating point. The parser can handle +-- overlapping grammars graciously. - escapeCode = charEsc <|> charNum <|> charAscii <|> charControl - "escape code" +number :: Stream s m Char => ParsecT s u m (Either Integer Double) +number = (Right <$> try float) <|> (Left <$> integer) "number" - charEsc = choice (parseEsc <$> escMap) - where parseEsc (c, code) = C.char c >> return code +-- | @signed space p@ parser parses optional sign, then if there is a sign +-- it will consume optional white space (using @space@ parser), then it runs +-- parser @p@ which should return a number. Sign of the number is changed +-- according to previously parsed sign. +-- +-- For example, to parse signed integer you can write: +-- +-- > lexeme = L.lexeme spaceConsumer +-- > integer = lexeme L.integer +-- > signedInteger = signed spaceConsumer integer - charNum = toEnum . fromInteger <$> - ( decimal <|> - (C.char 'o' >> nump "0o" C.octDigitChar) <|> - (C.char 'x' >> nump "0x" C.hexDigitChar) ) +signed :: (Stream s m Char, Num a) => + ParsecT s u m () -> ParsecT s u m a -> ParsecT s u m a +signed spc p = ($) <$> option id (lexeme spc sign) <*> p - charAscii = choice (parseAscii <$> asciiMap) - where parseAscii (asc, code) = try (C.string asc >> return code) +-- | Parse a sign and return either 'id' or 'negate' according to parsed +-- sign. - charControl = toEnum . subtract 64 . fromEnum <$> (C.char '^' >> C.upperChar) - - -- escape code tables - - escMap = zip "abfnrtv\\\"\'" "\a\b\f\n\r\t\v\\\"\'" - asciiMap = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2) - - ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM", - "FS","GS","RS","US","SP"] - ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL", - "DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB", - "CAN","SUB","ESC","DEL"] - - ascii2 = "\b\t\n\v\f\r\SO\SI\EM\FS\GS\RS\US " - ascii3 = "\NUL\SOH\STX\ETX\EOT\ENQ\ACK\a\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\SUB\ESC\DEL" - - -- numbers — integers - - integer = decimal - integer' = signed integer - - decimal = lexeme (nump "" C.digitChar "integer") - hexadecimal = lexeme $ C.char '0' >> C.oneOf "xX" >> nump "0x" C.hexDigitChar - octal = lexeme $ C.char '0' >> C.oneOf "oO" >> nump "0o" C.octDigitChar - - nump prefix baseDigit = read . (prefix ++) <$> some baseDigit - - signed p = ($) <$> option id (lexeme sign) <*> p - - sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a) - sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate) - - -- numbers — floats - - float = lexeme ffloat "float" - float' = signed float - - ffloat = read <$> ffloat' - where - ffloat' = do - decimal <- fDec - rest <- fraction <|> fExp - return $ decimal ++ rest - - fraction = do - void $ C.char '.' - decimal <- fDec - exp <- option "" fExp - return $ '.' : decimal ++ exp - - fDec = some C.digitChar - - fExp = do - expChar <- C.oneOf "eE" - signStr <- option "" (pure <$> C.oneOf "+-") - decimal <- fDec - return $ expChar : signStr ++ decimal - - -- numbers — a more general case - - number = (Right <$> try float) <|> (Left <$> integer) "number" - number' = (Right <$> try float') <|> (Left <$> integer') "number" - - -- operators & reserved ops - - reservedOp name = - lexeme $ try $ do - void $ C.string name - notFollowedBy (opLetter lang) ("end of " ++ show name) - - operator = - lexeme $ try $ do - name <- oper - if isReservedOp name - then unexpected ("reserved operator " ++ show name) - else return name - - oper = ((:) <$> opStart lang <*> many (opLetter lang)) - "operator" - - isReservedOp = isReserved . sort $ reservedOpNames lang - - -- identifiers & reserved words - - reserved name = - lexeme $ try $ do - void $ caseString name - notFollowedBy (identLetter lang) ("end of " ++ show name) - - caseString name - | caseSensitive lang = C.string name - | otherwise = walk name >> return name - where walk = foldr (\c -> ((caseChar c show name) >>)) (return ()) - caseChar c - | isAlpha c = C.char (toLower c) <|> C.char (toUpper c) - | otherwise = C.char c - - identifier = - lexeme $ try $ do - name <- ident - if isReservedName name - then unexpected ("reserved word " ++ show name) - else return name - - ident = ((:) <$> identStart lang <*> many (identLetter lang)) - "identifier" - - isReservedName name = isReserved theReservedNames caseName - where caseName - | caseSensitive lang = name - | otherwise = toLower <$> name - - isReserved names name = scan names - where scan [] = False - scan (r:rs) = case compare r name of - LT -> scan rs - EQ -> True - GT -> False - - theReservedNames - | caseSensitive lang = sort reserved - | otherwise = sort . fmap (fmap toLower) $ reserved - where reserved = reservedNames lang +sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a) +sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate) diff --git a/megaparsec.cabal b/megaparsec.cabal index d988ffb..a914b94 100644 --- a/megaparsec.cabal +++ b/megaparsec.cabal @@ -115,6 +115,7 @@ test-suite old-tests main-is: Main.hs hs-source-dirs: old-tests type: exitcode-stdio-1.0 + ghc-options: -O2 -Wall other-modules: Bugs , Bugs.Bug2 , Bugs.Bug6 @@ -125,7 +126,8 @@ test-suite old-tests , HUnit == 1.2.* , test-framework >= 0.6 && < 1 , test-framework-hunit >= 0.2 && < 0.4 - ghc-options: -O2 -Wall + default-extensions: + FlexibleContexts default-language: Haskell2010 test-suite tests diff --git a/old-tests/Bugs/Bug2.hs b/old-tests/Bugs/Bug2.hs index 5cee124..6696c1e 100644 --- a/old-tests/Bugs/Bug2.hs +++ b/old-tests/Bugs/Bug2.hs @@ -1,13 +1,26 @@ module Bugs.Bug2 (main) where -import Test.HUnit hiding (Test) -import Test.Framework -import Test.Framework.Providers.HUnit +import Control.Applicative (empty) +import Control.Monad (void) import Text.Megaparsec +import Text.Megaparsec.String import qualified Text.Megaparsec.Lexer as L +import Test.Framework +import Test.Framework.Providers.HUnit +import Test.HUnit hiding (Test) + +sc :: Parser () +sc = L.space (void spaceChar) empty empty + +lexeme :: Parser a -> Parser a +lexeme = L.lexeme sc + +stringLiteral :: Parser String +stringLiteral = lexeme $ char '"' >> manyTill L.charLiteral (char '"') + main :: Test main = testCase "Control Char Parsing (#2)" $ @@ -15,7 +28,6 @@ main = where parseString :: String -> String parseString input = - case parse parser "Example" input of + case parse stringLiteral "Example" input of Left{} -> error "Parse failure" Right str -> str - parser = L.stringLiteral $ L.makeLexer L.defaultLang diff --git a/old-tests/Bugs/Bug35.hs b/old-tests/Bugs/Bug35.hs index 1bde9cd..ace833c 100644 --- a/old-tests/Bugs/Bug35.hs +++ b/old-tests/Bugs/Bug35.hs @@ -2,12 +2,11 @@ module Bugs.Bug35 (main) where import Text.Megaparsec -import Text.Megaparsec.String import qualified Text.Megaparsec.Lexer as L -import Test.HUnit hiding (Test) import Test.Framework import Test.Framework.Providers.HUnit +import Test.HUnit hiding (Test) trickyFloats :: [String] trickyFloats = @@ -27,12 +26,9 @@ trickyFloats = , "32.96176575630599" , "38.47735512322269" ] -float :: Parser Double -float = L.float (L.makeLexer L.defaultLang) - testBatch :: Assertion testBatch = mapM_ testFloat trickyFloats - where testFloat x = parse float "" x @?= Right (read x :: Double) + where testFloat x = parse L.float "" x @?= Right (read x :: Double) main :: Test main = testCase "Output of Text.Megaparsec.Lexer.float (#35)" testBatch diff --git a/old-tests/Bugs/Bug39.hs b/old-tests/Bugs/Bug39.hs index 14a1c25..91b88e0 100644 --- a/old-tests/Bugs/Bug39.hs +++ b/old-tests/Bugs/Bug39.hs @@ -1,15 +1,17 @@ module Bugs.Bug39 (main) where +import Control.Applicative (empty) +import Control.Monad (void) import Data.Either (isLeft, isRight) import Text.Megaparsec import Text.Megaparsec.String import qualified Text.Megaparsec.Lexer as L -import Test.HUnit hiding (Test) import Test.Framework import Test.Framework.Providers.HUnit +import Test.HUnit hiding (Test) shouldFail :: [String] shouldFail = [" 1", " +1", " -1"] @@ -17,8 +19,14 @@ shouldFail = [" 1", " +1", " -1"] shouldSucceed :: [String] shouldSucceed = ["1", "+1", "-1", "+ 1 ", "- 1 ", "1 "] +sc :: Parser () +sc = L.space (void spaceChar) empty empty + +lexeme :: Parser a -> Parser a +lexeme = L.lexeme sc + integer :: Parser Integer -integer = L.integer' (L.makeLexer L.defaultLang) +integer = lexeme $ L.signed sc L.integer testBatch :: Assertion testBatch = mapM_ (f testFail) shouldFail >> diff --git a/old-tests/Bugs/Bug6.hs b/old-tests/Bugs/Bug6.hs index 55b1363..fc8f736 100644 --- a/old-tests/Bugs/Bug6.hs +++ b/old-tests/Bugs/Bug6.hs @@ -1,13 +1,13 @@ module Bugs.Bug6 (main) where -import Test.HUnit hiding (Test) -import Test.Framework -import Test.Framework.Providers.HUnit - import Text.Megaparsec import Text.Megaparsec.String +import Test.Framework +import Test.Framework.Providers.HUnit +import Test.HUnit hiding (Test) + import Util main :: Test @@ -19,5 +19,5 @@ variable :: Parser String variable = do x <- lookAhead (some letterChar) if x == "return" - then fail "'return' is a reserved keyword" - else string x + then fail "'return' is a reserved keyword" + else string x diff --git a/old-tests/Bugs/Bug9.hs b/old-tests/Bugs/Bug9.hs index 79d7205..734be38 100644 --- a/old-tests/Bugs/Bug9.hs +++ b/old-tests/Bugs/Bug9.hs @@ -1,14 +1,17 @@ module Bugs.Bug9 (main) where +import Control.Applicative (empty) +import Control.Monad (void) + import Text.Megaparsec -import Text.Megaparsec.String (Parser) import Text.Megaparsec.Expr +import Text.Megaparsec.String (Parser) import qualified Text.Megaparsec.Lexer as L -import Test.HUnit hiding (Test) import Test.Framework import Test.Framework.Providers.HUnit +import Test.HUnit hiding (Test) import Util @@ -24,12 +27,21 @@ main = -- Syntax analysis +sc :: Stream s m Char => ParsecT s u m () +sc = L.space (void spaceChar) empty empty + +lexeme :: Stream s m Char => ParsecT s u m a -> ParsecT s u m a +lexeme = L.lexeme sc + +integer :: Stream s m Char => ParsecT s u m Integer +integer = lexeme L.integer + +operator :: Stream s m Char => String -> ParsecT s u m String +operator op = L.symbol sc op "operator" + parseTopLevel :: Parser Expr parseTopLevel = parseExpr <* eof parseExpr :: Parser Expr parseExpr = makeExprParser (Const <$> integer) table - where table = [[ InfixL (Op <$ reserved ">>>") ]] - lexer = L.makeLexer L.defaultLang { L.reservedOpNames = [">>>"] } - integer = L.integer lexer - reserved = L.reserved lexer + where table = [[ InfixL (Op <$ operator ">>>") ]]