diff --git a/Text/Megaparsec/Expr.hs b/Text/Megaparsec/Expr.hs index 02b896a..be03039 100644 --- a/Text/Megaparsec/Expr.hs +++ b/Text/Megaparsec/Expr.hs @@ -41,11 +41,10 @@ data Operator s u m a -- descending precedence. All operators in one list have the same precedence -- (but may have a different associativity). -- --- Prefix and postfix --- operators of the same precedence can only occur once (i.e. @--2@ is not --- allowed if @-@ is prefix negate). Prefix and postfix operators of the --- same precedence associate to the left (i.e. if @++@ is postfix increment, --- than @-2++@ equals @-1@, not @-3@). +-- Prefix and postfix operators of the same precedence can only occur once +-- (i.e. @--2@ is not allowed if @-@ is prefix negate). Prefix and postfix +-- operators of the same precedence associate to the left (i.e. if @++@ is +-- postfix increment, than @-2++@ equals @-1@, not @-3@). -- -- The @makeExprParser@ takes care of all the complexity involved in -- building expression parser. Here is an example of an expression parser @@ -66,6 +65,9 @@ data Operator s u m a -- > binary name f = InfixL (reservedOp name >> return f) -- > prefix name f = Prefix (reservedOp name >> return f) -- > postfix name f = Postfix (reservedOp name >> return f) +-- +-- Please note that multi-character operators should use 'try' in order to +-- be reported correctly in error messages. makeExprParser :: Stream s m t => ParsecT s u m a -> [[Operator s u m a]] -> ParsecT s u m a diff --git a/Text/Megaparsec/Language.hs b/Text/Megaparsec/Language.hs deleted file mode 100644 index 05df486..0000000 --- a/Text/Megaparsec/Language.hs +++ /dev/null @@ -1,115 +0,0 @@ --- | --- Module : Text.Megaparsec.Language --- Copyright : © 2015 Megaparsec contributors --- © 2007 Paolo Martini --- © 1999–2001 Daan Leijen --- License : BSD3 --- --- Maintainer : Mark Karpov --- Stability : experimental --- Portability : non-portable (uses non-portable module Text.Megaparsec.Lexer) --- --- A helper module that defines some language definitions that can be used --- to instantiate a token parser (see "Text.Megaparsec.Lexer"). - -module Text.Megaparsec.Language - ( LanguageDef - , emptyDef - , haskellStyle - , javaStyle - , haskellDef - , mondrianDef ) -where - -import Control.Monad.Identity - -import Control.Applicative ((<|>)) -import Text.Megaparsec.Char -import Text.Megaparsec.Lexer - --- | This is the most minimal token definition. It is recommended to use --- this definition as the basis for other definitions. @emptyDef@ has no --- reserved names or operators, is case sensitive and doesn't accept --- comments, identifiers or operators. - -emptyDef :: LanguageDef String st Identity -emptyDef = - LanguageDef - { commentStart = "" - , commentEnd = "" - , commentLine = "" - , nestedComments = True - , identStart = letterChar <|> char '_' - , identLetter = alphaNumChar <|> oneOf "_'" - , opStart = opLetter emptyDef - , opLetter = oneOf ":!#$%&*+./<=>?@\\^|-~" - , reservedOpNames = [] - , reservedNames = [] - , caseSensitive = True } - --- | This is a minimal token definition for Haskell-style languages. It --- defines the style of comments, valid identifiers and case sensitivity. It --- does not define any reserved words or operators. - -haskellStyle :: LanguageDef String u Identity -haskellStyle = - emptyDef - { commentStart = "{-" - , commentEnd = "-}" - , commentLine = "--" - , nestedComments = True - , identStart = letterChar - , identLetter = alphaNumChar <|> oneOf "_'" - , opStart = opLetter haskellStyle - , opLetter = oneOf ":!#$%&*+./<=>?@\\^|-~" - , reservedOpNames = [] - , reservedNames = [] - , caseSensitive = True } - --- | This is a minimal token definition for Java-style languages. It --- defines the style of comments, valid identifiers and case sensitivity. It --- does not define any reserved words or operators. - -javaStyle :: LanguageDef String u Identity -javaStyle = - emptyDef - { commentStart = "/*" - , commentEnd = "*/" - , commentLine = "//" - , nestedComments = True - , identStart = letterChar - , identLetter = alphaNumChar <|> oneOf "_'" - , reservedNames = [] - , reservedOpNames = [] - , caseSensitive = False } - --- | The language definition for the Haskell language. - -haskellDef :: LanguageDef String u Identity -haskellDef = - haskell98Def - { identLetter = identLetter haskell98Def <|> char '#' - , reservedNames = reservedNames haskell98Def ++ - [ "foreign", "import", "export", "primitive" - , "_ccall_", "_casm_", "forall"] } - --- | The language definition for the language Haskell98. - -haskell98Def :: LanguageDef String u Identity -haskell98Def = - haskellStyle - { reservedOpNames = ["::","..","=","\\","|","<-","->","@","~","=>"] - , reservedNames = [ "let", "in", "case", "of", "if", "then", "else" - , "data", "type", "class", "default", "deriving" - , "do", "import", "infix", "infixl", "infixr" - , "instance", "module", "newtype", "where" - , "primitive" ] } - --- | The language definition for the language Mondrian. - -mondrianDef :: LanguageDef String u Identity -mondrianDef = - javaStyle - { reservedNames = [ "case", "class", "default", "extends" - , "import", "in", "let", "new", "of", "package" ] - , caseSensitive = True } diff --git a/Text/Megaparsec/Lexer.hs b/Text/Megaparsec/Lexer.hs index 2d593f0..87c4677 100644 --- a/Text/Megaparsec/Lexer.hs +++ b/Text/Megaparsec/Lexer.hs @@ -9,573 +9,271 @@ -- Stability : experimental -- Portability : non-portable (uses local universal quantification: PolymorphicComponents) -- --- A helper module to parse lexical elements. See 'makeLexer' for a --- description of how to use it. - -{-# OPTIONS_GHC -fno-warn-name-shadowing #-} +-- High-level parsers to help you write your lexer. The module doesn't +-- impose how you should write your parser, but certain approaches may be +-- more elegant than others. Especially important theme is parsing of write +-- space, comments and indentation. +-- +-- This module is supposed to be imported qualified: +-- +-- > import qualified Text.Megaparsec.Lexer as L module Text.Megaparsec.Lexer - ( LanguageDef (..) - , Lexer (..) - , makeLexer ) + ( -- * White space and indentation + space + , lexeme + , symbol + , symbol' + , indentGuard + , skipLineComment + , skipBlockComment + -- * Character and string literals + , charLiteral + -- * Numbers + , integer + , decimal + , hexadecimal + , octal + , float + , number + , signed ) where -import Control.Applicative ((<|>), many, some) +import Control.Applicative ((<|>), some) import Control.Monad (void) -import Data.Char (isAlpha, toLower, toUpper) -import Data.List (sort) +import Data.Char (readLitChar) +import Data.Maybe (listToMaybe) -import Text.Megaparsec.Prim -import Text.Megaparsec.Char import Text.Megaparsec.Combinator +import Text.Megaparsec.Pos +import Text.Megaparsec.Prim +import Text.Megaparsec.ShowToken +import qualified Text.Megaparsec.Char as C --- Language definition +-- White space and indentation --- | The @LanguageDef@ type is a record that contains all parameters used to --- control features of the "Text.Megaparsec.Lexer" module. The module --- "Text.Megaparsec.Language" contains some default definitions. - -data LanguageDef s u m = - LanguageDef { - - -- | Describes the start of a block comment. Use the empty string if the - -- language doesn't support block comments. - - commentStart :: String - - -- | Describes the end of a block comment. Use the empty string if the - -- language doesn't support block comments. - - , commentEnd :: String - - -- | Describes the start of a line comment. Use the empty string if the - -- language doesn't support line comments. - - , commentLine :: String - - -- | Set to 'True' if the language supports nested block comments. - - , nestedComments :: Bool - - -- | This parser should accept any start characters of identifiers, for - -- example @letter \<|> char \'_\'@. - - , identStart :: ParsecT s u m Char - - -- | This parser should accept any legal tail characters of identifiers, - -- for example @alphaNum \<|> char \'_\'@. - - , identLetter :: ParsecT s u m Char - - -- | This parser should accept any start characters of operators, for - -- example @oneOf \":!#$%&*+.\/\<=>?\@\\\\^|-~\"@ - - , opStart :: ParsecT s u m Char - - -- | This parser should accept any legal tail characters of operators. - -- Note that this parser should even be defined if the language doesn't - -- support user-defined operators, or otherwise the 'reservedOp' parser - -- won't work correctly. - - , opLetter :: ParsecT s u m Char - - -- | The list of reserved identifiers. - - , reservedNames :: [String] - - -- | The list of reserved operators. - - , reservedOpNames :: [String] - - -- | Set to 'True' if the language is case sensitive. - - , caseSensitive :: Bool } - --- Lexer - --- | The type of the record that holds lexical parsers that work on --- @s@ streams with state @u@ over a monad @m@. - -data Lexer s u m = - Lexer { - - -- | The lexeme parser parses a legal identifier. Returns the identifier - -- string. This parser will fail on identifiers that are reserved - -- words. Legal identifier (start) characters and reserved words are - -- defined in the 'LanguageDef' that is passed to 'makeLexer'. - - identifier :: ParsecT s u m String - - -- | The lexeme parser @reserved name@ parses @symbol name@, but it also - -- checks that the @name@ is not a prefix of a valid identifier. - - , reserved :: String -> ParsecT s u m () - - -- | The lexeme parser parses a legal operator. Returns the name of the - -- operator. This parser will fail on any operators that are reserved - -- operators. Legal operator (start) characters and reserved operators are - -- defined in the 'LanguageDef' that is passed to 'makeLexer'. - - , operator :: ParsecT s u m String - - -- | The lexeme parser @reservedOp name@ parses @symbol name@, but it - -- also checks that the @name@ is not a prefix of a valid operator. - - , reservedOp :: String -> ParsecT s u m () - - -- | The lexeme parser parses a single literal character. Returns the - -- literal character value. This parsers deals correctly with escape - -- sequences. The literal character is parsed according to the grammar - -- rules defined in the Haskell report (which matches most programming - -- languages quite closely). - - , charLiteral :: ParsecT s u m Char - - -- | The lexeme parser parses a literal string. Returns the literal - -- string value. This parsers deals correctly with escape sequences and - -- gaps. The literal string is parsed according to the grammar rules - -- defined in the Haskell report (which matches most programming languages - -- quite closely). - - , stringLiteral :: ParsecT s u m String - - -- | The lexeme parser parses an integer (a whole number). This parser - -- /does not/ parse sign. Returns the value of the number. The number can - -- be specified in 'decimal', 'hexadecimal' or 'octal'. The number is - -- parsed according to the grammar rules in the Haskell report. - - , integer :: ParsecT s u m Integer - - -- | This is just like 'integer', except it can parse sign. - - , integer' :: ParsecT s u m Integer - - -- | The lexeme parses a positive whole number in the decimal system. - -- Returns the value of the number. - - , decimal :: ParsecT s u m Integer - - -- | The lexeme parses a positive whole number in the hexadecimal - -- system. The number should be prefixed with “0x” or “0X”. Returns the - -- value of the number. - - , hexadecimal :: ParsecT s u m Integer - - -- | The lexeme parses a positive whole number in the octal system. - -- The number should be prefixed with “0o” or “0O”. Returns the value of - -- the number. - - , octal :: ParsecT s u m Integer - - -- | @signed p@ tries to parse sign (i.e. “+”, “-”, or nothing) and - -- then runs parser @p@, changing sign of its result accordingly. Note - -- that there may be white space after the sign but not before it. - - , signed :: forall a . Num a => ParsecT s u m a -> ParsecT s u m a - - -- | The lexeme parser parses a floating point value. Returns the value - -- of the number. The number is parsed according to the grammar rules - -- defined in the Haskell report, sign is /not/ parsed, use 'float'' to - -- achieve parsing of signed floating point values. - - , float :: ParsecT s u m Double - - -- | This is just like 'float', except it can parse sign. - - , float' :: ParsecT s u m Double - - -- | The lexeme parser parses either 'integer' or a 'float'. - -- Returns the value of the number. This parser deals with any overlap in - -- the grammar rules for integers and floats. The number is parsed - -- according to the grammar rules defined in the Haskell report. - - , number :: ParsecT s u m (Either Integer Double) - - -- | This is just like 'number', except it can parse sign. - - , number' :: ParsecT s u m (Either Integer Double) - - -- | Lexeme parser @symbol s@ parses 'string' @s@ and skips - -- trailing white space. - - , symbol :: String -> ParsecT s u m String - - -- | @lexeme p@ first applies parser @p@ and then the 'whiteSpace' - -- parser, returning the value of @p@. Every lexical token (lexeme) is - -- defined using @lexeme@, this way every parse starts at a point without - -- white space. Parsers that use @lexeme@ are called /lexeme/ parsers in - -- this document. - -- - -- The only point where the 'whiteSpace' parser should be called - -- explicitly is the start of the main parser in order to skip any leading - -- white space. - - , lexeme :: forall a. ParsecT s u m a -> ParsecT s u m a - - -- | Parses any white space. White space consists of /zero/ or more - -- occurrences of a 'space', a line comment or a block (multi line) - -- comment. Block comments may be nested. How comments are started and - -- ended is defined in the 'LanguageDef' that is passed to - -- 'makeLexer'. - - , whiteSpace :: ParsecT s u m () - - -- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis, - -- returning the value of @p@. - - , parens :: forall a. ParsecT s u m a -> ParsecT s u m a - - -- | Lexeme parser @braces p@ parses @p@ enclosed in braces (“{” and - -- “}”), returning the value of @p@. - - , braces :: forall a. ParsecT s u m a -> ParsecT s u m a - - -- | Lexeme parser @angles p@ parses @p@ enclosed in angle brackets (“\<” - -- and “>”), returning the value of @p@. - - , angles :: forall a. ParsecT s u m a -> ParsecT s u m a - - -- | Lexeme parser @brackets p@ parses @p@ enclosed in brackets (“[” - -- and “]”), returning the value of @p@. - - , brackets :: forall a. ParsecT s u m a -> ParsecT s u m a - - -- | Lexeme parser @semicolon@ parses the character “;” and skips any - -- trailing white space. Returns the string “;”. - - , semicolon :: ParsecT s u m String - - -- | Lexeme parser @comma@ parses the character “,” and skips any - -- trailing white space. Returns the string “,”. - - , comma :: ParsecT s u m String - - -- | Lexeme parser @colon@ parses the character “:” and skips any - -- trailing white space. Returns the string “:”. - - , colon :: ParsecT s u m String - - -- | Lexeme parser @dot@ parses the character “.” and skips any - -- trailing white space. Returns the string “.”. - - , dot :: ParsecT s u m String - - -- | Lexeme parser @semiSep p@ parses /zero/ or more occurrences of @p@ - -- separated by 'semicolon'. Returns a list of values returned by @p@. - - , semicolonSep :: forall a . ParsecT s u m a -> ParsecT s u m [a] - - -- | Lexeme parser @semiSep1 p@ parses /one/ or more occurrences of @p@ - -- separated by 'semi'. Returns a list of values returned by @p@. - - , semicolonSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a] - - -- | Lexeme parser @commaSep p@ parses /zero/ or more occurrences of - -- @p@ separated by 'comma'. Returns a list of values returned by @p@. - - , commaSep :: forall a . ParsecT s u m a -> ParsecT s u m [a] - - -- | Lexeme parser @commaSep1 p@ parses /one/ or more occurrences of - -- @p@ separated by 'comma'. Returns a list of values returned by @p@. - - , commaSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a] } - --- | The expression @makeLexer language@ creates a 'Lexer' record that --- contains lexical parsers that are defined using the definitions in the --- @language@ record. +-- | @space spaceChar lineComment blockComment@ produces parser that can +-- parse white space in general. It's expected that you create such a parser +-- once and pass it to many other function in this module as needed (it's +-- usually called @spaceConsumer@ in doc-strings here). -- --- The use of this function is quite stylized — one imports the appropriate --- language definition and selects the lexical parsers that are needed from --- the resulting 'Lexer'. +-- @spaceChar@ is used to parse trivial space characters. You can use +-- 'C.spaceChar' from "Text.Megaparsec.Char" for this purpose as well as +-- your own parser (if you don't want automatically consume newlines, for +-- example). -- --- > module Main (main) where +-- @lineComment@ is used to parse line comments. You can use +-- 'skipLineComment' if you don't need anything special. +-- +-- @blockComment@ is used to parse block (multi-line) comments. You can use +-- 'skipBlockComment' if you don't need anything special. +-- +-- Parsing of white space is important part of any parser. We propose scheme +-- where every lexeme should consume all trailing white space, but not +-- leading one. You should wrap every lexeme parser with 'lexeme' to achieve +-- this. You only need to call 'space' “manually” to consume any white space +-- before the first lexeme (at the beginning of file). + +space :: Stream s m t => ParsecT s u m () -> + ParsecT s u m () -> ParsecT s u m () -> ParsecT s u m () +space ch line block = hidden . skipMany $ choice [ch, line, block] + +-- | This is wrapper for lexemes. Typical usage is to supply first argument +-- (parser that consumes white space, probably defined via 'space') and use +-- resulting function to wrap parsers for every lexeme. +-- +-- > lexeme = L.lexeme spaceConsumer +-- > integer = lexeme L.integer + +lexeme :: ParsecT s u m () -> ParsecT s u m a -> ParsecT s u m a +lexeme spc p = p <* spc + +-- | This is a helper to parse symbols, i.e. verbatim strings. You pass the +-- first argument (parser that consumes white space, probably defined via +-- 'space') and then you can use the resulting function to parse strings: +-- +-- > symbol = L.symbol spaceConsumer -- > --- > import Text.Megaparsec --- > import Text.Megaparsec.Language (haskellDef) --- > import qualified Text.Megaparsec.Lexer as L --- > --- > -- The parser --- > … --- > --- > expr = parens expr --- > <|> identifier --- > <|> … --- > --- > -- The lexer --- > lexer = L.makeLexer haskellDef --- > --- > parens = L.parens lexer --- > braces = L.braces lexer --- > identifier = L.identifier lexer --- > reserved = L.reserved lexer --- > … - -makeLexer :: Stream s m Char => LanguageDef s u m -> Lexer s u m -makeLexer languageDef = - Lexer - { identifier = identifier - , reserved = reserved - , operator = operator - , reservedOp = reservedOp - - , charLiteral = charLiteral - , stringLiteral = stringLiteral - - , integer = integer - , integer' = integer' - , decimal = decimal - , hexadecimal = hexadecimal - , octal = octal - , signed = signed - , float = float - , float' = float' - , number = number - , number' = number' - - , symbol = symbol - , lexeme = lexeme - , whiteSpace = whiteSpace - - , parens = parens - , braces = braces - , angles = angles - , brackets = brackets - , semicolon = semicolon - , comma = comma - , colon = colon - , dot = dot - , semicolonSep = semicolonSep - , semicolonSep1 = semicolonSep1 - , commaSep = commaSep - , commaSep1 = commaSep1 } - where - - -- bracketing - - parens = between (symbol "(") (symbol ")") - braces = between (symbol "{") (symbol "}") - angles = between (symbol "<") (symbol ">") - brackets = between (symbol "[") (symbol "]") - - semicolon = symbol ";" - comma = symbol "," - dot = symbol "." - colon = symbol ":" - - commaSep = (`sepBy` comma) - semicolonSep = (`sepBy` semicolon) - - commaSep1 = (`sepBy1` comma) - semicolonSep1 = (`sepBy1` semicolon) - - -- chars & strings - - charLiteral = lexeme ( between (char '\'') - (char '\'' "end of character") - characterChar ) - "character" - - characterChar = charLetter <|> charEscape "literal character" - - charEscape = char '\\' >> escapeCode - charLetter = satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026')) - - stringLiteral = - lexeme ((foldr (maybe id (:)) "" <$> - between (char '"') (char '"' "end of string") - (many stringChar)) "literal string") - - stringChar = (Just <$> stringLetter) <|> stringEscape "string character" - - stringLetter = satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026')) - - stringEscape = char '\\' >> - ( (escapeGap >> return Nothing) <|> - (escapeEmpty >> return Nothing) <|> - (Just <$> escapeCode) ) - - escapeEmpty = char '&' - escapeGap = some spaceChar >> char '\\' "end of string gap" - - -- escape codes - - escapeCode = charEsc <|> charNum <|> charAscii <|> charControl - "escape code" - - charEsc = choice (parseEsc <$> escMap) - where parseEsc (c, code) = char c >> return code - - charNum = toEnum . fromInteger <$> - ( decimal <|> - (char 'o' >> nump "0o" octDigitChar) <|> - (char 'x' >> nump "0x" hexDigitChar) ) - - charAscii = choice (parseAscii <$> asciiMap) - where parseAscii (asc, code) = try (string asc >> return code) - - charControl = toEnum . subtract 64 . fromEnum <$> (char '^' >> upperChar) - - -- escape code tables - - escMap = zip "abfnrtv\\\"\'" "\a\b\f\n\r\t\v\\\"\'" - asciiMap = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2) - - ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM", - "FS","GS","RS","US","SP"] - ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL", - "DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB", - "CAN","SUB","ESC","DEL"] - - ascii2 = "\b\t\n\v\f\r\SO\SI\EM\FS\GS\RS\US " - ascii3 = "\NUL\SOH\STX\ETX\EOT\ENQ\ACK\a\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\SUB\ESC\DEL" - - -- numbers — integers - - integer = decimal - integer' = signed integer - - decimal = lexeme (nump "" digitChar "integer") - hexadecimal = lexeme $ char '0' >> oneOf "xX" >> nump "0x" hexDigitChar - octal = lexeme $ char '0' >> oneOf "oO" >> nump "0o" octDigitChar - - nump prefix baseDigit = read . (prefix ++) <$> some baseDigit - - signed p = ($) <$> option id (lexeme sign) <*> p - - sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a) - sign = (char '+' *> return id) <|> (char '-' *> return negate) - - -- numbers — floats - - float = lexeme ffloat "float" - float' = signed float - - ffloat = read <$> ffloat' - where - ffloat' = do - decimal <- fDec - rest <- fraction <|> fExp - return $ decimal ++ rest - - fraction = do - void $ char '.' - decimal <- fDec - exp <- option "" fExp - return $ '.' : decimal ++ exp - - fDec = some digitChar - - fExp = do - expChar <- oneOf "eE" - signStr <- option "" (pure <$> oneOf "+-") - decimal <- fDec - return $ expChar : signStr ++ decimal - - -- numbers — a more general case - - number = (Right <$> try float) <|> (Left <$> integer) "number" - number' = (Right <$> try float') <|> (Left <$> integer') "number" - - -- operators & reserved ops - - reservedOp name = - lexeme $ try $ do - void $ string name - notFollowedBy (opLetter languageDef) ("end of " ++ show name) - - operator = - lexeme $ try $ do - name <- oper - if isReservedOp name - then unexpected ("reserved operator " ++ show name) - else return name - - oper = ((:) <$> opStart languageDef <*> many (opLetter languageDef)) - "operator" - - isReservedOp = isReserved . sort $ reservedOpNames languageDef - - -- identifiers & reserved words - - reserved name = - lexeme $ try $ do - void $ caseString name - notFollowedBy (identLetter languageDef) ("end of " ++ show name) - - caseString name - | caseSensitive languageDef = string name - | otherwise = walk name >> return name - where walk = foldr (\c -> ((caseChar c show name) >>)) (return ()) - caseChar c - | isAlpha c = char (toLower c) <|> char (toUpper c) - | otherwise = char c - - identifier = - lexeme $ try $ do - name <- ident - if isReservedName name - then unexpected ("reserved word " ++ show name) - else return name - - ident = ((:) <$> identStart languageDef <*> many (identLetter languageDef)) - "identifier" - - isReservedName name = isReserved theReservedNames caseName - where caseName - | caseSensitive languageDef = name - | otherwise = toLower <$> name - - isReserved names name = scan names - where scan [] = False - scan (r:rs) = case compare r name of - LT -> scan rs - EQ -> True - GT -> False - - theReservedNames - | caseSensitive languageDef = sort reserved - | otherwise = sort . fmap (fmap toLower) $ reserved - where reserved = reservedNames languageDef - - -- white space & symbols - - symbol = lexeme . string - - lexeme p = p <* whiteSpace - - whiteSpace = hidden space -- FIXME: write it in a decent manner - -- \| noLine && noMulti = skipMany (space "") - -- \| noLine = skipMany (space <|> - -- multiLineComment "") - -- \| noMulti = skipMany (space <|> - -- oneLineComment "") - -- \| otherwise = skipMany (space <|> - -- oneLineComment <|> - -- multiLineComment "") - -- where - -- noLine = null (commentLine languageDef) - -- noMulti = null (commentStart languageDef) - - -- oneLineComment = void (try (string (commentLine languageDef)) - -- >> skipMany (satisfy (/= '\n'))) - - -- multiLineComment = try (string (commentStart languageDef)) >> inComment - - -- inComment = if nestedComments languageDef - -- then inCommentMulti - -- else inCommentSingle - - -- inCommentMulti - -- = void (try . string $ commentEnd languageDef) - -- <|> (multiLineComment >> inCommentMulti) - -- <|> (skipSome (noneOf startEnd) >> inCommentMulti) - -- <|> (oneOf startEnd >> inCommentMulti) - -- "end of comment" - - -- inCommentSingle - -- = void (try . string $ commentEnd languageDef) - -- <|> (skipSome (noneOf startEnd) >> inCommentSingle) - -- <|> (oneOf startEnd >> inCommentSingle) - -- "end of comment" - - -- startEnd = nub $ (++) <$> commentEnd <*> commentStart $ languageDef +-- > parens = between (symbol "(") (symbol ")") +-- > braces = between (symbol "{") (symbol "}") +-- > angles = between (symbol "<") (symbol ">") +-- > brackets = between (symbol "[") (symbol "]") +-- > semicolon = symbol ";" +-- > comma = symbol "," +-- > colon = symbol ":" +-- > dot = symbol "." + +symbol :: Stream s m Char => + ParsecT s u m () -> String -> ParsecT s u m String +symbol spc = lexeme spc . C.string + +-- | Case-insensitive version of 'symbol'. This may be helpful if you're +-- working with case-insensitive languages. + +symbol' :: Stream s m Char => + ParsecT s u m () -> String -> ParsecT s u m String +symbol' spc = lexeme spc . C.string' + +-- | @indentGuard spaceConsumer test@ first consumes all white space +-- (indentation) with @spaceConsumer@ parser, then it checks column +-- position. It should satisfy supplied predicate @test@, otherwise the +-- parser fails with error message “incorrect indentation”. On success +-- current column position is returned. +-- +-- When you want to parse block of indentation first run this parser with +-- predicate like @(> 1)@ — this will make sure you have some +-- indentation. Use returned value to check indentation on every subsequent +-- line according to syntax of your language. + +indentGuard :: Stream s m t => + ParsecT s u m () -> (Int -> Bool) -> ParsecT s u m Int +indentGuard spc p = do + spc + pos <- sourceColumn <$> getPosition + if p pos + then return pos + else fail "incorrect indentation" + +-- | Given comment prefix this function returns parser that skips line +-- comments. Note that it stops just before newline character but doesn't +-- consume the newline. Newline is either supposed to be consumed by 'space' +-- parser or picked up manually. + +skipLineComment :: Stream s m Char => String -> ParsecT s u m () +skipLineComment prefix = p >> void (manyTill C.anyChar n) + where p = try $ C.string prefix + n = lookAhead C.newline + +-- | @skipBlockComment start end@ skips non-nested block comment starting +-- with @start@ and ending with @end@. + +skipBlockComment :: Stream s m Char => String -> String -> ParsecT s u m () +skipBlockComment start end = p >> void (manyTill C.anyChar n) + where p = try $ C.string start + n = try $ C.string end + +-- Character and string literals + +-- | The lexeme parser parses a single literal character without +-- quotes. Purpose of this parser is to help with parsing of commonly used +-- escape sequences. It's your responsibility to take care of character +-- literal syntax in your language (surround it with single quotes or +-- similar). +-- +-- The literal character is parsed according to the grammar rules defined in +-- the Haskell report. +-- +-- Note that you can use this parser as a building block to parse various +-- string literals: +-- +-- > stringLiteral = char '"' >> manyTill L.charLiteral (char '"') + +charLiteral :: Stream s m Char => ParsecT s u m Char +charLiteral = label "literal character" $ do + r@(x:_) <- lookAhead $ count' 1 8 C.anyChar + case listToMaybe (readLitChar r) of + Just (c, r') -> count (length r - length r') C.anyChar >> return c + Nothing -> unexpected (showToken x) + +-- Numbers + +-- | Parse an integer without sign in decimal representation (according to +-- format of integer literals described in Haskell report). +-- +-- If you need to parse signed integers, see 'signed' combinator. + +integer :: Stream s m Char => ParsecT s u m Integer +integer = decimal "integer" + +-- | The same as 'integer', but 'integer' is 'label'ed with “integer” label, +-- while this parser is not labeled. + +decimal :: Stream s m Char => ParsecT s u m Integer +decimal = nump "" C.digitChar + +-- | Parse an integer in hexadecimal representation. Representation of +-- hexadecimal number is expected to be according to Haskell report except +-- for the fact that this parser doesn't parse “0x” or “0X” prefix. It is +-- reponsibility of the programmer to parse correct prefix before parsing +-- the number itself. +-- +-- For example you can make it conform to Haskell report like this: +-- +-- > hexadecimal = char '0' >> char' 'x' >> L.hexadecimal + +hexadecimal :: Stream s m Char => ParsecT s u m Integer +hexadecimal = nump "0x" C.hexDigitChar + +-- | Parse an integer in octal representation. Representation of octal +-- number is expected to be according to Haskell report except for the fact +-- that this parser doesn't parse “0o” or “0O” prefix. It is responsibility +-- of the programmer to parse correct prefix before parsing the number +-- itself. + +octal :: Stream s m Char => ParsecT s u m Integer +octal = nump "0o" C.octDigitChar + +-- | @nump prefix p@ parses /one/ or more characters with @p@ parser, then +-- prepends @prefix@ to returned value and tries to interpret the result as +-- an integer according to Haskell syntax. + +nump :: String -> ParsecT s u m Char -> ParsecT s u m Integer +nump prefix baseDigit = read . (prefix ++) <$> some baseDigit + +-- | Parse a floating point value without sign. Representation of floating +-- point value is expected to be according to Haskell report. +-- +-- If you need to parse signed floats, see 'signed' combinator. + +float :: Stream s m Char => ParsecT s u m Double +float = label "float" $ read <$> f + where f = do + d <- some C.digitChar + rest <- fraction <|> fExp + return $ d ++ rest + +-- | This is a helper for 'float' parser. It parses fractional part of +-- floating point number, that is, dot and everything after it. + +fraction :: Stream s m Char => ParsecT s u m String +fraction = do + void $ C.char '.' + d <- some C.digitChar + e <- option "" fExp + return $ '.' : d ++ e + +-- | This helper parses exponent of floating point numbers. + +fExp :: Stream s m Char => ParsecT s u m String +fExp = do + expChar <- C.char' 'e' + signStr <- option "" (pure <$> choice (C.char <$> "+-")) + d <- some C.digitChar + return $ expChar : signStr ++ d + +-- | Parse a number: either integer or floating point. The parser can handle +-- overlapping grammars graciously. + +number :: Stream s m Char => ParsecT s u m (Either Integer Double) +number = (Right <$> try float) <|> (Left <$> integer) "number" + +-- | @signed space p@ parser parses optional sign, then if there is a sign +-- it will consume optional white space (using @space@ parser), then it runs +-- parser @p@ which should return a number. Sign of the number is changed +-- according to previously parsed sign. +-- +-- For example, to parse signed integer you can write: +-- +-- > lexeme = L.lexeme spaceConsumer +-- > integer = lexeme L.integer +-- > signedInteger = signed spaceConsumer integer + +signed :: (Stream s m Char, Num a) => + ParsecT s u m () -> ParsecT s u m a -> ParsecT s u m a +signed spc p = ($) <$> option id (lexeme spc sign) <*> p + +-- | Parse a sign and return either 'id' or 'negate' according to parsed +-- sign. + +sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a) +sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate) diff --git a/megaparsec.cabal b/megaparsec.cabal index 355d6a7..dabde57 100644 --- a/megaparsec.cabal +++ b/megaparsec.cabal @@ -100,7 +100,6 @@ library , Text.Megaparsec.Combinator , Text.Megaparsec.Error , Text.Megaparsec.Expr - , Text.Megaparsec.Language , Text.Megaparsec.Lexer , Text.Megaparsec.Perm , Text.Megaparsec.Pos @@ -116,6 +115,7 @@ test-suite old-tests main-is: Main.hs hs-source-dirs: old-tests type: exitcode-stdio-1.0 + ghc-options: -O2 -Wall other-modules: Bugs , Bugs.Bug2 , Bugs.Bug6 @@ -126,7 +126,8 @@ test-suite old-tests , HUnit == 1.2.* , test-framework >= 0.6 && < 1 , test-framework-hunit >= 0.2 && < 0.4 - ghc-options: -O2 -Wall + default-extensions: + FlexibleContexts default-language: Haskell2010 test-suite tests diff --git a/old-tests/Bugs/Bug2.hs b/old-tests/Bugs/Bug2.hs index e73a1dc..6696c1e 100644 --- a/old-tests/Bugs/Bug2.hs +++ b/old-tests/Bugs/Bug2.hs @@ -1,21 +1,33 @@ module Bugs.Bug2 (main) where -import Test.HUnit hiding (Test) -import Test.Framework -import Test.Framework.Providers.HUnit +import Control.Applicative (empty) +import Control.Monad (void) import Text.Megaparsec -import Text.Megaparsec.Language (haskellDef) +import Text.Megaparsec.String import qualified Text.Megaparsec.Lexer as L +import Test.Framework +import Test.Framework.Providers.HUnit +import Test.HUnit hiding (Test) + +sc :: Parser () +sc = L.space (void spaceChar) empty empty + +lexeme :: Parser a -> Parser a +lexeme = L.lexeme sc + +stringLiteral :: Parser String +stringLiteral = lexeme $ char '"' >> manyTill L.charLiteral (char '"') + main :: Test main = testCase "Control Char Parsing (#2)" $ parseString "\"test\\^Bstring\"" @?= "test\^Bstring" where + parseString :: String -> String parseString input = - case parse parser "Example" input of + case parse stringLiteral "Example" input of Left{} -> error "Parse failure" Right str -> str - parser = L.stringLiteral $ L.makeLexer haskellDef diff --git a/old-tests/Bugs/Bug35.hs b/old-tests/Bugs/Bug35.hs index 57018c4..ace833c 100644 --- a/old-tests/Bugs/Bug35.hs +++ b/old-tests/Bugs/Bug35.hs @@ -2,13 +2,11 @@ module Bugs.Bug35 (main) where import Text.Megaparsec -import Text.Megaparsec.Language -import Text.Megaparsec.String import qualified Text.Megaparsec.Lexer as L -import Test.HUnit hiding (Test) import Test.Framework import Test.Framework.Providers.HUnit +import Test.HUnit hiding (Test) trickyFloats :: [String] trickyFloats = @@ -28,12 +26,9 @@ trickyFloats = , "32.96176575630599" , "38.47735512322269" ] -float :: Parser Double -float = L.float (L.makeLexer emptyDef) - testBatch :: Assertion testBatch = mapM_ testFloat trickyFloats - where testFloat x = parse float "" x @?= Right (read x :: Double) + where testFloat x = parse L.float "" x @?= Right (read x :: Double) main :: Test main = testCase "Output of Text.Megaparsec.Lexer.float (#35)" testBatch diff --git a/old-tests/Bugs/Bug39.hs b/old-tests/Bugs/Bug39.hs index e3f9dbc..91b88e0 100644 --- a/old-tests/Bugs/Bug39.hs +++ b/old-tests/Bugs/Bug39.hs @@ -1,16 +1,17 @@ module Bugs.Bug39 (main) where +import Control.Applicative (empty) +import Control.Monad (void) import Data.Either (isLeft, isRight) import Text.Megaparsec -import Text.Megaparsec.Language import Text.Megaparsec.String import qualified Text.Megaparsec.Lexer as L -import Test.HUnit hiding (Test) import Test.Framework import Test.Framework.Providers.HUnit +import Test.HUnit hiding (Test) shouldFail :: [String] shouldFail = [" 1", " +1", " -1"] @@ -18,8 +19,14 @@ shouldFail = [" 1", " +1", " -1"] shouldSucceed :: [String] shouldSucceed = ["1", "+1", "-1", "+ 1 ", "- 1 ", "1 "] +sc :: Parser () +sc = L.space (void spaceChar) empty empty + +lexeme :: Parser a -> Parser a +lexeme = L.lexeme sc + integer :: Parser Integer -integer = L.integer' (L.makeLexer emptyDef) +integer = lexeme $ L.signed sc L.integer testBatch :: Assertion testBatch = mapM_ (f testFail) shouldFail >> diff --git a/old-tests/Bugs/Bug6.hs b/old-tests/Bugs/Bug6.hs index 55b1363..fc8f736 100644 --- a/old-tests/Bugs/Bug6.hs +++ b/old-tests/Bugs/Bug6.hs @@ -1,13 +1,13 @@ module Bugs.Bug6 (main) where -import Test.HUnit hiding (Test) -import Test.Framework -import Test.Framework.Providers.HUnit - import Text.Megaparsec import Text.Megaparsec.String +import Test.Framework +import Test.Framework.Providers.HUnit +import Test.HUnit hiding (Test) + import Util main :: Test @@ -19,5 +19,5 @@ variable :: Parser String variable = do x <- lookAhead (some letterChar) if x == "return" - then fail "'return' is a reserved keyword" - else string x + then fail "'return' is a reserved keyword" + else string x diff --git a/old-tests/Bugs/Bug9.hs b/old-tests/Bugs/Bug9.hs index 5aa8d68..82e9ba0 100644 --- a/old-tests/Bugs/Bug9.hs +++ b/old-tests/Bugs/Bug9.hs @@ -1,15 +1,17 @@ module Bugs.Bug9 (main) where +import Control.Applicative (empty) +import Control.Monad (void) + import Text.Megaparsec -import Text.Megaparsec.Language (haskellStyle) -import Text.Megaparsec.String (Parser) import Text.Megaparsec.Expr +import Text.Megaparsec.String (Parser) import qualified Text.Megaparsec.Lexer as L -import Test.HUnit hiding (Test) import Test.Framework import Test.Framework.Providers.HUnit +import Test.HUnit hiding (Test) import Util @@ -25,12 +27,21 @@ main = -- Syntax analysis +sc :: Stream s m Char => ParsecT s u m () +sc = L.space (void spaceChar) empty empty + +lexeme :: Stream s m Char => ParsecT s u m a -> ParsecT s u m a +lexeme = L.lexeme sc + +integer :: Stream s m Char => ParsecT s u m Integer +integer = lexeme L.integer + +operator :: Stream s m Char => String -> ParsecT s u m String +operator = try . L.symbol sc + parseTopLevel :: Parser Expr parseTopLevel = parseExpr <* eof parseExpr :: Parser Expr parseExpr = makeExprParser (Const <$> integer) table - where table = [[ InfixL (Op <$ reserved ">>>") ]] - lexer = L.makeLexer haskellStyle { L.reservedOpNames = [">>>"] } - integer = L.integer lexer - reserved = L.reserved lexer + where table = [[ InfixL (Op <$ operator ">>>") ]] diff --git a/tests/Lexer.hs b/tests/Lexer.hs index cbbd187..344172f 100644 --- a/tests/Lexer.hs +++ b/tests/Lexer.hs @@ -29,16 +29,206 @@ module Lexer (tests) where -import Control.Applicative (some, (<|>)) +import Control.Applicative (empty) +import Control.Monad (void) +import Data.Char + ( readLitChar + , showLitChar + , isDigit + , isAlphaNum + , isSpace + , toLower ) +import Data.List (findIndices, isInfixOf) +import Data.Maybe (listToMaybe, maybeToList, isNothing, fromJust) +import Numeric (showInt, showHex, showOct, showSigned) import Test.Framework import Test.Framework.Providers.QuickCheck2 (testProperty) import Test.QuickCheck -import Text.Megaparsec.Char -import Text.Megaparsec.Combinator +import Text.Megaparsec.Error import Text.Megaparsec.Lexer +import Text.Megaparsec.Pos +import Text.Megaparsec.Prim +import Text.Megaparsec.String +import qualified Text.Megaparsec.Char as C + +import Util tests :: Test tests = testGroup "Lexer" - [] + [ testProperty "space combinator" prop_space + , testProperty "symbol combinator" prop_symbol + , testProperty "symbol' combinator" prop_symbol' + , testProperty "indentGuard combinator" prop_indentGuard + , testProperty "charLiteral" prop_charLiteral + , testProperty "integer" prop_integer + , testProperty "decimal" prop_decimal + , testProperty "hexadecimal" prop_hexadecimal + , testProperty "octal" prop_octal + , testProperty "float 0" prop_float_0 + , testProperty "float 1" prop_float_1 + , testProperty "number" prop_number + , testProperty "signed" prop_signed ] + +newtype WhiteSpace = WhiteSpace + { getWhiteSpace :: String } + deriving (Show, Eq) + +instance Arbitrary WhiteSpace where + arbitrary = WhiteSpace . concat <$> listOf whiteUnit + +newtype Symbol = Symbol + { getSymbol :: String } + deriving (Show, Eq) + +instance Arbitrary Symbol where + arbitrary = Symbol <$> ((++) <$> symbolName <*> whiteChars) + +whiteUnit :: Gen String +whiteUnit = oneof [whiteChars, whiteLine, whiteBlock] + +whiteChars :: Gen String +whiteChars = listOf $ elements "\t\n " + +whiteLine :: Gen String +whiteLine = commentOut <$> arbitrary `suchThat` goodEnough + where commentOut x = "//" ++ x ++ "\n" + goodEnough x = '\n' `notElem` x + +whiteBlock :: Gen String +whiteBlock = commentOut <$> arbitrary `suchThat` goodEnough + where commentOut x = "/*" ++ x ++ "*/" + goodEnough x = not $ "*/" `isInfixOf` x + +symbolName :: Gen String +symbolName = listOf $ arbitrary `suchThat` isAlphaNum + +sc :: Parser () +sc = space (void C.spaceChar) l b + where l = skipLineComment "//" + b = skipBlockComment "/*" "*/" + +sc' :: Parser () +sc' = space (void $ C.oneOf " \t") empty empty + +prop_space :: WhiteSpace -> Property +prop_space w = checkParser p r s + where p = sc + r = Right () + s = getWhiteSpace w + +prop_symbol :: Symbol -> Maybe Char -> Property +prop_symbol = parseSymbol (symbol sc) id + +prop_symbol' :: Symbol -> Maybe Char -> Property +prop_symbol' = parseSymbol (symbol' sc) (fmap toLower) + +parseSymbol :: (String -> Parser String) -> (String -> String) + -> Symbol -> Maybe Char -> Property +parseSymbol p' f s' t = checkParser p r s + where p = p' (f g) + r | g == s || isSpace (last s) = Right (f g) + | otherwise = posErr (length s - 1) s [uneCh (last s), exEof] + g = takeWhile (not . isSpace) s + s = getSymbol s' ++ maybeToList t + +newtype IndLine = IndLine + { getIndLine :: String } + deriving (Show, Eq) + +instance Arbitrary IndLine where + arbitrary = IndLine . concat <$> sequence [spc, sym, spc, eol] + where spc = listOf (elements " \t") + sym = return "xxx" + eol = return "\n" + +prop_indentGuard :: IndLine -> IndLine -> IndLine -> Property +prop_indentGuard l0 l1 l2 = checkParser p r s + where p = ip (> 1) >>= \x -> sp >> ip (== x) >> sp >> ip (> x) >> sp + ip = indentGuard sc' + sp = void $ symbol sc' "xxx" <* C.eol + r | f' l0 <= 1 = posErr 0 s msg' + | f' l1 /= f' l0 = posErr (f l1 + g [l0]) s msg' + | f' l2 <= f' l0 = posErr (f l2 + g [l0, l1]) s msg' + | otherwise = Right () + msg' = [msg "incorrect indentation"] + f = length . takeWhile isSpace . getIndLine + f' x = sourceColumn $ + updatePosString (initialPos "") $ take (f x) (getIndLine x) + g xs = sum $ length . getIndLine <$> xs + s = concat $ getIndLine <$> [l0, l1, l2] + +prop_charLiteral :: String -> Bool -> Property +prop_charLiteral t i = checkParser charLiteral r s + where b = listToMaybe $ readLitChar s + (h, g) = fromJust b + r | isNothing b = posErr 0 s $ exSpec "literal character" : + [ if null s then uneEof else uneCh (head s) ] + | null g = Right h + | otherwise = posErr l s [uneCh (head g), exEof] + l = length s - length g + s = if null t || i then t else showLitChar (head t) (tail t) + +prop_integer :: NonNegative Integer -> Int -> Property +prop_integer n' i = checkParser integer r s + where (r, s) = quasiCorrupted n' i showInt "integer" + +prop_decimal :: NonNegative Integer -> Int -> Property +prop_decimal n' i = checkParser decimal r s + where (r, s) = quasiCorrupted n' i showInt "digit" + +prop_hexadecimal :: NonNegative Integer -> Int -> Property +prop_hexadecimal n' i = checkParser hexadecimal r s + where (r, s) = quasiCorrupted n' i showHex "hexadecimal digit" + +prop_octal :: NonNegative Integer -> Int -> Property +prop_octal n' i = checkParser octal r s + where (r, s) = quasiCorrupted n' i showOct "octal digit" + +prop_float_0 :: NonNegative Double -> Property +prop_float_0 n' = checkParser float r s + where n = getNonNegative n' + r = Right n + s = show n + +prop_float_1 :: Maybe (NonNegative Integer) -> Property +prop_float_1 n' = checkParser float r s + where r | isNothing n' = posErr 0 s [uneEof, exSpec "float"] + | otherwise = posErr (length s) s [ uneEof, exCh '.', exCh 'E' + , exCh 'e', exSpec "digit" ] + s = maybe "" (show . getNonNegative) n' + +prop_number :: Either (NonNegative Integer) (NonNegative Double) + -> Integer -> Property +prop_number n' i = checkParser number r s + where r | null s = posErr 0 s [uneEof, exSpec "number"] + | otherwise = + Right $ case n' of + Left x -> Left $ getNonNegative x + Right x -> Right $ getNonNegative x + s = if i < 5 + then "" + else either (show . getNonNegative) (show . getNonNegative) n' + +prop_signed :: Integer -> Int -> Bool -> Property +prop_signed n i plus = checkParser p r s + where p = signed (hidden C.space) integer + r | i > length z = Right n + | otherwise = posErr i s $ [uneCh '?', exSpec "integer"] ++ + (if i <= 0 then [exCh '+', exCh '-'] else []) ++ + [exEof | i > head (findIndices isDigit s)] + z = let bar = showSigned showInt 0 n "" + in if n < 0 || plus then bar else '+' : bar + s = if i <= length z then take i z ++ "?" ++ drop i z else z + +quasiCorrupted :: NonNegative Integer -> Int + -> (Integer -> String -> String) -> String + -> (Either ParseError Integer, String) +quasiCorrupted n' i shower l = (r, s) + where n = getNonNegative n' + r | i > length z = Right n + | otherwise = posErr i s $ [uneCh '?', exSpec l] ++ + [ exEof | i > 0 ] + z = shower n "" + s = if i <= length z then take i z ++ "?" ++ drop i z else z