From b6a43c33355c9ab5c15c03e17052afb6dd6ed39c Mon Sep 17 00:00:00 2001 From: mrkkrp Date: Wed, 2 Sep 2015 19:27:48 +0600 Subject: [PATCH 01/10] started work on new lexer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eliminated ‘Text.Megaparsec.Language’ module because at this point it is clear that already existing definitions are of little use in Megaparsec. I started writing “default” language definition in ‘Text.Megaparsec.Lexer’. At this point it should be possible to parse languages where indentation matters, although we will need to provide more helpers to make it easier. --- Text/Megaparsec/Language.hs | 115 ---------------- Text/Megaparsec/Lexer.hs | 255 ++++++++++++++++++------------------ megaparsec.cabal | 1 - old-tests/Bugs/Bug2.hs | 4 +- old-tests/Bugs/Bug35.hs | 3 +- old-tests/Bugs/Bug39.hs | 3 +- old-tests/Bugs/Bug9.hs | 3 +- 7 files changed, 133 insertions(+), 251 deletions(-) delete mode 100644 Text/Megaparsec/Language.hs diff --git a/Text/Megaparsec/Language.hs b/Text/Megaparsec/Language.hs deleted file mode 100644 index 05df486..0000000 --- a/Text/Megaparsec/Language.hs +++ /dev/null @@ -1,115 +0,0 @@ --- | --- Module : Text.Megaparsec.Language --- Copyright : © 2015 Megaparsec contributors --- © 2007 Paolo Martini --- © 1999–2001 Daan Leijen --- License : BSD3 --- --- Maintainer : Mark Karpov --- Stability : experimental --- Portability : non-portable (uses non-portable module Text.Megaparsec.Lexer) --- --- A helper module that defines some language definitions that can be used --- to instantiate a token parser (see "Text.Megaparsec.Lexer"). - -module Text.Megaparsec.Language - ( LanguageDef - , emptyDef - , haskellStyle - , javaStyle - , haskellDef - , mondrianDef ) -where - -import Control.Monad.Identity - -import Control.Applicative ((<|>)) -import Text.Megaparsec.Char -import Text.Megaparsec.Lexer - --- | This is the most minimal token definition. It is recommended to use --- this definition as the basis for other definitions. @emptyDef@ has no --- reserved names or operators, is case sensitive and doesn't accept --- comments, identifiers or operators. - -emptyDef :: LanguageDef String st Identity -emptyDef = - LanguageDef - { commentStart = "" - , commentEnd = "" - , commentLine = "" - , nestedComments = True - , identStart = letterChar <|> char '_' - , identLetter = alphaNumChar <|> oneOf "_'" - , opStart = opLetter emptyDef - , opLetter = oneOf ":!#$%&*+./<=>?@\\^|-~" - , reservedOpNames = [] - , reservedNames = [] - , caseSensitive = True } - --- | This is a minimal token definition for Haskell-style languages. It --- defines the style of comments, valid identifiers and case sensitivity. It --- does not define any reserved words or operators. - -haskellStyle :: LanguageDef String u Identity -haskellStyle = - emptyDef - { commentStart = "{-" - , commentEnd = "-}" - , commentLine = "--" - , nestedComments = True - , identStart = letterChar - , identLetter = alphaNumChar <|> oneOf "_'" - , opStart = opLetter haskellStyle - , opLetter = oneOf ":!#$%&*+./<=>?@\\^|-~" - , reservedOpNames = [] - , reservedNames = [] - , caseSensitive = True } - --- | This is a minimal token definition for Java-style languages. It --- defines the style of comments, valid identifiers and case sensitivity. It --- does not define any reserved words or operators. - -javaStyle :: LanguageDef String u Identity -javaStyle = - emptyDef - { commentStart = "/*" - , commentEnd = "*/" - , commentLine = "//" - , nestedComments = True - , identStart = letterChar - , identLetter = alphaNumChar <|> oneOf "_'" - , reservedNames = [] - , reservedOpNames = [] - , caseSensitive = False } - --- | The language definition for the Haskell language. - -haskellDef :: LanguageDef String u Identity -haskellDef = - haskell98Def - { identLetter = identLetter haskell98Def <|> char '#' - , reservedNames = reservedNames haskell98Def ++ - [ "foreign", "import", "export", "primitive" - , "_ccall_", "_casm_", "forall"] } - --- | The language definition for the language Haskell98. - -haskell98Def :: LanguageDef String u Identity -haskell98Def = - haskellStyle - { reservedOpNames = ["::","..","=","\\","|","<-","->","@","~","=>"] - , reservedNames = [ "let", "in", "case", "of", "if", "then", "else" - , "data", "type", "class", "default", "deriving" - , "do", "import", "infix", "infixl", "infixr" - , "instance", "module", "newtype", "where" - , "primitive" ] } - --- | The language definition for the language Mondrian. - -mondrianDef :: LanguageDef String u Identity -mondrianDef = - javaStyle - { reservedNames = [ "case", "class", "default", "extends" - , "import", "in", "let", "new", "of", "package" ] - , caseSensitive = True } diff --git a/Text/Megaparsec/Lexer.hs b/Text/Megaparsec/Lexer.hs index 2d593f0..ddd2057 100644 --- a/Text/Megaparsec/Lexer.hs +++ b/Text/Megaparsec/Lexer.hs @@ -17,45 +17,47 @@ module Text.Megaparsec.Lexer ( LanguageDef (..) , Lexer (..) + , defaultLang , makeLexer ) where -import Control.Applicative ((<|>), many, some) +import Control.Applicative ((<|>), many, some, empty) import Control.Monad (void) import Data.Char (isAlpha, toLower, toUpper) import Data.List (sort) -import Text.Megaparsec.Prim -import Text.Megaparsec.Char import Text.Megaparsec.Combinator +import Text.Megaparsec.Pos +import Text.Megaparsec.Prim +import qualified Text.Megaparsec.Char as C -- Language definition -- | The @LanguageDef@ type is a record that contains all parameters used to --- control features of the "Text.Megaparsec.Lexer" module. The module --- "Text.Megaparsec.Language" contains some default definitions. +-- control features of the "Text.Megaparsec.Lexer" module. 'defaultLang' can +-- be used as a basis for new language definitions. data LanguageDef s u m = LanguageDef { - -- | Describes the start of a block comment. Use the empty string if the - -- language doesn't support block comments. + -- | The parser is used to parse single white space character. If + -- indentation is important in your language you should probably not treat + -- newline as white space character. - commentStart :: String + spaceChar :: ParsecT s u m Char - -- | Describes the end of a block comment. Use the empty string if the - -- language doesn't support block comments. + -- | The parser parses line comments. It's responsibility of the parser to + -- stop at the end of line. If your language doesn't support this type of + -- comments, set this value to 'empty'. - , commentEnd :: String + , lineComment :: ParsecT s u m () - -- | Describes the start of a line comment. Use the empty string if the - -- language doesn't support line comments. + -- | The parser parses block (multi-line) comments. If your language + -- doesn't support this type of comments, set this value to 'empty'. - , commentLine :: String + , blockComment :: ParsecT s u m () - -- | Set to 'True' if the language supports nested block comments. - - , nestedComments :: Bool + -- NEXT -- | This parser should accept any start characters of identifiers, for -- example @letter \<|> char \'_\'@. @@ -91,6 +93,28 @@ data LanguageDef s u m = , caseSensitive :: Bool } +-- Default language definition + +-- | This is standard language definition. It is recommended to use +-- this definition as the basis for other definitions. @defaultLang@ has no +-- reserved names or operators, is case sensitive and doesn't accept +-- comments, identifiers or operators. + +defaultLang :: Stream s m Char => LanguageDef s u m +defaultLang = + LanguageDef + { spaceChar = C.spaceChar + , lineComment = empty + , blockComment = empty + -- NEXT + , identStart = C.letterChar <|> C.char '_' + , identLetter = C.alphaNumChar <|> C.oneOf "_'" + , opStart = opLetter defaultLang + , opLetter = C.oneOf ":!#$%&*+./<=>?@\\^|-~" + , reservedOpNames = [] + , reservedNames = [] + , caseSensitive = True } + -- Lexer -- | The type of the record that holds lexical parsers that work on @@ -99,12 +123,43 @@ data LanguageDef s u m = data Lexer s u m = Lexer { + -- | Skips any white space. White space consists of /zero/ or more + -- occurrences of 'spaceChar', a line comment or a block (multi-line) + -- comment. + + space :: ParsecT s u m () + + -- | @lexeme p@ first applies parser @p@ and then the 'space' parser, + -- returning the value of @p@. Every lexical token (lexeme) is defined + -- using @lexeme@, this way every parse starts at a point without white + -- space. Parsers that use @lexeme@ are called /lexeme/ parsers in this + -- document. + -- + -- The only point where the 'space' parser should be called explicitly is + -- the start of the main parser in order to skip any leading white space. + + , lexeme :: forall a. ParsecT s u m a -> ParsecT s u m a + + -- | Lexeme parser @symbol s@ parses 'string' @s@ and skips + -- trailing white space. + + , symbol :: String -> ParsecT s u m String + + -- | @indentGuard p@ consumes all white space it can consume, then checks + -- column number. The column number should satisfy given predicate @p@, + -- otherwise the parser fails with “incorrect indentation” message. In + -- successful cases @indentGuard@ returns current column number. + + , indentGuard :: (Int -> Bool) -> ParsecT s u m Int + + -- NEXT + -- | The lexeme parser parses a legal identifier. Returns the identifier -- string. This parser will fail on identifiers that are reserved -- words. Legal identifier (start) characters and reserved words are -- defined in the 'LanguageDef' that is passed to 'makeLexer'. - identifier :: ParsecT s u m String + , identifier :: ParsecT s u m String -- | The lexeme parser @reserved name@ parses @symbol name@, but it also -- checks that the @name@ is not a prefix of a valid identifier. @@ -195,31 +250,6 @@ data Lexer s u m = , number' :: ParsecT s u m (Either Integer Double) - -- | Lexeme parser @symbol s@ parses 'string' @s@ and skips - -- trailing white space. - - , symbol :: String -> ParsecT s u m String - - -- | @lexeme p@ first applies parser @p@ and then the 'whiteSpace' - -- parser, returning the value of @p@. Every lexical token (lexeme) is - -- defined using @lexeme@, this way every parse starts at a point without - -- white space. Parsers that use @lexeme@ are called /lexeme/ parsers in - -- this document. - -- - -- The only point where the 'whiteSpace' parser should be called - -- explicitly is the start of the main parser in order to skip any leading - -- white space. - - , lexeme :: forall a. ParsecT s u m a -> ParsecT s u m a - - -- | Parses any white space. White space consists of /zero/ or more - -- occurrences of a 'space', a line comment or a block (multi line) - -- comment. Block comments may be nested. How comments are started and - -- ended is defined in the 'LanguageDef' that is passed to - -- 'makeLexer'. - - , whiteSpace :: ParsecT s u m () - -- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis, -- returning the value of @p@. @@ -311,9 +341,14 @@ data Lexer s u m = -- > … makeLexer :: Stream s m Char => LanguageDef s u m -> Lexer s u m -makeLexer languageDef = +makeLexer lang = Lexer - { identifier = identifier + { space = space + , lexeme = lexeme + , symbol = symbol + , indentGuard = indentGuard + + , identifier = identifier , reserved = reserved , operator = operator , reservedOp = reservedOp @@ -332,10 +367,6 @@ makeLexer languageDef = , number = number , number' = number' - , symbol = symbol - , lexeme = lexeme - , whiteSpace = whiteSpace - , parens = parens , braces = braces , angles = angles @@ -350,7 +381,20 @@ makeLexer languageDef = , commaSep1 = commaSep1 } where - -- bracketing + -- white space & indentation + + space = hidden . skipMany . choice $ + ($ lang) <$> [blockComment, lineComment, void . spaceChar] + lexeme p = p <* space + symbol = lexeme . C.string + indentGuard p = do + space + pos <- sourceColumn <$> getPosition + if p pos + then return pos + else fail "incorrect indentation" + + -- bracketing NEXT parens = between (symbol "(") (symbol ")") braces = between (symbol "{") (symbol "}") @@ -370,32 +414,32 @@ makeLexer languageDef = -- chars & strings - charLiteral = lexeme ( between (char '\'') - (char '\'' "end of character") + charLiteral = lexeme ( between (C.char '\'') + (C.char '\'' "end of character") characterChar ) "character" characterChar = charLetter <|> charEscape "literal character" - charEscape = char '\\' >> escapeCode - charLetter = satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026')) + charEscape = C.char '\\' >> escapeCode + charLetter = C.satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026')) stringLiteral = lexeme ((foldr (maybe id (:)) "" <$> - between (char '"') (char '"' "end of string") + between (C.char '"') (C.char '"' "end of string") (many stringChar)) "literal string") stringChar = (Just <$> stringLetter) <|> stringEscape "string character" - stringLetter = satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026')) + stringLetter = C.satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026')) - stringEscape = char '\\' >> + stringEscape = C.char '\\' >> ( (escapeGap >> return Nothing) <|> (escapeEmpty >> return Nothing) <|> (Just <$> escapeCode) ) - escapeEmpty = char '&' - escapeGap = some spaceChar >> char '\\' "end of string gap" + escapeEmpty = C.char '&' + escapeGap = some C.spaceChar >> C.char '\\' "end of string gap" -- escape codes @@ -403,17 +447,17 @@ makeLexer languageDef = "escape code" charEsc = choice (parseEsc <$> escMap) - where parseEsc (c, code) = char c >> return code + where parseEsc (c, code) = C.char c >> return code charNum = toEnum . fromInteger <$> ( decimal <|> - (char 'o' >> nump "0o" octDigitChar) <|> - (char 'x' >> nump "0x" hexDigitChar) ) + (C.char 'o' >> nump "0o" C.octDigitChar) <|> + (C.char 'x' >> nump "0x" C.hexDigitChar) ) charAscii = choice (parseAscii <$> asciiMap) - where parseAscii (asc, code) = try (string asc >> return code) + where parseAscii (asc, code) = try (C.string asc >> return code) - charControl = toEnum . subtract 64 . fromEnum <$> (char '^' >> upperChar) + charControl = toEnum . subtract 64 . fromEnum <$> (C.char '^' >> C.upperChar) -- escape code tables @@ -434,16 +478,16 @@ makeLexer languageDef = integer = decimal integer' = signed integer - decimal = lexeme (nump "" digitChar "integer") - hexadecimal = lexeme $ char '0' >> oneOf "xX" >> nump "0x" hexDigitChar - octal = lexeme $ char '0' >> oneOf "oO" >> nump "0o" octDigitChar + decimal = lexeme (nump "" C.digitChar "integer") + hexadecimal = lexeme $ C.char '0' >> C.oneOf "xX" >> nump "0x" C.hexDigitChar + octal = lexeme $ C.char '0' >> C.oneOf "oO" >> nump "0o" C.octDigitChar nump prefix baseDigit = read . (prefix ++) <$> some baseDigit signed p = ($) <$> option id (lexeme sign) <*> p sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a) - sign = (char '+' *> return id) <|> (char '-' *> return negate) + sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate) -- numbers — floats @@ -458,16 +502,16 @@ makeLexer languageDef = return $ decimal ++ rest fraction = do - void $ char '.' + void $ C.char '.' decimal <- fDec exp <- option "" fExp return $ '.' : decimal ++ exp - fDec = some digitChar + fDec = some C.digitChar fExp = do - expChar <- oneOf "eE" - signStr <- option "" (pure <$> oneOf "+-") + expChar <- C.oneOf "eE" + signStr <- option "" (pure <$> C.oneOf "+-") decimal <- fDec return $ expChar : signStr ++ decimal @@ -480,8 +524,8 @@ makeLexer languageDef = reservedOp name = lexeme $ try $ do - void $ string name - notFollowedBy (opLetter languageDef) ("end of " ++ show name) + void $ C.string name + notFollowedBy (opLetter lang) ("end of " ++ show name) operator = lexeme $ try $ do @@ -490,25 +534,25 @@ makeLexer languageDef = then unexpected ("reserved operator " ++ show name) else return name - oper = ((:) <$> opStart languageDef <*> many (opLetter languageDef)) + oper = ((:) <$> opStart lang <*> many (opLetter lang)) "operator" - isReservedOp = isReserved . sort $ reservedOpNames languageDef + isReservedOp = isReserved . sort $ reservedOpNames lang -- identifiers & reserved words reserved name = lexeme $ try $ do void $ caseString name - notFollowedBy (identLetter languageDef) ("end of " ++ show name) + notFollowedBy (identLetter lang) ("end of " ++ show name) caseString name - | caseSensitive languageDef = string name + | caseSensitive lang = C.string name | otherwise = walk name >> return name where walk = foldr (\c -> ((caseChar c show name) >>)) (return ()) caseChar c - | isAlpha c = char (toLower c) <|> char (toUpper c) - | otherwise = char c + | isAlpha c = C.char (toLower c) <|> C.char (toUpper c) + | otherwise = C.char c identifier = lexeme $ try $ do @@ -517,12 +561,12 @@ makeLexer languageDef = then unexpected ("reserved word " ++ show name) else return name - ident = ((:) <$> identStart languageDef <*> many (identLetter languageDef)) + ident = ((:) <$> identStart lang <*> many (identLetter lang)) "identifier" isReservedName name = isReserved theReservedNames caseName where caseName - | caseSensitive languageDef = name + | caseSensitive lang = name | otherwise = toLower <$> name isReserved names name = scan names @@ -533,49 +577,6 @@ makeLexer languageDef = GT -> False theReservedNames - | caseSensitive languageDef = sort reserved + | caseSensitive lang = sort reserved | otherwise = sort . fmap (fmap toLower) $ reserved - where reserved = reservedNames languageDef - - -- white space & symbols - - symbol = lexeme . string - - lexeme p = p <* whiteSpace - - whiteSpace = hidden space -- FIXME: write it in a decent manner - -- \| noLine && noMulti = skipMany (space "") - -- \| noLine = skipMany (space <|> - -- multiLineComment "") - -- \| noMulti = skipMany (space <|> - -- oneLineComment "") - -- \| otherwise = skipMany (space <|> - -- oneLineComment <|> - -- multiLineComment "") - -- where - -- noLine = null (commentLine languageDef) - -- noMulti = null (commentStart languageDef) - - -- oneLineComment = void (try (string (commentLine languageDef)) - -- >> skipMany (satisfy (/= '\n'))) - - -- multiLineComment = try (string (commentStart languageDef)) >> inComment - - -- inComment = if nestedComments languageDef - -- then inCommentMulti - -- else inCommentSingle - - -- inCommentMulti - -- = void (try . string $ commentEnd languageDef) - -- <|> (multiLineComment >> inCommentMulti) - -- <|> (skipSome (noneOf startEnd) >> inCommentMulti) - -- <|> (oneOf startEnd >> inCommentMulti) - -- "end of comment" - - -- inCommentSingle - -- = void (try . string $ commentEnd languageDef) - -- <|> (skipSome (noneOf startEnd) >> inCommentSingle) - -- <|> (oneOf startEnd >> inCommentSingle) - -- "end of comment" - - -- startEnd = nub $ (++) <$> commentEnd <*> commentStart $ languageDef + where reserved = reservedNames lang diff --git a/megaparsec.cabal b/megaparsec.cabal index cbace6d..d988ffb 100644 --- a/megaparsec.cabal +++ b/megaparsec.cabal @@ -100,7 +100,6 @@ library , Text.Megaparsec.Combinator , Text.Megaparsec.Error , Text.Megaparsec.Expr - , Text.Megaparsec.Language , Text.Megaparsec.Lexer , Text.Megaparsec.Perm , Text.Megaparsec.Pos diff --git a/old-tests/Bugs/Bug2.hs b/old-tests/Bugs/Bug2.hs index e73a1dc..5cee124 100644 --- a/old-tests/Bugs/Bug2.hs +++ b/old-tests/Bugs/Bug2.hs @@ -6,7 +6,6 @@ import Test.Framework import Test.Framework.Providers.HUnit import Text.Megaparsec -import Text.Megaparsec.Language (haskellDef) import qualified Text.Megaparsec.Lexer as L main :: Test @@ -14,8 +13,9 @@ main = testCase "Control Char Parsing (#2)" $ parseString "\"test\\^Bstring\"" @?= "test\^Bstring" where + parseString :: String -> String parseString input = case parse parser "Example" input of Left{} -> error "Parse failure" Right str -> str - parser = L.stringLiteral $ L.makeLexer haskellDef + parser = L.stringLiteral $ L.makeLexer L.defaultLang diff --git a/old-tests/Bugs/Bug35.hs b/old-tests/Bugs/Bug35.hs index 57018c4..1bde9cd 100644 --- a/old-tests/Bugs/Bug35.hs +++ b/old-tests/Bugs/Bug35.hs @@ -2,7 +2,6 @@ module Bugs.Bug35 (main) where import Text.Megaparsec -import Text.Megaparsec.Language import Text.Megaparsec.String import qualified Text.Megaparsec.Lexer as L @@ -29,7 +28,7 @@ trickyFloats = , "38.47735512322269" ] float :: Parser Double -float = L.float (L.makeLexer emptyDef) +float = L.float (L.makeLexer L.defaultLang) testBatch :: Assertion testBatch = mapM_ testFloat trickyFloats diff --git a/old-tests/Bugs/Bug39.hs b/old-tests/Bugs/Bug39.hs index e3f9dbc..14a1c25 100644 --- a/old-tests/Bugs/Bug39.hs +++ b/old-tests/Bugs/Bug39.hs @@ -4,7 +4,6 @@ module Bugs.Bug39 (main) where import Data.Either (isLeft, isRight) import Text.Megaparsec -import Text.Megaparsec.Language import Text.Megaparsec.String import qualified Text.Megaparsec.Lexer as L @@ -19,7 +18,7 @@ shouldSucceed :: [String] shouldSucceed = ["1", "+1", "-1", "+ 1 ", "- 1 ", "1 "] integer :: Parser Integer -integer = L.integer' (L.makeLexer emptyDef) +integer = L.integer' (L.makeLexer L.defaultLang) testBatch :: Assertion testBatch = mapM_ (f testFail) shouldFail >> diff --git a/old-tests/Bugs/Bug9.hs b/old-tests/Bugs/Bug9.hs index 5aa8d68..79d7205 100644 --- a/old-tests/Bugs/Bug9.hs +++ b/old-tests/Bugs/Bug9.hs @@ -2,7 +2,6 @@ module Bugs.Bug9 (main) where import Text.Megaparsec -import Text.Megaparsec.Language (haskellStyle) import Text.Megaparsec.String (Parser) import Text.Megaparsec.Expr import qualified Text.Megaparsec.Lexer as L @@ -31,6 +30,6 @@ parseTopLevel = parseExpr <* eof parseExpr :: Parser Expr parseExpr = makeExprParser (Const <$> integer) table where table = [[ InfixL (Op <$ reserved ">>>") ]] - lexer = L.makeLexer haskellStyle { L.reservedOpNames = [">>>"] } + lexer = L.makeLexer L.defaultLang { L.reservedOpNames = [">>>"] } integer = L.integer lexer reserved = L.reserved lexer From f58d5bfe1ce19a4433f4e6859b9d6252df507fb6 Mon Sep 17 00:00:00 2001 From: mrkkrp Date: Thu, 3 Sep 2015 13:35:22 +0600 Subject: [PATCH 02/10] further refinement --- Text/Megaparsec/Lexer.hs | 196 +++++++++++++++++++-------------------- 1 file changed, 94 insertions(+), 102 deletions(-) diff --git a/Text/Megaparsec/Lexer.hs b/Text/Megaparsec/Lexer.hs index ddd2057..db32d55 100644 --- a/Text/Megaparsec/Lexer.hs +++ b/Text/Megaparsec/Lexer.hs @@ -10,14 +10,17 @@ -- Portability : non-portable (uses local universal quantification: PolymorphicComponents) -- -- A helper module to parse lexical elements. See 'makeLexer' for a --- description of how to use it. +-- description of how to use it. This module is supposed to be imported +-- qualified. {-# OPTIONS_GHC -fno-warn-name-shadowing #-} module Text.Megaparsec.Lexer ( LanguageDef (..) - , Lexer (..) , defaultLang + , skipLineComment + , skipBlockComment + , Lexer (..) , makeLexer ) where @@ -42,18 +45,22 @@ data LanguageDef s u m = -- | The parser is used to parse single white space character. If -- indentation is important in your language you should probably not treat - -- newline as white space character. + -- newline as white space character. Also note that if newline is not + -- white space character, you will need to pick it up manually. spaceChar :: ParsecT s u m Char -- | The parser parses line comments. It's responsibility of the parser to -- stop at the end of line. If your language doesn't support this type of - -- comments, set this value to 'empty'. + -- comments, set this value to 'empty'. In simple cases you can use + -- 'skipLineComment' to quickly construct line comment parser. , lineComment :: ParsecT s u m () -- | The parser parses block (multi-line) comments. If your language - -- doesn't support this type of comments, set this value to 'empty'. + -- doesn't support this type of comments, set this value to 'empty'. In + -- simple cases you can use 'skipBlockComment' to quickly construct block + -- comment parser. , blockComment :: ParsecT s u m () @@ -115,6 +122,24 @@ defaultLang = , reservedNames = [] , caseSensitive = True } +-- Utility functions + +-- | Given comment prefix this function returns parser that skips line +-- comments. Note that it stops just before newline character but doesn't +-- consume the newline. Newline is either supposed to be consumed by 'space' +-- parser or picked manually. + +skipLineComment :: Stream s m Char => String -> ParsecT s u m () +skipLineComment prefix = C.string prefix >> void (manyTill C.anyChar n) + where n = lookAhead C.newline + +-- | @skipBlockComment start end@ skips non-nested block comment starting +-- with @start@ and ending with @end@. + +skipBlockComment :: Stream s m Char => String -> String -> ParsecT s u m () +skipBlockComment start end = C.string start >> void (manyTill C.anyChar n) + where n = lookAhead (C.string end) + -- Lexer -- | The type of the record that holds lexical parsers that work on @@ -152,31 +177,45 @@ data Lexer s u m = , indentGuard :: (Int -> Bool) -> ParsecT s u m Int - -- NEXT + -- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis, + -- returning the value of @p@. - -- | The lexeme parser parses a legal identifier. Returns the identifier - -- string. This parser will fail on identifiers that are reserved - -- words. Legal identifier (start) characters and reserved words are - -- defined in the 'LanguageDef' that is passed to 'makeLexer'. + , parens :: forall a. ParsecT s u m a -> ParsecT s u m a - , identifier :: ParsecT s u m String + -- | Lexeme parser @braces p@ parses @p@ enclosed in braces (“{” and + -- “}”), returning the value of @p@. - -- | The lexeme parser @reserved name@ parses @symbol name@, but it also - -- checks that the @name@ is not a prefix of a valid identifier. + , braces :: forall a. ParsecT s u m a -> ParsecT s u m a - , reserved :: String -> ParsecT s u m () + -- | Lexeme parser @angles p@ parses @p@ enclosed in angle brackets (“\<” + -- and “>”), returning the value of @p@. - -- | The lexeme parser parses a legal operator. Returns the name of the - -- operator. This parser will fail on any operators that are reserved - -- operators. Legal operator (start) characters and reserved operators are - -- defined in the 'LanguageDef' that is passed to 'makeLexer'. + , angles :: forall a. ParsecT s u m a -> ParsecT s u m a - , operator :: ParsecT s u m String + -- | Lexeme parser @brackets p@ parses @p@ enclosed in brackets (“[” + -- and “]”), returning the value of @p@. - -- | The lexeme parser @reservedOp name@ parses @symbol name@, but it - -- also checks that the @name@ is not a prefix of a valid operator. + , brackets :: forall a. ParsecT s u m a -> ParsecT s u m a - , reservedOp :: String -> ParsecT s u m () + -- | Lexeme parser @semicolon@ parses the character “;” and skips any + -- trailing white space. Returns the string “;”. + + , semicolon :: ParsecT s u m String + + -- | Lexeme parser @comma@ parses the character “,” and skips any + -- trailing white space. Returns the string “,”. + + , comma :: ParsecT s u m String + + -- | Lexeme parser @colon@ parses the character “:” and skips any + -- trailing white space. Returns the string “:”. + + , colon :: ParsecT s u m String + + -- | Lexeme parser @dot@ parses the character “.” and skips any + -- trailing white space. Returns the string “.”. + + , dot :: ParsecT s u m String -- | The lexeme parser parses a single literal character. Returns the -- literal character value. This parsers deals correctly with escape @@ -226,7 +265,7 @@ data Lexer s u m = -- then runs parser @p@, changing sign of its result accordingly. Note -- that there may be white space after the sign but not before it. - , signed :: forall a . Num a => ParsecT s u m a -> ParsecT s u m a + , signed :: forall a. Num a => ParsecT s u m a -> ParsecT s u m a -- | The lexeme parser parses a floating point value. Returns the value -- of the number. The number is parsed according to the grammar rules @@ -250,65 +289,29 @@ data Lexer s u m = , number' :: ParsecT s u m (Either Integer Double) - -- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis, - -- returning the value of @p@. + -- | The lexeme parser parses a legal identifier. Returns the identifier + -- string. This parser will fail on identifiers that are reserved + -- words. Legal identifier (start) characters and reserved words are + -- defined in the 'LanguageDef' that is passed to 'makeLexer'. - , parens :: forall a. ParsecT s u m a -> ParsecT s u m a + , identifier :: ParsecT s u m String - -- | Lexeme parser @braces p@ parses @p@ enclosed in braces (“{” and - -- “}”), returning the value of @p@. + -- | The lexeme parser @reserved name@ parses @symbol name@, but it also + -- checks that the @name@ is not a prefix of a valid identifier. - , braces :: forall a. ParsecT s u m a -> ParsecT s u m a + , reserved :: String -> ParsecT s u m () - -- | Lexeme parser @angles p@ parses @p@ enclosed in angle brackets (“\<” - -- and “>”), returning the value of @p@. + -- | The lexeme parser parses a legal operator. Returns the name of the + -- operator. This parser will fail on any operators that are reserved + -- operators. Legal operator (start) characters and reserved operators are + -- defined in the 'LanguageDef' that is passed to 'makeLexer'. - , angles :: forall a. ParsecT s u m a -> ParsecT s u m a + , operator :: ParsecT s u m String - -- | Lexeme parser @brackets p@ parses @p@ enclosed in brackets (“[” - -- and “]”), returning the value of @p@. + -- | The lexeme parser @reservedOp name@ parses @symbol name@, but it + -- also checks that the @name@ is not a prefix of a valid operator. - , brackets :: forall a. ParsecT s u m a -> ParsecT s u m a - - -- | Lexeme parser @semicolon@ parses the character “;” and skips any - -- trailing white space. Returns the string “;”. - - , semicolon :: ParsecT s u m String - - -- | Lexeme parser @comma@ parses the character “,” and skips any - -- trailing white space. Returns the string “,”. - - , comma :: ParsecT s u m String - - -- | Lexeme parser @colon@ parses the character “:” and skips any - -- trailing white space. Returns the string “:”. - - , colon :: ParsecT s u m String - - -- | Lexeme parser @dot@ parses the character “.” and skips any - -- trailing white space. Returns the string “.”. - - , dot :: ParsecT s u m String - - -- | Lexeme parser @semiSep p@ parses /zero/ or more occurrences of @p@ - -- separated by 'semicolon'. Returns a list of values returned by @p@. - - , semicolonSep :: forall a . ParsecT s u m a -> ParsecT s u m [a] - - -- | Lexeme parser @semiSep1 p@ parses /one/ or more occurrences of @p@ - -- separated by 'semi'. Returns a list of values returned by @p@. - - , semicolonSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a] - - -- | Lexeme parser @commaSep p@ parses /zero/ or more occurrences of - -- @p@ separated by 'comma'. Returns a list of values returned by @p@. - - , commaSep :: forall a . ParsecT s u m a -> ParsecT s u m [a] - - -- | Lexeme parser @commaSep1 p@ parses /one/ or more occurrences of - -- @p@ separated by 'comma'. Returns a list of values returned by @p@. - - , commaSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a] } + , reservedOp :: String -> ParsecT s u m () } -- | The expression @makeLexer language@ creates a 'Lexer' record that -- contains lexical parsers that are defined using the definitions in the @@ -348,10 +351,14 @@ makeLexer lang = , symbol = symbol , indentGuard = indentGuard - , identifier = identifier - , reserved = reserved - , operator = operator - , reservedOp = reservedOp + , parens = parens + , braces = braces + , angles = angles + , brackets = brackets + , semicolon = semicolon + , comma = comma + , colon = colon + , dot = dot , charLiteral = charLiteral , stringLiteral = stringLiteral @@ -367,24 +374,16 @@ makeLexer lang = , number = number , number' = number' - , parens = parens - , braces = braces - , angles = angles - , brackets = brackets - , semicolon = semicolon - , comma = comma - , colon = colon - , dot = dot - , semicolonSep = semicolonSep - , semicolonSep1 = semicolonSep1 - , commaSep = commaSep - , commaSep1 = commaSep1 } + , identifier = identifier + , reserved = reserved + , operator = operator + , reservedOp = reservedOp } where -- white space & indentation space = hidden . skipMany . choice $ - ($ lang) <$> [blockComment, lineComment, void . spaceChar] + ($ lang) <$> [void . spaceChar, blockComment, lineComment] lexeme p = p <* space symbol = lexeme . C.string indentGuard p = do @@ -394,25 +393,18 @@ makeLexer lang = then return pos else fail "incorrect indentation" - -- bracketing NEXT + -- auxiliary parsers parens = between (symbol "(") (symbol ")") braces = between (symbol "{") (symbol "}") angles = between (symbol "<") (symbol ">") brackets = between (symbol "[") (symbol "]") - semicolon = symbol ";" comma = symbol "," - dot = symbol "." colon = symbol ":" + dot = symbol "." - commaSep = (`sepBy` comma) - semicolonSep = (`sepBy` semicolon) - - commaSep1 = (`sepBy1` comma) - semicolonSep1 = (`sepBy1` semicolon) - - -- chars & strings + -- char & string literals charLiteral = lexeme ( between (C.char '\'') (C.char '\'' "end of character") From 7298c0138788b02d0ab58986c63b23f581ebd237 Mon Sep 17 00:00:00 2001 From: mrkkrp Date: Tue, 8 Sep 2015 17:34:02 +0600 Subject: [PATCH 03/10] first version of the new lexer module --- Text/Megaparsec/Lexer.hs | 691 +++++++++++---------------------------- megaparsec.cabal | 4 +- old-tests/Bugs/Bug2.hs | 22 +- old-tests/Bugs/Bug35.hs | 8 +- old-tests/Bugs/Bug39.hs | 12 +- old-tests/Bugs/Bug6.hs | 12 +- old-tests/Bugs/Bug9.hs | 24 +- 7 files changed, 249 insertions(+), 524 deletions(-) diff --git a/Text/Megaparsec/Lexer.hs b/Text/Megaparsec/Lexer.hs index db32d55..ddc3088 100644 --- a/Text/Megaparsec/Lexer.hs +++ b/Text/Megaparsec/Lexer.hs @@ -9,125 +9,135 @@ -- Stability : experimental -- Portability : non-portable (uses local universal quantification: PolymorphicComponents) -- --- A helper module to parse lexical elements. See 'makeLexer' for a --- description of how to use it. This module is supposed to be imported --- qualified. - -{-# OPTIONS_GHC -fno-warn-name-shadowing #-} +-- High-level parsers to help you write your lexer. The module doesn't +-- impose how you should write your parser, but certain approaches may be +-- more elegant than others. Especially important theme is parsing of write +-- space, comments and indentation. +-- +-- This module is supposed to be imported qualified: +-- +-- > import qualified Text.Megaparsec.Lexer as L module Text.Megaparsec.Lexer - ( LanguageDef (..) - , defaultLang + ( -- * White space and indentation + space + , lexeme + , symbol + , symbol' + , indentGuard , skipLineComment , skipBlockComment - , Lexer (..) - , makeLexer ) + -- * Character and string literals + , charLiteral + -- * Numbers + , integer + , decimal + , hexadecimal + , octal + , float + , number + , signed ) where -import Control.Applicative ((<|>), many, some, empty) +import Control.Applicative ((<|>), some) import Control.Monad (void) -import Data.Char (isAlpha, toLower, toUpper) -import Data.List (sort) +import Data.Char (readLitChar) +import Data.Maybe (listToMaybe) import Text.Megaparsec.Combinator import Text.Megaparsec.Pos import Text.Megaparsec.Prim +import Text.Megaparsec.ShowToken import qualified Text.Megaparsec.Char as C --- Language definition +-- White space and indentation --- | The @LanguageDef@ type is a record that contains all parameters used to --- control features of the "Text.Megaparsec.Lexer" module. 'defaultLang' can --- be used as a basis for new language definitions. +-- | @space spaceChar lineComment blockComment@ produces parser that can +-- parse white space in general. It's expected that you create such a parser +-- once and pass it to many other function in this module as needed (it's +-- usually called @spaceConsumer@ in doc-strings here). +-- +-- @spaceChar@ is used to parse trivial space characters. You can use +-- 'C.spaceChar' from "Text.Megaparsec.Char" for this purpose as well as +-- your own parser (if you don't want automatically consume newlines, for +-- example). +-- +-- @lineComment@ is used to parse line comments. You can use +-- 'skipLineComment' if you don't need anything special. +-- +-- @blockComment@ is used to parse block (multi-line) comments. You can use +-- 'skipBlockComment' if you don't need anything special. +-- +-- Parsing of white space is important part of any parser. We propose scheme +-- where every lexeme should consume all trailing white space, but not +-- leading one. You should wrap every lexeme parser with 'lexeme' to achieve +-- this. You only need to call 'space' “manually” to consume any white space +-- before the first lexeme (at the beginning of file). -data LanguageDef s u m = - LanguageDef { +space :: Stream s m t => ParsecT s u m () -> + ParsecT s u m () -> ParsecT s u m () -> ParsecT s u m () +space ch line block = hidden . skipMany $ choice [ch, line, block] - -- | The parser is used to parse single white space character. If - -- indentation is important in your language you should probably not treat - -- newline as white space character. Also note that if newline is not - -- white space character, you will need to pick it up manually. +-- | This is wrapper for lexemes. Typical usage is to supply first argument +-- (parser that consumes white space, probably defined via 'space') and use +-- resulting function to wrap parsers for every lexeme. +-- +-- > lexeme = L.lexeme spaceConsumer +-- > integer = lexeme L.integer - spaceChar :: ParsecT s u m Char +lexeme :: ParsecT s u m () -> ParsecT s u m a -> ParsecT s u m a +lexeme spc p = p <* spc - -- | The parser parses line comments. It's responsibility of the parser to - -- stop at the end of line. If your language doesn't support this type of - -- comments, set this value to 'empty'. In simple cases you can use - -- 'skipLineComment' to quickly construct line comment parser. +-- | This is a helper to parse symbols, i.e. verbatim strings. You pass the +-- first argument (parser that consumes white space, probably defined via +-- 'space') and then you can use the resulting function to parse strings: +-- +-- > symbol = L.symbol spaceConsumer +-- > +-- > parens = between (symbol "(") (symbol ")") +-- > braces = between (symbol "{") (symbol "}") +-- > angles = between (symbol "<") (symbol ">") +-- > brackets = between (symbol "[") (symbol "]") +-- > semicolon = symbol ";" +-- > comma = symbol "," +-- > colon = symbol ":" +-- > dot = symbol "." - , lineComment :: ParsecT s u m () +symbol :: Stream s m Char => + ParsecT s u m () -> String -> ParsecT s u m String +symbol spc = lexeme spc . C.string - -- | The parser parses block (multi-line) comments. If your language - -- doesn't support this type of comments, set this value to 'empty'. In - -- simple cases you can use 'skipBlockComment' to quickly construct block - -- comment parser. +-- | Case-insensitive version of 'symbol'. This may be helpful if you're +-- working with case-insensitive languages. - , blockComment :: ParsecT s u m () +symbol' :: Stream s m Char => + ParsecT s u m () -> String -> ParsecT s u m String +symbol' spc = lexeme spc . C.string' - -- NEXT +-- | @indentGuard spaceConsumer test@ first consumes all white space +-- (indentation) with @spaceConsumer@ parser, then it checks column +-- position. It should satisfy supplied predicate @test@, otherwise the +-- parser fails with error message “incorrect indentation”. On success +-- current column position is returned. +-- +-- When you want to parse block of indentation first run this parser with +-- predicate like @(> 0)@ — this will make sure you have some +-- indentation. Use returned value to check indentation on every subsequent +-- line according to syntax of your language. - -- | This parser should accept any start characters of identifiers, for - -- example @letter \<|> char \'_\'@. - - , identStart :: ParsecT s u m Char - - -- | This parser should accept any legal tail characters of identifiers, - -- for example @alphaNum \<|> char \'_\'@. - - , identLetter :: ParsecT s u m Char - - -- | This parser should accept any start characters of operators, for - -- example @oneOf \":!#$%&*+.\/\<=>?\@\\\\^|-~\"@ - - , opStart :: ParsecT s u m Char - - -- | This parser should accept any legal tail characters of operators. - -- Note that this parser should even be defined if the language doesn't - -- support user-defined operators, or otherwise the 'reservedOp' parser - -- won't work correctly. - - , opLetter :: ParsecT s u m Char - - -- | The list of reserved identifiers. - - , reservedNames :: [String] - - -- | The list of reserved operators. - - , reservedOpNames :: [String] - - -- | Set to 'True' if the language is case sensitive. - - , caseSensitive :: Bool } - --- Default language definition - --- | This is standard language definition. It is recommended to use --- this definition as the basis for other definitions. @defaultLang@ has no --- reserved names or operators, is case sensitive and doesn't accept --- comments, identifiers or operators. - -defaultLang :: Stream s m Char => LanguageDef s u m -defaultLang = - LanguageDef - { spaceChar = C.spaceChar - , lineComment = empty - , blockComment = empty - -- NEXT - , identStart = C.letterChar <|> C.char '_' - , identLetter = C.alphaNumChar <|> C.oneOf "_'" - , opStart = opLetter defaultLang - , opLetter = C.oneOf ":!#$%&*+./<=>?@\\^|-~" - , reservedOpNames = [] - , reservedNames = [] - , caseSensitive = True } - --- Utility functions +indentGuard :: Stream s m t => + ParsecT s u m () -> (Int -> Bool) -> ParsecT s u m Int +indentGuard spc p = do + spc + pos <- sourceColumn <$> getPosition + if p pos + then return pos + else fail "incorrect indentation" -- | Given comment prefix this function returns parser that skips line -- comments. Note that it stops just before newline character but doesn't -- consume the newline. Newline is either supposed to be consumed by 'space' --- parser or picked manually. +-- parser or picked up manually. skipLineComment :: Stream s m Char => String -> ParsecT s u m () skipLineComment prefix = C.string prefix >> void (manyTill C.anyChar n) @@ -138,437 +148,122 @@ skipLineComment prefix = C.string prefix >> void (manyTill C.anyChar n) skipBlockComment :: Stream s m Char => String -> String -> ParsecT s u m () skipBlockComment start end = C.string start >> void (manyTill C.anyChar n) - where n = lookAhead (C.string end) + where n = lookAhead . try $ C.string end --- Lexer +-- Character and string literals --- | The type of the record that holds lexical parsers that work on --- @s@ streams with state @u@ over a monad @m@. - -data Lexer s u m = - Lexer { - - -- | Skips any white space. White space consists of /zero/ or more - -- occurrences of 'spaceChar', a line comment or a block (multi-line) - -- comment. - - space :: ParsecT s u m () - - -- | @lexeme p@ first applies parser @p@ and then the 'space' parser, - -- returning the value of @p@. Every lexical token (lexeme) is defined - -- using @lexeme@, this way every parse starts at a point without white - -- space. Parsers that use @lexeme@ are called /lexeme/ parsers in this - -- document. - -- - -- The only point where the 'space' parser should be called explicitly is - -- the start of the main parser in order to skip any leading white space. - - , lexeme :: forall a. ParsecT s u m a -> ParsecT s u m a - - -- | Lexeme parser @symbol s@ parses 'string' @s@ and skips - -- trailing white space. - - , symbol :: String -> ParsecT s u m String - - -- | @indentGuard p@ consumes all white space it can consume, then checks - -- column number. The column number should satisfy given predicate @p@, - -- otherwise the parser fails with “incorrect indentation” message. In - -- successful cases @indentGuard@ returns current column number. - - , indentGuard :: (Int -> Bool) -> ParsecT s u m Int - - -- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis, - -- returning the value of @p@. - - , parens :: forall a. ParsecT s u m a -> ParsecT s u m a - - -- | Lexeme parser @braces p@ parses @p@ enclosed in braces (“{” and - -- “}”), returning the value of @p@. - - , braces :: forall a. ParsecT s u m a -> ParsecT s u m a - - -- | Lexeme parser @angles p@ parses @p@ enclosed in angle brackets (“\<” - -- and “>”), returning the value of @p@. - - , angles :: forall a. ParsecT s u m a -> ParsecT s u m a - - -- | Lexeme parser @brackets p@ parses @p@ enclosed in brackets (“[” - -- and “]”), returning the value of @p@. - - , brackets :: forall a. ParsecT s u m a -> ParsecT s u m a - - -- | Lexeme parser @semicolon@ parses the character “;” and skips any - -- trailing white space. Returns the string “;”. - - , semicolon :: ParsecT s u m String - - -- | Lexeme parser @comma@ parses the character “,” and skips any - -- trailing white space. Returns the string “,”. - - , comma :: ParsecT s u m String - - -- | Lexeme parser @colon@ parses the character “:” and skips any - -- trailing white space. Returns the string “:”. - - , colon :: ParsecT s u m String - - -- | Lexeme parser @dot@ parses the character “.” and skips any - -- trailing white space. Returns the string “.”. - - , dot :: ParsecT s u m String - - -- | The lexeme parser parses a single literal character. Returns the - -- literal character value. This parsers deals correctly with escape - -- sequences. The literal character is parsed according to the grammar - -- rules defined in the Haskell report (which matches most programming - -- languages quite closely). - - , charLiteral :: ParsecT s u m Char - - -- | The lexeme parser parses a literal string. Returns the literal - -- string value. This parsers deals correctly with escape sequences and - -- gaps. The literal string is parsed according to the grammar rules - -- defined in the Haskell report (which matches most programming languages - -- quite closely). - - , stringLiteral :: ParsecT s u m String - - -- | The lexeme parser parses an integer (a whole number). This parser - -- /does not/ parse sign. Returns the value of the number. The number can - -- be specified in 'decimal', 'hexadecimal' or 'octal'. The number is - -- parsed according to the grammar rules in the Haskell report. - - , integer :: ParsecT s u m Integer - - -- | This is just like 'integer', except it can parse sign. - - , integer' :: ParsecT s u m Integer - - -- | The lexeme parses a positive whole number in the decimal system. - -- Returns the value of the number. - - , decimal :: ParsecT s u m Integer - - -- | The lexeme parses a positive whole number in the hexadecimal - -- system. The number should be prefixed with “0x” or “0X”. Returns the - -- value of the number. - - , hexadecimal :: ParsecT s u m Integer - - -- | The lexeme parses a positive whole number in the octal system. - -- The number should be prefixed with “0o” or “0O”. Returns the value of - -- the number. - - , octal :: ParsecT s u m Integer - - -- | @signed p@ tries to parse sign (i.e. “+”, “-”, or nothing) and - -- then runs parser @p@, changing sign of its result accordingly. Note - -- that there may be white space after the sign but not before it. - - , signed :: forall a. Num a => ParsecT s u m a -> ParsecT s u m a - - -- | The lexeme parser parses a floating point value. Returns the value - -- of the number. The number is parsed according to the grammar rules - -- defined in the Haskell report, sign is /not/ parsed, use 'float'' to - -- achieve parsing of signed floating point values. - - , float :: ParsecT s u m Double - - -- | This is just like 'float', except it can parse sign. - - , float' :: ParsecT s u m Double - - -- | The lexeme parser parses either 'integer' or a 'float'. - -- Returns the value of the number. This parser deals with any overlap in - -- the grammar rules for integers and floats. The number is parsed - -- according to the grammar rules defined in the Haskell report. - - , number :: ParsecT s u m (Either Integer Double) - - -- | This is just like 'number', except it can parse sign. - - , number' :: ParsecT s u m (Either Integer Double) - - -- | The lexeme parser parses a legal identifier. Returns the identifier - -- string. This parser will fail on identifiers that are reserved - -- words. Legal identifier (start) characters and reserved words are - -- defined in the 'LanguageDef' that is passed to 'makeLexer'. - - , identifier :: ParsecT s u m String - - -- | The lexeme parser @reserved name@ parses @symbol name@, but it also - -- checks that the @name@ is not a prefix of a valid identifier. - - , reserved :: String -> ParsecT s u m () - - -- | The lexeme parser parses a legal operator. Returns the name of the - -- operator. This parser will fail on any operators that are reserved - -- operators. Legal operator (start) characters and reserved operators are - -- defined in the 'LanguageDef' that is passed to 'makeLexer'. - - , operator :: ParsecT s u m String - - -- | The lexeme parser @reservedOp name@ parses @symbol name@, but it - -- also checks that the @name@ is not a prefix of a valid operator. - - , reservedOp :: String -> ParsecT s u m () } - --- | The expression @makeLexer language@ creates a 'Lexer' record that --- contains lexical parsers that are defined using the definitions in the --- @language@ record. +-- | The lexeme parser parses a single literal character without +-- quotes. Purpose of this parser is to help with parsing of commonly used +-- escape sequences. It's your responsibility to take care of character +-- literal syntax in your language (surround it with single quotes or +-- similar). -- --- The use of this function is quite stylized — one imports the appropriate --- language definition and selects the lexical parsers that are needed from --- the resulting 'Lexer'. +-- The literal character is parsed according to the grammar rules defined in +-- the Haskell report. -- --- > module Main (main) where --- > --- > import Text.Megaparsec --- > import Text.Megaparsec.Language (haskellDef) --- > import qualified Text.Megaparsec.Lexer as L --- > --- > -- The parser --- > … --- > --- > expr = parens expr --- > <|> identifier --- > <|> … --- > --- > -- The lexer --- > lexer = L.makeLexer haskellDef --- > --- > parens = L.parens lexer --- > braces = L.braces lexer --- > identifier = L.identifier lexer --- > reserved = L.reserved lexer --- > … +-- Note that you can use this parser as a building block to parse various +-- string literals: +-- +-- > stringLiteral = char '"' >> manyTill L.charLiteral (char '"') -makeLexer :: Stream s m Char => LanguageDef s u m -> Lexer s u m -makeLexer lang = - Lexer - { space = space - , lexeme = lexeme - , symbol = symbol - , indentGuard = indentGuard +charLiteral :: Stream s m Char => ParsecT s u m Char +charLiteral = label "literal character" $ do + r@(x:_) <- lookAhead $ count' 1 8 C.anyChar + case listToMaybe (readLitChar r) of + Just (c, r') -> count (length r - length r') C.anyChar >> return c + Nothing -> unexpected (showToken x) - , parens = parens - , braces = braces - , angles = angles - , brackets = brackets - , semicolon = semicolon - , comma = comma - , colon = colon - , dot = dot +-- Numbers - , charLiteral = charLiteral - , stringLiteral = stringLiteral +-- | Parse an integer without sign in decimal representation (according to +-- format of integer literals described in Haskell report). +-- +-- If you need to parse signed integers, see 'signed' combinator. - , integer = integer - , integer' = integer' - , decimal = decimal - , hexadecimal = hexadecimal - , octal = octal - , signed = signed - , float = float - , float' = float' - , number = number - , number' = number' +integer :: Stream s m Char => ParsecT s u m Integer +integer = decimal "integer" - , identifier = identifier - , reserved = reserved - , operator = operator - , reservedOp = reservedOp } - where +-- | The same as 'integer', but 'integer' is 'label'ed with “integer” label, +-- while this parser is not labeled. - -- white space & indentation +decimal :: Stream s m Char => ParsecT s u m Integer +decimal = nump "" C.digitChar - space = hidden . skipMany . choice $ - ($ lang) <$> [void . spaceChar, blockComment, lineComment] - lexeme p = p <* space - symbol = lexeme . C.string - indentGuard p = do - space - pos <- sourceColumn <$> getPosition - if p pos - then return pos - else fail "incorrect indentation" +-- | Parse an integer in hexadecimal representation. Representation of +-- hexadecimal number is expected to be according to Haskell report, that +-- is, it should be prefixed with “0x” or “0X” prefix. - -- auxiliary parsers +hexadecimal :: Stream s m Char => ParsecT s u m Integer +hexadecimal = C.char '0' >> C.char' 'x' >> nump "0x" C.hexDigitChar - parens = between (symbol "(") (symbol ")") - braces = between (symbol "{") (symbol "}") - angles = between (symbol "<") (symbol ">") - brackets = between (symbol "[") (symbol "]") - semicolon = symbol ";" - comma = symbol "," - colon = symbol ":" - dot = symbol "." +-- | Parse an integer in octal representation. Representation of octal +-- number is expected to be according to Haskell report, that is, it should +-- be prefixed with “0o” or “0O” prefix. - -- char & string literals +octal :: Stream s m Char => ParsecT s u m Integer +octal = C.char '0' >> C.char' 'o' >> nump "0o" C.octDigitChar - charLiteral = lexeme ( between (C.char '\'') - (C.char '\'' "end of character") - characterChar ) - "character" +-- | @nump prefix p@ parses /one/ or more characters with @p@ parser, then +-- prepends @prefix@ to returned value and tries to interpret the result as +-- an integer according to Haskell syntax. - characterChar = charLetter <|> charEscape "literal character" +nump :: String -> ParsecT s u m Char -> ParsecT s u m Integer +nump prefix baseDigit = read . (prefix ++) <$> some baseDigit - charEscape = C.char '\\' >> escapeCode - charLetter = C.satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026')) +-- | Parse a floating point value without sign. Representation of floating +-- point value is expected to be according to Haskell report. +-- +-- If you need to parse signed floats, see 'signed' combinator. - stringLiteral = - lexeme ((foldr (maybe id (:)) "" <$> - between (C.char '"') (C.char '"' "end of string") - (many stringChar)) "literal string") +float :: Stream s m Char => ParsecT s u m Double +float = label "float" $ read <$> f + where f = do + d <- some C.digitChar + rest <- fraction <|> fExp + return $ d ++ rest - stringChar = (Just <$> stringLetter) <|> stringEscape "string character" +-- | This is a helper for 'float' parser. It parses fractional part of +-- floating point number, that is, dot and everything after it. - stringLetter = C.satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026')) +fraction :: Stream s m Char => ParsecT s u m String +fraction = do + void $ C.char '.' + d <- some C.digitChar + e <- option "" fExp + return $ '.' : d ++ e - stringEscape = C.char '\\' >> - ( (escapeGap >> return Nothing) <|> - (escapeEmpty >> return Nothing) <|> - (Just <$> escapeCode) ) +-- | This helper parses exponent of floating point numbers. - escapeEmpty = C.char '&' - escapeGap = some C.spaceChar >> C.char '\\' "end of string gap" +fExp :: Stream s m Char => ParsecT s u m String +fExp = do + expChar <- C.char' 'e' + signStr <- option "" (pure <$> choice (C.char <$> "+-")) + d <- some C.digitChar + return $ expChar : signStr ++ d - -- escape codes +-- | Parse a number: either integer or floating point. The parser can handle +-- overlapping grammars graciously. - escapeCode = charEsc <|> charNum <|> charAscii <|> charControl - "escape code" +number :: Stream s m Char => ParsecT s u m (Either Integer Double) +number = (Right <$> try float) <|> (Left <$> integer) "number" - charEsc = choice (parseEsc <$> escMap) - where parseEsc (c, code) = C.char c >> return code +-- | @signed space p@ parser parses optional sign, then if there is a sign +-- it will consume optional white space (using @space@ parser), then it runs +-- parser @p@ which should return a number. Sign of the number is changed +-- according to previously parsed sign. +-- +-- For example, to parse signed integer you can write: +-- +-- > lexeme = L.lexeme spaceConsumer +-- > integer = lexeme L.integer +-- > signedInteger = signed spaceConsumer integer - charNum = toEnum . fromInteger <$> - ( decimal <|> - (C.char 'o' >> nump "0o" C.octDigitChar) <|> - (C.char 'x' >> nump "0x" C.hexDigitChar) ) +signed :: (Stream s m Char, Num a) => + ParsecT s u m () -> ParsecT s u m a -> ParsecT s u m a +signed spc p = ($) <$> option id (lexeme spc sign) <*> p - charAscii = choice (parseAscii <$> asciiMap) - where parseAscii (asc, code) = try (C.string asc >> return code) +-- | Parse a sign and return either 'id' or 'negate' according to parsed +-- sign. - charControl = toEnum . subtract 64 . fromEnum <$> (C.char '^' >> C.upperChar) - - -- escape code tables - - escMap = zip "abfnrtv\\\"\'" "\a\b\f\n\r\t\v\\\"\'" - asciiMap = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2) - - ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM", - "FS","GS","RS","US","SP"] - ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL", - "DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB", - "CAN","SUB","ESC","DEL"] - - ascii2 = "\b\t\n\v\f\r\SO\SI\EM\FS\GS\RS\US " - ascii3 = "\NUL\SOH\STX\ETX\EOT\ENQ\ACK\a\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\SUB\ESC\DEL" - - -- numbers — integers - - integer = decimal - integer' = signed integer - - decimal = lexeme (nump "" C.digitChar "integer") - hexadecimal = lexeme $ C.char '0' >> C.oneOf "xX" >> nump "0x" C.hexDigitChar - octal = lexeme $ C.char '0' >> C.oneOf "oO" >> nump "0o" C.octDigitChar - - nump prefix baseDigit = read . (prefix ++) <$> some baseDigit - - signed p = ($) <$> option id (lexeme sign) <*> p - - sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a) - sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate) - - -- numbers — floats - - float = lexeme ffloat "float" - float' = signed float - - ffloat = read <$> ffloat' - where - ffloat' = do - decimal <- fDec - rest <- fraction <|> fExp - return $ decimal ++ rest - - fraction = do - void $ C.char '.' - decimal <- fDec - exp <- option "" fExp - return $ '.' : decimal ++ exp - - fDec = some C.digitChar - - fExp = do - expChar <- C.oneOf "eE" - signStr <- option "" (pure <$> C.oneOf "+-") - decimal <- fDec - return $ expChar : signStr ++ decimal - - -- numbers — a more general case - - number = (Right <$> try float) <|> (Left <$> integer) "number" - number' = (Right <$> try float') <|> (Left <$> integer') "number" - - -- operators & reserved ops - - reservedOp name = - lexeme $ try $ do - void $ C.string name - notFollowedBy (opLetter lang) ("end of " ++ show name) - - operator = - lexeme $ try $ do - name <- oper - if isReservedOp name - then unexpected ("reserved operator " ++ show name) - else return name - - oper = ((:) <$> opStart lang <*> many (opLetter lang)) - "operator" - - isReservedOp = isReserved . sort $ reservedOpNames lang - - -- identifiers & reserved words - - reserved name = - lexeme $ try $ do - void $ caseString name - notFollowedBy (identLetter lang) ("end of " ++ show name) - - caseString name - | caseSensitive lang = C.string name - | otherwise = walk name >> return name - where walk = foldr (\c -> ((caseChar c show name) >>)) (return ()) - caseChar c - | isAlpha c = C.char (toLower c) <|> C.char (toUpper c) - | otherwise = C.char c - - identifier = - lexeme $ try $ do - name <- ident - if isReservedName name - then unexpected ("reserved word " ++ show name) - else return name - - ident = ((:) <$> identStart lang <*> many (identLetter lang)) - "identifier" - - isReservedName name = isReserved theReservedNames caseName - where caseName - | caseSensitive lang = name - | otherwise = toLower <$> name - - isReserved names name = scan names - where scan [] = False - scan (r:rs) = case compare r name of - LT -> scan rs - EQ -> True - GT -> False - - theReservedNames - | caseSensitive lang = sort reserved - | otherwise = sort . fmap (fmap toLower) $ reserved - where reserved = reservedNames lang +sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a) +sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate) diff --git a/megaparsec.cabal b/megaparsec.cabal index d988ffb..a914b94 100644 --- a/megaparsec.cabal +++ b/megaparsec.cabal @@ -115,6 +115,7 @@ test-suite old-tests main-is: Main.hs hs-source-dirs: old-tests type: exitcode-stdio-1.0 + ghc-options: -O2 -Wall other-modules: Bugs , Bugs.Bug2 , Bugs.Bug6 @@ -125,7 +126,8 @@ test-suite old-tests , HUnit == 1.2.* , test-framework >= 0.6 && < 1 , test-framework-hunit >= 0.2 && < 0.4 - ghc-options: -O2 -Wall + default-extensions: + FlexibleContexts default-language: Haskell2010 test-suite tests diff --git a/old-tests/Bugs/Bug2.hs b/old-tests/Bugs/Bug2.hs index 5cee124..6696c1e 100644 --- a/old-tests/Bugs/Bug2.hs +++ b/old-tests/Bugs/Bug2.hs @@ -1,13 +1,26 @@ module Bugs.Bug2 (main) where -import Test.HUnit hiding (Test) -import Test.Framework -import Test.Framework.Providers.HUnit +import Control.Applicative (empty) +import Control.Monad (void) import Text.Megaparsec +import Text.Megaparsec.String import qualified Text.Megaparsec.Lexer as L +import Test.Framework +import Test.Framework.Providers.HUnit +import Test.HUnit hiding (Test) + +sc :: Parser () +sc = L.space (void spaceChar) empty empty + +lexeme :: Parser a -> Parser a +lexeme = L.lexeme sc + +stringLiteral :: Parser String +stringLiteral = lexeme $ char '"' >> manyTill L.charLiteral (char '"') + main :: Test main = testCase "Control Char Parsing (#2)" $ @@ -15,7 +28,6 @@ main = where parseString :: String -> String parseString input = - case parse parser "Example" input of + case parse stringLiteral "Example" input of Left{} -> error "Parse failure" Right str -> str - parser = L.stringLiteral $ L.makeLexer L.defaultLang diff --git a/old-tests/Bugs/Bug35.hs b/old-tests/Bugs/Bug35.hs index 1bde9cd..ace833c 100644 --- a/old-tests/Bugs/Bug35.hs +++ b/old-tests/Bugs/Bug35.hs @@ -2,12 +2,11 @@ module Bugs.Bug35 (main) where import Text.Megaparsec -import Text.Megaparsec.String import qualified Text.Megaparsec.Lexer as L -import Test.HUnit hiding (Test) import Test.Framework import Test.Framework.Providers.HUnit +import Test.HUnit hiding (Test) trickyFloats :: [String] trickyFloats = @@ -27,12 +26,9 @@ trickyFloats = , "32.96176575630599" , "38.47735512322269" ] -float :: Parser Double -float = L.float (L.makeLexer L.defaultLang) - testBatch :: Assertion testBatch = mapM_ testFloat trickyFloats - where testFloat x = parse float "" x @?= Right (read x :: Double) + where testFloat x = parse L.float "" x @?= Right (read x :: Double) main :: Test main = testCase "Output of Text.Megaparsec.Lexer.float (#35)" testBatch diff --git a/old-tests/Bugs/Bug39.hs b/old-tests/Bugs/Bug39.hs index 14a1c25..91b88e0 100644 --- a/old-tests/Bugs/Bug39.hs +++ b/old-tests/Bugs/Bug39.hs @@ -1,15 +1,17 @@ module Bugs.Bug39 (main) where +import Control.Applicative (empty) +import Control.Monad (void) import Data.Either (isLeft, isRight) import Text.Megaparsec import Text.Megaparsec.String import qualified Text.Megaparsec.Lexer as L -import Test.HUnit hiding (Test) import Test.Framework import Test.Framework.Providers.HUnit +import Test.HUnit hiding (Test) shouldFail :: [String] shouldFail = [" 1", " +1", " -1"] @@ -17,8 +19,14 @@ shouldFail = [" 1", " +1", " -1"] shouldSucceed :: [String] shouldSucceed = ["1", "+1", "-1", "+ 1 ", "- 1 ", "1 "] +sc :: Parser () +sc = L.space (void spaceChar) empty empty + +lexeme :: Parser a -> Parser a +lexeme = L.lexeme sc + integer :: Parser Integer -integer = L.integer' (L.makeLexer L.defaultLang) +integer = lexeme $ L.signed sc L.integer testBatch :: Assertion testBatch = mapM_ (f testFail) shouldFail >> diff --git a/old-tests/Bugs/Bug6.hs b/old-tests/Bugs/Bug6.hs index 55b1363..fc8f736 100644 --- a/old-tests/Bugs/Bug6.hs +++ b/old-tests/Bugs/Bug6.hs @@ -1,13 +1,13 @@ module Bugs.Bug6 (main) where -import Test.HUnit hiding (Test) -import Test.Framework -import Test.Framework.Providers.HUnit - import Text.Megaparsec import Text.Megaparsec.String +import Test.Framework +import Test.Framework.Providers.HUnit +import Test.HUnit hiding (Test) + import Util main :: Test @@ -19,5 +19,5 @@ variable :: Parser String variable = do x <- lookAhead (some letterChar) if x == "return" - then fail "'return' is a reserved keyword" - else string x + then fail "'return' is a reserved keyword" + else string x diff --git a/old-tests/Bugs/Bug9.hs b/old-tests/Bugs/Bug9.hs index 79d7205..734be38 100644 --- a/old-tests/Bugs/Bug9.hs +++ b/old-tests/Bugs/Bug9.hs @@ -1,14 +1,17 @@ module Bugs.Bug9 (main) where +import Control.Applicative (empty) +import Control.Monad (void) + import Text.Megaparsec -import Text.Megaparsec.String (Parser) import Text.Megaparsec.Expr +import Text.Megaparsec.String (Parser) import qualified Text.Megaparsec.Lexer as L -import Test.HUnit hiding (Test) import Test.Framework import Test.Framework.Providers.HUnit +import Test.HUnit hiding (Test) import Util @@ -24,12 +27,21 @@ main = -- Syntax analysis +sc :: Stream s m Char => ParsecT s u m () +sc = L.space (void spaceChar) empty empty + +lexeme :: Stream s m Char => ParsecT s u m a -> ParsecT s u m a +lexeme = L.lexeme sc + +integer :: Stream s m Char => ParsecT s u m Integer +integer = lexeme L.integer + +operator :: Stream s m Char => String -> ParsecT s u m String +operator op = L.symbol sc op "operator" + parseTopLevel :: Parser Expr parseTopLevel = parseExpr <* eof parseExpr :: Parser Expr parseExpr = makeExprParser (Const <$> integer) table - where table = [[ InfixL (Op <$ reserved ">>>") ]] - lexer = L.makeLexer L.defaultLang { L.reservedOpNames = [">>>"] } - integer = L.integer lexer - reserved = L.reserved lexer + where table = [[ InfixL (Op <$ operator ">>>") ]] From 3de3f6965dff300d910178ed0035498e425386fc Mon Sep 17 00:00:00 2001 From: mrkkrp Date: Wed, 9 Sep 2015 14:15:39 +0600 Subject: [PATCH 04/10] =?UTF-8?q?make=20=E2=80=98hexadecimal=E2=80=99=20an?= =?UTF-8?q?d=20=E2=80=98octal=E2=80=99=20more=20powerful?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various languages may vary in how hexadecimal and octal literals should be prefixed. Following the spirit of the new lexer we leave this to programmer to decide. --- Text/Megaparsec/Lexer.hs | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/Text/Megaparsec/Lexer.hs b/Text/Megaparsec/Lexer.hs index ddc3088..c4a7fc6 100644 --- a/Text/Megaparsec/Lexer.hs +++ b/Text/Megaparsec/Lexer.hs @@ -190,18 +190,26 @@ decimal :: Stream s m Char => ParsecT s u m Integer decimal = nump "" C.digitChar -- | Parse an integer in hexadecimal representation. Representation of --- hexadecimal number is expected to be according to Haskell report, that --- is, it should be prefixed with “0x” or “0X” prefix. +-- hexadecimal number is expected to be according to Haskell report except +-- for the fact that this parser doesn't parse “0x” or “0X” prefix. It is +-- reponsibility of the programmer to parse correct prefix before parsing +-- the number itself. +-- +-- For example you can make it conform to Haskell report like this: +-- +-- > hexadecimal = char '0' >> char' 'x' >> L.hexadecimal hexadecimal :: Stream s m Char => ParsecT s u m Integer -hexadecimal = C.char '0' >> C.char' 'x' >> nump "0x" C.hexDigitChar +hexadecimal = nump "0x" C.hexDigitChar -- | Parse an integer in octal representation. Representation of octal --- number is expected to be according to Haskell report, that is, it should --- be prefixed with “0o” or “0O” prefix. +-- number is expected to be according to Haskell report except for the fact +-- that this parser doesn't parse “0o” or “0O” prefix. It is responsibility +-- of the programmer to parse correct prefix before parsing the number +-- itself. octal :: Stream s m Char => ParsecT s u m Integer -octal = C.char '0' >> C.char' 'o' >> nump "0o" C.octDigitChar +octal = nump "0o" C.octDigitChar -- | @nump prefix p@ parses /one/ or more characters with @p@ parser, then -- prepends @prefix@ to returned value and tries to interpret the result as From 4e8a1c298abc6c9d36d1fd5cfccf08f4b2e96e4a Mon Sep 17 00:00:00 2001 From: mrkkrp Date: Fri, 11 Sep 2015 15:10:14 +0600 Subject: [PATCH 05/10] fixed the failing old test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multi-character operators should use ‘try’ in order to be reported correctly (as “operator”). I've mentioned it in doc-string of ‘makeExprParser’. It's tempting to include ‘try’ directly in expression parsing code, but following general spirit of Parsec toward ‘try’, I think current solution is the best. --- Text/Megaparsec/Expr.hs | 12 +++++++----- old-tests/Bugs/Bug9.hs | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/Text/Megaparsec/Expr.hs b/Text/Megaparsec/Expr.hs index 02b896a..be03039 100644 --- a/Text/Megaparsec/Expr.hs +++ b/Text/Megaparsec/Expr.hs @@ -41,11 +41,10 @@ data Operator s u m a -- descending precedence. All operators in one list have the same precedence -- (but may have a different associativity). -- --- Prefix and postfix --- operators of the same precedence can only occur once (i.e. @--2@ is not --- allowed if @-@ is prefix negate). Prefix and postfix operators of the --- same precedence associate to the left (i.e. if @++@ is postfix increment, --- than @-2++@ equals @-1@, not @-3@). +-- Prefix and postfix operators of the same precedence can only occur once +-- (i.e. @--2@ is not allowed if @-@ is prefix negate). Prefix and postfix +-- operators of the same precedence associate to the left (i.e. if @++@ is +-- postfix increment, than @-2++@ equals @-1@, not @-3@). -- -- The @makeExprParser@ takes care of all the complexity involved in -- building expression parser. Here is an example of an expression parser @@ -66,6 +65,9 @@ data Operator s u m a -- > binary name f = InfixL (reservedOp name >> return f) -- > prefix name f = Prefix (reservedOp name >> return f) -- > postfix name f = Postfix (reservedOp name >> return f) +-- +-- Please note that multi-character operators should use 'try' in order to +-- be reported correctly in error messages. makeExprParser :: Stream s m t => ParsecT s u m a -> [[Operator s u m a]] -> ParsecT s u m a diff --git a/old-tests/Bugs/Bug9.hs b/old-tests/Bugs/Bug9.hs index 734be38..82e9ba0 100644 --- a/old-tests/Bugs/Bug9.hs +++ b/old-tests/Bugs/Bug9.hs @@ -37,7 +37,7 @@ integer :: Stream s m Char => ParsecT s u m Integer integer = lexeme L.integer operator :: Stream s m Char => String -> ParsecT s u m String -operator op = L.symbol sc op "operator" +operator = try . L.symbol sc parseTopLevel :: Parser Expr parseTopLevel = parseExpr <* eof From de16f4242f3486654c4a80288c063502d10f113d Mon Sep 17 00:00:00 2001 From: mrkkrp Date: Fri, 11 Sep 2015 17:15:46 +0600 Subject: [PATCH 06/10] =?UTF-8?q?started=20writing=20tests=20for=20?= =?UTF-8?q?=E2=80=98Text.Megaparsec.Lexer=E2=80=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the following combinators are tested: * ‘charLiteral’ * ‘integer’ * ‘decimal’ * ‘hexadecimal’ * ‘octal’ --- tests/Lexer.hs | 94 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 91 insertions(+), 3 deletions(-) diff --git a/tests/Lexer.hs b/tests/Lexer.hs index cbbd187..003b44e 100644 --- a/tests/Lexer.hs +++ b/tests/Lexer.hs @@ -30,15 +30,103 @@ module Lexer (tests) where import Control.Applicative (some, (<|>)) +import Data.Char (readLitChar, showLitChar) +import Data.Maybe (listToMaybe, isNothing, fromJust) +import Numeric (showInt, showHex, showOct, showSigned) import Test.Framework import Test.Framework.Providers.QuickCheck2 (testProperty) import Test.QuickCheck -import Text.Megaparsec.Char -import Text.Megaparsec.Combinator +-- import Text.Megaparsec.Combinator +import Text.Megaparsec.Error import Text.Megaparsec.Lexer +import Text.Megaparsec.Prim + +import Util tests :: Test tests = testGroup "Lexer" - [] + [ testProperty "space combinator" prop_space + , testProperty "space lexeme" prop_lexeme + , testProperty "space symbol" prop_symbol + , testProperty "space symbol'" prop_symbol' + , testProperty "space indentGuard" prop_indentGuard + , testProperty "space skipLineComment" prop_skipLineComment + , testProperty "space skipBlockComment" prop_skipBlockComment + , testProperty "space charLiteral" prop_charLiteral + , testProperty "space integer" prop_integer + , testProperty "space decimal" prop_decimal + , testProperty "space hexadecimal" prop_hexadecimal + , testProperty "space octal" prop_octal + , testProperty "space float" prop_float + , testProperty "space number" prop_number + , testProperty "space signed" prop_signed ] + +prop_space :: Property +prop_space = property True + +prop_lexeme :: Property +prop_lexeme = property True + +prop_symbol :: Property +prop_symbol = property True + +prop_symbol' :: Property +prop_symbol' = property True + +prop_indentGuard :: Property +prop_indentGuard = property True + +prop_skipLineComment :: Property +prop_skipLineComment = property True + +prop_skipBlockComment :: Property +prop_skipBlockComment = property True + +prop_charLiteral :: String -> Bool -> Property +prop_charLiteral t i = checkParser charLiteral r s + where b = listToMaybe $ readLitChar s + (h, g) = fromJust b + r | isNothing b = posErr 0 s $ exSpec "literal character" : + [ if null s then uneEof else uneCh (head s) ] + | null g = Right h + | otherwise = posErr l s [uneCh (head g), exEof] + l = length s - length g + s = if null t || i then t else showLitChar (head t) (tail t) + +prop_integer :: NonNegative Integer -> Int -> Property +prop_integer n' i = checkParser integer r s + where (r, s) = quasiCorrupted n' i showInt "integer" + +prop_decimal :: NonNegative Integer -> Int -> Property +prop_decimal n' i = checkParser decimal r s + where (r, s) = quasiCorrupted n' i showInt "digit" + +prop_hexadecimal :: NonNegative Integer -> Int -> Property +prop_hexadecimal n' i = checkParser hexadecimal r s + where (r, s) = quasiCorrupted n' i showHex "hexadecimal digit" + +prop_octal :: NonNegative Integer -> Int -> Property +prop_octal n' i = checkParser octal r s + where (r, s) = quasiCorrupted n' i showOct "octal digit" + +prop_float :: Property +prop_float = property True + +prop_number :: Property +prop_number = property True + +prop_signed :: Property +prop_signed = property True + +quasiCorrupted :: NonNegative Integer -> Int + -> (Integer -> String -> String) -> String + -> (Either ParseError Integer, String) +quasiCorrupted n' i shower l = (r, s) + where n = getNonNegative n' + r | i > length z = Right n + | otherwise = posErr i s $ [uneCh '?', exSpec l] ++ + [ exEof | i > 0 ] + z = shower n "" + s = if i <= length z then take i z ++ "?" ++ drop i z else z From 26f3039c8b721ceb6081e1786f61472999514ce1 Mon Sep 17 00:00:00 2001 From: mrkkrp Date: Sat, 12 Sep 2015 17:06:58 +0600 Subject: [PATCH 07/10] =?UTF-8?q?added=20more=20tests=20for=20=E2=80=98Tex?= =?UTF-8?q?t.Megaparsec.Lexer=E2=80=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following functions are covered: * ‘float’ * ‘number’ * ‘signed’ --- tests/Lexer.hs | 76 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 24 deletions(-) diff --git a/tests/Lexer.hs b/tests/Lexer.hs index 003b44e..c4047d3 100644 --- a/tests/Lexer.hs +++ b/tests/Lexer.hs @@ -29,8 +29,8 @@ module Lexer (tests) where -import Control.Applicative (some, (<|>)) -import Data.Char (readLitChar, showLitChar) +import Data.Char (readLitChar, showLitChar, isDigit) +import Data.List (findIndices) import Data.Maybe (listToMaybe, isNothing, fromJust) import Numeric (showInt, showHex, showOct, showSigned) @@ -38,30 +38,31 @@ import Test.Framework import Test.Framework.Providers.QuickCheck2 (testProperty) import Test.QuickCheck --- import Text.Megaparsec.Combinator import Text.Megaparsec.Error import Text.Megaparsec.Lexer import Text.Megaparsec.Prim +import qualified Text.Megaparsec.Char as C import Util tests :: Test tests = testGroup "Lexer" - [ testProperty "space combinator" prop_space - , testProperty "space lexeme" prop_lexeme - , testProperty "space symbol" prop_symbol - , testProperty "space symbol'" prop_symbol' - , testProperty "space indentGuard" prop_indentGuard - , testProperty "space skipLineComment" prop_skipLineComment - , testProperty "space skipBlockComment" prop_skipBlockComment - , testProperty "space charLiteral" prop_charLiteral - , testProperty "space integer" prop_integer - , testProperty "space decimal" prop_decimal - , testProperty "space hexadecimal" prop_hexadecimal - , testProperty "space octal" prop_octal - , testProperty "space float" prop_float - , testProperty "space number" prop_number - , testProperty "space signed" prop_signed ] + [ testProperty "space combinator" prop_space + , testProperty "lexeme combinator" prop_lexeme + , testProperty "symbol combinator" prop_symbol + , testProperty "symbol' combinator" prop_symbol' + , testProperty "indentGuard combinator" prop_indentGuard + , testProperty "skipLineComment combinator" prop_skipLineComment + , testProperty "skipBlockComment combinator" prop_skipBlockComment + , testProperty "charLiteral" prop_charLiteral + , testProperty "integer" prop_integer + , testProperty "decimal" prop_decimal + , testProperty "hexadecimal" prop_hexadecimal + , testProperty "octal" prop_octal + , testProperty "float 0" prop_float_0 + , testProperty "float 1" prop_float_1 + , testProperty "number" prop_number + , testProperty "signed" prop_signed ] prop_space :: Property prop_space = property True @@ -111,14 +112,41 @@ prop_octal :: NonNegative Integer -> Int -> Property prop_octal n' i = checkParser octal r s where (r, s) = quasiCorrupted n' i showOct "octal digit" -prop_float :: Property -prop_float = property True +prop_float_0 :: NonNegative Double -> Property +prop_float_0 n' = checkParser float r s + where n = getNonNegative n' + r = Right n + s = show n -prop_number :: Property -prop_number = property True +prop_float_1 :: Maybe (NonNegative Integer) -> Property +prop_float_1 n' = checkParser float r s + where r | isNothing n' = posErr 0 s [uneEof, exSpec "float"] + | otherwise = posErr (length s) s [ uneEof, exCh '.', exCh 'E' + , exCh 'e', exSpec "digit" ] + s = maybe "" (show . getNonNegative) n' -prop_signed :: Property -prop_signed = property True +prop_number :: Either (NonNegative Integer) (NonNegative Double) + -> Integer -> Property +prop_number n' i = checkParser number r s + where r | null s = posErr 0 s [uneEof, exSpec "number"] + | otherwise = + Right $ case n' of + Left x -> Left $ getNonNegative x + Right x -> Right $ getNonNegative x + s = if i < 5 + then "" + else either (show . getNonNegative) (show . getNonNegative) n' + +prop_signed :: Integer -> Int -> Bool -> Property +prop_signed n i plus = checkParser p r s + where p = signed (hidden C.space) integer + r | i > length z = Right n + | otherwise = posErr i s $ [uneCh '?', exSpec "integer"] ++ + (if i <= 0 then [exCh '+', exCh '-'] else []) ++ + [exEof | i > head (findIndices isDigit s)] + z = let bar = showSigned showInt 0 n "" + in if n < 0 || plus then bar else '+' : bar + s = if i <= length z then take i z ++ "?" ++ drop i z else z quasiCorrupted :: NonNegative Integer -> Int -> (Integer -> String -> String) -> String From 193d7ade079d8f9ab4b17a6f07831da10a0a9ac8 Mon Sep 17 00:00:00 2001 From: mrkkrp Date: Sun, 13 Sep 2015 18:51:15 +0600 Subject: [PATCH 08/10] =?UTF-8?q?fixed=20=E2=80=98skipLineComment=E2=80=99?= =?UTF-8?q?=20and=20=E2=80=98skipBlockComment=E2=80=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multi-character staring/ending sequences should be wrapped with ‘try’. Also, ‘lookAhead’ should not be used in ‘skipBlockComment’. --- Text/Megaparsec/Lexer.hs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Text/Megaparsec/Lexer.hs b/Text/Megaparsec/Lexer.hs index c4a7fc6..037a976 100644 --- a/Text/Megaparsec/Lexer.hs +++ b/Text/Megaparsec/Lexer.hs @@ -140,15 +140,17 @@ indentGuard spc p = do -- parser or picked up manually. skipLineComment :: Stream s m Char => String -> ParsecT s u m () -skipLineComment prefix = C.string prefix >> void (manyTill C.anyChar n) - where n = lookAhead C.newline +skipLineComment prefix = p >> void (manyTill C.anyChar n) + where p = try $ C.string prefix + n = lookAhead C.newline -- | @skipBlockComment start end@ skips non-nested block comment starting -- with @start@ and ending with @end@. skipBlockComment :: Stream s m Char => String -> String -> ParsecT s u m () -skipBlockComment start end = C.string start >> void (manyTill C.anyChar n) - where n = lookAhead . try $ C.string end +skipBlockComment start end = p >> void (manyTill C.anyChar n) + where p = try $ C.string start + n = try $ C.string end -- Character and string literals From ec3b5934f0ef4c658b5061bfc0e44f90d0e97aea Mon Sep 17 00:00:00 2001 From: mrkkrp Date: Sun, 13 Sep 2015 21:00:22 +0600 Subject: [PATCH 09/10] fix a typo (columns starts from 1) --- Text/Megaparsec/Lexer.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Text/Megaparsec/Lexer.hs b/Text/Megaparsec/Lexer.hs index 037a976..87c4677 100644 --- a/Text/Megaparsec/Lexer.hs +++ b/Text/Megaparsec/Lexer.hs @@ -121,7 +121,7 @@ symbol' spc = lexeme spc . C.string' -- current column position is returned. -- -- When you want to parse block of indentation first run this parser with --- predicate like @(> 0)@ — this will make sure you have some +-- predicate like @(> 1)@ — this will make sure you have some -- indentation. Use returned value to check indentation on every subsequent -- line according to syntax of your language. From 704f84f0181ba05a996d723c7c232b6bab3907bc Mon Sep 17 00:00:00 2001 From: mrkkrp Date: Sun, 13 Sep 2015 21:00:39 +0600 Subject: [PATCH 10/10] =?UTF-8?q?finished=20tests=20for=20=E2=80=98Text.Me?= =?UTF-8?q?gaparsec.Lexer=E2=80=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Covered the rest of public functions: * ‘space’ * ‘symbol’ * ‘symbol'’ * ‘indentGuard’ * ‘skipLineComment’ * ‘skipBlockComment’ --- tests/Lexer.hs | 114 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 94 insertions(+), 20 deletions(-) diff --git a/tests/Lexer.hs b/tests/Lexer.hs index c4047d3..344172f 100644 --- a/tests/Lexer.hs +++ b/tests/Lexer.hs @@ -29,9 +29,17 @@ module Lexer (tests) where -import Data.Char (readLitChar, showLitChar, isDigit) -import Data.List (findIndices) -import Data.Maybe (listToMaybe, isNothing, fromJust) +import Control.Applicative (empty) +import Control.Monad (void) +import Data.Char + ( readLitChar + , showLitChar + , isDigit + , isAlphaNum + , isSpace + , toLower ) +import Data.List (findIndices, isInfixOf) +import Data.Maybe (listToMaybe, maybeToList, isNothing, fromJust) import Numeric (showInt, showHex, showOct, showSigned) import Test.Framework @@ -40,7 +48,9 @@ import Test.QuickCheck import Text.Megaparsec.Error import Text.Megaparsec.Lexer +import Text.Megaparsec.Pos import Text.Megaparsec.Prim +import Text.Megaparsec.String import qualified Text.Megaparsec.Char as C import Util @@ -48,12 +58,9 @@ import Util tests :: Test tests = testGroup "Lexer" [ testProperty "space combinator" prop_space - , testProperty "lexeme combinator" prop_lexeme , testProperty "symbol combinator" prop_symbol , testProperty "symbol' combinator" prop_symbol' , testProperty "indentGuard combinator" prop_indentGuard - , testProperty "skipLineComment combinator" prop_skipLineComment - , testProperty "skipBlockComment combinator" prop_skipBlockComment , testProperty "charLiteral" prop_charLiteral , testProperty "integer" prop_integer , testProperty "decimal" prop_decimal @@ -64,26 +71,93 @@ tests = testGroup "Lexer" , testProperty "number" prop_number , testProperty "signed" prop_signed ] -prop_space :: Property -prop_space = property True +newtype WhiteSpace = WhiteSpace + { getWhiteSpace :: String } + deriving (Show, Eq) -prop_lexeme :: Property -prop_lexeme = property True +instance Arbitrary WhiteSpace where + arbitrary = WhiteSpace . concat <$> listOf whiteUnit -prop_symbol :: Property -prop_symbol = property True +newtype Symbol = Symbol + { getSymbol :: String } + deriving (Show, Eq) -prop_symbol' :: Property -prop_symbol' = property True +instance Arbitrary Symbol where + arbitrary = Symbol <$> ((++) <$> symbolName <*> whiteChars) -prop_indentGuard :: Property -prop_indentGuard = property True +whiteUnit :: Gen String +whiteUnit = oneof [whiteChars, whiteLine, whiteBlock] -prop_skipLineComment :: Property -prop_skipLineComment = property True +whiteChars :: Gen String +whiteChars = listOf $ elements "\t\n " -prop_skipBlockComment :: Property -prop_skipBlockComment = property True +whiteLine :: Gen String +whiteLine = commentOut <$> arbitrary `suchThat` goodEnough + where commentOut x = "//" ++ x ++ "\n" + goodEnough x = '\n' `notElem` x + +whiteBlock :: Gen String +whiteBlock = commentOut <$> arbitrary `suchThat` goodEnough + where commentOut x = "/*" ++ x ++ "*/" + goodEnough x = not $ "*/" `isInfixOf` x + +symbolName :: Gen String +symbolName = listOf $ arbitrary `suchThat` isAlphaNum + +sc :: Parser () +sc = space (void C.spaceChar) l b + where l = skipLineComment "//" + b = skipBlockComment "/*" "*/" + +sc' :: Parser () +sc' = space (void $ C.oneOf " \t") empty empty + +prop_space :: WhiteSpace -> Property +prop_space w = checkParser p r s + where p = sc + r = Right () + s = getWhiteSpace w + +prop_symbol :: Symbol -> Maybe Char -> Property +prop_symbol = parseSymbol (symbol sc) id + +prop_symbol' :: Symbol -> Maybe Char -> Property +prop_symbol' = parseSymbol (symbol' sc) (fmap toLower) + +parseSymbol :: (String -> Parser String) -> (String -> String) + -> Symbol -> Maybe Char -> Property +parseSymbol p' f s' t = checkParser p r s + where p = p' (f g) + r | g == s || isSpace (last s) = Right (f g) + | otherwise = posErr (length s - 1) s [uneCh (last s), exEof] + g = takeWhile (not . isSpace) s + s = getSymbol s' ++ maybeToList t + +newtype IndLine = IndLine + { getIndLine :: String } + deriving (Show, Eq) + +instance Arbitrary IndLine where + arbitrary = IndLine . concat <$> sequence [spc, sym, spc, eol] + where spc = listOf (elements " \t") + sym = return "xxx" + eol = return "\n" + +prop_indentGuard :: IndLine -> IndLine -> IndLine -> Property +prop_indentGuard l0 l1 l2 = checkParser p r s + where p = ip (> 1) >>= \x -> sp >> ip (== x) >> sp >> ip (> x) >> sp + ip = indentGuard sc' + sp = void $ symbol sc' "xxx" <* C.eol + r | f' l0 <= 1 = posErr 0 s msg' + | f' l1 /= f' l0 = posErr (f l1 + g [l0]) s msg' + | f' l2 <= f' l0 = posErr (f l2 + g [l0, l1]) s msg' + | otherwise = Right () + msg' = [msg "incorrect indentation"] + f = length . takeWhile isSpace . getIndLine + f' x = sourceColumn $ + updatePosString (initialPos "") $ take (f x) (getIndLine x) + g xs = sum $ length . getIndLine <$> xs + s = concat $ getIndLine <$> [l0, l1, l2] prop_charLiteral :: String -> Bool -> Property prop_charLiteral t i = checkParser charLiteral r s