From c5bcdfb220f90a1bbc216d12847b3c65f2417317 Mon Sep 17 00:00:00 2001 From: mrkkrp Date: Fri, 31 Jul 2015 17:30:38 +0600 Subject: [PATCH] rewritten parsing of numbers, fixes #2 and #3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changed how numbers are parsed because they were parsed in a naïf and hairy way. Added tests for #2 and #3 (in old Parsec project these are number 35 and 39 respectively). * Since Haskell report doesn't say anything about sign, I've made ‘integer’ and ‘float’ parse numbers without sign. * Removed ‘natural’ parser, it's equal to new ‘integer’ now. * Renamed ‘naturalOrFloat’ → ‘number’ — this doesn't parse sign too. * Added new combinator ‘signed’ to parse all sorts of signed numbers. * For the sake of convenience I've added ‘integer'’, ‘float'’, and ‘number'’ combinators that also can parse signed numbers out of box. --- Text/MegaParsec/Token.hs | 238 +++++++++++++++++++-------------------- test/Bugs.hs | 6 +- test/Bugs/Bug35.hs | 39 +++++++ test/Bugs/Bug39.hs | 34 ++++++ 4 files changed, 195 insertions(+), 122 deletions(-) create mode 100644 test/Bugs/Bug35.hs create mode 100644 test/Bugs/Bug39.hs diff --git a/Text/MegaParsec/Token.hs b/Text/MegaParsec/Token.hs index a8ff8ce..21a191f 100644 --- a/Text/MegaParsec/Token.hs +++ b/Text/MegaParsec/Token.hs @@ -20,7 +20,7 @@ module Text.MegaParsec.Token , makeTokenParser ) where -import Data.Char (isAlpha, toLower, toUpper, isSpace, digitToInt) +import Data.Char (isAlpha, toLower, toUpper, isSpace) import Data.List (nub, sort) import Control.Monad (void) @@ -99,7 +99,7 @@ data LanguageDef s u m = data TokenParser s u m = TokenParser { - -- | This lexeme parser parses a legal identifier. Returns the identifier + -- | The lexeme parser parses a legal identifier. Returns the identifier -- string. This parser will fail on identifiers that are reserved -- words. Legal identifier (start) characters and reserved words are -- defined in the 'LanguageDef' that is passed to 'makeTokenParser'. An @@ -114,7 +114,7 @@ data TokenParser s u m = , reserved :: String -> ParsecT s u m () - -- | This lexeme parser parses a legal operator. Returns the name of the + -- | The lexeme parser parses a legal operator. Returns the name of the -- operator. This parser will fail on any operators that are reserved -- operators. Legal operator (start) characters and reserved operators -- are defined in the 'LanguageDef' that is passed to @@ -129,7 +129,7 @@ data TokenParser s u m = , reservedOp :: String -> ParsecT s u m () - -- | This lexeme parser parses a single literal character. Returns the + -- | The lexeme parser parses a single literal character. Returns the -- literal character value. This parsers deals correctly with escape -- sequences. The literal character is parsed according to the grammar -- rules defined in the Haskell report (which matches most programming @@ -137,7 +137,7 @@ data TokenParser s u m = , charLiteral :: ParsecT s u m Char - -- | This lexeme parser parses a literal string. Returns the literal + -- | The lexeme parser parses a literal string. Returns the literal -- string value. This parsers deals correctly with escape sequences and -- gaps. The literal string is parsed according to the grammar rules -- defined in the Haskell report (which matches most programming @@ -145,51 +145,62 @@ data TokenParser s u m = , stringLiteral :: ParsecT s u m String - -- | This lexeme parser parses a natural number (a positive whole - -- number). Returns the value of the number. The number can be specified - -- in 'decimal', 'hexadecimal' or 'octal'. The number is parsed - -- according to the grammar rules in the Haskell report. - - , natural :: ParsecT s u m Integer - - -- | This lexeme parser parses an integer (a whole number). This parser - -- is like 'natural' except that it can be prefixed with sign - -- (i.e. \'-\' or \'+\'). Returns the value of the number. The number + -- | The lexeme parser parses an integer (a whole number). This parser + -- /does not/ parse sign. Returns the value of the number. The number -- can be specified in 'decimal', 'hexadecimal' or 'octal'. The number -- is parsed according to the grammar rules in the Haskell report. , integer :: ParsecT s u m Integer - -- | This lexeme parser parses a floating point value. Returns the value - -- of the number. The number is parsed according to the grammar rules - -- defined in the Haskell report. + -- | This is just like 'integer', except it can parse sign. - , float :: ParsecT s u m Double + , integer' :: ParsecT s u m Integer - -- | This lexeme parser parses either 'natural' or a 'float'. - -- Returns the value of the number. This parsers deals with any overlap - -- in the grammar rules for naturals and floats. The number is parsed - -- according to the grammar rules defined in the Haskell report. - - , naturalOrFloat :: ParsecT s u m (Either Integer Double) - - -- | Parses a positive whole number in the decimal system. Returns the - -- value of the number. + -- | The lexeme parses a positive whole number in the decimal system. + -- Returns the value of the number. , decimal :: ParsecT s u m Integer - -- | Parses a positive whole number in the hexadecimal system. The number - -- should be prefixed with \"0x\" or \"0X\". Returns the value of the - -- number. + -- | The lexeme parses a positive whole number in the hexadecimal + -- system. The number should be prefixed with \"0x\" or \"0X\". Returns + -- the value of the number. , hexadecimal :: ParsecT s u m Integer - -- | Parses a positive whole number in the octal system. The number - -- should be prefixed with \"0o\" or \"0O\". Returns the value of the - -- number. + -- | The lexeme parses a positive whole number in the octal system. + -- The number should be prefixed with \"0o\" or \"0O\". Returns the + -- value of the number. , octal :: ParsecT s u m Integer + -- | @signed p@ tries to parse sign (i.e. \'+\', \'-\', or nothing) and + -- then runs parser @p@, changing sign of its result accordingly. Note + -- that there may be white space after the sign but not before it. + + , signed :: forall a . Num a => ParsecT s u m a -> ParsecT s u m a + + -- | The lexeme parser parses a floating point value. Returns the value + -- of the number. The number is parsed according to the grammar rules + -- defined in the Haskell report, sign is /not/ parsed, use 'signed' to + -- achieve parsing of signed floating point values. + + , float :: ParsecT s u m Double + + -- | This is just like 'float', except it can parse sign. + + , float' :: ParsecT s u m Double + + -- | The lexeme parser parses either 'integer' or a 'float'. + -- Returns the value of the number. This parser deals with any overlap + -- in the grammar rules for integers and floats. The number is parsed + -- according to the grammar rules defined in the Haskell report. + + , number :: ParsecT s u m (Either Integer Double) + + -- | This is just like 'number', except it can parse sign. + + , number' :: ParsecT s u m (Either Integer Double) + -- | Lexeme parser @symbol s@ parses 'string' @s@ and skips -- trailing white space. @@ -310,37 +321,41 @@ data TokenParser s u m = makeTokenParser :: Stream s m Char => LanguageDef s u m -> TokenParser s u m makeTokenParser languageDef = TokenParser - { identifier = identifier - , reserved = reserved - , operator = operator - , reservedOp = reservedOp + { identifier = identifier + , reserved = reserved + , operator = operator + , reservedOp = reservedOp - , charLiteral = charLiteral - , stringLiteral = stringLiteral - , natural = natural - , integer = integer - , float = float - , naturalOrFloat = naturalOrFloat - , decimal = decimal - , hexadecimal = hexadecimal - , octal = octal + , charLiteral = charLiteral + , stringLiteral = stringLiteral - , symbol = symbol - , lexeme = lexeme - , whiteSpace = whiteSpace + , integer = integer + , integer' = integer' + , decimal = decimal + , hexadecimal = hexadecimal + , octal = octal + , signed = signed + , float = float + , float' = float' + , number = number + , number' = number' - , parens = parens - , braces = braces - , angles = angles - , brackets = brackets - , semi = semi - , comma = comma - , colon = colon - , dot = dot - , semiSep = semiSep - , semiSep1 = semiSep1 - , commaSep = commaSep - , commaSep1 = commaSep1 } + , symbol = symbol + , lexeme = lexeme + , whiteSpace = whiteSpace + + , parens = parens + , braces = braces + , angles = angles + , brackets = brackets + , semi = semi + , comma = comma + , colon = colon + , dot = dot + , semiSep = semiSep + , semiSep1 = semiSep1 + , commaSep = commaSep + , commaSep1 = commaSep1 } where -- bracketing @@ -395,19 +410,19 @@ makeTokenParser languageDef = escapeCode = charEsc <|> charNum <|> charAscii <|> charControl "escape code" - charControl = toEnum . subtract 64 . fromEnum <$> (char '^' >> upper) - - charNum = toEnum . fromInteger <$> - ( decimal <|> - (char 'o' >> number 8 octDigit) <|> - (char 'x' >> number 16 hexDigit) ) - charEsc = choice (parseEsc <$> escMap) where parseEsc (c, code) = char c >> return code + charNum = toEnum . fromInteger <$> + ( decimal <|> + (char 'o' >> nump "0o" octDigit) <|> + (char 'x' >> nump "0x" hexDigit) ) + charAscii = choice (parseAscii <$> asciiMap) where parseAscii (asc, code) = try (string asc >> return code) + charControl = toEnum . subtract 64 . fromEnum <$> (char '^' >> upper) + -- escape code tables escMap = zip "abfnrtv\\\"\'" "\a\b\f\n\r\t\v\\\"\'" @@ -422,72 +437,53 @@ makeTokenParser languageDef = ascii2 = "\b\t\n\v\f\r\SO\SI\EM\FS\GS\RS\US " ascii3 = "\NUL\SOH\STX\ETX\EOT\ENQ\ACK\a\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\SUB\ESC\DEL" - -- numbers + -- numbers — integers - naturalOrFloat = lexeme natFloat "number" + integer = decimal "unsigned integer" + integer' = signed integer "integer" - float = lexeme floating "float" - integer = lexeme int "integer" - natural = lexeme nat "natural" + decimal = lexeme $ nump "" digit + hexadecimal = lexeme $ char '0' >> oneOf "xX" >> nump "0x" hexDigit + octal = lexeme $ char '0' >> oneOf "oO" >> nump "0o" octDigit - -- floats + nump prefix baseDigit = read . (prefix ++) <$> many1 baseDigit - floating = decimal >>= fractExponent + signed p = ($) <$> option id (lexeme sign) <*> p - natFloat = (char '0' >> zeroNumFloat) <|> decimalFloat + sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a) + sign = (char '+' *> return id) <|> (char '-' *> return negate) - zeroNumFloat = (Left <$> (hexadecimal <|> octal)) <|> - decimalFloat <|> - fractFloat 0 <|> - return (Left 0) + -- numbers — floats - decimalFloat = decimal >>= \n -> option (Left n) (fractFloat n) + float = lexeme ffloat "unsigned float" + float' = signed float "float" - fractFloat n = Right <$> fractExponent n + ffloat = read <$> (ffir <|> fsec) - fractExponent n = - do { fract <- fraction - ; expo <- option 1.0 exponent' - ; return $ (n' + fract) * expo - } <|> ((* n') <$> exponent') - where n' = fromInteger n + ffir = do + decimal1 <- fDec + void $ char '.' + decimal2 <- fDec + exponent <- option "" fExp + return $ decimal1 ++ "." ++ decimal2 ++ exponent - fraction = - do { void $ char '.' - ; digits <- many1 digit "fraction" - ; return $ foldr op 0.0 digits - } "fraction" - where op d f = (f + fromIntegral (digitToInt d)) / 10.0 + fsec = do + decimal <- fDec + exponent <- fExp + return $ decimal ++ exponent - exponent' = - do { void $ oneOf "eE" - ; f <- sign - ; e <- decimal "exponent" - ; return $ power (f e) - } "exponent" - where power e - | e < 0 = 1.0 / power (-e) - | otherwise = fromInteger (10^e) + fDec = many1 digit - -- integers and naturals + fExp = do + expChar <- oneOf "eE" + signStr <- option "" (pure <$> oneOf "+-") + decimal <- fDec + return $ expChar : signStr ++ decimal - int = ($) <$> lexeme sign <*> nat + -- numbers — a more general case - sign = (char '-' >> return negate) <|> (char '+' >> return id) <|> return id - - nat = zeroNumber <|> decimal - - zeroNumber = (char '0' >> (hexadecimal <|> octal <|> decimal <|> return 0)) - "" - - decimal = number 10 digit - hexadecimal = oneOf "xX" >> number 16 hexDigit - octal = oneOf "oO" >> number 8 octDigit - - number base baseDigit = do - digits <- many1 baseDigit - let n = foldl (\x d -> base * x + toInteger (digitToInt d)) 0 digits - n `seq` return n + number = (Right <$> try float) <|> (Left <$> integer) "unsigned number" + number' = (Right <$> try float') <|> (Left <$> integer') "number" -- operators & reserved ops diff --git a/test/Bugs.hs b/test/Bugs.hs index a66a731..b3c82c0 100644 --- a/test/Bugs.hs +++ b/test/Bugs.hs @@ -6,8 +6,12 @@ import Test.Framework import qualified Bugs.Bug2 import qualified Bugs.Bug6 import qualified Bugs.Bug9 +import qualified Bugs.Bug35 +import qualified Bugs.Bug39 bugs :: [Test] bugs = [ Bugs.Bug2.main , Bugs.Bug6.main - , Bugs.Bug9.main ] + , Bugs.Bug9.main + , Bugs.Bug35.main + , Bugs.Bug39.main ] diff --git a/test/Bugs/Bug35.hs b/test/Bugs/Bug35.hs new file mode 100644 index 0000000..93033e0 --- /dev/null +++ b/test/Bugs/Bug35.hs @@ -0,0 +1,39 @@ + +module Bugs.Bug35 (main) where + +import Text.MegaParsec +import Text.MegaParsec.Language +import Text.MegaParsec.String +import qualified Text.MegaParsec.Token as Token + +import Test.HUnit hiding (Test) +import Test.Framework +import Test.Framework.Providers.HUnit + +trickyFloats :: [String] +trickyFloats = + [ "1.5339794352098402e-118" + , "2.108934760892056e-59" + , "2.250634744599241e-19" + , "5.0e-324" + , "5.960464477539063e-8" + , "0.25996181067141905" + , "0.3572019862807257" + , "0.46817723004874223" + , "0.9640035681058178" + , "4.23808622486133" + , "4.540362294799751" + , "5.212384849884261" + , "13.958257048123212" + , "32.96176575630599" + , "38.47735512322269" ] + +float :: Parser Double +float = Token.float (Token.makeTokenParser emptyDef) + +testBatch :: Assertion +testBatch = mapM_ testFloat trickyFloats + where testFloat x = parse float "" x @?= Right (read x :: Double) + +main :: Test +main = testCase "Quality of output of Text.Parsec.Token.float (#35)" testBatch diff --git a/test/Bugs/Bug39.hs b/test/Bugs/Bug39.hs new file mode 100644 index 0000000..151a80b --- /dev/null +++ b/test/Bugs/Bug39.hs @@ -0,0 +1,34 @@ + +module Bugs.Bug39 (main) where + +import Data.Either (isLeft, isRight) + +import Text.MegaParsec +import Text.MegaParsec.Language +import Text.MegaParsec.String +import qualified Text.MegaParsec.Token as Token + +import Test.HUnit hiding (Test) +import Test.Framework +import Test.Framework.Providers.HUnit + +shouldFail :: [String] +shouldFail = [" 1", " +1", " -1"] + +shouldSucceed :: [String] +shouldSucceed = ["1", "+1", "-1", "+ 1 ", "- 1 ", "1 "] + +integer :: Parser Integer +integer = Token.integer' (Token.makeTokenParser emptyDef) + +testBatch :: Assertion +testBatch = mapM_ (f testFail) shouldFail >> + mapM_ (f testSucceed) shouldSucceed + where f t a = t (parse integer "" a) a + testFail x a = assertBool + ("Should fail on " ++ show a) (isLeft x) + testSucceed x a = assertBool + ("Should succeed on " ++ show a) (isRight x) + +main :: Test +main = testCase "TokenParser should fail on leading whitespace (#39)" testBatch