rewritten parsing of numbers, fixes #2 and #3

Changed how numbers are parsed because they were parsed in a naïf and hairy way. Added tests for #2 and #3 (in old Parsec project these are number 35 and 39 respectively). * Since Haskell report doesn't say anything about sign, I've made ‘integer’ and ‘float’ parse numbers without sign. * Removed ‘natural’ parser, it's equal to new ‘integer’ now. * Renamed ‘naturalOrFloat’ → ‘number’ — this doesn't parse sign too. * Added new combinator ‘signed’ to parse all sorts of signed numbers. * For the sake of convenience I've added ‘integer'’, ‘float'’, and ‘number'’ combinators that also can parse signed numbers out of box.
2024-11-24 03:52:07 +03:00 · 2015-07-31 17:30:38 +06:00 · 2015-07-31 17:30:38 +06:00 · c5bcdfb220
commit c5bcdfb220
parent b19dae4315
4 changed files with 195 additions and 122 deletions
--- a/Text/MegaParsec/Token.hs
+++ b/Text/MegaParsec/Token.hs
@ -20,7 +20,7 @@ module Text.MegaParsec.Token
    , makeTokenParser )
 where
-import Data.Char (isAlpha, toLower, toUpper, isSpace, digitToInt)
+import Data.Char (isAlpha, toLower, toUpper, isSpace)
 import Data.List (nub, sort)
 import Control.Monad (void)
@ -99,7 +99,7 @@ data LanguageDef s u m =
 data TokenParser s u m =
    TokenParser {
-    -- | This lexeme parser parses a legal identifier. Returns the identifier
+    -- | The lexeme parser parses a legal identifier. Returns the identifier
    -- string. This parser will fail on identifiers that are reserved
    -- words. Legal identifier (start) characters and reserved words are
    -- defined in the 'LanguageDef' that is passed to 'makeTokenParser'. An
@ -114,7 +114,7 @@ data TokenParser s u m =
    , reserved :: String -> ParsecT s u m ()
-    -- | This lexeme parser parses a legal operator. Returns the name of the
+    -- | The lexeme parser parses a legal operator. Returns the name of the
    -- operator. This parser will fail on any operators that are reserved
    -- operators. Legal operator (start) characters and reserved operators
    -- are defined in the 'LanguageDef' that is passed to
@ -129,7 +129,7 @@ data TokenParser s u m =
    , reservedOp :: String -> ParsecT s u m ()
-    -- | This lexeme parser parses a single literal character. Returns the
+    -- | The lexeme parser parses a single literal character. Returns the
    -- literal character value. This parsers deals correctly with escape
    -- sequences. The literal character is parsed according to the grammar
    -- rules defined in the Haskell report (which matches most programming
@ -137,7 +137,7 @@ data TokenParser s u m =
    , charLiteral :: ParsecT s u m Char
-    -- | This lexeme parser parses a literal string. Returns the literal
+    -- | The lexeme parser parses a literal string. Returns the literal
    -- string value. This parsers deals correctly with escape sequences and
    -- gaps. The literal string is parsed according to the grammar rules
    -- defined in the Haskell report (which matches most programming
@ -145,51 +145,62 @@ data TokenParser s u m =
    , stringLiteral :: ParsecT s u m String
-    -- | This lexeme parser parses a natural number (a positive whole
+    -- | The lexeme parser parses an integer (a whole number). This parser
-    -- number). Returns the value of the number. The number can be specified
+    -- /does not/ parse sign. Returns the value of the number. The number
    -- in 'decimal', 'hexadecimal' or 'octal'. The number is parsed
    -- according to the grammar rules in the Haskell report.
    , natural :: ParsecT s u m Integer
    -- | This lexeme parser parses an integer (a whole number). This parser
    -- is like 'natural' except that it can be prefixed with sign
    -- (i.e. \'-\' or \'+\'). Returns the value of the number. The number
    -- can be specified in 'decimal', 'hexadecimal' or 'octal'. The number
    -- is parsed according to the grammar rules in the Haskell report.
    , integer :: ParsecT s u m Integer
-    -- | This lexeme parser parses a floating point value. Returns the value
+    -- | This is just like 'integer', except it can parse sign.
    -- of the number. The number is parsed according to the grammar rules
    -- defined in the Haskell report.
-    , float :: ParsecT s u m Double
+    , integer' :: ParsecT s u m Integer
-    -- | This lexeme parser parses either 'natural' or a 'float'.
+    -- | The lexeme parses a positive whole number in the decimal system.
-    -- Returns the value of the number. This parsers deals with any overlap
+    -- Returns the value of the number.
    -- in the grammar rules for naturals and floats. The number is parsed
    -- according to the grammar rules defined in the Haskell report.
    , naturalOrFloat :: ParsecT s u m (Either Integer Double)
    -- | Parses a positive whole number in the decimal system. Returns the
    -- value of the number.
    , decimal :: ParsecT s u m Integer
-    -- | Parses a positive whole number in the hexadecimal system. The number
+    -- | The lexeme parses a positive whole number in the hexadecimal
-    -- should be prefixed with \"0x\" or \"0X\". Returns the value of the
+    -- system. The number should be prefixed with \"0x\" or \"0X\". Returns
-    -- number.
+    -- the value of the number.
    , hexadecimal :: ParsecT s u m Integer
-    -- | Parses a positive whole number in the octal system. The number
+    -- | The lexeme parses a positive whole number in the octal system.
-    -- should be prefixed with \"0o\" or \"0O\". Returns the value of the
+    -- The number should be prefixed with \"0o\" or \"0O\". Returns the
-    -- number.
+    -- value of the number.
    , octal :: ParsecT s u m Integer
    -- | @signed p@ tries to parse sign (i.e. \'+\', \'-\', or nothing) and
    -- then runs parser @p@, changing sign of its result accordingly. Note
    -- that there may be white space after the sign but not before it.
    , signed :: forall a . Num a => ParsecT s u m a -> ParsecT s u m a
    -- | The lexeme parser parses a floating point value. Returns the value
    -- of the number. The number is parsed according to the grammar rules
    -- defined in the Haskell report, sign is /not/ parsed, use 'signed' to
    -- achieve parsing of signed floating point values.
    , float :: ParsecT s u m Double
    -- | This is just like 'float', except it can parse sign.
    , float' :: ParsecT s u m Double
    -- | The lexeme parser parses either 'integer' or a 'float'.
    -- Returns the value of the number. This parser deals with any overlap
    -- in the grammar rules for integers and floats. The number is parsed
    -- according to the grammar rules defined in the Haskell report.
    , number :: ParsecT s u m (Either Integer Double)
    -- | This is just like 'number', except it can parse sign.
    , number' :: ParsecT s u m (Either Integer Double)
    -- | Lexeme parser @symbol s@ parses 'string' @s@ and skips
    -- trailing white space.
@ -310,37 +321,41 @@ data TokenParser s u m =
 makeTokenParser :: Stream s m Char => LanguageDef s u m -> TokenParser s u m
 makeTokenParser languageDef =
    TokenParser
-    { identifier     = identifier
+    { identifier    = identifier
-    , reserved       = reserved
+    , reserved      = reserved
-    , operator       = operator
+    , operator      = operator
-    , reservedOp     = reservedOp
+    , reservedOp    = reservedOp
-    , charLiteral    = charLiteral
+    , charLiteral   = charLiteral
-    , stringLiteral  = stringLiteral
+    , stringLiteral = stringLiteral
    , natural        = natural
    , integer        = integer
    , float          = float
    , naturalOrFloat = naturalOrFloat
    , decimal        = decimal
    , hexadecimal    = hexadecimal
    , octal          = octal
-    , symbol         = symbol
+    , integer       = integer
-    , lexeme         = lexeme
+    , integer'      = integer'
-    , whiteSpace     = whiteSpace
+    , decimal       = decimal
    , hexadecimal   = hexadecimal
    , octal         = octal
    , signed        = signed
    , float         = float
    , float'        = float'
    , number        = number
    , number'       = number'
-    , parens         = parens
+    , symbol        = symbol
-    , braces         = braces
+    , lexeme        = lexeme
-    , angles         = angles
+    , whiteSpace    = whiteSpace
-    , brackets       = brackets
+
-    , semi           = semi
+    , parens        = parens
-    , comma          = comma
+    , braces        = braces
-    , colon          = colon
+    , angles        = angles
-    , dot            = dot
+    , brackets      = brackets
-    , semiSep        = semiSep
+    , semi          = semi
-    , semiSep1       = semiSep1
+    , comma         = comma
-    , commaSep       = commaSep
+    , colon         = colon
-    , commaSep1      = commaSep1 }
+    , dot           = dot
    , semiSep       = semiSep
    , semiSep1      = semiSep1
    , commaSep      = commaSep
    , commaSep1     = commaSep1 }
    where
    -- bracketing
@ -395,19 +410,19 @@ makeTokenParser languageDef =
    escapeCode = charEsc <|> charNum <|> charAscii <|> charControl
                 <?> "escape code"
    charControl = toEnum . subtract 64 . fromEnum <$> (char '^' >> upper)
    charNum = toEnum . fromInteger <$>
              ( decimal <|>
               (char 'o' >> number 8 octDigit) <|>
               (char 'x' >> number 16 hexDigit) )
    charEsc = choice (parseEsc <$> escMap)
        where parseEsc (c, code) = char c >> return code
    charNum = toEnum . fromInteger <$>
              ( decimal <|>
               (char 'o' >> nump "0o" octDigit) <|>
               (char 'x' >> nump "0x" hexDigit) )
    charAscii = choice (parseAscii <$> asciiMap)
        where parseAscii (asc, code) = try (string asc >> return code)
    charControl = toEnum . subtract 64 . fromEnum <$> (char '^' >> upper)
    -- escape code tables
    escMap      = zip "abfnrtv\\\"\'" "\a\b\f\n\r\t\v\\\"\'"
@ -422,72 +437,53 @@ makeTokenParser languageDef =
    ascii2 = "\b\t\n\v\f\r\SO\SI\EM\FS\GS\RS\US "
    ascii3 = "\NUL\SOH\STX\ETX\EOT\ENQ\ACK\a\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\SUB\ESC\DEL"
-    -- numbers
+    -- numbers — integers
-    naturalOrFloat = lexeme natFloat <?> "number"
+    integer  = decimal        <?> "unsigned integer"
    integer' = signed integer <?> "integer"
-    float   = lexeme floating <?> "float"
+    decimal     = lexeme $ nump "" digit
-    integer = lexeme int      <?> "integer"
+    hexadecimal = lexeme $ char '0' >> oneOf "xX" >> nump "0x" hexDigit
-    natural = lexeme nat      <?> "natural"
+    octal       = lexeme $ char '0' >> oneOf "oO" >> nump "0o" octDigit
-    -- floats
+    nump prefix baseDigit = read . (prefix ++) <$> many1 baseDigit
-    floating = decimal >>= fractExponent
+    signed p = ($) <$> option id (lexeme sign) <*> p
-    natFloat = (char '0' >> zeroNumFloat) <|> decimalFloat
+    sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a)
    sign = (char '+' *> return id) <|> (char '-' *> return negate)
-    zeroNumFloat = (Left <$> (hexadecimal <|> octal)) <|>
+    -- numbers — floats
                   decimalFloat <|>
                   fractFloat 0 <|>
                   return (Left 0)
-    decimalFloat = decimal >>= \n -> option (Left n) (fractFloat n)
+    float  = lexeme ffloat <?> "unsigned float"
    float' = signed float  <?> "float"
-    fractFloat n = Right <$> fractExponent n
+    ffloat = read <$> (ffir <|> fsec)
-    fractExponent n =
+    ffir = do
-        do { fract <- fraction
+      decimal1 <- fDec
-           ; expo  <- option 1.0 exponent'
+      void $ char '.'
-           ; return $ (n' + fract) * expo
+      decimal2 <- fDec
-           } <|> ((* n') <$> exponent')
+      exponent <- option "" fExp
-        where n' = fromInteger n
+      return $ decimal1 ++ "." ++ decimal2 ++ exponent
-    fraction =
+    fsec = do
-        do { void $ char '.'
+      decimal  <- fDec
-           ; digits <- many1 digit <?> "fraction"
+      exponent <- fExp
-           ; return $ foldr op 0.0 digits
+      return $ decimal ++ exponent
           } <?> "fraction"
        where op d f = (f + fromIntegral (digitToInt d)) / 10.0
-    exponent' =
+    fDec = many1 digit
        do { void $ oneOf "eE"
           ; f <- sign
           ; e <- decimal <?> "exponent"
           ; return $ power (f e)
           } <?> "exponent"
        where power e
                  | e < 0     = 1.0 / power (-e)
                  | otherwise = fromInteger (10^e)
-    -- integers and naturals
+    fExp = do
      expChar <- oneOf "eE"
      signStr <- option "" (pure <$> oneOf "+-")
      decimal <- fDec
      return $ expChar : signStr ++ decimal
-    int = ($) <$> lexeme sign <*> nat
+    -- numbers — a more general case
-    sign = (char '-' >> return negate) <|> (char '+' >> return id) <|> return id
+    number  = (Right <$> try float) <|> (Left <$> integer) <?> "unsigned number"
-
+    number' = (Right <$> try float') <|> (Left <$> integer') <?> "number"
    nat = zeroNumber <|> decimal
    zeroNumber = (char '0' >> (hexadecimal <|> octal <|> decimal <|> return 0))
                 <?> ""
    decimal     = number 10 digit
    hexadecimal = oneOf "xX" >> number 16 hexDigit
    octal       = oneOf "oO" >> number 8  octDigit
    number base baseDigit = do
      digits <- many1 baseDigit
      let n = foldl (\x d -> base * x + toInteger (digitToInt d)) 0 digits
      n `seq` return n
    -- operators & reserved ops
--- a/test/Bugs.hs
+++ b/test/Bugs.hs
@ -6,8 +6,12 @@ import Test.Framework
 import qualified Bugs.Bug2
 import qualified Bugs.Bug6
 import qualified Bugs.Bug9
 import qualified Bugs.Bug35
 import qualified Bugs.Bug39
 bugs :: [Test]
 bugs = [ Bugs.Bug2.main
       , Bugs.Bug6.main
-       , Bugs.Bug9.main ]
+       , Bugs.Bug9.main
       , Bugs.Bug35.main
       , Bugs.Bug39.main ]
--- a/test/Bugs/Bug35.hs
+++ b/test/Bugs/Bug35.hs
@ -0,0 +1,39 @@
 module Bugs.Bug35 (main) where
 import Text.MegaParsec
 import Text.MegaParsec.Language
 import Text.MegaParsec.String
 import qualified Text.MegaParsec.Token as Token
 import Test.HUnit hiding (Test)
 import Test.Framework
 import Test.Framework.Providers.HUnit
 trickyFloats :: [String]
 trickyFloats =
    [ "1.5339794352098402e-118"
    , "2.108934760892056e-59"
    , "2.250634744599241e-19"
    , "5.0e-324"
    , "5.960464477539063e-8"
    , "0.25996181067141905"
    , "0.3572019862807257"
    , "0.46817723004874223"
    , "0.9640035681058178"
    , "4.23808622486133"
    , "4.540362294799751"
    , "5.212384849884261"
    , "13.958257048123212"
    , "32.96176575630599"
    , "38.47735512322269" ]
 float :: Parser Double
 float = Token.float (Token.makeTokenParser emptyDef)
 testBatch :: Assertion
 testBatch = mapM_ testFloat trickyFloats
    where testFloat x = parse float "" x @?= Right (read x :: Double)
 main :: Test
 main = testCase "Quality of output of Text.Parsec.Token.float (#35)" testBatch
--- a/test/Bugs/Bug39.hs
+++ b/test/Bugs/Bug39.hs
@ -0,0 +1,34 @@
 module Bugs.Bug39 (main) where
 import Data.Either (isLeft, isRight)
 import Text.MegaParsec
 import Text.MegaParsec.Language
 import Text.MegaParsec.String
 import qualified Text.MegaParsec.Token as Token
 import Test.HUnit hiding (Test)
 import Test.Framework
 import Test.Framework.Providers.HUnit
 shouldFail :: [String]
 shouldFail = [" 1", " +1", " -1"]
 shouldSucceed :: [String]
 shouldSucceed = ["1", "+1", "-1", "+ 1 ", "- 1 ", "1 "]
 integer :: Parser Integer
 integer = Token.integer' (Token.makeTokenParser emptyDef)
 testBatch :: Assertion
 testBatch = mapM_ (f testFail)    shouldFail >>
            mapM_ (f testSucceed) shouldSucceed
    where f           t a = t (parse integer "" a) a
          testFail    x a = assertBool
                            ("Should fail on " ++ show a) (isLeft x)
          testSucceed x a = assertBool
                            ("Should succeed on " ++ show a) (isRight x)
 main :: Test
 main = testCase "TokenParser should fail on leading whitespace (#39)" testBatch