rewritten parsing of numbers, fixes #2 and #3

Changed how numbers are parsed because they were parsed in a naïf and
hairy way. Added tests for #2 and #3 (in old Parsec project these are
number 35 and 39 respectively).

* Since Haskell report doesn't say anything about sign, I've made
  ‘integer’ and ‘float’ parse numbers without sign.

* Removed ‘natural’ parser, it's equal to new ‘integer’ now.

* Renamed ‘naturalOrFloat’ → ‘number’ — this doesn't parse sign too.

* Added new combinator ‘signed’ to parse all sorts of signed numbers.

* For the sake of convenience I've added ‘integer'’, ‘float'’, and
 ‘number'’ combinators that also can parse signed numbers out of box.
This commit is contained in:
mrkkrp 2015-07-31 17:30:38 +06:00
parent b19dae4315
commit c5bcdfb220
4 changed files with 195 additions and 122 deletions

View File

@ -20,7 +20,7 @@ module Text.MegaParsec.Token
, makeTokenParser ) , makeTokenParser )
where where
import Data.Char (isAlpha, toLower, toUpper, isSpace, digitToInt) import Data.Char (isAlpha, toLower, toUpper, isSpace)
import Data.List (nub, sort) import Data.List (nub, sort)
import Control.Monad (void) import Control.Monad (void)
@ -99,7 +99,7 @@ data LanguageDef s u m =
data TokenParser s u m = data TokenParser s u m =
TokenParser { TokenParser {
-- | This lexeme parser parses a legal identifier. Returns the identifier -- | The lexeme parser parses a legal identifier. Returns the identifier
-- string. This parser will fail on identifiers that are reserved -- string. This parser will fail on identifiers that are reserved
-- words. Legal identifier (start) characters and reserved words are -- words. Legal identifier (start) characters and reserved words are
-- defined in the 'LanguageDef' that is passed to 'makeTokenParser'. An -- defined in the 'LanguageDef' that is passed to 'makeTokenParser'. An
@ -114,7 +114,7 @@ data TokenParser s u m =
, reserved :: String -> ParsecT s u m () , reserved :: String -> ParsecT s u m ()
-- | This lexeme parser parses a legal operator. Returns the name of the -- | The lexeme parser parses a legal operator. Returns the name of the
-- operator. This parser will fail on any operators that are reserved -- operator. This parser will fail on any operators that are reserved
-- operators. Legal operator (start) characters and reserved operators -- operators. Legal operator (start) characters and reserved operators
-- are defined in the 'LanguageDef' that is passed to -- are defined in the 'LanguageDef' that is passed to
@ -129,7 +129,7 @@ data TokenParser s u m =
, reservedOp :: String -> ParsecT s u m () , reservedOp :: String -> ParsecT s u m ()
-- | This lexeme parser parses a single literal character. Returns the -- | The lexeme parser parses a single literal character. Returns the
-- literal character value. This parsers deals correctly with escape -- literal character value. This parsers deals correctly with escape
-- sequences. The literal character is parsed according to the grammar -- sequences. The literal character is parsed according to the grammar
-- rules defined in the Haskell report (which matches most programming -- rules defined in the Haskell report (which matches most programming
@ -137,7 +137,7 @@ data TokenParser s u m =
, charLiteral :: ParsecT s u m Char , charLiteral :: ParsecT s u m Char
-- | This lexeme parser parses a literal string. Returns the literal -- | The lexeme parser parses a literal string. Returns the literal
-- string value. This parsers deals correctly with escape sequences and -- string value. This parsers deals correctly with escape sequences and
-- gaps. The literal string is parsed according to the grammar rules -- gaps. The literal string is parsed according to the grammar rules
-- defined in the Haskell report (which matches most programming -- defined in the Haskell report (which matches most programming
@ -145,51 +145,62 @@ data TokenParser s u m =
, stringLiteral :: ParsecT s u m String , stringLiteral :: ParsecT s u m String
-- | This lexeme parser parses a natural number (a positive whole -- | The lexeme parser parses an integer (a whole number). This parser
-- number). Returns the value of the number. The number can be specified -- /does not/ parse sign. Returns the value of the number. The number
-- in 'decimal', 'hexadecimal' or 'octal'. The number is parsed
-- according to the grammar rules in the Haskell report.
, natural :: ParsecT s u m Integer
-- | This lexeme parser parses an integer (a whole number). This parser
-- is like 'natural' except that it can be prefixed with sign
-- (i.e. \'-\' or \'+\'). Returns the value of the number. The number
-- can be specified in 'decimal', 'hexadecimal' or 'octal'. The number -- can be specified in 'decimal', 'hexadecimal' or 'octal'. The number
-- is parsed according to the grammar rules in the Haskell report. -- is parsed according to the grammar rules in the Haskell report.
, integer :: ParsecT s u m Integer , integer :: ParsecT s u m Integer
-- | This lexeme parser parses a floating point value. Returns the value -- | This is just like 'integer', except it can parse sign.
-- of the number. The number is parsed according to the grammar rules
-- defined in the Haskell report.
, float :: ParsecT s u m Double , integer' :: ParsecT s u m Integer
-- | This lexeme parser parses either 'natural' or a 'float'. -- | The lexeme parses a positive whole number in the decimal system.
-- Returns the value of the number. This parsers deals with any overlap -- Returns the value of the number.
-- in the grammar rules for naturals and floats. The number is parsed
-- according to the grammar rules defined in the Haskell report.
, naturalOrFloat :: ParsecT s u m (Either Integer Double)
-- | Parses a positive whole number in the decimal system. Returns the
-- value of the number.
, decimal :: ParsecT s u m Integer , decimal :: ParsecT s u m Integer
-- | Parses a positive whole number in the hexadecimal system. The number -- | The lexeme parses a positive whole number in the hexadecimal
-- should be prefixed with \"0x\" or \"0X\". Returns the value of the -- system. The number should be prefixed with \"0x\" or \"0X\". Returns
-- number. -- the value of the number.
, hexadecimal :: ParsecT s u m Integer , hexadecimal :: ParsecT s u m Integer
-- | Parses a positive whole number in the octal system. The number -- | The lexeme parses a positive whole number in the octal system.
-- should be prefixed with \"0o\" or \"0O\". Returns the value of the -- The number should be prefixed with \"0o\" or \"0O\". Returns the
-- number. -- value of the number.
, octal :: ParsecT s u m Integer , octal :: ParsecT s u m Integer
-- | @signed p@ tries to parse sign (i.e. \'+\', \'-\', or nothing) and
-- then runs parser @p@, changing sign of its result accordingly. Note
-- that there may be white space after the sign but not before it.
, signed :: forall a . Num a => ParsecT s u m a -> ParsecT s u m a
-- | The lexeme parser parses a floating point value. Returns the value
-- of the number. The number is parsed according to the grammar rules
-- defined in the Haskell report, sign is /not/ parsed, use 'signed' to
-- achieve parsing of signed floating point values.
, float :: ParsecT s u m Double
-- | This is just like 'float', except it can parse sign.
, float' :: ParsecT s u m Double
-- | The lexeme parser parses either 'integer' or a 'float'.
-- Returns the value of the number. This parser deals with any overlap
-- in the grammar rules for integers and floats. The number is parsed
-- according to the grammar rules defined in the Haskell report.
, number :: ParsecT s u m (Either Integer Double)
-- | This is just like 'number', except it can parse sign.
, number' :: ParsecT s u m (Either Integer Double)
-- | Lexeme parser @symbol s@ parses 'string' @s@ and skips -- | Lexeme parser @symbol s@ parses 'string' @s@ and skips
-- trailing white space. -- trailing white space.
@ -310,37 +321,41 @@ data TokenParser s u m =
makeTokenParser :: Stream s m Char => LanguageDef s u m -> TokenParser s u m makeTokenParser :: Stream s m Char => LanguageDef s u m -> TokenParser s u m
makeTokenParser languageDef = makeTokenParser languageDef =
TokenParser TokenParser
{ identifier = identifier { identifier = identifier
, reserved = reserved , reserved = reserved
, operator = operator , operator = operator
, reservedOp = reservedOp , reservedOp = reservedOp
, charLiteral = charLiteral , charLiteral = charLiteral
, stringLiteral = stringLiteral , stringLiteral = stringLiteral
, natural = natural
, integer = integer
, float = float
, naturalOrFloat = naturalOrFloat
, decimal = decimal
, hexadecimal = hexadecimal
, octal = octal
, symbol = symbol , integer = integer
, lexeme = lexeme , integer' = integer'
, whiteSpace = whiteSpace , decimal = decimal
, hexadecimal = hexadecimal
, octal = octal
, signed = signed
, float = float
, float' = float'
, number = number
, number' = number'
, parens = parens , symbol = symbol
, braces = braces , lexeme = lexeme
, angles = angles , whiteSpace = whiteSpace
, brackets = brackets
, semi = semi , parens = parens
, comma = comma , braces = braces
, colon = colon , angles = angles
, dot = dot , brackets = brackets
, semiSep = semiSep , semi = semi
, semiSep1 = semiSep1 , comma = comma
, commaSep = commaSep , colon = colon
, commaSep1 = commaSep1 } , dot = dot
, semiSep = semiSep
, semiSep1 = semiSep1
, commaSep = commaSep
, commaSep1 = commaSep1 }
where where
-- bracketing -- bracketing
@ -395,19 +410,19 @@ makeTokenParser languageDef =
escapeCode = charEsc <|> charNum <|> charAscii <|> charControl escapeCode = charEsc <|> charNum <|> charAscii <|> charControl
<?> "escape code" <?> "escape code"
charControl = toEnum . subtract 64 . fromEnum <$> (char '^' >> upper)
charNum = toEnum . fromInteger <$>
( decimal <|>
(char 'o' >> number 8 octDigit) <|>
(char 'x' >> number 16 hexDigit) )
charEsc = choice (parseEsc <$> escMap) charEsc = choice (parseEsc <$> escMap)
where parseEsc (c, code) = char c >> return code where parseEsc (c, code) = char c >> return code
charNum = toEnum . fromInteger <$>
( decimal <|>
(char 'o' >> nump "0o" octDigit) <|>
(char 'x' >> nump "0x" hexDigit) )
charAscii = choice (parseAscii <$> asciiMap) charAscii = choice (parseAscii <$> asciiMap)
where parseAscii (asc, code) = try (string asc >> return code) where parseAscii (asc, code) = try (string asc >> return code)
charControl = toEnum . subtract 64 . fromEnum <$> (char '^' >> upper)
-- escape code tables -- escape code tables
escMap = zip "abfnrtv\\\"\'" "\a\b\f\n\r\t\v\\\"\'" escMap = zip "abfnrtv\\\"\'" "\a\b\f\n\r\t\v\\\"\'"
@ -422,72 +437,53 @@ makeTokenParser languageDef =
ascii2 = "\b\t\n\v\f\r\SO\SI\EM\FS\GS\RS\US " ascii2 = "\b\t\n\v\f\r\SO\SI\EM\FS\GS\RS\US "
ascii3 = "\NUL\SOH\STX\ETX\EOT\ENQ\ACK\a\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\SUB\ESC\DEL" ascii3 = "\NUL\SOH\STX\ETX\EOT\ENQ\ACK\a\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\SUB\ESC\DEL"
-- numbers -- numbers — integers
naturalOrFloat = lexeme natFloat <?> "number" integer = decimal <?> "unsigned integer"
integer' = signed integer <?> "integer"
float = lexeme floating <?> "float" decimal = lexeme $ nump "" digit
integer = lexeme int <?> "integer" hexadecimal = lexeme $ char '0' >> oneOf "xX" >> nump "0x" hexDigit
natural = lexeme nat <?> "natural" octal = lexeme $ char '0' >> oneOf "oO" >> nump "0o" octDigit
-- floats nump prefix baseDigit = read . (prefix ++) <$> many1 baseDigit
floating = decimal >>= fractExponent signed p = ($) <$> option id (lexeme sign) <*> p
natFloat = (char '0' >> zeroNumFloat) <|> decimalFloat sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a)
sign = (char '+' *> return id) <|> (char '-' *> return negate)
zeroNumFloat = (Left <$> (hexadecimal <|> octal)) <|> -- numbers — floats
decimalFloat <|>
fractFloat 0 <|>
return (Left 0)
decimalFloat = decimal >>= \n -> option (Left n) (fractFloat n) float = lexeme ffloat <?> "unsigned float"
float' = signed float <?> "float"
fractFloat n = Right <$> fractExponent n ffloat = read <$> (ffir <|> fsec)
fractExponent n = ffir = do
do { fract <- fraction decimal1 <- fDec
; expo <- option 1.0 exponent' void $ char '.'
; return $ (n' + fract) * expo decimal2 <- fDec
} <|> ((* n') <$> exponent') exponent <- option "" fExp
where n' = fromInteger n return $ decimal1 ++ "." ++ decimal2 ++ exponent
fraction = fsec = do
do { void $ char '.' decimal <- fDec
; digits <- many1 digit <?> "fraction" exponent <- fExp
; return $ foldr op 0.0 digits return $ decimal ++ exponent
} <?> "fraction"
where op d f = (f + fromIntegral (digitToInt d)) / 10.0
exponent' = fDec = many1 digit
do { void $ oneOf "eE"
; f <- sign
; e <- decimal <?> "exponent"
; return $ power (f e)
} <?> "exponent"
where power e
| e < 0 = 1.0 / power (-e)
| otherwise = fromInteger (10^e)
-- integers and naturals fExp = do
expChar <- oneOf "eE"
signStr <- option "" (pure <$> oneOf "+-")
decimal <- fDec
return $ expChar : signStr ++ decimal
int = ($) <$> lexeme sign <*> nat -- numbers — a more general case
sign = (char '-' >> return negate) <|> (char '+' >> return id) <|> return id number = (Right <$> try float) <|> (Left <$> integer) <?> "unsigned number"
number' = (Right <$> try float') <|> (Left <$> integer') <?> "number"
nat = zeroNumber <|> decimal
zeroNumber = (char '0' >> (hexadecimal <|> octal <|> decimal <|> return 0))
<?> ""
decimal = number 10 digit
hexadecimal = oneOf "xX" >> number 16 hexDigit
octal = oneOf "oO" >> number 8 octDigit
number base baseDigit = do
digits <- many1 baseDigit
let n = foldl (\x d -> base * x + toInteger (digitToInt d)) 0 digits
n `seq` return n
-- operators & reserved ops -- operators & reserved ops

View File

@ -6,8 +6,12 @@ import Test.Framework
import qualified Bugs.Bug2 import qualified Bugs.Bug2
import qualified Bugs.Bug6 import qualified Bugs.Bug6
import qualified Bugs.Bug9 import qualified Bugs.Bug9
import qualified Bugs.Bug35
import qualified Bugs.Bug39
bugs :: [Test] bugs :: [Test]
bugs = [ Bugs.Bug2.main bugs = [ Bugs.Bug2.main
, Bugs.Bug6.main , Bugs.Bug6.main
, Bugs.Bug9.main ] , Bugs.Bug9.main
, Bugs.Bug35.main
, Bugs.Bug39.main ]

39
test/Bugs/Bug35.hs Normal file
View File

@ -0,0 +1,39 @@
module Bugs.Bug35 (main) where
import Text.MegaParsec
import Text.MegaParsec.Language
import Text.MegaParsec.String
import qualified Text.MegaParsec.Token as Token
import Test.HUnit hiding (Test)
import Test.Framework
import Test.Framework.Providers.HUnit
trickyFloats :: [String]
trickyFloats =
[ "1.5339794352098402e-118"
, "2.108934760892056e-59"
, "2.250634744599241e-19"
, "5.0e-324"
, "5.960464477539063e-8"
, "0.25996181067141905"
, "0.3572019862807257"
, "0.46817723004874223"
, "0.9640035681058178"
, "4.23808622486133"
, "4.540362294799751"
, "5.212384849884261"
, "13.958257048123212"
, "32.96176575630599"
, "38.47735512322269" ]
float :: Parser Double
float = Token.float (Token.makeTokenParser emptyDef)
testBatch :: Assertion
testBatch = mapM_ testFloat trickyFloats
where testFloat x = parse float "" x @?= Right (read x :: Double)
main :: Test
main = testCase "Quality of output of Text.Parsec.Token.float (#35)" testBatch

34
test/Bugs/Bug39.hs Normal file
View File

@ -0,0 +1,34 @@
module Bugs.Bug39 (main) where
import Data.Either (isLeft, isRight)
import Text.MegaParsec
import Text.MegaParsec.Language
import Text.MegaParsec.String
import qualified Text.MegaParsec.Token as Token
import Test.HUnit hiding (Test)
import Test.Framework
import Test.Framework.Providers.HUnit
shouldFail :: [String]
shouldFail = [" 1", " +1", " -1"]
shouldSucceed :: [String]
shouldSucceed = ["1", "+1", "-1", "+ 1 ", "- 1 ", "1 "]
integer :: Parser Integer
integer = Token.integer' (Token.makeTokenParser emptyDef)
testBatch :: Assertion
testBatch = mapM_ (f testFail) shouldFail >>
mapM_ (f testSucceed) shouldSucceed
where f t a = t (parse integer "" a) a
testFail x a = assertBool
("Should fail on " ++ show a) (isLeft x)
testSucceed x a = assertBool
("Should succeed on " ++ show a) (isRight x)
main :: Test
main = testCase "TokenParser should fail on leading whitespace (#39)" testBatch