mirror of
https://github.com/mrkkrp/megaparsec.git
synced 2024-11-24 03:52:07 +03:00
Changed how numbers are parsed because they were parsed in a naïf and hairy way. Added tests for #2 and #3 (in old Parsec project these are number 35 and 39 respectively). * Since Haskell report doesn't say anything about sign, I've made ‘integer’ and ‘float’ parse numbers without sign. * Removed ‘natural’ parser, it's equal to new ‘integer’ now. * Renamed ‘naturalOrFloat’ → ‘number’ — this doesn't parse sign too. * Added new combinator ‘signed’ to parse all sorts of signed numbers. * For the sake of convenience I've added ‘integer'’, ‘float'’, and ‘number'’ combinators that also can parse signed numbers out of box.
This commit is contained in:
parent
b19dae4315
commit
c5bcdfb220
@ -20,7 +20,7 @@ module Text.MegaParsec.Token
|
||||
, makeTokenParser )
|
||||
where
|
||||
|
||||
import Data.Char (isAlpha, toLower, toUpper, isSpace, digitToInt)
|
||||
import Data.Char (isAlpha, toLower, toUpper, isSpace)
|
||||
import Data.List (nub, sort)
|
||||
|
||||
import Control.Monad (void)
|
||||
@ -99,7 +99,7 @@ data LanguageDef s u m =
|
||||
data TokenParser s u m =
|
||||
TokenParser {
|
||||
|
||||
-- | This lexeme parser parses a legal identifier. Returns the identifier
|
||||
-- | The lexeme parser parses a legal identifier. Returns the identifier
|
||||
-- string. This parser will fail on identifiers that are reserved
|
||||
-- words. Legal identifier (start) characters and reserved words are
|
||||
-- defined in the 'LanguageDef' that is passed to 'makeTokenParser'. An
|
||||
@ -114,7 +114,7 @@ data TokenParser s u m =
|
||||
|
||||
, reserved :: String -> ParsecT s u m ()
|
||||
|
||||
-- | This lexeme parser parses a legal operator. Returns the name of the
|
||||
-- | The lexeme parser parses a legal operator. Returns the name of the
|
||||
-- operator. This parser will fail on any operators that are reserved
|
||||
-- operators. Legal operator (start) characters and reserved operators
|
||||
-- are defined in the 'LanguageDef' that is passed to
|
||||
@ -129,7 +129,7 @@ data TokenParser s u m =
|
||||
|
||||
, reservedOp :: String -> ParsecT s u m ()
|
||||
|
||||
-- | This lexeme parser parses a single literal character. Returns the
|
||||
-- | The lexeme parser parses a single literal character. Returns the
|
||||
-- literal character value. This parsers deals correctly with escape
|
||||
-- sequences. The literal character is parsed according to the grammar
|
||||
-- rules defined in the Haskell report (which matches most programming
|
||||
@ -137,7 +137,7 @@ data TokenParser s u m =
|
||||
|
||||
, charLiteral :: ParsecT s u m Char
|
||||
|
||||
-- | This lexeme parser parses a literal string. Returns the literal
|
||||
-- | The lexeme parser parses a literal string. Returns the literal
|
||||
-- string value. This parsers deals correctly with escape sequences and
|
||||
-- gaps. The literal string is parsed according to the grammar rules
|
||||
-- defined in the Haskell report (which matches most programming
|
||||
@ -145,51 +145,62 @@ data TokenParser s u m =
|
||||
|
||||
, stringLiteral :: ParsecT s u m String
|
||||
|
||||
-- | This lexeme parser parses a natural number (a positive whole
|
||||
-- number). Returns the value of the number. The number can be specified
|
||||
-- in 'decimal', 'hexadecimal' or 'octal'. The number is parsed
|
||||
-- according to the grammar rules in the Haskell report.
|
||||
|
||||
, natural :: ParsecT s u m Integer
|
||||
|
||||
-- | This lexeme parser parses an integer (a whole number). This parser
|
||||
-- is like 'natural' except that it can be prefixed with sign
|
||||
-- (i.e. \'-\' or \'+\'). Returns the value of the number. The number
|
||||
-- | The lexeme parser parses an integer (a whole number). This parser
|
||||
-- /does not/ parse sign. Returns the value of the number. The number
|
||||
-- can be specified in 'decimal', 'hexadecimal' or 'octal'. The number
|
||||
-- is parsed according to the grammar rules in the Haskell report.
|
||||
|
||||
, integer :: ParsecT s u m Integer
|
||||
|
||||
-- | This lexeme parser parses a floating point value. Returns the value
|
||||
-- of the number. The number is parsed according to the grammar rules
|
||||
-- defined in the Haskell report.
|
||||
-- | This is just like 'integer', except it can parse sign.
|
||||
|
||||
, float :: ParsecT s u m Double
|
||||
, integer' :: ParsecT s u m Integer
|
||||
|
||||
-- | This lexeme parser parses either 'natural' or a 'float'.
|
||||
-- Returns the value of the number. This parsers deals with any overlap
|
||||
-- in the grammar rules for naturals and floats. The number is parsed
|
||||
-- according to the grammar rules defined in the Haskell report.
|
||||
|
||||
, naturalOrFloat :: ParsecT s u m (Either Integer Double)
|
||||
|
||||
-- | Parses a positive whole number in the decimal system. Returns the
|
||||
-- value of the number.
|
||||
-- | The lexeme parses a positive whole number in the decimal system.
|
||||
-- Returns the value of the number.
|
||||
|
||||
, decimal :: ParsecT s u m Integer
|
||||
|
||||
-- | Parses a positive whole number in the hexadecimal system. The number
|
||||
-- should be prefixed with \"0x\" or \"0X\". Returns the value of the
|
||||
-- number.
|
||||
-- | The lexeme parses a positive whole number in the hexadecimal
|
||||
-- system. The number should be prefixed with \"0x\" or \"0X\". Returns
|
||||
-- the value of the number.
|
||||
|
||||
, hexadecimal :: ParsecT s u m Integer
|
||||
|
||||
-- | Parses a positive whole number in the octal system. The number
|
||||
-- should be prefixed with \"0o\" or \"0O\". Returns the value of the
|
||||
-- number.
|
||||
-- | The lexeme parses a positive whole number in the octal system.
|
||||
-- The number should be prefixed with \"0o\" or \"0O\". Returns the
|
||||
-- value of the number.
|
||||
|
||||
, octal :: ParsecT s u m Integer
|
||||
|
||||
-- | @signed p@ tries to parse sign (i.e. \'+\', \'-\', or nothing) and
|
||||
-- then runs parser @p@, changing sign of its result accordingly. Note
|
||||
-- that there may be white space after the sign but not before it.
|
||||
|
||||
, signed :: forall a . Num a => ParsecT s u m a -> ParsecT s u m a
|
||||
|
||||
-- | The lexeme parser parses a floating point value. Returns the value
|
||||
-- of the number. The number is parsed according to the grammar rules
|
||||
-- defined in the Haskell report, sign is /not/ parsed, use 'signed' to
|
||||
-- achieve parsing of signed floating point values.
|
||||
|
||||
, float :: ParsecT s u m Double
|
||||
|
||||
-- | This is just like 'float', except it can parse sign.
|
||||
|
||||
, float' :: ParsecT s u m Double
|
||||
|
||||
-- | The lexeme parser parses either 'integer' or a 'float'.
|
||||
-- Returns the value of the number. This parser deals with any overlap
|
||||
-- in the grammar rules for integers and floats. The number is parsed
|
||||
-- according to the grammar rules defined in the Haskell report.
|
||||
|
||||
, number :: ParsecT s u m (Either Integer Double)
|
||||
|
||||
-- | This is just like 'number', except it can parse sign.
|
||||
|
||||
, number' :: ParsecT s u m (Either Integer Double)
|
||||
|
||||
-- | Lexeme parser @symbol s@ parses 'string' @s@ and skips
|
||||
-- trailing white space.
|
||||
|
||||
@ -310,37 +321,41 @@ data TokenParser s u m =
|
||||
makeTokenParser :: Stream s m Char => LanguageDef s u m -> TokenParser s u m
|
||||
makeTokenParser languageDef =
|
||||
TokenParser
|
||||
{ identifier = identifier
|
||||
, reserved = reserved
|
||||
, operator = operator
|
||||
, reservedOp = reservedOp
|
||||
{ identifier = identifier
|
||||
, reserved = reserved
|
||||
, operator = operator
|
||||
, reservedOp = reservedOp
|
||||
|
||||
, charLiteral = charLiteral
|
||||
, stringLiteral = stringLiteral
|
||||
, natural = natural
|
||||
, integer = integer
|
||||
, float = float
|
||||
, naturalOrFloat = naturalOrFloat
|
||||
, decimal = decimal
|
||||
, hexadecimal = hexadecimal
|
||||
, octal = octal
|
||||
, charLiteral = charLiteral
|
||||
, stringLiteral = stringLiteral
|
||||
|
||||
, symbol = symbol
|
||||
, lexeme = lexeme
|
||||
, whiteSpace = whiteSpace
|
||||
, integer = integer
|
||||
, integer' = integer'
|
||||
, decimal = decimal
|
||||
, hexadecimal = hexadecimal
|
||||
, octal = octal
|
||||
, signed = signed
|
||||
, float = float
|
||||
, float' = float'
|
||||
, number = number
|
||||
, number' = number'
|
||||
|
||||
, parens = parens
|
||||
, braces = braces
|
||||
, angles = angles
|
||||
, brackets = brackets
|
||||
, semi = semi
|
||||
, comma = comma
|
||||
, colon = colon
|
||||
, dot = dot
|
||||
, semiSep = semiSep
|
||||
, semiSep1 = semiSep1
|
||||
, commaSep = commaSep
|
||||
, commaSep1 = commaSep1 }
|
||||
, symbol = symbol
|
||||
, lexeme = lexeme
|
||||
, whiteSpace = whiteSpace
|
||||
|
||||
, parens = parens
|
||||
, braces = braces
|
||||
, angles = angles
|
||||
, brackets = brackets
|
||||
, semi = semi
|
||||
, comma = comma
|
||||
, colon = colon
|
||||
, dot = dot
|
||||
, semiSep = semiSep
|
||||
, semiSep1 = semiSep1
|
||||
, commaSep = commaSep
|
||||
, commaSep1 = commaSep1 }
|
||||
where
|
||||
|
||||
-- bracketing
|
||||
@ -395,19 +410,19 @@ makeTokenParser languageDef =
|
||||
escapeCode = charEsc <|> charNum <|> charAscii <|> charControl
|
||||
<?> "escape code"
|
||||
|
||||
charControl = toEnum . subtract 64 . fromEnum <$> (char '^' >> upper)
|
||||
|
||||
charNum = toEnum . fromInteger <$>
|
||||
( decimal <|>
|
||||
(char 'o' >> number 8 octDigit) <|>
|
||||
(char 'x' >> number 16 hexDigit) )
|
||||
|
||||
charEsc = choice (parseEsc <$> escMap)
|
||||
where parseEsc (c, code) = char c >> return code
|
||||
|
||||
charNum = toEnum . fromInteger <$>
|
||||
( decimal <|>
|
||||
(char 'o' >> nump "0o" octDigit) <|>
|
||||
(char 'x' >> nump "0x" hexDigit) )
|
||||
|
||||
charAscii = choice (parseAscii <$> asciiMap)
|
||||
where parseAscii (asc, code) = try (string asc >> return code)
|
||||
|
||||
charControl = toEnum . subtract 64 . fromEnum <$> (char '^' >> upper)
|
||||
|
||||
-- escape code tables
|
||||
|
||||
escMap = zip "abfnrtv\\\"\'" "\a\b\f\n\r\t\v\\\"\'"
|
||||
@ -422,72 +437,53 @@ makeTokenParser languageDef =
|
||||
ascii2 = "\b\t\n\v\f\r\SO\SI\EM\FS\GS\RS\US "
|
||||
ascii3 = "\NUL\SOH\STX\ETX\EOT\ENQ\ACK\a\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\SUB\ESC\DEL"
|
||||
|
||||
-- numbers
|
||||
-- numbers — integers
|
||||
|
||||
naturalOrFloat = lexeme natFloat <?> "number"
|
||||
integer = decimal <?> "unsigned integer"
|
||||
integer' = signed integer <?> "integer"
|
||||
|
||||
float = lexeme floating <?> "float"
|
||||
integer = lexeme int <?> "integer"
|
||||
natural = lexeme nat <?> "natural"
|
||||
decimal = lexeme $ nump "" digit
|
||||
hexadecimal = lexeme $ char '0' >> oneOf "xX" >> nump "0x" hexDigit
|
||||
octal = lexeme $ char '0' >> oneOf "oO" >> nump "0o" octDigit
|
||||
|
||||
-- floats
|
||||
nump prefix baseDigit = read . (prefix ++) <$> many1 baseDigit
|
||||
|
||||
floating = decimal >>= fractExponent
|
||||
signed p = ($) <$> option id (lexeme sign) <*> p
|
||||
|
||||
natFloat = (char '0' >> zeroNumFloat) <|> decimalFloat
|
||||
sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a)
|
||||
sign = (char '+' *> return id) <|> (char '-' *> return negate)
|
||||
|
||||
zeroNumFloat = (Left <$> (hexadecimal <|> octal)) <|>
|
||||
decimalFloat <|>
|
||||
fractFloat 0 <|>
|
||||
return (Left 0)
|
||||
-- numbers — floats
|
||||
|
||||
decimalFloat = decimal >>= \n -> option (Left n) (fractFloat n)
|
||||
float = lexeme ffloat <?> "unsigned float"
|
||||
float' = signed float <?> "float"
|
||||
|
||||
fractFloat n = Right <$> fractExponent n
|
||||
ffloat = read <$> (ffir <|> fsec)
|
||||
|
||||
fractExponent n =
|
||||
do { fract <- fraction
|
||||
; expo <- option 1.0 exponent'
|
||||
; return $ (n' + fract) * expo
|
||||
} <|> ((* n') <$> exponent')
|
||||
where n' = fromInteger n
|
||||
ffir = do
|
||||
decimal1 <- fDec
|
||||
void $ char '.'
|
||||
decimal2 <- fDec
|
||||
exponent <- option "" fExp
|
||||
return $ decimal1 ++ "." ++ decimal2 ++ exponent
|
||||
|
||||
fraction =
|
||||
do { void $ char '.'
|
||||
; digits <- many1 digit <?> "fraction"
|
||||
; return $ foldr op 0.0 digits
|
||||
} <?> "fraction"
|
||||
where op d f = (f + fromIntegral (digitToInt d)) / 10.0
|
||||
fsec = do
|
||||
decimal <- fDec
|
||||
exponent <- fExp
|
||||
return $ decimal ++ exponent
|
||||
|
||||
exponent' =
|
||||
do { void $ oneOf "eE"
|
||||
; f <- sign
|
||||
; e <- decimal <?> "exponent"
|
||||
; return $ power (f e)
|
||||
} <?> "exponent"
|
||||
where power e
|
||||
| e < 0 = 1.0 / power (-e)
|
||||
| otherwise = fromInteger (10^e)
|
||||
fDec = many1 digit
|
||||
|
||||
-- integers and naturals
|
||||
fExp = do
|
||||
expChar <- oneOf "eE"
|
||||
signStr <- option "" (pure <$> oneOf "+-")
|
||||
decimal <- fDec
|
||||
return $ expChar : signStr ++ decimal
|
||||
|
||||
int = ($) <$> lexeme sign <*> nat
|
||||
-- numbers — a more general case
|
||||
|
||||
sign = (char '-' >> return negate) <|> (char '+' >> return id) <|> return id
|
||||
|
||||
nat = zeroNumber <|> decimal
|
||||
|
||||
zeroNumber = (char '0' >> (hexadecimal <|> octal <|> decimal <|> return 0))
|
||||
<?> ""
|
||||
|
||||
decimal = number 10 digit
|
||||
hexadecimal = oneOf "xX" >> number 16 hexDigit
|
||||
octal = oneOf "oO" >> number 8 octDigit
|
||||
|
||||
number base baseDigit = do
|
||||
digits <- many1 baseDigit
|
||||
let n = foldl (\x d -> base * x + toInteger (digitToInt d)) 0 digits
|
||||
n `seq` return n
|
||||
number = (Right <$> try float) <|> (Left <$> integer) <?> "unsigned number"
|
||||
number' = (Right <$> try float') <|> (Left <$> integer') <?> "number"
|
||||
|
||||
-- operators & reserved ops
|
||||
|
||||
|
@ -6,8 +6,12 @@ import Test.Framework
|
||||
import qualified Bugs.Bug2
|
||||
import qualified Bugs.Bug6
|
||||
import qualified Bugs.Bug9
|
||||
import qualified Bugs.Bug35
|
||||
import qualified Bugs.Bug39
|
||||
|
||||
bugs :: [Test]
|
||||
bugs = [ Bugs.Bug2.main
|
||||
, Bugs.Bug6.main
|
||||
, Bugs.Bug9.main ]
|
||||
, Bugs.Bug9.main
|
||||
, Bugs.Bug35.main
|
||||
, Bugs.Bug39.main ]
|
||||
|
39
test/Bugs/Bug35.hs
Normal file
39
test/Bugs/Bug35.hs
Normal file
@ -0,0 +1,39 @@
|
||||
|
||||
module Bugs.Bug35 (main) where
|
||||
|
||||
import Text.MegaParsec
|
||||
import Text.MegaParsec.Language
|
||||
import Text.MegaParsec.String
|
||||
import qualified Text.MegaParsec.Token as Token
|
||||
|
||||
import Test.HUnit hiding (Test)
|
||||
import Test.Framework
|
||||
import Test.Framework.Providers.HUnit
|
||||
|
||||
trickyFloats :: [String]
|
||||
trickyFloats =
|
||||
[ "1.5339794352098402e-118"
|
||||
, "2.108934760892056e-59"
|
||||
, "2.250634744599241e-19"
|
||||
, "5.0e-324"
|
||||
, "5.960464477539063e-8"
|
||||
, "0.25996181067141905"
|
||||
, "0.3572019862807257"
|
||||
, "0.46817723004874223"
|
||||
, "0.9640035681058178"
|
||||
, "4.23808622486133"
|
||||
, "4.540362294799751"
|
||||
, "5.212384849884261"
|
||||
, "13.958257048123212"
|
||||
, "32.96176575630599"
|
||||
, "38.47735512322269" ]
|
||||
|
||||
float :: Parser Double
|
||||
float = Token.float (Token.makeTokenParser emptyDef)
|
||||
|
||||
testBatch :: Assertion
|
||||
testBatch = mapM_ testFloat trickyFloats
|
||||
where testFloat x = parse float "" x @?= Right (read x :: Double)
|
||||
|
||||
main :: Test
|
||||
main = testCase "Quality of output of Text.Parsec.Token.float (#35)" testBatch
|
34
test/Bugs/Bug39.hs
Normal file
34
test/Bugs/Bug39.hs
Normal file
@ -0,0 +1,34 @@
|
||||
|
||||
module Bugs.Bug39 (main) where
|
||||
|
||||
import Data.Either (isLeft, isRight)
|
||||
|
||||
import Text.MegaParsec
|
||||
import Text.MegaParsec.Language
|
||||
import Text.MegaParsec.String
|
||||
import qualified Text.MegaParsec.Token as Token
|
||||
|
||||
import Test.HUnit hiding (Test)
|
||||
import Test.Framework
|
||||
import Test.Framework.Providers.HUnit
|
||||
|
||||
shouldFail :: [String]
|
||||
shouldFail = [" 1", " +1", " -1"]
|
||||
|
||||
shouldSucceed :: [String]
|
||||
shouldSucceed = ["1", "+1", "-1", "+ 1 ", "- 1 ", "1 "]
|
||||
|
||||
integer :: Parser Integer
|
||||
integer = Token.integer' (Token.makeTokenParser emptyDef)
|
||||
|
||||
testBatch :: Assertion
|
||||
testBatch = mapM_ (f testFail) shouldFail >>
|
||||
mapM_ (f testSucceed) shouldSucceed
|
||||
where f t a = t (parse integer "" a) a
|
||||
testFail x a = assertBool
|
||||
("Should fail on " ++ show a) (isLeft x)
|
||||
testSucceed x a = assertBool
|
||||
("Should succeed on " ++ show a) (isRight x)
|
||||
|
||||
main :: Test
|
||||
main = testCase "TokenParser should fail on leading whitespace (#39)" testBatch
|
Loading…
Reference in New Issue
Block a user