Standardize and fix floating-point parsing.

The existence of #1705 showed me that it's time to remove the bandaid of a fix that was `normalizeFloatString` in #1537. This patch introduces a proper Attoparsec parser for Scientific values that handles the vaguaries of cross-language floating-point syntax. We already depended on Attoparsec indirectly, so adding it as an explicit dependency is fine. A unit test is included, with examples taken from the tree-sitter corpora.
2024-11-28 01:47:01 +03:00 · 2018-04-03 17:02:56 -04:00 · 2018-04-03 17:02:56 -04:00 · c36256e943
commit c36256e943
parent 0e64794172
11 changed files with 163 additions and 50 deletions
--- a/semantic.cabal
+++ b/semantic.cabal
@ -81,6 +81,7 @@ library
                     , Data.Range
                     , Data.Record
                     , Data.Semigroup.App
+                     , Data.Scientific.Exts
                     , Data.Source
                     , Data.Span
                     , Data.SplitDiff
@ -151,6 +152,7 @@ library
                     , ansi-terminal
                     , array
                     , async
+                     , attoparsec
                     , bifunctors
                     , bytestring
                     , cmark-gfm
@ -233,6 +235,7 @@ test-suite test
                     , Data.Functor.Classes.Generic.Spec
                     , Data.Functor.Listable
                     , Data.Mergeable.Spec
+                     , Data.Scientific.Spec
                     , Data.Source.Spec
                     , Data.Term.Spec
                     , Diffing.Algorithm.RWS.Spec
--- a/src/Data/Scientific/Exts.hs
+++ b/src/Data/Scientific/Exts.hs
@ -0,0 +1,98 @@
+module Data.Scientific.Exts
+    ( module Data.Scientific
+    , parseScientific
+    , parser
+    ) where
+
+import Prelude hiding (filter, null, takeWhile)
+
+import Control.Applicative
+import Control.Monad
+import Data.Attoparsec.ByteString.Char8
+import Data.ByteString.Char8 hiding (readInt, takeWhile)
+import Data.Char (isOctDigit)
+import Data.Scientific
+import Data.Semigroup
+import Numeric
+import Text.Read (readMaybe)
+
+parseScientific :: ByteString -> Either String Scientific
+parseScientific = parseOnly parser
+
+-- | This is a very flexible and forgiving parser for Scientific values.
+-- Unlike 'scientificP' or Scientific's 'Read' instance, this handles the myriad
+-- array of floating-point syntaxes across languages:
+-- * omitted whole parts, e.g. @.5@
+-- * omitted decimal parts, e.g. @5.@
+-- * numbers with trailing imaginary/length specifiers, @1.7j, 20L@
+-- * numeric parts, in whole or decimal or exponent parts, with @_@ characters
+-- * hexadecimal, octal, and binary literals (TypeScript needs this because all numbers are floats)
+-- You may either omit the whole or the leading part, not both; this parser also rejects the empty string.
+-- It does /not/ handle hexadecimal floating-point numbers yet, as no language we parse supports them.
+-- This will need to be changed when we support Java.
+-- Please note there are extant parser bugs where complex literals (e.g. @123j@) are parsed
+-- as floating-point rather than complex quantities. This parser discards all suffixes.
+-- This parser is unit-tested in Data.Scientific.Spec.
+
+parser :: Parser Scientific
+parser = signed (choice [hex, oct, bin, dec]) where
+  -- The ending stanza. Note the explicit endOfInput call to ensure we haven't left any dangling input.
+  done = skipWhile (inClass "iIjJlL") *> endOfInput
+
+  -- Wrapper around readMaybe. Analogous to maybeFail in the Prologue, but no need to pull that in.
+  attempt :: Read a => String -> Parser a
+  attempt str = maybe (fail ("No parse: " <> str)) pure (readMaybe str)
+
+  -- Parse a hex value, leaning on the parser provided by Attoparsec.
+  hex = fromIntegral <$> (string "0x" *> hexadecimal @Integer)
+
+  -- Here's where things start getting icky. We lean on Haskell's octal integer support, parsing
+  -- the given string as an integer then coercing it to a Scientific.
+  oct = do
+    void (char '0' <* optional (char 'o'))
+    digs <- takeWhile1 isOctDigit <* done
+    fromIntegral <$> attempt @Integer (unpack ("0o" <> digs))
+
+  -- This is where it starts getting really horrible. Despite having binary literal support, Integer's
+  -- Read instance does not handle binary literals. So we have to shell out to Numeric.readInt, which is
+  -- in all respects a miserable excuse for an API, and apply a ReadS manually. This sucks so much.
+  -- The use of 'error' looks partial, but I really promise you it isn't.
+  bin = do
+    void (string "0b")
+    let isBin = inClass "01"
+    digs <- unpack <$> (takeWhile1 isBin <* done)
+    let c2b c = case c of
+          '0' -> 0
+          '1' -> 1
+          x   -> error ("Invariant violated: both Attoparsec and readInt let a bad digit through: " <> [x])
+    let res = readInt 2 isBin c2b digs
+    case res of
+      []        -> fail ("No parse of binary literal: " <> digs)
+      [(x, "")] -> pure x
+      others    -> fail ("Too many parses of binary literal: " <> show others)
+
+  -- Compared to the binary parser, this is positively breezy.
+  dec = do
+    let notUnder = filter (/= '_')
+    let decOrUnder c = isDigit c || (c == '_')
+
+    -- Try getting the whole part of a floating literal.
+    leadings <- notUnder <$> takeWhile decOrUnder
+
+    -- Try reading a dot.
+    void (optional (char '.'))
+
+    -- The trailing part...
+    trailings <- notUnder <$> takeWhile decOrUnder
+
+    -- ...and the exponent.
+    exponent  <- notUnder <$> takeWhile (inClass "eE_0123456789+-")
+
+    -- Ensure we don't read an empty string, or one consisting only of a dot and/or an exponent.
+    when (null trailings && null leadings) (fail "Does not accept a single dot")
+
+    -- Replace empty parts with a zero.
+    let leads = if null leadings then "0" else leadings
+    let trail = if null trailings then "0" else trailings
+
+    attempt (unpack (leads <> "." <> trail <> exponent))
--- a/src/Data/Syntax/Literal.hs
+++ b/src/Data/Syntax/Literal.hs
@ -5,8 +5,7 @@ import Control.Arrow ((>>>))
 import Data.Abstract.Evaluatable
 import Data.ByteString.Char8 (readInteger, unpack)
 import qualified Data.ByteString.Char8 as B
-import Data.Monoid (Endo (..), appEndo)
-import Data.Scientific (Scientific)
+import Data.Scientific.Exts
 import Diffing.Algorithm
 import Prelude hiding (Float, fail, null)
 import Prologue hiding (Set, hash, null)
@ -58,45 +57,11 @@ instance Eq1 Data.Syntax.Literal.Float where liftEq = genericLiftEq
 instance Ord1 Data.Syntax.Literal.Float where liftCompare = genericLiftCompare
 instance Show1 Data.Syntax.Literal.Float where liftShowsPrec = genericLiftShowsPrec

-- | Ensures that numbers of the form '.52' are parsed correctly. Most languages need this.
-padWithLeadingZero :: ByteString -> ByteString
-padWithLeadingZero b
-  | fmap fst (B.uncons b) == Just '.' = B.cons '0' b
-  | otherwise                         = b
-
-- | As @padWithLeadingZero@, but on the end. Not all languages need this.
-padWithTrailingZero :: ByteString -> ByteString
-padWithTrailingZero b
-  | fmap snd (B.unsnoc b) == Just '.' = B.snoc b '0'
-  | otherwise                         = b
-
-- | Removes underscores in numeric literals. Python 3 and Ruby support this, whereas Python 2, JS, and Go do not.
-removeUnderscores :: ByteString -> ByteString
-removeUnderscores = B.filter (/= '_')
-
-- | Strip suffixes from floating-point literals so as to handle Python's
--   TODO: tree-sitter-python needs some love so that it parses j-suffixed floats as complexen
-dropAlphaSuffix :: ByteString -> ByteString
-dropAlphaSuffix = B.takeWhile (\x -> x `notElem` ("lLjJiI" :: Prelude.String))
-
-- | This is the shared function that munges a bytestring representation of a float
--   so that it can be parsed to a @Scientific@ later. It takes as its arguments a list of functions, which
--   will be some combination of the above 'ByteString -> ByteString' functions. This is meant
--   to be called from an @Assignment@, hence the @MonadFail@ constraint. Caveat: the list is
--   order-dependent; the rightmost function will be applied first.
-normalizeFloatString :: MonadFail m => [ByteString -> ByteString] -> ByteString -> m (Float a)
-normalizeFloatString preds val =
-  let munger = appEndo (foldMap Endo preds)
-  in case readMaybe @Scientific (unpack (munger val)) of
-    Nothing -> fail ("Invalid floating-point value: " <> show val)
-    Just _  -> pure (Float val)
-
 instance Evaluatable Data.Syntax.Literal.Float where
-  eval (Float s) = do
-    sci <- case readMaybe (unpack s) of
-      Just s  -> pure s
-      Nothing -> fail ("Bug: non-normalized float string: " <> show s)
-    float sci
+  eval (Float s) =
+    float =<< case parseScientific s of
+      Right num -> pure num
+      Left  err -> fail ("Parse error: " <> err)

 -- Rational literals e.g. `2/3r`
 newtype Rational a = Rational ByteString
--- a/src/Language/Go/Assignment.hs
+++ b/src/Language/Go/Assignment.hs
@ -228,7 +228,7 @@ fieldIdentifier :: Assignment
 fieldIdentifier = makeTerm <$> symbol FieldIdentifier <*> (Syntax.Identifier <$> (name <$> source))

 floatLiteral :: Assignment
-floatLiteral = makeTerm <$> symbol FloatLiteral <*> (source >>= Literal.normalizeFloatString [Literal.padWithLeadingZero, Literal.dropAlphaSuffix])
+floatLiteral = makeTerm <$> symbol FloatLiteral <*> (Literal.Float <$> source)

 identifier :: Assignment
 identifier =  makeTerm <$> (symbol Identifier <|> symbol Identifier') <*> (Syntax.Identifier <$> (name <$> source))
--- a/src/Language/JSON/Assignment.hs
+++ b/src/Language/JSON/Assignment.hs
@ -50,7 +50,7 @@ array :: Assignment
 array = makeTerm <$> symbol Array <*> children (Literal.Array <$> many jsonValue)

 number :: Assignment
-number = makeTerm <$> symbol Number <*> (source >>= Literal.normalizeFloatString [Literal.padWithLeadingZero])
+number = makeTerm <$> symbol Number <*> (Literal.Float <$> source)

 string :: Assignment
 string = makeTerm <$> symbol String <*> (Literal.TextElement <$> source)
--- a/src/Language/PHP/Assignment.hs
+++ b/src/Language/PHP/Assignment.hs
@ -471,7 +471,7 @@ literal :: Assignment
 literal = integer <|> float <|> string

 float :: Assignment
-float = makeTerm <$> symbol Float <*> (source >>= Literal.normalizeFloatString [Literal.padWithLeadingZero])
+float = makeTerm <$> symbol Float <*> (Literal.Float <$> source)

 integer :: Assignment
 integer = makeTerm <$> symbol Integer <*> (Literal.Integer <$> source)
--- a/src/Language/Python/Assignment.hs
+++ b/src/Language/Python/Assignment.hs
@ -365,11 +365,7 @@ concatenatedString :: Assignment
 concatenatedString = makeTerm <$> symbol ConcatenatedString <*> children (manyTerm string)

 float :: Assignment
-float = makeTerm <$> symbol Float <*> (source >>= Literal.normalizeFloatString [ Literal.padWithLeadingZero
-                                                                               , Literal.padWithTrailingZero
-                                                                               , Literal.dropAlphaSuffix
-                                                                               , Literal.removeUnderscores
-                                                                               ])
+float = makeTerm <$> symbol Float <*> (Literal.Float <$> source)

 integer :: Assignment
 integer = makeTerm <$> symbol Integer <*> (Literal.Integer <$> source)
--- a/src/Language/Ruby/Assignment.hs
+++ b/src/Language/Ruby/Assignment.hs
@ -168,7 +168,7 @@ literal =
  <|> makeTerm <$> token  Grammar.False    <*> pure Literal.false
  <|> makeTerm <$> token  Grammar.Nil      <*> pure Literal.Null
  <|> makeTerm <$> symbol Grammar.Integer  <*> (Literal.Integer <$> source)
-  <|> makeTerm <$> symbol Grammar.Float    <*> (source >>= Literal.normalizeFloatString [Literal.padWithLeadingZero, Literal.removeUnderscores])
+  <|> makeTerm <$> symbol Grammar.Float    <*> (Literal.Float <$> source)
  <|> makeTerm <$> symbol Grammar.Rational <*> (Literal.Rational <$> source)
  <|> makeTerm <$> symbol Grammar.Complex  <*> (Literal.Complex <$> source)
   -- TODO: Do we want to represent the difference between .. and ...
--- a/src/Language/TypeScript/Assignment.hs
+++ b/src/Language/TypeScript/Assignment.hs
@ -326,7 +326,7 @@ importAlias' :: Assignment
 importAlias' = makeTerm <$> symbol Grammar.ImportAlias <*> children (TypeScript.Syntax.ImportAlias <$> term identifier <*> term (identifier <|> nestedIdentifier))

 number :: Assignment
-number = makeTerm <$> symbol Grammar.Number <*> (source >>= Literal.normalizeFloatString [Literal.padWithLeadingZero])
+number = makeTerm <$> symbol Grammar.Number <*> (Literal.Float <$> source)

 string :: Assignment
 string = makeTerm <$> symbol Grammar.String <*> (Literal.TextElement <$> source)
--- a/test/Data/Scientific/Spec.hs
+++ b/test/Data/Scientific/Spec.hs
@ -0,0 +1,49 @@
+module Data.Scientific.Spec where
+
+import Data.Scientific.Exts
+import Data.Either
+import SpecHelpers
+
+spec :: Spec
+spec = describe "Scientific parsing" $ do
+
+  let go cases = forM_ cases $ \(s, v) -> parseScientific s `shouldBe` Right v
+
+  -- TODO: hexadecimal floats, someday (0x1.999999999999ap-4)
+
+  it "should handle Python floats" $
+    go [ ("-.6_6", -0.66)
+       , ("+.1_1", 0.11)
+       , ("123.4123", 123.4123)
+       , ("123.123J", 123.123) -- TODO: handle complex values separately in the parser
+       , ("1_1.3_1", 11.31)
+       , ("1_1.", 11.0)
+       , ("99E+01", 99e1)
+       , ("1e+3_4j", 1e34)
+       , ("3.e14", 3e14)
+       , (".3e1_4", 0.3e14)
+       , ("1_0.l", 10) -- this and the subsequent ones don't actually seem to be valid syntax, we should fix this in tree-sitter
+       , (".3", 0.3)
+       , (".1l", 0.1)  -- omitting a leading 0 is deprecated in python 3, also note that the -l suffix is not valid in Python 3
+       ]
+
+  it "should handle Ruby floats" $
+    go [ ("1.234_5e1_0", 1.2345e10)
+       , ("1E30", 1e30)
+       , ("1.2i", 1.2)
+       , ("1.0e+6", 1.0e6)
+       , ("1.0e-6", 1.0e-6)
+       ]
+
+  it "should handle JS numbers, including multiple bases" $
+    go [ ("101", 101)
+       , ("3.14", 3.14)
+       , ("3.14e+1", 3.14e1)
+       , ("0x1ABCDEFabcdef", 470375954370031)
+       , ("0o7632157312", 1047060170)
+       , ("0b1010101001", 681)
+       ]
+
+  it "should not accept truly bad input" $ do
+    parseScientific "." `shouldSatisfy` isLeft
+    parseScientific ""  `shouldSatisfy` isLeft
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -9,6 +9,7 @@ import qualified Assigning.Assignment.Spec
 import qualified Data.Diff.Spec
 import qualified Data.Functor.Classes.Generic.Spec
 import qualified Data.Mergeable.Spec
+import qualified Data.Scientific.Spec
 import qualified Data.Source.Spec
 import qualified Data.Term.Spec
 import qualified Diffing.Algorithm.RWS.Spec
@ -37,6 +38,7 @@ main = hspec $ do
    describe "Data.Diff" Data.Diff.Spec.spec
    describe "Data.Functor.Classes.Generic" Data.Functor.Classes.Generic.Spec.spec
    describe "Data.Mergeable" Data.Mergeable.Spec.spec
+    describe "Data.Scientific" Data.Scientific.Spec.spec
    describe "Data.Source" Data.Source.Spec.spec
    describe "Data.Term" Data.Term.Spec.spec
    describe "Diffing.Algorithm.RWS" Diffing.Algorithm.RWS.Spec.spec