mirror of
https://github.com/github/semantic.git
synced 2024-11-28 01:47:01 +03:00
Standardize and fix floating-point parsing.
The existence of #1705 showed me that it's time to remove the bandaid of a fix that was `normalizeFloatString` in #1537. This patch introduces a proper Attoparsec parser for Scientific values that handles the vaguaries of cross-language floating-point syntax. We already depended on Attoparsec indirectly, so adding it as an explicit dependency is fine. A unit test is included, with examples taken from the tree-sitter corpora.
This commit is contained in:
parent
0e64794172
commit
c36256e943
@ -81,6 +81,7 @@ library
|
||||
, Data.Range
|
||||
, Data.Record
|
||||
, Data.Semigroup.App
|
||||
, Data.Scientific.Exts
|
||||
, Data.Source
|
||||
, Data.Span
|
||||
, Data.SplitDiff
|
||||
@ -151,6 +152,7 @@ library
|
||||
, ansi-terminal
|
||||
, array
|
||||
, async
|
||||
, attoparsec
|
||||
, bifunctors
|
||||
, bytestring
|
||||
, cmark-gfm
|
||||
@ -233,6 +235,7 @@ test-suite test
|
||||
, Data.Functor.Classes.Generic.Spec
|
||||
, Data.Functor.Listable
|
||||
, Data.Mergeable.Spec
|
||||
, Data.Scientific.Spec
|
||||
, Data.Source.Spec
|
||||
, Data.Term.Spec
|
||||
, Diffing.Algorithm.RWS.Spec
|
||||
|
98
src/Data/Scientific/Exts.hs
Normal file
98
src/Data/Scientific/Exts.hs
Normal file
@ -0,0 +1,98 @@
|
||||
module Data.Scientific.Exts
|
||||
( module Data.Scientific
|
||||
, parseScientific
|
||||
, parser
|
||||
) where
|
||||
|
||||
import Prelude hiding (filter, null, takeWhile)
|
||||
|
||||
import Control.Applicative
|
||||
import Control.Monad
|
||||
import Data.Attoparsec.ByteString.Char8
|
||||
import Data.ByteString.Char8 hiding (readInt, takeWhile)
|
||||
import Data.Char (isOctDigit)
|
||||
import Data.Scientific
|
||||
import Data.Semigroup
|
||||
import Numeric
|
||||
import Text.Read (readMaybe)
|
||||
|
||||
parseScientific :: ByteString -> Either String Scientific
|
||||
parseScientific = parseOnly parser
|
||||
|
||||
-- | This is a very flexible and forgiving parser for Scientific values.
|
||||
-- Unlike 'scientificP' or Scientific's 'Read' instance, this handles the myriad
|
||||
-- array of floating-point syntaxes across languages:
|
||||
-- * omitted whole parts, e.g. @.5@
|
||||
-- * omitted decimal parts, e.g. @5.@
|
||||
-- * numbers with trailing imaginary/length specifiers, @1.7j, 20L@
|
||||
-- * numeric parts, in whole or decimal or exponent parts, with @_@ characters
|
||||
-- * hexadecimal, octal, and binary literals (TypeScript needs this because all numbers are floats)
|
||||
-- You may either omit the whole or the leading part, not both; this parser also rejects the empty string.
|
||||
-- It does /not/ handle hexadecimal floating-point numbers yet, as no language we parse supports them.
|
||||
-- This will need to be changed when we support Java.
|
||||
-- Please note there are extant parser bugs where complex literals (e.g. @123j@) are parsed
|
||||
-- as floating-point rather than complex quantities. This parser discards all suffixes.
|
||||
-- This parser is unit-tested in Data.Scientific.Spec.
|
||||
|
||||
parser :: Parser Scientific
|
||||
parser = signed (choice [hex, oct, bin, dec]) where
|
||||
-- The ending stanza. Note the explicit endOfInput call to ensure we haven't left any dangling input.
|
||||
done = skipWhile (inClass "iIjJlL") *> endOfInput
|
||||
|
||||
-- Wrapper around readMaybe. Analogous to maybeFail in the Prologue, but no need to pull that in.
|
||||
attempt :: Read a => String -> Parser a
|
||||
attempt str = maybe (fail ("No parse: " <> str)) pure (readMaybe str)
|
||||
|
||||
-- Parse a hex value, leaning on the parser provided by Attoparsec.
|
||||
hex = fromIntegral <$> (string "0x" *> hexadecimal @Integer)
|
||||
|
||||
-- Here's where things start getting icky. We lean on Haskell's octal integer support, parsing
|
||||
-- the given string as an integer then coercing it to a Scientific.
|
||||
oct = do
|
||||
void (char '0' <* optional (char 'o'))
|
||||
digs <- takeWhile1 isOctDigit <* done
|
||||
fromIntegral <$> attempt @Integer (unpack ("0o" <> digs))
|
||||
|
||||
-- This is where it starts getting really horrible. Despite having binary literal support, Integer's
|
||||
-- Read instance does not handle binary literals. So we have to shell out to Numeric.readInt, which is
|
||||
-- in all respects a miserable excuse for an API, and apply a ReadS manually. This sucks so much.
|
||||
-- The use of 'error' looks partial, but I really promise you it isn't.
|
||||
bin = do
|
||||
void (string "0b")
|
||||
let isBin = inClass "01"
|
||||
digs <- unpack <$> (takeWhile1 isBin <* done)
|
||||
let c2b c = case c of
|
||||
'0' -> 0
|
||||
'1' -> 1
|
||||
x -> error ("Invariant violated: both Attoparsec and readInt let a bad digit through: " <> [x])
|
||||
let res = readInt 2 isBin c2b digs
|
||||
case res of
|
||||
[] -> fail ("No parse of binary literal: " <> digs)
|
||||
[(x, "")] -> pure x
|
||||
others -> fail ("Too many parses of binary literal: " <> show others)
|
||||
|
||||
-- Compared to the binary parser, this is positively breezy.
|
||||
dec = do
|
||||
let notUnder = filter (/= '_')
|
||||
let decOrUnder c = isDigit c || (c == '_')
|
||||
|
||||
-- Try getting the whole part of a floating literal.
|
||||
leadings <- notUnder <$> takeWhile decOrUnder
|
||||
|
||||
-- Try reading a dot.
|
||||
void (optional (char '.'))
|
||||
|
||||
-- The trailing part...
|
||||
trailings <- notUnder <$> takeWhile decOrUnder
|
||||
|
||||
-- ...and the exponent.
|
||||
exponent <- notUnder <$> takeWhile (inClass "eE_0123456789+-")
|
||||
|
||||
-- Ensure we don't read an empty string, or one consisting only of a dot and/or an exponent.
|
||||
when (null trailings && null leadings) (fail "Does not accept a single dot")
|
||||
|
||||
-- Replace empty parts with a zero.
|
||||
let leads = if null leadings then "0" else leadings
|
||||
let trail = if null trailings then "0" else trailings
|
||||
|
||||
attempt (unpack (leads <> "." <> trail <> exponent))
|
@ -5,8 +5,7 @@ import Control.Arrow ((>>>))
|
||||
import Data.Abstract.Evaluatable
|
||||
import Data.ByteString.Char8 (readInteger, unpack)
|
||||
import qualified Data.ByteString.Char8 as B
|
||||
import Data.Monoid (Endo (..), appEndo)
|
||||
import Data.Scientific (Scientific)
|
||||
import Data.Scientific.Exts
|
||||
import Diffing.Algorithm
|
||||
import Prelude hiding (Float, fail, null)
|
||||
import Prologue hiding (Set, hash, null)
|
||||
@ -58,45 +57,11 @@ instance Eq1 Data.Syntax.Literal.Float where liftEq = genericLiftEq
|
||||
instance Ord1 Data.Syntax.Literal.Float where liftCompare = genericLiftCompare
|
||||
instance Show1 Data.Syntax.Literal.Float where liftShowsPrec = genericLiftShowsPrec
|
||||
|
||||
-- | Ensures that numbers of the form '.52' are parsed correctly. Most languages need this.
|
||||
padWithLeadingZero :: ByteString -> ByteString
|
||||
padWithLeadingZero b
|
||||
| fmap fst (B.uncons b) == Just '.' = B.cons '0' b
|
||||
| otherwise = b
|
||||
|
||||
-- | As @padWithLeadingZero@, but on the end. Not all languages need this.
|
||||
padWithTrailingZero :: ByteString -> ByteString
|
||||
padWithTrailingZero b
|
||||
| fmap snd (B.unsnoc b) == Just '.' = B.snoc b '0'
|
||||
| otherwise = b
|
||||
|
||||
-- | Removes underscores in numeric literals. Python 3 and Ruby support this, whereas Python 2, JS, and Go do not.
|
||||
removeUnderscores :: ByteString -> ByteString
|
||||
removeUnderscores = B.filter (/= '_')
|
||||
|
||||
-- | Strip suffixes from floating-point literals so as to handle Python's
|
||||
-- TODO: tree-sitter-python needs some love so that it parses j-suffixed floats as complexen
|
||||
dropAlphaSuffix :: ByteString -> ByteString
|
||||
dropAlphaSuffix = B.takeWhile (\x -> x `notElem` ("lLjJiI" :: Prelude.String))
|
||||
|
||||
-- | This is the shared function that munges a bytestring representation of a float
|
||||
-- so that it can be parsed to a @Scientific@ later. It takes as its arguments a list of functions, which
|
||||
-- will be some combination of the above 'ByteString -> ByteString' functions. This is meant
|
||||
-- to be called from an @Assignment@, hence the @MonadFail@ constraint. Caveat: the list is
|
||||
-- order-dependent; the rightmost function will be applied first.
|
||||
normalizeFloatString :: MonadFail m => [ByteString -> ByteString] -> ByteString -> m (Float a)
|
||||
normalizeFloatString preds val =
|
||||
let munger = appEndo (foldMap Endo preds)
|
||||
in case readMaybe @Scientific (unpack (munger val)) of
|
||||
Nothing -> fail ("Invalid floating-point value: " <> show val)
|
||||
Just _ -> pure (Float val)
|
||||
|
||||
instance Evaluatable Data.Syntax.Literal.Float where
|
||||
eval (Float s) = do
|
||||
sci <- case readMaybe (unpack s) of
|
||||
Just s -> pure s
|
||||
Nothing -> fail ("Bug: non-normalized float string: " <> show s)
|
||||
float sci
|
||||
eval (Float s) =
|
||||
float =<< case parseScientific s of
|
||||
Right num -> pure num
|
||||
Left err -> fail ("Parse error: " <> err)
|
||||
|
||||
-- Rational literals e.g. `2/3r`
|
||||
newtype Rational a = Rational ByteString
|
||||
|
@ -228,7 +228,7 @@ fieldIdentifier :: Assignment
|
||||
fieldIdentifier = makeTerm <$> symbol FieldIdentifier <*> (Syntax.Identifier <$> (name <$> source))
|
||||
|
||||
floatLiteral :: Assignment
|
||||
floatLiteral = makeTerm <$> symbol FloatLiteral <*> (source >>= Literal.normalizeFloatString [Literal.padWithLeadingZero, Literal.dropAlphaSuffix])
|
||||
floatLiteral = makeTerm <$> symbol FloatLiteral <*> (Literal.Float <$> source)
|
||||
|
||||
identifier :: Assignment
|
||||
identifier = makeTerm <$> (symbol Identifier <|> symbol Identifier') <*> (Syntax.Identifier <$> (name <$> source))
|
||||
|
@ -50,7 +50,7 @@ array :: Assignment
|
||||
array = makeTerm <$> symbol Array <*> children (Literal.Array <$> many jsonValue)
|
||||
|
||||
number :: Assignment
|
||||
number = makeTerm <$> symbol Number <*> (source >>= Literal.normalizeFloatString [Literal.padWithLeadingZero])
|
||||
number = makeTerm <$> symbol Number <*> (Literal.Float <$> source)
|
||||
|
||||
string :: Assignment
|
||||
string = makeTerm <$> symbol String <*> (Literal.TextElement <$> source)
|
||||
|
@ -471,7 +471,7 @@ literal :: Assignment
|
||||
literal = integer <|> float <|> string
|
||||
|
||||
float :: Assignment
|
||||
float = makeTerm <$> symbol Float <*> (source >>= Literal.normalizeFloatString [Literal.padWithLeadingZero])
|
||||
float = makeTerm <$> symbol Float <*> (Literal.Float <$> source)
|
||||
|
||||
integer :: Assignment
|
||||
integer = makeTerm <$> symbol Integer <*> (Literal.Integer <$> source)
|
||||
|
@ -365,11 +365,7 @@ concatenatedString :: Assignment
|
||||
concatenatedString = makeTerm <$> symbol ConcatenatedString <*> children (manyTerm string)
|
||||
|
||||
float :: Assignment
|
||||
float = makeTerm <$> symbol Float <*> (source >>= Literal.normalizeFloatString [ Literal.padWithLeadingZero
|
||||
, Literal.padWithTrailingZero
|
||||
, Literal.dropAlphaSuffix
|
||||
, Literal.removeUnderscores
|
||||
])
|
||||
float = makeTerm <$> symbol Float <*> (Literal.Float <$> source)
|
||||
|
||||
integer :: Assignment
|
||||
integer = makeTerm <$> symbol Integer <*> (Literal.Integer <$> source)
|
||||
|
@ -168,7 +168,7 @@ literal =
|
||||
<|> makeTerm <$> token Grammar.False <*> pure Literal.false
|
||||
<|> makeTerm <$> token Grammar.Nil <*> pure Literal.Null
|
||||
<|> makeTerm <$> symbol Grammar.Integer <*> (Literal.Integer <$> source)
|
||||
<|> makeTerm <$> symbol Grammar.Float <*> (source >>= Literal.normalizeFloatString [Literal.padWithLeadingZero, Literal.removeUnderscores])
|
||||
<|> makeTerm <$> symbol Grammar.Float <*> (Literal.Float <$> source)
|
||||
<|> makeTerm <$> symbol Grammar.Rational <*> (Literal.Rational <$> source)
|
||||
<|> makeTerm <$> symbol Grammar.Complex <*> (Literal.Complex <$> source)
|
||||
-- TODO: Do we want to represent the difference between .. and ...
|
||||
|
@ -326,7 +326,7 @@ importAlias' :: Assignment
|
||||
importAlias' = makeTerm <$> symbol Grammar.ImportAlias <*> children (TypeScript.Syntax.ImportAlias <$> term identifier <*> term (identifier <|> nestedIdentifier))
|
||||
|
||||
number :: Assignment
|
||||
number = makeTerm <$> symbol Grammar.Number <*> (source >>= Literal.normalizeFloatString [Literal.padWithLeadingZero])
|
||||
number = makeTerm <$> symbol Grammar.Number <*> (Literal.Float <$> source)
|
||||
|
||||
string :: Assignment
|
||||
string = makeTerm <$> symbol Grammar.String <*> (Literal.TextElement <$> source)
|
||||
|
49
test/Data/Scientific/Spec.hs
Normal file
49
test/Data/Scientific/Spec.hs
Normal file
@ -0,0 +1,49 @@
|
||||
module Data.Scientific.Spec where
|
||||
|
||||
import Data.Scientific.Exts
|
||||
import Data.Either
|
||||
import SpecHelpers
|
||||
|
||||
spec :: Spec
|
||||
spec = describe "Scientific parsing" $ do
|
||||
|
||||
let go cases = forM_ cases $ \(s, v) -> parseScientific s `shouldBe` Right v
|
||||
|
||||
-- TODO: hexadecimal floats, someday (0x1.999999999999ap-4)
|
||||
|
||||
it "should handle Python floats" $
|
||||
go [ ("-.6_6", -0.66)
|
||||
, ("+.1_1", 0.11)
|
||||
, ("123.4123", 123.4123)
|
||||
, ("123.123J", 123.123) -- TODO: handle complex values separately in the parser
|
||||
, ("1_1.3_1", 11.31)
|
||||
, ("1_1.", 11.0)
|
||||
, ("99E+01", 99e1)
|
||||
, ("1e+3_4j", 1e34)
|
||||
, ("3.e14", 3e14)
|
||||
, (".3e1_4", 0.3e14)
|
||||
, ("1_0.l", 10) -- this and the subsequent ones don't actually seem to be valid syntax, we should fix this in tree-sitter
|
||||
, (".3", 0.3)
|
||||
, (".1l", 0.1) -- omitting a leading 0 is deprecated in python 3, also note that the -l suffix is not valid in Python 3
|
||||
]
|
||||
|
||||
it "should handle Ruby floats" $
|
||||
go [ ("1.234_5e1_0", 1.2345e10)
|
||||
, ("1E30", 1e30)
|
||||
, ("1.2i", 1.2)
|
||||
, ("1.0e+6", 1.0e6)
|
||||
, ("1.0e-6", 1.0e-6)
|
||||
]
|
||||
|
||||
it "should handle JS numbers, including multiple bases" $
|
||||
go [ ("101", 101)
|
||||
, ("3.14", 3.14)
|
||||
, ("3.14e+1", 3.14e1)
|
||||
, ("0x1ABCDEFabcdef", 470375954370031)
|
||||
, ("0o7632157312", 1047060170)
|
||||
, ("0b1010101001", 681)
|
||||
]
|
||||
|
||||
it "should not accept truly bad input" $ do
|
||||
parseScientific "." `shouldSatisfy` isLeft
|
||||
parseScientific "" `shouldSatisfy` isLeft
|
@ -9,6 +9,7 @@ import qualified Assigning.Assignment.Spec
|
||||
import qualified Data.Diff.Spec
|
||||
import qualified Data.Functor.Classes.Generic.Spec
|
||||
import qualified Data.Mergeable.Spec
|
||||
import qualified Data.Scientific.Spec
|
||||
import qualified Data.Source.Spec
|
||||
import qualified Data.Term.Spec
|
||||
import qualified Diffing.Algorithm.RWS.Spec
|
||||
@ -37,6 +38,7 @@ main = hspec $ do
|
||||
describe "Data.Diff" Data.Diff.Spec.spec
|
||||
describe "Data.Functor.Classes.Generic" Data.Functor.Classes.Generic.Spec.spec
|
||||
describe "Data.Mergeable" Data.Mergeable.Spec.spec
|
||||
describe "Data.Scientific" Data.Scientific.Spec.spec
|
||||
describe "Data.Source" Data.Source.Spec.spec
|
||||
describe "Data.Term" Data.Term.Spec.spec
|
||||
describe "Diffing.Algorithm.RWS" Diffing.Algorithm.RWS.Spec.spec
|
||||
|
Loading…
Reference in New Issue
Block a user