1
1
mirror of https://github.com/github/semantic.git synced 2024-11-28 01:47:01 +03:00

Standardize and fix floating-point parsing.

The existence of #1705 showed me that it's time to remove the bandaid
of a fix that was `normalizeFloatString` in #1537. This patch
introduces a proper Attoparsec parser for Scientific values that
handles the vaguaries of cross-language floating-point syntax. We
already depended on Attoparsec indirectly, so adding it as an explicit
dependency is fine.

A unit test is included, with examples taken from the tree-sitter corpora.
This commit is contained in:
Patrick Thomson 2018-04-03 17:02:56 -04:00
parent 0e64794172
commit c36256e943
11 changed files with 163 additions and 50 deletions

View File

@ -81,6 +81,7 @@ library
, Data.Range
, Data.Record
, Data.Semigroup.App
, Data.Scientific.Exts
, Data.Source
, Data.Span
, Data.SplitDiff
@ -151,6 +152,7 @@ library
, ansi-terminal
, array
, async
, attoparsec
, bifunctors
, bytestring
, cmark-gfm
@ -233,6 +235,7 @@ test-suite test
, Data.Functor.Classes.Generic.Spec
, Data.Functor.Listable
, Data.Mergeable.Spec
, Data.Scientific.Spec
, Data.Source.Spec
, Data.Term.Spec
, Diffing.Algorithm.RWS.Spec

View File

@ -0,0 +1,98 @@
module Data.Scientific.Exts
( module Data.Scientific
, parseScientific
, parser
) where
import Prelude hiding (filter, null, takeWhile)
import Control.Applicative
import Control.Monad
import Data.Attoparsec.ByteString.Char8
import Data.ByteString.Char8 hiding (readInt, takeWhile)
import Data.Char (isOctDigit)
import Data.Scientific
import Data.Semigroup
import Numeric
import Text.Read (readMaybe)
parseScientific :: ByteString -> Either String Scientific
parseScientific = parseOnly parser
-- | This is a very flexible and forgiving parser for Scientific values.
-- Unlike 'scientificP' or Scientific's 'Read' instance, this handles the myriad
-- array of floating-point syntaxes across languages:
-- * omitted whole parts, e.g. @.5@
-- * omitted decimal parts, e.g. @5.@
-- * numbers with trailing imaginary/length specifiers, @1.7j, 20L@
-- * numeric parts, in whole or decimal or exponent parts, with @_@ characters
-- * hexadecimal, octal, and binary literals (TypeScript needs this because all numbers are floats)
-- You may either omit the whole or the leading part, not both; this parser also rejects the empty string.
-- It does /not/ handle hexadecimal floating-point numbers yet, as no language we parse supports them.
-- This will need to be changed when we support Java.
-- Please note there are extant parser bugs where complex literals (e.g. @123j@) are parsed
-- as floating-point rather than complex quantities. This parser discards all suffixes.
-- This parser is unit-tested in Data.Scientific.Spec.
parser :: Parser Scientific
parser = signed (choice [hex, oct, bin, dec]) where
-- The ending stanza. Note the explicit endOfInput call to ensure we haven't left any dangling input.
done = skipWhile (inClass "iIjJlL") *> endOfInput
-- Wrapper around readMaybe. Analogous to maybeFail in the Prologue, but no need to pull that in.
attempt :: Read a => String -> Parser a
attempt str = maybe (fail ("No parse: " <> str)) pure (readMaybe str)
-- Parse a hex value, leaning on the parser provided by Attoparsec.
hex = fromIntegral <$> (string "0x" *> hexadecimal @Integer)
-- Here's where things start getting icky. We lean on Haskell's octal integer support, parsing
-- the given string as an integer then coercing it to a Scientific.
oct = do
void (char '0' <* optional (char 'o'))
digs <- takeWhile1 isOctDigit <* done
fromIntegral <$> attempt @Integer (unpack ("0o" <> digs))
-- This is where it starts getting really horrible. Despite having binary literal support, Integer's
-- Read instance does not handle binary literals. So we have to shell out to Numeric.readInt, which is
-- in all respects a miserable excuse for an API, and apply a ReadS manually. This sucks so much.
-- The use of 'error' looks partial, but I really promise you it isn't.
bin = do
void (string "0b")
let isBin = inClass "01"
digs <- unpack <$> (takeWhile1 isBin <* done)
let c2b c = case c of
'0' -> 0
'1' -> 1
x -> error ("Invariant violated: both Attoparsec and readInt let a bad digit through: " <> [x])
let res = readInt 2 isBin c2b digs
case res of
[] -> fail ("No parse of binary literal: " <> digs)
[(x, "")] -> pure x
others -> fail ("Too many parses of binary literal: " <> show others)
-- Compared to the binary parser, this is positively breezy.
dec = do
let notUnder = filter (/= '_')
let decOrUnder c = isDigit c || (c == '_')
-- Try getting the whole part of a floating literal.
leadings <- notUnder <$> takeWhile decOrUnder
-- Try reading a dot.
void (optional (char '.'))
-- The trailing part...
trailings <- notUnder <$> takeWhile decOrUnder
-- ...and the exponent.
exponent <- notUnder <$> takeWhile (inClass "eE_0123456789+-")
-- Ensure we don't read an empty string, or one consisting only of a dot and/or an exponent.
when (null trailings && null leadings) (fail "Does not accept a single dot")
-- Replace empty parts with a zero.
let leads = if null leadings then "0" else leadings
let trail = if null trailings then "0" else trailings
attempt (unpack (leads <> "." <> trail <> exponent))

View File

@ -5,8 +5,7 @@ import Control.Arrow ((>>>))
import Data.Abstract.Evaluatable
import Data.ByteString.Char8 (readInteger, unpack)
import qualified Data.ByteString.Char8 as B
import Data.Monoid (Endo (..), appEndo)
import Data.Scientific (Scientific)
import Data.Scientific.Exts
import Diffing.Algorithm
import Prelude hiding (Float, fail, null)
import Prologue hiding (Set, hash, null)
@ -58,45 +57,11 @@ instance Eq1 Data.Syntax.Literal.Float where liftEq = genericLiftEq
instance Ord1 Data.Syntax.Literal.Float where liftCompare = genericLiftCompare
instance Show1 Data.Syntax.Literal.Float where liftShowsPrec = genericLiftShowsPrec
-- | Ensures that numbers of the form '.52' are parsed correctly. Most languages need this.
padWithLeadingZero :: ByteString -> ByteString
padWithLeadingZero b
| fmap fst (B.uncons b) == Just '.' = B.cons '0' b
| otherwise = b
-- | As @padWithLeadingZero@, but on the end. Not all languages need this.
padWithTrailingZero :: ByteString -> ByteString
padWithTrailingZero b
| fmap snd (B.unsnoc b) == Just '.' = B.snoc b '0'
| otherwise = b
-- | Removes underscores in numeric literals. Python 3 and Ruby support this, whereas Python 2, JS, and Go do not.
removeUnderscores :: ByteString -> ByteString
removeUnderscores = B.filter (/= '_')
-- | Strip suffixes from floating-point literals so as to handle Python's
-- TODO: tree-sitter-python needs some love so that it parses j-suffixed floats as complexen
dropAlphaSuffix :: ByteString -> ByteString
dropAlphaSuffix = B.takeWhile (\x -> x `notElem` ("lLjJiI" :: Prelude.String))
-- | This is the shared function that munges a bytestring representation of a float
-- so that it can be parsed to a @Scientific@ later. It takes as its arguments a list of functions, which
-- will be some combination of the above 'ByteString -> ByteString' functions. This is meant
-- to be called from an @Assignment@, hence the @MonadFail@ constraint. Caveat: the list is
-- order-dependent; the rightmost function will be applied first.
normalizeFloatString :: MonadFail m => [ByteString -> ByteString] -> ByteString -> m (Float a)
normalizeFloatString preds val =
let munger = appEndo (foldMap Endo preds)
in case readMaybe @Scientific (unpack (munger val)) of
Nothing -> fail ("Invalid floating-point value: " <> show val)
Just _ -> pure (Float val)
instance Evaluatable Data.Syntax.Literal.Float where
eval (Float s) = do
sci <- case readMaybe (unpack s) of
Just s -> pure s
Nothing -> fail ("Bug: non-normalized float string: " <> show s)
float sci
eval (Float s) =
float =<< case parseScientific s of
Right num -> pure num
Left err -> fail ("Parse error: " <> err)
-- Rational literals e.g. `2/3r`
newtype Rational a = Rational ByteString

View File

@ -228,7 +228,7 @@ fieldIdentifier :: Assignment
fieldIdentifier = makeTerm <$> symbol FieldIdentifier <*> (Syntax.Identifier <$> (name <$> source))
floatLiteral :: Assignment
floatLiteral = makeTerm <$> symbol FloatLiteral <*> (source >>= Literal.normalizeFloatString [Literal.padWithLeadingZero, Literal.dropAlphaSuffix])
floatLiteral = makeTerm <$> symbol FloatLiteral <*> (Literal.Float <$> source)
identifier :: Assignment
identifier = makeTerm <$> (symbol Identifier <|> symbol Identifier') <*> (Syntax.Identifier <$> (name <$> source))

View File

@ -50,7 +50,7 @@ array :: Assignment
array = makeTerm <$> symbol Array <*> children (Literal.Array <$> many jsonValue)
number :: Assignment
number = makeTerm <$> symbol Number <*> (source >>= Literal.normalizeFloatString [Literal.padWithLeadingZero])
number = makeTerm <$> symbol Number <*> (Literal.Float <$> source)
string :: Assignment
string = makeTerm <$> symbol String <*> (Literal.TextElement <$> source)

View File

@ -471,7 +471,7 @@ literal :: Assignment
literal = integer <|> float <|> string
float :: Assignment
float = makeTerm <$> symbol Float <*> (source >>= Literal.normalizeFloatString [Literal.padWithLeadingZero])
float = makeTerm <$> symbol Float <*> (Literal.Float <$> source)
integer :: Assignment
integer = makeTerm <$> symbol Integer <*> (Literal.Integer <$> source)

View File

@ -365,11 +365,7 @@ concatenatedString :: Assignment
concatenatedString = makeTerm <$> symbol ConcatenatedString <*> children (manyTerm string)
float :: Assignment
float = makeTerm <$> symbol Float <*> (source >>= Literal.normalizeFloatString [ Literal.padWithLeadingZero
, Literal.padWithTrailingZero
, Literal.dropAlphaSuffix
, Literal.removeUnderscores
])
float = makeTerm <$> symbol Float <*> (Literal.Float <$> source)
integer :: Assignment
integer = makeTerm <$> symbol Integer <*> (Literal.Integer <$> source)

View File

@ -168,7 +168,7 @@ literal =
<|> makeTerm <$> token Grammar.False <*> pure Literal.false
<|> makeTerm <$> token Grammar.Nil <*> pure Literal.Null
<|> makeTerm <$> symbol Grammar.Integer <*> (Literal.Integer <$> source)
<|> makeTerm <$> symbol Grammar.Float <*> (source >>= Literal.normalizeFloatString [Literal.padWithLeadingZero, Literal.removeUnderscores])
<|> makeTerm <$> symbol Grammar.Float <*> (Literal.Float <$> source)
<|> makeTerm <$> symbol Grammar.Rational <*> (Literal.Rational <$> source)
<|> makeTerm <$> symbol Grammar.Complex <*> (Literal.Complex <$> source)
-- TODO: Do we want to represent the difference between .. and ...

View File

@ -326,7 +326,7 @@ importAlias' :: Assignment
importAlias' = makeTerm <$> symbol Grammar.ImportAlias <*> children (TypeScript.Syntax.ImportAlias <$> term identifier <*> term (identifier <|> nestedIdentifier))
number :: Assignment
number = makeTerm <$> symbol Grammar.Number <*> (source >>= Literal.normalizeFloatString [Literal.padWithLeadingZero])
number = makeTerm <$> symbol Grammar.Number <*> (Literal.Float <$> source)
string :: Assignment
string = makeTerm <$> symbol Grammar.String <*> (Literal.TextElement <$> source)

View File

@ -0,0 +1,49 @@
module Data.Scientific.Spec where
import Data.Scientific.Exts
import Data.Either
import SpecHelpers
spec :: Spec
spec = describe "Scientific parsing" $ do
let go cases = forM_ cases $ \(s, v) -> parseScientific s `shouldBe` Right v
-- TODO: hexadecimal floats, someday (0x1.999999999999ap-4)
it "should handle Python floats" $
go [ ("-.6_6", -0.66)
, ("+.1_1", 0.11)
, ("123.4123", 123.4123)
, ("123.123J", 123.123) -- TODO: handle complex values separately in the parser
, ("1_1.3_1", 11.31)
, ("1_1.", 11.0)
, ("99E+01", 99e1)
, ("1e+3_4j", 1e34)
, ("3.e14", 3e14)
, (".3e1_4", 0.3e14)
, ("1_0.l", 10) -- this and the subsequent ones don't actually seem to be valid syntax, we should fix this in tree-sitter
, (".3", 0.3)
, (".1l", 0.1) -- omitting a leading 0 is deprecated in python 3, also note that the -l suffix is not valid in Python 3
]
it "should handle Ruby floats" $
go [ ("1.234_5e1_0", 1.2345e10)
, ("1E30", 1e30)
, ("1.2i", 1.2)
, ("1.0e+6", 1.0e6)
, ("1.0e-6", 1.0e-6)
]
it "should handle JS numbers, including multiple bases" $
go [ ("101", 101)
, ("3.14", 3.14)
, ("3.14e+1", 3.14e1)
, ("0x1ABCDEFabcdef", 470375954370031)
, ("0o7632157312", 1047060170)
, ("0b1010101001", 681)
]
it "should not accept truly bad input" $ do
parseScientific "." `shouldSatisfy` isLeft
parseScientific "" `shouldSatisfy` isLeft

View File

@ -9,6 +9,7 @@ import qualified Assigning.Assignment.Spec
import qualified Data.Diff.Spec
import qualified Data.Functor.Classes.Generic.Spec
import qualified Data.Mergeable.Spec
import qualified Data.Scientific.Spec
import qualified Data.Source.Spec
import qualified Data.Term.Spec
import qualified Diffing.Algorithm.RWS.Spec
@ -37,6 +38,7 @@ main = hspec $ do
describe "Data.Diff" Data.Diff.Spec.spec
describe "Data.Functor.Classes.Generic" Data.Functor.Classes.Generic.Spec.spec
describe "Data.Mergeable" Data.Mergeable.Spec.spec
describe "Data.Scientific" Data.Scientific.Spec.spec
describe "Data.Source" Data.Source.Spec.spec
describe "Data.Term" Data.Term.Spec.spec
describe "Diffing.Algorithm.RWS" Diffing.Algorithm.RWS.Spec.spec