Merge pull request #227 from gren-lang/fix-parser-for-non-ascii-variable-names

Fix parser for non ascii variable names
This commit is contained in:
Robin Heggelund Hansen 2023-09-21 14:06:41 +02:00 committed by GitHub
commit 58ba88c002
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 63 additions and 23 deletions

View File

@ -223,8 +223,8 @@ fromTypeVariable name@(Utf8.Utf8 ba#) index =
then name
else
let len# = sizeofByteArray# ba#
end# = indexWord8Array# ba# (len# -# 1#)
in if isTrue# (leWord8# (wordToWord8# 0x30##) end#) && isTrue# (leWord8# end# (wordToWord8# 0x39##))
end# = word8ToWord# (indexWord8Array# ba# (len# -# 1#))
in if isTrue# (leWord# 0x30## end#) && isTrue# (leWord# end# 0x39##)
then
runST
( do

View File

@ -121,11 +121,11 @@ startsWithChar isGood bytes@(Utf8 ba#) =
if isEmpty bytes
then False
else
let !w# = indexWord8Array# ba# 0#
let !w# = word8ToWord# (indexWord8Array# ba# 0#)
!char
| isTrue# (ltWord8# w# (wordToWord8# 0xC0##)) = C# (chr# (word8ToInt# w#))
| isTrue# (ltWord8# w# (wordToWord8# 0xE0##)) = chr2 ba# 0# w#
| isTrue# (ltWord8# w# (wordToWord8# 0xF0##)) = chr3 ba# 0# w#
| isTrue# (ltWord# w# 0xC0##) = C# (chr# (word2Int# w#))
| isTrue# (ltWord# w# 0xE0##) = chr2 ba# 0# w#
| isTrue# (ltWord# w# 0xF0##) = chr3 ba# 0# w#
| True = chr4 ba# 0# w#
in isGood char
@ -247,22 +247,22 @@ writeChars !mba !offset chars =
char : chars
| n < 0x80 ->
do
writeWord8 mba (offset) (fromIntegral n)
writeWord8 mba offset (fromIntegral n)
writeChars mba (offset + 1) chars
| n < 0x800 ->
do
writeWord8 mba (offset) (fromIntegral ((shiftR n 6) + 0xC0))
writeWord8 mba offset (fromIntegral ((shiftR n 6) + 0xC0))
writeWord8 mba (offset + 1) (fromIntegral ((n .&. 0x3F) + 0x80))
writeChars mba (offset + 2) chars
| n < 0x10000 ->
do
writeWord8 mba (offset) (fromIntegral ((shiftR n 12) + 0xE0))
writeWord8 mba offset (fromIntegral ((shiftR n 12) + 0xE0))
writeWord8 mba (offset + 1) (fromIntegral ((shiftR n 6 .&. 0x3F) + 0x80))
writeWord8 mba (offset + 2) (fromIntegral ((n .&. 0x3F) + 0x80))
writeChars mba (offset + 3) chars
| otherwise ->
do
writeWord8 mba (offset) (fromIntegral ((shiftR n 18) + 0xF0))
writeWord8 mba offset (fromIntegral ((shiftR n 18) + 0xF0))
writeWord8 mba (offset + 1) (fromIntegral ((shiftR n 12 .&. 0x3F) + 0x80))
writeWord8 mba (offset + 2) (fromIntegral ((shiftR n 6 .&. 0x3F) + 0x80))
writeWord8 mba (offset + 3) (fromIntegral ((n .&. 0x3F) + 0x80))
@ -290,27 +290,27 @@ toCharsHelp ba# offset# len# =
if isTrue# (offset# >=# len#)
then []
else
let !w# = indexWord8Array# ba# offset#
let !w# = word8ToWord# (indexWord8Array# ba# offset#)
!(# char, width# #)
| isTrue# (ltWord8# w# (wordToWord8# 0xC0##)) = (# C# (chr# (word8ToInt# w#)), 1# #)
| isTrue# (ltWord8# w# (wordToWord8# 0xE0##)) = (# chr2 ba# offset# w#, 2# #)
| isTrue# (ltWord8# w# (wordToWord8# 0xF0##)) = (# chr3 ba# offset# w#, 3# #)
| isTrue# (ltWord# w# 0xC0##) = (# C# (chr# (word2Int# w#)), 1# #)
| isTrue# (ltWord# w# 0xE0##) = (# chr2 ba# offset# w#, 2# #)
| isTrue# (ltWord# w# 0xF0##) = (# chr3 ba# offset# w#, 3# #)
| True = (# chr4 ba# offset# w#, 4# #)
!newOffset# = offset# +# width#
in char : toCharsHelp ba# newOffset# len#
chr2 :: ByteArray# -> Int# -> Word8# -> Char
chr2 :: ByteArray# -> Int# -> Word# -> Char
chr2 ba# offset# firstWord# =
let !i1# = word8ToInt# firstWord#
let !i1# = word2Int# firstWord#
!i2# = word8ToInt# (indexWord8Array# ba# (offset# +# 1#))
!c1# = uncheckedIShiftL# (i1# -# 0xC0#) 6#
!c2# = i2# -# 0x80#
in C# (chr# (c1# +# c2#))
chr3 :: ByteArray# -> Int# -> Word8# -> Char
chr3 :: ByteArray# -> Int# -> Word# -> Char
chr3 ba# offset# firstWord# =
let !i1# = word8ToInt# firstWord#
let !i1# = word2Int# firstWord#
!i2# = word8ToInt# (indexWord8Array# ba# (offset# +# 1#))
!i3# = word8ToInt# (indexWord8Array# ba# (offset# +# 2#))
!c1# = uncheckedIShiftL# (i1# -# 0xE0#) 12#
@ -318,9 +318,9 @@ chr3 ba# offset# firstWord# =
!c3# = i3# -# 0x80#
in C# (chr# (c1# +# c2# +# c3#))
chr4 :: ByteArray# -> Int# -> Word8# -> Char
chr4 :: ByteArray# -> Int# -> Word# -> Char
chr4 ba# offset# firstWord# =
let !i1# = word8ToInt# firstWord#
let !i1# = word2Int# firstWord#
!i2# = word8ToInt# (indexWord8Array# ba# (offset# +# 1#))
!i3# = word8ToInt# (indexWord8Array# ba# (offset# +# 2#))
!i4# = word8ToInt# (indexWord8Array# ba# (offset# +# 3#))
@ -332,7 +332,7 @@ chr4 ba# offset# firstWord# =
word8ToInt# :: Word8# -> Int#
word8ToInt# word8 =
int8ToInt# (word8ToInt8# word8)
word2Int# (word8ToWord# word8)
-- TO TEXT

View File

@ -39,6 +39,7 @@ data Decl
| Alias (Maybe Src.DocComment) (A.Located Src.Alias)
| Port (Maybe Src.DocComment) Src.Port
| TopLevelComments (NonEmpty Src.Comment)
deriving (Show)
declaration :: Space.Parser E.Decl (Decl, [Src.Comment])
declaration =

View File

@ -27,7 +27,7 @@ import Data.Name qualified as Name
import Data.Set qualified as Set
import Data.Word (Word8)
import Foreign.Ptr (Ptr, plusPtr)
import GHC.Exts (Char (C#), Int#, chr#, int8ToInt#, uncheckedIShiftL#, word8ToInt8#, (+#), (-#))
import GHC.Exts (Char (C#), Int#, chr#, uncheckedIShiftL#, word2Int#, word8ToWord#, (+#), (-#))
import GHC.Word (Word8 (W8#))
import Parse.Primitives (Col, Parser, Row, unsafeIndex)
import Parse.Primitives qualified as P
@ -301,4 +301,4 @@ chr4 pos firstWord =
unpack :: Word8 -> Int#
unpack (W8# word#) =
int8ToInt# (word8ToInt8# word#)
word2Int# (word8ToWord# word#)

View File

@ -263,6 +263,7 @@ Test-Suite gren-tests
Parse.SpaceSpec
Parse.UnderscorePatternSpec
Parse.MultilineStringSpec
Parse.DeclSpec
Build-Depends:
hspec >= 2.7.10 && < 3

38
tests/Parse/DeclSpec.hs Normal file
View File

@ -0,0 +1,38 @@
module Parse.DeclSpec where
import Data.ByteString.UTF8 qualified as Utf8
import Helpers.Instances ()
import Parse.Declaration (declaration)
import Parse.Primitives qualified as P
import Test.Hspec (Spec, describe, it, shouldSatisfy)
data ParseError
= DeclError P.Row P.Col
| OtherError String P.Row P.Col
deriving (Show, Eq)
spec :: Spec
spec = do
describe "Top Level Valeus" $ do
it "regression test" $
parse "test = 1"
it "Value names can contain non-ascii characters" $ do
parse "vålue = 1"
it "Value names can be only non-ascii characters" $ do
parse "æøå = 1"
parse :: String -> IO ()
parse str =
P.fromByteString
(P.specialize (\_ row col -> DeclError row col) declaration)
(OtherError "fromByteString failed")
(Utf8.fromString str)
`shouldSatisfy` valid
valid :: Either x y -> Bool
valid result =
case result of
Right _ -> True
Left _ -> False