mirror of
https://github.com/composewell/streamly.git
synced 2024-09-11 19:47:10 +03:00
Implement UTF-16 encoding/decoding routines
- Add identity tests - Add benchmarks - Add Word16 to Word8 helpers
This commit is contained in:
parent
febaeaac09
commit
86e8accfb0
@ -29,6 +29,7 @@ import Streamly.Data.Stream (Stream)
|
|||||||
import Streamly.Data.Fold (Fold)
|
import Streamly.Data.Fold (Fold)
|
||||||
import Prelude hiding (last, length)
|
import Prelude hiding (last, length)
|
||||||
import System.IO (Handle)
|
import System.IO (Handle)
|
||||||
|
import Streamly.Internal.System.IO (arrayPayloadSize)
|
||||||
|
|
||||||
import qualified Streamly.Data.Array as Array
|
import qualified Streamly.Data.Array as Array
|
||||||
import qualified Streamly.Data.Fold as Fold
|
import qualified Streamly.Data.Fold as Fold
|
||||||
@ -37,6 +38,7 @@ import qualified Streamly.Internal.Data.Unfold as Unfold
|
|||||||
import qualified Streamly.Internal.FileSystem.Handle as Handle
|
import qualified Streamly.Internal.FileSystem.Handle as Handle
|
||||||
import qualified Streamly.Internal.Unicode.Array as UnicodeArr
|
import qualified Streamly.Internal.Unicode.Array as UnicodeArr
|
||||||
import qualified Streamly.Internal.Unicode.Stream as Unicode
|
import qualified Streamly.Internal.Unicode.Stream as Unicode
|
||||||
|
import qualified Streamly.Internal.Data.Array as Array
|
||||||
|
|
||||||
import Test.Tasty.Bench hiding (env)
|
import Test.Tasty.Bench hiding (env)
|
||||||
import Streamly.Benchmark.Common
|
import Streamly.Benchmark.Common
|
||||||
@ -45,7 +47,6 @@ import Streamly.Benchmark.Common.Handle
|
|||||||
#ifdef INSPECTION
|
#ifdef INSPECTION
|
||||||
import Streamly.Internal.Data.MutByteArray (Unbox)
|
import Streamly.Internal.Data.MutByteArray (Unbox)
|
||||||
import Streamly.Internal.Data.Stream (Step(..))
|
import Streamly.Internal.Data.Stream (Step(..))
|
||||||
import qualified Streamly.Internal.Data.Array as Array
|
|
||||||
import qualified Streamly.Internal.Data.MutArray as MutArray
|
import qualified Streamly.Internal.Data.MutArray as MutArray
|
||||||
import qualified Streamly.Internal.Data.Fold as Fold
|
import qualified Streamly.Internal.Data.Fold as Fold
|
||||||
import qualified Streamly.Internal.Data.Tuple.Strict as Strict
|
import qualified Streamly.Internal.Data.Tuple.Strict as Strict
|
||||||
@ -258,6 +259,17 @@ _copyStreamUtf8' inh outh =
|
|||||||
$ Unicode.decodeUtf8'
|
$ Unicode.decodeUtf8'
|
||||||
$ Stream.unfold Handle.reader inh
|
$ Stream.unfold Handle.reader inh
|
||||||
|
|
||||||
|
-- | Copy file
|
||||||
|
{-# NOINLINE copyStreamUtf16 #-}
|
||||||
|
copyStreamUtf16 :: Handle -> Handle -> IO ()
|
||||||
|
copyStreamUtf16 inh outh =
|
||||||
|
Stream.fold (Handle.writeChunks outh)
|
||||||
|
$ fmap Array.castUnsafe $ Array.chunksOf (arrayPayloadSize (16 * 1024))
|
||||||
|
$ Unicode.encodeUtf16le'
|
||||||
|
$ Unicode.decodeUtf16le
|
||||||
|
$ Array.concat $ fmap Array.castUnsafe $ Unicode.mkEvenW8Chunks
|
||||||
|
$ Handle.readChunks inh
|
||||||
|
|
||||||
#ifdef INSPECTION
|
#ifdef INSPECTION
|
||||||
inspect $ hasNoTypeClasses '_copyStreamUtf8'
|
inspect $ hasNoTypeClasses '_copyStreamUtf8'
|
||||||
-- inspect $ '_copyStreamUtf8 `hasNoType` ''Step
|
-- inspect $ '_copyStreamUtf8 `hasNoType` ''Step
|
||||||
@ -319,6 +331,8 @@ o_1_space_decode_encode_read env =
|
|||||||
$ \inh outh -> _copyStreamUtf8Parser inh outh
|
$ \inh outh -> _copyStreamUtf8Parser inh outh
|
||||||
, mkBenchSmall "encodeUtf8 . decodeUtf8" env $ \inh outh ->
|
, mkBenchSmall "encodeUtf8 . decodeUtf8" env $ \inh outh ->
|
||||||
copyStreamUtf8 inh outh
|
copyStreamUtf8 inh outh
|
||||||
|
, mkBenchSmall "encodeUtf16 . decodeUtf16" env $ \inh outh ->
|
||||||
|
copyStreamUtf16 inh outh
|
||||||
]
|
]
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -28,7 +28,10 @@ module Streamly.Internal.Unicode.Stream
|
|||||||
, decodeUtf8
|
, decodeUtf8
|
||||||
, decodeUtf8'
|
, decodeUtf8'
|
||||||
, decodeUtf8_
|
, decodeUtf8_
|
||||||
|
|
||||||
|
-- ** UTF-16 Decoding
|
||||||
, decodeUtf16le'
|
, decodeUtf16le'
|
||||||
|
, decodeUtf16le
|
||||||
|
|
||||||
-- ** Resumable UTF-8 Decoding
|
-- ** Resumable UTF-8 Decoding
|
||||||
, DecodeError(..)
|
, DecodeError(..)
|
||||||
@ -56,7 +59,10 @@ module Streamly.Internal.Unicode.Stream
|
|||||||
, encodeUtf8'
|
, encodeUtf8'
|
||||||
, encodeUtf8_
|
, encodeUtf8_
|
||||||
, encodeStrings
|
, encodeStrings
|
||||||
|
|
||||||
|
-- ** UTF-16 Encoding
|
||||||
, encodeUtf16le'
|
, encodeUtf16le'
|
||||||
|
, encodeUtf16le
|
||||||
{-
|
{-
|
||||||
-- * Operations on character strings
|
-- * Operations on character strings
|
||||||
, strip -- (dropAround isSpace)
|
, strip -- (dropAround isSpace)
|
||||||
@ -83,6 +89,10 @@ module Streamly.Internal.Unicode.Stream
|
|||||||
-- * Decoding String Literals
|
-- * Decoding String Literals
|
||||||
, fromStr#
|
, fromStr#
|
||||||
|
|
||||||
|
-- * Word16 Utilities
|
||||||
|
, mkEvenW8Chunks
|
||||||
|
, swapByteOrder
|
||||||
|
|
||||||
-- * Deprecations
|
-- * Deprecations
|
||||||
, decodeUtf8Lax
|
, decodeUtf8Lax
|
||||||
, encodeLatin1Lax
|
, encodeLatin1Lax
|
||||||
@ -92,6 +102,10 @@ where
|
|||||||
|
|
||||||
#include "inline.hs"
|
#include "inline.hs"
|
||||||
|
|
||||||
|
-- MachDeps.h includes ghcautoconf.h that defines WORDS_BIGENDIAN for big endian
|
||||||
|
-- systems.
|
||||||
|
#include "MachDeps.h"
|
||||||
|
|
||||||
import Control.Monad (void)
|
import Control.Monad (void)
|
||||||
import Control.Monad.IO.Class (MonadIO, liftIO)
|
import Control.Monad.IO.Class (MonadIO, liftIO)
|
||||||
import Data.Bits (shiftR, shiftL, (.|.), (.&.))
|
import Data.Bits (shiftR, shiftL, (.|.), (.&.))
|
||||||
@ -124,7 +138,7 @@ import Streamly.Internal.System.IO (unsafeInlineIO)
|
|||||||
|
|
||||||
import qualified Streamly.Data.Fold as Fold
|
import qualified Streamly.Data.Fold as Fold
|
||||||
import qualified Streamly.Data.Unfold as Unfold
|
import qualified Streamly.Data.Unfold as Unfold
|
||||||
import qualified Streamly.Internal.Data.Array.Type as Array
|
import qualified Streamly.Internal.Data.Array as Array
|
||||||
import qualified Streamly.Internal.Data.Parser as Parser (Parser)
|
import qualified Streamly.Internal.Data.Parser as Parser (Parser)
|
||||||
import qualified Streamly.Internal.Data.Parser as ParserD
|
import qualified Streamly.Internal.Data.Parser as ParserD
|
||||||
import qualified Streamly.Internal.Data.Stream as Stream
|
import qualified Streamly.Internal.Data.Stream as Stream
|
||||||
@ -682,14 +696,183 @@ decodeUtf8Lax = decodeUtf8
|
|||||||
-- Decoding Utf16
|
-- Decoding Utf16
|
||||||
-------------------------------------------------------------------------------
|
-------------------------------------------------------------------------------
|
||||||
|
|
||||||
-- | Decode a UTF-16 little endian encoded bytestream to a stream of Unicode
|
data MkEvenW8ChunksState s w8 arr
|
||||||
-- characters. The function throws an error if an invalid codepoint is
|
= MECSInit s
|
||||||
|
| MECSBuffer w8 s
|
||||||
|
| MECSYieldAndInit arr s
|
||||||
|
| MECSYieldAndBuffer arr w8 s
|
||||||
|
|
||||||
|
-- | Ensure chunks of even length. This can be used before casting the arrays to
|
||||||
|
-- Word16. Use this API when interacting with external data.
|
||||||
|
--
|
||||||
|
-- The chunks are split and merged accordingly to create arrays of even length.
|
||||||
|
-- If the sum of length of all the arrays in the stream is odd then the trailing
|
||||||
|
-- byte of the last array is dropped.
|
||||||
|
--
|
||||||
|
{-# INLINE_NORMAL mkEvenW8Chunks #-}
|
||||||
|
mkEvenW8Chunks :: Monad m => Stream m (Array Word8) -> Stream m (Array Word8)
|
||||||
|
mkEvenW8Chunks (D.Stream step state) = D.Stream step1 (MECSInit state)
|
||||||
|
|
||||||
|
where
|
||||||
|
|
||||||
|
{-# INLINE_LATE step1 #-}
|
||||||
|
step1 gst (MECSInit st) = do
|
||||||
|
r <- step (adaptState gst) st
|
||||||
|
return $
|
||||||
|
case r of
|
||||||
|
Yield arr st1 ->
|
||||||
|
let len = Array.length arr
|
||||||
|
in if (len .&. 1) == 1
|
||||||
|
then let arr1 = Array.getSliceUnsafe 0 (len - 1) arr
|
||||||
|
remElem = Array.getIndexUnsafe (len - 1) arr
|
||||||
|
in Yield arr1 (MECSBuffer remElem st1)
|
||||||
|
else Yield arr (MECSInit st1)
|
||||||
|
Skip s -> Skip (MECSInit s)
|
||||||
|
Stop -> Stop
|
||||||
|
step1 gst (MECSBuffer remElem st) = do
|
||||||
|
r <- step (adaptState gst) st
|
||||||
|
return $
|
||||||
|
case r of
|
||||||
|
Yield arr st1 | Array.length arr == 0 ->
|
||||||
|
Skip (MECSBuffer remElem st1)
|
||||||
|
Yield arr st1 | Array.length arr == 1 ->
|
||||||
|
let fstElem = Array.getIndexUnsafe 0 arr
|
||||||
|
w16 = Array.fromList [remElem, fstElem]
|
||||||
|
in Yield w16 (MECSInit st1)
|
||||||
|
Yield arr st1 ->
|
||||||
|
let len = Array.length arr
|
||||||
|
in if (len .&. 1) == 1
|
||||||
|
then let arr1 = Array.getSliceUnsafe 1 (len - 1) arr
|
||||||
|
fstElem = Array.getIndexUnsafe 0 arr
|
||||||
|
w16 = Array.fromList [remElem, fstElem]
|
||||||
|
in Yield w16 (MECSYieldAndInit arr1 st1)
|
||||||
|
else let arr1 = Array.getSliceUnsafe 1 (len - 2) arr
|
||||||
|
fstElem = Array.getIndexUnsafe 0 arr
|
||||||
|
lstElem = Array.getIndexUnsafe (len - 1) arr
|
||||||
|
w16 = Array.fromList [remElem, fstElem]
|
||||||
|
in Yield w16
|
||||||
|
(MECSYieldAndBuffer arr1 lstElem st1)
|
||||||
|
Skip s -> Skip (MECSBuffer remElem s)
|
||||||
|
Stop -> Stop -- Here the last Word8 is lost
|
||||||
|
step1 _ (MECSYieldAndInit arr st) =
|
||||||
|
pure $ Yield arr (MECSInit st)
|
||||||
|
step1 _ (MECSYieldAndBuffer arr lastElem st) =
|
||||||
|
pure $ Yield arr (MECSBuffer lastElem st)
|
||||||
|
|
||||||
|
-- | Swap the byte order of Word16
|
||||||
|
--
|
||||||
|
-- > swapByteOrder 0xABCD == 0xCDAB
|
||||||
|
-- > swapByteOrder . swapByteOrder == id
|
||||||
|
{-# INLINE swapByteOrder #-}
|
||||||
|
swapByteOrder :: Word16 -> Word16
|
||||||
|
swapByteOrder w = (w `shiftL` 8) .|. (w `shiftR` 8)
|
||||||
|
|
||||||
|
data DecodeUtf16WithState w c s
|
||||||
|
= U16NoSurrogate s
|
||||||
|
| U16HighSurrogate w s
|
||||||
|
| U16D
|
||||||
|
| U16YAndC c (DecodeUtf16WithState w c s)
|
||||||
|
|
||||||
|
{-# INLINE_NORMAL decodeUtf16With #-}
|
||||||
|
decodeUtf16With ::
|
||||||
|
Monad m
|
||||||
|
=> CodingFailureMode
|
||||||
|
-> D.Stream m Word16
|
||||||
|
-> D.Stream m Char
|
||||||
|
decodeUtf16With cfm (D.Stream step state) =
|
||||||
|
D.Stream step1 (U16NoSurrogate state)
|
||||||
|
|
||||||
|
where
|
||||||
|
|
||||||
|
prefix = "Streamly.Internal.Unicode.Stream.decodeUtf16With: "
|
||||||
|
|
||||||
|
{-# INLINE combineSurrogates #-}
|
||||||
|
combineSurrogates hi lo =
|
||||||
|
let first10 = fromIntegral (hi - utf16HighSurrogate) `shiftL` 10
|
||||||
|
second10 = fromIntegral (lo - utf16LowSurrogate)
|
||||||
|
in unsafeChr (0x10000 + (first10 .|. second10))
|
||||||
|
|
||||||
|
{-# INLINE transliterateOrError #-}
|
||||||
|
transliterateOrError e s =
|
||||||
|
case cfm of
|
||||||
|
ErrorOnCodingFailure -> error e
|
||||||
|
TransliterateCodingFailure -> U16YAndC replacementChar s
|
||||||
|
DropOnCodingFailure -> s
|
||||||
|
|
||||||
|
{-# INLINE inputUnderflow #-}
|
||||||
|
inputUnderflow =
|
||||||
|
case cfm of
|
||||||
|
ErrorOnCodingFailure -> error $ prefix ++ "Input Underflow"
|
||||||
|
TransliterateCodingFailure -> U16YAndC replacementChar U16D
|
||||||
|
DropOnCodingFailure -> U16D
|
||||||
|
|
||||||
|
{-# INLINE_LATE step1 #-}
|
||||||
|
step1 gst (U16NoSurrogate st) = do
|
||||||
|
r <- step (adaptState gst) st
|
||||||
|
pure $
|
||||||
|
case r of
|
||||||
|
Yield x st1
|
||||||
|
| x < 0xD800 || x > 0xDFFF ->
|
||||||
|
Yield (unsafeChr (fromIntegral x)) (U16NoSurrogate st1)
|
||||||
|
| x >= 0xD800 && x <= 0xDBFF ->
|
||||||
|
Skip (U16HighSurrogate x st1)
|
||||||
|
| otherwise ->
|
||||||
|
let msg = prefix
|
||||||
|
++ "Invalid first UTF16 word " ++ show x
|
||||||
|
in Skip $
|
||||||
|
transliterateOrError msg (U16NoSurrogate st1)
|
||||||
|
Skip st1 -> Skip (U16NoSurrogate st1)
|
||||||
|
Stop -> Stop
|
||||||
|
step1 gst (U16HighSurrogate hi st) = do
|
||||||
|
r <- step (adaptState gst) st
|
||||||
|
pure $
|
||||||
|
case r of
|
||||||
|
Yield x st1
|
||||||
|
| x >= 0xDC00 && x <= 0xDFFF ->
|
||||||
|
Yield (combineSurrogates hi x) (U16NoSurrogate st1)
|
||||||
|
| otherwise ->
|
||||||
|
let msg = prefix
|
||||||
|
++ "Invalid subsequent UTF16 word " ++ show x
|
||||||
|
++ " in state " ++ show hi
|
||||||
|
in Skip $
|
||||||
|
transliterateOrError msg (U16NoSurrogate st1)
|
||||||
|
Skip st1 -> Skip (U16HighSurrogate hi st1)
|
||||||
|
Stop -> Skip inputUnderflow
|
||||||
|
step1 _ (U16YAndC x st) = pure $ Yield x st
|
||||||
|
step1 _ U16D = pure Stop
|
||||||
|
|
||||||
|
{-# INLINE decodeUtf16' #-}
|
||||||
|
decodeUtf16' :: Monad m => Stream m Word16 -> Stream m Char
|
||||||
|
decodeUtf16' = decodeUtf16With ErrorOnCodingFailure
|
||||||
|
|
||||||
|
{-# INLINE decodeUtf16 #-}
|
||||||
|
decodeUtf16 :: Monad m => Stream m Word16 -> Stream m Char
|
||||||
|
decodeUtf16 = decodeUtf16With TransliterateCodingFailure
|
||||||
|
|
||||||
|
-- | Similar to 'decodeUtf16le' but throws an error if an invalid codepoint is
|
||||||
-- encountered.
|
-- encountered.
|
||||||
--
|
--
|
||||||
-- /Unimplemented/
|
|
||||||
{-# INLINE decodeUtf16le' #-}
|
{-# INLINE decodeUtf16le' #-}
|
||||||
decodeUtf16le' :: Stream m Word16 -> Stream m Char
|
decodeUtf16le' :: Monad m => Stream m Word16 -> Stream m Char
|
||||||
decodeUtf16le' = undefined
|
decodeUtf16le' =
|
||||||
|
decodeUtf16'
|
||||||
|
#ifdef WORDS_BIGENDIAN
|
||||||
|
. fmap swapByteOrder
|
||||||
|
#endif
|
||||||
|
|
||||||
|
-- | Decode a UTF-16 encoded stream to a stream of Unicode characters. Any
|
||||||
|
-- invalid codepoint encountered is replaced with the unicode replacement
|
||||||
|
-- character.
|
||||||
|
--
|
||||||
|
-- The Word16s are expected to be in the little-endian byte order.
|
||||||
|
--
|
||||||
|
{-# INLINE decodeUtf16le #-}
|
||||||
|
decodeUtf16le :: Monad m => Stream m Word16 -> Stream m Char
|
||||||
|
decodeUtf16le =
|
||||||
|
decodeUtf16
|
||||||
|
#ifdef WORDS_BIGENDIAN
|
||||||
|
. fmap swapByteOrder
|
||||||
|
#endif
|
||||||
|
|
||||||
-------------------------------------------------------------------------------
|
-------------------------------------------------------------------------------
|
||||||
-- Decoding Array Streams
|
-- Decoding Array Streams
|
||||||
@ -850,12 +1033,12 @@ decodeUtf8Chunks_ = decodeUtf8ArraysWithD DropOnCodingFailure
|
|||||||
-- Encoding Unicode (UTF-8) Characters
|
-- Encoding Unicode (UTF-8) Characters
|
||||||
-------------------------------------------------------------------------------
|
-------------------------------------------------------------------------------
|
||||||
|
|
||||||
data WList = WCons !Word8 !WList | WNil
|
data WList a = WCons !a !(WList a) | WNil
|
||||||
|
|
||||||
-- UTF-8 primitives, Lifted from GHC.IO.Encoding.UTF8.
|
-- UTF-8 primitives, Lifted from GHC.IO.Encoding.UTF8.
|
||||||
|
|
||||||
{-# INLINE ord2 #-}
|
{-# INLINE ord2 #-}
|
||||||
ord2 :: Char -> WList
|
ord2 :: Char -> (WList Word8)
|
||||||
ord2 c = assert (n >= 0x80 && n <= 0x07ff) (WCons x1 (WCons x2 WNil))
|
ord2 c = assert (n >= 0x80 && n <= 0x07ff) (WCons x1 (WCons x2 WNil))
|
||||||
where
|
where
|
||||||
n = ord c
|
n = ord c
|
||||||
@ -863,7 +1046,7 @@ ord2 c = assert (n >= 0x80 && n <= 0x07ff) (WCons x1 (WCons x2 WNil))
|
|||||||
x2 = fromIntegral $ (n .&. 0x3F) + 0x80
|
x2 = fromIntegral $ (n .&. 0x3F) + 0x80
|
||||||
|
|
||||||
{-# INLINE ord3 #-}
|
{-# INLINE ord3 #-}
|
||||||
ord3 :: Char -> WList
|
ord3 :: Char -> (WList Word8)
|
||||||
ord3 c = assert (n >= 0x0800 && n <= 0xffff) (WCons x1 (WCons x2 (WCons x3 WNil)))
|
ord3 c = assert (n >= 0x0800 && n <= 0xffff) (WCons x1 (WCons x2 (WCons x3 WNil)))
|
||||||
where
|
where
|
||||||
n = ord c
|
n = ord c
|
||||||
@ -872,7 +1055,7 @@ ord3 c = assert (n >= 0x0800 && n <= 0xffff) (WCons x1 (WCons x2 (WCons x3 WNil)
|
|||||||
x3 = fromIntegral $ (n .&. 0x3F) + 0x80
|
x3 = fromIntegral $ (n .&. 0x3F) + 0x80
|
||||||
|
|
||||||
{-# INLINE ord4 #-}
|
{-# INLINE ord4 #-}
|
||||||
ord4 :: Char -> WList
|
ord4 :: Char -> (WList Word8)
|
||||||
ord4 c = assert (n >= 0x10000) (WCons x1 (WCons x2 (WCons x3 (WCons x4 WNil))))
|
ord4 c = assert (n >= 0x10000) (WCons x1 (WCons x2 (WCons x3 (WCons x4 WNil))))
|
||||||
where
|
where
|
||||||
n = ord c
|
n = ord c
|
||||||
@ -882,7 +1065,7 @@ ord4 c = assert (n >= 0x10000) (WCons x1 (WCons x2 (WCons x3 (WCons x4 WNil))))
|
|||||||
x4 = fromIntegral $ (n .&. 0x3F) + 0x80
|
x4 = fromIntegral $ (n .&. 0x3F) + 0x80
|
||||||
|
|
||||||
{-# INLINE_NORMAL readCharUtf8With #-}
|
{-# INLINE_NORMAL readCharUtf8With #-}
|
||||||
readCharUtf8With :: Monad m => WList -> Unfold m Char Word8
|
readCharUtf8With :: Monad m => (WList Word8) -> Unfold m Char Word8
|
||||||
readCharUtf8With surr = Unfold step inject
|
readCharUtf8With surr = Unfold step inject
|
||||||
|
|
||||||
where
|
where
|
||||||
@ -965,13 +1148,74 @@ encodeUtf8Lax = encodeUtf8
|
|||||||
-- Encoding to Utf16
|
-- Encoding to Utf16
|
||||||
-------------------------------------------------------------------------------
|
-------------------------------------------------------------------------------
|
||||||
|
|
||||||
-- | Encode a stream of Unicode characters to a UTF-16 little endian encoded
|
{-# INLINE utf16LowSurrogate #-}
|
||||||
-- bytestream.
|
utf16LowSurrogate :: Word16
|
||||||
|
utf16LowSurrogate = 0xDC00
|
||||||
|
|
||||||
|
{-# INLINE utf16HighSurrogate #-}
|
||||||
|
utf16HighSurrogate :: Word16
|
||||||
|
utf16HighSurrogate = 0xD800
|
||||||
|
|
||||||
|
{-# INLINE_NORMAL readCharUtf16With #-}
|
||||||
|
readCharUtf16With :: Monad m => WList Word16 -> Unfold m Char Word16
|
||||||
|
readCharUtf16With invalidReplacement = Unfold step inject
|
||||||
|
|
||||||
|
where
|
||||||
|
|
||||||
|
inject c =
|
||||||
|
return $ case ord c of
|
||||||
|
x | x < 0xD800 -> fromIntegral x `WCons` WNil
|
||||||
|
| x > 0xDFFF && x <= 0xFFFF -> fromIntegral x `WCons` WNil
|
||||||
|
| x >= 0x10000 && x <= 0x10FFFF ->
|
||||||
|
let u = x - 0x10000 -- 20 bits
|
||||||
|
h = utf16HighSurrogate
|
||||||
|
+ fromIntegral (u `shiftR` 10) -- 10 bits
|
||||||
|
l = utf16LowSurrogate
|
||||||
|
+ fromIntegral (u .&. 0x3FF) -- 10 bits
|
||||||
|
in WCons h $ WCons l WNil
|
||||||
|
| otherwise -> invalidReplacement
|
||||||
|
|
||||||
|
{-# INLINE_LATE step #-}
|
||||||
|
step WNil = return Stop
|
||||||
|
step (WCons x xs) = return $ Yield x xs
|
||||||
|
|
||||||
|
{-# INLINE encodeUtf16' #-}
|
||||||
|
encodeUtf16' :: Monad m => Stream m Char -> Stream m Word16
|
||||||
|
encodeUtf16' = D.unfoldMany (readCharUtf16With errString)
|
||||||
|
where
|
||||||
|
errString =
|
||||||
|
error
|
||||||
|
$ "Streamly.Internal.Unicode.encodeUtf16': Encountered an \
|
||||||
|
invalid character"
|
||||||
|
|
||||||
|
{-# INLINE encodeUtf16 #-}
|
||||||
|
encodeUtf16 :: Monad m => Stream m Char -> Stream m Word16
|
||||||
|
encodeUtf16 = D.unfoldMany (readCharUtf16With WNil)
|
||||||
|
|
||||||
|
-- | Similar to 'encodeUtf16le' but throws an error if any invalid character is
|
||||||
|
-- encountered.
|
||||||
--
|
--
|
||||||
-- /Unimplemented/
|
|
||||||
{-# INLINE encodeUtf16le' #-}
|
{-# INLINE encodeUtf16le' #-}
|
||||||
encodeUtf16le' :: Stream m Char -> Stream m Word16
|
encodeUtf16le' :: Monad m => Stream m Char -> Stream m Word16
|
||||||
encodeUtf16le' = undefined
|
encodeUtf16le' =
|
||||||
|
#ifdef WORDS_BIGENDIAN
|
||||||
|
fmap swapByteOrder .
|
||||||
|
#endif
|
||||||
|
encodeUtf16'
|
||||||
|
|
||||||
|
-- | Encode a stream of Unicode characters to a UTF-16 encoded stream. Any
|
||||||
|
-- invalid characters in the input stream are replaced by the Unicode
|
||||||
|
-- replacement character U+FFFD.
|
||||||
|
--
|
||||||
|
-- The resulting Word16s are encoded in little-endian byte order.
|
||||||
|
--
|
||||||
|
{-# INLINE encodeUtf16le #-}
|
||||||
|
encodeUtf16le :: Monad m => Stream m Char -> Stream m Word16
|
||||||
|
encodeUtf16le =
|
||||||
|
#ifdef WORDS_BIGENDIAN
|
||||||
|
fmap swapByteOrder .
|
||||||
|
#endif
|
||||||
|
encodeUtf16
|
||||||
|
|
||||||
-------------------------------------------------------------------------------
|
-------------------------------------------------------------------------------
|
||||||
-- Decoding string literals
|
-- Decoding string literals
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
module Streamly.Test.Unicode.Stream (main) where
|
module Streamly.Test.Unicode.Stream (main) where
|
||||||
|
|
||||||
import Data.Char (ord, chr)
|
import Data.Char (ord, chr)
|
||||||
import Data.Word (Word8)
|
import Data.Word (Word8, Word16)
|
||||||
import Test.QuickCheck
|
import Test.QuickCheck
|
||||||
( Property
|
( Property
|
||||||
, forAll
|
, forAll
|
||||||
@ -17,6 +17,7 @@ import Test.QuickCheck
|
|||||||
, choose
|
, choose
|
||||||
)
|
)
|
||||||
import Test.QuickCheck.Monadic (run, monadicIO, assert)
|
import Test.QuickCheck.Monadic (run, monadicIO, assert)
|
||||||
|
import Streamly.Data.Stream (Stream)
|
||||||
|
|
||||||
import qualified Streamly.Data.Array as A
|
import qualified Streamly.Data.Array as A
|
||||||
import qualified Streamly.Data.Stream as Stream
|
import qualified Streamly.Data.Stream as Stream
|
||||||
@ -43,8 +44,11 @@ maxTestCount = 10
|
|||||||
genUnicode :: Gen String
|
genUnicode :: Gen String
|
||||||
genUnicode = listOf arbitraryUnicodeChar
|
genUnicode = listOf arbitraryUnicodeChar
|
||||||
|
|
||||||
genWord8 :: Gen [Word8]
|
genWord8List :: Gen [Word8]
|
||||||
genWord8 = listOf arbitrary
|
genWord8List = listOf arbitrary
|
||||||
|
|
||||||
|
genListOfW8List :: Gen [[Word8]]
|
||||||
|
genListOfW8List = listOf (listOf arbitrary)
|
||||||
|
|
||||||
propDecodeEncodeId' :: Property
|
propDecodeEncodeId' :: Property
|
||||||
propDecodeEncodeId' =
|
propDecodeEncodeId' =
|
||||||
@ -54,6 +58,33 @@ propDecodeEncodeId' =
|
|||||||
chrs <- run $ Stream.toList $ SS.decodeUtf8' wrds
|
chrs <- run $ Stream.toList $ SS.decodeUtf8' wrds
|
||||||
assert (chrs == list)
|
assert (chrs == list)
|
||||||
|
|
||||||
|
propDecodeEncodeUtf16Id
|
||||||
|
:: (Stream IO Char -> Stream IO Word16)
|
||||||
|
-> (Stream IO Word16 -> Stream IO Char)
|
||||||
|
-> Property
|
||||||
|
propDecodeEncodeUtf16Id encoder decoder =
|
||||||
|
forAll genUnicode $ \list ->
|
||||||
|
monadicIO $ do
|
||||||
|
let wrds = encoder $ Stream.fromList list
|
||||||
|
chrs <- run $ Stream.toList $ decoder wrds
|
||||||
|
assert (chrs == list)
|
||||||
|
|
||||||
|
propMkEvenW8Chunks :: Property
|
||||||
|
propMkEvenW8Chunks =
|
||||||
|
forAll genListOfW8List $ \list ->
|
||||||
|
monadicIO $ do
|
||||||
|
list1 <-
|
||||||
|
run $ Stream.toList
|
||||||
|
$ fmap A.toList
|
||||||
|
$ IUS.mkEvenW8Chunks
|
||||||
|
$ fmap A.fromList $ Stream.fromList list
|
||||||
|
let concatedList = concat list
|
||||||
|
concatedList1 = concat list1
|
||||||
|
assert (and (map (even . length) list1))
|
||||||
|
if (odd (length concatedList))
|
||||||
|
then assert (concatedList1 == init concatedList)
|
||||||
|
else assert (concatedList1 == concatedList)
|
||||||
|
|
||||||
-- XXX need to use invalid characters
|
-- XXX need to use invalid characters
|
||||||
propDecodeEncodeId :: Property
|
propDecodeEncodeId :: Property
|
||||||
propDecodeEncodeId =
|
propDecodeEncodeId =
|
||||||
@ -120,7 +151,7 @@ testLines =
|
|||||||
|
|
||||||
testLinesArray :: Property
|
testLinesArray :: Property
|
||||||
testLinesArray =
|
testLinesArray =
|
||||||
forAll genWord8 $ \list ->
|
forAll genWord8List $ \list ->
|
||||||
monadicIO $ do
|
monadicIO $ do
|
||||||
xs <- Stream.toList
|
xs <- Stream.toList
|
||||||
$ fmap A.toList
|
$ fmap A.toList
|
||||||
@ -184,6 +215,13 @@ main = H.hspec
|
|||||||
"Streamly.Data.String.unwords . Streamly.Data.String.words == unwords . words"
|
"Streamly.Data.String.unwords . Streamly.Data.String.words == unwords . words"
|
||||||
testUnwords
|
testUnwords
|
||||||
|
|
||||||
|
H.describe "UTF16 - Encoding / Decoding" $ do
|
||||||
|
prop "decodeUtf16le' . encodeUtf16le' == id"
|
||||||
|
(propDecodeEncodeUtf16Id IUS.encodeUtf16le' IUS.decodeUtf16le')
|
||||||
|
prop "decodeUtf16le . encodeUtf16le == id"
|
||||||
|
(propDecodeEncodeUtf16Id IUS.encodeUtf16le' IUS.decodeUtf16le)
|
||||||
|
prop "mkEvenW8Chunks" propMkEvenW8Chunks
|
||||||
|
|
||||||
H.describe "Latin1 - Encoding / Decoding" $ do
|
H.describe "Latin1 - Encoding / Decoding" $ do
|
||||||
prop "ASCII to Latin1" propASCIIToLatin1
|
prop "ASCII to Latin1" propASCIIToLatin1
|
||||||
prop "Unicode to Latin1" propUnicodeToLatin1
|
prop "Unicode to Latin1" propUnicodeToLatin1
|
||||||
|
Loading…
Reference in New Issue
Block a user