Document and expose sequence based splitting ops

This commit is contained in:
Harendra Kumar 2024-09-23 05:28:10 +05:30
parent 6c9c641c9f
commit 946d0649de
11 changed files with 131 additions and 56 deletions

View File

@ -201,7 +201,7 @@ toarr = Array.fromList . map (fromIntegral . ord)
fileInfixTakeEndBy_ :: Handle -> IO Int
fileInfixTakeEndBy_ inh =
Stream.fold Fold.length
$ Stream.foldManyPost (FL.takeEndBy_ (== lf) Fold.drain)
$ Stream.foldMany1 (FL.takeEndBy_ (== lf) Fold.drain)
$ Handle.read inh -- >>= print
#ifdef INSPECTION
@ -256,7 +256,7 @@ inspect $ 'fileSuffixTakeEndBy `hasNoType` ''MutArray.ArrayUnsafe -- FH.read/A.
splitOnSeq :: String -> Handle -> IO Int
splitOnSeq str inh =
Stream.fold Fold.length
$ Stream.foldManyPost (Fold.takeEndBySeq_ (toarr str) Fold.drain)
$ Stream.foldMany1 (Fold.takeEndBySeq_ (toarr str) Fold.drain)
$ Handle.read inh -- >>= print
#ifdef INSPECTION
@ -269,7 +269,7 @@ splitOnSeq100k :: Handle -> IO Int
splitOnSeq100k inh = do
arr <- Stream.fold Array.create $ Stream.replicate 100000 123
Stream.fold Fold.length
$ Stream.foldManyPost (Fold.takeEndBySeq_ arr Fold.drain)
$ Stream.foldMany1 (Fold.takeEndBySeq_ arr Fold.drain)
$ Handle.read inh -- >>= print
-- | Split on suffix sequence.
@ -356,8 +356,7 @@ o_1_space_reduce_read_split env =
splitOnSeqUtf8 :: String -> Handle -> IO Int
splitOnSeqUtf8 str inh =
Stream.fold Fold.length
$ Stream.foldManyPost
(Fold.takeEndBySeq_ (Array.fromList str) Fold.drain)
$ Stream.foldMany1 (Fold.takeEndBySeq_ (Array.fromList str) Fold.drain)
$ Unicode.decodeUtf8Chunks
$ Handle.readChunks inh -- >>= print

View File

@ -135,12 +135,12 @@ foldMany =
. S.foldMany (FL.take 2 FL.mconcat)
. fmap Sum
{-# INLINE foldManyPost #-}
foldManyPost :: Monad m => Stream m Int -> m ()
foldManyPost =
{-# INLINE foldMany1 #-}
foldMany1 :: Monad m => Stream m Int -> m ()
foldMany1 =
Common.drain
. fmap getSum
. S.foldManyPost (FL.take 2 FL.mconcat)
. S.foldMany1 (FL.take 2 FL.mconcat)
. fmap Sum
{-# INLINE refoldMany #-}
@ -185,7 +185,7 @@ o_1_space_grouping value =
-- modules we can bring those here. chunksOf benchmarks are in
-- Parser/ParserD/Array.Stream/FileSystem.Handle.
benchIOSink value "foldMany" foldMany
, benchIOSink value "foldManyPost" foldManyPost
, benchIOSink value "foldMany1" foldMany1
, benchIOSink value "refoldMany" refoldMany
, benchIOSink value "foldIterateM" foldIterateM
, benchIOSink value "refoldIterateM" refoldIterateM

View File

@ -216,10 +216,10 @@ chunksOfSum :: Int -> Handle -> IO Int
chunksOfSum n inh =
S.fold Fold.length $ IP.groupsOf n FL.sum (S.unfold FH.reader inh)
foldManyPostChunksOfSum :: Int -> Handle -> IO Int
foldManyPostChunksOfSum n inh =
foldMany1ChunksOfSum :: Int -> Handle -> IO Int
foldMany1ChunksOfSum n inh =
S.fold Fold.length
$ IP.foldManyPost (FL.take n FL.sum) (S.unfold FH.reader inh)
$ IP.foldMany1 (FL.take n FL.sum) (S.unfold FH.reader inh)
foldManyChunksOfSum :: Int -> Handle -> IO Int
foldManyChunksOfSum n inh =
@ -263,13 +263,13 @@ o_1_space_reduce_read_grouped env =
-- XXX investigate why we need inline/noinline in these cases (GHC)
-- Chunk using parsers
, mkBench
("S.foldManyPost (FL.take " ++ show (bigSize env) ++ " FL.sum)")
("S.foldMany1 (FL.take " ++ show (bigSize env) ++ " FL.sum)")
env
$ \inh _ -> noinline foldManyPostChunksOfSum (bigSize env) inh
$ \inh _ -> noinline foldMany1ChunksOfSum (bigSize env) inh
, mkBench
"S.foldManyPost (FL.take 1 FL.sum)"
"S.foldMany1 (FL.take 1 FL.sum)"
env
$ \inh _ -> inline foldManyPostChunksOfSum 1 inh
$ \inh _ -> inline foldMany1ChunksOfSum 1 inh
, mkBench
("S.foldMany (FL.take " ++ show (bigSize env) ++ " FL.sum)")
env

View File

@ -323,6 +323,8 @@ module Streamly.Data.Fold
, take
, takeEndBy
, takeEndBy_
, takeEndBySeq
, takeEndBySeq_
-- ** Key-value Collectors
, toMap

View File

@ -549,21 +549,25 @@ module Streamly.Data.Stream
-- >>> groupBy eq = Stream.parseMany (Parser.groupBy eq Fold.toList)
-- >>> groupsByRolling eq = Stream.parseMany (Parser.groupByRolling eq Fold.toList)
-- >>> groups = groupBy (==)
, foldMany -- XXX Rename to foldRepeat
, foldMany
, foldMany1
, groupsOf
, parseMany
-- * Splitting
-- | Idioms and equivalents of Data.List APIs:
--
-- >>> splitWithSuffix p f = Stream.foldMany (Fold.takeEndBy p f)
-- >>> splitOnSuffix p f = Stream.foldMany (Fold.takeEndBy_ p f)
-- >>> lines = splitOnSuffix (== '\n')
-- >>> splitEndBy p f = Stream.foldMany (Fold.takeEndBy p f)
-- >>> splitEndBy_ p f = Stream.foldMany (Fold.takeEndBy_ p f)
-- >>> lines = splitEndBy_ (== '\n')
-- >>> words = Stream.wordsBy isSpace
-- >>> splitAt n = Stream.fold (Fold.splitAt n Fold.toList Fold.toList)
-- >>> span p = Parser.splitWith (,) (Parser.takeWhile p Fold.toList) (Parser.fromFold Fold.toList)
-- >>> break p = span (not . p)
, splitOn
, splitOnSeq
, splitEndBySeq
, splitEndBySeq_
, wordsBy
-- XXX Should use scanr instead

View File

@ -1426,13 +1426,18 @@ data SplitOnSeqState mba acc a rh w ck =
-- sequence, taking the supplied sequence as well. If the pattern is empty this
-- acts as an identity fold.
--
-- >>> s = Stream.fromList "hello there. How are you?"
-- >>> f = Fold.takeEndBySeq (Array.fromList "re") Fold.toList
-- >>> s = Stream.fromList "Gauss---Euler---Noether"
-- >>> f = Fold.takeEndBySeq (Array.fromList "---") Fold.toList
-- >>> Stream.fold f s
-- "hello there"
-- "Gauss---"
--
-- >>> Stream.fold Fold.toList $ Stream.foldMany f s
-- ["hello there",". How are"," you?"]
-- ["Gauss---","Euler---","Noether"]
--
-- Uses Rabin-Karp algorithm for substring search.
--
-- See also: 'Streamly.Data.Stream.splitOnSeq' and
-- 'Streamly.Data.Stream.splitEndBySeq'.
--
-- /Pre-release/
{-# INLINE takeEndBySeq #-}
@ -1588,6 +1593,17 @@ takeEndBySeq patArr (Fold fstep finitial fextract ffinal) =
-- | Like 'takeEndBySeq' but discards the matched sequence.
--
-- >>> s = Stream.fromList "Gauss---Euler---Noether"
-- >>> f = Fold.takeEndBySeq_ (Array.fromList "---") Fold.toList
-- >>> Stream.fold f s
-- "Gauss"
--
-- >>> Stream.fold Fold.toList $ Stream.foldMany f s
-- ["Gauss","Euler","Noether"]
--
-- See also: 'Streamly.Data.Stream.splitOnSeq' and
-- 'Streamly.Data.Stream.splitEndBySeq_'.
--
-- /Pre-release/
--
{-# INLINE takeEndBySeq_ #-}

View File

@ -131,10 +131,10 @@ module Streamly.Internal.Data.Stream.Nesting
-- ** Splitting
-- | A special case of parsing.
, wordsBy
-- XXX these are currently not being used/tested
, splitOnSeq -- XXX splitOnSeg
, splitOnSuffixSeq -- XXX splitOnSegSuffix, splitOnTrailer
, splitOnSeq
, splitOnSuffixSeq
, splitEndBySeq
, splitEndBySeq_
-- XXX Implement these as folds or parsers instead.
, splitOnSuffixSeqAny
@ -2112,6 +2112,17 @@ data SplitOnSeqState mba rb rh ck w fs s b x =
| SplitOnSeqReinit (fs -> SplitOnSeqState mba rb rh ck w fs s b x)
-- | Like 'splitOn' but splits the stream on a sequence of elements rather than
-- a single element. Parses a sequence of tokens separated by an infixed
-- separator e.g. @a;b;c@ is parsed as @a@, @b@, @c@. If the pattern is empty
-- the stream is returned as it is.
--
-- Equivalent to the following:
--
-- >>> splitOnSeq pat f = Stream.foldMany1 (Fold.takeEndBySeq_ pat f)
--
-- Uses Rabin-Karp algorithm for substring search.
--
{-# INLINE_NORMAL splitOnSeq #-}
splitOnSeq
:: forall m a b. (MonadIO m, Unbox a, Enum a, Eq a)
@ -2455,6 +2466,10 @@ data SplitOnSuffixSeqState mba rb rh ck w fs s b x =
| SplitOnSuffixSeqReinit
(fs -> SplitOnSuffixSeqState mba rb rh ck w fs s b x)
-- | @splitOnSuffixSeq withSep pat fld input@ splits the input using @pat@ as a
-- suffixed separator, the resulting split segments are fed to the fold @fld@.
-- If @withSep@ is True then the separator sequence is also suffixed with the
-- split segments.
{-# INLINE_NORMAL splitOnSuffixSeq #-}
splitOnSuffixSeq
:: forall m a b. (MonadIO m, Unbox a, Enum a, Eq a)
@ -2818,6 +2833,42 @@ splitOnSuffixSeq withSep patArr (Fold fstep initial _ final) (Stream step state)
let jump c = SplitOnSuffixSeqKRDone (len - SIZE_OF(a)) c rb1
yieldProceed jump b
-- | Parses a sequence of tokens suffixed by a separator e.g. @a;b;c;@ is
-- parsed as @a;@, @b;@, @c;@. If the pattern is empty the input stream is
-- returned as it is.
--
-- Equivalent to the following:
--
-- >>> splitOnSeq pat f = Stream.foldMany (Fold.takeEndBySeq pat f)
--
-- Uses Rabin-Karp algorithm for substring search.
--
{-# INLINE_NORMAL splitEndBySeq #-}
splitEndBySeq
:: forall m a b. (MonadIO m, Unbox a, Enum a, Eq a)
=> Array a
-> Fold m a b
-> Stream m a
-> Stream m b
splitEndBySeq = splitOnSuffixSeq True
-- | Like 'splitEndBySeq' but drops the separators and returns only the tokens.
--
-- Equivalent to the following:
--
-- >>> splitEndBySeq_ pat f = Stream.foldMany (Fold.takeEndBySeq_ pat f)
--
-- Uses Rabin-Karp algorithm for substring search.
--
{-# INLINE_NORMAL splitEndBySeq_ #-}
splitEndBySeq_
:: forall m a b. (MonadIO m, Unbox a, Enum a, Eq a)
=> Array a
-> Fold m a b
-> Stream m a
-> Stream m b
splitEndBySeq_ = splitOnSuffixSeq False
-- Implement this as a fold or a parser instead.
-- This can be implemented easily using Rabin Karp
-- | Split post any one of the given patterns.

View File

@ -2024,6 +2024,10 @@ catEithers = fmap (either id id)
-- separator elements determined by the supplied predicate, separator is
-- considered as infixed between two segments:
--
-- Definition:
--
-- >>> splitOn p f = Stream.foldMany1 (Fold.takeEndBy_ p f)
--
-- >>> splitOn' p xs = Stream.fold Fold.toList $ Stream.splitOn p Fold.toList (Stream.fromList xs)
-- >>> splitOn' (== '.') "a.b"
-- ["a","b"]
@ -2071,4 +2075,4 @@ splitOn predicate f =
--
-- Since a suffix split fold can be easily expressed using a
-- non-backtracking fold, we use that.
foldManyPost (FL.takeEndBy_ predicate f)
foldMany1 (FL.takeEndBy_ predicate f)

View File

@ -129,6 +129,7 @@ module Streamly.Internal.Data.Stream.Type
, FoldMany (..) -- for inspection testing
, FoldManyPost (..)
, foldMany
, foldMany1
, foldManyPost
, groupsOf
, refoldMany
@ -463,7 +464,7 @@ foldBreak fld strm = do
nil = Stream (\_ _ -> return Stop) ()
-- >>> fold f = Fold.extractM . Stream.foldAddLazy f
-- >>> fold f = Stream.fold Fold.one . Stream.foldManyPost f
-- >>> fold f = Stream.fold Fold.one . Stream.foldMany1 f
-- >>> fold f = Fold.extractM <=< Stream.foldAdd f
-- | Fold a stream using the supplied left 'Fold' and reducing the resulting
@ -1786,24 +1787,21 @@ data FoldManyPost s fs b a
| FoldManyPostYield b (FoldManyPost s fs b a)
| FoldManyPostDone
-- XXX Need a more intuitive name, and need to reconcile the names
-- foldMany/fold/parse/parseMany/parseManyPost etc.
-- XXX foldManyPost keeps the last fold always partial. if the last fold is
-- complete then another fold is applied on empty input. This is used for
-- applying folds like takeEndBy such that the last element is not the
-- separator (infix style). But that looks like a hack. We should remove this
-- and use a custom combinator for infix parsing.
-- Note that using a closed fold e.g. @Fold.take 0@, would result in an
-- infinite stream without consuming the input.
--
-- Like foldMany1, "scan" should ideally be "scan1" always resulting in a
-- non-empty stream, and "postscan" should be called just "scan" because it is
-- much more common. But those names cannot be changed now.
-- | Like 'foldMany' but evaluates the fold even if the fold did not receive
-- any input, therefore, always results in a non-empty output even on an empty
-- stream (default result of the fold). 'foldMany' is like 'scan' which always
-- includes the initial value of the accumulator.
-- stream (default result of the fold).
--
-- Example, empty stream, compare with 'foldMany':
--
-- >>> f = Fold.take 2 Fold.toList
-- >>> fmany = Stream.fold Fold.toList . Stream.foldManyPost f
-- >>> fmany = Stream.fold Fold.toList . Stream.foldMany1 f
-- >>> fmany $ Stream.fromList []
-- [[]]
--
@ -1817,14 +1815,11 @@ data FoldManyPost s fs b a
-- >>> fmany $ Stream.fromList [1..5]
-- [[1,2],[3,4],[5]]
--
-- Note that using a closed fold e.g. @Fold.take 0@, would result in an
-- infinite stream without consuming the input.
--
-- /Pre-release/
--
{-# INLINE_NORMAL foldManyPost #-}
foldManyPost :: Monad m => Fold m a b -> Stream m a -> Stream m b
foldManyPost (Fold fstep initial _ final) (Stream step state) =
{-# INLINE_NORMAL foldMany1 #-}
foldMany1 :: Monad m => Fold m a b -> Stream m a -> Stream m b
foldMany1 (Fold fstep initial _ final) (Stream step state) =
Stream step' (FoldManyPostStart state)
where
@ -1857,6 +1852,11 @@ foldManyPost (Fold fstep initial _ final) (Stream step state) =
step' _ (FoldManyPostYield b next) = return $ Yield b next
step' _ FoldManyPostDone = return Stop
{-# DEPRECATED foldManyPost "Please use foldMany1 instead." #-}
{-# INLINE foldManyPost #-}
foldManyPost :: Monad m => Fold m a b -> Stream m a -> Stream m b
foldManyPost = foldMany1
{-# ANN type FoldMany Fuse #-}
data FoldMany s fs b a
= FoldManyStart s
@ -1866,13 +1866,12 @@ data FoldMany s fs b a
| FoldManyDone
-- XXX Nested foldMany does not fuse.
-- XXX Rename fondMany/foldManyPost to keep the behavior and naming consistent
-- with scan, postscan?
-- | Apply a terminating 'Fold' repeatedly on a stream and emit the results in
-- the output stream. Like 'postscan', foldMany omits the initial (default)
-- value of the accumulator, if the input stream segment is empty the result is
-- also empty. Using a non-terminating fold in 'foldMany' will result in a hang.
-- the output stream. If the last fold is empty, it's result is not emitted.
-- This means if the input stream is empty the result is also an empty stream.
-- See 'foldMany1' for an alternate behavior which always results in a
-- non-empty stream even if the input stream is empty.
--
-- Definition:
--

View File

@ -59,7 +59,7 @@ toList = Stream.toList
-- XXX Where are the tests for "takeEndBy"?
splitOn :: Monad m =>
(a -> Bool) -> Fold m a b -> Stream m a -> Stream m b
splitOn predicate f = Stream.foldManyPost (Fold.takeEndBy_ predicate f)
splitOn predicate f = Stream.foldMany1 (Fold.takeEndBy_ predicate f)
splitOnSuffix :: Monad m =>
(a -> Bool) -> Fold m a b -> Stream m a -> Stream m b
@ -68,7 +68,7 @@ splitOnSuffix predicate f = Stream.foldMany (Fold.takeEndBy_ predicate f)
-- XXX Where are the tests for "takeEndBySeq"?
splitOnSeqFold :: (MonadIO m, Unbox a, Enum a, Eq a) =>
Array.Array a -> Fold m a b -> Stream m a -> Stream m b
splitOnSeqFold patt f = Stream.foldManyPost (Fold.takeEndBySeq_ patt f)
splitOnSeqFold patt f = Stream.foldMany1 (Fold.takeEndBySeq_ patt f)
splitOnSeqStream :: (MonadIO m, Unbox a, Enum a, Eq a) =>
Array.Array a -> Fold m a b -> Stream m a -> Stream m b

View File

@ -72,7 +72,7 @@ checkNFKD :: (Text, Text, Text, Text, Text) -> IO Bool
checkNFKD (c1, c2, c3, c4, c5) =
checkOp "toNFKD" NFKD $ map (c5,) [c1, c2, c3, c4, c5]
splitOn predicate f = S.foldManyPost (FL.takeEndBy_ predicate f)
splitOn predicate f = S.foldMany1 (FL.takeEndBy_ predicate f)
checkAllTestCases :: Int -> String -> IO ()
checkAllTestCases lineno line = do