From 946d0649def51cc1a6d41f84d1fb0f04cc7b48a9 Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Mon, 23 Sep 2024 05:28:10 +0530 Subject: [PATCH] Document and expose sequence based splitting ops --- benchmark/Streamly/Benchmark/Data/Fold.hs | 9 ++- .../Streamly/Benchmark/Data/Stream/Reduce.hs | 10 ++-- .../Benchmark/FileSystem/Handle/Read.hs | 14 ++--- core/src/Streamly/Data/Fold.hs | 2 + core/src/Streamly/Data/Stream.hs | 12 ++-- .../Internal/Data/Fold/Combinators.hs | 24 ++++++-- .../Streamly/Internal/Data/Stream/Nesting.hs | 59 +++++++++++++++++-- .../Internal/Data/Stream/Transform.hs | 6 +- .../src/Streamly/Internal/Data/Stream/Type.hs | 45 +++++++------- test/Streamly/Test/Data/Stream.hs | 4 +- test/Streamly/Test/Unicode/Char.hs | 2 +- 11 files changed, 131 insertions(+), 56 deletions(-) diff --git a/benchmark/Streamly/Benchmark/Data/Fold.hs b/benchmark/Streamly/Benchmark/Data/Fold.hs index 17a081321..fcf6c1672 100644 --- a/benchmark/Streamly/Benchmark/Data/Fold.hs +++ b/benchmark/Streamly/Benchmark/Data/Fold.hs @@ -201,7 +201,7 @@ toarr = Array.fromList . map (fromIntegral . ord) fileInfixTakeEndBy_ :: Handle -> IO Int fileInfixTakeEndBy_ inh = Stream.fold Fold.length - $ Stream.foldManyPost (FL.takeEndBy_ (== lf) Fold.drain) + $ Stream.foldMany1 (FL.takeEndBy_ (== lf) Fold.drain) $ Handle.read inh -- >>= print #ifdef INSPECTION @@ -256,7 +256,7 @@ inspect $ 'fileSuffixTakeEndBy `hasNoType` ''MutArray.ArrayUnsafe -- FH.read/A. splitOnSeq :: String -> Handle -> IO Int splitOnSeq str inh = Stream.fold Fold.length - $ Stream.foldManyPost (Fold.takeEndBySeq_ (toarr str) Fold.drain) + $ Stream.foldMany1 (Fold.takeEndBySeq_ (toarr str) Fold.drain) $ Handle.read inh -- >>= print #ifdef INSPECTION @@ -269,7 +269,7 @@ splitOnSeq100k :: Handle -> IO Int splitOnSeq100k inh = do arr <- Stream.fold Array.create $ Stream.replicate 100000 123 Stream.fold Fold.length - $ Stream.foldManyPost (Fold.takeEndBySeq_ arr Fold.drain) + $ Stream.foldMany1 (Fold.takeEndBySeq_ arr Fold.drain) $ Handle.read inh -- >>= print -- | Split on suffix sequence. @@ -356,8 +356,7 @@ o_1_space_reduce_read_split env = splitOnSeqUtf8 :: String -> Handle -> IO Int splitOnSeqUtf8 str inh = Stream.fold Fold.length - $ Stream.foldManyPost - (Fold.takeEndBySeq_ (Array.fromList str) Fold.drain) + $ Stream.foldMany1 (Fold.takeEndBySeq_ (Array.fromList str) Fold.drain) $ Unicode.decodeUtf8Chunks $ Handle.readChunks inh -- >>= print diff --git a/benchmark/Streamly/Benchmark/Data/Stream/Reduce.hs b/benchmark/Streamly/Benchmark/Data/Stream/Reduce.hs index 9ab57c071..52787dbd7 100644 --- a/benchmark/Streamly/Benchmark/Data/Stream/Reduce.hs +++ b/benchmark/Streamly/Benchmark/Data/Stream/Reduce.hs @@ -135,12 +135,12 @@ foldMany = . S.foldMany (FL.take 2 FL.mconcat) . fmap Sum -{-# INLINE foldManyPost #-} -foldManyPost :: Monad m => Stream m Int -> m () -foldManyPost = +{-# INLINE foldMany1 #-} +foldMany1 :: Monad m => Stream m Int -> m () +foldMany1 = Common.drain . fmap getSum - . S.foldManyPost (FL.take 2 FL.mconcat) + . S.foldMany1 (FL.take 2 FL.mconcat) . fmap Sum {-# INLINE refoldMany #-} @@ -185,7 +185,7 @@ o_1_space_grouping value = -- modules we can bring those here. chunksOf benchmarks are in -- Parser/ParserD/Array.Stream/FileSystem.Handle. benchIOSink value "foldMany" foldMany - , benchIOSink value "foldManyPost" foldManyPost + , benchIOSink value "foldMany1" foldMany1 , benchIOSink value "refoldMany" refoldMany , benchIOSink value "foldIterateM" foldIterateM , benchIOSink value "refoldIterateM" refoldIterateM diff --git a/benchmark/Streamly/Benchmark/FileSystem/Handle/Read.hs b/benchmark/Streamly/Benchmark/FileSystem/Handle/Read.hs index aa34c3c5a..0e47be3e2 100644 --- a/benchmark/Streamly/Benchmark/FileSystem/Handle/Read.hs +++ b/benchmark/Streamly/Benchmark/FileSystem/Handle/Read.hs @@ -216,10 +216,10 @@ chunksOfSum :: Int -> Handle -> IO Int chunksOfSum n inh = S.fold Fold.length $ IP.groupsOf n FL.sum (S.unfold FH.reader inh) -foldManyPostChunksOfSum :: Int -> Handle -> IO Int -foldManyPostChunksOfSum n inh = +foldMany1ChunksOfSum :: Int -> Handle -> IO Int +foldMany1ChunksOfSum n inh = S.fold Fold.length - $ IP.foldManyPost (FL.take n FL.sum) (S.unfold FH.reader inh) + $ IP.foldMany1 (FL.take n FL.sum) (S.unfold FH.reader inh) foldManyChunksOfSum :: Int -> Handle -> IO Int foldManyChunksOfSum n inh = @@ -263,13 +263,13 @@ o_1_space_reduce_read_grouped env = -- XXX investigate why we need inline/noinline in these cases (GHC) -- Chunk using parsers , mkBench - ("S.foldManyPost (FL.take " ++ show (bigSize env) ++ " FL.sum)") + ("S.foldMany1 (FL.take " ++ show (bigSize env) ++ " FL.sum)") env - $ \inh _ -> noinline foldManyPostChunksOfSum (bigSize env) inh + $ \inh _ -> noinline foldMany1ChunksOfSum (bigSize env) inh , mkBench - "S.foldManyPost (FL.take 1 FL.sum)" + "S.foldMany1 (FL.take 1 FL.sum)" env - $ \inh _ -> inline foldManyPostChunksOfSum 1 inh + $ \inh _ -> inline foldMany1ChunksOfSum 1 inh , mkBench ("S.foldMany (FL.take " ++ show (bigSize env) ++ " FL.sum)") env diff --git a/core/src/Streamly/Data/Fold.hs b/core/src/Streamly/Data/Fold.hs index 210e2c333..f9993cc27 100644 --- a/core/src/Streamly/Data/Fold.hs +++ b/core/src/Streamly/Data/Fold.hs @@ -323,6 +323,8 @@ module Streamly.Data.Fold , take , takeEndBy , takeEndBy_ + , takeEndBySeq + , takeEndBySeq_ -- ** Key-value Collectors , toMap diff --git a/core/src/Streamly/Data/Stream.hs b/core/src/Streamly/Data/Stream.hs index fdd3b51d0..ff10456b3 100644 --- a/core/src/Streamly/Data/Stream.hs +++ b/core/src/Streamly/Data/Stream.hs @@ -549,21 +549,25 @@ module Streamly.Data.Stream -- >>> groupBy eq = Stream.parseMany (Parser.groupBy eq Fold.toList) -- >>> groupsByRolling eq = Stream.parseMany (Parser.groupByRolling eq Fold.toList) -- >>> groups = groupBy (==) - , foldMany -- XXX Rename to foldRepeat + , foldMany + , foldMany1 , groupsOf , parseMany -- * Splitting -- | Idioms and equivalents of Data.List APIs: -- - -- >>> splitWithSuffix p f = Stream.foldMany (Fold.takeEndBy p f) - -- >>> splitOnSuffix p f = Stream.foldMany (Fold.takeEndBy_ p f) - -- >>> lines = splitOnSuffix (== '\n') + -- >>> splitEndBy p f = Stream.foldMany (Fold.takeEndBy p f) + -- >>> splitEndBy_ p f = Stream.foldMany (Fold.takeEndBy_ p f) + -- >>> lines = splitEndBy_ (== '\n') -- >>> words = Stream.wordsBy isSpace -- >>> splitAt n = Stream.fold (Fold.splitAt n Fold.toList Fold.toList) -- >>> span p = Parser.splitWith (,) (Parser.takeWhile p Fold.toList) (Parser.fromFold Fold.toList) -- >>> break p = span (not . p) , splitOn + , splitOnSeq + , splitEndBySeq + , splitEndBySeq_ , wordsBy -- XXX Should use scanr instead diff --git a/core/src/Streamly/Internal/Data/Fold/Combinators.hs b/core/src/Streamly/Internal/Data/Fold/Combinators.hs index 55b85c5d4..7bfe9fda6 100644 --- a/core/src/Streamly/Internal/Data/Fold/Combinators.hs +++ b/core/src/Streamly/Internal/Data/Fold/Combinators.hs @@ -1426,13 +1426,18 @@ data SplitOnSeqState mba acc a rh w ck = -- sequence, taking the supplied sequence as well. If the pattern is empty this -- acts as an identity fold. -- --- >>> s = Stream.fromList "hello there. How are you?" --- >>> f = Fold.takeEndBySeq (Array.fromList "re") Fold.toList +-- >>> s = Stream.fromList "Gauss---Euler---Noether" +-- >>> f = Fold.takeEndBySeq (Array.fromList "---") Fold.toList -- >>> Stream.fold f s --- "hello there" +-- "Gauss---" -- -- >>> Stream.fold Fold.toList $ Stream.foldMany f s --- ["hello there",". How are"," you?"] +-- ["Gauss---","Euler---","Noether"] +-- +-- Uses Rabin-Karp algorithm for substring search. +-- +-- See also: 'Streamly.Data.Stream.splitOnSeq' and +-- 'Streamly.Data.Stream.splitEndBySeq'. -- -- /Pre-release/ {-# INLINE takeEndBySeq #-} @@ -1588,6 +1593,17 @@ takeEndBySeq patArr (Fold fstep finitial fextract ffinal) = -- | Like 'takeEndBySeq' but discards the matched sequence. -- +-- >>> s = Stream.fromList "Gauss---Euler---Noether" +-- >>> f = Fold.takeEndBySeq_ (Array.fromList "---") Fold.toList +-- >>> Stream.fold f s +-- "Gauss" +-- +-- >>> Stream.fold Fold.toList $ Stream.foldMany f s +-- ["Gauss","Euler","Noether"] +-- +-- See also: 'Streamly.Data.Stream.splitOnSeq' and +-- 'Streamly.Data.Stream.splitEndBySeq_'. +-- -- /Pre-release/ -- {-# INLINE takeEndBySeq_ #-} diff --git a/core/src/Streamly/Internal/Data/Stream/Nesting.hs b/core/src/Streamly/Internal/Data/Stream/Nesting.hs index b9e6711d3..d5003722b 100644 --- a/core/src/Streamly/Internal/Data/Stream/Nesting.hs +++ b/core/src/Streamly/Internal/Data/Stream/Nesting.hs @@ -131,10 +131,10 @@ module Streamly.Internal.Data.Stream.Nesting -- ** Splitting -- | A special case of parsing. , wordsBy - - -- XXX these are currently not being used/tested - , splitOnSeq -- XXX splitOnSeg - , splitOnSuffixSeq -- XXX splitOnSegSuffix, splitOnTrailer + , splitOnSeq + , splitOnSuffixSeq + , splitEndBySeq + , splitEndBySeq_ -- XXX Implement these as folds or parsers instead. , splitOnSuffixSeqAny @@ -2112,6 +2112,17 @@ data SplitOnSeqState mba rb rh ck w fs s b x = | SplitOnSeqReinit (fs -> SplitOnSeqState mba rb rh ck w fs s b x) +-- | Like 'splitOn' but splits the stream on a sequence of elements rather than +-- a single element. Parses a sequence of tokens separated by an infixed +-- separator e.g. @a;b;c@ is parsed as @a@, @b@, @c@. If the pattern is empty +-- the stream is returned as it is. +-- +-- Equivalent to the following: +-- +-- >>> splitOnSeq pat f = Stream.foldMany1 (Fold.takeEndBySeq_ pat f) +-- +-- Uses Rabin-Karp algorithm for substring search. +-- {-# INLINE_NORMAL splitOnSeq #-} splitOnSeq :: forall m a b. (MonadIO m, Unbox a, Enum a, Eq a) @@ -2455,6 +2466,10 @@ data SplitOnSuffixSeqState mba rb rh ck w fs s b x = | SplitOnSuffixSeqReinit (fs -> SplitOnSuffixSeqState mba rb rh ck w fs s b x) +-- | @splitOnSuffixSeq withSep pat fld input@ splits the input using @pat@ as a +-- suffixed separator, the resulting split segments are fed to the fold @fld@. +-- If @withSep@ is True then the separator sequence is also suffixed with the +-- split segments. {-# INLINE_NORMAL splitOnSuffixSeq #-} splitOnSuffixSeq :: forall m a b. (MonadIO m, Unbox a, Enum a, Eq a) @@ -2818,6 +2833,42 @@ splitOnSuffixSeq withSep patArr (Fold fstep initial _ final) (Stream step state) let jump c = SplitOnSuffixSeqKRDone (len - SIZE_OF(a)) c rb1 yieldProceed jump b +-- | Parses a sequence of tokens suffixed by a separator e.g. @a;b;c;@ is +-- parsed as @a;@, @b;@, @c;@. If the pattern is empty the input stream is +-- returned as it is. +-- +-- Equivalent to the following: +-- +-- >>> splitOnSeq pat f = Stream.foldMany (Fold.takeEndBySeq pat f) +-- +-- Uses Rabin-Karp algorithm for substring search. +-- +{-# INLINE_NORMAL splitEndBySeq #-} +splitEndBySeq + :: forall m a b. (MonadIO m, Unbox a, Enum a, Eq a) + => Array a + -> Fold m a b + -> Stream m a + -> Stream m b +splitEndBySeq = splitOnSuffixSeq True + +-- | Like 'splitEndBySeq' but drops the separators and returns only the tokens. +-- +-- Equivalent to the following: +-- +-- >>> splitEndBySeq_ pat f = Stream.foldMany (Fold.takeEndBySeq_ pat f) +-- +-- Uses Rabin-Karp algorithm for substring search. +-- +{-# INLINE_NORMAL splitEndBySeq_ #-} +splitEndBySeq_ + :: forall m a b. (MonadIO m, Unbox a, Enum a, Eq a) + => Array a + -> Fold m a b + -> Stream m a + -> Stream m b +splitEndBySeq_ = splitOnSuffixSeq False + -- Implement this as a fold or a parser instead. -- This can be implemented easily using Rabin Karp -- | Split post any one of the given patterns. diff --git a/core/src/Streamly/Internal/Data/Stream/Transform.hs b/core/src/Streamly/Internal/Data/Stream/Transform.hs index e74dd2094..765d047e7 100644 --- a/core/src/Streamly/Internal/Data/Stream/Transform.hs +++ b/core/src/Streamly/Internal/Data/Stream/Transform.hs @@ -2024,6 +2024,10 @@ catEithers = fmap (either id id) -- separator elements determined by the supplied predicate, separator is -- considered as infixed between two segments: -- +-- Definition: +-- +-- >>> splitOn p f = Stream.foldMany1 (Fold.takeEndBy_ p f) +-- -- >>> splitOn' p xs = Stream.fold Fold.toList $ Stream.splitOn p Fold.toList (Stream.fromList xs) -- >>> splitOn' (== '.') "a.b" -- ["a","b"] @@ -2071,4 +2075,4 @@ splitOn predicate f = -- -- Since a suffix split fold can be easily expressed using a -- non-backtracking fold, we use that. - foldManyPost (FL.takeEndBy_ predicate f) + foldMany1 (FL.takeEndBy_ predicate f) diff --git a/core/src/Streamly/Internal/Data/Stream/Type.hs b/core/src/Streamly/Internal/Data/Stream/Type.hs index 723539918..bd9afb116 100644 --- a/core/src/Streamly/Internal/Data/Stream/Type.hs +++ b/core/src/Streamly/Internal/Data/Stream/Type.hs @@ -129,6 +129,7 @@ module Streamly.Internal.Data.Stream.Type , FoldMany (..) -- for inspection testing , FoldManyPost (..) , foldMany + , foldMany1 , foldManyPost , groupsOf , refoldMany @@ -463,7 +464,7 @@ foldBreak fld strm = do nil = Stream (\_ _ -> return Stop) () -- >>> fold f = Fold.extractM . Stream.foldAddLazy f --- >>> fold f = Stream.fold Fold.one . Stream.foldManyPost f +-- >>> fold f = Stream.fold Fold.one . Stream.foldMany1 f -- >>> fold f = Fold.extractM <=< Stream.foldAdd f -- | Fold a stream using the supplied left 'Fold' and reducing the resulting @@ -1786,24 +1787,21 @@ data FoldManyPost s fs b a | FoldManyPostYield b (FoldManyPost s fs b a) | FoldManyPostDone --- XXX Need a more intuitive name, and need to reconcile the names --- foldMany/fold/parse/parseMany/parseManyPost etc. - --- XXX foldManyPost keeps the last fold always partial. if the last fold is --- complete then another fold is applied on empty input. This is used for --- applying folds like takeEndBy such that the last element is not the --- separator (infix style). But that looks like a hack. We should remove this --- and use a custom combinator for infix parsing. +-- Note that using a closed fold e.g. @Fold.take 0@, would result in an +-- infinite stream without consuming the input. +-- +-- Like foldMany1, "scan" should ideally be "scan1" always resulting in a +-- non-empty stream, and "postscan" should be called just "scan" because it is +-- much more common. But those names cannot be changed now. -- | Like 'foldMany' but evaluates the fold even if the fold did not receive -- any input, therefore, always results in a non-empty output even on an empty --- stream (default result of the fold). 'foldMany' is like 'scan' which always --- includes the initial value of the accumulator. +-- stream (default result of the fold). -- -- Example, empty stream, compare with 'foldMany': -- -- >>> f = Fold.take 2 Fold.toList --- >>> fmany = Stream.fold Fold.toList . Stream.foldManyPost f +-- >>> fmany = Stream.fold Fold.toList . Stream.foldMany1 f -- >>> fmany $ Stream.fromList [] -- [[]] -- @@ -1817,14 +1815,11 @@ data FoldManyPost s fs b a -- >>> fmany $ Stream.fromList [1..5] -- [[1,2],[3,4],[5]] -- --- Note that using a closed fold e.g. @Fold.take 0@, would result in an --- infinite stream without consuming the input. --- -- /Pre-release/ -- -{-# INLINE_NORMAL foldManyPost #-} -foldManyPost :: Monad m => Fold m a b -> Stream m a -> Stream m b -foldManyPost (Fold fstep initial _ final) (Stream step state) = +{-# INLINE_NORMAL foldMany1 #-} +foldMany1 :: Monad m => Fold m a b -> Stream m a -> Stream m b +foldMany1 (Fold fstep initial _ final) (Stream step state) = Stream step' (FoldManyPostStart state) where @@ -1857,6 +1852,11 @@ foldManyPost (Fold fstep initial _ final) (Stream step state) = step' _ (FoldManyPostYield b next) = return $ Yield b next step' _ FoldManyPostDone = return Stop +{-# DEPRECATED foldManyPost "Please use foldMany1 instead." #-} +{-# INLINE foldManyPost #-} +foldManyPost :: Monad m => Fold m a b -> Stream m a -> Stream m b +foldManyPost = foldMany1 + {-# ANN type FoldMany Fuse #-} data FoldMany s fs b a = FoldManyStart s @@ -1866,13 +1866,12 @@ data FoldMany s fs b a | FoldManyDone -- XXX Nested foldMany does not fuse. --- XXX Rename fondMany/foldManyPost to keep the behavior and naming consistent --- with scan, postscan? -- | Apply a terminating 'Fold' repeatedly on a stream and emit the results in --- the output stream. Like 'postscan', foldMany omits the initial (default) --- value of the accumulator, if the input stream segment is empty the result is --- also empty. Using a non-terminating fold in 'foldMany' will result in a hang. +-- the output stream. If the last fold is empty, it's result is not emitted. +-- This means if the input stream is empty the result is also an empty stream. +-- See 'foldMany1' for an alternate behavior which always results in a +-- non-empty stream even if the input stream is empty. -- -- Definition: -- diff --git a/test/Streamly/Test/Data/Stream.hs b/test/Streamly/Test/Data/Stream.hs index e680332f0..5fc555526 100644 --- a/test/Streamly/Test/Data/Stream.hs +++ b/test/Streamly/Test/Data/Stream.hs @@ -59,7 +59,7 @@ toList = Stream.toList -- XXX Where are the tests for "takeEndBy"? splitOn :: Monad m => (a -> Bool) -> Fold m a b -> Stream m a -> Stream m b -splitOn predicate f = Stream.foldManyPost (Fold.takeEndBy_ predicate f) +splitOn predicate f = Stream.foldMany1 (Fold.takeEndBy_ predicate f) splitOnSuffix :: Monad m => (a -> Bool) -> Fold m a b -> Stream m a -> Stream m b @@ -68,7 +68,7 @@ splitOnSuffix predicate f = Stream.foldMany (Fold.takeEndBy_ predicate f) -- XXX Where are the tests for "takeEndBySeq"? splitOnSeqFold :: (MonadIO m, Unbox a, Enum a, Eq a) => Array.Array a -> Fold m a b -> Stream m a -> Stream m b -splitOnSeqFold patt f = Stream.foldManyPost (Fold.takeEndBySeq_ patt f) +splitOnSeqFold patt f = Stream.foldMany1 (Fold.takeEndBySeq_ patt f) splitOnSeqStream :: (MonadIO m, Unbox a, Enum a, Eq a) => Array.Array a -> Fold m a b -> Stream m a -> Stream m b diff --git a/test/Streamly/Test/Unicode/Char.hs b/test/Streamly/Test/Unicode/Char.hs index cd3c5961f..c71f68e27 100644 --- a/test/Streamly/Test/Unicode/Char.hs +++ b/test/Streamly/Test/Unicode/Char.hs @@ -72,7 +72,7 @@ checkNFKD :: (Text, Text, Text, Text, Text) -> IO Bool checkNFKD (c1, c2, c3, c4, c5) = checkOp "toNFKD" NFKD $ map (c5,) [c1, c2, c3, c4, c5] -splitOn predicate f = S.foldManyPost (FL.takeEndBy_ predicate f) +splitOn predicate f = S.foldMany1 (FL.takeEndBy_ predicate f) checkAllTestCases :: Int -> String -> IO () checkAllTestCases lineno line = do