Document and expose sequence based splitting ops

2024-10-03 22:38:16 +03:00 · 2024-09-23 05:28:10 +05:30 · 2024-09-23 05:28:10 +05:30 · 946d0649de
commit 946d0649de
parent 6c9c641c9f
11 changed files with 131 additions and 56 deletions
--- a/benchmark/Streamly/Benchmark/Data/Fold.hs
+++ b/benchmark/Streamly/Benchmark/Data/Fold.hs
@ -201,7 +201,7 @@ toarr = Array.fromList . map (fromIntegral . ord)
 fileInfixTakeEndBy_ :: Handle -> IO Int
 fileInfixTakeEndBy_ inh =
    Stream.fold Fold.length
-        $ Stream.foldManyPost (FL.takeEndBy_ (== lf) Fold.drain)
+        $ Stream.foldMany1 (FL.takeEndBy_ (== lf) Fold.drain)
        $ Handle.read inh -- >>= print

 #ifdef INSPECTION
@ -256,7 +256,7 @@ inspect $ 'fileSuffixTakeEndBy `hasNoType` ''MutArray.ArrayUnsafe  -- FH.read/A.
 splitOnSeq :: String -> Handle -> IO Int
 splitOnSeq str inh =
    Stream.fold Fold.length
-        $ Stream.foldManyPost (Fold.takeEndBySeq_ (toarr str) Fold.drain)
+        $ Stream.foldMany1 (Fold.takeEndBySeq_ (toarr str) Fold.drain)
        $ Handle.read inh -- >>= print

 #ifdef INSPECTION
@ -269,7 +269,7 @@ splitOnSeq100k :: Handle -> IO Int
 splitOnSeq100k inh = do
    arr <- Stream.fold Array.create $ Stream.replicate 100000 123
    Stream.fold Fold.length
-        $ Stream.foldManyPost (Fold.takeEndBySeq_ arr Fold.drain)
+        $ Stream.foldMany1 (Fold.takeEndBySeq_ arr Fold.drain)
        $ Handle.read inh -- >>= print

 -- | Split on suffix sequence.
@ -356,8 +356,7 @@ o_1_space_reduce_read_split env =
 splitOnSeqUtf8 :: String -> Handle -> IO Int
 splitOnSeqUtf8 str inh =
    Stream.fold Fold.length
-        $ Stream.foldManyPost
-            (Fold.takeEndBySeq_ (Array.fromList str) Fold.drain)
+        $ Stream.foldMany1 (Fold.takeEndBySeq_ (Array.fromList str) Fold.drain)
        $ Unicode.decodeUtf8Chunks
        $ Handle.readChunks inh -- >>= print

--- a/benchmark/Streamly/Benchmark/Data/Stream/Reduce.hs
+++ b/benchmark/Streamly/Benchmark/Data/Stream/Reduce.hs
@ -135,12 +135,12 @@ foldMany =
    . S.foldMany (FL.take 2 FL.mconcat)
    . fmap Sum

-{-# INLINE foldManyPost #-}
-foldManyPost :: Monad m => Stream m Int -> m ()
-foldManyPost =
+{-# INLINE foldMany1 #-}
+foldMany1 :: Monad m => Stream m Int -> m ()
+foldMany1 =
      Common.drain
    . fmap getSum
-    . S.foldManyPost (FL.take 2 FL.mconcat)
+    . S.foldMany1 (FL.take 2 FL.mconcat)
    . fmap Sum

 {-# INLINE refoldMany #-}
@ -185,7 +185,7 @@ o_1_space_grouping value =
        -- modules we can bring those here. chunksOf benchmarks are in
        -- Parser/ParserD/Array.Stream/FileSystem.Handle.
          benchIOSink value "foldMany" foldMany
-        , benchIOSink value "foldManyPost" foldManyPost
+        , benchIOSink value "foldMany1" foldMany1
        , benchIOSink value "refoldMany" refoldMany
        , benchIOSink value "foldIterateM" foldIterateM
        , benchIOSink value "refoldIterateM" refoldIterateM
--- a/benchmark/Streamly/Benchmark/FileSystem/Handle/Read.hs
+++ b/benchmark/Streamly/Benchmark/FileSystem/Handle/Read.hs
@ -216,10 +216,10 @@ chunksOfSum :: Int -> Handle -> IO Int
 chunksOfSum n inh =
    S.fold Fold.length $ IP.groupsOf n FL.sum (S.unfold FH.reader inh)

-foldManyPostChunksOfSum :: Int -> Handle -> IO Int
-foldManyPostChunksOfSum n inh =
+foldMany1ChunksOfSum :: Int -> Handle -> IO Int
+foldMany1ChunksOfSum n inh =
    S.fold Fold.length
-        $ IP.foldManyPost (FL.take n FL.sum) (S.unfold FH.reader inh)
+        $ IP.foldMany1 (FL.take n FL.sum) (S.unfold FH.reader inh)

 foldManyChunksOfSum :: Int -> Handle -> IO Int
 foldManyChunksOfSum n inh =
@ -263,13 +263,13 @@ o_1_space_reduce_read_grouped env =
        -- XXX investigate why we need inline/noinline in these cases (GHC)
        -- Chunk using parsers
        , mkBench
-            ("S.foldManyPost (FL.take " ++ show (bigSize env) ++ " FL.sum)")
+            ("S.foldMany1 (FL.take " ++ show (bigSize env) ++ " FL.sum)")
            env
-            $ \inh _ -> noinline foldManyPostChunksOfSum (bigSize env) inh
+            $ \inh _ -> noinline foldMany1ChunksOfSum (bigSize env) inh
        , mkBench
-            "S.foldManyPost (FL.take 1 FL.sum)"
+            "S.foldMany1 (FL.take 1 FL.sum)"
            env
-            $ \inh _ -> inline foldManyPostChunksOfSum 1 inh
+            $ \inh _ -> inline foldMany1ChunksOfSum 1 inh
        , mkBench
            ("S.foldMany (FL.take " ++ show (bigSize env) ++ " FL.sum)")
            env
--- a/core/src/Streamly/Data/Fold.hs
+++ b/core/src/Streamly/Data/Fold.hs
@ -323,6 +323,8 @@ module Streamly.Data.Fold
    , take
    , takeEndBy
    , takeEndBy_
+    , takeEndBySeq
+    , takeEndBySeq_

    -- ** Key-value Collectors
    , toMap
--- a/core/src/Streamly/Data/Stream.hs
+++ b/core/src/Streamly/Data/Stream.hs
@ -549,21 +549,25 @@ module Streamly.Data.Stream
    -- >>> groupBy eq = Stream.parseMany (Parser.groupBy eq Fold.toList)
    -- >>> groupsByRolling eq = Stream.parseMany (Parser.groupByRolling eq Fold.toList)
    -- >>> groups = groupBy (==)
-    , foldMany -- XXX Rename to foldRepeat
+    , foldMany
+    , foldMany1
    , groupsOf
    , parseMany

    -- * Splitting
    -- | Idioms and equivalents of Data.List APIs:
    --
-    -- >>> splitWithSuffix p f = Stream.foldMany (Fold.takeEndBy p f)
-    -- >>> splitOnSuffix p f = Stream.foldMany (Fold.takeEndBy_ p f)
-    -- >>> lines = splitOnSuffix (== '\n')
+    -- >>> splitEndBy p f = Stream.foldMany (Fold.takeEndBy p f)
+    -- >>> splitEndBy_ p f = Stream.foldMany (Fold.takeEndBy_ p f)
+    -- >>> lines = splitEndBy_ (== '\n')
    -- >>> words = Stream.wordsBy isSpace
    -- >>> splitAt n = Stream.fold (Fold.splitAt n Fold.toList Fold.toList)
    -- >>> span p = Parser.splitWith (,) (Parser.takeWhile p Fold.toList) (Parser.fromFold Fold.toList)
    -- >>> break p = span (not . p)
    , splitOn
+    , splitOnSeq
+    , splitEndBySeq
+    , splitEndBySeq_
    , wordsBy

    -- XXX Should use scanr instead
--- a/core/src/Streamly/Internal/Data/Fold/Combinators.hs
+++ b/core/src/Streamly/Internal/Data/Fold/Combinators.hs
@ -1426,13 +1426,18 @@ data SplitOnSeqState mba acc a rh w ck =
 -- sequence, taking the supplied sequence as well. If the pattern is empty this
 -- acts as an identity fold.
 --
-- >>> s = Stream.fromList "hello there. How are you?"
-- >>> f = Fold.takeEndBySeq (Array.fromList "re") Fold.toList
+-- >>> s = Stream.fromList "Gauss---Euler---Noether"
+-- >>> f = Fold.takeEndBySeq (Array.fromList "---") Fold.toList
 -- >>> Stream.fold f s
-- "hello there"
+-- "Gauss---"
 --
 -- >>> Stream.fold Fold.toList $ Stream.foldMany f s
-- ["hello there",". How are"," you?"]
+-- ["Gauss---","Euler---","Noether"]
+--
+-- Uses Rabin-Karp algorithm for substring search.
+--
+-- See also: 'Streamly.Data.Stream.splitOnSeq' and
+-- 'Streamly.Data.Stream.splitEndBySeq'.
 --
 -- /Pre-release/
 {-# INLINE takeEndBySeq #-}
@ -1588,6 +1593,17 @@ takeEndBySeq patArr (Fold fstep finitial fextract ffinal) =

 -- | Like 'takeEndBySeq' but discards the matched sequence.
 --
+-- >>> s = Stream.fromList "Gauss---Euler---Noether"
+-- >>> f = Fold.takeEndBySeq_ (Array.fromList "---") Fold.toList
+-- >>> Stream.fold f s
+-- "Gauss"
+--
+-- >>> Stream.fold Fold.toList $ Stream.foldMany f s
+-- ["Gauss","Euler","Noether"]
+--
+-- See also: 'Streamly.Data.Stream.splitOnSeq' and
+-- 'Streamly.Data.Stream.splitEndBySeq_'.
+--
 -- /Pre-release/
 --
 {-# INLINE takeEndBySeq_ #-}
--- a/core/src/Streamly/Internal/Data/Stream/Nesting.hs
+++ b/core/src/Streamly/Internal/Data/Stream/Nesting.hs
@ -131,10 +131,10 @@ module Streamly.Internal.Data.Stream.Nesting
    -- ** Splitting
    -- | A special case of parsing.
    , wordsBy
-
-    -- XXX these are currently not being used/tested
-    , splitOnSeq -- XXX splitOnSeg
-    , splitOnSuffixSeq -- XXX splitOnSegSuffix, splitOnTrailer
+    , splitOnSeq
+    , splitOnSuffixSeq
+    , splitEndBySeq
+    , splitEndBySeq_

    -- XXX Implement these as folds or parsers instead.
    , splitOnSuffixSeqAny
@ -2112,6 +2112,17 @@ data SplitOnSeqState mba rb rh ck w fs s b x =

    | SplitOnSeqReinit (fs -> SplitOnSeqState mba rb rh ck w fs s b x)

+-- | Like 'splitOn' but splits the stream on a sequence of elements rather than
+-- a single element. Parses a sequence of tokens separated by an infixed
+-- separator e.g. @a;b;c@ is parsed as @a@, @b@, @c@. If the pattern is empty
+-- the stream is returned as it is.
+--
+-- Equivalent to the following:
+--
+-- >>> splitOnSeq pat f = Stream.foldMany1 (Fold.takeEndBySeq_ pat f)
+--
+-- Uses Rabin-Karp algorithm for substring search.
+--
 {-# INLINE_NORMAL splitOnSeq #-}
 splitOnSeq
    :: forall m a b. (MonadIO m, Unbox a, Enum a, Eq a)
@ -2455,6 +2466,10 @@ data SplitOnSuffixSeqState mba rb rh ck w fs s b x =
    | SplitOnSuffixSeqReinit
          (fs -> SplitOnSuffixSeqState mba rb rh ck w fs s b x)

+-- | @splitOnSuffixSeq withSep pat fld input@ splits the input using @pat@ as a
+-- suffixed separator, the resulting split segments are fed to the fold @fld@.
+-- If @withSep@ is True then the separator sequence is also suffixed with the
+-- split segments.
 {-# INLINE_NORMAL splitOnSuffixSeq #-}
 splitOnSuffixSeq
    :: forall m a b. (MonadIO m, Unbox a, Enum a, Eq a)
@ -2818,6 +2833,42 @@ splitOnSuffixSeq withSep patArr (Fold fstep initial _ final) (Stream step state)
                let jump c = SplitOnSuffixSeqKRDone (len - SIZE_OF(a)) c rb1
                yieldProceed jump b

+-- | Parses a sequence of tokens suffixed by a separator e.g. @a;b;c;@ is
+-- parsed as @a;@, @b;@, @c;@. If the pattern is empty the input stream is
+-- returned as it is.
+--
+-- Equivalent to the following:
+--
+-- >>> splitOnSeq pat f = Stream.foldMany (Fold.takeEndBySeq pat f)
+--
+-- Uses Rabin-Karp algorithm for substring search.
+--
+{-# INLINE_NORMAL splitEndBySeq #-}
+splitEndBySeq
+    :: forall m a b. (MonadIO m, Unbox a, Enum a, Eq a)
+    => Array a
+    -> Fold m a b
+    -> Stream m a
+    -> Stream m b
+splitEndBySeq = splitOnSuffixSeq True
+
+-- | Like 'splitEndBySeq' but drops the separators and returns only the tokens.
+--
+-- Equivalent to the following:
+--
+-- >>> splitEndBySeq_ pat f = Stream.foldMany (Fold.takeEndBySeq_ pat f)
+--
+-- Uses Rabin-Karp algorithm for substring search.
+--
+{-# INLINE_NORMAL splitEndBySeq_ #-}
+splitEndBySeq_
+    :: forall m a b. (MonadIO m, Unbox a, Enum a, Eq a)
+    => Array a
+    -> Fold m a b
+    -> Stream m a
+    -> Stream m b
+splitEndBySeq_ = splitOnSuffixSeq False
+
 -- Implement this as a fold or a parser instead.
 -- This can be implemented easily using Rabin Karp
 -- | Split post any one of the given patterns.
--- a/core/src/Streamly/Internal/Data/Stream/Transform.hs
+++ b/core/src/Streamly/Internal/Data/Stream/Transform.hs
@ -2024,6 +2024,10 @@ catEithers = fmap (either id id)
 -- separator elements determined by the supplied predicate, separator is
 -- considered as infixed between two segments:
 --
+-- Definition:
+--
+-- >>> splitOn p f = Stream.foldMany1 (Fold.takeEndBy_ p f)
+--
 -- >>> splitOn' p xs = Stream.fold Fold.toList $ Stream.splitOn p Fold.toList (Stream.fromList xs)
 -- >>> splitOn' (== '.') "a.b"
 -- ["a","b"]
@ -2071,4 +2075,4 @@ splitOn predicate f =
    --
    -- Since a suffix split fold can be easily expressed using a
    -- non-backtracking fold, we use that.
-    foldManyPost (FL.takeEndBy_ predicate f)
+    foldMany1 (FL.takeEndBy_ predicate f)
--- a/core/src/Streamly/Internal/Data/Stream/Type.hs
+++ b/core/src/Streamly/Internal/Data/Stream/Type.hs
@ -129,6 +129,7 @@ module Streamly.Internal.Data.Stream.Type
    , FoldMany (..) -- for inspection testing
    , FoldManyPost (..)
    , foldMany
+    , foldMany1
    , foldManyPost
    , groupsOf
    , refoldMany
@ -463,7 +464,7 @@ foldBreak fld strm = do
    nil = Stream (\_ _ -> return Stop) ()

 -- >>> fold f = Fold.extractM . Stream.foldAddLazy f
-- >>> fold f = Stream.fold Fold.one . Stream.foldManyPost f
+-- >>> fold f = Stream.fold Fold.one . Stream.foldMany1 f
 -- >>> fold f = Fold.extractM <=< Stream.foldAdd f

 -- | Fold a stream using the supplied left 'Fold' and reducing the resulting
@ -1786,24 +1787,21 @@ data FoldManyPost s fs b a
    | FoldManyPostYield b (FoldManyPost s fs b a)
    | FoldManyPostDone

-- XXX Need a more intuitive name, and need to reconcile the names
-- foldMany/fold/parse/parseMany/parseManyPost etc.
-
-- XXX foldManyPost keeps the last fold always partial. if the last fold is
-- complete then another fold is applied on empty input. This is used for
-- applying folds like takeEndBy such that the last element is not the
-- separator (infix style). But that looks like a hack. We should remove this
-- and use a custom combinator for infix parsing.
+-- Note that using a closed fold e.g. @Fold.take 0@, would result in an
+-- infinite stream without consuming the input.
+--
+-- Like foldMany1, "scan" should ideally be "scan1" always resulting in a
+-- non-empty stream, and "postscan" should be called just "scan" because it is
+-- much more common. But those names cannot be changed now.

 -- | Like 'foldMany' but evaluates the fold even if the fold did not receive
 -- any input, therefore, always results in a non-empty output even on an empty
-- stream (default result of the fold). 'foldMany' is like 'scan' which always
-- includes the initial value of the accumulator.
+-- stream (default result of the fold).
 --
 -- Example, empty stream, compare with 'foldMany':
 --
 -- >>> f = Fold.take 2 Fold.toList
-- >>> fmany = Stream.fold Fold.toList . Stream.foldManyPost f
+-- >>> fmany = Stream.fold Fold.toList . Stream.foldMany1 f
 -- >>> fmany $ Stream.fromList []
 -- [[]]
 --
@ -1817,14 +1815,11 @@ data FoldManyPost s fs b a
 -- >>> fmany $ Stream.fromList [1..5]
 -- [[1,2],[3,4],[5]]
 --
-- Note that using a closed fold e.g. @Fold.take 0@, would result in an
-- infinite stream without consuming the input.
--
 -- /Pre-release/
 --
-{-# INLINE_NORMAL foldManyPost #-}
-foldManyPost :: Monad m => Fold m a b -> Stream m a -> Stream m b
-foldManyPost (Fold fstep initial _ final) (Stream step state) =
+{-# INLINE_NORMAL foldMany1 #-}
+foldMany1 :: Monad m => Fold m a b -> Stream m a -> Stream m b
+foldMany1 (Fold fstep initial _ final) (Stream step state) =
    Stream step' (FoldManyPostStart state)

    where
@ -1857,6 +1852,11 @@ foldManyPost (Fold fstep initial _ final) (Stream step state) =
    step' _ (FoldManyPostYield b next) = return $ Yield b next
    step' _ FoldManyPostDone = return Stop

+{-# DEPRECATED foldManyPost "Please use foldMany1 instead." #-}
+{-# INLINE foldManyPost #-}
+foldManyPost :: Monad m => Fold m a b -> Stream m a -> Stream m b
+foldManyPost = foldMany1
+
 {-# ANN type FoldMany Fuse #-}
 data FoldMany s fs b a
    = FoldManyStart s
@ -1866,13 +1866,12 @@ data FoldMany s fs b a
    | FoldManyDone

 -- XXX Nested foldMany does not fuse.
-- XXX Rename fondMany/foldManyPost to keep the behavior and naming consistent
-- with scan, postscan?

 -- | Apply a terminating 'Fold' repeatedly on a stream and emit the results in
-- the output stream. Like 'postscan', foldMany omits the initial (default)
-- value of the accumulator, if the input stream segment is empty the result is
-- also empty. Using a non-terminating fold in 'foldMany' will result in a hang.
+-- the output stream. If the last fold is empty, it's result is not emitted.
+-- This means if the input stream is empty the result is also an empty stream.
+-- See 'foldMany1' for an alternate behavior which always results in a
+-- non-empty stream even if the input stream is empty.
 --
 -- Definition:
 --
--- a/test/Streamly/Test/Data/Stream.hs
+++ b/test/Streamly/Test/Data/Stream.hs
@ -59,7 +59,7 @@ toList = Stream.toList
 -- XXX Where are the tests for "takeEndBy"?
 splitOn :: Monad m =>
    (a -> Bool) -> Fold m a b -> Stream m a -> Stream m b
-splitOn predicate f = Stream.foldManyPost (Fold.takeEndBy_ predicate f)
+splitOn predicate f = Stream.foldMany1 (Fold.takeEndBy_ predicate f)

 splitOnSuffix :: Monad m =>
    (a -> Bool) -> Fold m a b -> Stream m a -> Stream m b
@ -68,7 +68,7 @@ splitOnSuffix predicate f = Stream.foldMany (Fold.takeEndBy_ predicate f)
 -- XXX Where are the tests for "takeEndBySeq"?
 splitOnSeqFold :: (MonadIO m, Unbox a, Enum a, Eq a) =>
   Array.Array a -> Fold m a b -> Stream m a -> Stream m b
-splitOnSeqFold patt f = Stream.foldManyPost (Fold.takeEndBySeq_ patt f)
+splitOnSeqFold patt f = Stream.foldMany1 (Fold.takeEndBySeq_ patt f)

 splitOnSeqStream :: (MonadIO m, Unbox a, Enum a, Eq a) =>
   Array.Array a -> Fold m a b -> Stream m a -> Stream m b
--- a/test/Streamly/Test/Unicode/Char.hs
+++ b/test/Streamly/Test/Unicode/Char.hs
@ -72,7 +72,7 @@ checkNFKD :: (Text, Text, Text, Text, Text) -> IO Bool
 checkNFKD (c1, c2, c3, c4, c5) =
    checkOp "toNFKD" NFKD $ map (c5,) [c1, c2, c3, c4, c5]

-splitOn predicate f = S.foldManyPost (FL.takeEndBy_ predicate f)
+splitOn predicate f = S.foldMany1 (FL.takeEndBy_ predicate f)

 checkAllTestCases :: Int -> String -> IO ()
 checkAllTestCases lineno line = do