Refactor Prelude benchmarks

* Now benchmark modules correspond to source modules. The Prelude module in source corresponds to several modules one for each stream type. * Benchmarks in the same order/groupings as they appear in source * All benchmarks now have division according to space complexity * Refactoring reduces a lot of code duplication especially the stream generation and elimination functions. * The RTS options are now completely set in the shell script to run the benchmarks. * RTS options can be set on a per benchmark basis. RTS options work correctly now. * The set of streaming/infinite stream benchmarks is now complete and we can run all such benchmarks coneveniently. * Benchmark "quick"/"speed" options can now be specified on a per benchmark basis. Longer benchmarks can have fewer iterations/quick run time. * Benchmarks are grouped in several groups which can be run on a per group basis. Comparison groups are also defined for convenient comparisons of different modules (e.g. arrays or streamD/K). * The benchmark namespaces are grouped in a consistent manner. Benchmark executables have a consistent naming based on module names.
2024-09-17 11:37:20 +03:00 · 2020-06-07 00:51:33 +05:30 · 2020-06-07 00:51:33 +05:30 · 129ebaf82c
commit 129ebaf82c
parent ed799d6b51
25 changed files with 3456 additions and 3800 deletions
--- a/.hlint.yaml
+++ b/.hlint.yaml
@ -17,6 +17,7 @@
 - ignore: {name: "Reduce duplication"}
 - ignore: {name: "Use <>"}
 - ignore: {name: "Use fewer imports"}
+- ignore: {name: "Use camelCase"}


 # Specify additional command line arguments
--- a/bench.sh
+++ b/bench.sh
@ -3,94 +3,91 @@
 # Note "_grp" and "_cmp" suffixes are special, do not rename them to something
 # else.

+#------------------------------------------------------------------------------
+# Benchmark groups
+#------------------------------------------------------------------------------
 base_stream_grp="\
    Data.Stream.StreamD \
    Data.Stream.StreamK \
    Data.Stream.StreamDK"
-SERIAL_O_1="linear"
-SERIAL_O_n="serial-o-n-heap serial-o-n-stack serial-o-n-space"
-serial_grp="$SERIAL_O_1 $SERIAL_O_n"

-# parallel benchmark-suite is separated because we run it with a higher
-# heap size limit.
-concurrent_grp="linear-async linear-rate nested-concurrent parallel concurrent adaptive"
+prelude_serial_grp="\
+  Prelude.Serial \
+  Prelude.WSerial \
+  Prelude.ZipSerial"
+
+prelude_concurrent_grp="\
+  Prelude.Async \
+  Prelude.WAsync \
+  Prelude.Ahead \
+  Prelude.Parallel \
+  Prelude.ZipAsync"
+
+prelude_other_grp="\
+  Prelude.Rate \
+  Prelude.Concurrent \
+  Prelude.Adaptive"

 array_grp="Memory.Array Data.Array Data.Prim.Array Data.SmallArray"

 base_parser_grp="Data.Parser.ParserD Data.Parser.ParserK"
 parser_grp="Data.Fold Data.Parser"

-# XXX We can include SERIAL_O_1 here once "base" also supports --stream-size
-infinite_grp="linear linear-async linear-rate nested-concurrent"
-finite_grp="$SERIAL_O_n $array_grp fileio parallel concurrent adaptive"
+#------------------------------------------------------------------------------
+# Streaming vs non-streaming
+#------------------------------------------------------------------------------
+# The "o-1-space" groups of these benchmarks are run with long stream
+# sizes when --long option is used.

-# Benchmarks that take long time per iteration must run fewer iterations to
-# finish in reasonable time.
-QUICK_BENCHMARKS="linear-rate concurrent adaptive fileio"
+infinite_grp="\
+  $prelude_serial_grp \
+  $prelude_concurrent_grp \
+  Prelude.Rate"

+#------------------------------------------------------------------------------
+# Comparison groups
+#------------------------------------------------------------------------------
 # *_cmp denotes a comparison benchmarks, the benchmarks provided in *_cmp
 # variables are compared with each other
-array_cmp="Memory.Array Data.Prim.Array Data.Array"
 base_stream_cmp="Data.Stream.StreamD Data.Stream.StreamK"
+serial_wserial_cmp="Prelude.Serial Prelude.WSerial"
+serial_async_cmp="Prelude.Serial Prelude.Async"
+concurrent_cmp="Prelude.Async Prelude.WAsync Prelude.Ahead Prelude.Parallel"
+array_cmp="Memory.Array Data.Prim.Array Data.Array"
 base_parser_cmp=$base_parser_grp
-COMPARISONS="array_cmp base_stream_cmp base_parser_cmp"
+COMPARISONS="\
+  base_stream_cmp \
+  serial_wserial_cmp \
+  serial_async_cmp \
+  concurrent_cmp \
+  array_cmp \
+  base_parser_cmp"

+#------------------------------------------------------------------------------
+# All
+#------------------------------------------------------------------------------
 # All high level benchmarks
 all_grp="\
-    $serial_grp \
-    $concurrent_grp \
+    $prelude_serial_grp \
+    $prelude_concurrent_grp \
    $array_grp \
    $parser_grp \
    Data.Unfold"

 ALL_BENCH_GROUPS="\
    all_grp \
-    serial_grp \
-    concurrent_grp \
+    prelude_serial_grp \
+    prelude_concurrent_grp \
    array_grp \
    infinite_grp \
-    finite_grp \
    base_stream_grp \
    base_parser_grp"

-# RTS options that go inside +RTS and -RTS while running the benchmark.
-bench_rts_opts () {
-  case "$1" in
-    "linear") echo -n "-T -K36K -M16M" ;;
-    "serial-o-n-stack") echo -n "-T -K1M -M16M" ;;
-    "serial-o-n-heap") echo -n "-T -K36K -M128M" ;;
-    "serial-o-n-space") echo -n "-T -K16M -M64M" ;;
-    Data.SmallArray/o-1-sp*) echo -n "-T -K128K -M16M" ;;
-    */o-1-sp*) echo -n "-T -K36K -M16M" ;;
-    */o-n-h*) echo -n "-T -K36K -M32M" ;;
-    */o-n-st*) echo -n "-T -K1M -M16M" ;;
-    */o-n-sp*) echo -n "-T -K1M -M32M" ;;
+#------------------------------------------------------------------------------
+# Script
+#------------------------------------------------------------------------------

-    *) echo -n "" ;;
-  esac
-}
-
-# The correct executable for the given benchmark name.
-bench_exec () {
-  case "$1" in
-    "linear") echo -n "serial" ;;
-    "serial-o-n-stack") echo -n "serial" ;;
-    "serial-o-n-heap") echo -n "serial" ;;
-    "serial-o-n-space") echo -n "serial" ;;
-    *) echo -n "$1" ;;
-  esac
-}
-
-# Specific gauge options for the given benchmark.
-bench_gauge_opts () {
-  case "$1" in
-    "linear") echo -n "-m prefix o-1-space" ;;
-    "serial-o-n-stack") echo -n "-m prefix o-n-stack" ;;
-    "serial-o-n-heap") echo -n "-m prefix o-n-heap" ;;
-    "serial-o-n-space") echo -n "-m prefix o-n-space" ;;
-    *) echo -n "" ;;
-  esac
-}
+BENCH_SH_DIR=$(dirname $0)

 list_benches ()  {
  echo "Individual benchmarks:"
@ -125,7 +122,7 @@ list_comparisons ()  {

 print_help () {
  echo "Usage: $0 "
-  echo "       [--benchmarks <"bench1 bench2 ..." | ?>]"
+  echo "       [--benchmarks <"bench1 bench2 ..." | help>]"
  echo "       [--graphs]"
  echo "       [--no-measure]"
  echo "       [--append]"
@ -136,7 +133,7 @@ print_help () {
  echo "       [--compare] [--base <commit>] [--candidate <commit>]"
  echo "       -- <gauge options or benchmarks>"
  echo
-  echo "--benchmarks: benchmarks to run, use '?' for list of benchmarks"
+  echo "--benchmarks: benchmarks to run, use 'help' for list of benchmarks"
  echo "--graphs: Generate graphical reports"
  echo "--no-measure: Don't run benchmarks, run reports from previous results"
  echo "--append: Don't overwrite previous results, append for comparison"
@ -280,10 +277,9 @@ function run_verbose() {

 run_bench () {
  local bench_name=$1
-  local bench_exe=$(bench_exec $bench_name)
+  local bench_exe=$bench_name
  local output_file=$(bench_output_file $bench_name)
  local bench_prog
-  local quick_bench=0
  bench_prog=$($GET_BENCH_PROG $bench_exe) || \
    die "Cannot find benchmark executable for benchmark $bench_name"

@ -291,41 +287,38 @@ run_bench () {

  echo "Running benchmark $bench_name ..."

-  for i in $QUICK_BENCHMARKS
-  do
-    if test "$(has_benchmark $i)" = "$bench_name"
-    then
-      quick_bench=1
-    fi
-  done
-
  local QUICK_OPTS="--quick --time-limit 1 --min-duration 0"
  local SPEED_OPTIONS
  if test "$LONG" -eq 0
  then
    if test "$SLOW" -eq 0
    then
-        if test "$QUICK" -eq 0 -a "$quick_bench" -eq 0
+        export QUICK_MODE
+        if test "$QUICK_MODE" -eq 0
        then
-          # reasonably quick
+          # default mode, not super quick, not slow
          SPEED_OPTIONS="$QUICK_OPTS --min-samples 10"
        else
          # super quick but less accurate
          SPEED_OPTIONS="$QUICK_OPTS --include-first-iter"
        fi
    else
+      # Slow but more accurate mode
      SPEED_OPTIONS="--min-duration 0"
    fi
  else
+      # large stream size, always super quick
+      GAUGE_ARGS="$GAUGE_ARGS $bench_name/o-1-space"
      SPEED_OPTIONS="--stream-size 10000000 $QUICK_OPTS --include-first-iter"
  fi

+  export BENCH_EXEC_PATH=$bench_prog
+  export RTS_OPTIONS
  run_verbose $bench_prog $SPEED_OPTIONS \
-    +RTS $(bench_rts_opts $GAUGE_ARGS) -RTS \
    --csvraw=$output_file \
    -v 2 \
-    --measure-with $bench_prog $GAUGE_ARGS \
-    $(bench_gauge_opts $bench_name) || die "Benchmarking failed"
+    --measure-with "$BENCH_SH_DIR/bin/bench-exec-one.sh" \
+    $GAUGE_ARGS || die "Benchmarking failed"
 }

 run_benches() {
@ -407,7 +400,7 @@ run_reports() {
 # Execution starts here
 #-----------------------------------------------------------------------------

-DEFAULT_BENCHMARKS="linear"
+DEFAULT_BENCHMARKS="$all_grp"

 COMPARE=0
 BASE=
@ -415,13 +408,14 @@ CANDIDATE=

 APPEND=0
 SLOW=0
-QUICK=0
+QUICK_MODE=0
 LONG=0
 RAW=0
 GRAPH=0
 MEASURE=1

 GAUGE_ARGS=
+RTS_OPTIONS=
 BUILD_ONCE=0
 USE_STACK=0
 CABAL_BUILD_FLAGS=""
@ -445,9 +439,10 @@ do
    --base) shift; BASE=$1; shift ;;
    --candidate) shift; CANDIDATE=$1; shift ;;
    --cabal-build-flags) shift; CABAL_BUILD_FLAGS=$1; shift ;;
+    --rtsopts) shift; RTS_OPTIONS=$1; shift ;;
    # flags
    --slow) SLOW=1; shift ;;
-    --quick) QUICK=1; shift ;;
+    --quick) QUICK_MODE=1; shift ;;
    --compare) COMPARE=1; shift ;;
    --raw) RAW=1; shift ;;
    --append) APPEND=1; shift ;;
@ -479,13 +474,6 @@ only_real_benchmarks () {
  done
 }

-proper_executables () {
-  for i in $BENCHMARKS
-  do
-    echo -n "$(bench_exec $i) "
-  done
-}
-
 has_benchmark () {
  for i in $BENCHMARKS_ORIG
  do
@ -519,7 +507,7 @@ fi
 BENCHMARKS=$(set_benchmarks)
 BENCHMARKS_ORIG=$BENCHMARKS
 BENCHMARKS=$(only_real_benchmarks)
-EXECUTABLES=$(proper_executables)
+EXECUTABLES=$BENCHMARKS

 echo "Using benchmark suites [$BENCHMARKS]"

--- a/benchmark/Chart.hs
+++ b/benchmark/Chart.hs
@ -1,18 +1,13 @@
-{-# LANGUAGE TupleSections #-}
 {-# LANGUAGE RecordWildCards #-}
 {-# LANGUAGE ScopedTypeVariables #-}

 module Main where

-import Control.Exception (handle, catch, SomeException, ErrorCall(..))
+import Control.Exception (catch, ErrorCall(..))
 import Control.Monad.Trans.State
 import Control.Monad.Trans.Maybe
 import Data.Char (toLower)
-import Data.Function (on, (&))
 import Data.List
-import Data.List.Split
-import Data.Maybe (mapMaybe)
-import Data.Ord (comparing)
 import System.Environment (getArgs)
 import Control.Monad.IO.Class (liftIO)
 import Control.Monad (mzero)
@ -24,32 +19,27 @@ import BenchShow
 ------------------------------------------------------------------------------

 data BenchType
-    = Linear
-    | LinearAsync
-    | LinearRate
-    | NestedConcurrent
-    | FileIO
-    | Concurrent
-    | Parallel
-    | Adaptive
-    | Compare String
+    = Compare String
    | Standard String
    deriving Show

 data Options = Options
    { genGraphs :: Bool
-    , benchType :: BenchType
+    , benchType :: Maybe BenchType
    } deriving Show

-defaultOptions = Options False Linear
+defaultOptions :: Options
+defaultOptions = Options False Nothing

+setGenGraphs :: Monad m => Bool -> StateT (a, Options) m ()
 setGenGraphs val = do
    (args, opts) <- get
    put (args, opts { genGraphs = val })

+setBenchType :: Monad m => BenchType -> StateT (a, Options) m ()
 setBenchType val = do
    (args, opts) <- get
-    put (args, opts { benchType = val })
+    put (args, opts { benchType = Just val })

 -- Like the shell "shift" to shift the command line arguments
 shift :: StateT ([String], Options) (MaybeT IO) (Maybe String)
@ -63,14 +53,6 @@ parseBench :: StateT ([String], Options) (MaybeT IO) ()
 parseBench = do
    x <- shift
    case x of
-        Just "linear" -> setBenchType Linear
-        Just "linear-async" -> setBenchType LinearAsync
-        Just "linear-rate" -> setBenchType LinearRate
-        Just "nested-concurrent" -> setBenchType NestedConcurrent
-        Just "fileio" -> setBenchType FileIO
-        Just "concurrent" -> setBenchType Concurrent
-        Just "parallel" -> setBenchType Parallel
-        Just "adaptive" -> setBenchType Adaptive
        Just str | "_cmp" `isSuffixOf` str -> setBenchType (Compare str)
        Just str -> setBenchType (Standard str)
        Nothing -> do
@ -101,102 +83,10 @@ parseOptions = do
            Just opt -> parseOpt opt >> parseLoop
            Nothing -> return ()

+ignoringErr :: IO () -> IO ()
 ignoringErr a = catch a (\(ErrorCall err :: ErrorCall) ->
    putStrLn $ "Failed with error:\n" <> err <> "\nSkipping.")

------------------------------------------------------------------------------
-- Linear composition charts
------------------------------------------------------------------------------
-
-makeLinearGraphs :: Config -> String -> IO ()
-makeLinearGraphs cfg@Config{..} inputFile = do
-    ignoringErr $ graph inputFile "generation" $ cfg
-        { title = (++) <$> title <*> Just " generation"
-        , classifyBenchmark =
-            fmap ("Streamly",) . stripPrefix "serially/generation/"
-        }
-
-    ignoringErr $ graph inputFile "elimination" $ cfg
-        { title = (++) <$> title <*> Just " Elimination"
-        , classifyBenchmark =
-            fmap ("Streamly",) . stripPrefix "serially/elimination/"
-        }
-
-    ignoringErr $ graph inputFile "transformation-zip" $ cfg
-        { title = (++) <$> title <*> Just " Transformation & Zip"
-        , classifyBenchmark = \b ->
-                if    "serially/transformation/" `isPrefixOf` b
-                   || "serially/zipping" `isPrefixOf` b
-                then Just ("Streamly", last $ splitOn "/" b)
-                else Nothing
-        }
-
-    ignoringErr $ graph inputFile "filtering" $ cfg
-        { title = (++) <$> title <*> Just " Filtering"
-        , classifyBenchmark =
-            fmap ("Streamly",) . stripPrefix "serially/filtering/"
-        }
-
-    ignoringErr $ graph inputFile "transformationX4" $ cfg
-        { title = (++) <$> title <*> Just " Transformation x 4"
-        , classifyBenchmark =
-            fmap ("Streamly",) . stripPrefix "serially/transformationX4/"
-        }
-
-    ignoringErr $ graph inputFile "filteringX4"
-        $ cfg
-        { title = (++) <$> title <*> Just " Filtering x 4"
-        , classifyBenchmark =
-            fmap ("Streamly",) . stripPrefix "serially/filteringX4/"
-        }
-
-    ignoringErr $ graph inputFile "mixedX4"
-        $ cfg
-        { title = (++) <$> title <*> Just " Mixed x 4"
-        , classifyBenchmark =
-            fmap ("Streamly",) . stripPrefix "serially/mixedX4/"
-        }
-
-    ignoringErr $ graph inputFile "iterated"
-        $ cfg
-        { title = Just "iterate 10,000 times over 10 elems"
-        , classifyBenchmark =
-            fmap ("Streamly",) . stripPrefix "serially/iterated/"
-        }
-
------------------------------------------------------------------------------
-- Stream type based comparison charts
------------------------------------------------------------------------------
-
-makeStreamComparisonGraphs :: String -> [String] -> Config -> String -> IO ()
-makeStreamComparisonGraphs outputPrefix benchPrefixes cfg inputFile =
-    ignoringErr $ graph inputFile outputPrefix $ cfg
-        { presentation = Groups Absolute
-        , classifyBenchmark = classifyNested
-        , selectGroups = \gs ->
-            groupBy ((==) `on` snd) gs
-            & fmap (\xs -> mapMaybe (\x -> (x,) <$> lookup x xs) benchPrefixes)
-            & concat
-        }
-
-    where
-
-    classifyNested b
-        | "serially/" `isPrefixOf` b =
-            ("serially",) <$> stripPrefix "serially/" b
-        | "asyncly/" `isPrefixOf` b =
-            ("asyncly",) <$> stripPrefix "asyncly/" b
-        | "wAsyncly/" `isPrefixOf` b =
-            ("wAsyncly",) <$> stripPrefix "wAsyncly/" b
-        | "aheadly/" `isPrefixOf` b =
-            ("aheadly",) <$> stripPrefix "aheadly/" b
-        | "parallely/" `isPrefixOf` b =
-            ("parallely",) <$> stripPrefix "parallely/" b
-        | otherwise = Nothing
-
-linearAsyncPrefixes = ["asyncly", "wAsyncly", "aheadly", "parallely"]
-nestedBenchPrefixes = ["serially"] ++ linearAsyncPrefixes
-
 ------------------------------------------------------------------------------
 -- Generic
 ------------------------------------------------------------------------------
@ -209,6 +99,7 @@ makeGraphs name cfg@Config{..} inputFile =
 -- Arrays
 ------------------------------------------------------------------------------

+showComparisons :: Options -> Config -> FilePath -> FilePath -> IO ()
 showComparisons Options{..} cfg inp out =
    let cfg1 = cfg { classifyBenchmark = classifyComparison }
     in if genGraphs
@ -222,7 +113,7 @@ showComparisons Options{..} cfg inp out =

    dropComponent = dropWhile (== '/') . dropWhile (/= '/')

-    classifyComparison b = Just $
+    classifyComparison b = Just
        ( takeWhile (/= '/') b
        , dropComponent b
        )
@ -242,6 +133,13 @@ selectBench f =
      (sortOn snd)
      $ f (ColumnIndex 1) (Just PercentDiff)

+benchShow ::
+       Options
+    -> Config
+    -> (Config -> String -> IO ())
+    -> String
+    -> FilePath
+    -> IO ()
 benchShow Options{..} cfg func inp out =
    if genGraphs
    then func cfg {outputDir = Just out} inp
@ -256,7 +154,7 @@ main = do
                ( flip elem ["time" , "mean"
                            , "maxrss", "cputime"
                            ]
-                . map toLower
+                . fmap toLower
                )
            }
    res <- parseOptions
@ -267,52 +165,16 @@ main = do
            return ()
        Just opts@Options{..} ->
            case benchType of
-                Linear -> benchShow opts cfg
-                            { title = Just "Linear" }
-                            makeLinearGraphs
-                            "charts/linear/results.csv"
-                            "charts/linear"
-                LinearRate -> benchShow opts cfg
-                            { title = Just "Linear Rate" }
-                            (makeGraphs "linear-rate")
-                            "charts/linear-rate/results.csv"
-                            "charts/linear-rate"
-                LinearAsync -> benchShow opts cfg
-                            { title = Just "Linear Async" }
-                            (makeStreamComparisonGraphs "linear-async" linearAsyncPrefixes)
-                            "charts/linear-async/results.csv"
-                            "charts/linear-async"
-                NestedConcurrent -> benchShow opts cfg
-                            { title = Just "Nested concurrent loops" }
-                            (makeStreamComparisonGraphs "nested-concurrent" nestedBenchPrefixes)
-                            "charts/nested-concurrent/results.csv"
-                            "charts/nested-concurrent"
-                FileIO -> benchShow opts cfg
-                            { title = Just "File IO" }
-                            (makeGraphs "fileIO")
-                            "charts/fileio/results.csv"
-                            "charts/fileio"
-                Concurrent -> benchShow opts cfg
-                            { title = Just "Concurrent Ops" }
-                            (makeGraphs "Concurrent")
-                            "charts/concurrent/results.csv"
-                            "charts/concurrent"
-                Parallel -> benchShow opts cfg
-                            { title = Just "Parallel" }
-                            (makeGraphs "parallel")
-                            "charts/parallel/results.csv"
-                            "charts/parallel"
-                Adaptive -> benchShow opts cfg
-                            { title = Just "Adaptive" }
-                            (makeGraphs "adaptive")
-                            "charts/adaptive/results.csv"
-                            "charts/adaptive"
-                Compare str -> showComparisons opts cfg
-                            { title = Just $ str }
-                            ("charts/" ++ str ++ "/results.csv")
-                            ("charts/" ++ str)
-                Standard str -> benchShow opts cfg
-                            { title = Just str }
-                            (makeGraphs str)
-                            ("charts/" ++ str ++ "/results.csv")
-                            ("charts/" ++ str)
+                Just (Compare str) ->
+                    showComparisons opts cfg
+                        { title = Just str }
+                        ("charts/" ++ str ++ "/results.csv")
+                        ("charts/" ++ str)
+                Just (Standard str) ->
+                    benchShow opts cfg
+                        { title = Just str }
+                        (makeGraphs str)
+                        ("charts/" ++ str ++ "/results.csv")
+                        ("charts/" ++ str)
+                Nothing ->
+                    error "Please specify a benchmark using --benchmark."
--- a/benchmark/Streamly/Benchmark/Prelude/Ahead.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/Ahead.hs
@ -0,0 +1,122 @@
+-- |
+-- Module      : Main
+-- Copyright   : (c) 2018 Harendra Kumar
+--
+-- License     : BSD3
+-- Maintainer  : streamly@composewell.com
+
+import Prelude hiding (mapM)
+
+import Streamly (aheadly, ahead, maxBuffer, maxThreads)
+
+import Streamly.Benchmark.Common
+import Streamly.Benchmark.Prelude
+
+import Gauge
+
+moduleName :: String
+moduleName = "Prelude.Ahead"
+
+-------------------------------------------------------------------------------
+-- Benchmark groups
+-------------------------------------------------------------------------------
+
+-- unfoldr and fromFoldable are always serial and therefore the same for
+-- all stream types. They can be removed to reduce the number of benchmarks.
+o_1_space_generation :: Int -> [Benchmark]
+o_1_space_generation value =
+    [ bgroup "generation"
+        [ benchIOSrc aheadly "unfoldr" (sourceUnfoldr value)
+        , benchIOSrc aheadly "unfoldrM" (sourceUnfoldrM value)
+    --  , benchIOSrc aheadly "fromFoldable" (sourceFromFoldable value)
+        , benchIOSrc aheadly "fromFoldableM" (sourceFromFoldableM value)
+        , benchIOSrc aheadly "unfoldrM maxThreads 1"
+            (maxThreads 1 . sourceUnfoldrM value)
+        , benchIOSrc aheadly "unfoldrM maxBuffer 1 (x/10 ops)"
+            (maxBuffer 1 . sourceUnfoldrMN (value `div` 10))
+        ]
+    ]
+
+o_1_space_mapping :: Int -> [Benchmark]
+o_1_space_mapping value =
+    [ bgroup "mapping"
+        [ benchIOSink value "map" $ mapN aheadly 1
+        , benchIOSink value "fmap" $ fmapN aheadly 1
+        , benchIOSink value "mapM" $ mapM aheadly 1
+        ]
+    ]
+
+o_1_space_concatFoldable :: Int -> [Benchmark]
+o_1_space_concatFoldable value =
+    [ bgroup
+        "concat-foldable"
+        [ benchIOSrc aheadly "foldMapWith" (sourceFoldMapWith value)
+        , benchIOSrc aheadly "foldMapWithM" (sourceFoldMapWithM value)
+        , benchIOSrc aheadly "foldMapM" (sourceFoldMapM value)
+        ]
+    ]
+
+o_1_space_concatMap :: Int -> [Benchmark]
+o_1_space_concatMap value =
+    value2 `seq`
+        [ bgroup "concat"
+              [ benchIO "concatMapWith (2,x/2)"
+                    (concatStreamsWith ahead 2 (value `div` 2))
+              , benchIO "concatMapWith (sqrt x,sqrt x)"
+                    (concatStreamsWith ahead value2 value2)
+              , benchIO "concatMapWith (sqrt x * 2,sqrt x / 2)"
+                    (concatStreamsWith ahead (value2 * 2) (value2 `div` 2))
+              ]
+        ]
+
+    where
+
+    value2 = round $ sqrt (fromIntegral value :: Double)
+
+-------------------------------------------------------------------------------
+-- Monadic outer product
+-------------------------------------------------------------------------------
+
+o_1_space_outerProduct :: Int -> [Benchmark]
+o_1_space_outerProduct value =
+    [ bgroup "monad-outer-product"
+        [ benchIO "toNullAp"       $ toNullAp value aheadly
+        , benchIO "toNull"         $ toNullM value aheadly
+        , benchIO "toNull3"        $ toNullM3 value aheadly
+        , benchIO "filterAllOut"   $ filterAllOutM value aheadly
+        , benchIO "filterAllIn"    $ filterAllInM value aheadly
+        , benchIO "filterSome"     $ filterSome value aheadly
+        , benchIO "breakAfterSome" $ breakAfterSome value aheadly
+
+        ]
+    ]
+
+o_n_space_outerProduct :: Int -> [Benchmark]
+o_n_space_outerProduct value =
+    [ bgroup "monad-outer-product"
+        [ benchIO "toList"         $ toListM value aheadly
+        , benchIO "toListSome"     $ toListSome value aheadly
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Main
+-------------------------------------------------------------------------------
+
+main :: IO ()
+main = do
+    (value, cfg, benches) <- parseCLIOpts defaultStreamSize
+    value `seq` runMode (mode cfg) cfg benches (allBenchmarks value)
+
+    where
+
+    allBenchmarks value =
+        [ bgroup (o_1_space_prefix moduleName) $ concat
+            [ o_1_space_generation value
+            , o_1_space_mapping value
+            , o_1_space_concatFoldable value
+            , o_1_space_concatMap value
+            , o_1_space_outerProduct value
+            ]
+        , bgroup (o_n_space_prefix moduleName) (o_n_space_outerProduct value)
+        ]
--- a/benchmark/Streamly/Benchmark/Prelude/Async.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/Async.hs
@ -0,0 +1,127 @@
+-- |
+-- Module      : Main
+-- Copyright   : (c) 2018 Composewell Technologies
+--
+-- License     : BSD3
+-- Maintainer  : streamly@composewell.com
+
+import Prelude hiding (mapM)
+
+import Streamly (asyncly, async, maxBuffer, maxThreads)
+
+import Streamly.Benchmark.Common
+import Streamly.Benchmark.Prelude
+
+import Gauge
+
+moduleName :: String
+moduleName = "Prelude.Async"
+
+-------------------------------------------------------------------------------
+-- Generation
+-------------------------------------------------------------------------------
+
+o_1_space_generation :: Int -> [Benchmark]
+o_1_space_generation value =
+    [ bgroup "generation"
+        [ benchIOSrc asyncly "unfoldr" (sourceUnfoldr value)
+        , benchIOSrc asyncly "unfoldrM" (sourceUnfoldrM value)
+        , benchIOSrc asyncly "fromFoldable" (sourceFromFoldable value)
+        , benchIOSrc asyncly "fromFoldableM" (sourceFromFoldableM value)
+        , benchIOSrc asyncly "unfoldrM maxThreads 1"
+            (maxThreads 1 . sourceUnfoldrM value)
+        , benchIOSrc asyncly "unfoldrM maxBuffer 1 (x/10 ops)"
+            (maxBuffer 1 . sourceUnfoldrMN (value `div` 10))
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Mapping
+-------------------------------------------------------------------------------
+
+o_1_space_mapping :: Int -> [Benchmark]
+o_1_space_mapping value =
+    [ bgroup "mapping"
+        [ benchIOSink value "map" $ mapN asyncly 1
+        , benchIOSink value "fmap" $ fmapN asyncly 1
+        , benchIOSink value "mapM" $ mapM asyncly 1
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Concat
+-------------------------------------------------------------------------------
+
+o_1_space_concatFoldable :: Int -> [Benchmark]
+o_1_space_concatFoldable value =
+    [ bgroup "concat-foldable"
+        [ benchIOSrc asyncly "foldMapWith" (sourceFoldMapWith value)
+        , benchIOSrc asyncly "foldMapWithM" (sourceFoldMapWithM value)
+        , benchIOSrc asyncly "foldMapM" (sourceFoldMapM value)
+        ]
+    ]
+
+o_1_space_concatMap :: Int -> [Benchmark]
+o_1_space_concatMap value =
+    value2 `seq`
+        [ bgroup "concatMap"
+            [ benchIO "concatMapWith (2,x/2)"
+                  (concatStreamsWith async 2 (value `div` 2))
+            , benchIO "concatMapWith (sqrt x,sqrt x)"
+                  (concatStreamsWith async value2 value2)
+            , benchIO "concatMapWith (sqrt x * 2,sqrt x / 2)"
+                  (concatStreamsWith async (value2 * 2) (value2 `div` 2))
+            ]
+        ]
+
+    where
+
+    value2 = round $ sqrt (fromIntegral value :: Double)
+
+-------------------------------------------------------------------------------
+-- Monadic outer product
+-------------------------------------------------------------------------------
+
+o_1_space_outerProduct :: Int -> [Benchmark]
+o_1_space_outerProduct value =
+    [ bgroup "monad-outer-product"
+        [ benchIO "toNullAp"       $ toNullAp value asyncly
+        , benchIO "toNull"         $ toNullM value asyncly
+        , benchIO "toNull3"        $ toNullM3 value asyncly
+        , benchIO "filterAllOut"   $ filterAllOutM value asyncly
+        , benchIO "filterAllIn"    $ filterAllInM value asyncly
+        , benchIO "filterSome"     $ filterSome value asyncly
+        , benchIO "breakAfterSome" $ breakAfterSome value asyncly
+
+        ]
+    ]
+
+o_n_space_outerProduct :: Int -> [Benchmark]
+o_n_space_outerProduct value =
+    [ bgroup "monad-outer-product"
+        [ benchIO "toList"         $ toListM value asyncly
+        , benchIO "toListSome"     $ toListSome value asyncly
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Main
+-------------------------------------------------------------------------------
+
+main :: IO ()
+main = do
+    (value, cfg, benches) <- parseCLIOpts defaultStreamSize
+    value `seq` runMode (mode cfg) cfg benches (allBenchmarks value)
+
+    where
+
+    allBenchmarks value =
+        [ bgroup (o_1_space_prefix moduleName) $ concat
+            [ o_1_space_generation value
+            , o_1_space_mapping value
+            , o_1_space_concatFoldable value
+            , o_1_space_concatMap value
+            , o_1_space_outerProduct value
+            ]
+        , bgroup (o_n_space_prefix moduleName) (o_n_space_outerProduct value)
+        ]
--- a/benchmark/Streamly/Benchmark/Prelude/Concurrent.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/Concurrent.hs
@ -1,7 +1,7 @@
 {-# LANGUAGE RankNTypes #-}
 -- |
 -- Module      : Main
-- Copyright   : (c) 2018 Harendra Kumar
+-- Copyright   : (c) 2018 Composewell Technologies
 --
 -- License     : BSD3
 -- Maintainer  : streamly@composewell.com
@ -30,7 +30,7 @@ append buflen tcount d t =
        $ t
        $ maxBuffer buflen
        $ maxThreads (-1)
-        $ S.fromFoldableM $ map work [1..tcount]
+        $ S.fromFoldableM $ fmap work [1..tcount]

 -- | Run @threads@ concurrently, each producing streams of @elems@ elements
 -- with a delay of @d@ microseconds between successive elements, and merge
@ -46,8 +46,7 @@ concated
    -> (forall a. SerialT IO a -> SerialT IO a -> SerialT IO a)
    -> IO ()
 concated buflen threads d elems t =
-    let work = \i -> S.replicateM i
-                        ((when (d /= 0) (threadDelay d)) >> return i)
+    let work = \i -> S.replicateM i (when (d /= 0) (threadDelay d) >> return i)
    in S.drain
        $ adapt
        $ maxThreads (-1)
@ -74,7 +73,7 @@ concatGroup buflen threads delay n =
    ]

 main :: IO ()
-main = do
+main =
  defaultMainWith (defaultConfig
    { timeLimit = Just 0
    , minSamples = Just 1
--- a/benchmark/Streamly/Benchmark/Prelude/LinearAsync.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/LinearAsync.hs
@ -1,46 +0,0 @@
-- |
-- Module      : Main
-- Copyright   : (c) 2018 Harendra Kumar
--
-- License     : BSD3
-- Maintainer  : streamly@composewell.com
-
-import Streamly.Benchmark.Common
-import Streamly.Benchmark.Prelude
-
-import Gauge
-
-main :: IO ()
-main = do
-    (value, cfg, benches) <- parseCLIOpts defaultStreamSize
-    value `seq` runMode (mode cfg) cfg benches (allBenchmarks value)
-  where
-    allBenchmarks value =
-        concat
-             [ async value
-             , wAsync value
-             , ahead value
-             , zipAsync value
-             ]
-    async value =
-        concat
-            [ o_1_space_async_generation value
-            , o_1_space_async_concatFoldable value
-            , o_1_space_async_concatMap value
-            , o_1_space_async_transformation value
-            ]
-    wAsync value =
-        concat
-            [ o_1_space_wAsync_generation value
-            , o_1_space_wAsync_concatFoldable value
-            , o_1_space_wAsync_concatMap value
-            , o_1_space_wAsync_transformation value
-            ]
-    ahead value =
-        concat
-            [ o_1_space_ahead_generation value
-            , o_1_space_ahead_concatFoldable value
-            , o_1_space_ahead_concatMap value
-            , o_1_space_ahead_transformation value
-            ]
-    zipAsync = o_1_space_async_zip
--- a/benchmark/Streamly/Benchmark/Prelude/LinearRate.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/LinearRate.hs
@ -1,24 +0,0 @@
-- |
-- Module      : Main
-- Copyright   : (c) 2018 Harendra Kumar
--
-- License     : BSD3
-- Maintainer  : streamly@composewell.com
-
-import Streamly.Benchmark.Common
-import Streamly.Benchmark.Prelude
-
-import Gauge
-
-main :: IO ()
-main = do
-    (value, cfg, benches) <- parseCLIOpts defaultStreamSize
-    value `seq` runMode (mode cfg) cfg benches (allBenchmarks value)
-
-    where
-
-    allBenchmarks value =
-        concat
-            [ o_1_space_async_avgRate value
-            , o_1_space_ahead_avgRate value
-            ]
--- a/benchmark/Streamly/Benchmark/Prelude/NestedConcurrent.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/NestedConcurrent.hs
@ -1,84 +0,0 @@
-- |
-- Module      : Main
-- Copyright   : (c) 2018 Harendra Kumar
--
-- License     : BSD3
-- Maintainer  : streamly@composewell.com
-
-import Control.DeepSeq (NFData)
-import Control.Monad (when)
-import Data.Functor.Identity (Identity, runIdentity)
-import System.Random (randomRIO)
-
-import Streamly.Benchmark.Common (parseCLIOpts)
-
-import Streamly
-import Gauge
-
-import qualified Streamly.Benchmark.Prelude.NestedOps as Ops
-
-benchIO :: (NFData b) => String -> (Int -> IO b) -> Benchmark
-benchIO name f = bench name $ nfIO $ randomRIO (1,1) >>= f
-
-_benchId :: (NFData b) => String -> (Int -> Identity b) -> Benchmark
-_benchId name f = bench name $ nf (\g -> runIdentity (g 1))  f
-
-defaultStreamSize :: Int
-defaultStreamSize = 100000
-
-main :: IO ()
-main = do
-  -- XXX Fix indentation
-  (linearCount, cfg, benches) <- parseCLIOpts defaultStreamSize
-  let finiteCount = min linearCount defaultStreamSize
-  when (finiteCount /= linearCount) $
-    putStrLn $ "Limiting stream size to "
-               ++ show defaultStreamSize
-               ++ " for finite stream operations only"
-
-  finiteCount `seq` linearCount `seq` runMode (mode cfg) cfg benches
-    [
-      bgroup "aheadly"
-      [ benchIO "toNullAp"       $ Ops.toNullAp linearCount       aheadly
-      , benchIO "toNull"         $ Ops.toNull linearCount         aheadly
-      , benchIO "toNull3"        $ Ops.toNull3 linearCount        aheadly
-      -- , benchIO "toList"         $ Ops.toList linearCount         aheadly
-      -- XXX consumes too much stack space
-      , benchIO "toListSome"     $ Ops.toListSome linearCount     aheadly
-      , benchIO "filterAllOut"   $ Ops.filterAllOut linearCount   aheadly
-      , benchIO "filterAllIn"    $ Ops.filterAllIn linearCount    aheadly
-      , benchIO "filterSome"     $ Ops.filterSome linearCount     aheadly
-      , benchIO "breakAfterSome" $ Ops.breakAfterSome linearCount aheadly
-      ]
-
-    , bgroup "asyncly"
-      [ benchIO "toNullAp"       $ Ops.toNullAp linearCount       asyncly
-      , benchIO "toNull"         $ Ops.toNull linearCount         asyncly
-      , benchIO "toNull3"        $ Ops.toNull3 linearCount        asyncly
-      -- , benchIO "toList"         $ Ops.toList linearCount         asyncly
-      , benchIO "toListSome"     $ Ops.toListSome  linearCount    asyncly
-      , benchIO "filterAllOut"   $ Ops.filterAllOut linearCount   asyncly
-      , benchIO "filterAllIn"    $ Ops.filterAllIn linearCount    asyncly
-      , benchIO "filterSome"     $ Ops.filterSome linearCount     asyncly
-      , benchIO "breakAfterSome" $ Ops.breakAfterSome linearCount asyncly
-      ]
-
-    , bgroup "zipAsyncly"
-      [ benchIO "toNullAp"       $ Ops.toNullAp linearCount       zipAsyncly
-      ]
-
-    -- Operations that are not scalable to infinite streams
-    , bgroup "finite"
-      [ bgroup "wAsyncly"
-        [ benchIO "toNullAp"       $ Ops.toNullAp finiteCount       wAsyncly
-        , benchIO "toNull"         $ Ops.toNull finiteCount         wAsyncly
-        , benchIO "toNull3"        $ Ops.toNull3 finiteCount        wAsyncly
-        -- , benchIO "toList"         $ Ops.toList finiteCount         wAsyncly
-        , benchIO "toListSome"     $ Ops.toListSome finiteCount     wAsyncly
-        , benchIO "filterAllOut"   $ Ops.filterAllOut finiteCount   wAsyncly
-        -- , benchIO "filterAllIn"    $ Ops.filterAllIn finiteCount    wAsyncly
-        , benchIO "filterSome"     $ Ops.filterSome finiteCount     wAsyncly
-        , benchIO "breakAfterSome" $ Ops.breakAfterSome finiteCount wAsyncly
-        ]
-      ]
-    ]
--- a/benchmark/Streamly/Benchmark/Prelude/NestedOps.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/NestedOps.hs
@ -1,174 +0,0 @@
-- |
-- Module      : BenchmarkOps
-- Copyright   : (c) 2018 Harendra Kumar
--
-- License     : MIT
-- Maintainer  : streamly@composewell.com
-
-{-# LANGUAGE CPP #-}
-{-# LANGUAGE FlexibleContexts #-}
-{-# LANGUAGE ScopedTypeVariables #-}
-
-module Streamly.Benchmark.Prelude.NestedOps where
-
-import Control.Exception (try)
-import GHC.Exception (ErrorCall)
-
-import qualified Streamly          as S hiding (runStream)
-import qualified Streamly.Prelude  as S
-
-------------------------------------------------------------------------------
-- Stream generation and elimination
-------------------------------------------------------------------------------
-
-type Stream m a = S.SerialT m a
-
-{-# INLINE source #-}
-source :: (S.MonadAsync m, S.IsStream t) => Int -> Int -> t m Int
-source = sourceUnfoldrM
-
-- Change this to "sourceUnfoldrM value n" for consistency
-{-# INLINE sourceUnfoldrM #-}
-sourceUnfoldrM :: (S.IsStream t, S.MonadAsync m) => Int -> Int -> t m Int
-sourceUnfoldrM n value = S.serially $ S.unfoldrM step n
-    where
-    step cnt =
-        if cnt > n + value
-        then return Nothing
-        else return (Just (cnt, cnt + 1))
-
-{-# INLINE sourceUnfoldr #-}
-sourceUnfoldr :: (Monad m, S.IsStream t) => Int -> Int -> t m Int
-sourceUnfoldr start n = S.unfoldr step start
-    where
-    step cnt =
-        if cnt > start + n
-        then Nothing
-        else Just (cnt, cnt + 1)
-
-{-# INLINE runStream #-}
-runStream :: Monad m => Stream m a -> m ()
-runStream = S.drain
-
-{-# INLINE runToList #-}
-runToList :: Monad m => Stream m a -> m [a]
-runToList = S.toList
-
-------------------------------------------------------------------------------
-- Benchmark ops
-------------------------------------------------------------------------------
-
-{-# INLINE toNullAp #-}
-toNullAp
-    :: (S.IsStream t, S.MonadAsync m, Applicative (t m))
-    => Int -> (t m Int -> S.SerialT m Int) -> Int -> m ()
-toNullAp linearCount t start = runStream . t $
-    (+) <$> source start nestedCount2 <*> source start nestedCount2
-  where
-    nestedCount2 = round (fromIntegral linearCount**(1/2::Double))
-
-{-# INLINE toNull #-}
-toNull
-    :: (S.IsStream t, S.MonadAsync m, Monad (t m))
-    => Int -> (t m Int -> S.SerialT m Int) -> Int -> m ()
-toNull linearCount t start = runStream . t $ do
-    x <- source start nestedCount2
-    y <- source start nestedCount2
-    return $ x + y
-  where
-    nestedCount2 = round (fromIntegral linearCount**(1/2::Double))
-
-{-# INLINE toNull3 #-}
-toNull3
-    :: (S.IsStream t, S.MonadAsync m, Monad (t m))
-    => Int -> (t m Int -> S.SerialT m Int) -> Int -> m ()
-toNull3 linearCount t start = runStream . t $ do
-    x <- source start nestedCount3
-    y <- source start nestedCount3
-    z <- source start nestedCount3
-    return $ x + y + z
-  where
-    nestedCount3 = round (fromIntegral linearCount**(1/3::Double))
-
-{-# INLINE toList #-}
-toList
-    :: (S.IsStream t, S.MonadAsync m, Monad (t m))
-    => Int -> (t m Int -> S.SerialT m Int) -> Int -> m [Int]
-toList linearCount t start = runToList . t $ do
-    x <- source start nestedCount2
-    y <- source start nestedCount2
-    return $ x + y
-  where
-    nestedCount2 = round (fromIntegral linearCount**(1/2::Double))
-
-- Taking a specified number of elements is very expensive in logict so we have
-- a test to measure the same.
-{-# INLINE toListSome #-}
-toListSome
-    :: (S.IsStream t, S.MonadAsync m, Monad (t m))
-    => Int -> (t m Int -> S.SerialT m Int) -> Int -> m [Int]
-toListSome linearCount t start =
-    runToList . t $ S.take 10000 $ do
-        x <- source start nestedCount2
-        y <- source start nestedCount2
-        return $ x + y
-  where
-    nestedCount2 = round (fromIntegral linearCount**(1/2::Double))
-
-{-# INLINE filterAllOut #-}
-filterAllOut
-    :: (S.IsStream t, S.MonadAsync m, Monad (t m))
-    => Int -> (t m Int -> S.SerialT m Int) -> Int -> m ()
-filterAllOut linearCount t start = runStream . t $ do
-    x <- source start nestedCount2
-    y <- source start nestedCount2
-    let s = x + y
-    if s < 0
-    then return s
-    else S.nil
-  where
-    nestedCount2 = round (fromIntegral linearCount**(1/2::Double))
-
-{-# INLINE filterAllIn #-}
-filterAllIn
-    :: (S.IsStream t, S.MonadAsync m, Monad (t m))
-    => Int -> (t m Int -> S.SerialT m Int) -> Int -> m ()
-filterAllIn linearCount t start = runStream . t $ do
-    x <- source start nestedCount2
-    y <- source start nestedCount2
-    let s = x + y
-    if s > 0
-    then return s
-    else S.nil
-  where
-    nestedCount2 = round (fromIntegral linearCount**(1/2::Double))
-
-{-# INLINE filterSome #-}
-filterSome
-    :: (S.IsStream t, S.MonadAsync m, Monad (t m))
-    => Int -> (t m Int -> S.SerialT m Int) -> Int -> m ()
-filterSome linearCount t start = runStream . t $ do
-    x <- source start nestedCount2
-    y <- source start nestedCount2
-    let s = x + y
-    if s > 1100000
-    then return s
-    else S.nil
-  where
-    nestedCount2 = round (fromIntegral linearCount**(1/2::Double))
-
-{-# INLINE breakAfterSome #-}
-breakAfterSome
-    :: (S.IsStream t, Monad (t IO))
-    => Int -> (t IO Int -> S.SerialT IO Int) -> Int -> IO ()
-breakAfterSome linearCount t start = do
-    (_ :: Either ErrorCall ()) <- try $ runStream . t $ do
-        x <- source start nestedCount2
-        y <- source start nestedCount2
-        let s = x + y
-        if s > 1100000
-        then error "break"
-        else return s
-    return ()
-  where
-    nestedCount2 = round (fromIntegral linearCount**(1/2::Double))
--- a/benchmark/Streamly/Benchmark/Prelude/Parallel.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/Parallel.hs
@ -1,15 +1,182 @@
 -- |
 -- Module      : Main
-- Copyright   : (c) 2018 Harendra Kumar
+-- Copyright   : (c) 2018 Composewell Technologies
 --
 -- License     : BSD3
 -- Maintainer  : streamly@composewell.com

+{-# LANGUAGE FlexibleContexts #-}
+
+import Prelude hiding (mapM)
+
+import Streamly (SerialT, parallely, parallel, serially, maxBuffer, maxThreads)
+
+import qualified Streamly as S
+import qualified Streamly.Prelude  as S
+import qualified Streamly.Internal.Data.Fold as FL
+import qualified Streamly.Internal.Data.Stream.Parallel as Par
+import qualified Streamly.Internal.Prelude as Internal
+
 import Streamly.Benchmark.Common
 import Streamly.Benchmark.Prelude

 import Gauge

+moduleName :: String
+moduleName = "Prelude.Parallel"
+
+-------------------------------------------------------------------------------
+-- Merging
+-------------------------------------------------------------------------------
+
+{-# INLINE mergeAsyncByM #-}
+mergeAsyncByM :: (S.IsStream t, S.MonadAsync m) => Int -> Int -> t m Int
+mergeAsyncByM count n =
+    S.mergeAsyncByM
+        (\a b -> return (a `compare` b))
+        (sourceUnfoldrMN count n)
+        (sourceUnfoldrMN count (n + 1))
+
+{-# INLINE mergeAsyncBy #-}
+mergeAsyncBy :: (S.IsStream t, S.MonadAsync m) => Int -> Int -> t m Int
+mergeAsyncBy count n =
+    S.mergeAsyncBy
+        compare
+        (sourceUnfoldrMN count n)
+        (sourceUnfoldrMN count (n + 1))
+
+-------------------------------------------------------------------------------
+-- Application/fold
+-------------------------------------------------------------------------------
+
+{-# INLINE parAppMap #-}
+parAppMap :: S.MonadAsync m => SerialT m Int -> m ()
+parAppMap src = S.drain $ S.map (+1) S.|$ src
+
+{-# INLINE parAppSum #-}
+parAppSum :: S.MonadAsync m => SerialT m Int -> m ()
+parAppSum src = (S.sum S.|$. src) >>= \x -> seq x (return ())
+
+-------------------------------------------------------------------------------
+-- Tapping
+-------------------------------------------------------------------------------
+
+{-# INLINE tapAsyncS #-}
+tapAsyncS :: S.MonadAsync m => Int -> SerialT m Int -> m ()
+tapAsyncS n = composeN n $ Par.tapAsync S.sum
+
+{-# INLINE tapAsync #-}
+tapAsync :: S.MonadAsync m => Int -> SerialT m Int -> m ()
+tapAsync n = composeN n $ Internal.tapAsync FL.sum
+
+o_1_space_merge_app_tap :: Int -> [Benchmark]
+o_1_space_merge_app_tap value =
+    [ bgroup "merge-app-tap"
+        [ benchIOSrc serially "mergeAsyncBy (2,x/2)"
+              (mergeAsyncBy (value `div` 2))
+        , benchIOSrc serially "mergeAsyncByM (2,x/2)"
+              (mergeAsyncByM (value `div` 2))
+        -- Parallel stages in a pipeline
+        , benchIOSink value "parAppMap" parAppMap
+        , benchIOSink value "parAppSum" parAppSum
+        , benchIOSink value "tapAsync" (tapAsync 1)
+        , benchIOSink value "tapAsyncS" (tapAsyncS 1)
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Generation
+-------------------------------------------------------------------------------
+
+o_n_heap_generation :: Int -> [Benchmark]
+o_n_heap_generation value =
+    [ bgroup
+        "generation"
+        [ benchIOSrc parallely "unfoldr" (sourceUnfoldr value)
+        , benchIOSrc parallely "unfoldrM" (sourceUnfoldrM value)
+        , benchIOSrc parallely "fromFoldable" (sourceFromFoldable value)
+        , benchIOSrc parallely "fromFoldableM" (sourceFromFoldableM value)
+        , benchIOSrc parallely "unfoldrM maxThreads 1"
+              (maxThreads 1 . sourceUnfoldrM value)
+        , benchIOSrc parallely "unfoldrM maxBuffer 1 (x/10 ops)"
+              (maxBuffer 1 . sourceUnfoldrMN (value `div` 10))
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Mapping
+-------------------------------------------------------------------------------
+
+o_n_heap_mapping :: Int -> [Benchmark]
+o_n_heap_mapping value =
+    [ bgroup "mapping"
+        [ benchIOSink value "map" $ mapN parallely 1
+        , benchIOSink value "fmap" $ fmapN parallely 1
+        , benchIOSink value "mapM" $ mapM parallely 1
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Concat
+-------------------------------------------------------------------------------
+
+o_n_heap_concatFoldable :: Int -> [Benchmark]
+o_n_heap_concatFoldable value =
+    [ bgroup
+        "concat-foldable"
+        [ benchIOSrc parallely "foldMapWith" (sourceFoldMapWith value)
+        , benchIOSrc parallely "foldMapWithM" (sourceFoldMapWithM value)
+        , benchIOSrc parallely "foldMapM" (sourceFoldMapM value)
+        ]
+    ]
+
+o_n_heap_concat :: Int -> [Benchmark]
+o_n_heap_concat value =
+    value2 `seq`
+        [ bgroup "concat"
+            [ benchIO "concatMapWith (2,x/2)"
+                (concatStreamsWith parallel 2 (value `div` 2))
+            , benchIO "concatMapWith (sqrt x,sqrt x)"
+                (concatStreamsWith parallel value2 value2)
+            , benchIO "concatMapWith (sqrt x * 2,sqrt x / 2)"
+                (concatStreamsWith parallel (value2 * 2) (value2 `div` 2))
+            ]
+        ]
+
+    where
+
+    value2 = round $ sqrt (fromIntegral value :: Double)
+
+-------------------------------------------------------------------------------
+-- Monadic outer product
+-------------------------------------------------------------------------------
+
+o_n_heap_outerProduct :: Int -> [Benchmark]
+o_n_heap_outerProduct value =
+    [ bgroup "monad-outer-product"
+        [ benchIO "toNullAp" $ toNullAp value parallely
+        , benchIO "toNull" $ toNullM value parallely
+        , benchIO "toNull3" $ toNullM3 value parallely
+        , benchIO "filterAllOut" $ filterAllOutM value parallely
+        , benchIO "filterAllIn" $ filterAllInM value parallely
+        , benchIO "filterSome" $ filterSome value parallely
+        , benchIO "breakAfterSome" $ breakAfterSome value parallely
+        ]
+    ]
+
+o_n_space_outerProduct :: Int -> [Benchmark]
+o_n_space_outerProduct value =
+    [ bgroup "monad-outer-product"
+        [ benchIO "toList" $ toListM value parallely
+        -- XXX disabled due to a bug for now
+        -- , benchIO "toListSome" $ toListSome value parallely
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Main
+-------------------------------------------------------------------------------
+
 main :: IO ()
 main = do
    (value, cfg, benches) <- parseCLIOpts defaultStreamSize
@ -18,18 +185,13 @@ main = do
    where

    allBenchmarks value =
-        concat
-            [ linear value
-            , nested value
+        [ bgroup (o_1_space_prefix moduleName) (o_1_space_merge_app_tap value)
+        , bgroup (o_n_heap_prefix moduleName) $ concat
+            [ o_n_heap_generation value
+            , o_n_heap_mapping value
+            , o_n_heap_concatFoldable value
+            , o_n_heap_concat value
+            , o_n_heap_outerProduct value
            ]
-
-    linear value =
-        concat
-            [ o_n_space_parallel_generation value
-            , o_n_space_parallel_concatFoldable value
-            -- , o_n_space_parallel_outerProductStreams2
-            , o_n_space_parallel_concatMap value
-            , o_n_space_parallel_transformation value
-            ]
-
-    nested = o_n_space_parallel_outerProductStreams
+        , bgroup (o_n_space_prefix moduleName) (o_n_space_outerProduct value)
+        ]
--- a/benchmark/Streamly/Benchmark/Prelude/Rate.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/Rate.hs
@ -0,0 +1,73 @@
+-- |
+-- Module      : Main
+-- Copyright   : (c) 2018 Composewell Technologies
+--
+-- License     : BSD3
+-- Maintainer  : streamly@composewell.com
+
+import Streamly (asyncly, aheadly, maxThreads)
+
+import qualified Streamly as S
+
+import Streamly.Benchmark.Common
+import Streamly.Benchmark.Prelude
+
+import Gauge
+
+moduleName :: String
+moduleName = "Prelude.Rate"
+
+-------------------------------------------------------------------------------
+-- Average Rate
+-------------------------------------------------------------------------------
+
+-- XXX arbitrarily large rate should be the same as rate Nothing
+o_1_space_async_avgRate :: Int -> [Benchmark]
+o_1_space_async_avgRate value =
+    [ bgroup "asyncly"
+        [ bgroup "avgRate"
+            -- benchIO "unfoldr" $ toNull asyncly
+            -- benchIOSrc asyncly "unfoldrM" (sourceUnfoldrM value)
+            [ benchIOSrc asyncly "unfoldrM/Nothing"
+                (S.rate Nothing . sourceUnfoldrM value)
+            , benchIOSrc asyncly "unfoldrM/1,000,000"
+                (S.avgRate 1000000 . sourceUnfoldrM value)
+            , benchIOSrc asyncly "unfoldrM/3,000,000"
+                (S.avgRate 3000000 . sourceUnfoldrM value)
+            , benchIOSrc asyncly "unfoldrM/10,000,000/maxThreads1"
+                (maxThreads 1 .  S.avgRate 10000000 . sourceUnfoldrM value)
+            , benchIOSrc asyncly "unfoldrM/10,000,000"
+                (S.avgRate 10000000 . sourceUnfoldrM value)
+            , benchIOSrc asyncly "unfoldrM/20,000,000"
+                (S.avgRate 20000000 . sourceUnfoldrM value)
+            ]
+        ]
+    ]
+
+o_1_space_ahead_avgRate :: Int -> [Benchmark]
+o_1_space_ahead_avgRate value =
+    [ bgroup "aheadly"
+        [ bgroup "avgRate"
+            [ benchIOSrc aheadly "unfoldrM/1,000,000"
+                  (S.avgRate 1000000 . sourceUnfoldrM value)
+            ]
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Main
+-------------------------------------------------------------------------------
+
+main :: IO ()
+main = do
+    (value, cfg, benches) <- parseCLIOpts defaultStreamSize
+    value `seq` runMode (mode cfg) cfg benches (allBenchmarks value)
+
+    where
+
+    allBenchmarks value =
+        [ bgroup (o_1_space_prefix moduleName) $ concat
+            [ o_1_space_async_avgRate value
+            , o_1_space_ahead_avgRate value
+            ]
+        ]
--- a/benchmark/Streamly/Benchmark/Prelude/Serial.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/Serial.hs
--- a/benchmark/Streamly/Benchmark/Prelude/WAsync.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/WAsync.hs
@ -0,0 +1,135 @@
+-- |
+-- Module      : Main
+-- Copyright   : (c) 2018 Harendra Kumar
+--
+-- License     : BSD3
+-- Maintainer  : streamly@composewell.com
+
+import Prelude hiding (mapM)
+
+import Streamly (wAsyncly, wAsync, maxBuffer, maxThreads)
+
+import Streamly.Benchmark.Common
+import Streamly.Benchmark.Prelude
+
+import Gauge
+
+moduleName :: String
+moduleName = "Prelude.WAsync"
+
+-------------------------------------------------------------------------------
+-- Generation
+-------------------------------------------------------------------------------
+
+o_1_space_generation :: Int -> [Benchmark]
+o_1_space_generation value =
+    [ bgroup "generation"
+        [ benchIOSrc wAsyncly "unfoldr" (sourceUnfoldr value)
+        , benchIOSrc wAsyncly "unfoldrM" (sourceUnfoldrM value)
+        , benchIOSrc wAsyncly "fromFoldable" (sourceFromFoldable value)
+        , benchIOSrc wAsyncly "fromFoldableM" (sourceFromFoldableM value)
+        , benchIOSrc wAsyncly "unfoldrM maxThreads 1"
+            (maxThreads 1 . sourceUnfoldrM value)
+        , benchIOSrc wAsyncly "unfoldrM maxBuffer 1 (x/10 ops)"
+            (maxBuffer 1 . sourceUnfoldrMN (value `div` 10))
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Mapping
+-------------------------------------------------------------------------------
+
+o_1_space_mapping :: Int -> [Benchmark]
+o_1_space_mapping value =
+    [ bgroup "mapping"
+        [ benchIOSink value "map" $ mapN wAsyncly 1
+        , benchIOSink value "fmap" $ fmapN wAsyncly 1
+        , benchIOSink value "mapM" $ mapM wAsyncly 1
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Concat
+-------------------------------------------------------------------------------
+
+o_1_space_concatFoldable :: Int -> [Benchmark]
+o_1_space_concatFoldable value =
+    [ bgroup "concat-foldable"
+        [ benchIOSrc wAsyncly "foldMapWith" (sourceFoldMapWith value)
+        , benchIOSrc wAsyncly "foldMapWithM" (sourceFoldMapWithM value)
+        , benchIOSrc wAsyncly "foldMapM" (sourceFoldMapM value)
+        ]
+    ]
+
+-- When we merge streams using wAsync the size of the queue increases
+-- slowly because of the binary composition adding just one more item
+-- to the work queue only after every scheduling pass through the
+-- work queue.
+--
+-- We should see the memory consumption increasing slowly if these
+-- benchmarks are left to run on infinite number of streams of infinite
+-- sizes.
+o_1_space_concatMap :: Int -> [Benchmark]
+o_1_space_concatMap value =
+    value2 `seq`
+        [ bgroup "concat"
+              [ benchIO "concatMapWith (2,x/2)"
+                    (concatStreamsWith wAsync 2 (value `div` 2))
+              , benchIO "concatMapWith (sqrt x,sqrt x)"
+                    (concatStreamsWith wAsync value2 value2)
+              , benchIO "concatMapWith (sqrt x * 2,sqrt x / 2)"
+                    (concatStreamsWith wAsync (value2 * 2) (value2 `div` 2))
+              ]
+        ]
+
+    where
+
+    value2 = round $ sqrt (fromIntegral value :: Double)
+
+-------------------------------------------------------------------------------
+-- Monadic outer product
+-------------------------------------------------------------------------------
+
+o_n_heap_outerProduct :: Int -> [Benchmark]
+o_n_heap_outerProduct value =
+    [ bgroup "monad-outer-product"
+        [ benchIO "toNullAp"       $ toNullAp value wAsyncly
+        , benchIO "toNull"         $ toNullM value wAsyncly
+        , benchIO "toNull3"        $ toNullM3 value wAsyncly
+        , benchIO "filterAllOut"   $ filterAllOutM value wAsyncly
+        , benchIO "filterAllIn"    $ filterAllInM value wAsyncly
+        , benchIO "filterSome"     $ filterSome value wAsyncly
+        , benchIO "breakAfterSome" $ breakAfterSome value wAsyncly
+
+        ]
+    ]
+
+o_n_space_outerProduct :: Int -> [Benchmark]
+o_n_space_outerProduct value =
+    [ bgroup "monad-outer-product"
+        [ benchIO "toList"         $ toListM value wAsyncly
+        , benchIO "toListSome"     $ toListSome value wAsyncly
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Main
+-------------------------------------------------------------------------------
+
+main :: IO ()
+main = do
+    (value, cfg, benches) <- parseCLIOpts defaultStreamSize
+    value `seq` runMode (mode cfg) cfg benches (allBenchmarks value)
+
+    where
+
+    allBenchmarks value =
+        [ bgroup (o_1_space_prefix moduleName) $ concat
+            [ o_1_space_generation value
+            , o_1_space_mapping value
+            , o_1_space_concatFoldable value
+            , o_1_space_concatMap value
+            ]
+        , bgroup (o_n_heap_prefix moduleName) (o_n_heap_outerProduct value)
+        , bgroup (o_n_space_prefix moduleName) (o_n_space_outerProduct value)
+        ]
--- a/benchmark/Streamly/Benchmark/Prelude/WSerial.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/WSerial.hs
@ -0,0 +1,212 @@
+-- |
+-- Module      : Main
+-- Copyright   : (c) 2018 Composewell Technologies
+--
+-- License     : BSD3
+-- Maintainer  : streamly@composewell.com
+
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE RankNTypes #-}
+
+#ifdef __HADDOCK_VERSION__
+#undef INSPECTION
+#endif
+
+#ifdef INSPECTION
+{-# LANGUAGE TemplateHaskell #-}
+{-# OPTIONS_GHC -fplugin Test.Inspection.Plugin #-}
+#endif
+
+import Streamly
+import qualified Streamly.Prelude as S
+import qualified Streamly.Internal.Prelude as Internal
+import qualified Streamly.Internal.Data.Unfold as UF
+
+import Streamly.Benchmark.Common
+import Streamly.Benchmark.Prelude
+
+import Gauge
+
+moduleName :: String
+moduleName = "Prelude.WSerial"
+
+-------------------------------------------------------------------------------
+-- Mapping
+-------------------------------------------------------------------------------
+
+o_1_space_mapping :: Int -> [Benchmark]
+o_1_space_mapping value =
+    [ bgroup "mapping"
+        [ benchIOSink value "fmap" $ fmapN wSerially 1 ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Interleaving
+-------------------------------------------------------------------------------
+
+{-# INLINE wSerial2 #-}
+wSerial2 :: Int -> Int -> IO ()
+wSerial2 value n =
+    S.drain $ wSerial
+        (sourceUnfoldrMN (value `div` 2) n)
+        (sourceUnfoldrMN (value `div` 2) (n + 1))
+
+{-# INLINE interleave2 #-}
+interleave2 :: Int -> Int -> IO ()
+interleave2 value n =
+    S.drain $
+    Internal.interleave
+        (sourceUnfoldrMN (value `div` 2) n)
+        (sourceUnfoldrMN (value `div` 2) (n + 1))
+
+#ifdef INSPECTION
+inspect $ hasNoTypeClasses 'interleave2
+inspect $ 'interleave2 `hasNoType` ''SPEC
+inspect $ 'interleave2 `hasNoType` ''D.InterleaveState
+#endif
+
+{-# INLINE roundRobin2 #-}
+roundRobin2 :: Int -> Int -> IO ()
+roundRobin2 value n =
+    S.drain $
+    Internal.roundrobin
+        (sourceUnfoldrMN (value `div` 2) n)
+        (sourceUnfoldrMN (value `div` 2) (n + 1))
+
+#ifdef INSPECTION
+inspect $ hasNoTypeClasses 'roundRobin2
+inspect $ 'roundRobin2 `hasNoType` ''SPEC
+inspect $ 'roundRobin2 `hasNoType` ''D.InterleaveState
+#endif
+
+o_1_space_joining :: Int -> [Benchmark]
+o_1_space_joining value =
+    [ bgroup "joining"
+        [ benchIOSrc1 "wSerial (2,x/2)" (wSerial2 value)
+        , benchIOSrc1 "interleave (2,x/2)" (interleave2 value)
+        , benchIOSrc1 "roundRobin (2,x/2)" (roundRobin2 value)
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Concat
+-------------------------------------------------------------------------------
+
+{-# INLINE concatMapWithWSerial #-}
+concatMapWithWSerial :: Int -> Int -> Int -> IO ()
+concatMapWithWSerial = concatStreamsWith wSerial
+
+#ifdef INSPECTION
+inspect $ hasNoTypeClasses 'concatMapWithWSerial
+inspect $ 'concatMapWithSerial `hasNoType` ''SPEC
+#endif
+
+o_1_space_concat :: Int -> [Benchmark]
+o_1_space_concat value =
+    [ bgroup "concat"
+        [ benchIOSrc1
+              "concatMapWithWSerial (2,x/2)"
+              (concatMapWithWSerial 2 (value `div` 2))
+        , benchIOSrc1
+              "concatMapWithWSerial (x/2,2)"
+              (concatMapWithWSerial (value `div` 2) 2)
+        ]
+    ]
+
+{-# INLINE concatUnfoldInterleaveRepl4xN #-}
+concatUnfoldInterleaveRepl4xN :: Int -> Int -> IO ()
+concatUnfoldInterleaveRepl4xN value n =
+    S.drain $ Internal.concatUnfoldInterleave
+        (UF.replicateM 4)
+        (sourceUnfoldrMN (value `div` 4) n)
+
+#ifdef INSPECTION
+inspect $ hasNoTypeClasses 'concatUnfoldInterleaveRepl4xN
+-- inspect $ 'concatUnfoldInterleaveRepl4xN `hasNoType` ''SPEC
+-- inspect $ 'concatUnfoldInterleaveRepl4xN `hasNoType`
+--      ''D.ConcatUnfoldInterleaveState
+#endif
+
+{-# INLINE concatUnfoldRoundrobinRepl4xN #-}
+concatUnfoldRoundrobinRepl4xN :: Int -> Int -> IO ()
+concatUnfoldRoundrobinRepl4xN value n =
+    S.drain $ Internal.concatUnfoldRoundrobin
+        (UF.replicateM 4)
+        (sourceUnfoldrMN (value `div` 4) n)
+
+#ifdef INSPECTION
+inspect $ hasNoTypeClasses 'concatUnfoldRoundrobinRepl4xN
+-- inspect $ 'concatUnfoldRoundrobinRepl4xN `hasNoType` ''SPEC
+-- inspect $ 'concatUnfoldRoundrobinRepl4xN `hasNoType`
+--      ''D.ConcatUnfoldInterleaveState
+#endif
+
+o_n_heap_concat :: Int -> [Benchmark]
+o_n_heap_concat value =
+    [ bgroup "concat"
+        [
+          -- interleave x/4 streams of 4 elements each. Needs to buffer
+          -- proportional to x/4. This is different from WSerial because
+          -- WSerial expands slowly because of binary interleave behavior and
+          -- this expands immediately because of Nary interleave behavior.
+          benchIOSrc1
+              "concatUnfoldInterleaveRepl (x/4,4)"
+              (concatUnfoldInterleaveRepl4xN value)
+        , benchIOSrc1
+              "concatUnfoldRoundrobinRepl (x/4,4)"
+              (concatUnfoldRoundrobinRepl4xN value)
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Monad
+-------------------------------------------------------------------------------
+
+o_1_space_outerProduct :: Int -> [Benchmark]
+o_1_space_outerProduct value =
+    [ bgroup "monad-outer-product"
+        [ benchIO "toNullAp" $ toNullAp value wSerially
+        , benchIO "toNullM" $ toNullM value wSerially
+        , benchIO "toNullM3" $ toNullM3 value wSerially
+        , benchIO "filterAllOutM" $ filterAllOutM value wSerially
+        , benchIO "filterAllInM" $ filterAllInM value wSerially
+        , benchIO "filterSome" $ filterSome value wSerially
+        , benchIO "breakAfterSome" $ breakAfterSome value wSerially
+        ]
+    ]
+
+o_n_space_outerProduct :: Int -> [Benchmark]
+o_n_space_outerProduct value =
+    [ bgroup
+        "monad-outer-product"
+        [ benchIO "toList" $ toListM value wSerially
+        , benchIO "toListSome" $ toListSome value wSerially
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Main
+-------------------------------------------------------------------------------
+
+-- In addition to gauge options, the number of elements in the stream can be
+-- passed using the --stream-size option.
+--
+main :: IO ()
+main = do
+    (value, cfg, benches) <- parseCLIOpts defaultStreamSize
+    value `seq` runMode (mode cfg) cfg benches (allBenchmarks value)
+
+    where
+
+    allBenchmarks size =
+        [ bgroup (o_1_space_prefix moduleName) $ concat
+            [ o_1_space_mapping size
+            , o_1_space_joining size
+            , o_1_space_concat size
+            , o_1_space_outerProduct size
+            ]
+        , bgroup (o_n_heap_prefix moduleName) (o_n_heap_concat size)
+        , bgroup (o_n_space_prefix moduleName) (o_n_space_outerProduct size)
+        ]
--- a/benchmark/Streamly/Benchmark/Prelude/ZipAsync.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/ZipAsync.hs
@ -0,0 +1,82 @@
+-- |
+-- Module      : Main
+-- Copyright   : (c) 2018 Harendra Kumar
+--
+-- License     : BSD3
+-- Maintainer  : streamly@composewell.com
+
+{-# LANGUAGE FlexibleContexts #-}
+
+import Streamly (serially)
+import qualified Streamly as S
+import qualified Streamly.Prelude  as S
+
+import Streamly.Benchmark.Common
+import Streamly.Benchmark.Prelude
+
+import Gauge
+
+moduleName :: String
+moduleName = "Prelude.ZipAsync"
+
+-------------------------------------------------------------------------------
+-- Zipping
+-------------------------------------------------------------------------------
+
+{-# INLINE zipAsync #-}
+zipAsync :: (S.IsStream t, S.MonadAsync m) => Int -> Int -> t m (Int, Int)
+zipAsync count n =
+    S.zipAsyncWith (,) (sourceUnfoldrMN count n) (sourceUnfoldrMN count (n + 1))
+
+{-# INLINE zipAsyncM #-}
+zipAsyncM :: (S.IsStream t, S.MonadAsync m) => Int -> Int -> t m (Int, Int)
+zipAsyncM count n =
+    S.zipAsyncWithM
+        (curry return)
+        (sourceUnfoldrMN count n)
+        (sourceUnfoldrMN count (n + 1))
+
+{-# INLINE zipAsyncAp #-}
+zipAsyncAp :: (S.IsStream t, S.MonadAsync m) => Int -> Int -> t m (Int, Int)
+zipAsyncAp count n =
+    S.zipAsyncly $
+        (,) <$> sourceUnfoldrMN count n <*> sourceUnfoldrMN count (n + 1)
+
+o_1_space_joining :: Int -> [Benchmark]
+o_1_space_joining value =
+    [ bgroup "joining"
+        [ benchIOSrc serially "zipAsync (2,x/2)" (zipAsync (value `div` 2))
+        , benchIOSrc serially "zipAsyncM (2,x/2)" (zipAsyncM (value `div` 2))
+        , benchIOSrc serially "zipAsyncAp (2,x/2)" (zipAsyncAp (value `div` 2))
+        , benchIOSink value "fmap zipAsyncly" $ fmapN S.zipAsyncly 1
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Monad outer product
+-------------------------------------------------------------------------------
+
+o_1_space_outerProduct :: Int -> [Benchmark]
+o_1_space_outerProduct value =
+    [ bgroup "monad-outer-product"
+        [ benchIO "toNullAp" $ toNullAp value S.zipAsyncly
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Main
+-------------------------------------------------------------------------------
+
+main :: IO ()
+main = do
+    (value, cfg, benches) <- parseCLIOpts defaultStreamSize
+    value `seq` runMode (mode cfg) cfg benches (allBenchmarks value)
+
+    where
+
+    allBenchmarks size =
+        [ bgroup (o_1_space_prefix moduleName) $ concat
+            [ o_1_space_joining size
+            , o_1_space_outerProduct size
+            ]
+        ]
--- a/benchmark/Streamly/Benchmark/Prelude/ZipSerial.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/ZipSerial.hs
@ -0,0 +1,118 @@
+-- |
+-- Module      : Main
+-- Copyright   : (c) 2018 Composewell Technologies
+--
+-- License     : BSD3
+-- Maintainer  : streamly@composewell.com
+
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE RankNTypes #-}
+
+#ifdef __HADDOCK_VERSION__
+#undef INSPECTION
+#endif
+
+#ifdef INSPECTION
+{-# LANGUAGE TemplateHaskell #-}
+{-# OPTIONS_GHC -fplugin Test.Inspection.Plugin #-}
+#endif
+
+import Prelude hiding (zip)
+
+import Streamly (zipSerially)
+import qualified Streamly.Prelude  as S
+
+import Streamly.Benchmark.Common
+import Streamly.Benchmark.Prelude
+
+import Gauge
+
+moduleName :: String
+moduleName = "Prelude.ZipSerial"
+
+-------------------------------------------------------------------------------
+-- Zipping
+-------------------------------------------------------------------------------
+
+{-# INLINE zip #-}
+zip :: Int -> Int -> IO ()
+zip count n =
+    S.drain $
+    S.zipWith (,) (sourceUnfoldrMN count n) (sourceUnfoldrMN count (n + 1))
+
+#ifdef INSPECTION
+inspect $ hasNoTypeClasses 'zip
+inspect $ 'zip `hasNoType` ''SPEC
+inspect $ 'zip `hasNoType` ''D.Step
+#endif
+
+{-# INLINE zipM #-}
+zipM :: Int -> Int -> IO ()
+zipM count n =
+    S.drain $
+    S.zipWithM
+        (curry return)
+        (sourceUnfoldrMN count n)
+        (sourceUnfoldrMN count (n + 1))
+
+#ifdef INSPECTION
+inspect $ hasNoTypeClasses 'zipM
+inspect $ 'zipM `hasNoType` ''SPEC
+inspect $ 'zipM `hasNoType` ''D.Step
+#endif
+
+o_1_space_joining :: Int -> [Benchmark]
+o_1_space_joining value =
+    [ bgroup "joining"
+        [ benchIOSrc1 "zip (2,x/2)" (zip (value `div` 2))
+        , benchIOSrc1 "zipM (2,x/2)" (zipM (value `div` 2))
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Mapping
+-------------------------------------------------------------------------------
+
+o_1_space_mapping :: Int -> [Benchmark]
+o_1_space_mapping value =
+    [ bgroup "mapping"
+        [ benchIOSink value "fmap" $ fmapN zipSerially 1
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Monad outer product
+-------------------------------------------------------------------------------
+
+o_1_space_outerProduct :: Int -> [Benchmark]
+o_1_space_outerProduct value =
+    [ bgroup "monad-outer-product"
+        -- XXX needs fixing
+        [ benchIO "toNullAp" $ toNullAp value zipSerially
+        ]
+    ]
+
+-------------------------------------------------------------------------------
+-- Main
+-------------------------------------------------------------------------------
+
+-- In addition to gauge options, the number of elements in the stream can be
+-- passed using the --stream-size option.
+--
+main :: IO ()
+main = do
+    (value, cfg, benches) <- parseCLIOpts defaultStreamSize
+    value `seq` runMode (mode cfg) cfg benches (allBenchmarks value)
+
+    where
+
+    allBenchmarks size =
+        [ bgroup (o_1_space_prefix moduleName) $ concat
+            [ o_1_space_joining size
+            , o_1_space_mapping size
+            , o_1_space_outerProduct size
+            ]
+        ]
+
--- a/benchmark/lib/Streamly/Benchmark/Common.hs
+++ b/benchmark/lib/Streamly/Benchmark/Common.hs
@ -31,7 +31,6 @@ module Streamly.Benchmark.Common
    , mkListString

    , defaultStreamSize
-    , limitStreamSize
    )
 where

@ -40,7 +39,7 @@ import Control.Exception (evaluate)
 import Control.Monad (when)
 import Data.Functor.Identity (Identity, runIdentity)
 import Data.List (scanl')
-import Data.Maybe (catMaybes)
+import Data.Maybe (mapMaybe)
 import System.Console.GetOpt
       (OptDescr(..), ArgDescr(..), ArgOrder(..), getOpt')
 import System.Environment (getArgs, lookupEnv, setEnv)
@ -131,20 +130,11 @@ mkList value = [1..value]
 defaultStreamSize :: Int
 defaultStreamSize = 100000

-limitStreamSize :: Int -> IO Int
-limitStreamSize value = do
-    let val = min value defaultStreamSize
-    when (val /= value) $
-        putStrLn $ "Limiting stream size to "
-                   ++ show defaultStreamSize
-                   ++ " for non O(1) space operations"
-    return val
-
 -------------------------------------------------------------------------------
 -- Parse custom CLI options
 -------------------------------------------------------------------------------

-data BenchOpts = StreamSize Int deriving Show
+newtype BenchOpts = StreamSize Int deriving Show

 getStreamSize :: String -> Int
 getStreamSize size =
@ -186,6 +176,7 @@ parseCLIOpts defStreamSize = do
    (streamSize, args') <-
        case opts of
            StreamSize x : _ -> do
+                putStrLn $ "Stream size: " ++ show x
                -- When using the gauge "--measure-with" option we need to make
                -- sure that we pass the stream size to child process forked by
                -- gauge. So we use this env var for that purpose.
@ -195,8 +186,7 @@ parseCLIOpts defStreamSize = do
                -- correct order.
                newArgs <-
                          evaluate
-                        $ catMaybes
-                        $ map snd
+                        $ mapMaybe snd
                        $ scanl' deleteOptArgs (Nothing, Nothing) args
                return (x, newArgs)
            _ -> do
@ -205,7 +195,9 @@ parseCLIOpts defStreamSize = do
                    Just x -> do
                        s <- evaluate $ getStreamSize x
                        return (s, args)
-                    Nothing -> return (defStreamSize, args)
+                    Nothing -> do
+                        setEnv "STREAM_SIZE" (show defStreamSize)
+                        return (defStreamSize, args)

    -- Parse gauge options
    let config = defaultConfig
--- a/benchmark/lib/Streamly/Benchmark/Prelude.hs
+++ b/benchmark/lib/Streamly/Benchmark/Prelude.hs
@ -1,16 +1,467 @@
 -- |
 -- Module      : Streamly.Benchmark.Prelude
-- Copyright   : (c) 2018 Harendra Kumar
+-- Copyright   : (c) 2018 Composewell Technologies
 --
 -- License     : MIT
 -- Maintainer  : streamly@composewell.com

-module Streamly.Benchmark.Prelude
-    ( module Streamly.Benchmark.Prelude.Generation
-    , module Streamly.Benchmark.Prelude.Elimination
-    , module Streamly.Benchmark.Prelude.Transformation
-    ) where
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE RankNTypes #-}

-import Streamly.Benchmark.Prelude.Generation
-import Streamly.Benchmark.Prelude.Elimination
-import Streamly.Benchmark.Prelude.Transformation
+module Streamly.Benchmark.Prelude where
+
+import Control.DeepSeq (NFData(..))
+import Control.Exception (try)
+import GHC.Exception (ErrorCall)
+import System.Random (randomRIO)
+
+import qualified Data.Foldable as F
+import qualified Streamly as S
+import qualified Streamly.Prelude  as S
+import qualified Streamly.Internal.Prelude as Internal
+import qualified Streamly.Internal.Data.Pipe as Pipe
+
+import Gauge
+import Streamly.Internal.Data.Time.Units
+
+-- Common polymorphic stream APIs used across multiple stream modules
+
+-------------------------------------------------------------------------------
+-- Generation
+-------------------------------------------------------------------------------
+
+-------------------------------------------------------------------------------
+-- enumerate
+-------------------------------------------------------------------------------
+
+{-# INLINE sourceIntFromTo #-}
+sourceIntFromTo :: (Monad m, S.IsStream t) => Int -> Int -> t m Int
+sourceIntFromTo value n = S.enumerateFromTo n (n + value)
+
+{-# INLINE sourceIntFromThenTo #-}
+sourceIntFromThenTo :: (Monad m, S.IsStream t) => Int -> Int -> t m Int
+sourceIntFromThenTo value n = S.enumerateFromThenTo n (n + 1) (n + value)
+
+{-# INLINE sourceFracFromTo #-}
+sourceFracFromTo :: (Monad m, S.IsStream t) => Int -> Int -> t m Double
+sourceFracFromTo value n =
+    S.enumerateFromTo (fromIntegral n) (fromIntegral (n + value))
+
+{-# INLINE sourceFracFromThenTo #-}
+sourceFracFromThenTo :: (Monad m, S.IsStream t) => Int -> Int -> t m Double
+sourceFracFromThenTo value n = S.enumerateFromThenTo (fromIntegral n)
+    (fromIntegral n + 1.0001) (fromIntegral (n + value))
+
+{-# INLINE sourceIntegerFromStep #-}
+sourceIntegerFromStep :: (Monad m, S.IsStream t) => Int -> Int -> t m Integer
+sourceIntegerFromStep value n =
+    S.take value $ S.enumerateFromThen (fromIntegral n) (fromIntegral n + 1)
+
+-------------------------------------------------------------------------------
+-- unfold
+-------------------------------------------------------------------------------
+
+{-# INLINE sourceUnfoldr #-}
+sourceUnfoldr :: (Monad m, S.IsStream t) => Int -> Int -> t m Int
+sourceUnfoldr count start = S.unfoldr step start
+    where
+    step cnt =
+        if cnt > start + count
+        then Nothing
+        else Just (cnt, cnt + 1)
+
+{-# INLINE sourceUnfoldrN #-}
+sourceUnfoldrN :: (Monad m, S.IsStream t) => Int -> Int -> t m Int
+sourceUnfoldrN count start = S.unfoldr step start
+    where
+    step cnt =
+        if cnt > start + count
+        then Nothing
+        else Just (cnt, cnt + 1)
+
+{-# INLINE sourceUnfoldrM #-}
+sourceUnfoldrM :: (S.IsStream t, S.MonadAsync m) => Int -> Int -> t m Int
+sourceUnfoldrM count start = S.unfoldrM step start
+    where
+    step cnt =
+        if cnt > start + count
+        then return Nothing
+        else return (Just (cnt, cnt + 1))
+
+{-# INLINE sourceUnfoldrMN #-}
+sourceUnfoldrMN :: (S.IsStream t, S.MonadAsync m) => Int -> Int -> t m Int
+sourceUnfoldrMN count start = S.unfoldrM step start
+    where
+    step cnt =
+        if cnt > start + count
+        then return Nothing
+        else return (Just (cnt, cnt + 1))
+
+-------------------------------------------------------------------------------
+-- fromIndices
+-------------------------------------------------------------------------------
+
+{-# INLINE _sourceFromIndices #-}
+_sourceFromIndices :: (Monad m, S.IsStream t) => Int -> Int -> t m Int
+_sourceFromIndices value n = S.take value $ S.fromIndices (+ n)
+
+{-# INLINE _sourceFromIndicesM #-}
+_sourceFromIndicesM :: (S.MonadAsync m, S.IsStream t) => Int -> Int -> t m Int
+_sourceFromIndicesM value n = S.take value $ S.fromIndicesM (fmap return (+ n))
+
+-------------------------------------------------------------------------------
+-- fromList
+-------------------------------------------------------------------------------
+
+{-# INLINE sourceFromList #-}
+sourceFromList :: (Monad m, S.IsStream t) => Int -> Int -> t m Int
+sourceFromList value n = S.fromList [n..n+value]
+
+{-# INLINE sourceFromListM #-}
+sourceFromListM :: (S.MonadAsync m, S.IsStream t) => Int -> Int -> t m Int
+sourceFromListM value n = S.fromListM (fmap return [n..n+value])
+
+-------------------------------------------------------------------------------
+-- fromFoldable
+-------------------------------------------------------------------------------
+
+{-# INLINE sourceFromFoldable #-}
+sourceFromFoldable :: S.IsStream t => Int -> Int -> t m Int
+sourceFromFoldable value n = S.fromFoldable [n..n+value]
+
+{-# INLINE sourceFromFoldableM #-}
+sourceFromFoldableM :: (S.IsStream t, S.MonadAsync m) => Int -> Int -> t m Int
+sourceFromFoldableM value n = S.fromFoldableM (fmap return [n..n+value])
+
+-------------------------------------------------------------------------------
+-- Time enumeration
+-------------------------------------------------------------------------------
+
+{-# INLINE absTimes #-}
+absTimes :: (S.IsStream t, S.MonadAsync m, Functor (t m))
+    => Int -> Int -> t m AbsTime
+absTimes value _ = S.take value Internal.absTimes
+
+-------------------------------------------------------------------------------
+-- Elimination
+-------------------------------------------------------------------------------
+
+{-# INLINE toNull #-}
+toNull :: Monad m => (t m a -> S.SerialT m a) -> t m a -> m ()
+toNull t = S.drain . t
+
+-- We need a monadic bind here to make sure that the function f does not get
+-- completely optimized out by the compiler in some cases.
+
+-- | Takes a fold method, and uses it with a default source.
+{-# INLINE benchIOSink #-}
+benchIOSink
+    :: (S.IsStream t, NFData b)
+    => Int -> String -> (t IO Int -> IO b) -> Benchmark
+benchIOSink value name f =
+    bench name $ nfIO $ randomRIO (1,1) >>= f . sourceUnfoldrM value
+
+-- | Takes a source, and uses it with a default drain/fold method.
+{-# INLINE benchIOSrc #-}
+benchIOSrc
+    :: (t IO a -> S.SerialT IO a)
+    -> String
+    -> (Int -> t IO a)
+    -> Benchmark
+benchIOSrc t name f =
+    bench name $ nfIO $ randomRIO (1,1) >>= toNull t . f
+
+{-# INLINE benchIO #-}
+benchIO :: (NFData b) => String -> (Int -> IO b) -> Benchmark
+benchIO name f = bench name $ nfIO $ randomRIO (1,1) >>= f
+
+-------------------------------------------------------------------------------
+-- Mapping
+-------------------------------------------------------------------------------
+
+{-# INLINE sourceUnfoldrAction #-}
+sourceUnfoldrAction :: (S.IsStream t, Monad m, Monad m1)
+    => Int -> Int -> t m (m1 Int)
+sourceUnfoldrAction value n = S.serially $ S.unfoldr step n
+    where
+    step cnt =
+        if cnt > n + value
+        then Nothing
+        else Just (return cnt, cnt + 1)
+
+{-# INLINE composeN #-}
+composeN ::
+       (S.IsStream t, Monad m)
+    => Int
+    -> (t m Int -> S.SerialT m Int)
+    -> t m Int
+    -> m ()
+composeN n f =
+    case n of
+        1 -> S.drain . f
+        2 -> S.drain . f . S.adapt . f
+        3 -> S.drain . f . S.adapt . f . S.adapt . f
+        4 -> S.drain . f . S.adapt . f . S.adapt . f . S.adapt . f
+        _ -> undefined
+
+{-# INLINE fmapN #-}
+fmapN ::
+       (S.IsStream t, S.MonadAsync m, Functor (t m))
+    => (t m Int -> S.SerialT m Int)
+    -> Int
+    -> t m Int
+    -> m ()
+fmapN t n = composeN n $ t . fmap (+ 1)
+
+{-# INLINE mapN #-}
+mapN ::
+       (S.IsStream t, S.MonadAsync m)
+    => (t m Int -> S.SerialT m Int)
+    -> Int
+    -> t m Int
+    -> m ()
+mapN t n = composeN n $ t . S.map (+ 1)
+
+{-# INLINE mapM #-}
+mapM ::
+       (S.IsStream t, S.MonadAsync m)
+    => (t m Int -> S.SerialT m Int)
+    -> Int
+    -> t m Int
+    -> m ()
+mapM t n = composeN n $ t . S.mapM return
+
+-------------------------------------------------------------------------------
+-- Pipes
+-------------------------------------------------------------------------------
+
+{-# INLINE transformMapM #-}
+transformMapM ::
+       (S.IsStream t, S.MonadAsync m)
+    => (t m Int -> S.SerialT m Int)
+    -> Int
+    -> t m Int
+    -> m ()
+transformMapM t n = composeN n $ t . Internal.transform (Pipe.mapM return)
+
+{-# INLINE transformComposeMapM #-}
+transformComposeMapM ::
+       (S.IsStream t, S.MonadAsync m)
+    => (t m Int -> S.SerialT m Int)
+    -> Int
+    -> t m Int
+    -> m ()
+transformComposeMapM t n =
+    composeN n $
+    t .
+    Internal.transform
+        (Pipe.mapM (\x -> return (x + 1)) `Pipe.compose`
+         Pipe.mapM (\x -> return (x + 2)))
+
+{-# INLINE transformTeeMapM #-}
+transformTeeMapM ::
+       (S.IsStream t, S.MonadAsync m)
+    => (t m Int -> S.SerialT m Int)
+    -> Int
+    -> t m Int
+    -> m ()
+transformTeeMapM t n =
+    composeN n $
+    t .
+    Internal.transform
+        (Pipe.mapM (\x -> return (x + 1)) `Pipe.tee`
+         Pipe.mapM (\x -> return (x + 2)))
+
+{-# INLINE transformZipMapM #-}
+transformZipMapM ::
+       (S.IsStream t, S.MonadAsync m)
+    => (t m Int -> S.SerialT m Int)
+    -> Int
+    -> t m Int
+    -> m ()
+transformZipMapM t n =
+    composeN n $
+    t .
+    Internal.transform
+        (Pipe.zipWith
+             (+)
+             (Pipe.mapM (\x -> return (x + 1)))
+             (Pipe.mapM (\x -> return (x + 2))))
+
+-------------------------------------------------------------------------------
+-- Streams of streams
+-------------------------------------------------------------------------------
+
+-------------------------------------------------------------------------------
+-- Concat foldable
+-------------------------------------------------------------------------------
+
+{-# INLINE sourceFoldMapWith #-}
+sourceFoldMapWith :: (S.IsStream t, S.Semigroup (t m Int))
+    => Int -> Int -> t m Int
+sourceFoldMapWith value n = S.foldMapWith (S.<>) S.yield [n..n+value]
+
+{-# INLINE sourceFoldMapWithM #-}
+sourceFoldMapWithM :: (S.IsStream t, Monad m, S.Semigroup (t m Int))
+    => Int -> Int -> t m Int
+sourceFoldMapWithM value n =
+    S.foldMapWith (S.<>) (S.yieldM . return) [n..n+value]
+
+{-# INLINE sourceFoldMapM #-}
+sourceFoldMapM :: (S.IsStream t, Monad m, Monoid (t m Int))
+    => Int -> Int -> t m Int
+sourceFoldMapM value n = F.foldMap (S.yieldM . return) [n..n+value]
+
+-------------------------------------------------------------------------------
+-- Concat
+-------------------------------------------------------------------------------
+
+{-# INLINE sourceConcatMapId #-}
+sourceConcatMapId :: (S.IsStream t, Monad m)
+    => Int -> Int -> t m Int
+sourceConcatMapId value n =
+    S.concatMap id $ S.fromFoldable $ fmap (S.yieldM . return) [n..n+value]
+
+-- concatMapWith
+
+{-# INLINE concatStreamsWith #-}
+concatStreamsWith
+    :: (forall c. S.SerialT IO c -> S.SerialT IO c -> S.SerialT IO c)
+    -> Int
+    -> Int
+    -> Int
+    -> IO ()
+concatStreamsWith op outer inner n =
+    S.drain $ S.concatMapWith op
+        (sourceUnfoldrMN inner)
+        (sourceUnfoldrMN outer n)
+
+-------------------------------------------------------------------------------
+-- Monadic outer product
+-------------------------------------------------------------------------------
+
+{-# INLINE runToList #-}
+runToList :: Monad m => S.SerialT m a -> m [a]
+runToList = S.toList
+
+{-# INLINE toNullAp #-}
+toNullAp
+    :: (S.IsStream t, S.MonadAsync m, Applicative (t m))
+    => Int -> (t m Int -> S.SerialT m Int) -> Int -> m ()
+toNullAp linearCount t start = S.drain . t $
+    (+) <$> sourceUnfoldrM nestedCount2 start
+        <*> sourceUnfoldrM nestedCount2 start
+
+    where
+
+    nestedCount2 = round (fromIntegral linearCount**(1/2::Double))
+
+{-# INLINE toNullM #-}
+toNullM
+    :: (S.IsStream t, S.MonadAsync m, Monad (t m))
+    => Int -> (t m Int -> S.SerialT m Int) -> Int -> m ()
+toNullM linearCount t start = S.drain . t $ do
+    x <- sourceUnfoldrM nestedCount2 start
+    y <- sourceUnfoldrM nestedCount2 start
+    return $ x + y
+
+    where
+
+    nestedCount2 = round (fromIntegral linearCount**(1/2::Double))
+
+{-# INLINE toNullM3 #-}
+toNullM3
+    :: (S.IsStream t, S.MonadAsync m, Monad (t m))
+    => Int -> (t m Int -> S.SerialT m Int) -> Int -> m ()
+toNullM3 linearCount t start = S.drain . t $ do
+    x <- sourceUnfoldrM nestedCount3 start
+    y <- sourceUnfoldrM nestedCount3 start
+    z <- sourceUnfoldrM nestedCount3 start
+    return $ x + y + z
+  where
+    nestedCount3 = round (fromIntegral linearCount**(1/3::Double))
+
+{-# INLINE toListM #-}
+toListM
+    :: (S.IsStream t, S.MonadAsync m, Monad (t m))
+    => Int -> (t m Int -> S.SerialT m Int) -> Int -> m [Int]
+toListM linearCount t start = runToList . t $ do
+    x <- sourceUnfoldrM nestedCount2 start
+    y <- sourceUnfoldrM nestedCount2 start
+    return $ x + y
+  where
+    nestedCount2 = round (fromIntegral linearCount**(1/2::Double))
+
+-- Taking a specified number of elements is very expensive in logict so we have
+-- a test to measure the same.
+{-# INLINE toListSome #-}
+toListSome
+    :: (S.IsStream t, S.MonadAsync m, Monad (t m))
+    => Int -> (t m Int -> S.SerialT m Int) -> Int -> m [Int]
+toListSome linearCount t start =
+    runToList . t $ S.take 10000 $ do
+        x <- sourceUnfoldrM nestedCount2 start
+        y <- sourceUnfoldrM nestedCount2 start
+        return $ x + y
+  where
+    nestedCount2 = round (fromIntegral linearCount**(1/2::Double))
+
+{-# INLINE filterAllOutM #-}
+filterAllOutM
+    :: (S.IsStream t, S.MonadAsync m, Monad (t m))
+    => Int -> (t m Int -> S.SerialT m Int) -> Int -> m ()
+filterAllOutM linearCount t start = S.drain . t $ do
+    x <- sourceUnfoldrM nestedCount2 start
+    y <- sourceUnfoldrM nestedCount2 start
+    let s = x + y
+    if s < 0
+    then return s
+    else S.nil
+  where
+    nestedCount2 = round (fromIntegral linearCount**(1/2::Double))
+
+{-# INLINE filterAllInM #-}
+filterAllInM
+    :: (S.IsStream t, S.MonadAsync m, Monad (t m))
+    => Int -> (t m Int -> S.SerialT m Int) -> Int -> m ()
+filterAllInM linearCount t start = S.drain . t $ do
+    x <- sourceUnfoldrM nestedCount2 start
+    y <- sourceUnfoldrM nestedCount2 start
+    let s = x + y
+    if s > 0
+    then return s
+    else S.nil
+  where
+    nestedCount2 = round (fromIntegral linearCount**(1/2::Double))
+
+{-# INLINE filterSome #-}
+filterSome
+    :: (S.IsStream t, S.MonadAsync m, Monad (t m))
+    => Int -> (t m Int -> S.SerialT m Int) -> Int -> m ()
+filterSome linearCount t start = S.drain . t $ do
+    x <- sourceUnfoldrM nestedCount2 start
+    y <- sourceUnfoldrM nestedCount2 start
+    let s = x + y
+    if s > 1100000
+    then return s
+    else S.nil
+  where
+    nestedCount2 = round (fromIntegral linearCount**(1/2::Double))
+
+{-# INLINE breakAfterSome #-}
+breakAfterSome
+    :: (S.IsStream t, Monad (t IO))
+    => Int -> (t IO Int -> S.SerialT IO Int) -> Int -> IO ()
+breakAfterSome linearCount t start = do
+    (_ :: Either ErrorCall ()) <- try $ S.drain . t $ do
+        x <- sourceUnfoldrM nestedCount2 start
+        y <- sourceUnfoldrM nestedCount2 start
+        let s = x + y
+        if s > 1100000
+        then error "break"
+        else return s
+    return ()
+  where
+    nestedCount2 = round (fromIntegral linearCount**(1/2::Double))
--- a/benchmark/lib/Streamly/Benchmark/Prelude/Elimination.hs
+++ b/benchmark/lib/Streamly/Benchmark/Prelude/Elimination.hs
@ -1,795 +0,0 @@
-- |
-- Module      : Streamly.Benchmark.Prelude
-- Copyright   : (c) 2018 Harendra Kumar
--
-- License     : MIT
-- Maintainer  : streamly@composewell.com
-
-{-# LANGUAGE CPP #-}
-{-# LANGUAGE FlexibleContexts #-}
-{-# LANGUAGE ScopedTypeVariables #-}
-{-# LANGUAGE DeriveAnyClass #-}
-{-# LANGUAGE DeriveGeneric #-}
-{-# LANGUAGE RankNTypes #-}
-
-#ifdef __HADDOCK_VERSION__
-#undef INSPECTION
-#endif
-
-#ifdef INSPECTION
-{-# LANGUAGE TemplateHaskell #-}
-{-# OPTIONS_GHC -fplugin Test.Inspection.Plugin #-}
-#endif
-
-module Streamly.Benchmark.Prelude.Elimination
-    ( o_1_space_serial_pure
-    , o_1_space_serial_foldable
-    , o_1_space_serial_elimination
-    , o_1_space_serial_foldMultiStream
-
-    , o_n_space_serial_toList
-
-    , o_n_space_serial_foldr
-    , o_n_heap_serial_foldl
-    ) where
-
-import Control.DeepSeq (NFData(..))
-import Control.Monad.IO.Class (MonadIO(..))
-import Data.Functor.Identity (Identity, runIdentity)
-
-import System.Random (randomRIO)
-import Prelude
-       (Monad, String, Int, (+), ($), (.), return, (>), (<=), (==),
-        undefined, Maybe(..), Bool, (>>=), IO, compare, flip)
-import qualified Prelude as P
-import qualified Data.Foldable as F
-import qualified GHC.Exts as GHC
-
-#ifdef INSPECTION
-import GHC.Types (SPEC(..))
-import Test.Inspection
-
-import qualified Streamly.Internal.Data.Stream.StreamD as D
-#endif
-
-import qualified Streamly as S hiding (runStream)
-import qualified Streamly.Prelude  as S
-import qualified Streamly.Internal.Prelude as Internal
-
-import qualified Streamly.Internal.Prelude as IP
-
-
-import Gauge
-import Streamly hiding (runStream)
-import Streamly.Benchmark.Common
-
-type Stream m a = S.SerialT m a
-
-
-------------------------------------------------------------------------------
-- Stream generation
-------------------------------------------------------------------------------
-
-- unfoldr
-
-{-# INLINE sourceUnfoldr #-}
-sourceUnfoldr :: (Monad m, S.IsStream t) => Int -> Int -> t m Int
-sourceUnfoldr value n = S.unfoldr step n
-    where
-    step cnt =
-        if cnt > n + value
-        then Nothing
-        else Just (cnt, cnt + 1)
-
-{-# INLINE sourceUnfoldrM #-}
-sourceUnfoldrM :: (S.IsStream t, S.MonadAsync m) => Int -> Int -> t m Int
-sourceUnfoldrM value n = S.unfoldrM step n
-    where
-    step cnt =
-        if cnt > n + value
-        then return Nothing
-        else return (Just (cnt, cnt + 1))
-
-{-# INLINE source #-}
-source :: (S.MonadAsync m, S.IsStream t) => Int -> Int -> t m Int
-source = sourceUnfoldrM
-
-{-# INLINE sourceUnfoldrAction #-}
-sourceUnfoldrAction :: (S.IsStream t, Monad m, Monad m1)
-    => Int -> Int -> t m (m1 Int)
-sourceUnfoldrAction value n = S.serially $ S.unfoldr step n
-    where
-    step cnt =
-        if cnt > n + value
-        then Nothing
-        else (Just (return cnt, cnt + 1))
-
-- fromIndices
-
-{-# INLINE sourceIsList #-}
-sourceIsList :: Int -> Int -> S.SerialT Identity Int
-sourceIsList value n = GHC.fromList [n..n+value]
-
-{-# INLINE sourceIsString #-}
-sourceIsString :: Int -> Int -> S.SerialT Identity P.Char
-sourceIsString value n = GHC.fromString (P.replicate (n + value) 'a')
-
-------------------------------------------------------------------------------
-- Elimination
-------------------------------------------------------------------------------
-
-{-# INLINE runStream #-}
-runStream :: Monad m => Stream m a -> m ()
-runStream = S.drain
-
-{-# INLINE toNull #-}
-toNull :: Monad m => (t m a -> S.SerialT m a) -> t m a -> m ()
-toNull t = runStream . t
-
-{-# INLINE uncons #-}
-uncons :: Monad m => Stream m Int -> m ()
-uncons s = do
-    r <- S.uncons s
-    case r of
-        Nothing -> return ()
-        Just (_, t) -> uncons t
-
-{-# INLINE init #-}
-init :: Monad m => Stream m a -> m ()
-init s = S.init s >>= P.mapM_ S.drain
-
-{-# INLINE mapM_ #-}
-mapM_ :: Monad m => Stream m Int -> m ()
-mapM_ = S.mapM_ (\_ -> return ())
-
-{-# INLINE toList #-}
-toList :: Monad m => Stream m Int -> m [Int]
-toList = S.toList
-
-{-# INLINE toListRev #-}
-toListRev :: Monad m => Stream m Int -> m [Int]
-toListRev = Internal.toListRev
-
-{-# INLINE foldrMElem #-}
-foldrMElem :: Monad m => Int -> Stream m Int -> m Bool
-foldrMElem e m =
-    S.foldrM
-        (\x xs ->
-             if x == e
-                 then return P.True
-                 else xs)
-        (return P.False)
-        m
-
-{-# INLINE foldrMToStream #-}
-foldrMToStream :: Monad m => Stream m Int -> m (Stream Identity Int)
-foldrMToStream = S.foldr S.cons S.nil
-
-{-# INLINE foldrMBuild #-}
-foldrMBuild :: Monad m => Stream m Int -> m [Int]
-foldrMBuild = S.foldrM (\x xs -> xs >>= return . (x :)) (return [])
-
-{-# INLINE foldl'Build #-}
-foldl'Build :: Monad m => Stream m Int -> m [Int]
-foldl'Build = S.foldl' (flip (:)) []
-
-{-# INLINE foldlM'Build #-}
-foldlM'Build :: Monad m => Stream m Int -> m [Int]
-foldlM'Build = S.foldlM' (\xs x -> return $ x : xs) []
-
-{-# INLINE foldrMReduce #-}
-foldrMReduce :: Monad m => Stream m Int -> m Int
-foldrMReduce = S.foldrM (\x xs -> xs >>= return . (x +)) (return 0)
-
-{-# INLINE foldl'Reduce #-}
-foldl'Reduce :: Monad m => Stream m Int -> m Int
-foldl'Reduce = S.foldl' (+) 0
-
-{-# INLINE foldl1'Reduce #-}
-foldl1'Reduce :: Monad m => Stream m Int -> m (Maybe Int)
-foldl1'Reduce = S.foldl1' (+)
-
-{-# INLINE foldlM'Reduce #-}
-foldlM'Reduce :: Monad m => Stream m Int -> m Int
-foldlM'Reduce = S.foldlM' (\xs a -> return $ a + xs) 0
-
-{-# INLINE last #-}
-last :: Monad m => Stream m Int -> m (Maybe Int)
-last = S.last
-
-{-# INLINE _null #-}
-_null :: Monad m => Stream m Int -> m Bool
-_null = S.null
-
-{-# INLINE _head #-}
-_head :: Monad m => Stream m Int -> m (Maybe Int)
-_head = S.head
-
-{-# INLINE elem #-}
-elem :: Monad m => Int -> Stream m Int -> m Bool
-elem value = S.elem (value + 1)
-
-{-# INLINE notElem #-}
-notElem :: Monad m => Int -> Stream m Int -> m Bool
-notElem value = S.notElem (value + 1)
-
-{-# INLINE length #-}
-length :: Monad m => Stream m Int -> m Int
-length = S.length
-
-{-# INLINE all #-}
-all :: Monad m => Int -> Stream m Int -> m Bool
-all value = S.all (<= (value + 1))
-
-{-# INLINE any #-}
-any :: Monad m => Int -> Stream m Int -> m Bool
-any value = S.any (> (value + 1))
-
-{-# INLINE and #-}
-and :: Monad m => Int -> Stream m Int -> m Bool
-and value = S.and . S.map (<= (value + 1))
-
-{-# INLINE or #-}
-or :: Monad m => Int -> Stream m Int -> m Bool
-or value = S.or . S.map (> (value + 1))
-
-{-# INLINE find #-}
-find :: Monad m => Int -> Stream m Int -> m (Maybe Int)
-find value = S.find (== (value + 1))
-
-{-# INLINE findIndex #-}
-findIndex :: Monad m => Int -> Stream m Int -> m (Maybe Int)
-findIndex value = S.findIndex (== (value + 1))
-
-{-# INLINE elemIndex #-}
-elemIndex :: Monad m => Int -> Stream m Int -> m (Maybe Int)
-elemIndex value = S.elemIndex (value + 1)
-
-{-# INLINE maximum #-}
-maximum :: Monad m => Stream m Int -> m (Maybe Int)
-maximum = S.maximum
-
-{-# INLINE minimum #-}
-minimum :: Monad m => Stream m Int -> m (Maybe Int)
-minimum = S.minimum
-
-{-# INLINE sum #-}
-sum :: Monad m => Stream m Int -> m Int
-sum = S.sum
-
-{-# INLINE product #-}
-product :: Monad m => Stream m Int -> m Int
-product = S.product
-
-{-# INLINE minimumBy #-}
-minimumBy :: Monad m => Stream m Int -> m (Maybe Int)
-minimumBy = S.minimumBy compare
-
-{-# INLINE maximumBy #-}
-maximumBy :: Monad m => Stream m Int -> m (Maybe Int)
-maximumBy = S.maximumBy compare
-
-------------------------------------------------------------------------------
-- Transformation
-------------------------------------------------------------------------------
-
-{-# INLINE transform #-}
-transform :: Monad m => Stream m a -> m ()
-transform = runStream
-
-{-# INLINE composeN #-}
-composeN ::
-       MonadIO m
-    => Int
-    -> (Stream m Int -> Stream m Int)
-    -> Stream m Int
-    -> m ()
-composeN n f =
-    case n of
-        1 -> transform . f
-        2 -> transform . f . f
-        3 -> transform . f . f . f
-        4 -> transform . f . f . f . f
-        _ -> undefined
-
-{-# INLINE reverse #-}
-reverse :: MonadIO m => Int -> Stream m Int -> m ()
-reverse n = composeN n $ S.reverse
-
-{-# INLINE reverse' #-}
-reverse' :: MonadIO m => Int -> Stream m Int -> m ()
-reverse' n = composeN n $ Internal.reverse'
-
-------------------------------------------------------------------------------
-- Multi-stream folds
-------------------------------------------------------------------------------
-
-{-# INLINE isPrefixOf #-}
-isPrefixOf :: Monad m => Stream m Int -> m Bool
-isPrefixOf src = S.isPrefixOf src src
-
-{-# INLINE isSubsequenceOf #-}
-isSubsequenceOf :: Monad m => Stream m Int -> m Bool
-isSubsequenceOf src = S.isSubsequenceOf src src
-
-{-# INLINE stripPrefix #-}
-stripPrefix :: Monad m => Stream m Int -> m ()
-stripPrefix src = do
-    _ <- S.stripPrefix src src
-    return ()
-
-{-# INLINE eqBy' #-}
-eqBy' :: (Monad m, P.Eq a) => Stream m a -> m P.Bool
-eqBy' src = S.eqBy (==) src src
-
-{-# INLINE eqBy #-}
-eqBy :: Int -> Int -> IO Bool
-eqBy value n = eqBy' (source value n)
-
-#ifdef INSPECTION
-inspect $ hasNoTypeClasses 'eqBy
-inspect $ 'eqBy `hasNoType` ''SPEC
-inspect $ 'eqBy `hasNoType` ''D.Step
-#endif
-
-
-{-# INLINE eqByPure #-}
-eqByPure :: Int -> Int -> Identity Bool
-eqByPure value n = eqBy' (sourceUnfoldr value n)
-
-#ifdef INSPECTION
-inspect $ hasNoTypeClasses 'eqByPure
-inspect $ 'eqByPure `hasNoType` ''SPEC
-inspect $ 'eqByPure `hasNoType` ''D.Step
-#endif
-
-{-# INLINE cmpBy' #-}
-cmpBy' :: (Monad m, P.Ord a) => Stream m a -> m P.Ordering
-cmpBy' src = S.cmpBy P.compare src src
-
-{-# INLINE cmpBy #-}
-cmpBy :: Int -> Int -> IO P.Ordering
-cmpBy value n = cmpBy' (source value n)
-
-#ifdef INSPECTION
-inspect $ hasNoTypeClasses 'cmpBy
-inspect $ 'cmpBy `hasNoType` ''SPEC
-inspect $ 'cmpBy `hasNoType` ''D.Step
-#endif
-
-{-# INLINE cmpByPure #-}
-cmpByPure :: Int -> Int -> Identity P.Ordering
-cmpByPure value n = cmpBy' (sourceUnfoldr value n)
-
-#ifdef INSPECTION
-inspect $ hasNoTypeClasses 'cmpByPure
-inspect $ 'cmpByPure `hasNoType` ''SPEC
-inspect $ 'cmpByPure `hasNoType` ''D.Step
-#endif
-
-------------------------------------------------------------------------------
-- Type class instances
-------------------------------------------------------------------------------
-
-{-# INLINE eqInstance #-}
-eqInstance :: Stream Identity Int -> Bool
-eqInstance src = src == src
-
-{-# INLINE eqInstanceNotEq #-}
-eqInstanceNotEq :: Stream Identity Int -> Bool
-eqInstanceNotEq src = src P./= src
-
-{-# INLINE ordInstance #-}
-ordInstance :: Stream Identity Int -> Bool
-ordInstance src = src P.< src
-
-{-# INLINE ordInstanceMin #-}
-ordInstanceMin :: Stream Identity Int -> Stream Identity Int
-ordInstanceMin src = P.min src src
-
-{-# INLINE showInstance #-}
-showInstance :: Stream Identity Int -> P.String
-showInstance src = P.show src
-
-------------------------------------------------------------------------------
-- Pure (Identity) streams
-------------------------------------------------------------------------------
-
-{-# INLINE pureFoldl' #-}
-pureFoldl' :: Stream Identity Int -> Int
-pureFoldl' = runIdentity . S.foldl' (+) 0
-
-------------------------------------------------------------------------------
-- Foldable Instance
-------------------------------------------------------------------------------
-
-{-# INLINE foldableFoldl' #-}
-foldableFoldl' :: Int -> Int -> Int
-foldableFoldl' value n =
-    F.foldl' (+) 0 (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableFoldrElem #-}
-foldableFoldrElem :: Int -> Int -> Bool
-foldableFoldrElem value n =
-    F.foldr (\x xs -> if x == value then P.True else xs)
-            (P.False)
-            (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableSum #-}
-foldableSum :: Int -> Int -> Int
-foldableSum value n =
-    P.sum (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableProduct #-}
-foldableProduct :: Int -> Int -> Int
-foldableProduct value n =
-    P.product (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE _foldableNull #-}
-_foldableNull :: Int -> Int -> Bool
-_foldableNull value n =
-    P.null (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableElem #-}
-foldableElem :: Int -> Int -> Bool
-foldableElem value n =
-    P.elem value (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableNotElem #-}
-foldableNotElem :: Int -> Int -> Bool
-foldableNotElem value n =
-    P.notElem value (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableFind #-}
-foldableFind :: Int -> Int -> Maybe Int
-foldableFind value n =
-    F.find (== (value + 1)) (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableAll #-}
-foldableAll :: Int -> Int -> Bool
-foldableAll value n =
-    P.all (<= (value + 1)) (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableAny #-}
-foldableAny :: Int -> Int -> Bool
-foldableAny value n =
-    P.any (> (value + 1)) (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableAnd #-}
-foldableAnd :: Int -> Int -> Bool
-foldableAnd value n =
-    P.and $ S.map (<= (value + 1)) (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableOr #-}
-foldableOr :: Int -> Int -> Bool
-foldableOr value n =
-    P.or $ S.map (> (value + 1)) (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableLength #-}
-foldableLength :: Int -> Int -> Int
-foldableLength value n =
-    P.length (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableMin #-}
-foldableMin :: Int -> Int -> Int
-foldableMin value n =
-    P.minimum (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableMax #-}
-foldableMax :: Int -> Int -> Int
-foldableMax value n =
-    P.maximum (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableMinBy #-}
-foldableMinBy :: Int -> Int -> Int
-foldableMinBy value n =
-    F.minimumBy compare (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableListMinBy #-}
-foldableListMinBy :: Int -> Int -> Int
-foldableListMinBy value n = F.minimumBy compare [1..value+n]
-
-{-# INLINE foldableMaxBy #-}
-foldableMaxBy :: Int -> Int -> Int
-foldableMaxBy value n =
-    F.maximumBy compare (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableToList #-}
-foldableToList :: Int -> Int -> [Int]
-foldableToList value n =
-    F.toList (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableMapM_ #-}
-foldableMapM_ :: Monad m => Int -> Int -> m ()
-foldableMapM_ value n =
-    F.mapM_ (\_ -> return ()) (sourceUnfoldr value n :: S.SerialT Identity Int)
-
-{-# INLINE foldableSequence_ #-}
-foldableSequence_ :: Int -> Int -> IO ()
-foldableSequence_ value n =
-    F.sequence_ (sourceUnfoldrAction value n :: S.SerialT Identity (IO Int))
-
-{-# INLINE _foldableMsum #-}
-_foldableMsum :: Int -> Int -> IO Int
-_foldableMsum value n =
-    F.msum (sourceUnfoldrAction value n :: S.SerialT Identity (IO Int))
-
-------------------------------------------------------------------------------
-- Benchmark groups
-------------------------------------------------------------------------------
-
-- We need a monadic bind here to make sure that the function f does not get
-- completely optimized out by the compiler in some cases.
-
-- | Takes a fold method, and uses it with a default source.
-{-# INLINE benchIOSink #-}
-benchIOSink
-    :: (IsStream t, NFData b)
-    => Int -> String -> (t IO Int -> IO b) -> Benchmark
-benchIOSink value name f = bench name $ nfIO $ randomRIO (1,1) >>= f . source value
-
-{-# INLINE benchHoistSink #-}
-benchHoistSink
-    :: (IsStream t, NFData b)
-    => Int -> String -> (t Identity Int -> IO b) -> Benchmark
-benchHoistSink value name f =
-    bench name $ nfIO $ randomRIO (1,1) >>= f .  sourceUnfoldr value
-
-- XXX We should be using sourceUnfoldrM for fair comparison with IO monad, but
-- we can't use it as it requires MonadAsync constraint.
-{-# INLINE benchIdentitySink #-}
-benchIdentitySink
-    :: (IsStream t, NFData b)
-    => Int -> String -> (t Identity Int -> Identity b) -> Benchmark
-benchIdentitySink value name f = bench name $ nf (f . sourceUnfoldr value) 1
-
-{-# INLINE benchPureSink #-}
-benchPureSink :: NFData b => Int -> String -> (SerialT Identity Int -> b) -> Benchmark
-benchPureSink value name f = benchPure name (sourceUnfoldr value) f
-
-------------------------------------------------------------------------------
-- Benchmark groups
-------------------------------------------------------------------------------
-
-o_1_space_serial_pure :: Int -> [Benchmark]
-o_1_space_serial_pure value =
-    [ bgroup
-          "serially"
-          [ bgroup
-                "pure"
-                [ benchPureSink value "id" P.id
-                , benchPureSink1 "eqBy" (eqByPure value)
-                , benchPureSink value "==" eqInstance
-                , benchPureSink value "/=" eqInstanceNotEq
-                , benchPureSink1 "cmpBy" (cmpByPure value)
-                , benchPureSink value "<" ordInstance
-                , benchPureSink value "min" ordInstanceMin
-                , benchPureSrc "IsList.fromList" (sourceIsList value)
-            -- length is used to check for foldr/build fusion
-                , benchPureSink
-                      value
-                      "length . IsList.toList"
-                      (P.length . GHC.toList)
-                , benchPureSrc "IsString.fromString" (sourceIsString value)
-                , benchPureSink value "showsPrec pure streams" showInstance
-                , benchPureSink value "foldl'" pureFoldl'
-                ]
-          ]
-    ]
-
-
-o_1_space_serial_foldable :: Int -> [Benchmark]
-o_1_space_serial_foldable value =
-    [ bgroup
-          "serially"
-          [ bgroup
-                "foldable"
-              -- Foldable instance
-              -- type class operations
-                [ bench "foldl'" $ nf (foldableFoldl' value) 1
-                , bench "foldrElem" $ nf (foldableFoldrElem value) 1
-            -- , bench "null" $ nf (_foldableNull value) 1
-                , bench "elem" $ nf (foldableElem value) 1
-                , bench "length" $ nf (foldableLength value) 1
-                , bench "sum" $ nf (foldableSum value) 1
-                , bench "product" $ nf (foldableProduct value) 1
-                , bench "minimum" $ nf (foldableMin value) 1
-                , bench "maximum" $ nf (foldableMax value) 1
-                , bench "length . toList" $
-                  nf (P.length . foldableToList value) 1
-            -- folds
-                , bench "notElem" $ nf (foldableNotElem value) 1
-                , bench "find" $ nf (foldableFind value) 1
-                , bench "all" $ nf (foldableAll value) 1
-                , bench "any" $ nf (foldableAny value) 1
-                , bench "and" $ nf (foldableAnd value) 1
-                , bench "or" $ nf (foldableOr value) 1
-            -- Note: minimumBy/maximumBy do not work in constant memory they are in
-            -- the O(n) group of benchmarks down below in this file.
-            -- Applicative and Traversable operations
-            -- TBD: traverse_
-                , benchIOSink1 "mapM_" (foldableMapM_ value)
-            -- TBD: for_
-            -- TBD: forM_
-                , benchIOSink1 "sequence_" (foldableSequence_ value)
-            -- TBD: sequenceA_
-            -- TBD: asum
-            -- , benchIOSink1 "msum" (_foldableMsum value)
-                ]
-          ]
-    ]
-
-
-o_1_space_serial_elimination :: Int -> [Benchmark]
-o_1_space_serial_elimination value =
-    [ bgroup
-          "serially"
-          [ bgroup
-                "elimination"
-                [ bgroup
-                      "reduce"
-                      [ bgroup
-                            "IO"
-                            [ benchIOSink value "foldl'" foldl'Reduce
-                            , benchIOSink value "foldl1'" foldl1'Reduce
-                            , benchIOSink value "foldlM'" foldlM'Reduce
-                            ]
-                      , bgroup
-                            "Identity"
-                            [ benchIdentitySink value "foldl'" foldl'Reduce
-                            , benchIdentitySink
-                                  value
-                                  "foldl1'"
-                                  foldl1'Reduce
-                            , benchIdentitySink
-                                  value
-                                  "foldlM'"
-                                  foldlM'Reduce
-                            ]
-                      ]
-                , bgroup
-                      "build"
-                      [ bgroup
-                            "IO"
-                            [ benchIOSink
-                                  value
-                                  "foldrMElem"
-                                  (foldrMElem value)
-                            ]
-                      , bgroup
-                            "Identity"
-                            [ benchIdentitySink
-                                  value
-                                  "foldrMElem"
-                                  (foldrMElem value)
-                            , benchIdentitySink
-                                  value
-                                  "foldrMToStreamLength"
-                                  (S.length . runIdentity . foldrMToStream)
-                            , benchPureSink
-                                  value
-                                  "foldrMToListLength"
-                                  (P.length . runIdentity . foldrMBuild)
-                            ]
-                      ]
-                , benchIOSink value "uncons" uncons
-                , benchIOSink value "toNull" $ toNull serially
-                , benchIOSink value "mapM_" mapM_
-                , benchIOSink value "init" init
-            -- this is too low and causes all benchmarks reported in ns
-            -- , benchIOSink value "head" head
-                , benchIOSink value "last" last
-            -- , benchIOSink value "lookup" lookup
-                , benchIOSink value "find" (find value)
-                , benchIOSink value "findIndex" (findIndex value)
-                , benchIOSink value "elemIndex" (elemIndex value)
-            -- this is too low and causes all benchmarks reported in ns
-            -- , benchIOSink value "null" null
-                , benchIOSink value "elem" (elem value)
-                , benchIOSink value "notElem" (notElem value)
-                , benchIOSink value "all" (all value)
-                , benchIOSink value "any" (any value)
-                , benchIOSink value "and" (and value)
-                , benchIOSink value "or" (or value)
-                , benchIOSink value "length" length
-                , benchHoistSink
-                      value
-                      "length . generally"
-                      (length . IP.generally)
-                , benchIOSink value "sum" sum
-                , benchIOSink value "product" product
-                , benchIOSink value "maximumBy" maximumBy
-                , benchIOSink value "maximum" maximum
-                , benchIOSink value "minimumBy" minimumBy
-                , benchIOSink value "minimum" minimum
-                ]
-          ]
-    ]
-
-
-
-o_1_space_serial_foldMultiStream :: Int -> [Benchmark]
-o_1_space_serial_foldMultiStream value =
-    [ bgroup
-          "serially"
-          [ bgroup
-                "fold-multi-stream"
-                [ benchIOSink1 "eqBy" (eqBy value)
-                , benchIOSink1 "cmpBy" (cmpBy value)
-                , benchIOSink value "isPrefixOf" isPrefixOf
-                , benchIOSink value "isSubsequenceOf" isSubsequenceOf
-                , benchIOSink value "stripPrefix" stripPrefix
-                ]
-          ]
-    ]
-
-
-o_n_space_serial_toList :: Int -> [Benchmark]
-o_n_space_serial_toList value =
-    [ bgroup
-          "serially"
-          [ bgroup
-                "toList" -- < 2MB
-          -- Converting the stream to a list or pure stream in a strict monad
-                [ benchIOSink value "foldrMToList" foldrMBuild
-                , benchIOSink value "toList" toList
-                , benchIOSink value "toListRev" toListRev
-          -- , benchIOSink value "toPure" toPure
-          -- , benchIOSink value "toPureRev" toPureRev
-                ]
-          ]
-    ]
-
-
-o_n_space_serial_foldr :: Int -> [Benchmark]
-o_n_space_serial_foldr value =
-    [ bgroup
-          "serially"
-        -- Head recursive strict right folds.
-          [ bgroup
-                "foldr"
-            -- < 2MB
-          -- accumulation due to strictness of IO monad
-                [ benchIOSink value "foldrM/build/IO" foldrMBuild
-          -- Right folds for reducing are inherently non-streaming as the
-          -- expression needs to be fully built before it can be reduced.
-                , benchIdentitySink
-                      value
-                      "foldrM/reduce/Identity"
-                      foldrMReduce
-          -- takes < 4MB
-                , benchIOSink value "foldrM/reduce/IO" foldrMReduce
-          -- XXX the definitions of minimumBy and maximumBy in Data.Foldable use
-          -- foldl1 which does not work in constant memory for our implementation.
-          -- It works in constant memory for lists but even for lists it takes 15x
-          -- more time compared to our foldl' based implementation.
-          -- XXX these take < 16M stack space
-                , bench "minimumBy" $ nf (flip foldableMinBy 1) value
-                , bench "maximumBy" $ nf (flip foldableMaxBy 1) value
-                , bench "minimumByList" $ nf (flip foldableListMinBy 1) value
-                ]
-          ]
-    ]
-
-
-o_n_heap_serial_foldl :: Int -> [Benchmark]
-o_n_heap_serial_foldl value =
-    [ bgroup
-          "serially"
-          [ bgroup
-                "foldl"
-          -- Left folds for building a structure are inherently non-streaming
-          -- as the structure cannot be lazily consumed until fully built.
-                [ benchIOSink value "foldl'/build/IO" foldl'Build
-                , benchIdentitySink value "foldl'/build/Identity" foldl'Build
-                , benchIOSink value "foldlM'/build/IO" foldlM'Build
-                , benchIdentitySink
-                      value
-                      "foldlM'/build/Identity"
-                      foldlM'Build
-          -- Reversing/sorting a stream
-                , benchIOSink value "reverse" (reverse 1)
-                , benchIOSink value "reverse'" (reverse' 1)
-                ]
-          ]
-    ]
--- a/benchmark/lib/Streamly/Benchmark/Prelude/Generation.hs
+++ b/benchmark/lib/Streamly/Benchmark/Prelude/Generation.hs
--- a/benchmark/lib/Streamly/Benchmark/Prelude/Transformation.hs
+++ b/benchmark/lib/Streamly/Benchmark/Prelude/Transformation.hs
--- a/benchmark/streamly-benchmarks.cabal
+++ b/benchmark/streamly-benchmarks.cabal
@ -128,27 +128,9 @@ common bench-depends

 library
    import: lib-options, bench-depends
-    hs-source-dirs:    lib
-    exposed-modules:
-                       Streamly.Benchmark.Common
-
-library lib-prelude
-    import: lib-options, bench-depends
-    hs-source-dirs:    lib, .
-    exposed-modules:
-                       Streamly.Benchmark.Prelude
-    other-modules:     Streamly.Benchmark.Common
-                     , Streamly.Benchmark.Prelude.NestedOps
-                     , Streamly.Benchmark.Prelude.Generation
-                     , Streamly.Benchmark.Prelude.Elimination
-                     , Streamly.Benchmark.Prelude.Transformation
-
-    -- XXX GHCJS build fails for this library.
-    if impl(ghcjs)
-      buildable: False
-    else
-      build-depends: ghc-prim
-      buildable: True
+    hs-source-dirs: lib
+    exposed-modules: Streamly.Benchmark.Common
+                   , Streamly.Benchmark.Prelude

 -------------------------------------------------------------------------------
 -- Benchmarks
@ -161,25 +143,21 @@ library lib-prelude
 common bench-options
  import: compile-options, optimization-options, bench-depends
  ghc-options: -rtsopts
-  build-depends: streamly-benchmarks
+  build-depends: streamly-benchmarks == 0.0.0

 -- Some benchmarks are threaded some are not
 common bench-options-threaded
  import: compile-options, optimization-options, bench-depends
  -- -threaded and -N2 is important because some GC and space leak issues
  -- trigger only with these options.
-  ghc-options: -threaded -with-rtsopts "-T -N2 -K32K -M16M"
-  build-depends: streamly-benchmarks
-
-- XXX the individual modules can just export a bunch of gauge Benchmark
-- grouped by space usage and then we can combine the groups in just four
-- different top level drivers.
+  ghc-options: -threaded -rtsopts -with-rtsopts "-N2"
+  build-depends: streamly-benchmarks == 0.0.0

 -------------------------------------------------------------------------------
 -- Serial Streams
 -------------------------------------------------------------------------------

-benchmark serial
+benchmark Prelude.Serial
  import: bench-options
  type: exitcode-stdio-1.0
  hs-source-dirs: Streamly/Benchmark/Prelude
@ -187,7 +165,76 @@ benchmark serial
  if impl(ghcjs)
    buildable: False
  else
-    build-depends: lib-prelude
+    buildable: True
+
+benchmark Prelude.WSerial
+  import: bench-options
+  type: exitcode-stdio-1.0
+  hs-source-dirs: Streamly/Benchmark/Prelude
+  main-is: WSerial.hs
+  if impl(ghcjs)
+    buildable: False
+  else
+    buildable: True
+
+benchmark Prelude.ZipSerial
+  import: bench-options
+  type: exitcode-stdio-1.0
+  hs-source-dirs: Streamly/Benchmark/Prelude
+  main-is: ZipSerial.hs
+  if impl(ghcjs)
+    buildable: False
+  else
+    buildable: True
+
+benchmark Prelude.ZipAsync
+  import: bench-options
+  type: exitcode-stdio-1.0
+  hs-source-dirs: Streamly/Benchmark/Prelude
+  main-is: ZipAsync.hs
+  if impl(ghcjs)
+    buildable: False
+  else
+    buildable: True
+
+benchmark Prelude.Ahead
+  import: bench-options-threaded
+  type: exitcode-stdio-1.0
+  hs-source-dirs: Streamly/Benchmark/Prelude
+  main-is: Ahead.hs
+  if impl(ghcjs)
+    buildable: False
+  else
+    buildable: True
+
+benchmark Prelude.Async
+  import: bench-options-threaded
+  type: exitcode-stdio-1.0
+  hs-source-dirs: Streamly/Benchmark/Prelude
+  main-is: Async.hs
+  if impl(ghcjs)
+    buildable: False
+  else
+    buildable: True
+
+benchmark Prelude.WAsync
+  import: bench-options-threaded
+  type: exitcode-stdio-1.0
+  hs-source-dirs: Streamly/Benchmark/Prelude
+  main-is: WAsync.hs
+  if impl(ghcjs)
+    buildable: False
+  else
+    buildable: True
+
+benchmark Prelude.Parallel
+  import: bench-options-threaded
+  type: exitcode-stdio-1.0
+  hs-source-dirs: Streamly/Benchmark/Prelude
+  main-is: Parallel.hs
+  if impl(ghcjs)
+    buildable: False
+  else
    buildable: True

 benchmark Data.Unfold
@ -291,47 +338,13 @@ executable nano-bench
 -- Concurrent Streams
 -------------------------------------------------------------------------------

-benchmark linear-async
-  import: bench-options-threaded
-  type: exitcode-stdio-1.0
-  ghc-options: -with-rtsopts "-T -N2 -K64K -M16M"
-  hs-source-dirs: Streamly/Benchmark/Prelude
-  main-is: LinearAsync.hs
-  if impl(ghcjs)
-    buildable: False
-  else
-    build-depends: lib-prelude
-    buildable: True
-
-benchmark nested-concurrent
-  import: bench-options-threaded
-  type: exitcode-stdio-1.0
-  -- XXX this can be lowered once we split out the finite benchmarks
-  ghc-options: -with-rtsopts "-T -N2 -K256K -M128M"
-  hs-source-dirs: ., Streamly/Benchmark/Prelude
-  main-is: NestedConcurrent.hs
-  other-modules: Streamly.Benchmark.Prelude.NestedOps
-
-benchmark parallel
-  import: bench-options-threaded
-  type: exitcode-stdio-1.0
-  ghc-options: -with-rtsopts "-T -N2 -K128K -M256M"
-  hs-source-dirs: Streamly/Benchmark/Prelude
-  main-is: Parallel.hs
-  if impl(ghcjs)
-    buildable: False
-  else
-    build-depends: lib-prelude
-    buildable: True
-
-benchmark concurrent
+benchmark Prelude.Concurrent
  import: bench-options-threaded
  type: exitcode-stdio-1.0
  hs-source-dirs: Streamly/Benchmark/Prelude
  main-is: Concurrent.hs
-  ghc-options: -with-rtsopts "-T -N2 -K256K -M384M"

-benchmark adaptive
+benchmark Prelude.Adaptive
  import: bench-options-threaded
  type: exitcode-stdio-1.0
  hs-source-dirs: Streamly/Benchmark/Prelude
@ -341,15 +354,14 @@ benchmark adaptive
  else
    buildable: True

-benchmark linear-rate
+benchmark Prelude.Rate
  import: bench-options-threaded
  type: exitcode-stdio-1.0
  hs-source-dirs: Streamly/Benchmark/Prelude
-  main-is: LinearRate.hs
+  main-is: Rate.hs
  if impl(ghcjs)
    buildable: False
  else
-    build-depends: lib-prelude
    buildable: True

 -------------------------------------------------------------------------------
@ -412,7 +424,6 @@ executable chart
    build-Depends:
        base >= 4.8 && < 5
      , bench-show >= 0.3 && < 0.4
-      , split
      , transformers >= 0.4   && < 0.6
  else
    buildable: False
--- a/bin/bench-exec-one.sh
+++ b/bin/bench-exec-one.sh
@ -0,0 +1,133 @@
+#!/bin/bash
+
+# Environment passed:
+# BENCH_EXEC_PATH: the benchmark executable
+# RTS_OPTIONS: additional RTS options
+# QUICK_MODE: whether we are in quick mode
+
+#------------------------------------------------------------------------------
+# RTS Options
+#------------------------------------------------------------------------------
+
+# RTS options based on the benchmark executable
+bench_exe_rts_opts () {
+  case "$1" in
+    Prelude.Concurrent*) echo -n "-K256K -M384M" ;;
+    *) echo -n "" ;;
+  esac
+}
+
+# General RTS options for different classes of benchmarks
+bench_rts_opts_default () {
+  case "$1" in
+    */o-1-sp*) echo -n "-K36K -M16M" ;;
+    */o-n-h*) echo -n "-K36K -M32M" ;;
+    */o-n-st*) echo -n "-K1M -M16M" ;;
+    */o-n-sp*) echo -n "-K1M -M32M" ;;
+    *) echo -n "" ;;
+  esac
+}
+
+# Overrides for specific benchmarks
+bench_rts_opts_specific () {
+  case "$1" in
+    Prelude.Parallel/o-n-heap/mapping/mapM) echo -n "-M256M" ;;
+    #Prelude.Parallel/o-n-heap/monad-outer-product/toList) echo -n "-K-M256M" ;;
+    Prelude.Parallel/o-n-heap/monad-outer-product/*) echo -n "-M256M" ;;
+    Prelude.Parallel/o-n-space/monad-outer-product/*) echo -n "-K4M -M256M" ;;
+    Prelude.Serial/o-n-space/grouping/*) echo -n "" ;;
+    Prelude.Serial/o-n-space/*) echo -n "-K4M" ;;
+    Prelude.WSerial/o-n-space/*) echo -n "-K4M" ;;
+
+    Prelude.Async/o-n-space/monad-outer-product/*) echo -n "-K4M" ;;
+    Prelude.Ahead/o-n-space/monad-outer-product/*) echo -n "-K4M" ;;
+
+    Prelude.WAsync/o-n-heap/monad-outer-product/toNull3) echo -n "-M64M" ;;
+    Prelude.WAsync/o-n-space/monad-outer-product/*) echo -n "-K4M" ;;
+
+    # XXX need to investigate these, taking too much stack
+    Data.Parser.ParserD/o-1-space/some) echo -n "-K1M" ;;
+    Data.Parser/o-1-space/some) echo -n "-K1M" ;;
+    Data.Parser.ParserD/o-1-space/manyTill) echo -n "-K4M" ;;
+    Data.Parser/o-1-space/manyTill) echo -n "-K4M" ;;
+
+    Data.SmallArray/o-1-sp*) echo -n "-K128K" ;;
+    *) echo -n "" ;;
+  esac
+}
+
+#------------------------------------------------------------------------------
+# Speed options
+#------------------------------------------------------------------------------
+
+if test "$QUICK_MODE" -eq 0
+then
+QUICK_OPTIONS="--min-samples 3"
+fi
+SUPER_QUICK_OPTIONS="--min-samples 1 --include-first-iter"
+
+bench_exe_quick_opts () {
+  case "$1" in
+    Prelude.Concurrent) echo -n "$SUPER_QUICK_OPTIONS" ;;
+    Prelude.Rate) echo -n "$SUPER_QUICK_OPTIONS" ;;
+    Prelude.Adaptive) echo -n "$SUPER_QUICK_OPTIONS" ;;
+    fileio) echo -n "$SUPER_QUICK_OPTIONS" ;;
+    *) echo -n "" ;;
+  esac
+}
+
+# Use quick options for benchmarks that take too long
+bench_quick_opts () {
+  case "$1" in
+    Prelude.Parallel/o-n-heap/mapping/mapM)
+        echo -n "$SUPER_QUICK_OPTIONS" ;;
+    Prelude.Parallel/o-n-heap/monad-outer-product/*)
+        echo -n "$SUPER_QUICK_OPTIONS" ;;
+    Prelude.Parallel/o-n-space/monad-outer-product/*)
+        echo -n "$SUPER_QUICK_OPTIONS" ;;
+    Prelude.Parallel/o-n-heap/generation/*) echo -n "$QUICK_OPTIONS" ;;
+    Prelude.Parallel/o-n-heap/mapping/*) echo -n "$QUICK_OPTIONS" ;;
+    Prelude.Parallel/o-n-heap/concat-foldable/*) echo -n "$QUICK_OPTIONS" ;;
+
+    Prelude.Async/o-1-space/monad-outer-product/*) echo -n "$QUICK_OPTIONS" ;;
+    Prelude.Async/o-n-space/monad-outer-product/*) echo -n "$QUICK_OPTIONS" ;;
+
+    Prelude.Ahead/o-1-space/monad-outer-product/*) echo -n "$QUICK_OPTIONS" ;;
+    Prelude.Ahead/o-n-space/monad-outer-product/*) echo -n "$QUICK_OPTIONS" ;;
+
+    Prelude.WAsync/o-n-heap/monad-outer-product/*) echo -n "$QUICK_OPTIONS" ;;
+    Prelude.WAsync/o-n-space/monad-outer-product/*) echo -n "$QUICK_OPTIONS" ;;
+    *) echo -n "" ;;
+  esac
+}
+
+last=""
+for i in "$@"
+do
+    BENCH_NAME="$last"
+    last="$i"
+done
+
+RTS_OPTIONS=\
+"+RTS -T \
+$(bench_exe_rts_opts $(basename $BENCH_EXEC_PATH)) \
+$(bench_rts_opts_default $BENCH_NAME) \
+$(bench_rts_opts_specific $BENCH_NAME) \
+$RTS_OPTIONS \
+-RTS"
+
+QUICK_BENCH_OPTIONS="\
+$(bench_exe_quick_opts $(basename $BENCH_EXEC_PATH)) \
+$(bench_quick_opts $BENCH_NAME)"
+
+if test -n "$STREAM_SIZE"
+then
+  STREAM_LEN=$(env LC_ALL=en_US.UTF-8 printf "\--stream-size %'.f\n" $STREAM_SIZE)
+fi
+
+echo "$BENCH_NAME: \
+$STREAM_LEN \
+$QUICK_BENCH_OPTIONS \
+$RTS_OPTIONS"
+
+$BENCH_EXEC_PATH $RTS_OPTIONS "$@" $QUICK_BENCH_OPTIONS
--- a/streamly.cabal
+++ b/streamly.cabal
@ -138,7 +138,6 @@ extra-source-files:
    benchmark/README.md
    benchmark/*.hs
    benchmark/lib/Streamly/Benchmark/*.hs
-    benchmark/lib/Streamly/Benchmark/Prelude/*.hs
    benchmark/Streamly/Benchmark/Memory/*.hs
    benchmark/Streamly/Benchmark/Data/*.hs
    benchmark/Streamly/Benchmark/Data/Prim/*.hs