Cleanup, fix issues in bench.sh

2024-09-11 08:25:40 +03:00 · 2021-06-04 14:09:19 +05:30 · 2021-06-04 14:09:19 +05:30 · 03218ad71b
commit 03218ad71b
parent b68baf3c51
15 changed files with 232 additions and 177 deletions
--- a/benchmark/README.md
+++ b/benchmark/README.md
@ -1,3 +1,12 @@
+# Benchmarking
+
+## Benchmark Dirvers
+
+Two benchmark drivers are supported:
+
+* `tasty-bench` (default)
+* `gauge` (enabled by `--use-gauge` build flag)
+
 ## Building a single benchmark suite

 ```
@ -30,15 +39,18 @@ $ cabal build --flag "-opt" ...

 ## Build and run single benchmarks:

-For quick results you may have to use `--quick` and possibly some other
-`gauge` options, or better use `bench.sh --quick` as described in the
-following sections.
+The benchmark executables are `tasty-bench` executables unless you have
+passed `--use-gauge` cabal flag when building in which case it is a
+`gauge` executable.
+
+For quick results you may have to use a large value for `--stdev` or use
+`bench.sh --quick` as described in the following sections.

 ```
-$ cabal run bench:Prelude.Serial -- --quick
+$ cabal run bench:Prelude.Serial -- --stdev 100000
 ```

-`cabal bench` can be used but you cannot pass arguments (like --quick):
+`cabal bench` can be used but you cannot pass arguments (like --stdev):

 ```
 $ cabal bench Prelude.Serial
@ -104,7 +116,7 @@ $ bench.sh --benchmarks "Prelude.Serial Data.Parser"
 Run all O(1) space complexity benchmarks in `Prelude.Serial` suite:

 ```
-$ bench.sh --benchmarks Prelude.Serial -- Prelude.Serial/o-1-space
+$ bench.sh --benchmarks Prelude.Serial --prefix Prelude.Serial/o-1-space
 ```

 Anything after a `--` is passed to the benchmark executable,
@ -114,13 +126,13 @@ it basically selects all benchmarks starting with
 Run a specific benchmark in `Prelude.Serial` suite:

 ```
-$ bench.sh --benchmarks Prelude.Serial -- Prelude.Serial/o-1-space/generation/unfoldr
+$ bench.sh --benchmarks Prelude.Serial --prefix Prelude.Serial/o-1-space/generation/unfoldr
 ```

 Run a benchmark directly instead of running it through `bench.sh`:

 ```
-$ cabal run bench:Prelude.Serial -- --quick Prelude.Serial/o-1-space/generation/unfoldr
+$ cabal run bench:Prelude.Serial -- Prelude.Serial/o-1-space/generation/unfoldr
 ```

 The options after `--` are the benchmark executable options.
@ -212,7 +224,7 @@ You can specify the stream size (default is 100000) to be used for
 benchmarking:

 ```
-$ cabal run bench:Prelude.Serial -- --quick --stream-size 1000000
+$ cabal run bench:Prelude.Serial -- --stream-size 1000000
 ```

 ### Unicode input
@ -222,7 +234,7 @@ environment variable:

 ```
 $ export Benchmark_FileSystem_Handle_InputFile=./gutenberg-500.txt
-$ cabal run FileSystem.Handle -- --quick FileSystem.Handle/o-1-space/reduce/read/S.splitOnSeq
+$ cabal run FileSystem.Handle -- FileSystem.Handle/o-1-space/reduce/read/S.splitOnSeq
 ```

 The automatic tests do not test unicode input, this option is useful to specify
@ -230,10 +242,9 @@ a unicode text file manually.

 ## Benchmarking notes

-We use gauge instead of criterion for benchmarking. We have fixed
-several issues in gauge inherited from criterion. We have added several
-features as well e.g. rusage stats, running benchmarks in an isolated
-process etc which are crucial to our benchmarking analysis process.
+We run each benchmark in an isolated process to minimize interference
+of benchmarks and to be able to control the RTS memory restrictions per
+benchmark.

 ### Gotchas

--- a/benchmark/Streamly/Benchmark/Data/Array/Stream/Foreign.hs
+++ b/benchmark/Streamly/Benchmark/Data/Array/Stream/Foreign.hs
@ -193,11 +193,11 @@ moduleName = "Data.Array.Stream.Foreign"
 main :: IO ()
 main = do
    env <- mkHandleBenchEnv
-    runWithCLIOpts defaultStreamSize (allBenchmarks env)
+    defaultMain (allBenchmarks env)

    where

-    allBenchmarks env _ =
+    allBenchmarks env =
        [ bgroup (o_1_space_prefix moduleName)
          ( o_1_space_read_chunked env
          ++ o_1_space_copy_toChunks_group_ungroup env
--- a/benchmark/Streamly/Benchmark/Data/Parser.hs
+++ b/benchmark/Streamly/Benchmark/Data/Parser.hs
@ -395,25 +395,15 @@ o_n_heap_serial value =
 -- Driver
 -------------------------------------------------------------------------------

-getArray :: (Int -> [Benchmark]) -> IO [Array.Array Int]
-#ifndef MIN_VERSION_gauge
-getArray f = do
-    (value, _) <- parseCLIOpts defaultStreamSize $ bgroup "All" (f 0)
-    IP.toList $ IP.arraysOf 100 $ sourceUnfoldrM value 0
-#else
-getArray _ = do
-    (value, _, _) <- parseCLIOpts defaultStreamSize
-    IP.toList $ IP.arraysOf 100 $ sourceUnfoldrM value 0
-#endif
-
 main :: IO ()
 main = do
    env <- mkHandleBenchEnv
-    arrays <- getArray (allBenchmarks env [])
-    runWithCLIOpts defaultStreamSize (allBenchmarks env arrays)
+    runWithCLIOptsEnv defaultStreamSize alloc (allBenchmarks env)

    where

+    alloc value = IP.toList $ IP.arraysOf 100 $ sourceUnfoldrM value 0
+
    allBenchmarks env arrays value =
        [ bgroup (o_1_space_prefix moduleName) (o_1_space_serial value)
        , bgroup
--- a/benchmark/Streamly/Benchmark/Data/Parser/ParserD.hs
+++ b/benchmark/Streamly/Benchmark/Data/Parser/ParserD.hs
@ -5,7 +5,6 @@
 -- License     : BSD-3-Clause
 -- Maintainer  : streamly@composewell.com

-{-# LANGUAGE CPP #-}
 {-# LANGUAGE FlexibleContexts #-}
 {-# LANGUAGE ScopedTypeVariables #-}
 {-# OPTIONS_GHC -fspec-constr-recursive=4 #-}
@ -359,24 +358,13 @@ o_n_space_serial value =
 -- Driver
 -------------------------------------------------------------------------------

-getArray :: (Int -> [Benchmark]) -> IO [Array.Array Int]
-#ifndef MIN_VERSION_gauge
-getArray f = do
-    (value, _) <- parseCLIOpts defaultStreamSize $ bgroup "All" (f 0)
-    IP.toList $ IP.arraysOf 100 $ sourceUnfoldrM value 0
-#else
-getArray _ = do
-    (value, _, _) <- parseCLIOpts defaultStreamSize
-    IP.toList $ IP.arraysOf 100 $ sourceUnfoldrM value 0
-#endif
-
 main :: IO ()
-main = do
-    arrays <- getArray (allBenchmarks [])
-    runWithCLIOpts defaultStreamSize (allBenchmarks arrays)
+main = runWithCLIOptsEnv defaultStreamSize alloc allBenchmarks

    where

+    alloc value = IP.toList $ IP.arraysOf 100 $ sourceUnfoldrM value 0
+
    allBenchmarks arraysSmall value =
        [ bgroup (o_1_space_prefix moduleName) (o_1_space_serial value)
        , bgroup (o_1_space_prefix moduleName) (o_1_space_serial_spanning value)
--- a/benchmark/Streamly/Benchmark/FileSystem/Handle.hs
+++ b/benchmark/Streamly/Benchmark/FileSystem/Handle.hs
@ -44,11 +44,11 @@ moduleName = "FileSystem.Handle"
 main :: IO ()
 main = do
    env <- mkHandleBenchEnv
-    runWithCLIOpts defaultStreamSize (allBenchmarks env)
+    defaultMain (allBenchmarks env)

    where

-    allBenchmarks env _ =
+    allBenchmarks env =
        [ bgroup (o_1_space_prefix moduleName) $ Prelude.concat
            [ RO.allBenchmarks env
            , RW.allBenchmarks env
--- a/benchmark/Streamly/Benchmark/Prelude/Concurrent.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/Concurrent.hs
@ -1,3 +1,5 @@
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE RankNTypes #-}
 -- |
 -- Module      : Main
 -- Copyright   : (c) 2018 Composewell Technologies
@ -5,8 +7,6 @@
 -- License     : BSD3
 -- Maintainer  : streamly@composewell.com

-{-# LANGUAGE RankNTypes #-}
-
 import Control.Concurrent
 import Control.Monad (when, replicateM)
 import Streamly.Prelude
@ -78,7 +78,17 @@ concatGroup buflen threads usec n =

 main :: IO ()
 main =
+#ifdef MIN_VERSION_gauge
+  defaultMainWith (defaultConfig
+    { timeLimit = Just 0
+    , minSamples = Just 1
+    , minDuration = 0
+    , includeFirstIter = True
+    , quickMode = True
+    })
+#else
    defaultMain
+#endif

    [ -- bgroup "append/buf-1-threads-10k-0sec"  (appendGroup 1 10000 0)
    -- , bgroup "append/buf-100-threads-100k-0sec"  (appendGroup 100 100000 0)
--- a/benchmark/Streamly/Benchmark/Prelude/Rate.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/Rate.hs
@ -1,5 +1,4 @@
 {-# LANGUAGE FlexibleContexts #-}
-{-# LANGUAGE CPP #-}

 -- |
 -- Module      : Main
--- a/benchmark/Streamly/Benchmark/Prelude/WAsync.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/WAsync.hs
@ -5,8 +5,6 @@
 -- License     : BSD3
 -- Maintainer  : streamly@composewell.com

-{-# LANGUAGE CPP #-}
-
 import Prelude hiding (mapM)

 import Streamly.Prelude (fromWAsync, fromSerial, wAsync, maxBuffer, maxThreads)
@ -170,7 +168,7 @@ o_n_space_outerProduct value =

 main :: IO ()
 main = runWithCLIOpts defaultStreamSize allBenchmarks
-    
+
    where

    allBenchmarks value =
--- a/benchmark/Streamly/Benchmark/Prelude/ZipAsync.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/ZipAsync.hs
@ -6,7 +6,6 @@
 -- Maintainer  : streamly@composewell.com

 {-# LANGUAGE FlexibleContexts #-}
-{-# LANGUAGE CPP #-}

 import Streamly.Prelude (fromSerial)
 import qualified Streamly.Prelude  as S
--- a/benchmark/Streamly/Benchmark/Prelude/ZipSerial.hs
+++ b/benchmark/Streamly/Benchmark/Prelude/ZipSerial.hs
@ -60,8 +60,8 @@ zipWith count n =
    S.drain $
    S.zipWith
        (,)
-        (S.fromSerial $ Main.sourceUnfoldrM count n)
-        (S.fromSerial $ Main.sourceUnfoldrM count (n + 1))
+        (S.fromSerial $ sourceUnfoldrM count n)
+        (S.fromSerial $ sourceUnfoldrM count (n + 1))

 #ifdef INSPECTION
 inspect $ hasNoTypeClasses 'zipWith
@ -75,8 +75,8 @@ zipWithM count n =
    S.drain $
    S.zipWithM
        (curry return)
-        (Main.sourceUnfoldrM count n)
-        (Main.sourceUnfoldrM count (n + 1))
+        (sourceUnfoldrM count n)
+        (sourceUnfoldrM count (n + 1))

 #ifdef INSPECTION
 inspect $ hasNoTypeClasses 'zipWithM
@ -125,7 +125,7 @@ o_1_space_outerProduct value =
 -- passed using the --stream-size option.
 --
 main :: IO ()
-main = runWithCLIOpts defaultStreamSize allBenchmarks    
+main = runWithCLIOpts defaultStreamSize allBenchmarks

    where

--- a/benchmark/lib/Streamly/Benchmark/Common.hs
+++ b/benchmark/lib/Streamly/Benchmark/Common.hs
@ -16,8 +16,7 @@ module Streamly.Benchmark.Common
    , o_n_heap_prefix
    , o_n_stack_prefix

-   -- , parseEnvOpts
-    , parseCLIOpts
+    , runWithCLIOptsEnv
    , runWithCLIOpts

    , benchIOSink1
@ -33,14 +32,6 @@ module Streamly.Benchmark.Common
    , mkListString

    , defaultStreamSize
-    , BenchOpts(..)
-#ifndef MIN_VERSION_gauge
-    , OptionDescription(..)
-    , includingOptions
-    , lookupOption
-    , defaultMainWithIngredients
-    , parseOptions
-#endif
    )
 where

@ -54,10 +45,11 @@ import System.Console.GetOpt
       (OptDescr(..), ArgDescr(..), ArgOrder(..), getOpt')
 import System.Environment (getArgs, lookupEnv, setEnv)
 #else
-import Data.Proxy
-import Test.Tasty.Ingredients.Basic
+import Data.Proxy (Proxy(..))
+import Test.Tasty.Ingredients.Basic (includingOptions)
 import Test.Tasty.Options
-import Test.Tasty.Runners
+    (IsOption(..), OptionDescription(..), lookupOption, safeRead)
+import Test.Tasty.Runners (Ingredient, defaultMainWithIngredients, parseOptions)
 #endif
 import Control.DeepSeq (NFData(..))
 import Data.Functor.Identity (Identity, runIdentity)
@ -230,27 +222,37 @@ instance IsOption BenchOpts where
    defaultValue = StreamSize defaultStreamSize
    parseValue = fmap StreamSize . safeRead
    optionName = pure "stream-size"
-    optionHelp = pure "StreamSize used in benchmarks"
+    optionHelp = pure "Size of the stream to be used in benchmarks"

 parseCLIOpts :: Int -> Benchmark -> IO (Int, [Ingredient])
-parseCLIOpts cDefSize benches = do
+parseCLIOpts defStreamSize benches = do
    let customOpts  = [Test.Tasty.Options.Option (Proxy :: Proxy BenchOpts)]
        ingredients = includingOptions customOpts : benchIngredients
-    opts <- parseOptions ingredients benches
+    opts <- parseOptions ingredients benches -- (TestGroup "" [])
    let StreamSize size = lookupOption opts
-    print $ "Stream-Size = " ++ show size
-    if size == defaultStreamSize        -- LONG option is not set
-    then return (cDefSize, ingredients) -- use custom defaut size of Benchmark
-    else return (size, ingredients)     -- LONG option is set use large stream size
+    -- putStrLn $ "Stream size: " ++ show size
+    if size == defaultStreamSize
+    then return (defStreamSize, ingredients)
+    else return (size, ingredients)
+#endif
+
+runWithCLIOptsEnv :: Int -> (Int -> IO a) -> (a -> Int -> [Benchmark]) -> IO ()
+runWithCLIOptsEnv defStreamSize alloc mkBench = do
+
+#ifdef MIN_VERSION_gauge
+    (value, cfg, benches) <- parseCLIOpts defStreamSize
+    r <- alloc value
+    value `seq` runMode (mode cfg) cfg benches (mkBench r value)
+#else
+    (value, ingredients) <-
+        parseCLIOpts defStreamSize $ bgroup "All" (mkBench undefined 0)
+    r <- alloc value
+    value `seq` defaultMainWithIngredients ingredients
+        $ bgroup "All" (mkBench r value)
 #endif

 runWithCLIOpts :: Int -> (Int -> [Benchmark]) -> IO ()
-runWithCLIOpts cDefSize f = do
-
-#ifdef MIN_VERSION_gauge
-    (value, cfg, benches) <- parseCLIOpts cDefSize
-    value `seq` runMode (mode cfg) cfg benches (f value)
-#else
-    (value, ingredients) <- parseCLIOpts cDefSize $ bgroup "All" (f 0)
-    value `seq` defaultMainWithIngredients ingredients $ bgroup "All" (f value)
-#endif
+runWithCLIOpts defStreamSize f =
+    runWithCLIOptsEnv defStreamSize
+        (const $ return undefined)
+        (\_ v -> f v)
--- a/benchmark/lib/Streamly/Benchmark/Common/Handle.hs
+++ b/benchmark/lib/Streamly/Benchmark/Common/Handle.hs
@ -106,7 +106,7 @@ getHandles env mkHandles = do
            }

    -- update
-    writeIORef (href env) $ refHandles
+    writeIORef (href env) refHandles
    return $ mkHandles refHandles

 mkBenchCommon ::
--- a/benchmark/streamly-benchmarks.cabal
+++ b/benchmark/streamly-benchmarks.cabal
@ -124,13 +124,13 @@ common bench-depends
    , directory         >= 1.2.2 && < 1.4
    , ghc-prim          >= 0.4   && < 0.8

-  if flag(use-gauge)    
+  if flag(use-gauge)
    build-depends:  gauge >= 0.2.4 && < 0.3
-  else    
+  else
    build-depends:    tasty-bench >= 0.2.5 && < 0.3
                    , tasty     >= 1.4.1
    mixins: tasty-bench
-      (Test.Tasty.Bench as Gauge     
+      (Test.Tasty.Bench as Gauge
      , Test.Tasty.Bench as Gauge.Main
      )

--- a/bin/bench-exec-one.sh
+++ b/bin/bench-exec-one.sh
@ -4,6 +4,8 @@
 # BENCH_EXEC_PATH: the benchmark executable
 # RTS_OPTIONS: additional RTS options
 # QUICK_MODE: whether we are in quick mode
+# USE_GAUGE: whether to use gauge or tasty-bench
+# LONG: whether to use a large stream size

 # $1: message
 die () {
@ -42,6 +44,9 @@ bench_rts_opts_default () {
 }

 # Overrides for specific benchmarks
+# XXX Note: for tasty-bench we replace the "." separator in the benchmark names
+# with "/" for matching with this. It may not work reliably if the benchmark
+# name already contains ".".
 bench_rts_opts_specific () {
  case "$1" in
    Data.Stream.StreamD/o-n-space/elimination/toList) echo -n "-K2M" ;;
@ -103,43 +108,46 @@ bench_rts_opts_specific () {
 if test "$USE_GAUGE" -eq 0
 then
  SUPER_QUICK_OPTIONS="--stdev 1000000"
-  QUICKER_OPTIONS="--stdev 1000"
+  QUICKER_OPTIONS="--stdev 100"
 else
  # Do not keep time limit as 0 otherwise GC stats may remain 0 in some cases.
  SUPER_QUICK_OPTIONS="--quick --min-duration 0 --time-limit 0.01 --include-first-iter"
  QUICKER_OPTIONS="--min-samples 3 --time-limit 1"
 fi

+# tasty-bench does not like an option set twice
+set_super_quick_mode () {
+    echo -n super_quick
+}
+
 # For certain long benchmarks if the user has not requested super quick
 # mode we anyway use a slightly quicker mode.
 use_quicker_mode () {
-  if test -n "$QUICK_MODE"
+  if test "$QUICK_MODE" -eq 0
  then
-    if test "$QUICK_MODE" -eq 0
-    then
-      echo $QUICKER_OPTIONS
-    fi
+    echo quicker
  fi
 }

 bench_exe_quick_opts () {
  case "$1" in
-    Prelude.Concurrent) echo -n "$SUPER_QUICK_OPTIONS" ;;
-    Prelude.Rate) echo -n "$SUPER_QUICK_OPTIONS" ;;
-    Prelude.Adaptive) echo -n "$SUPER_QUICK_OPTIONS" ;;
+    Prelude.Concurrent) set_super_quick_mode ;;
+    Prelude.Rate) set_super_quick_mode ;;
+    Prelude.Adaptive) set_super_quick_mode;;
    *) echo -n "" ;;
  esac
 }

+# XXX Note: for tasty-bench we replace the "." separator in the benchmark names
+# with "/" for matching with this. It may not work reliably if the benchmark
+# name already contains ".".
+
 # Use quick options for benchmarks that take too long
 bench_quick_opts () {
  case "$1" in
-    Prelude.Parallel/o-n-heap/mapping/mapM)
-        echo -n "$SUPER_QUICK_OPTIONS" ;;
-    Prelude.Parallel/o-n-heap/monad-outer-product/*)
-        echo -n "$SUPER_QUICK_OPTIONS" ;;
-    Prelude.Parallel/o-n-space/monad-outer-product/*)
-        echo -n "$SUPER_QUICK_OPTIONS" ;;
+    Prelude.Parallel/o-n-heap/mapping/mapM) set_super_quick_mode ;;
+    Prelude.Parallel/o-n-heap/monad-outer-product/*) set_super_quick_mode ;;
+    Prelude.Parallel/o-n-space/monad-outer-product/*) set_super_quick_mode ;;
    Prelude.Parallel/o-n-heap/generation/*) use_quicker_mode ;;
    Prelude.Parallel/o-n-heap/mapping/*) use_quicker_mode ;;
    Prelude.Parallel/o-n-heap/concat-foldable/*) use_quicker_mode ;;
@ -163,11 +171,12 @@ bench_output_file() {
    echo "charts/$bench_name/results.csv"
 }

-BENCH_NAME_ORIG=""
-for i in "$@"
-do
-    BENCH_NAME_ORIG="$i"
-done
+#------------------------------------------------------------------------------
+# Determine options from benchmark name
+#------------------------------------------------------------------------------
+
+BENCH_NAME_ORIG="$1"
+shift

 if test "$USE_GAUGE" -eq 0
 then
@ -178,16 +187,8 @@ then
  BENCH_NAME1=$(echo $BENCH_NAME | cut -f1 -d '/')
  BENCH_NAME2=$(echo $BENCH_NAME | cut -f2- -d '/' | sed -e 's/\./\//g')
  BENCH_NAME="$BENCH_NAME1/$BENCH_NAME2"
-  JOB_OPT=" -j 1"
 else
  BENCH_NAME=$BENCH_NAME_ORIG
-  JOB_OPT=""
-fi
-if test "$LONG" -eq 0
-then
-  SIZE_OPT=""
-else
-  SIZE_OPT="--stream-size 1000000"
 fi

 RTS_OPTIONS=\
@ -198,28 +199,65 @@ $(bench_rts_opts_specific $BENCH_NAME) \
 $RTS_OPTIONS \
 -RTS"

-QUICK_BENCH_OPTIONS="\
-$(if test "$QUICK_MODE" -ne 0; then echo $SUPER_QUICK_OPTIONS; else :; fi)
+QUICK_MODE_TYPE="\
+$(if test "$QUICK_MODE" -ne 0; then set_super_quick_mode; fi) \
 $(bench_exe_quick_opts $(basename $BENCH_EXEC_PATH)) \
 $(bench_quick_opts $BENCH_NAME)"

-output_file=$(bench_output_file $(basename $BENCH_EXEC_PATH))
-mkdir -p `dirname $output_file`
+for i in $QUICK_MODE_TYPE
+do
+  case "$i" in
+    super_quick) QUICK_BENCH_OPTIONS="$SUPER_QUICK_OPTIONS"; break ;;
+    quicker) QUICK_BENCH_OPTIONS="$QUICKER_OPTIONS"; break ;;
+  esac
+done
+
+if test "$LONG" -ne 0
+then
+  STREAM_SIZE=10000000
+  STREAM_LEN=$(env LC_ALL=en_US.UTF-8 printf "--stream-size %'.f\n" $STREAM_SIZE)
+  STREAM_SIZE_OPT="--stream-size $STREAM_SIZE"
+fi

 echo "$BENCH_NAME: \
-$QUICK_BENCH_OPTIONS \
-$RTS_OPTIONS"
+$RTS_OPTIONS \
+$STREAM_LEN \
+$QUICK_BENCH_OPTIONS" \
+"$@"

+#------------------------------------------------------------------------------
+# Run benchmark with options and collect results
+#------------------------------------------------------------------------------
+
+output_file=$(bench_output_file $(basename $BENCH_EXEC_PATH))
+mkdir -p `dirname $output_file`
 rm -f ${output_file}.tmp
+
 if test $USE_GAUGE -eq 0
 then
+  # Escape "\" and double quotes in benchmark names
  BENCH_NAME_ESC=$(echo "$BENCH_NAME_ORIG" | sed -e 's/\\/\\\\/g' | sed -e 's/"/\\"/g')
-  $BENCH_EXEC_PATH $SIZE_OPT $JOB_OPT $RTS_OPTIONS $QUICK_BENCH_OPTIONS --csv=${output_file}.tmp \
+  $BENCH_EXEC_PATH \
+    -j 1 \
+    $RTS_OPTIONS \
+    $STREAM_SIZE_OPT \
+    $QUICK_BENCH_OPTIONS \
+    "$@" \
+    --csv=${output_file}.tmp \
    -p '$0 == "'"$BENCH_NAME_ESC"'"'
+
+  # Convert cpuTime field from picoseconds to seconds
  tail -n +2 ${output_file}.tmp | \
-    awk 'BEGIN {FPAT = "([^,]+)|(\"[^\"]+\")";OFS=","} {$2=$2/1000000000000;print}' >> $output_file
+    awk 'BEGIN {FPAT = "([^,]+)|(\"[^\"]+\")";OFS=","} {$2=$2/1000000000000;print}' \
+    >> $output_file
 else
-  $BENCH_EXEC_PATH $SIZE_OPT $RTS_OPTIONS $QUICK_BENCH_OPTIONS --csvraw=${output_file}.tmp \
+  $BENCH_EXEC_PATH \
+    $RTS_OPTIONS \
+    $STREAM_SIZE_OPT \
+    $QUICK_BENCH_OPTIONS \
+    "$@" \
+    --csvraw=${output_file}.tmp \
    -m exact "$BENCH_NAME"
-  tail -n +2 ${output_file}.tmp >> $output_file
+  tail -n +2 ${output_file}.tmp \
+    >> $output_file
 fi
--- a/bin/bench.sh
+++ b/bin/bench.sh
@ -8,6 +8,7 @@ source $SCRIPT_DIR/build-lib.sh
 print_help () {
  echo "Usage: $0 "
  echo "       [--benchmarks <"bench1 bench2 ..." | help>]"
+  echo "       [--prefix <benchmark name prefix to match>"
  echo "       [--fields <"field1 field2 ..." | help>]"
  echo "       [--graphs]"
  echo "       [--no-measure]"
@ -142,11 +143,58 @@ bench_output_file() {
    echo "charts/$bench_name/results.csv"
 }

+invoke_gauge () {
+    local target_prog="$1"
+    local target_name="$2"
+    local output_file="$3"
+
+    local MATCH=""
+    if test "$LONG" -ne 0
+    then
+      MATCH="$target_name/o-1-space"
+    else
+      MATCH="$BENCH_PREFIX"
+    fi
+    echo "name,iters,time,cycles,cpuTime,utime,stime,maxrss,minflt,majflt,nvcsw,nivcsw,allocated,numGcs,bytesCopied,mutatorWallSeconds,mutatorCpuSeconds,gcWallSeconds,gcCpuSeconds" >> $output_file
+    # keep only benchmark names with shortest prefix e.g. "a/b/c" and "a/b", we
+    # should only keep "a/b" otherwise benchmarks will run multiple times. why?
+    $target_prog -l \
+      | grep "^$target_name" \
+      | grep "^$MATCH" \
+      | sort \
+      | awk 'BEGIN {prev="XXX"} {if (substr($0,1,length(prev)) != prev) {print $0; prev=$0}}' \
+      | while read -r name; \
+  do bin/bench-exec-one.sh "$name" "${GAUGE_ARGS[@]}"; done
+}
+
+invoke_tasty_bench () {
+    local target_prog="$1"
+    local target_name="$2"
+    local output_file="$3"
+
+    local MATCH=""
+    if test "$LONG" -ne 0
+    then
+      MATCH="-p /$target_name\/o-1-space/"
+    else
+        if test -n "$BENCH_PREFIX"
+        then
+          # escape "/"
+          local escaped_name=$(echo "$BENCH_PREFIX" | sed -e 's/\//\\\//g')
+          MATCH="-p /$escaped_name/"
+        fi
+    fi
+    echo "Name,cpuTime,2*Stdev (ps),Allocated,bytesCopied" >> $output_file
+    $target_prog -l $MATCH \
+      | grep "^All" \
+      | while read -r name; \
+          do bin/bench-exec-one.sh "$name" "${GAUGE_ARGS[@]}"; done
+}
+
 run_bench_target () {
  local package_name=$1
  local component=$2
  local target_name=$3
-  local output_file=$(bench_output_file $target_name)

  local target_prog
  target_prog=$(cabal_target_prog $package_name $component $target_name) || \
@ -156,46 +204,16 @@ run_bench_target () {

  # Needed by bench-exec-one.sh
  export BENCH_EXEC_PATH=$target_prog
-  if test "$LONG" -ne 0
-  then
-      BENCH_ARGS="-p /$target_name\/o-1-space/"
-      STREAM_SIZE=10000000
-      export STREAM_SIZE
-  fi
+  export RTS_OPTIONS
+  export QUICK_MODE
+  export USE_GAUGE
+  export LONG

-  local MATCH=""
+  local output_file=$(bench_output_file $target_name)
+  mkdir -p `dirname $output_file`
  if test "$USE_GAUGE" -eq 0
-  then
-    if test "$LONG" -ne 0
-    then
-      MATCH="-p /$target_name\/o-1-space/"
-    else
-        if test -n "$GAUGE_ARGS"
-        then
-          local GAUGE_ARGS1=$(echo "$GAUGE_ARGS" | sed -e 's/\//\\\//g')
-          MATCH="-p /$GAUGE_ARGS1/"
-        fi
-    fi
-    echo "Name,cpuTime,2*Stdev (ps),Allocated,bytesCopied" >> $output_file
-    $target_prog -l $MATCH \
-      | grep "^All" \
-      | while read -r name; do bin/bench-exec-one.sh "$name"; done
-  else
-    if test "$LONG" -ne 0
-    then
-      MATCH="$target_name/o-1-space"
-    else
-      MATCH="$GAUGE_ARGS"
-    fi
-    echo "name,iters,time,cycles,cpuTime,utime,stime,maxrss,minflt,majflt,nvcsw,nivcsw,allocated,numGcs,bytesCopied,mutatorWallSeconds,mutatorCpuSeconds,gcWallSeconds,gcCpuSeconds" >> $output_file
-    # XXX We may have to use "sort | awk" to keep only benchmark names with
-    # shortest prefix e.g. "a/b/c" and "a/b", we should only keep "a/b"
-    # otherwise benchmarks will run multiple times.
-    $target_prog -l \
-      | grep "^$target_name" \
-      | grep "^$MATCH" \
-      | sort | paste -sd "," - | awk 'BEGIN {FS=","} {t="XU987"; for(i=1;i<=NF;i++) if (substr($i,1,length(t)) != t) {print $i; t=$i}}' \
-      | while read -r name; do bin/bench-exec-one.sh "$name"; done
+  then invoke_tasty_bench "$target_prog" "$target_name" "$output_file"
+  else invoke_gauge "$target_prog" "$target_name" "$output_file"
  fi
 }

@ -284,7 +302,6 @@ run_reports() {
 cd $SCRIPT_DIR/..

 USE_GAUGE=0
-export USE_GAUGE
 USE_GIT_CABAL=1
 set_common_vars

@ -318,6 +335,7 @@ do
    # options with arguments
    --benchmarks) shift; TARGETS=$1; shift ;;
    --targets) shift; TARGETS=$1; shift ;;
+    --prefix) shift; BENCH_PREFIX="$1" shift ;;
    --fields) shift; FIELDS=$1; shift ;;
    --base) shift; BASE=$1; shift ;;
    --candidate) shift; CANDIDATE=$1; shift ;;
@ -334,13 +352,13 @@ do
    --long) LONG=1; shift ;;
    --graphs) GRAPH=1; shift ;;
    --no-measure) MEASURE=0; shift ;;
-    --dev-build) RUNNING_DEVBUILD=1; shift ;;    
+    --dev-build) RUNNING_DEVBUILD=1; shift ;;
+    --use-gauge) USE_GAUGE=1; shift ;;
    --) shift; break ;;
-    -*|--*) echo "Unknown flags: $*"; echo; print_help ;;
-    *) break ;;
+    *) echo "Unknown flags: $*"; echo; print_help ;;
  esac
 done
-GAUGE_ARGS=$*
+GAUGE_ARGS=("$@")

 set_derived_vars

@ -414,13 +432,15 @@ build_report_progs
 # Build and run targets
 #-----------------------------------------------------------------------------

-BUILD_BENCH="$CABAL_EXECUTABLE v2-build $CABAL_BUILD_OPTIONS --enable-benchmarks"
+if test "$USE_GAUGE" -eq 1
+then
+  BUILD_FLAGS="--flag use-gauge"
+fi
+
+BUILD_BENCH="$CABAL_EXECUTABLE v2-build $BUILD_FLAGS $CABAL_BUILD_OPTIONS --enable-benchmarks"
 if test "$MEASURE" = "1"
 then
  run_build "$BUILD_BENCH" streamly-benchmarks bench "$TARGETS"
-  export QUICK_MODE
-  export RTS_OPTIONS
-  export LONG
  run_measurements "$TARGETS"
 fi