diff --git a/.gitignore b/.gitignore
index 3a1149cf..e8e8e452 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ hie.yaml
 
 .stack-work/
 .stack-work-test/
+.stack-work-bench/
 stack.yaml.lock
 .hie/
 stan.html
diff --git a/scripts/benchmark-against-parent.sh b/scripts/benchmark-against-parent.sh
index bc2dfc34..d530d115 100755
--- a/scripts/benchmark-against-parent.sh
+++ b/scripts/benchmark-against-parent.sh
@@ -16,7 +16,8 @@ fi
 BASELINE_OUTPUT=baseline.csv
 
 git checkout HEAD~
-STACK_WORK=.stack-work-bench stack bench swarm:benchmark --benchmark-arguments "--csv $BASELINE_OUTPUT --color always"
+
+scripts/run-benchmarks.sh "--csv $BASELINE_OUTPUT"
 
 git switch -
-STACK_WORK=.stack-work-bench stack bench swarm:benchmark --benchmark-arguments "--baseline $BASELINE_OUTPUT --fail-if-slower 3 --color always"
\ No newline at end of file
+scripts/run-benchmarks.sh "--baseline $BASELINE_OUTPUT --fail-if-slower 3"
\ No newline at end of file
diff --git a/scripts/run-benchmarks.sh b/scripts/run-benchmarks.sh
new file mode 100755
index 00000000..1a5daba4
--- /dev/null
+++ b/scripts/run-benchmarks.sh
@@ -0,0 +1,7 @@
+#!/bin/bash -xe
+
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd $SCRIPT_DIR/..
+
+STACK_WORK=.stack-work-bench stack bench swarm:benchmark --benchmark-arguments "--color always $@"
diff --git a/swarm.cabal b/swarm.cabal
index d3eef3ef..03e17946 100644
--- a/swarm.cabal
+++ b/swarm.cabal
@@ -798,8 +798,10 @@ benchmark benchmark
   type:           exitcode-stdio-1.0
   build-depends:  tasty-bench >= 0.3.1 && < 0.4,
                   base,
+                  extra,
                   lens,
                   mtl,
+                  text,
                   swarm:swarm-engine,
                   swarm:swarm-util,
                   swarm:swarm-lang,
diff --git a/test/bench/Benchmark.hs b/test/bench/Benchmark.hs
index fc15d4ff..c77e6db7 100644
--- a/test/bench/Benchmark.hs
+++ b/test/bench/Benchmark.hs
@@ -11,6 +11,8 @@ import Control.Monad (replicateM_)
 import Control.Monad.State (evalStateT, execStateT)
 import Data.Map qualified as M
 import Data.Sequence (Seq)
+import Data.Text qualified as T
+import Data.Tuple.Extra (dupe)
 import Swarm.Effect (runTimeIO)
 import Swarm.Game.CESK (emptyStore, initMachine)
 import Swarm.Game.Display (defaultRobotDisplay)
@@ -30,6 +32,7 @@ import Swarm.Language.Context qualified as Context
 import Swarm.Language.Pipeline (ProcessedTerm)
 import Swarm.Language.Pipeline.QQ (tmQ)
 import Swarm.Language.Syntax
+import Swarm.Util (parens, showT)
 import Swarm.Util.Effect (simpleErrorHandle)
 import Swarm.Util.Erasable
 import Test.Tasty.Bench (Benchmark, bcompare, bench, bgroup, defaultMain, whnfAppIO)
@@ -157,38 +160,56 @@ runGame numGameTicks = evalStateT (replicateM_ numGameTicks $ runTimeIO gameTick
 
 main :: IO ()
 main = do
-  idlers <- mkGameStates idleProgram
-  trees <- mkGameStates treeProgram
-  circlers <- mkGameStates circlerProgram
-  movers <- mkGameStates moverProgram
-  wavesInlined <- mkGameStates (waveProgram True)
-  wavesWithDef <- mkGameStates (waveProgram False)
+  idlers <- mkGameStates largeRobotNumbers idleProgram
+  trees <- mkGameStates robotNumbers treeProgram
+  circlers <- mkGameStates robotNumbers circlerProgram
+  movers <- mkGameStates robotNumbers moverProgram
+  wavesInlined <- mkGameStates robotNumbers $ waveProgram True
+  wavesWithDef <- mkGameStates robotNumbers $ waveProgram False
   -- In theory we should force the evaluation of these game states to normal
   -- form before running the benchmarks. In practice, the first of the many
   -- criterion runs for each of these benchmarks doesn't look like an outlier.
   defaultMain
     [ bgroup
         "run 1000 game ticks"
-        [ bgroup "idlers" (toBenchmarks idlers)
-        , bgroup "trees" (toBenchmarks trees)
-        , bgroup "circlers" (toBenchmarks circlers)
-        , bgroup "movers" (toBenchmarks movers)
-        , bgroup "wavesInlined" (toBenchmarks wavesInlined)
+        [ bgroupTicks "idlers" 10000 idlers
+        , bgroupTicks "trees" 1000 trees
+        , bgroupTicks "circlers" 1000 circlers
+        , bgroupTicks "movers" 1000 movers
         , bgroup
-            "wavesWithDef"
-            ( zipWith (\i -> bcompare ("wavesInlined." <> show i)) robotNumbers $
-                toBenchmarks wavesWithDef
-            )
+            "waves comparison"
+            [ bgroup "wavesInlined" (toBenchmarks 1000 wavesInlined)
+            , bgroup
+                "wavesWithDef"
+                ( zipWith (\i -> bcompare ("wavesInlined." <> show i)) robotNumbers $
+                    toBenchmarks 1000 wavesWithDef
+                )
+            ]
         ]
     ]
  where
+  bgroupTicks label ticks bots =
+    bgroup newLabel $ toBenchmarks ticks bots
+   where
+    newLabel =
+      unwords
+        [ label
+        , T.unpack $
+            parens $
+              T.unwords
+                [ showT ticks
+                , "ticks"
+                ]
+        ]
+
   robotNumbers = [10, 20 .. 40]
+  largeRobotNumbers = take 4 $ iterate (* 2) 100
 
-  mkGameStates :: ProcessedTerm -> IO [(Int, GameState)]
-  mkGameStates prog = zip robotNumbers <$> mapM (mkGameState prog $ initRobot prog) robotNumbers
+  mkGameStates :: [Int] -> ProcessedTerm -> IO [(Int, GameState)]
+  mkGameStates botCounts prog = mapM (traverse (mkGameState prog $ initRobot prog) . dupe) botCounts
 
-  toBenchmarks :: [(Int, GameState)] -> [Benchmark]
-  toBenchmarks gameStates =
-    [ bench (show n) $ whnfAppIO (runGame 1000) gs
+  toBenchmarks :: Int -> [(Int, GameState)] -> [Benchmark]
+  toBenchmarks tickCount gameStates =
+    [ bench (show n) $ whnfAppIO (runGame tickCount) gs
     | (n, gs) <- gameStates
     ]