tweak benchmarks (#1754)

In support of #1598 (for #1739). Runs the `idle` benchmark for many more ticks and with many more robots to try to emphasize the effect of regressions and mitigate jitter. Also factors out common code from the `benchmark-against-parent.sh` script.
2024-08-16 18:10:23 +03:00 · 2024-01-29 10:38:57 -08:00 · 2024-01-29 10:38:57 -08:00 · 0c45811755
commit 0c45811755
parent e7b8cfba31
5 changed files with 54 additions and 22 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,6 +9,7 @@ hie.yaml

 .stack-work/
 .stack-work-test/
+.stack-work-bench/
 stack.yaml.lock
 .hie/
 stan.html
--- a/scripts/benchmark-against-parent.sh
+++ b/scripts/benchmark-against-parent.sh
@ -16,7 +16,8 @@ fi
 BASELINE_OUTPUT=baseline.csv

 git checkout HEAD~
-STACK_WORK=.stack-work-bench stack bench swarm:benchmark --benchmark-arguments "--csv $BASELINE_OUTPUT --color always"
+
+scripts/run-benchmarks.sh "--csv $BASELINE_OUTPUT"

 git switch -
-STACK_WORK=.stack-work-bench stack bench swarm:benchmark --benchmark-arguments "--baseline $BASELINE_OUTPUT --fail-if-slower 3 --color always"
+scripts/run-benchmarks.sh "--baseline $BASELINE_OUTPUT --fail-if-slower 3"
--- a/scripts/run-benchmarks.sh
+++ b/scripts/run-benchmarks.sh
@ -0,0 +1,7 @@
+#!/bin/bash -xe
+
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd $SCRIPT_DIR/..
+
+STACK_WORK=.stack-work-bench stack bench swarm:benchmark --benchmark-arguments "--color always $@"
--- a/swarm.cabal
+++ b/swarm.cabal
@ -798,8 +798,10 @@ benchmark benchmark
  type:           exitcode-stdio-1.0
  build-depends:  tasty-bench >= 0.3.1 && < 0.4,
                  base,
+                  extra,
                  lens,
                  mtl,
+                  text,
                  swarm:swarm-engine,
                  swarm:swarm-util,
                  swarm:swarm-lang,
--- a/test/bench/Benchmark.hs
+++ b/test/bench/Benchmark.hs
@ -11,6 +11,8 @@ import Control.Monad (replicateM_)
 import Control.Monad.State (evalStateT, execStateT)
 import Data.Map qualified as M
 import Data.Sequence (Seq)
+import Data.Text qualified as T
+import Data.Tuple.Extra (dupe)
 import Swarm.Effect (runTimeIO)
 import Swarm.Game.CESK (emptyStore, initMachine)
 import Swarm.Game.Display (defaultRobotDisplay)
@ -30,6 +32,7 @@ import Swarm.Language.Context qualified as Context
 import Swarm.Language.Pipeline (ProcessedTerm)
 import Swarm.Language.Pipeline.QQ (tmQ)
 import Swarm.Language.Syntax
+import Swarm.Util (parens, showT)
 import Swarm.Util.Effect (simpleErrorHandle)
 import Swarm.Util.Erasable
 import Test.Tasty.Bench (Benchmark, bcompare, bench, bgroup, defaultMain, whnfAppIO)
@ -157,38 +160,56 @@ runGame numGameTicks = evalStateT (replicateM_ numGameTicks $ runTimeIO gameTick

 main :: IO ()
 main = do
-  idlers <- mkGameStates idleProgram
-  trees <- mkGameStates treeProgram
-  circlers <- mkGameStates circlerProgram
-  movers <- mkGameStates moverProgram
-  wavesInlined <- mkGameStates (waveProgram True)
-  wavesWithDef <- mkGameStates (waveProgram False)
+  idlers <- mkGameStates largeRobotNumbers idleProgram
+  trees <- mkGameStates robotNumbers treeProgram
+  circlers <- mkGameStates robotNumbers circlerProgram
+  movers <- mkGameStates robotNumbers moverProgram
+  wavesInlined <- mkGameStates robotNumbers $ waveProgram True
+  wavesWithDef <- mkGameStates robotNumbers $ waveProgram False
  -- In theory we should force the evaluation of these game states to normal
  -- form before running the benchmarks. In practice, the first of the many
  -- criterion runs for each of these benchmarks doesn't look like an outlier.
  defaultMain
    [ bgroup
        "run 1000 game ticks"
-        [ bgroup "idlers" (toBenchmarks idlers)
-        , bgroup "trees" (toBenchmarks trees)
-        , bgroup "circlers" (toBenchmarks circlers)
-        , bgroup "movers" (toBenchmarks movers)
-        , bgroup "wavesInlined" (toBenchmarks wavesInlined)
+        [ bgroupTicks "idlers" 10000 idlers
+        , bgroupTicks "trees" 1000 trees
+        , bgroupTicks "circlers" 1000 circlers
+        , bgroupTicks "movers" 1000 movers
        , bgroup
-            "wavesWithDef"
-            ( zipWith (\i -> bcompare ("wavesInlined." <> show i)) robotNumbers $
-                toBenchmarks wavesWithDef
-            )
+            "waves comparison"
+            [ bgroup "wavesInlined" (toBenchmarks 1000 wavesInlined)
+            , bgroup
+                "wavesWithDef"
+                ( zipWith (\i -> bcompare ("wavesInlined." <> show i)) robotNumbers $
+                    toBenchmarks 1000 wavesWithDef
+                )
+            ]
        ]
    ]
 where
+  bgroupTicks label ticks bots =
+    bgroup newLabel $ toBenchmarks ticks bots
+   where
+    newLabel =
+      unwords
+        [ label
+        , T.unpack $
+            parens $
+              T.unwords
+                [ showT ticks
+                , "ticks"
+                ]
+        ]
+
  robotNumbers = [10, 20 .. 40]
+  largeRobotNumbers = take 4 $ iterate (* 2) 100

-  mkGameStates :: ProcessedTerm -> IO [(Int, GameState)]
-  mkGameStates prog = zip robotNumbers <$> mapM (mkGameState prog $ initRobot prog) robotNumbers
+  mkGameStates :: [Int] -> ProcessedTerm -> IO [(Int, GameState)]
+  mkGameStates botCounts prog = mapM (traverse (mkGameState prog $ initRobot prog) . dupe) botCounts

-  toBenchmarks :: [(Int, GameState)] -> [Benchmark]
-  toBenchmarks gameStates =
-    [ bench (show n) $ whnfAppIO (runGame 1000) gs
+  toBenchmarks :: Int -> [(Int, GameState)] -> [Benchmark]
+  toBenchmarks tickCount gameStates =
+    [ bench (show n) $ whnfAppIO (runGame tickCount) gs
    | (n, gs) <- gameStates
    ]