From 81df74852edd849fff85ef169a2b65da70df5947 Mon Sep 17 00:00:00 2001
From: "kodiakhq[bot]" <49736102+kodiakhq[bot]@users.noreply.github.com>
Date: Thu, 22 Sep 2022 13:27:32 +0000
Subject: [PATCH] ci/benchmarks: tweak what we present in regression report

- Make regression report more compact, remove `min` since it doesn't behave like a limit
- It looks like the low-load variants are probably just noisier with fewer samples, so remove them from the regression view:

![samples](https://user-images.githubusercontent.com/210815/191076137-f5c3a0c6-9586-4ea4-a5a7-66000e7a2540.png)

PR-URL: https://github.com/hasura/graphql-engine-mono/pull/5942
GitOrigin-RevId: 8a9ab00c251f04d7d5a542731696cb5f86ad2b78
---
 .../huge_schema/config.query.yaml             | 30 ++++++++-
 server/benchmarks/fabfile.py                  | 62 +++++++++++++------
 2 files changed, 70 insertions(+), 22 deletions(-)

diff --git a/server/benchmarks/benchmark_sets/huge_schema/config.query.yaml b/server/benchmarks/benchmark_sets/huge_schema/config.query.yaml
index 985aeff96fd..41031ff0808 100644
--- a/server/benchmarks/benchmark_sets/huge_schema/config.query.yaml
+++ b/server/benchmarks/benchmark_sets/huge_schema/config.query.yaml
@@ -8,6 +8,7 @@ headers:
 constants:
   scalars:
     - &low_load 20
+    - &medium_load 100
     - &high_load 500
 
   k6_custom: &k6_custom
@@ -37,13 +38,26 @@ queries:
             # tune this so it's just high enough that we can expect to not need
             # to allocate during the test:
             preAllocatedVUs: 10
-    query: |
+    query: &small_query |
         query MyQuery {
           aouulefavluzmkd {
             afqqxqkiyibuccz
           }
         }
 
+  - name: small_query_high_load
+    <<: *k6_custom
+    options:
+      k6:
+        scenarios:
+          main:
+            <<: *settings
+            rate: *high_load
+            # tune this so it's just high enough that we can expect to not need
+            # to allocate during the test:
+            preAllocatedVUs: 50
+    query: *small_query
+
   ############################################################################
   # A large query returning no rows. How does this compare to above? How does
   # it compare to a query from chinook that returns little data?
@@ -58,7 +72,7 @@ queries:
             <<: *settings
             rate: *low_load
             preAllocatedVUs: 20
-    query: |
+    query: &huge_query |
         query MyQuery {
           avnnjybkglhndgc {
             bhdvbvtikfpzzzi {
@@ -168,6 +182,18 @@ queries:
             }
           }
         }
+  
+  - name: huge_query_medium_load
+    <<: *k6_custom
+    options:
+      k6:
+        scenarios:
+          main:
+            <<: *settings
+            # NOTE: we can't keep up at high_load, it seems:
+            rate: *medium_load
+            preAllocatedVUs: 100
+    query: *huge_query
 
   ############################################################################
   # The standard introspection query from server/src-rsr/introspection.json
diff --git a/server/benchmarks/fabfile.py b/server/benchmarks/fabfile.py
index 495a9ed7adf..66000e6d1bf 100755
--- a/server/benchmarks/fabfile.py
+++ b/server/benchmarks/fabfile.py
@@ -424,6 +424,14 @@ def generate_regression_report():
             # this_bench['requests']['count'] # TODO use this to normalize allocations
             name = this_bench['name']
 
+            # Skip if: this is a "low load" variation with few samples since these are 
+            #          likely redundant / less useful for the purpose of finding regressions
+            #          (see mono #5942)
+            if "low_load" in name:
+                warn(f"Skipping '{name}' which has 'low_load' in name")
+                continue
+
+            # Skip if: no result in merge base report to compare to:
             try:
                 merge_base_bench = merge_base_report_dict[name]
             except KeyError:
@@ -440,6 +448,14 @@ def generate_regression_report():
                 )
             except KeyError:
                 continue
+
+            # For now just report regressions in the stable bytes-allocated metric for adhoc
+            if name.startswith("ADHOC-"):
+                warn(f"Just reporting regressions in bytes_alloc_per_req for '{name}' which is adhoc")
+                benchmark_set_results.append((name, metrics))
+                # Skip everything else:
+                continue
+
             # Response body size:
             try:
                 merge_base_body_size = float(merge_base_bench['response']['totalBytes']) / float(merge_base_bench['requests']['count'])
@@ -455,7 +471,11 @@ def generate_regression_report():
                 pass
             # NOTE: we decided to omit higher-percentile latencies here since
             # they are noisy (which might lead to people ignoring benchmarks)
-            for m in ['min', 'p50']:
+            # NOTE: we originally had `min` here, thinking it should be an
+            # asymptote (we can only get so fast doing a particular workload),
+            # but this hasn't turned out to be a useful summary statistic (we
+            # might need several times more samples for it to stabilize)
+            for m in ['p50']:
                 try:
                     this_hist = this_bench['histogram']['json']
                     merge_base_hist = merge_base_bench['histogram']['json']
@@ -485,6 +505,7 @@ def pretty_print_regression_report_github_comment(results, skip_pr_report_names,
     def out(s): f.write(s+"\n")
 
     out(f"## Benchmark Results") # NOTE: We use this header to identify benchmark reports in `hide-benchmark-reports.sh`
+    out(f"<details closed><summary>Click for detailed reports, and help docs</summary>")
     out(f"")
     out((f"The regression report below shows, for each benchmark, the **percent change** for "
          f"different metrics, between the merge base (the changes from **PR {merge_base_pr}**) and "
@@ -503,7 +524,7 @@ def pretty_print_regression_report_github_comment(results, skip_pr_report_names,
             f"[:bar_chart: merge base]({graphql_bench_url([base_id])})... "
             f"[:bar_chart: both compared]({graphql_bench_url([these_id, base_id])})")
     out(f"")
-    out(f"<details open><summary>Click here for a detailed report.</summary>")
+    out(f"</details>")
     out(f"")
 
     # Return what should be the first few chars of the line, which will detemine its styling:
@@ -519,31 +540,32 @@ def pretty_print_regression_report_github_comment(results, skip_pr_report_names,
         elif -25.0 <= val < 0: return "++  "  # GREEN
         else:                  return "+++ "  # GREEN
 
-    out(            f"``` diff                                       ")  # START DIFF SYNTAX
+    out(f"``` diff")  # START DIFF SYNTAX
     for benchmark_set_name, (mem_in_use_before_diff, live_bytes_before_diff, mem_in_use_after_diff, live_bytes_after_diff, benchmarks) in results.items():
         if benchmark_set_name[:-5] in skip_pr_report_names: continue
         l0 = live_bytes_before_diff
         l1 = live_bytes_after_diff
         u0 = mem_in_use_before_diff
-        u1 = mem_in_use_after_diff
-        out(        f"{col( )}    ┌{'─'*(len(benchmark_set_name)+4)}┐")
-        out(        f"{col( )}    │  {benchmark_set_name}  │"         )
-        out(        f"{col( )}    └{'─'*(len(benchmark_set_name)+4)}┘")
-        out(        f"{col( )}                                       ")
-        out(        f"{col( )}    ᐉ  Memory Residency (RTS-reported):")
-        out(        f"{col(u0)}        {'mem_in_use':<25}:  {u0:>6.1f}   (BEFORE benchmarks ran; baseline for schema)")
-        out(        f"{col(l0)}        {'live_bytes':<25}:  {l0:>6.1f}   (BEFORE benchmarks ran; baseline for schema)")
-        out(        f"{col(l1)}        {'live_bytes':<25}:  {l1:>6.1f}   (AFTER benchmarks ran)")
+        # u1 = mem_in_use_after_diff
+
+        out(        f"{col(u0)} {benchmark_set_name[:-5]+'  ':─<21s}{'┤ MEMORY RESIDENCY (from RTS)': <30}{'mem_in_use (BEFORE benchmarks)': >38}{u0:>12.1f} ┐")
+        out(        f"{col(l0)} {                        '  ': <21s}{'│'                            : <30}{'live_bytes (BEFORE benchmarks)': >38}{l0:>12.1f} │")
+        out(        f"{col(l1)} {                        '  ': <21s}{'│'                              }{'   live_bytes  (AFTER benchmarks)':_>67}{l1:>12.1f} ┘")
         for bench_name, metrics in benchmarks:
-            out(    f"{col( )}                                       ")
-            out(    f"{col( )}    ᐅ {bench_name.replace('-k6-custom','').replace('_',' ')}:")
+            bench_name_pretty = bench_name.replace('-k6-custom','').replace('_',' ') # need at least 40 chars
             for metric_name, d in metrics.items():
-                # For now just report regressions in the stable bytes-allocated metric for adhoc
-                if bench_name.startswith("ADHOC-") and not metric_name is "bytes_alloc_per_req": continue
-                out(f"{col(d)}        {metric_name:<25}:  {d:>6.1f}")
-        out(        f"{col( )}                                       ")
-    out(            f"```                                            ")  # END DIFF SYNTAX
-    out(f"</details>")
+              if len(list(metrics.items())) == 1:  # need to waste a line if only one metric:
+                out(f"{col(d )} {                        '  ': <21s}{'│ '+bench_name_pretty         : <40}{                     metric_name: >28}{d :>12.1f} ┐")
+                out(f"{col(l1)} {                        '  ': <21s}{'│'                                 }{                              '':_>67}{''  :>12s} ┘")
+              elif metric_name == list(metrics.items())[0][0]:  # first:
+                out(f"{col(d )} {                        '  ': <21s}{'│ '+bench_name_pretty         : <40}{                     metric_name: >28}{d :>12.1f} ┐")
+              elif metric_name == list(metrics.items())[-1][0]:  # last:
+                out(f"{col(l1)} {                        '  ': <21s}{'│'                                 }{               '   '+metric_name:_>67}{d :>12.1f} ┘")
+              else:   # middle, omit name
+                out(f"{col(d )} {                        '  ': <21s}{'│ '                           : <40}{                     metric_name: >28}{d :>12.1f} │")
+
+
+    out(f"```")  # END DIFF SYNTAX
 
     say(f"Wrote github comment to {REGRESSION_REPORT_COMMENT_FILENAME}")
     f.close()