ci/benchmarks: tweak what we present in regression report

- Make regression report more compact, remove `min` since it doesn't behave like a limit - It looks like the low-load variants are probably just noisier with fewer samples, so remove them from the regression view: ![samples](https://user-images.githubusercontent.com/210815/191076137-f5c3a0c6-9586-4ea4-a5a7-66000e7a2540.png) PR-URL: https://github.com/hasura/graphql-engine-mono/pull/5942 GitOrigin-RevId: 8a9ab00c251f04d7d5a542731696cb5f86ad2b78
2024-12-15 01:12:56 +03:00 · 2022-09-22 13:27:32 +00:00 · 2022-09-22 13:27:32 +00:00 · 81df74852e
commit 81df74852e
parent 37b846c28d
2 changed files with 70 additions and 22 deletions
--- a/server/benchmarks/benchmark_sets/huge_schema/config.query.yaml
+++ b/server/benchmarks/benchmark_sets/huge_schema/config.query.yaml
@ -8,6 +8,7 @@ headers:
 constants:
  scalars:
    - &low_load 20
+    - &medium_load 100
    - &high_load 500

  k6_custom: &k6_custom
@ -37,13 +38,26 @@ queries:
            # tune this so it's just high enough that we can expect to not need
            # to allocate during the test:
            preAllocatedVUs: 10
-    query: |
+    query: &small_query |
        query MyQuery {
          aouulefavluzmkd {
            afqqxqkiyibuccz
          }
        }

+  - name: small_query_high_load
+    <<: *k6_custom
+    options:
+      k6:
+        scenarios:
+          main:
+            <<: *settings
+            rate: *high_load
+            # tune this so it's just high enough that we can expect to not need
+            # to allocate during the test:
+            preAllocatedVUs: 50
+    query: *small_query
+
  ############################################################################
  # A large query returning no rows. How does this compare to above? How does
  # it compare to a query from chinook that returns little data?
@ -58,7 +72,7 @@ queries:
            <<: *settings
            rate: *low_load
            preAllocatedVUs: 20
-    query: |
+    query: &huge_query |
        query MyQuery {
          avnnjybkglhndgc {
            bhdvbvtikfpzzzi {
@ -168,6 +182,18 @@ queries:
            }
          }
        }
+  
+  - name: huge_query_medium_load
+    <<: *k6_custom
+    options:
+      k6:
+        scenarios:
+          main:
+            <<: *settings
+            # NOTE: we can't keep up at high_load, it seems:
+            rate: *medium_load
+            preAllocatedVUs: 100
+    query: *huge_query

  ############################################################################
  # The standard introspection query from server/src-rsr/introspection.json
--- a/server/benchmarks/fabfile.py
+++ b/server/benchmarks/fabfile.py
@ -424,6 +424,14 @@ def generate_regression_report():
            # this_bench['requests']['count'] # TODO use this to normalize allocations
            name = this_bench['name']

+            # Skip if: this is a "low load" variation with few samples since these are 
+            #          likely redundant / less useful for the purpose of finding regressions
+            #          (see mono #5942)
+            if "low_load" in name:
+                warn(f"Skipping '{name}' which has 'low_load' in name")
+                continue
+
+            # Skip if: no result in merge base report to compare to:
            try:
                merge_base_bench = merge_base_report_dict[name]
            except KeyError:
@ -440,6 +448,14 @@ def generate_regression_report():
                )
            except KeyError:
                continue
+
+            # For now just report regressions in the stable bytes-allocated metric for adhoc
+            if name.startswith("ADHOC-"):
+                warn(f"Just reporting regressions in bytes_alloc_per_req for '{name}' which is adhoc")
+                benchmark_set_results.append((name, metrics))
+                # Skip everything else:
+                continue
+
            # Response body size:
            try:
                merge_base_body_size = float(merge_base_bench['response']['totalBytes']) / float(merge_base_bench['requests']['count'])
@ -455,7 +471,11 @@ def generate_regression_report():
                pass
            # NOTE: we decided to omit higher-percentile latencies here since
            # they are noisy (which might lead to people ignoring benchmarks)
-            for m in ['min', 'p50']:
+            # NOTE: we originally had `min` here, thinking it should be an
+            # asymptote (we can only get so fast doing a particular workload),
+            # but this hasn't turned out to be a useful summary statistic (we
+            # might need several times more samples for it to stabilize)
+            for m in ['p50']:
                try:
                    this_hist = this_bench['histogram']['json']
                    merge_base_hist = merge_base_bench['histogram']['json']
@ -485,6 +505,7 @@ def pretty_print_regression_report_github_comment(results, skip_pr_report_names,
    def out(s): f.write(s+"\n")

    out(f"## Benchmark Results") # NOTE: We use this header to identify benchmark reports in `hide-benchmark-reports.sh`
+    out(f"<details closed><summary>Click for detailed reports, and help docs</summary>")
    out(f"")
    out((f"The regression report below shows, for each benchmark, the **percent change** for "
         f"different metrics, between the merge base (the changes from **PR {merge_base_pr}**) and "
@ -503,7 +524,7 @@ def pretty_print_regression_report_github_comment(results, skip_pr_report_names,
            f"[:bar_chart: merge base]({graphql_bench_url([base_id])})... "
            f"[:bar_chart: both compared]({graphql_bench_url([these_id, base_id])})")
    out(f"")
-    out(f"<details open><summary>Click here for a detailed report.</summary>")
+    out(f"</details>")
    out(f"")

    # Return what should be the first few chars of the line, which will detemine its styling:
@ -519,31 +540,32 @@ def pretty_print_regression_report_github_comment(results, skip_pr_report_names,
        elif -25.0 <= val < 0: return "++  "  # GREEN
        else:                  return "+++ "  # GREEN

-    out(            f"``` diff                                       ")  # START DIFF SYNTAX
+    out(f"``` diff")  # START DIFF SYNTAX
    for benchmark_set_name, (mem_in_use_before_diff, live_bytes_before_diff, mem_in_use_after_diff, live_bytes_after_diff, benchmarks) in results.items():
        if benchmark_set_name[:-5] in skip_pr_report_names: continue
        l0 = live_bytes_before_diff
        l1 = live_bytes_after_diff
        u0 = mem_in_use_before_diff
-        u1 = mem_in_use_after_diff
-        out(        f"{col( )}    ┌{'─'*(len(benchmark_set_name)+4)}┐")
-        out(        f"{col( )}    │  {benchmark_set_name}  │"         )
-        out(        f"{col( )}    └{'─'*(len(benchmark_set_name)+4)}┘")
-        out(        f"{col( )}                                       ")
-        out(        f"{col( )}    ᐉ  Memory Residency (RTS-reported):")
-        out(        f"{col(u0)}        {'mem_in_use':<25}:  {u0:>6.1f}   (BEFORE benchmarks ran; baseline for schema)")
-        out(        f"{col(l0)}        {'live_bytes':<25}:  {l0:>6.1f}   (BEFORE benchmarks ran; baseline for schema)")
-        out(        f"{col(l1)}        {'live_bytes':<25}:  {l1:>6.1f}   (AFTER benchmarks ran)")
+        # u1 = mem_in_use_after_diff
+
+        out(        f"{col(u0)} {benchmark_set_name[:-5]+'  ':─<21s}{'┤ MEMORY RESIDENCY (from RTS)': <30}{'mem_in_use (BEFORE benchmarks)': >38}{u0:>12.1f} ┐")
+        out(        f"{col(l0)} {                        '  ': <21s}{'│'                            : <30}{'live_bytes (BEFORE benchmarks)': >38}{l0:>12.1f} │")
+        out(        f"{col(l1)} {                        '  ': <21s}{'│'                              }{'   live_bytes  (AFTER benchmarks)':_>67}{l1:>12.1f} ┘")
        for bench_name, metrics in benchmarks:
-            out(    f"{col( )}                                       ")
-            out(    f"{col( )}    ᐅ {bench_name.replace('-k6-custom','').replace('_',' ')}:")
+            bench_name_pretty = bench_name.replace('-k6-custom','').replace('_',' ') # need at least 40 chars
            for metric_name, d in metrics.items():
-                # For now just report regressions in the stable bytes-allocated metric for adhoc
-                if bench_name.startswith("ADHOC-") and not metric_name is "bytes_alloc_per_req": continue
-                out(f"{col(d)}        {metric_name:<25}:  {d:>6.1f}")
-        out(        f"{col( )}                                       ")
-    out(            f"```                                            ")  # END DIFF SYNTAX
-    out(f"</details>")
+              if len(list(metrics.items())) == 1:  # need to waste a line if only one metric:
+                out(f"{col(d )} {                        '  ': <21s}{'│ '+bench_name_pretty         : <40}{                     metric_name: >28}{d :>12.1f} ┐")
+                out(f"{col(l1)} {                        '  ': <21s}{'│'                                 }{                              '':_>67}{''  :>12s} ┘")
+              elif metric_name == list(metrics.items())[0][0]:  # first:
+                out(f"{col(d )} {                        '  ': <21s}{'│ '+bench_name_pretty         : <40}{                     metric_name: >28}{d :>12.1f} ┐")
+              elif metric_name == list(metrics.items())[-1][0]:  # last:
+                out(f"{col(l1)} {                        '  ': <21s}{'│'                                 }{               '   '+metric_name:_>67}{d :>12.1f} ┘")
+              else:   # middle, omit name
+                out(f"{col(d )} {                        '  ': <21s}{'│ '                           : <40}{                     metric_name: >28}{d :>12.1f} │")
+
+
+    out(f"```")  # END DIFF SYNTAX

    say(f"Wrote github comment to {REGRESSION_REPORT_COMMENT_FILENAME}")
    f.close()