From 81df74852edd849fff85ef169a2b65da70df5947 Mon Sep 17 00:00:00 2001 From: "kodiakhq[bot]" <49736102+kodiakhq[bot]@users.noreply.github.com> Date: Thu, 22 Sep 2022 13:27:32 +0000 Subject: [PATCH] ci/benchmarks: tweak what we present in regression report - Make regression report more compact, remove `min` since it doesn't behave like a limit - It looks like the low-load variants are probably just noisier with fewer samples, so remove them from the regression view: ![samples](https://user-images.githubusercontent.com/210815/191076137-f5c3a0c6-9586-4ea4-a5a7-66000e7a2540.png) PR-URL: https://github.com/hasura/graphql-engine-mono/pull/5942 GitOrigin-RevId: 8a9ab00c251f04d7d5a542731696cb5f86ad2b78 --- .../huge_schema/config.query.yaml | 30 ++++++++- server/benchmarks/fabfile.py | 62 +++++++++++++------ 2 files changed, 70 insertions(+), 22 deletions(-) diff --git a/server/benchmarks/benchmark_sets/huge_schema/config.query.yaml b/server/benchmarks/benchmark_sets/huge_schema/config.query.yaml index 985aeff96fd..41031ff0808 100644 --- a/server/benchmarks/benchmark_sets/huge_schema/config.query.yaml +++ b/server/benchmarks/benchmark_sets/huge_schema/config.query.yaml @@ -8,6 +8,7 @@ headers: constants: scalars: - &low_load 20 + - &medium_load 100 - &high_load 500 k6_custom: &k6_custom @@ -37,13 +38,26 @@ queries: # tune this so it's just high enough that we can expect to not need # to allocate during the test: preAllocatedVUs: 10 - query: | + query: &small_query | query MyQuery { aouulefavluzmkd { afqqxqkiyibuccz } } + - name: small_query_high_load + <<: *k6_custom + options: + k6: + scenarios: + main: + <<: *settings + rate: *high_load + # tune this so it's just high enough that we can expect to not need + # to allocate during the test: + preAllocatedVUs: 50 + query: *small_query + ############################################################################ # A large query returning no rows. How does this compare to above? How does # it compare to a query from chinook that returns little data? @@ -58,7 +72,7 @@ queries: <<: *settings rate: *low_load preAllocatedVUs: 20 - query: | + query: &huge_query | query MyQuery { avnnjybkglhndgc { bhdvbvtikfpzzzi { @@ -168,6 +182,18 @@ queries: } } } + + - name: huge_query_medium_load + <<: *k6_custom + options: + k6: + scenarios: + main: + <<: *settings + # NOTE: we can't keep up at high_load, it seems: + rate: *medium_load + preAllocatedVUs: 100 + query: *huge_query ############################################################################ # The standard introspection query from server/src-rsr/introspection.json diff --git a/server/benchmarks/fabfile.py b/server/benchmarks/fabfile.py index 495a9ed7adf..66000e6d1bf 100755 --- a/server/benchmarks/fabfile.py +++ b/server/benchmarks/fabfile.py @@ -424,6 +424,14 @@ def generate_regression_report(): # this_bench['requests']['count'] # TODO use this to normalize allocations name = this_bench['name'] + # Skip if: this is a "low load" variation with few samples since these are + # likely redundant / less useful for the purpose of finding regressions + # (see mono #5942) + if "low_load" in name: + warn(f"Skipping '{name}' which has 'low_load' in name") + continue + + # Skip if: no result in merge base report to compare to: try: merge_base_bench = merge_base_report_dict[name] except KeyError: @@ -440,6 +448,14 @@ def generate_regression_report(): ) except KeyError: continue + + # For now just report regressions in the stable bytes-allocated metric for adhoc + if name.startswith("ADHOC-"): + warn(f"Just reporting regressions in bytes_alloc_per_req for '{name}' which is adhoc") + benchmark_set_results.append((name, metrics)) + # Skip everything else: + continue + # Response body size: try: merge_base_body_size = float(merge_base_bench['response']['totalBytes']) / float(merge_base_bench['requests']['count']) @@ -455,7 +471,11 @@ def generate_regression_report(): pass # NOTE: we decided to omit higher-percentile latencies here since # they are noisy (which might lead to people ignoring benchmarks) - for m in ['min', 'p50']: + # NOTE: we originally had `min` here, thinking it should be an + # asymptote (we can only get so fast doing a particular workload), + # but this hasn't turned out to be a useful summary statistic (we + # might need several times more samples for it to stabilize) + for m in ['p50']: try: this_hist = this_bench['histogram']['json'] merge_base_hist = merge_base_bench['histogram']['json'] @@ -485,6 +505,7 @@ def pretty_print_regression_report_github_comment(results, skip_pr_report_names, def out(s): f.write(s+"\n") out(f"## Benchmark Results") # NOTE: We use this header to identify benchmark reports in `hide-benchmark-reports.sh` + out(f"
Click for detailed reports, and help docs") out(f"") out((f"The regression report below shows, for each benchmark, the **percent change** for " f"different metrics, between the merge base (the changes from **PR {merge_base_pr}**) and " @@ -503,7 +524,7 @@ def pretty_print_regression_report_github_comment(results, skip_pr_report_names, f"[:bar_chart: merge base]({graphql_bench_url([base_id])})... " f"[:bar_chart: both compared]({graphql_bench_url([these_id, base_id])})") out(f"") - out(f"
Click here for a detailed report.") + out(f"
") out(f"") # Return what should be the first few chars of the line, which will detemine its styling: @@ -519,31 +540,32 @@ def pretty_print_regression_report_github_comment(results, skip_pr_report_names, elif -25.0 <= val < 0: return "++ " # GREEN else: return "+++ " # GREEN - out( f"``` diff ") # START DIFF SYNTAX + out(f"``` diff") # START DIFF SYNTAX for benchmark_set_name, (mem_in_use_before_diff, live_bytes_before_diff, mem_in_use_after_diff, live_bytes_after_diff, benchmarks) in results.items(): if benchmark_set_name[:-5] in skip_pr_report_names: continue l0 = live_bytes_before_diff l1 = live_bytes_after_diff u0 = mem_in_use_before_diff - u1 = mem_in_use_after_diff - out( f"{col( )} ┌{'─'*(len(benchmark_set_name)+4)}┐") - out( f"{col( )} │ {benchmark_set_name} │" ) - out( f"{col( )} └{'─'*(len(benchmark_set_name)+4)}┘") - out( f"{col( )} ") - out( f"{col( )} ᐉ Memory Residency (RTS-reported):") - out( f"{col(u0)} {'mem_in_use':<25}: {u0:>6.1f} (BEFORE benchmarks ran; baseline for schema)") - out( f"{col(l0)} {'live_bytes':<25}: {l0:>6.1f} (BEFORE benchmarks ran; baseline for schema)") - out( f"{col(l1)} {'live_bytes':<25}: {l1:>6.1f} (AFTER benchmarks ran)") + # u1 = mem_in_use_after_diff + + out( f"{col(u0)} {benchmark_set_name[:-5]+' ':─<21s}{'┤ MEMORY RESIDENCY (from RTS)': <30}{'mem_in_use (BEFORE benchmarks)': >38}{u0:>12.1f} ┐") + out( f"{col(l0)} { ' ': <21s}{'│' : <30}{'live_bytes (BEFORE benchmarks)': >38}{l0:>12.1f} │") + out( f"{col(l1)} { ' ': <21s}{'│' }{' live_bytes (AFTER benchmarks)':_>67}{l1:>12.1f} ┘") for bench_name, metrics in benchmarks: - out( f"{col( )} ") - out( f"{col( )} ᐅ {bench_name.replace('-k6-custom','').replace('_',' ')}:") + bench_name_pretty = bench_name.replace('-k6-custom','').replace('_',' ') # need at least 40 chars for metric_name, d in metrics.items(): - # For now just report regressions in the stable bytes-allocated metric for adhoc - if bench_name.startswith("ADHOC-") and not metric_name is "bytes_alloc_per_req": continue - out(f"{col(d)} {metric_name:<25}: {d:>6.1f}") - out( f"{col( )} ") - out( f"``` ") # END DIFF SYNTAX - out(f"
") + if len(list(metrics.items())) == 1: # need to waste a line if only one metric: + out(f"{col(d )} { ' ': <21s}{'│ '+bench_name_pretty : <40}{ metric_name: >28}{d :>12.1f} ┐") + out(f"{col(l1)} { ' ': <21s}{'│' }{ '':_>67}{'' :>12s} ┘") + elif metric_name == list(metrics.items())[0][0]: # first: + out(f"{col(d )} { ' ': <21s}{'│ '+bench_name_pretty : <40}{ metric_name: >28}{d :>12.1f} ┐") + elif metric_name == list(metrics.items())[-1][0]: # last: + out(f"{col(l1)} { ' ': <21s}{'│' }{ ' '+metric_name:_>67}{d :>12.1f} ┘") + else: # middle, omit name + out(f"{col(d )} { ' ': <21s}{'│ ' : <40}{ metric_name: >28}{d :>12.1f} │") + + + out(f"```") # END DIFF SYNTAX say(f"Wrote github comment to {REGRESSION_REPORT_COMMENT_FILENAME}") f.close()