graphql-engine/server/benchmarks/resource_calibration.sh
Brandon Simmons 3332189756 benchmarks: resource_calibration.sh and CI throughput benchmarks
See this earlier iteration of this work for an example of the kind of report  we're producing: #7664

And related work in this repo: github.com:hasura/graphql-bench-helper

PR-URL: https://github.com/hasura/graphql-engine-mono/pull/7923
GitOrigin-RevId: 99d2a55e2fb5b55f3f33e2570cfd0bc23e448e0c
2023-04-18 16:44:54 +00:00

210 lines
8.0 KiB
Bash
Executable File
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
set -euo pipefail
shopt -s globstar
# Allow killing background process by pgid without killing self
set -m
## This is a rough script that helps us quantify the resources required for
## hasura depending on the schema and the expected load. We expect only to need
## to run this quarterly or so.
##
## see: https://hasurahq.atlassian.net/browse/PR-56
echo_pretty() {
echo ">>> $(tput setaf 2)$1$(tput sgr0)"
}
echo_error() {
echo ">>> $(tput setaf 1)$1$(tput sgr0)"
}
echo_warn() {
echo ">>> $(tput setaf 3)$1$(tput sgr0)"
}
REPO_TOPLEVEL=$(git rev-parse --show-toplevel)
MEM_CPU_OUTFILE=$(mktemp)
ENGINE_OUT_FILE=$(mktemp)
POSTGRES_OUT_FILE=$(mktemp)
# Test between 1 and 3:
NUM_SERVER_CORES=2
echo_warn "Please make sure your computer has at least 8 cores and that you've disabled lower processor sleep states! "
echo_warn " $ sudo cpupower frequency-set -g performance && sudo cpupower idle-set -D10 # PERFORMANCE "
sleep 5
if ! command -v gblreg &> /dev/null
then
echo_error "Install gbutils for regression tool 'gblreg'"
exit 1
fi
function start_engine {
# use gnu time to get the memory high watermark
# Run with -Fd for faster memory reclamation back to baseline
command time -f "%M %P" -o "$MEM_CPU_OUTFILE" \
"$REPO_TOPLEVEL/scripts/dev.sh" graphql-engine --optimized -- +RTS -N"$NUM_SERVER_CORES" -Fd0.01 -RTS \
&> "$ENGINE_OUT_FILE" & GRAPHQL_ENGINE_PID=$!
if [ "${1-}" != "no_wait" ]; then
echo -n "Waiting for graphql-engine (for logs see: $ENGINE_OUT_FILE)"
until curl -s "http://127.0.0.1:8181/v1/query" &>/dev/null; do
echo -n '.' && sleep 0.2
# If the server stopped abort immediately
if ! kill -0 $GRAPHQL_ENGINE_PID ; then
echo_error "The server crashed or failed to start!!"
exit 42
fi
done
echo " Ok"
fi
}
function stop_engine {
PGID=$(ps -o '%r' "$GRAPHQL_ENGINE_PID" | tail -n1 | xargs)
# echo "PID/PGID: $$ $GRAPHQL_ENGINE_PID $PGID"
# Send INT to get output from GNU time!:
kill -INT "-$PGID"
# kill -- "-$PGID" # ...not this
wait "$GRAPHQL_ENGINE_PID" || true
if [ -f "$ENGINE_OUT_FILE" ]; then
echo_pretty "Productivity of engine just stopped, FYI:"
until grep '^ Productivity' "$ENGINE_OUT_FILE" ; do
sleep 1
done
fi
rm -f "$ENGINE_OUT_FILE"
}
function start_postgres {
echo "Launching postgres (see logs at $POSTGRES_OUT_FILE)"
"$REPO_TOPLEVEL/scripts/dev.sh" postgres \
&> "$POSTGRES_OUT_FILE" & POSTGRES_PID=$!
}
function stop_postgres {
PGID=$(ps -o '%r' "$POSTGRES_PID" | tail -n1 | xargs)
kill -- "-$PGID"
wait "$POSTGRES_PID" || true
rm -f "$POSTGRES_OUT_FILE"
}
function cleanup {
set +e
echo_pretty "Cleaning up"
stop_engine
stop_postgres
rm -f "$MEM_CPU_OUTFILE"
echo "Done"
}
trap cleanup EXIT
# Get a memory high water mark for replace_metadata
# Must be executed from a benchmark set directory
function init_and_replace_metadata {
echo_pretty "Initializing and doing some replace_metadata"
gunzip -c dump.sql.gz | PGPASSWORD=postgres psql -h 127.0.0.1 -p 25432 postgres -U postgres &>/dev/null
# run replace_metadata a few times (once to initialize schema, a few more to get good high water mark)
curl -X POST -H 'Content-Type: application/json' -d @replace_metadata.json http://127.0.0.1:8181/v1/query
curl -X POST -H 'Content-Type: application/json' -d @replace_metadata.json http://127.0.0.1:8181/v1/query
curl -X POST -H 'Content-Type: application/json' -d @replace_metadata.json http://127.0.0.1:8181/v1/query
echo
}
### Metadata operations and Baseline + peak memory ####################
if true; then
## huge_schema: ########
start_postgres
start_engine
cd "$REPO_TOPLEVEL/server/benchmarks/benchmark_sets/huge_schema"
init_and_replace_metadata
echo_pretty "Sleeping for 30 seconds and then checking for baseline memory usage"
sleep 30
MEM_BASELINE_HUGE_SCHEMA=$(ps -e -o pid,ppid,pgid,rss,comm | awk '$3 == '"$GRAPHQL_ENGINE_PID" | grep graphql-engine | awk '{print $4}')
stop_engine
stop_postgres
echo "sleeping..." && sleep 30 # TODO wait for all in process group
MEM_HIGHWATER_HUGE_SCHEMA=$(tail -n1 "$MEM_CPU_OUTFILE" | awk '{print $1}')
## chinook: ########
start_postgres
start_engine
cd "$REPO_TOPLEVEL/server/benchmarks/benchmark_sets/chinook"
init_and_replace_metadata
echo_pretty "Sleeping for 30 seconds and then checking for baseline memory usage"
sleep 30
MEM_BASELINE_CHINOOK=$(ps -e -o pid,ppid,pgid,rss,comm | awk '$3 == '"$GRAPHQL_ENGINE_PID" | grep graphql-engine | awk '{print $4}')
stop_engine
stop_postgres
echo "sleeping..." && sleep 30 # TODO wait for all in process group
MEM_HIGHWATER_CHINOOK=$(tail -n1 "$MEM_CPU_OUTFILE" | awk '{print $1}')
fi
### Throughput limit and Peak memory under load ####################
if true; then
cd "$REPO_TOPLEVEL/server/benchmarks"
start_engine no_wait
./bench.sh chinook_throughput
stop_engine
echo "sleeping..." && sleep 30 # TODO wait for all in process group
MEM_HIGHWATER_CHINOOK_UNDER_LOAD=$(tail -n1 "$MEM_CPU_OUTFILE" | awk '{print $1}')
CPU_CHINOOK_UNDER_LOAD=$(tail -n1 "$MEM_CPU_OUTFILE" | awk '{print $2}')
fi
set +e
echo_pretty "####################### RAW MEASUREMENTS ###########################"
echo_pretty ""
echo_pretty "Memory usage in KB:"
(echo "| SCHEMA_BASELINE REPLACE_METADATA_PEAK UNDER_LOAD_PEAK" ;\
echo "huge_schema $MEM_BASELINE_HUGE_SCHEMA $MEM_HIGHWATER_HUGE_SCHEMA N/A" ;\
echo "chinook $MEM_BASELINE_CHINOOK $MEM_HIGHWATER_CHINOOK $MEM_HIGHWATER_CHINOOK_UNDER_LOAD") |\
column --table -R1,2,3,4
echo_pretty ""
echo_pretty "Avg CPU During Chinook throughput tests: $CPU_CHINOOK_UNDER_LOAD "
echo "NOTE: The utility of the script relies on the assumption that the throughput "
echo " tests here are mostly CPU bound. we want the value above to be between 150% and "
echo " ${NUM_SERVER_CORES}00% (using all $NUM_SERVER_CORES cores allotted to server)"
echo " FYI: complex_query_high_load_large_result appears to be IO bound, "
echo " with the server at only 100% CPU (on two cores)"
echo_pretty ""
# TODO add uncompressed response body sizes here:
echo_pretty "Peak sustained throughput for our Chinook queries having different uncompressed response body sizes (server given $NUM_SERVER_CORES cores)"
paste -d ' ' <(echo -e "simple_query_high_load(600B): \n complex_query_high_load_small_result(650B): \n complex_query_high_load_large_result(33KB): \n full_introspection(190KB):") \
<(jq '.[] .requests.average |floor' "$REPO_TOPLEVEL/server/benchmarks/benchmark_sets/chinook_throughput/report.json" ) \
<(echo -e "RPS\nRPS\nRPS\nRPS") |\
column --table
echo_pretty ""
echo_pretty "####################### INTERPRETATION ###########################"
CHINOOK_PEAK_MEM=$(( MEM_HIGHWATER_CHINOOK_UNDER_LOAD > MEM_HIGHWATER_CHINOOK ? MEM_HIGHWATER_CHINOOK_UNDER_LOAD : MEM_HIGHWATER_CHINOOK ))
CHINOOK_MEM_SCALE=$(bc -l <<< "scale=1; $CHINOOK_PEAK_MEM/$MEM_BASELINE_CHINOOK")
HUGE_SCHEMA_MEM_SCALE=$(bc -l <<< "scale=1; $MEM_HIGHWATER_HUGE_SCHEMA/$MEM_BASELINE_HUGE_SCHEMA")
echo_pretty "Under peak sustained throughput and with some replace_metadata, peak memory usage is typically between..."
echo_pretty " ${CHINOOK_MEM_SCALE}x and ${HUGE_SCHEMA_MEM_SCALE}x "
echo_pretty "...above the idle baseline (i.e. the schema overhead)"
# TODO automate this
echo_warn ""
echo_warn "ABOVE WAS RUN WITH SERVER ALLOCATED < $NUM_SERVER_CORES > CORES."
echo_warn ""
echo_warn "Rerun this with one, two and three cores ( this is about the limit you can do on an"
echo_warn "8 core laptop  and still get meaningful numbers). Run a linear regression for each:"
echo_warn ' $ echo "1 2266\\n2 3587\\n 3 5270" | gblreg'
echo_warn ' 7.036667e+02 1.502000e+03'
echo_warn ' A^ B^ in: PEAK_THROUGHPUT=A+B*SERVER_CORES'
echo_pretty "Done. Shutting down"