graphql-engine/server/benchmarks/resource_calibration.sh

210 lines
8.0 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
set -euo pipefail
shopt -s globstar
# Allow killing background process by pgid without killing self
set -m
## This is a rough script that helps us quantify the resources required for
## hasura depending on the schema and the expected load. We expect only to need
## to run this quarterly or so.
##
## see: https://hasurahq.atlassian.net/browse/PR-56
echo_pretty() {
echo ">>> $(tput setaf 2)$1$(tput sgr0)"
}
echo_error() {
echo ">>> $(tput setaf 1)$1$(tput sgr0)"
}
echo_warn() {
echo ">>> $(tput setaf 3)$1$(tput sgr0)"
}
REPO_TOPLEVEL=$(git rev-parse --show-toplevel)
MEM_CPU_OUTFILE=$(mktemp)
ENGINE_OUT_FILE=$(mktemp)
POSTGRES_OUT_FILE=$(mktemp)
# Test between 1 and 3:
NUM_SERVER_CORES=2
echo_warn "Please make sure your computer has at least 8 cores and that you've disabled lower processor sleep states! "
echo_warn " $ sudo cpupower frequency-set -g performance && sudo cpupower idle-set -D10 # PERFORMANCE "
sleep 5
if ! command -v gblreg &> /dev/null
then
echo_error "Install gbutils for regression tool 'gblreg'"
exit 1
fi
function start_engine {
# use gnu time to get the memory high watermark
# Run with -Fd for faster memory reclamation back to baseline
command time -f "%M %P" -o "$MEM_CPU_OUTFILE" \
"$REPO_TOPLEVEL/scripts/dev.sh" graphql-engine --optimized -- +RTS -N"$NUM_SERVER_CORES" -Fd0.01 -RTS \
&> "$ENGINE_OUT_FILE" & GRAPHQL_ENGINE_PID=$!
if [ "${1-}" != "no_wait" ]; then
echo -n "Waiting for graphql-engine (for logs see: $ENGINE_OUT_FILE)"
until curl -s "http://127.0.0.1:8181/v1/query" &>/dev/null; do
echo -n '.' && sleep 0.2
# If the server stopped abort immediately
if ! kill -0 $GRAPHQL_ENGINE_PID ; then
echo_error "The server crashed or failed to start!!"
exit 42
fi
done
echo " Ok"
fi
}
function stop_engine {
PGID=$(ps -o '%r' "$GRAPHQL_ENGINE_PID" | tail -n1 | xargs)
# echo "PID/PGID: $$ $GRAPHQL_ENGINE_PID $PGID"
# Send INT to get output from GNU time!:
kill -INT "-$PGID"
# kill -- "-$PGID" # ...not this
wait "$GRAPHQL_ENGINE_PID" || true
if [ -f "$ENGINE_OUT_FILE" ]; then
echo_pretty "Productivity of engine just stopped, FYI:"
until grep '^ Productivity' "$ENGINE_OUT_FILE" ; do
sleep 1
done
fi
rm -f "$ENGINE_OUT_FILE"
}
function start_postgres {
echo "Launching postgres (see logs at $POSTGRES_OUT_FILE)"
"$REPO_TOPLEVEL/scripts/dev.sh" postgres \
&> "$POSTGRES_OUT_FILE" & POSTGRES_PID=$!
}
function stop_postgres {
PGID=$(ps -o '%r' "$POSTGRES_PID" | tail -n1 | xargs)
kill -- "-$PGID"
wait "$POSTGRES_PID" || true
rm -f "$POSTGRES_OUT_FILE"
}
function cleanup {
set +e
echo_pretty "Cleaning up"
stop_engine
stop_postgres
rm -f "$MEM_CPU_OUTFILE"
echo "Done"
}
trap cleanup EXIT
# Get a memory high water mark for replace_metadata
# Must be executed from a benchmark set directory
function init_and_replace_metadata {
echo_pretty "Initializing and doing some replace_metadata"
gunzip -c dump.sql.gz | PGPASSWORD=postgres psql -h 127.0.0.1 -p 25432 postgres -U postgres &>/dev/null
# run replace_metadata a few times (once to initialize schema, a few more to get good high water mark)
curl -X POST -H 'Content-Type: application/json' -d @replace_metadata.json http://127.0.0.1:8181/v1/query
curl -X POST -H 'Content-Type: application/json' -d @replace_metadata.json http://127.0.0.1:8181/v1/query
curl -X POST -H 'Content-Type: application/json' -d @replace_metadata.json http://127.0.0.1:8181/v1/query
echo
}
### Metadata operations and Baseline + peak memory ####################
if true; then
## huge_schema: ########
start_postgres
start_engine
cd "$REPO_TOPLEVEL/server/benchmarks/benchmark_sets/huge_schema"
init_and_replace_metadata
echo_pretty "Sleeping for 30 seconds and then checking for baseline memory usage"
sleep 30
MEM_BASELINE_HUGE_SCHEMA=$(ps -e -o pid,ppid,pgid,rss,comm | awk '$3 == '"$GRAPHQL_ENGINE_PID" | grep graphql-engine | awk '{print $4}')
stop_engine
stop_postgres
echo "sleeping..." && sleep 30 # TODO wait for all in process group
MEM_HIGHWATER_HUGE_SCHEMA=$(tail -n1 "$MEM_CPU_OUTFILE" | awk '{print $1}')
## chinook: ########
start_postgres
start_engine
cd "$REPO_TOPLEVEL/server/benchmarks/benchmark_sets/chinook"
init_and_replace_metadata
echo_pretty "Sleeping for 30 seconds and then checking for baseline memory usage"
sleep 30
MEM_BASELINE_CHINOOK=$(ps -e -o pid,ppid,pgid,rss,comm | awk '$3 == '"$GRAPHQL_ENGINE_PID" | grep graphql-engine | awk '{print $4}')
stop_engine
stop_postgres
echo "sleeping..." && sleep 30 # TODO wait for all in process group
MEM_HIGHWATER_CHINOOK=$(tail -n1 "$MEM_CPU_OUTFILE" | awk '{print $1}')
fi
### Throughput limit and Peak memory under load ####################
if true; then
cd "$REPO_TOPLEVEL/server/benchmarks"
start_engine no_wait
./bench.sh chinook_throughput
stop_engine
echo "sleeping..." && sleep 30 # TODO wait for all in process group
MEM_HIGHWATER_CHINOOK_UNDER_LOAD=$(tail -n1 "$MEM_CPU_OUTFILE" | awk '{print $1}')
CPU_CHINOOK_UNDER_LOAD=$(tail -n1 "$MEM_CPU_OUTFILE" | awk '{print $2}')
fi
set +e
echo_pretty "####################### RAW MEASUREMENTS ###########################"
echo_pretty ""
echo_pretty "Memory usage in KB:"
(echo "| SCHEMA_BASELINE REPLACE_METADATA_PEAK UNDER_LOAD_PEAK" ;\
echo "huge_schema $MEM_BASELINE_HUGE_SCHEMA $MEM_HIGHWATER_HUGE_SCHEMA N/A" ;\
echo "chinook $MEM_BASELINE_CHINOOK $MEM_HIGHWATER_CHINOOK $MEM_HIGHWATER_CHINOOK_UNDER_LOAD") |\
column --table -R1,2,3,4
echo_pretty ""
echo_pretty "Avg CPU During Chinook throughput tests: $CPU_CHINOOK_UNDER_LOAD "
echo "NOTE: The utility of the script relies on the assumption that the throughput "
echo " tests here are mostly CPU bound. we want the value above to be between 150% and "
echo " ${NUM_SERVER_CORES}00% (using all $NUM_SERVER_CORES cores allotted to server)"
echo " FYI: complex_query_high_load_large_result appears to be IO bound, "
echo " with the server at only 100% CPU (on two cores)"
echo_pretty ""
# TODO add uncompressed response body sizes here:
echo_pretty "Peak sustained throughput for our Chinook queries having different uncompressed response body sizes (server given $NUM_SERVER_CORES cores)"
paste -d ' ' <(echo -e "simple_query_high_load(600B): \n complex_query_high_load_small_result(650B): \n complex_query_high_load_large_result(33KB): \n full_introspection(190KB):") \
<(jq '.[] .requests.average |floor' "$REPO_TOPLEVEL/server/benchmarks/benchmark_sets/chinook_throughput/report.json" ) \
<(echo -e "RPS\nRPS\nRPS\nRPS") |\
column --table
echo_pretty ""
echo_pretty "####################### INTERPRETATION ###########################"
CHINOOK_PEAK_MEM=$(( MEM_HIGHWATER_CHINOOK_UNDER_LOAD > MEM_HIGHWATER_CHINOOK ? MEM_HIGHWATER_CHINOOK_UNDER_LOAD : MEM_HIGHWATER_CHINOOK ))
CHINOOK_MEM_SCALE=$(bc -l <<< "scale=1; $CHINOOK_PEAK_MEM/$MEM_BASELINE_CHINOOK")
HUGE_SCHEMA_MEM_SCALE=$(bc -l <<< "scale=1; $MEM_HIGHWATER_HUGE_SCHEMA/$MEM_BASELINE_HUGE_SCHEMA")
echo_pretty "Under peak sustained throughput and with some replace_metadata, peak memory usage is typically between..."
echo_pretty " ${CHINOOK_MEM_SCALE}x and ${HUGE_SCHEMA_MEM_SCALE}x "
echo_pretty "...above the idle baseline (i.e. the schema overhead)"
# TODO automate this
echo_warn ""
echo_warn "ABOVE WAS RUN WITH SERVER ALLOCATED < $NUM_SERVER_CORES > CORES."
echo_warn ""
echo_warn "Rerun this with one, two and three cores ( this is about the limit you can do on an"
echo_warn "8 core laptop  and still get meaningful numbers). Run a linear regression for each:"
echo_warn ' $ echo "1 2266\\n2 3587\\n 3 5270" | gblreg'
echo_warn ' 7.036667e+02 1.502000e+03'
echo_warn ' A^ B^ in: PEAK_THROUGHPUT=A+B*SERVER_CORES'
echo_pretty "Done. Shutting down"