Added prometheus metric for time to acquire connection (#21628)

ref
https://linear.app/ghost/issue/ENG-1769/improve-pool-utilization-metric

- Currently the connection pool metrics are all point in time metrics,
and with a scrape interval of 15s this doesn't tell us a whole lot about
what's happening in the pool.
- This commit adds a Summary metric to track the elapsed time each
transaction has to wait to acquire a connection from the pool, which
should be a good indication of contention in the pool.
- Also moved the call to `prometheusClient.instrumentKnex` to after `initCore` in the boot process, because the metric depends on event listeners on `knex.client.pool`, and the pool gets destroyed and recreated in `initCore`, which removes the listeners
This commit is contained in:
Chris Raible 2024-11-14 21:14:40 -08:00 committed by GitHub
parent 015b881bc1
commit 431719080e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 281 additions and 176 deletions

View File

@ -36,7 +36,7 @@
"gnetId": 14058,
"graphTooltip": 0,
"id": 1,
"iteration": 1731033697061,
"iteration": 1731634781920,
"links": [],
"liveNow": false,
"panels": [
@ -773,7 +773,7 @@
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
@ -786,7 +786,7 @@
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
@ -803,10 +803,6 @@
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
@ -824,7 +820,7 @@
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"displayMode": "table",
"placement": "bottom"
},
"tooltip": {
@ -868,11 +864,11 @@
"refId": "C"
}
],
"title": "Active Connections",
"title": "Connections",
"type": "timeseries"
},
{
"description": "The number of connections in the pool in active use (i.e. to run a query).",
"description": "The number of active connections as a percentage of the maximum allowed by the pool.",
"fieldConfig": {
"defaults": {
"color": {
@ -883,7 +879,7 @@
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
@ -896,105 +892,7 @@
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 23
},
"id": 23,
"interval": "1s",
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"exemplar": true,
"expr": "ghost_db_connection_pool_max{job=\"$job\"}",
"interval": "",
"legendFormat": "Max - {{job}}",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"exemplar": true,
"expr": "ghost_db_connection_pool_used{job=\"$job\"}",
"hide": false,
"interval": "",
"legendFormat": "Used - {{job}}",
"refId": "B"
}
],
"title": "Used Connections",
"type": "timeseries"
},
{
"description": "The percent of active or used connections out of the pool max.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
@ -1025,7 +923,7 @@
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"x": 8,
"y": 23
},
"id": 25,
@ -1033,7 +931,7 @@
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"displayMode": "table",
"placement": "bottom"
},
"tooltip": {
@ -1041,17 +939,6 @@
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"exemplar": true,
"expr": "ghost_db_connection_pool_used{job=~\"$job\"} / ghost_db_connection_pool_max{job=~\"$job\"}",
"interval": "",
"legendFormat": "Used - {{job}}",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
@ -1065,11 +952,11 @@
"refId": "B"
}
],
"title": "Utilization",
"title": "Pool Utilization",
"type": "timeseries"
},
{
"description": "The number of queries that are currently waiting for a free connection.",
"description": "The elapsed time a transaction spends waiting for an available connection in the connection pool.",
"fieldConfig": {
"defaults": {
"color": {
@ -1080,7 +967,7 @@
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
@ -1093,7 +980,118 @@
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "dtdurations"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 23
},
"id": 32,
"interval": "1s",
"options": {
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"exemplar": true,
"expr": "ghost_db_connection_acquire_duration_seconds{quantile=\"0.5\", job=~\"$job\"}",
"interval": "",
"legendFormat": "P50 - {{job}}",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"exemplar": true,
"expr": "ghost_db_connection_acquire_duration_seconds{quantile=\"0.9\", job=~\"$job\"}",
"hide": false,
"interval": "",
"legendFormat": "P90 - {{job}}",
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"exemplar": true,
"expr": "ghost_db_connection_acquire_duration_seconds{quantile=\"0.99\", job=~\"$job\"}",
"hide": false,
"interval": "",
"legendFormat": "P99 - {{job}}",
"refId": "C"
}
],
"title": "Time to Acquire Connection",
"type": "timeseries"
},
{
"description": "The number of transactions that are currently in the queue waiting to acquire a free connection",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
@ -1166,7 +1164,7 @@
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
@ -1179,7 +1177,7 @@
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
@ -1217,7 +1215,7 @@
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"displayMode": "table",
"placement": "bottom"
},
"tooltip": {
@ -1265,7 +1263,7 @@
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
@ -1278,7 +1276,7 @@
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
@ -1302,7 +1300,7 @@
}
]
},
"unit": "dtdurationms"
"unit": "dtdurations"
},
"overrides": []
},
@ -1317,7 +1315,7 @@
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"displayMode": "table",
"placement": "bottom"
},
"tooltip": {
@ -1331,7 +1329,7 @@
"uid": "PBFA97CFB590B2093"
},
"exemplar": true,
"expr": "ghost_db_query_duration_milliseconds{quantile=\"0.5\", job=~\"$job\"}",
"expr": "ghost_db_query_duration_seconds{quantile=\"0.5\", job=~\"$job\"}",
"interval": "",
"legendFormat": "P50 - {{job}}",
"refId": "A"
@ -1342,7 +1340,7 @@
"uid": "PBFA97CFB590B2093"
},
"exemplar": true,
"expr": "ghost_db_query_duration_milliseconds{quantile=\"0.9\", job=~\"$job\"}",
"expr": "ghost_db_query_duration_seconds{quantile=\"0.9\", job=~\"$job\"}",
"hide": false,
"interval": "",
"legendFormat": "P90 - {{job}}",
@ -1354,7 +1352,7 @@
"uid": "PBFA97CFB590B2093"
},
"exemplar": true,
"expr": "ghost_db_query_duration_milliseconds{quantile=\"0.99\", job=~\"$job\"}",
"expr": "ghost_db_query_duration_seconds{quantile=\"0.99\", job=~\"$job\"}",
"hide": false,
"interval": "",
"legendFormat": "P99 - {{job}}",
@ -1376,7 +1374,7 @@
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
@ -1389,7 +1387,7 @@
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
@ -1428,7 +1426,7 @@
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"displayMode": "table",
"placement": "bottom"
},
"tooltip": {
@ -1442,7 +1440,7 @@
"uid": "PBFA97CFB590B2093"
},
"exemplar": true,
"expr": "rate(ghost_db_query_count{job=~\"$job\"}[1m])",
"expr": "rate(ghost_db_query_duration_seconds_count{job=~\"$job\"}[1m])",
"interval": "",
"legendFormat": "{{job}}",
"refId": "A"
@ -2082,7 +2080,7 @@
},
{
"current": {
"selected": false,
"selected": true,
"text": [
"ghost-chris-local"
],
@ -2115,7 +2113,7 @@
},
{
"current": {
"selected": false,
"selected": true,
"text": [
"All"
],
@ -2239,6 +2237,6 @@
"timezone": "",
"title": "Ghost Dashboard",
"uid": "yX2d7k1Gk",
"version": 12,
"version": 4,
"weekStart": ""
}

View File

@ -568,6 +568,13 @@ async function bootGhost({backend = true, frontend = true, server = true} = {})
// Step 4 - Load Ghost with all its services
debug('Begin: Load Ghost Services & Apps');
await initCore({ghostServer, config, bootLogger, frontend});
// Instrument the knex instance and connection pool if prometheus is enabled
// Needs to be after initCore because the pool is destroyed and recreated in initCore, which removes the event listeners
if (prometheusClient) {
prometheusClient.instrumentKnex(connection);
}
const {dataService} = await initServicesForFrontend({bootLogger});
if (frontend) {

View File

@ -68,10 +68,6 @@ if (!knexInstance && config.get('database') && config.get('database').client) {
const instrumentation = new ConnectionPoolInstrumentation({knex: knexInstance, logging, metrics, config});
instrumentation.instrument();
}
if (config.get('prometheus:enabled')) {
const prometheusClient = require('../../../shared/prometheus-client');
prometheusClient.instrumentKnex(knexInstance);
}
}
module.exports = knexInstance;

View File

@ -31,7 +31,8 @@ export class PrometheusClient {
public client;
public gateway: client.Pushgateway<client.RegistryContentType> | undefined; // public for testing
public queries: Map<string, Date> = new Map();
public queries: Map<string, () => void> = new Map();
public acquires: Map<number, () => void> = new Map();
private config: PrometheusClientConfig;
private prefix;
@ -211,11 +212,14 @@ export class PrometheusClient {
* @param collect - The collect function to use for the summary
* @returns The summary metric
*/
registerSummary({name, help, percentiles, collect}: {name: string, help: string, percentiles?: number[], collect?: () => void}): client.Summary {
registerSummary({name, help, percentiles, maxAgeSeconds, ageBuckets, pruneAgedBuckets, collect}: {name: string, help: string, percentiles?: number[], maxAgeSeconds?: number, ageBuckets?: number, pruneAgedBuckets?: boolean, collect?: () => void}): client.Summary {
return new this.client.Summary({
name: `${this.prefix}${name}`,
help,
percentiles: percentiles || [0.5, 0.9, 0.99],
maxAgeSeconds,
ageBuckets,
pruneAgedBuckets,
collect
});
}
@ -300,23 +304,46 @@ export class PrometheusClient {
}
});
this.registerSummary({
name: `db_query_duration_milliseconds`,
help: 'The duration of queries in milliseconds',
percentiles: [0.5, 0.9, 0.99]
const queryDurationSummary = this.registerSummary({
name: `db_query_duration_seconds`,
help: 'Summary of the duration of knex database queries in seconds',
percentiles: [0.5, 0.9, 0.99],
maxAgeSeconds: 60,
ageBuckets: 6,
pruneAgedBuckets: false
});
const acquireDurationSummary = this.registerSummary({
name: `db_connection_acquire_duration_seconds`,
help: 'Summary of the duration of acquiring a connection from the pool in seconds',
percentiles: [0.5, 0.9, 0.99],
maxAgeSeconds: 60,
ageBuckets: 6,
pruneAgedBuckets: false
});
knexInstance.on('query', (query) => {
// Add the query to the map
this.queries.set(query.__knexQueryUid, new Date());
this.queries.set(query.__knexQueryUid, queryDurationSummary.startTimer());
});
knexInstance.on('query-response', (err, query) => {
const start = this.queries.get(query.__knexQueryUid);
if (start) {
const duration = new Date().getTime() - start.getTime();
(this.getMetric(`db_query_duration_milliseconds`) as client.Summary).observe(duration);
}
this.queries.get(query.__knexQueryUid)?.();
this.queries.delete(query.__knexQueryUid);
});
knexInstance.client.pool.on('acquireRequest', (eventId: number) => {
this.acquires.set(eventId, acquireDurationSummary.startTimer());
});
knexInstance.client.pool.on('acquireSuccess', (eventId: number) => {
this.acquires.get(eventId)?.();
this.acquires.delete(eventId);
});
knexInstance.client.pool.on('acquireFail', (eventId: number) => {
this.acquires.get(eventId)?.();
this.acquires.delete(eventId);
});
}
}

View File

@ -248,13 +248,14 @@ describe('Prometheus Client', function () {
describe('instrumentKnex', function () {
let knexMock: Knex;
let eventEmitter: EventEmitterType;
let knexEventEmitter: EventEmitterType;
let poolEventEmitter: EventEmitterType;
function simulateQuery(queryUid: string, duration: number) {
const clock = sinon.useFakeTimers();
eventEmitter.emit('query', {__knexQueryUid: queryUid, sql: 'SELECT 1'});
knexEventEmitter.emit('query', {__knexQueryUid: queryUid, sql: 'SELECT 1'});
clock.tick(duration);
eventEmitter.emit('query-response', null, {__knexQueryUid: queryUid, sql: 'SELECT 1'});
knexEventEmitter.emit('query-response', null, {__knexQueryUid: queryUid, sql: 'SELECT 1'});
clock.restore();
}
@ -264,11 +265,28 @@ describe('Prometheus Client', function () {
});
}
function simulateAcquire(duration: number) {
const clock = sinon.useFakeTimers();
poolEventEmitter.emit('acquireRequest');
clock.tick(duration);
poolEventEmitter.emit('acquireSuccess');
clock.restore();
}
function simulateAcquireFail(duration: number) {
const clock = sinon.useFakeTimers();
poolEventEmitter.emit('acquireRequest');
clock.tick(duration);
poolEventEmitter.emit('acquireFail');
clock.restore();
}
beforeEach(function () {
eventEmitter = new EventEmitter();
knexEventEmitter = new EventEmitter();
poolEventEmitter = new EventEmitter();
knexMock = {
on: sinon.stub().callsFake((event, callback) => {
eventEmitter.on(event, callback);
knexEventEmitter.on(event, callback);
}),
client: {
pool: {
@ -277,7 +295,10 @@ describe('Prometheus Client', function () {
numUsed: sinon.stub().returns(0),
numFree: sinon.stub().returns(0),
numPendingAcquires: sinon.stub().returns(0),
numPendingCreates: sinon.stub().returns(0)
numPendingCreates: sinon.stub().returns(0),
on: sinon.stub().callsFake((event, callback) => {
poolEventEmitter.on(event, callback);
})
}
}
} as unknown as Knex;
@ -353,9 +374,9 @@ describe('Prometheus Client', function () {
instance = new PrometheusClient();
instance.init();
instance.instrumentKnex(knexMock);
eventEmitter.emit('query', {__knexQueryUid: '1', sql: 'SELECT 1'});
const metricValues = await instance.getMetricValues('db_query_duration_milliseconds');
assert.equal(metricValues?.[0].value, 0);
simulateQuery('1', 500);
const metricValues = await instance.getMetricValues('db_query_duration_seconds');
assert.equal(metricValues?.[0].value, 0.5);
});
it('should accurately calculate the query duration of a query', async function () {
@ -364,15 +385,33 @@ describe('Prometheus Client', function () {
instance.instrumentKnex(knexMock);
const durations = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000];
simulateQueries(durations);
const metricValues = await instance.getMetricValues('db_query_duration_milliseconds');
const metricValues = await instance.getMetricValues('db_query_duration_seconds');
assert.deepEqual(metricValues, [
{labels: {quantile: 0.5}, value: 550},
{labels: {quantile: 0.9}, value: 950},
{labels: {quantile: 0.99}, value: 1000},
{metricName: 'ghost_db_query_duration_milliseconds_sum', labels: {}, value: 5500},
{metricName: 'ghost_db_query_duration_milliseconds_count', labels: {}, value: 10}
{labels: {quantile: 0.5}, value: 0.55},
{labels: {quantile: 0.9}, value: 0.95},
{labels: {quantile: 0.99}, value: 1},
{metricName: 'ghost_db_query_duration_seconds_sum', labels: {}, value: 5.5},
{metricName: 'ghost_db_query_duration_seconds_count', labels: {}, value: 10}
]);
});
it('should collect the db connection acquire duration metric when a connection is acquired', async function () {
instance = new PrometheusClient();
instance.init();
instance.instrumentKnex(knexMock);
simulateAcquire(500);
const metricValues = await instance.getMetricValues('db_connection_acquire_duration_seconds');
assert.equal(metricValues?.[0].value, 0.5);
});
it('should collect the db connection acquire duration metric when a connection fails to be acquired', async function () {
instance = new PrometheusClient();
instance.init();
instance.instrumentKnex(knexMock);
simulateAcquireFail(500);
const metricValues = await instance.getMetricValues('db_connection_acquire_duration_seconds');
assert.equal(metricValues?.[0].value, 0.5);
});
});
describe('Custom Metrics', function () {
@ -544,7 +583,7 @@ describe('Prometheus Client', function () {
]);
});
it('can use the percentiles option to set the summary value', async function () {
it('respects the percentiles option', async function () {
instance = new PrometheusClient();
instance.init();
instance.registerSummary({name: 'test_summary', help: 'A test summary', percentiles: [0.1, 0.5, 0.9]});
@ -558,6 +597,44 @@ describe('Prometheus Client', function () {
]);
});
it('removes datapoints older than maxAgeSeconds from percentile metrics if maxAgeSeconds and ageBuckets are provided', async function () {
const clock = sinon.useFakeTimers();
instance = new PrometheusClient();
instance.init();
const metric = instance.registerSummary({name: 'test_summary', help: 'A test summary', maxAgeSeconds: 10, ageBuckets: 1});
metric.observe(1);
const metricValuesBefore = await instance.getMetricValues('ghost_test_summary');
assert.deepEqual(metricValuesBefore, [
{labels: {quantile: 0.5}, value: 1},
{labels: {quantile: 0.9}, value: 1},
{labels: {quantile: 0.99}, value: 1},
{metricName: 'ghost_test_summary_sum', labels: {}, value: 1},
{metricName: 'ghost_test_summary_count', labels: {}, value: 1}
]);
clock.tick(20000);
const metricValuesAfter = await instance.getMetricValues('ghost_test_summary');
assert.deepEqual(metricValuesAfter, [
{labels: {quantile: 0.5}, value: 0},
{labels: {quantile: 0.9}, value: 0},
{labels: {quantile: 0.99}, value: 0},
{metricName: 'ghost_test_summary_sum', labels: {}, value: 1},
{metricName: 'ghost_test_summary_count', labels: {}, value: 1}
]);
clock.restore();
});
it('does not export the metric if maxAgeSeconds and ageBuckets are provided and pruneAgedBuckets is true', async function () {
const clock = sinon.useFakeTimers();
instance = new PrometheusClient();
instance.init();
const metric = instance.registerSummary({name: 'test_summary', help: 'A test summary', maxAgeSeconds: 10, ageBuckets: 1, pruneAgedBuckets: true});
metric.observe(1);
clock.tick(20000);
const metricValues = await instance.getMetricValues('ghost_test_summary');
assert.deepEqual(metricValues, []);
clock.restore();
});
it('can use a timer to observe the summary value', async function () {
instance = new PrometheusClient();
instance.init();