Added connection pool metrics to prometheus client (#21576)

ref
https://linear.app/ghost/issue/ENG-1592/start-monitoring-connection-pool-utilization-in-ghost

- This commit adds prometheus metrics to the connection pool so we can
start to track connection pool utilization, number of pending acquires,
and also adds some basic SQL query summary metrics like queries per
minute and query duration percentiles.
- The connection pool has now been theorized to be a main constraint of
Ghost for some time, but it's been challenging to get actual visibility
into the state of the connection pool. With this change, we should be
able to directly observe, monitor and alert on the connection pool.
- Updated grafana version to fix a bug in the query editor that was
fixed in 8.3, even though this is a couple versions ahead of production
This commit is contained in:
Chris Raible 2024-11-07 23:01:34 -08:00 committed by GitHub
parent 08fe08a0a2
commit 85408d10b7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 2579 additions and 1638 deletions

View File

@ -37,13 +37,14 @@ services:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
grafana:
profiles: [monitoring]
image: grafana/grafana:8.2.3
image: grafana/grafana:8.3.0
container_name: ghost-grafana
ports:
- "3000:3000"
restart: always
environment:
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
volumes:
- ./grafana/datasources:/etc/grafana/provisioning/datasources
- ./grafana/dashboard.yml:/etc/grafana/provisioning/dashboards/main.yaml

View File

@ -9,7 +9,7 @@ providers:
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: false
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: true

File diff suppressed because it is too large Load Diff

View File

@ -68,6 +68,10 @@ if (!knexInstance && config.get('database') && config.get('database').client) {
const instrumentation = new ConnectionPoolInstrumentation({knex: knexInstance, logging, metrics, config});
instrumentation.instrument();
}
if (config.get('prometheus:enabled')) {
const prometheusClient = require('../../../shared/prometheus-client');
prometheusClient.instrumentKnex(knexInstance);
}
}
module.exports = knexInstance;

View File

@ -25,14 +25,15 @@
"@types/express": "4.17.21",
"@types/stoppable": "1.1.0",
"c8": "10.1.2",
"knex": "2.4.2",
"mocha": "10.7.3",
"nock": "13.5.5",
"sinon": "19.0.2",
"supertest": "7.0.0",
"ts-node": "10.9.2",
"typescript": "5.6.2"
},
"dependencies": {
"@tryghost/errors": "1.3.6",
"@tryghost/logging": "2.4.19",
"express": "4.21.1",
"prom-client": "15.1.3",

View File

@ -1,7 +1,7 @@
import {Request, Response} from 'express';
import client from 'prom-client';
import type {Knex} from 'knex';
import logging from '@tryghost/logging';
import errors from '@tryghost/errors';
type PrometheusClientConfig = {
register?: client.Registry;
@ -21,17 +21,22 @@ export class PrometheusClient {
* Creates a new PrometheusClient instance
* @param prometheusConfig - The configuration for the PrometheusClient
*/
constructor(prometheusConfig: PrometheusClientConfig = {}) {
constructor(prometheusConfig: PrometheusClientConfig = {}, logger: any = logging) {
this.config = prometheusConfig;
this.client = client;
this.prefix = 'ghost_';
this.logger = logger;
}
public client;
public gateway: client.Pushgateway<client.RegistryContentType> | undefined; // public for testing
public customMetrics: Map<string, client.Metric> = new Map();
public queries: Map<string, Date> = new Map();
private config: PrometheusClientConfig;
private prefix;
public gateway: client.Pushgateway<client.RegistryContentType> | undefined; // public for testing
private pushInterval: ReturnType<typeof setInterval> | undefined;
private logger: any;
/**
* Initializes the prometheus client, setting up the pushgateway if enabled
@ -42,6 +47,7 @@ export class PrometheusClient {
const gatewayUrl = this.config.pushgateway.url || 'http://localhost:9091';
const interval = this.config.pushgateway.interval || 5000;
this.gateway = new client.Pushgateway(gatewayUrl);
this.pushMetrics();
this.pushInterval = setInterval(() => {
this.pushMetrics();
}, interval);
@ -56,15 +62,15 @@ export class PrometheusClient {
const jobName = this.config.pushgateway?.jobName || 'ghost';
try {
await this.gateway.pushAdd({jobName});
logging.debug('Metrics pushed to pushgateway - jobName: ', jobName);
this.logger.debug('Metrics pushed to pushgateway - jobName: ', jobName);
} catch (err) {
let error;
if (typeof err === 'object' && err !== null && 'code' in err) {
error = new errors.InternalServerError({message: 'Error pushing metrics to pushgateway: ' + err.code, code: err.code as string});
error = 'Error pushing metrics to pushgateway: ' + err.code as string;
} else {
error = new errors.InternalServerError({message: 'Error pushing metrics to pushgateway: Unknown error'});
error = 'Error pushing metrics to pushgateway: Unknown error';
}
logging.error(error);
this.logger.error(error);
}
}
}
@ -118,4 +124,95 @@ export class PrometheusClient {
getContentType() {
return this.client.register.contentType;
}
// Utility functions for creating custom metrics
/**
* Instruments the knex connection pool and queries
* @param knexInstance - The knex instance
*/
instrumentKnex(knexInstance: Knex) {
// Create some gauges for tracking the connection pool
this.customMetrics.set(`${this.prefix}db_connection_pool_max`, new this.client.Gauge({
name: `${this.prefix}db_connection_pool_max`,
help: 'The maximum number of connections allowed in the pool',
collect() {
this.set(knexInstance.client.pool.max);
}
}));
this.customMetrics.set(`${this.prefix}db_connection_pool_min`, new this.client.Gauge({
name: `${this.prefix}db_connection_pool_min`,
help: 'The minimum number of connections allowed in the pool',
collect() {
this.set(knexInstance.client.pool.min);
}
}));
this.customMetrics.set(`${this.prefix}db_connection_pool_active`, new this.client.Gauge({
name: `${this.prefix}db_connection_pool_active`,
help: 'The number of active connections to the database, which can be in use or idle',
collect() {
this.set(knexInstance.client.pool.numUsed() + knexInstance.client.pool.numFree());
}
}));
this.customMetrics.set(`${this.prefix}db_connection_pool_used`, new this.client.Gauge({
name: `${this.prefix}db_connection_pool_used`,
help: 'The number of connections currently in use by the database',
collect() {
this.set(knexInstance.client.pool.numUsed());
}
}));
this.customMetrics.set(`${this.prefix}db_connection_pool_idle`, new this.client.Gauge({
name: `${this.prefix}db_connection_pool_idle`,
help: 'The number of active connections currently idle in pool',
collect() {
this.set(knexInstance.client.pool.numFree());
}
}));
this.customMetrics.set(`${this.prefix}db_connection_pool_pending_acquires`, new this.client.Gauge({
name: `${this.prefix}db_connection_pool_pending_acquires`,
help: 'The number of connections currently waiting to be acquired from the pool',
collect() {
this.set(knexInstance.client.pool.numPendingAcquires());
}
}));
this.customMetrics.set(`${this.prefix}db_connection_pool_pending_creates`, new this.client.Gauge({
name: `${this.prefix}db_connection_pool_pending_creates`,
help: 'The number of connections currently waiting to be created',
collect() {
this.set(knexInstance.client.pool.numPendingCreates());
}
}));
this.customMetrics.set(`${this.prefix}db_query_count`, new this.client.Counter({
name: `${this.prefix}db_query_count`,
help: 'The number of queries executed'
}));
this.customMetrics.set(`${this.prefix}db_query_duration_milliseconds`, new this.client.Summary({
name: `${this.prefix}db_query_duration_milliseconds`,
help: 'The duration of queries in milliseconds',
percentiles: [0.5, 0.9, 0.99]
}));
knexInstance.on('query', (query) => {
// Increment the query counter
(this.customMetrics.get(`${this.prefix}db_query_count`) as client.Counter).inc();
// Add the query to the map
this.queries.set(query.__knexQueryUid, new Date());
});
knexInstance.on('query-response', (err, query) => {
const start = this.queries.get(query.__knexQueryUid);
if (start) {
const duration = new Date().getTime() - start.getTime();
(this.customMetrics.get(`${this.prefix}db_query_duration_milliseconds`) as client.Summary).observe(duration);
}
});
}
}

View File

@ -2,11 +2,22 @@ import assert from 'assert/strict';
import {PrometheusClient} from '../src';
import {Request, Response} from 'express';
import * as sinon from 'sinon';
import type {Knex} from 'knex';
import nock from 'nock';
import {EventEmitter} from 'events';
import type {EventEmitter as EventEmitterType} from 'events';
import type {Gauge, Counter, Summary, Pushgateway, RegistryContentType} from 'prom-client';
describe('Prometheus Client', function () {
let instance: PrometheusClient;
let logger: any;
beforeEach(function () {
sinon.restore();
logger = {
info: sinon.stub(),
error: sinon.stub()
};
});
afterEach(function () {
@ -14,6 +25,7 @@ describe('Prometheus Client', function () {
instance.stop();
instance.client.register.clear();
}
nock.cleanAll();
});
describe('constructor', function () {
@ -31,19 +43,21 @@ describe('Prometheus Client', function () {
assert.ok(collectDefaultMetricsSpy.called);
});
it('should create the pushgateway client if the pushgateway is enabled', function () {
instance = new PrometheusClient({pushgateway: {enabled: true}});
it('should create the pushgateway client if the pushgateway is enabled', async function () {
const clock = sinon.useFakeTimers();
nock('http://localhost:9091')
.persist()
.post('/metrics/job/ghost')
.reply(200);
instance = new PrometheusClient({pushgateway: {enabled: true, interval: 20}});
const pushMetricsStub = sinon.stub(instance, 'pushMetrics').resolves();
instance.init();
assert.ok(instance.gateway);
});
it('should push metrics to the pushgateway if it is enabled', async function () {
const clock = sinon.useFakeTimers();
instance = new PrometheusClient({pushgateway: {enabled: true}});
const pushMetricsSpy = sinon.spy(instance, 'pushMetrics');
instance.init();
clock.tick(10000);
assert.ok(pushMetricsSpy.called);
assert.ok(pushMetricsStub.called, 'pushMetrics should be called immediately');
clock.tick(30);
assert.ok(pushMetricsStub.calledTwice, 'pushMetrics should be called again after the interval');
clock.restore();
});
});
@ -56,6 +70,43 @@ describe('Prometheus Client', function () {
});
});
describe('pushMetrics', function () {
it('should push metrics to the pushgateway', async function () {
const scope = nock('http://localhost:9091')
.persist()
.post('/metrics/job/ghost')
.reply(200);
instance = new PrometheusClient({pushgateway: {enabled: true}});
instance.init();
await instance.pushMetrics();
scope.done();
});
it('should log an error with error code if pushing metrics to the gateway fails', async function () {
instance = new PrometheusClient({pushgateway: {enabled: true}}, logger);
instance.init();
instance.gateway = {
pushAdd: sinon.stub().rejects({code: 'ECONNRESET'})
} as unknown as Pushgateway<RegistryContentType>;
await instance.pushMetrics();
assert.ok(logger.error.called);
const [[error]] = logger.error.args;
assert.match(error, /ECONNRESET/);
});
it('should log a generic error if the error is unknown', async function () {
instance = new PrometheusClient({pushgateway: {enabled: true}}, logger);
instance.init();
instance.gateway = {
pushAdd: sinon.stub().rejects()
} as unknown as Pushgateway<RegistryContentType>;
await instance.pushMetrics();
assert.ok(logger.error.called);
const [[error]] = logger.error.args;
assert.match(error, /Unknown error/);
});
});
describe('handleMetricsRequest', function () {
it('should return the metrics', async function () {
const setStub = sinon.stub();
@ -111,4 +162,181 @@ describe('Prometheus Client', function () {
assert.match(metrics, /^# HELP/);
});
});
describe('instrumentKnex', function () {
let knexMock: Knex;
let eventEmitter: EventEmitterType;
function simulateQuery(queryUid: string, duration: number) {
const clock = sinon.useFakeTimers();
eventEmitter.emit('query', {__knexQueryUid: queryUid, sql: 'SELECT 1'});
clock.tick(duration);
eventEmitter.emit('query-response', null, {__knexQueryUid: queryUid, sql: 'SELECT 1'});
clock.restore();
}
function simulateQueries(durations: number[]) {
durations.forEach((duration, index) => {
simulateQuery(`${index}`, duration);
});
}
beforeEach(function () {
eventEmitter = new EventEmitter();
knexMock = {
on: sinon.stub().callsFake((event, callback) => {
eventEmitter.on(event, callback);
}),
client: {
pool: {
max: 10,
min: 1,
numUsed: sinon.stub().returns(0),
numFree: sinon.stub().returns(0),
numPendingAcquires: sinon.stub().returns(0),
numPendingCreates: sinon.stub().returns(0)
}
}
} as unknown as Knex;
});
afterEach(function () {
sinon.restore();
});
it('should create all the custom metrics for the connection pool and queries', function () {
instance = new PrometheusClient();
instance.init();
instance.instrumentKnex(knexMock);
const metrics = Array.from(instance.customMetrics.keys());
assert.deepEqual(metrics, [
'ghost_db_connection_pool_max',
'ghost_db_connection_pool_min',
'ghost_db_connection_pool_active',
'ghost_db_connection_pool_used',
'ghost_db_connection_pool_idle',
'ghost_db_connection_pool_pending_acquires',
'ghost_db_connection_pool_pending_creates',
'ghost_db_query_count',
'ghost_db_query_duration_milliseconds'
]);
});
it('should collect the connection pool max metric', async function () {
instance = new PrometheusClient();
instance.init();
instance.instrumentKnex(knexMock);
const connectionPoolMaxGauge = instance.customMetrics.get('ghost_db_connection_pool_max') as Gauge;
const result = await connectionPoolMaxGauge.get();
assert.equal(result.values[0].value, 10);
});
it('should collect the connection pool min metric', async function () {
instance = new PrometheusClient();
instance.init();
instance.instrumentKnex(knexMock);
const connectionPoolMinGauge = instance.customMetrics.get('ghost_db_connection_pool_min') as Gauge;
const result = await connectionPoolMinGauge.get();
assert.equal(result.values[0].value, 1);
});
it('should collect the connection pool active metric', async function () {
knexMock.client.pool.numUsed = sinon.stub().returns(3);
knexMock.client.pool.numFree = sinon.stub().returns(7);
instance = new PrometheusClient();
instance.init();
instance.instrumentKnex(knexMock);
const connectionPoolActiveGauge = instance.customMetrics.get('ghost_db_connection_pool_active') as Gauge;
const result = await connectionPoolActiveGauge.get();
assert.equal(result.values[0].value, 10);
});
it('should collect the connection pool used metric', async function () {
knexMock.client.pool.numUsed = sinon.stub().returns(3);
instance = new PrometheusClient();
instance.init();
instance.instrumentKnex(knexMock);
const connectionPoolUsedGauge = instance.customMetrics.get('ghost_db_connection_pool_used') as Gauge;
const result = await connectionPoolUsedGauge.get();
assert.equal(result.values[0].value, 3);
});
it('should collect the connection pool idle metric', async function () {
knexMock.client.pool.numFree = sinon.stub().returns(7);
instance = new PrometheusClient();
instance.init();
instance.instrumentKnex(knexMock);
const connectionPoolIdleGauge = instance.customMetrics.get('ghost_db_connection_pool_idle') as Gauge;
const result = await connectionPoolIdleGauge.get();
assert.equal(result.values[0].value, 7);
});
it('should collect the connection pool pending acquires metric', async function () {
knexMock.client.pool.numPendingAcquires = sinon.stub().returns(3);
instance = new PrometheusClient();
instance.init();
instance.instrumentKnex(knexMock);
const connectionPoolPendingAcquiresGauge = instance.customMetrics.get('ghost_db_connection_pool_pending_acquires') as Gauge;
const result = await connectionPoolPendingAcquiresGauge.get();
assert.equal(result.values[0].value, 3);
});
it('should collect the connection pool pending creates metric', async function () {
knexMock.client.pool.numPendingCreates = sinon.stub().returns(3);
instance = new PrometheusClient();
instance.init();
instance.instrumentKnex(knexMock);
const connectionPoolPendingCreatesGauge = instance.customMetrics.get('ghost_db_connection_pool_pending_creates') as Gauge;
const result = await connectionPoolPendingCreatesGauge.get();
assert.equal(result.values[0].value, 3);
});
it('should collect the db query count metric', async function () {
instance = new PrometheusClient();
instance.init();
instance.instrumentKnex(knexMock);
const dbQueryCountGauge = instance.customMetrics.get('ghost_db_query_count') as Counter;
const result = await dbQueryCountGauge.get();
assert.equal(result.values[0].value, 0);
});
it('should increment the db query count metric when a query is executed', async function () {
instance = new PrometheusClient();
instance.init();
instance.instrumentKnex(knexMock);
eventEmitter.emit('query', {__knexQueryUid: '1', sql: 'SELECT 1'});
const dbQueryCountGauge = instance.customMetrics.get('ghost_db_query_count') as Counter;
const result = await dbQueryCountGauge.get();
assert.equal(result.values[0].value, 1);
assert.equal(instance.queries.size, 1);
assert.ok(instance.queries.has('1'));
});
it('should collect the db query duration metric when a query is executed', async function () {
instance = new PrometheusClient();
instance.init();
instance.instrumentKnex(knexMock);
eventEmitter.emit('query', {__knexQueryUid: '1', sql: 'SELECT 1'});
const dbQueryDurationSummary = instance.customMetrics.get('ghost_db_query_duration_milliseconds') as Summary;
const result = await dbQueryDurationSummary.get();
assert.equal(result.values[0].value, 0);
});
it('should accurately calculate the query duration of a query', async function () {
instance = new PrometheusClient();
instance.init();
instance.instrumentKnex(knexMock);
const durations = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000];
simulateQueries(durations);
const dbQueryDurationSummary = instance.customMetrics.get('ghost_db_query_duration_milliseconds') as Summary;
const result = await dbQueryDurationSummary.get();
assert.deepEqual(result.values, [
{labels: {quantile: 0.5}, value: 550},
{labels: {quantile: 0.9}, value: 950},
{labels: {quantile: 0.99}, value: 1000},
{metricName: 'ghost_db_query_duration_milliseconds_sum', labels: {}, value: 5500},
{metricName: 'ghost_db_query_duration_milliseconds_count', labels: {}, value: 10}
]);
});
});
});

View File

@ -7430,7 +7430,7 @@
focus-trap "^6.7.2"
postcss-preset-env "^7.3.1"
"@tryghost/errors@1.3.1", "@tryghost/errors@1.3.5", "@tryghost/errors@1.3.6", "@tryghost/errors@^1.2.26", "@tryghost/errors@^1.2.3", "@tryghost/errors@^1.3.5", "@tryghost/errors@^1.3.6":
"@tryghost/errors@1.3.1", "@tryghost/errors@1.3.5", "@tryghost/errors@^1.2.26", "@tryghost/errors@^1.2.3", "@tryghost/errors@^1.3.5", "@tryghost/errors@^1.3.6":
version "1.3.5"
resolved "https://registry.yarnpkg.com/@tryghost/errors/-/errors-1.3.5.tgz#f4ef8e5c41a8a37456f2285271124180685827ae"
integrity sha512-iOkiHGnYFqSdFM9AVlgiL56Qcx6V9iQ3kbDKxyOAxrhMKq1OnOmOm7tr1CgGK1YDte9XYEZmR9hUZEg+ujn/jQ==