diff --git a/packages/backend/server/package.json b/packages/backend/server/package.json index d893852232..301b2599fc 100644 --- a/packages/backend/server/package.json +++ b/packages/backend/server/package.json @@ -48,6 +48,7 @@ "@opentelemetry/instrumentation-ioredis": "^0.35.3", "@opentelemetry/instrumentation-nestjs-core": "^0.33.3", "@opentelemetry/instrumentation-socket.io": "^0.34.3", + "@opentelemetry/resources": "^1.18.1", "@opentelemetry/sdk-metrics": "^1.18.1", "@opentelemetry/sdk-node": "^0.45.1", "@opentelemetry/sdk-trace-node": "^1.18.1", diff --git a/packages/backend/server/src/graphql/logger-plugin.ts b/packages/backend/server/src/graphql/logger-plugin.ts index 67c399d667..9028ce722a 100644 --- a/packages/backend/server/src/graphql/logger-plugin.ts +++ b/packages/backend/server/src/graphql/logger-plugin.ts @@ -20,7 +20,7 @@ export class GQLLoggerPlugin implements ApolloServerPlugin { const res = reqContext.contextValue.req.res as Response; const operation = reqContext.request.operationName; - metrics().gqlRequest.add(1, { operation }); + metrics.gql.counter('query_counter').add(1, { operation }); const start = Date.now(); return Promise.resolve({ @@ -30,7 +30,9 @@ export class GQLLoggerPlugin implements ApolloServerPlugin { 'Server-Timing', `gql;dur=${costInMilliseconds};desc="GraphQL"` ); - metrics().gqlTimer.record(costInMilliseconds, { operation }); + metrics.gql + .histogram('query_duration') + .record(costInMilliseconds, { operation }); return Promise.resolve(); }, didEncounterErrors: () => { @@ -39,7 +41,9 @@ export class GQLLoggerPlugin implements ApolloServerPlugin { 'Server-Timing', `gql;dur=${costInMilliseconds};desc="GraphQL ${operation}"` ); - metrics().gqlTimer.record(costInMilliseconds, { operation }); + metrics.gql + .histogram('query_duration') + .record(costInMilliseconds, { operation }); return Promise.resolve(); }, }); diff --git a/packages/backend/server/src/metrics/metrics.ts b/packages/backend/server/src/metrics/metrics.ts index b91223cf97..ccbb7be606 100644 --- a/packages/backend/server/src/metrics/metrics.ts +++ b/packages/backend/server/src/metrics/metrics.ts @@ -1,76 +1,129 @@ -import opentelemetry, { Attributes, Observable } from '@opentelemetry/api'; +import { + Attributes, + Counter, + Histogram, + Meter, + MetricOptions, +} from '@opentelemetry/api'; -interface AsyncMetric { - ob: Observable; - get value(): any; - get attrs(): Attributes | undefined; -} +import { getMeter } from './opentelemetry'; -let _metrics: ReturnType | undefined = undefined; +type MetricType = 'counter' | 'gauge' | 'histogram'; +type Metric = T extends 'counter' + ? Counter + : T extends 'gauge' + ? Histogram + : T extends 'histogram' + ? Histogram + : never; -export function getMeter(name = 'business') { - return opentelemetry.metrics.getMeter(name); -} +export type ScopedMetrics = { + [T in MetricType]: (name: string, opts?: MetricOptions) => Metric; +}; +type MetricCreators = { + [T in MetricType]: ( + meter: Meter, + name: string, + opts?: MetricOptions + ) => Metric; +}; -function createBusinessMetrics() { - const meter = getMeter(); - const asyncMetrics: AsyncMetric[] = []; +export type KnownMetricScopes = + | 'socketio' + | 'gql' + | 'jwst' + | 'auth' + | 'controllers' + | 'doc'; - function createGauge(name: string) { +const metricCreators: MetricCreators = { + counter(meter: Meter, name: string, opts?: MetricOptions) { + return meter.createCounter(name, opts); + }, + gauge(meter: Meter, name: string, opts?: MetricOptions) { let value: any; let attrs: Attributes | undefined; - const ob = meter.createObservableGauge(name); - asyncMetrics.push({ - ob, - get value() { - return value; - }, - get attrs() { - return attrs; - }, + const ob = meter.createObservableGauge(name, opts); + + ob.addCallback(result => { + result.observe(value, attrs); }); - return (newValue: any, newAttrs?: Attributes) => { - value = newValue; - attrs = newAttrs; - }; + return { + record: (newValue, newAttrs) => { + value = newValue; + attrs = newAttrs; + }, + } satisfies Histogram; + }, + histogram(meter: Meter, name: string, opts?: MetricOptions) { + return meter.createHistogram(name, opts); + }, +}; + +const scopes = new Map(); + +function make(scope: string) { + const meter = getMeter(); + const metrics = new Map(); + const prefix = scope + '/'; + + function getOrCreate( + type: T, + name: string, + opts?: MetricOptions + ): Metric { + name = prefix + name; + const metric = metrics.get(name); + if (metric) { + if (type !== metric.type) { + throw new Error( + `Metric ${name} has already been registered as ${metric.type} mode, but get as ${type} again.` + ); + } + + return metric.metric; + } else { + const metric = metricCreators[type](meter, name, opts); + metrics.set(name, { type, metric }); + return metric; + } } - const metrics = { - socketIOConnectionGauge: createGauge('socket_io_connection'), - - gqlRequest: meter.createCounter('gql_request'), - gqlError: meter.createCounter('gql_error'), - gqlTimer: meter.createHistogram('gql_timer'), - - jwstCodecMerge: meter.createCounter('jwst_codec_merge'), - jwstCodecDidnotMatch: meter.createCounter('jwst_codec_didnot_match'), - jwstCodecFail: meter.createCounter('jwst_codec_fail'), - - authCounter: meter.createCounter('auth'), - authFailCounter: meter.createCounter('auth_fail'), - - docHistoryCounter: meter.createCounter('doc_history_created'), - docRecoverCounter: meter.createCounter('doc_history_recovered'), - }; - - meter.addBatchObservableCallback( - result => { - asyncMetrics.forEach(metric => { - result.observe(metric.ob, metric.value, metric.attrs); - }); + return { + counter(name, opts) { + return getOrCreate('counter', name, opts); }, - asyncMetrics.map(({ ob }) => ob) - ); - - return metrics; + gauge(name, opts) { + return getOrCreate('gauge', name, opts); + }, + histogram(name, opts) { + return getOrCreate('histogram', name, opts); + }, + } satisfies ScopedMetrics; } -export function registerBusinessMetrics() { - if (!_metrics) { - _metrics = createBusinessMetrics(); +/** + * @example + * + * ``` + * metrics.scope.counter('example_count').add(1, { + * attr1: 'example-event' + * }) + * ``` + */ +export const metrics = new Proxy>( + // @ts-expect-error proxied + {}, + { + get(_, scopeName: string) { + let scope = scopes.get(scopeName); + if (!scope) { + scope = make(scopeName); + scopes.set(scopeName, scope); + } + + return scope; + }, } - - return _metrics; -} -export const metrics = registerBusinessMetrics; +); diff --git a/packages/backend/server/src/metrics/opentelemetry.ts b/packages/backend/server/src/metrics/opentelemetry.ts index cd5fcfa358..5b084c8cfc 100644 --- a/packages/backend/server/src/metrics/opentelemetry.ts +++ b/packages/backend/server/src/metrics/opentelemetry.ts @@ -1,5 +1,6 @@ import { MetricExporter } from '@google-cloud/opentelemetry-cloud-monitoring-exporter'; import { TraceExporter } from '@google-cloud/opentelemetry-cloud-trace-exporter'; +import { metrics } from '@opentelemetry/api'; import { CompositePropagator, W3CBaggagePropagator, @@ -16,6 +17,8 @@ import { NestInstrumentation } from '@opentelemetry/instrumentation-nestjs-core' import { SocketIoInstrumentation } from '@opentelemetry/instrumentation-socket.io'; import { ConsoleMetricExporter, + type MeterProvider, + MetricProducer, MetricReader, PeriodicExportingMetricReader, } from '@opentelemetry/sdk-metrics'; @@ -24,10 +27,11 @@ import { BatchSpanProcessor, ConsoleSpanExporter, SpanExporter, + TraceIdRatioBasedSampler, } from '@opentelemetry/sdk-trace-node'; import { PrismaInstrumentation } from '@prisma/instrumentation'; -import { registerBusinessMetrics } from './metrics'; +import { PrismaMetricProducer } from './prisma'; abstract class OpentelemetryFactor { abstract getMetricReader(): MetricReader; @@ -44,9 +48,14 @@ abstract class OpentelemetryFactor { ]; } + getMetricsProducers(): MetricProducer[] { + return [new PrismaMetricProducer()]; + } + create() { const traceExporter = this.getSpanExporter(); return new NodeSDK({ + sampler: new TraceIdRatioBasedSampler(0.1), traceExporter, metricReader: this.getMetricReader(), spanProcessor: new BatchSpanProcessor(traceExporter), @@ -67,7 +76,10 @@ class GCloudOpentelemetryFactor extends OpentelemetryFactor { return new PeriodicExportingMetricReader({ exportIntervalMillis: 30000, exportTimeoutMillis: 10000, - exporter: new MetricExporter(), + exporter: new MetricExporter({ + prefix: 'custom.googleapis.com', + }), + metricProducers: this.getMetricsProducers(), }); } @@ -78,7 +90,9 @@ class GCloudOpentelemetryFactor extends OpentelemetryFactor { class LocalOpentelemetryFactor extends OpentelemetryFactor { override getMetricReader(): MetricReader { - return new PrometheusExporter(); + return new PrometheusExporter({ + metricProducers: this.getMetricsProducers(), + }); } override getSpanExporter(): SpanExporter { @@ -90,6 +104,7 @@ class DebugOpentelemetryFactor extends OpentelemetryFactor { override getMetricReader(): MetricReader { return new PeriodicExportingMetricReader({ exporter: new ConsoleMetricExporter(), + metricProducers: this.getMetricsProducers(), }); } @@ -111,9 +126,30 @@ function createSDK() { return factor?.create(); } +let OPENTELEMETRY_STARTED = false; + +function ensureStarted() { + if (!OPENTELEMETRY_STARTED) { + OPENTELEMETRY_STARTED = true; + start(); + } +} + +function getMeterProvider() { + ensureStarted(); + return metrics.getMeterProvider(); +} + function registerCustomMetrics() { - const host = new HostMetrics({ name: 'instance-host-metrics' }); - host.start(); + const hostMetricsMonitoring = new HostMetrics({ + name: 'instance-host-metrics', + meterProvider: getMeterProvider() as MeterProvider, + }); + hostMetricsMonitoring.start(); +} + +export function getMeter(name = 'business') { + return getMeterProvider().getMeter(name); } export function start() { @@ -122,6 +158,5 @@ export function start() { if (sdk) { sdk.start(); registerCustomMetrics(); - registerBusinessMetrics(); } } diff --git a/packages/backend/server/src/metrics/prisma.ts b/packages/backend/server/src/metrics/prisma.ts new file mode 100644 index 0000000000..6c024c98ff --- /dev/null +++ b/packages/backend/server/src/metrics/prisma.ts @@ -0,0 +1,132 @@ +import { HrTime, ValueType } from '@opentelemetry/api'; +import { hrTime } from '@opentelemetry/core'; +import { Resource } from '@opentelemetry/resources'; +import { + AggregationTemporality, + CollectionResult, + DataPointType, + InstrumentType, + MetricProducer, + ScopeMetrics, +} from '@opentelemetry/sdk-metrics'; + +import { PrismaService } from '../prisma'; + +function transformPrismaKey(key: string) { + // replace first '_' to '/' as a scope prefix + // example: prisma_client_query_duration_seconds_sum -> prisma/client_query_duration_seconds_sum + return key.replace(/_/, '/'); +} + +export class PrismaMetricProducer implements MetricProducer { + private readonly startTime: HrTime = hrTime(); + + async collect(): Promise { + const result: CollectionResult = { + resourceMetrics: { + resource: Resource.EMPTY, + scopeMetrics: [], + }, + errors: [], + }; + + if (!PrismaService.INSTANCE) { + return result; + } + + const prisma = PrismaService.INSTANCE; + + const endTime = hrTime(); + + const metrics = await prisma.$metrics.json(); + const scopeMetrics: ScopeMetrics = { + scope: { + name: '', + }, + metrics: [], + }; + for (const counter of metrics.counters) { + scopeMetrics.metrics.push({ + descriptor: { + name: transformPrismaKey(counter.key), + description: counter.description, + unit: '1', + type: InstrumentType.COUNTER, + valueType: ValueType.INT, + }, + dataPointType: DataPointType.SUM, + aggregationTemporality: AggregationTemporality.CUMULATIVE, + dataPoints: [ + { + startTime: this.startTime, + endTime: endTime, + value: counter.value, + attributes: counter.labels, + }, + ], + isMonotonic: true, + }); + } + + for (const gauge of metrics.gauges) { + scopeMetrics.metrics.push({ + descriptor: { + name: transformPrismaKey(gauge.key), + description: gauge.description, + unit: '1', + type: InstrumentType.UP_DOWN_COUNTER, + valueType: ValueType.INT, + }, + dataPointType: DataPointType.GAUGE, + aggregationTemporality: AggregationTemporality.CUMULATIVE, + dataPoints: [ + { + startTime: this.startTime, + endTime: endTime, + value: gauge.value, + attributes: gauge.labels, + }, + ], + }); + } + + for (const histogram of metrics.histograms) { + const boundaries = []; + const counts = []; + for (const [boundary, count] of histogram.value.buckets) { + boundaries.push(boundary); + counts.push(count); + } + scopeMetrics.metrics.push({ + descriptor: { + name: transformPrismaKey(histogram.key), + description: histogram.description, + unit: 'ms', + type: InstrumentType.HISTOGRAM, + valueType: ValueType.DOUBLE, + }, + dataPointType: DataPointType.HISTOGRAM, + aggregationTemporality: AggregationTemporality.CUMULATIVE, + dataPoints: [ + { + startTime: this.startTime, + endTime: endTime, + value: { + buckets: { + boundaries, + counts, + }, + count: histogram.value.count, + sum: histogram.value.sum, + }, + attributes: histogram.labels, + }, + ], + }); + } + + result.resourceMetrics.scopeMetrics.push(scopeMetrics); + + return result; + } +} diff --git a/packages/backend/server/src/metrics/utils.ts b/packages/backend/server/src/metrics/utils.ts index 1c4fa313ff..e4d16a3492 100644 --- a/packages/backend/server/src/metrics/utils.ts +++ b/packages/backend/server/src/metrics/utils.ts @@ -1,8 +1,9 @@ import { Attributes } from '@opentelemetry/api'; -import { getMeter } from './metrics'; +import { KnownMetricScopes, metrics } from './metrics'; export const CallTimer = ( + scope: KnownMetricScopes, name: string, attrs?: Attributes ): MethodDecorator => { @@ -18,9 +19,11 @@ export const CallTimer = ( } desc.value = function (...args: any[]) { - const timer = getMeter().createHistogram(name, { + const timer = metrics[scope].histogram(name, { description: `function call time costs of ${name}`, + unit: 'ms', }); + const start = Date.now(); const end = () => { @@ -48,6 +51,7 @@ export const CallTimer = ( }; export const CallCounter = ( + scope: KnownMetricScopes, name: string, attrs?: Attributes ): MethodDecorator => { @@ -63,7 +67,7 @@ export const CallCounter = ( } desc.value = function (...args: any[]) { - const count = getMeter().createCounter(name, { + const count = metrics[scope].counter(name, { description: `function call counter of ${name}`, }); diff --git a/packages/backend/server/src/modules/auth/next-auth.controller.ts b/packages/backend/server/src/modules/auth/next-auth.controller.ts index f8d8faa043..9d717f0121 100644 --- a/packages/backend/server/src/modules/auth/next-auth.controller.ts +++ b/packages/backend/server/src/modules/auth/next-auth.controller.ts @@ -89,12 +89,13 @@ export class NextAuthController { res.redirect(`/signin${query}`); return; } - metrics().authCounter.add(1); const [action, providerId] = req.url // start with request url .slice(BASE_URL.length) // make relative to baseUrl .replace(/\?.*/, '') // remove query part, use only path part .split('/') as [AuthAction, string]; // as array of strings; + metrics.auth.counter('call_counter').add(1, { action, providerId }); + const credentialsSignIn = req.method === 'POST' && providerId === 'credentials'; let userId: string | undefined; @@ -126,7 +127,9 @@ export class NextAuthController { const options = this.nextAuthOptions; if (req.method === 'POST' && action === 'session') { if (typeof req.body !== 'object' || typeof req.body.data !== 'object') { - metrics().authFailCounter.add(1, { reason: 'invalid_session_data' }); + metrics.auth + .counter('call_fails_counter') + .add(1, { reason: 'invalid_session_data' }); throw new BadRequestException(`Invalid new session data`); } const user = await this.updateSession(req, req.body.data); @@ -209,9 +212,10 @@ export class NextAuthController { if (redirect?.endsWith('api/auth/error?error=AccessDenied')) { this.logger.log(`Early access redirect headers: ${req.headers}`); - metrics().authFailCounter.add(1, { - reason: 'no_early_access_permission', - }); + metrics.auth + .counter('call_fails_counter') + .add(1, { reason: 'no_early_access_permission' }); + if ( !req.headers?.referer || checkUrlOrigin(req.headers.referer, 'https://accounts.google.com') diff --git a/packages/backend/server/src/modules/doc/history.ts b/packages/backend/server/src/modules/doc/history.ts index 2a6dc47554..7502355950 100644 --- a/packages/backend/server/src/modules/doc/history.ts +++ b/packages/backend/server/src/modules/doc/history.ts @@ -68,7 +68,11 @@ export class DocHistoryManager { // safe to ignore // only happens when duplicated history record created in multi processes }); - metrics().docHistoryCounter.add(1, {}); + metrics.doc + .counter('history_created_counter', { + description: 'How many times the snapshot history created', + }) + .add(1); this.logger.log( `History created for ${snapshot.id} in workspace ${snapshot.workspaceId}.` ); @@ -182,7 +186,11 @@ export class DocHistoryManager { // which is not the solution in CRDT. // let user revert in client and update the data in sync system // `await this.db.snapshot.update();` - metrics().docRecoverCounter.add(1, {}); + metrics.doc + .counter('history_recovered_counter', { + description: 'How many times history recovered request happened', + }) + .add(1); return history.timestamp; } diff --git a/packages/backend/server/src/modules/doc/manager.ts b/packages/backend/server/src/modules/doc/manager.ts index 3362bc5ced..326902b35f 100644 --- a/packages/backend/server/src/modules/doc/manager.ts +++ b/packages/backend/server/src/modules/doc/manager.ts @@ -125,13 +125,13 @@ export class DocManager implements OnModuleInit, OnModuleDestroy { this.config.doc.manager.experimentalMergeWithJwstCodec && updates.length < 100 /* avoid overloading */ ) { - metrics().jwstCodecMerge.add(1); + metrics.jwst.counter('codec_merge_counter').add(1); const yjsResult = Buffer.from(encodeStateAsUpdate(doc)); let log = false; try { const jwstResult = jwstMergeUpdates(updates); if (!compare(yjsResult, jwstResult)) { - metrics().jwstCodecDidnotMatch.add(1); + metrics.jwst.counter('codec_not_match').add(1); this.logger.warn( `jwst codec result doesn't match yjs codec result for: ${guid}` ); @@ -142,7 +142,7 @@ export class DocManager implements OnModuleInit, OnModuleDestroy { } } } catch (e) { - metrics().jwstCodecFail.add(1); + metrics.jwst.counter('codec_fails_counter').add(1); this.logger.warn(`jwst apply update failed for ${guid}: ${e}`); log = true; } finally { diff --git a/packages/backend/server/src/modules/sync/events/events.gateway.ts b/packages/backend/server/src/modules/sync/events/events.gateway.ts index 2921411860..f847842c1c 100644 --- a/packages/backend/server/src/modules/sync/events/events.gateway.ts +++ b/packages/backend/server/src/modules/sync/events/events.gateway.ts @@ -45,6 +45,7 @@ export const GatewayErrorWrapper = (): MethodDecorator => { try { result = originalMethod.apply(this, args); } catch (e) { + metrics.socketio.counter('unhandled_errors').add(1); return { error: new InternalError(e as Error), }; @@ -52,6 +53,7 @@ export const GatewayErrorWrapper = (): MethodDecorator => { if (result instanceof Promise) { return result.catch(e => { + metrics.socketio.counter('unhandled_errors').add(1); return { error: new InternalError(e), }; @@ -68,7 +70,7 @@ export const GatewayErrorWrapper = (): MethodDecorator => { const SubscribeMessage = (event: string) => applyDecorators( GatewayErrorWrapper(), - CallTimer('socket_io_event_duration', { event }), + CallTimer('socketio', 'event_duration', { event }), RawSubscribeMessage(event) ); @@ -104,12 +106,12 @@ export class EventsGateway implements OnGatewayConnection, OnGatewayDisconnect { handleConnection() { this.connectionCount++; - metrics().socketIOConnectionGauge(this.connectionCount); + metrics.socketio.gauge('realtime_connections').record(this.connectionCount); } handleDisconnect() { this.connectionCount--; - metrics().socketIOConnectionGauge(this.connectionCount); + metrics.socketio.gauge('realtime_connections').record(this.connectionCount); } @Auth() diff --git a/packages/backend/server/src/modules/workspaces/controller.ts b/packages/backend/server/src/modules/workspaces/controller.ts index 2d20272f94..47b64cf4aa 100644 --- a/packages/backend/server/src/modules/workspaces/controller.ts +++ b/packages/backend/server/src/modules/workspaces/controller.ts @@ -34,7 +34,7 @@ export class WorkspacesController { // // NOTE: because graphql can't represent a File, so we have to use REST API to get blob @Get('/:id/blobs/:name') - @CallTimer('doc_controller', { method: 'get_blob' }) + @CallTimer('controllers', 'workspace_get_blob') async blob( @Param('id') workspaceId: string, @Param('name') name: string, @@ -59,7 +59,7 @@ export class WorkspacesController { @Get('/:id/docs/:guid') @Auth() @Publicable() - @CallTimer('doc_controller', { method: 'get_doc' }) + @CallTimer('controllers', 'workspace_get_doc') async doc( @CurrentUser() user: UserType | undefined, @Param('id') ws: string, @@ -106,7 +106,7 @@ export class WorkspacesController { @Get('/:id/docs/:guid/histories/:timestamp') @Auth() - @CallTimer('doc_controller', { method: 'get_history' }) + @CallTimer('controllers', 'workspace_get_history') async history( @CurrentUser() user: UserType, @Param('id') ws: string, diff --git a/packages/backend/server/src/prisma/service.ts b/packages/backend/server/src/prisma/service.ts index 25ac6b92f4..76fb92b392 100644 --- a/packages/backend/server/src/prisma/service.ts +++ b/packages/backend/server/src/prisma/service.ts @@ -7,6 +7,13 @@ export class PrismaService extends PrismaClient implements OnModuleInit, OnModuleDestroy { + static INSTANCE: PrismaService | null = null; + + constructor() { + super(); + PrismaService.INSTANCE = this; + } + async onModuleInit() { await this.$connect(); } diff --git a/yarn.lock b/yarn.lock index f56433d137..edfee1f4c1 100644 --- a/yarn.lock +++ b/yarn.lock @@ -734,6 +734,7 @@ __metadata: "@opentelemetry/instrumentation-ioredis": "npm:^0.35.3" "@opentelemetry/instrumentation-nestjs-core": "npm:^0.33.3" "@opentelemetry/instrumentation-socket.io": "npm:^0.34.3" + "@opentelemetry/resources": "npm:^1.18.1" "@opentelemetry/sdk-metrics": "npm:^1.18.1" "@opentelemetry/sdk-node": "npm:^0.45.1" "@opentelemetry/sdk-trace-node": "npm:^1.18.1" @@ -9017,7 +9018,7 @@ __metadata: languageName: node linkType: hard -"@opentelemetry/resources@npm:1.18.1": +"@opentelemetry/resources@npm:1.18.1, @opentelemetry/resources@npm:^1.18.1": version: 1.18.1 resolution: "@opentelemetry/resources@npm:1.18.1" dependencies: