From c1cd1713b9e2b5ca087711ffb0e9d19c816b2449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=8D=8E=E6=A1=A5?= Date: Sat, 25 Nov 2023 23:37:07 +0800 Subject: [PATCH] Revert "refactor(server): standarderlize metrics and trace with OTEL (#5054)" This reverts commit 91efca107a0f780cb141ba3381a6091a729f8d90. --- .../charts/graphql/templates/monitoring.yaml | 13 ++ .../charts/sync/templates/monitoring.yaml | 13 ++ packages/backend/server/package.json | 3 - packages/backend/server/src/app.ts | 2 + packages/backend/server/src/graphql.module.ts | 7 +- .../server/src/graphql/logger-plugin.ts | 15 ++- packages/backend/server/src/index.ts | 48 ++++++- .../backend/server/src/metrics/controller.ts | 18 +++ packages/backend/server/src/metrics/index.ts | 16 ++- .../backend/server/src/metrics/metrics.ts | 89 +++--------- .../server/src/metrics/opentelemetry.ts | 127 ------------------ packages/backend/server/src/metrics/utils.ts | 120 ++++++++++++++--- .../src/modules/auth/next-auth.controller.ts | 9 +- .../backend/server/src/modules/doc/history.ts | 9 +- .../backend/server/src/modules/doc/manager.ts | 9 +- .../src/modules/sync/events/events.gateway.ts | 12 +- .../src/modules/workspaces/controller.ts | 10 +- packages/backend/server/tests/auth.spec.ts | 2 + packages/backend/server/tests/doc.spec.ts | 2 + packages/backend/server/tests/history.spec.ts | 8 +- packages/backend/server/tests/mailer.e2e.ts | 2 + .../server/tests/prometheus-metrics.spec.ts | 78 +++++++++++ yarn.lock | 42 +----- 23 files changed, 358 insertions(+), 296 deletions(-) create mode 100644 .github/helm/affine/charts/graphql/templates/monitoring.yaml create mode 100644 .github/helm/affine/charts/sync/templates/monitoring.yaml create mode 100644 packages/backend/server/src/metrics/controller.ts delete mode 100644 packages/backend/server/src/metrics/opentelemetry.ts create mode 100644 packages/backend/server/tests/prometheus-metrics.spec.ts diff --git a/.github/helm/affine/charts/graphql/templates/monitoring.yaml b/.github/helm/affine/charts/graphql/templates/monitoring.yaml new file mode 100644 index 0000000000..5b7da3aa7c --- /dev/null +++ b/.github/helm/affine/charts/graphql/templates/monitoring.yaml @@ -0,0 +1,13 @@ +{{- if .Values.global.gke.enabled -}} +apiVersion: monitoring.googleapis.com/v1 +kind: PodMonitoring +metadata: + name: "{{ .Chart.Name }}-monitoring" +spec: + selector: + matchLabels: + app.kubernetes.io/name: "{{ include "graphql.name" . }}" + endpoints: + - port: {{ .Values.service.port }} + interval: 30s +{{- end }} diff --git a/.github/helm/affine/charts/sync/templates/monitoring.yaml b/.github/helm/affine/charts/sync/templates/monitoring.yaml new file mode 100644 index 0000000000..b7a03e6401 --- /dev/null +++ b/.github/helm/affine/charts/sync/templates/monitoring.yaml @@ -0,0 +1,13 @@ +{{- if .Values.global.gke.enabled -}} +apiVersion: monitoring.googleapis.com/v1 +kind: PodMonitoring +metadata: + name: "{{ .Chart.Name }}-monitoring" +spec: + selector: + matchLabels: + app.kubernetes.io/name: "{{ include "sync.name" . }}" + endpoints: + - port: {{ .Values.service.port }} + interval: 30s +{{- end }} diff --git a/packages/backend/server/package.json b/packages/backend/server/package.json index 17daf24fe4..48a99cdc12 100644 --- a/packages/backend/server/package.json +++ b/packages/backend/server/package.json @@ -39,9 +39,6 @@ "@node-rs/jsonwebtoken": "^0.2.3", "@opentelemetry/api": "^1.7.0", "@opentelemetry/core": "^1.18.1", - "@opentelemetry/exporter-prometheus": "^0.45.1", - "@opentelemetry/exporter-zipkin": "^1.18.1", - "@opentelemetry/host-metrics": "^0.33.2", "@opentelemetry/instrumentation": "^0.45.1", "@opentelemetry/instrumentation-graphql": "^0.36.0", "@opentelemetry/instrumentation-http": "^0.45.1", diff --git a/packages/backend/server/src/app.ts b/packages/backend/server/src/app.ts index cab18af060..8441983272 100644 --- a/packages/backend/server/src/app.ts +++ b/packages/backend/server/src/app.ts @@ -3,6 +3,7 @@ import { Module } from '@nestjs/common'; import { AppController } from './app.controller'; import { CacheModule } from './cache'; import { ConfigModule } from './config'; +import { MetricsModule } from './metrics'; import { BusinessModules } from './modules'; import { AuthModule } from './modules/auth'; import { PrismaModule } from './prisma'; @@ -15,6 +16,7 @@ const BasicModules = [ ConfigModule.forRoot(), CacheModule, StorageModule.forRoot(), + MetricsModule, SessionModule, RateLimiterModule, AuthModule, diff --git a/packages/backend/server/src/graphql.module.ts b/packages/backend/server/src/graphql.module.ts index e9c86ffa35..04bc1034dd 100644 --- a/packages/backend/server/src/graphql.module.ts +++ b/packages/backend/server/src/graphql.module.ts @@ -8,13 +8,14 @@ import { fileURLToPath } from 'url'; import { Config } from './config'; import { GQLLoggerPlugin } from './graphql/logger-plugin'; +import { Metrics } from './metrics/metrics'; @Global() @Module({ imports: [ GraphQLModule.forRootAsync({ driver: ApolloDriver, - useFactory: (config: Config) => { + useFactory: (config: Config, metrics: Metrics) => { return { ...config.graphql, path: `${config.path}/graphql`, @@ -30,10 +31,10 @@ import { GQLLoggerPlugin } from './graphql/logger-plugin'; req, res, }), - plugins: [new GQLLoggerPlugin()], + plugins: [new GQLLoggerPlugin(metrics)], }; }, - inject: [Config], + inject: [Config, Metrics], }), ], }) diff --git a/packages/backend/server/src/graphql/logger-plugin.ts b/packages/backend/server/src/graphql/logger-plugin.ts index 67c399d667..ee7c6670a3 100644 --- a/packages/backend/server/src/graphql/logger-plugin.ts +++ b/packages/backend/server/src/graphql/logger-plugin.ts @@ -7,39 +7,40 @@ import { Plugin } from '@nestjs/apollo'; import { Logger } from '@nestjs/common'; import { Response } from 'express'; -import { metrics } from '../metrics/metrics'; +import { Metrics } from '../metrics/metrics'; import { ReqContext } from '../types'; @Plugin() export class GQLLoggerPlugin implements ApolloServerPlugin { protected logger = new Logger(GQLLoggerPlugin.name); + constructor(private readonly metrics: Metrics) {} + requestDidStart( reqContext: GraphQLRequestContext ): Promise>> { const res = reqContext.contextValue.req.res as Response; const operation = reqContext.request.operationName; - metrics().gqlRequest.add(1, { operation }); - const start = Date.now(); + this.metrics.gqlRequest(1, { operation }); + const timer = this.metrics.gqlTimer({ operation }); return Promise.resolve({ willSendResponse: () => { - const costInMilliseconds = Date.now() - start; + const costInMilliseconds = timer() * 1000; res.setHeader( 'Server-Timing', `gql;dur=${costInMilliseconds};desc="GraphQL"` ); - metrics().gqlTimer.record(costInMilliseconds, { operation }); return Promise.resolve(); }, didEncounterErrors: () => { - const costInMilliseconds = Date.now() - start; + this.metrics.gqlError(1, { operation }); + const costInMilliseconds = timer() * 1000; res.setHeader( 'Server-Timing', `gql;dur=${costInMilliseconds};desc="GraphQL ${operation}"` ); - metrics().gqlTimer.record(costInMilliseconds, { operation }); return Promise.resolve(); }, }); diff --git a/packages/backend/server/src/index.ts b/packages/backend/server/src/index.ts index 1f42851a94..a124df389a 100644 --- a/packages/backend/server/src/index.ts +++ b/packages/backend/server/src/index.ts @@ -1,9 +1,22 @@ /// -import { start as startAutoMetrics } from './metrics'; -startAutoMetrics(); - +import { MetricExporter } from '@google-cloud/opentelemetry-cloud-monitoring-exporter'; +import { TraceExporter } from '@google-cloud/opentelemetry-cloud-trace-exporter'; import { NestFactory } from '@nestjs/core'; import type { NestExpressApplication } from '@nestjs/platform-express'; +import { + CompositePropagator, + W3CBaggagePropagator, + W3CTraceContextPropagator, +} from '@opentelemetry/core'; +import gql from '@opentelemetry/instrumentation-graphql'; +import { HttpInstrumentation } from '@opentelemetry/instrumentation-http'; +import ioredis from '@opentelemetry/instrumentation-ioredis'; +import { NestInstrumentation } from '@opentelemetry/instrumentation-nestjs-core'; +import socketIO from '@opentelemetry/instrumentation-socket.io'; +import { PeriodicExportingMetricReader } from '@opentelemetry/sdk-metrics'; +import { NodeSDK } from '@opentelemetry/sdk-node'; +import { BatchSpanProcessor } from '@opentelemetry/sdk-trace-node'; +import { PrismaInstrumentation } from '@prisma/instrumentation'; import cookieParser from 'cookie-parser'; import { static as staticMiddleware } from 'express'; import graphqlUploadExpress from 'graphql-upload/graphqlUploadExpress.mjs'; @@ -15,6 +28,35 @@ import { serverTimingAndCache } from './middleware/timing'; import { RedisIoAdapter } from './modules/sync/redis-adapter'; const { NODE_ENV, AFFINE_ENV } = process.env; + +if (NODE_ENV === 'production') { + const traceExporter = new TraceExporter(); + const tracing = new NodeSDK({ + traceExporter, + metricReader: new PeriodicExportingMetricReader({ + exporter: new MetricExporter(), + }), + spanProcessor: new BatchSpanProcessor(traceExporter), + textMapPropagator: new CompositePropagator({ + propagators: [ + new W3CBaggagePropagator(), + new W3CTraceContextPropagator(), + ], + }), + instrumentations: [ + new NestInstrumentation(), + new ioredis.IORedisInstrumentation(), + new socketIO.SocketIoInstrumentation({ traceReserved: true }), + new gql.GraphQLInstrumentation({ mergeItems: true }), + new HttpInstrumentation(), + new PrismaInstrumentation(), + ], + serviceName: 'affine-cloud', + }); + + tracing.start(); +} + const app = await NestFactory.create(AppModule, { cors: true, rawBody: true, diff --git a/packages/backend/server/src/metrics/controller.ts b/packages/backend/server/src/metrics/controller.ts new file mode 100644 index 0000000000..c130cf9005 --- /dev/null +++ b/packages/backend/server/src/metrics/controller.ts @@ -0,0 +1,18 @@ +import { Controller, Get, Res } from '@nestjs/common'; +import type { Response } from 'express'; +import { register } from 'prom-client'; + +import { PrismaService } from '../prisma'; + +@Controller() +export class MetricsController { + constructor(private readonly prisma: PrismaService) {} + + @Get('/metrics') + async index(@Res() res: Response): Promise { + res.header('Content-Type', register.contentType); + const prismaMetrics = await this.prisma.$metrics.prometheus(); + const appMetrics = await register.metrics(); + res.send(appMetrics + prismaMetrics); + } +} diff --git a/packages/backend/server/src/metrics/index.ts b/packages/backend/server/src/metrics/index.ts index 655c717c3e..a4f375e25d 100644 --- a/packages/backend/server/src/metrics/index.ts +++ b/packages/backend/server/src/metrics/index.ts @@ -1,3 +1,13 @@ -export * from './metrics'; -export { start } from './opentelemetry'; -export * from './utils'; +import { Global, Module } from '@nestjs/common'; + +import { MetricsController } from '../metrics/controller'; +import { Metrics } from './metrics'; + +@Global() +@Module({ + providers: [Metrics], + exports: [Metrics], + controllers: [MetricsController], +}) +export class MetricsModule {} +export { Metrics }; diff --git a/packages/backend/server/src/metrics/metrics.ts b/packages/backend/server/src/metrics/metrics.ts index b91223cf97..5df022c80e 100644 --- a/packages/backend/server/src/metrics/metrics.ts +++ b/packages/backend/server/src/metrics/metrics.ts @@ -1,76 +1,31 @@ -import opentelemetry, { Attributes, Observable } from '@opentelemetry/api'; +import { Injectable, OnModuleDestroy } from '@nestjs/common'; +import { register } from 'prom-client'; -interface AsyncMetric { - ob: Observable; - get value(): any; - get attrs(): Attributes | undefined; -} +import { metricsCreator } from './utils'; -let _metrics: ReturnType | undefined = undefined; - -export function getMeter(name = 'business') { - return opentelemetry.metrics.getMeter(name); -} - -function createBusinessMetrics() { - const meter = getMeter(); - const asyncMetrics: AsyncMetric[] = []; - - function createGauge(name: string) { - let value: any; - let attrs: Attributes | undefined; - const ob = meter.createObservableGauge(name); - asyncMetrics.push({ - ob, - get value() { - return value; - }, - get attrs() { - return attrs; - }, - }); - - return (newValue: any, newAttrs?: Attributes) => { - value = newValue; - attrs = newAttrs; - }; +@Injectable() +export class Metrics implements OnModuleDestroy { + onModuleDestroy(): void { + register.clear(); } - const metrics = { - socketIOConnectionGauge: createGauge('socket_io_connection'), - - gqlRequest: meter.createCounter('gql_request'), - gqlError: meter.createCounter('gql_error'), - gqlTimer: meter.createHistogram('gql_timer'), - - jwstCodecMerge: meter.createCounter('jwst_codec_merge'), - jwstCodecDidnotMatch: meter.createCounter('jwst_codec_didnot_match'), - jwstCodecFail: meter.createCounter('jwst_codec_fail'), - - authCounter: meter.createCounter('auth'), - authFailCounter: meter.createCounter('auth_fail'), - - docHistoryCounter: meter.createCounter('doc_history_created'), - docRecoverCounter: meter.createCounter('doc_history_recovered'), - }; - - meter.addBatchObservableCallback( - result => { - asyncMetrics.forEach(metric => { - result.observe(metric.ob, metric.value, metric.attrs); - }); - }, - asyncMetrics.map(({ ob }) => ob) + socketIOEventCounter = metricsCreator.counter('socket_io_counter', ['event']); + socketIOEventTimer = metricsCreator.timer('socket_io_timer', ['event']); + socketIOConnectionGauge = metricsCreator.gauge( + 'socket_io_connection_counter' ); - return metrics; -} + gqlRequest = metricsCreator.counter('gql_request', ['operation']); + gqlError = metricsCreator.counter('gql_error', ['operation']); + gqlTimer = metricsCreator.timer('gql_timer', ['operation']); -export function registerBusinessMetrics() { - if (!_metrics) { - _metrics = createBusinessMetrics(); - } + jwstCodecMerge = metricsCreator.counter('jwst_codec_merge'); + jwstCodecDidnotMatch = metricsCreator.counter('jwst_codec_didnot_match'); + jwstCodecFail = metricsCreator.counter('jwst_codec_fail'); - return _metrics; + authCounter = metricsCreator.counter('auth'); + authFailCounter = metricsCreator.counter('auth_fail', ['reason']); + + docHistoryCounter = metricsCreator.counter('doc_history_created'); + docRecoverCounter = metricsCreator.counter('doc_history_recovered'); } -export const metrics = registerBusinessMetrics; diff --git a/packages/backend/server/src/metrics/opentelemetry.ts b/packages/backend/server/src/metrics/opentelemetry.ts deleted file mode 100644 index 77c6d55b3c..0000000000 --- a/packages/backend/server/src/metrics/opentelemetry.ts +++ /dev/null @@ -1,127 +0,0 @@ -import { MetricExporter } from '@google-cloud/opentelemetry-cloud-monitoring-exporter'; -import { TraceExporter } from '@google-cloud/opentelemetry-cloud-trace-exporter'; -import { - CompositePropagator, - W3CBaggagePropagator, - W3CTraceContextPropagator, -} from '@opentelemetry/core'; -import { PrometheusExporter } from '@opentelemetry/exporter-prometheus'; -import { ZipkinExporter } from '@opentelemetry/exporter-zipkin'; -import { HostMetrics } from '@opentelemetry/host-metrics'; -import { Instrumentation } from '@opentelemetry/instrumentation'; -import { GraphQLInstrumentation } from '@opentelemetry/instrumentation-graphql'; -import { HttpInstrumentation } from '@opentelemetry/instrumentation-http'; -import { IORedisInstrumentation } from '@opentelemetry/instrumentation-ioredis'; -import { NestInstrumentation } from '@opentelemetry/instrumentation-nestjs-core'; -import { SocketIoInstrumentation } from '@opentelemetry/instrumentation-socket.io'; -import { - ConsoleMetricExporter, - MetricReader, - PeriodicExportingMetricReader, -} from '@opentelemetry/sdk-metrics'; -import { NodeSDK } from '@opentelemetry/sdk-node'; -import { - BatchSpanProcessor, - ConsoleSpanExporter, - SpanExporter, -} from '@opentelemetry/sdk-trace-node'; -import { PrismaInstrumentation } from '@prisma/instrumentation'; - -import { registerBusinessMetrics } from './metrics'; - -abstract class OpentelemetryFactor { - abstract getMetricReader(): MetricReader; - abstract getSpanExporter(): SpanExporter; - - getInstractions(): Instrumentation[] { - return [ - new NestInstrumentation(), - new IORedisInstrumentation(), - new SocketIoInstrumentation({ traceReserved: true }), - new GraphQLInstrumentation({ mergeItems: true }), - new HttpInstrumentation(), - new PrismaInstrumentation(), - ]; - } - - create() { - const traceExporter = this.getSpanExporter(); - return new NodeSDK({ - traceExporter, - metricReader: this.getMetricReader(), - spanProcessor: new BatchSpanProcessor(traceExporter), - textMapPropagator: new CompositePropagator({ - propagators: [ - new W3CBaggagePropagator(), - new W3CTraceContextPropagator(), - ], - }), - instrumentations: this.getInstractions(), - serviceName: 'affine-cloud', - }); - } -} - -class GCloudOpentelemetryFactor extends OpentelemetryFactor { - override getMetricReader(): MetricReader { - return new PeriodicExportingMetricReader({ - exportIntervalMillis: 30000, - exportTimeoutMillis: 60000, - exporter: new MetricExporter(), - }); - } - - override getSpanExporter(): SpanExporter { - return new TraceExporter(); - } -} - -class LocalOpentelemetryFactor extends OpentelemetryFactor { - override getMetricReader(): MetricReader { - return new PrometheusExporter(); - } - - override getSpanExporter(): SpanExporter { - return new ZipkinExporter(); - } -} - -class DebugOpentelemetryFactor extends OpentelemetryFactor { - override getMetricReader(): MetricReader { - return new PeriodicExportingMetricReader({ - exporter: new ConsoleMetricExporter(), - }); - } - - override getSpanExporter(): SpanExporter { - return new ConsoleSpanExporter(); - } -} - -function createSDK() { - let factor: OpentelemetryFactor | null = null; - if (process.env.NODE_ENV === 'production') { - factor = new GCloudOpentelemetryFactor(); - } else if (process.env.DEBUG_METRICS) { - factor = new DebugOpentelemetryFactor(); - } else { - factor = new LocalOpentelemetryFactor(); - } - - return factor?.create(); -} - -function registerCustomMetrics() { - const host = new HostMetrics({ name: 'instance-host-metrics' }); - host.start(); -} - -export function start() { - const sdk = createSDK(); - - if (sdk) { - sdk.start(); - registerCustomMetrics(); - registerBusinessMetrics(); - } -} diff --git a/packages/backend/server/src/metrics/utils.ts b/packages/backend/server/src/metrics/utils.ts index 1c4fa313ff..a4ee1ab86a 100644 --- a/packages/backend/server/src/metrics/utils.ts +++ b/packages/backend/server/src/metrics/utils.ts @@ -1,11 +1,99 @@ -import { Attributes } from '@opentelemetry/api'; +import { Counter, Gauge, register, Summary } from 'prom-client'; -import { getMeter } from './metrics'; +function getOr(name: string, or: () => T): T { + return (register.getSingleMetric(name) as T) || or(); +} + +type LabelValues = Partial>; +type MetricsCreator = ( + value: number, + labels: LabelValues +) => void; +type TimerMetricsCreator = ( + labels: LabelValues +) => () => number; + +export const metricsCreatorGenerator = () => { + const counterCreator = ( + name: string, + labelNames?: T[] + ): MetricsCreator => { + const counter = getOr( + name, + () => + new Counter({ + name, + help: name, + ...(labelNames ? { labelNames } : {}), + }) + ); + + return (value: number, labels: LabelValues) => { + counter.inc(labels, value); + }; + }; + + const gaugeCreator = ( + name: string, + labelNames?: T[] + ): MetricsCreator => { + const gauge = getOr( + name, + () => + new Gauge({ + name, + help: name, + ...(labelNames ? { labelNames } : {}), + }) + ); + + return (value: number, labels: LabelValues) => { + gauge.set(labels, value); + }; + }; + + const timerCreator = ( + name: string, + labelNames?: T[] + ): TimerMetricsCreator => { + const summary = getOr( + name, + () => + new Summary({ + name, + help: name, + ...(labelNames ? { labelNames } : {}), + }) + ); + + return (labels: LabelValues) => { + const now = process.hrtime(); + + return () => { + const delta = process.hrtime(now); + const value = delta[0] + delta[1] / 1e9; + + summary.observe(labels, value); + return value; + }; + }; + }; + + return { + counter: counterCreator, + gauge: gaugeCreator, + timer: timerCreator, + }; +}; + +export const metricsCreator = metricsCreatorGenerator(); export const CallTimer = ( name: string, - attrs?: Attributes + labels: Record = {} ): MethodDecorator => { + const timer = metricsCreator.timer(name, Object.keys(labels)); + // @ts-expect-error allow return ( _target, @@ -18,27 +106,19 @@ export const CallTimer = ( } desc.value = function (...args: any[]) { - const timer = getMeter().createHistogram(name, { - description: `function call time costs of ${name}`, - }); - const start = Date.now(); - - const end = () => { - timer.record(Date.now() - start, attrs); - }; - + const endTimer = timer(labels); let result: any; try { result = originalMethod.apply(this, args); } catch (e) { - end(); + endTimer(); throw e; } if (result instanceof Promise) { - return result.finally(end); + return result.finally(endTimer); } else { - end(); + endTimer(); return result; } }; @@ -49,8 +129,10 @@ export const CallTimer = ( export const CallCounter = ( name: string, - attrs?: Attributes + labels: Record = {} ): MethodDecorator => { + const count = metricsCreator.counter(name, Object.keys(labels)); + // @ts-expect-error allow return ( _target, @@ -63,11 +145,7 @@ export const CallCounter = ( } desc.value = function (...args: any[]) { - const count = getMeter().createCounter(name, { - description: `function call counter of ${name}`, - }); - - count.add(1, attrs); + count(1, labels); return originalMethod.apply(this, args); }; diff --git a/packages/backend/server/src/modules/auth/next-auth.controller.ts b/packages/backend/server/src/modules/auth/next-auth.controller.ts index f8d8faa043..1df2205e5b 100644 --- a/packages/backend/server/src/modules/auth/next-auth.controller.ts +++ b/packages/backend/server/src/modules/auth/next-auth.controller.ts @@ -23,7 +23,7 @@ import type { AuthAction, CookieOption, NextAuthOptions } from 'next-auth'; import { AuthHandler } from 'next-auth/core'; import { Config } from '../../config'; -import { metrics } from '../../metrics'; +import { Metrics } from '../../metrics/metrics'; import { PrismaService } from '../../prisma/service'; import { SessionService } from '../../session'; import { AuthThrottlerGuard, Throttle } from '../../throttler'; @@ -46,6 +46,7 @@ export class NextAuthController { private readonly authService: AuthService, @Inject(NextAuthOptionsProvide) private readonly nextAuthOptions: NextAuthOptions, + private readonly metrics: Metrics, private readonly session: SessionService ) { // eslint-disable-next-line @typescript-eslint/no-non-null-assertion @@ -89,7 +90,7 @@ export class NextAuthController { res.redirect(`/signin${query}`); return; } - metrics().authCounter.add(1); + this.metrics.authCounter(1, {}); const [action, providerId] = req.url // start with request url .slice(BASE_URL.length) // make relative to baseUrl .replace(/\?.*/, '') // remove query part, use only path part @@ -126,7 +127,7 @@ export class NextAuthController { const options = this.nextAuthOptions; if (req.method === 'POST' && action === 'session') { if (typeof req.body !== 'object' || typeof req.body.data !== 'object') { - metrics().authFailCounter.add(1, { reason: 'invalid_session_data' }); + this.metrics.authFailCounter(1, { reason: 'invalid_session_data' }); throw new BadRequestException(`Invalid new session data`); } const user = await this.updateSession(req, req.body.data); @@ -209,7 +210,7 @@ export class NextAuthController { if (redirect?.endsWith('api/auth/error?error=AccessDenied')) { this.logger.log(`Early access redirect headers: ${req.headers}`); - metrics().authFailCounter.add(1, { + this.metrics.authFailCounter(1, { reason: 'no_early_access_permission', }); if ( diff --git a/packages/backend/server/src/modules/doc/history.ts b/packages/backend/server/src/modules/doc/history.ts index ddb869d18e..61ae73e55d 100644 --- a/packages/backend/server/src/modules/doc/history.ts +++ b/packages/backend/server/src/modules/doc/history.ts @@ -6,7 +6,7 @@ import { Cron, CronExpression } from '@nestjs/schedule'; import type { Snapshot } from '@prisma/client'; import { Config } from '../../config'; -import { metrics } from '../../metrics'; +import { Metrics } from '../../metrics'; import { PrismaService } from '../../prisma'; import { SubscriptionStatus } from '../payment/service'; import { Permission } from '../workspaces/types'; @@ -16,7 +16,8 @@ export class DocHistoryManager { private readonly logger = new Logger(DocHistoryManager.name); constructor( private readonly config: Config, - private readonly db: PrismaService + private readonly db: PrismaService, + private readonly metrics: Metrics ) {} @OnEvent('doc:manager:snapshot:beforeUpdate') @@ -68,7 +69,7 @@ export class DocHistoryManager { // safe to ignore // only happens when duplicated history record created in multi processes }); - metrics().docHistoryCounter.add(1, {}); + this.metrics.docHistoryCounter(1, {}); this.logger.log( `History created for ${snapshot.id} in workspace ${snapshot.workspaceId}.` ); @@ -182,7 +183,7 @@ export class DocHistoryManager { // which is not the solution in CRDT. // let user revert in client and update the data in sync system // `await this.db.snapshot.update();` - metrics().docRecoverCounter.add(1, {}); + this.metrics.docRecoverCounter(1, {}); return history.timestamp; } diff --git a/packages/backend/server/src/modules/doc/manager.ts b/packages/backend/server/src/modules/doc/manager.ts index e531f6057e..154546da33 100644 --- a/packages/backend/server/src/modules/doc/manager.ts +++ b/packages/backend/server/src/modules/doc/manager.ts @@ -19,7 +19,7 @@ import { import { Cache } from '../../cache'; import { Config } from '../../config'; -import { metrics } from '../../metrics/metrics'; +import { Metrics } from '../../metrics/metrics'; import { PrismaService } from '../../prisma'; import { mergeUpdatesInApplyWay as jwstMergeUpdates } from '../../storage'; @@ -70,6 +70,7 @@ export class DocManager implements OnModuleInit, OnModuleDestroy { private readonly automation: boolean, private readonly db: PrismaService, private readonly config: Config, + private readonly metrics: Metrics, private readonly cache: Cache, private readonly event: EventEmitter2 ) {} @@ -125,13 +126,13 @@ export class DocManager implements OnModuleInit, OnModuleDestroy { this.config.doc.manager.experimentalMergeWithJwstCodec && updates.length < 100 /* avoid overloading */ ) { - metrics().jwstCodecMerge.add(1); + this.metrics.jwstCodecMerge(1, {}); const yjsResult = Buffer.from(encodeStateAsUpdate(doc)); let log = false; try { const jwstResult = jwstMergeUpdates(updates); if (!compare(yjsResult, jwstResult)) { - metrics().jwstCodecDidnotMatch.add(1); + this.metrics.jwstCodecDidnotMatch(1, {}); this.logger.warn( `jwst codec result doesn't match yjs codec result for: ${guid}` ); @@ -142,7 +143,7 @@ export class DocManager implements OnModuleInit, OnModuleDestroy { } } } catch (e) { - metrics().jwstCodecFail.add(1); + this.metrics.jwstCodecFail(1, {}); this.logger.warn(`jwst apply update failed for ${guid}: ${e}`); log = true; } finally { diff --git a/packages/backend/server/src/modules/sync/events/events.gateway.ts b/packages/backend/server/src/modules/sync/events/events.gateway.ts index 2921411860..6471c91afe 100644 --- a/packages/backend/server/src/modules/sync/events/events.gateway.ts +++ b/packages/backend/server/src/modules/sync/events/events.gateway.ts @@ -11,8 +11,8 @@ import { import { Server, Socket } from 'socket.io'; import { encodeStateAsUpdate, encodeStateVector } from 'yjs'; -import { metrics } from '../../../metrics'; -import { CallTimer } from '../../../metrics/utils'; +import { Metrics } from '../../../metrics/metrics'; +import { CallCounter, CallTimer } from '../../../metrics/utils'; import { DocID } from '../../../utils/doc'; import { Auth, CurrentUser } from '../../auth'; import { DocManager } from '../../doc'; @@ -68,7 +68,8 @@ export const GatewayErrorWrapper = (): MethodDecorator => { const SubscribeMessage = (event: string) => applyDecorators( GatewayErrorWrapper(), - CallTimer('socket_io_event_duration', { event }), + CallCounter('socket_io_counter', { event }), + CallTimer('socket_io_timer', { event }), RawSubscribeMessage(event) ); @@ -96,6 +97,7 @@ export class EventsGateway implements OnGatewayConnection, OnGatewayDisconnect { constructor( private readonly docManager: DocManager, + private readonly metric: Metrics, private readonly permissions: PermissionService ) {} @@ -104,12 +106,12 @@ export class EventsGateway implements OnGatewayConnection, OnGatewayDisconnect { handleConnection() { this.connectionCount++; - metrics().socketIOConnectionGauge(this.connectionCount); + this.metric.socketIOConnectionGauge(this.connectionCount, {}); } handleDisconnect() { this.connectionCount--; - metrics().socketIOConnectionGauge(this.connectionCount); + this.metric.socketIOConnectionGauge(this.connectionCount, {}); } @Auth() diff --git a/packages/backend/server/src/modules/workspaces/controller.ts b/packages/backend/server/src/modules/workspaces/controller.ts index 2d20272f94..6af8867d5a 100644 --- a/packages/backend/server/src/modules/workspaces/controller.ts +++ b/packages/backend/server/src/modules/workspaces/controller.ts @@ -4,13 +4,14 @@ import { ForbiddenException, Get, Inject, + Logger, NotFoundException, Param, Res, } from '@nestjs/common'; import type { Response } from 'express'; +import format from 'pretty-time'; -import { CallTimer } from '../../metrics'; import { PrismaService } from '../../prisma'; import { StorageProvide } from '../../storage'; import { DocID } from '../../utils/doc'; @@ -22,6 +23,8 @@ import { Permission } from './types'; @Controller('/api/workspaces') export class WorkspacesController { + private readonly logger = new Logger('WorkspacesController'); + constructor( @Inject(StorageProvide) private readonly storage: Storage, private readonly permission: PermissionService, @@ -34,7 +37,6 @@ export class WorkspacesController { // // NOTE: because graphql can't represent a File, so we have to use REST API to get blob @Get('/:id/blobs/:name') - @CallTimer('doc_controller', { method: 'get_blob' }) async blob( @Param('id') workspaceId: string, @Param('name') name: string, @@ -59,13 +61,13 @@ export class WorkspacesController { @Get('/:id/docs/:guid') @Auth() @Publicable() - @CallTimer('doc_controller', { method: 'get_doc' }) async doc( @CurrentUser() user: UserType | undefined, @Param('id') ws: string, @Param('guid') guid: string, @Res() res: Response ) { + const start = process.hrtime(); const docId = new DocID(guid, ws); if ( // if a user has the permission @@ -102,11 +104,11 @@ export class WorkspacesController { res.setHeader('content-type', 'application/octet-stream'); res.send(update); + this.logger.debug(`workspaces doc api: ${format(process.hrtime(start))}`); } @Get('/:id/docs/:guid/histories/:timestamp') @Auth() - @CallTimer('doc_controller', { method: 'get_history' }) async history( @CurrentUser() user: UserType, @Param('id') ws: string, diff --git a/packages/backend/server/tests/auth.spec.ts b/packages/backend/server/tests/auth.spec.ts index c60f46d8af..fa06c852a6 100644 --- a/packages/backend/server/tests/auth.spec.ts +++ b/packages/backend/server/tests/auth.spec.ts @@ -5,6 +5,7 @@ import test from 'ava'; import { ConfigModule } from '../src/config'; import { GqlModule } from '../src/graphql.module'; +import { MetricsModule } from '../src/metrics'; import { AuthModule } from '../src/modules/auth'; import { AuthResolver } from '../src/modules/auth/resolver'; import { AuthService } from '../src/modules/auth/service'; @@ -39,6 +40,7 @@ test.beforeEach(async () => { PrismaModule, GqlModule, AuthModule, + MetricsModule, RateLimiterModule, ], }).compile(); diff --git a/packages/backend/server/tests/doc.spec.ts b/packages/backend/server/tests/doc.spec.ts index 83147b96fe..9a08909691 100644 --- a/packages/backend/server/tests/doc.spec.ts +++ b/packages/backend/server/tests/doc.spec.ts @@ -10,6 +10,7 @@ import { Doc as YDoc, encodeStateAsUpdate } from 'yjs'; import { CacheModule } from '../src/cache'; import { Config, ConfigModule } from '../src/config'; +import { MetricsModule } from '../src/metrics'; import { DocManager, DocModule } from '../src/modules/doc'; import { PrismaModule, PrismaService } from '../src/prisma'; import { flushDB } from './utils'; @@ -18,6 +19,7 @@ const createModule = () => { return Test.createTestingModule({ imports: [ PrismaModule, + MetricsModule, CacheModule, EventEmitterModule.forRoot(), ConfigModule.forRoot(), diff --git a/packages/backend/server/tests/history.spec.ts b/packages/backend/server/tests/history.spec.ts index 103a7d231a..5555206a3b 100644 --- a/packages/backend/server/tests/history.spec.ts +++ b/packages/backend/server/tests/history.spec.ts @@ -6,6 +6,7 @@ import test from 'ava'; import * as Sinon from 'sinon'; import { ConfigModule } from '../src/config'; +import { MetricsModule } from '../src/metrics'; import { DocHistoryManager } from '../src/modules/doc'; import { PrismaModule, PrismaService } from '../src/prisma'; import { flushDB } from './utils'; @@ -19,7 +20,12 @@ let db: PrismaService; test.beforeEach(async () => { await flushDB(); m = await Test.createTestingModule({ - imports: [PrismaModule, ScheduleModule.forRoot(), ConfigModule.forRoot()], + imports: [ + PrismaModule, + MetricsModule, + ScheduleModule.forRoot(), + ConfigModule.forRoot(), + ], providers: [DocHistoryManager], }).compile(); diff --git a/packages/backend/server/tests/mailer.e2e.ts b/packages/backend/server/tests/mailer.e2e.ts index 10b2193252..f069fcb21b 100644 --- a/packages/backend/server/tests/mailer.e2e.ts +++ b/packages/backend/server/tests/mailer.e2e.ts @@ -12,6 +12,7 @@ import ava, { type TestFn } from 'ava'; import { ConfigModule } from '../src/config'; import { GqlModule } from '../src/graphql.module'; +import { MetricsModule } from '../src/metrics'; import { AuthModule } from '../src/modules/auth'; import { AuthService } from '../src/modules/auth/service'; import { PrismaModule } from '../src/prisma'; @@ -43,6 +44,7 @@ test.beforeEach(async t => { PrismaModule, GqlModule, AuthModule, + MetricsModule, RateLimiterModule, ], }).compile(); diff --git a/packages/backend/server/tests/prometheus-metrics.spec.ts b/packages/backend/server/tests/prometheus-metrics.spec.ts new file mode 100644 index 0000000000..af0d6dd4a5 --- /dev/null +++ b/packages/backend/server/tests/prometheus-metrics.spec.ts @@ -0,0 +1,78 @@ +import { Test, TestingModule } from '@nestjs/testing'; +import test from 'ava'; +import { register } from 'prom-client'; + +import { MetricsModule } from '../src/metrics'; +import { Metrics } from '../src/metrics/metrics'; +import { PrismaModule } from '../src/prisma'; + +let metrics: Metrics; +let module: TestingModule; + +test.beforeEach(async () => { + module = await Test.createTestingModule({ + imports: [MetricsModule, PrismaModule], + }).compile(); + + metrics = module.get(Metrics); +}); + +test.afterEach.always(async () => { + await module.close(); +}); + +test('should be able to increment counter', async t => { + metrics.socketIOEventCounter(1, { event: 'client-handshake' }); + const socketIOCounterMetric = register.getSingleMetric('socket_io_counter'); + t.truthy(socketIOCounterMetric); + + t.truthy( + JSON.stringify((await socketIOCounterMetric!.get()).values) === + '[{"value":1,"labels":{"event":"client-handshake"}}]' + ); + t.pass(); +}); + +test('should be able to timer', async t => { + let minimum: number; + { + const endTimer = metrics.socketIOEventTimer({ event: 'client-handshake' }); + const a = performance.now(); + await new Promise(resolve => setTimeout(resolve, 50)); + const b = performance.now(); + minimum = b - a; + endTimer(); + } + + let maximum: number; + { + const a = performance.now(); + const endTimer = metrics.socketIOEventTimer({ event: 'client-handshake' }); + await new Promise(resolve => setTimeout(resolve, 100)); + endTimer(); + const b = performance.now(); + maximum = b - a; + } + + const socketIOTimerMetric = register.getSingleMetric('socket_io_timer'); + t.truthy(socketIOTimerMetric); + + const observations = (await socketIOTimerMetric!.get()).values; + + for (const observation of observations) { + if ( + observation.labels.event === 'client-handshake' && + 'quantile' in observation.labels + ) { + t.truthy( + observation.value >= minimum / 1000, + 'observation.value should be greater than minimum' + ); + t.truthy( + observation.value <= maximum / 1000, + 'observation.value should be less than maximum' + ); + } + } + t.pass(); +}); diff --git a/yarn.lock b/yarn.lock index 7ecfc155b2..24af8189b4 100644 --- a/yarn.lock +++ b/yarn.lock @@ -726,9 +726,6 @@ __metadata: "@node-rs/jsonwebtoken": "npm:^0.2.3" "@opentelemetry/api": "npm:^1.7.0" "@opentelemetry/core": "npm:^1.18.1" - "@opentelemetry/exporter-prometheus": "npm:^0.45.1" - "@opentelemetry/exporter-zipkin": "npm:^1.18.1" - "@opentelemetry/host-metrics": "npm:^0.33.2" "@opentelemetry/instrumentation": "npm:^0.45.1" "@opentelemetry/instrumentation-graphql": "npm:^0.36.0" "@opentelemetry/instrumentation-http": "npm:^0.45.1" @@ -8742,19 +8739,6 @@ __metadata: languageName: node linkType: hard -"@opentelemetry/exporter-prometheus@npm:^0.45.1": - version: 0.45.1 - resolution: "@opentelemetry/exporter-prometheus@npm:0.45.1" - dependencies: - "@opentelemetry/core": "npm:1.18.1" - "@opentelemetry/resources": "npm:1.18.1" - "@opentelemetry/sdk-metrics": "npm:1.18.1" - peerDependencies: - "@opentelemetry/api": ^1.3.0 - checksum: 0d9999eca90b407de1e496e00bc9d991c5bb2ccf069c53a0de9054457170426efdce833e958c53273b22957dcbfa064a5559f8c9358114dfa48c873799d4fc27 - languageName: node - linkType: hard - "@opentelemetry/exporter-trace-otlp-grpc@npm:0.45.1": version: 0.45.1 resolution: "@opentelemetry/exporter-trace-otlp-grpc@npm:0.45.1" @@ -8802,7 +8786,7 @@ __metadata: languageName: node linkType: hard -"@opentelemetry/exporter-zipkin@npm:1.18.1, @opentelemetry/exporter-zipkin@npm:^1.18.1": +"@opentelemetry/exporter-zipkin@npm:1.18.1": version: 1.18.1 resolution: "@opentelemetry/exporter-zipkin@npm:1.18.1" dependencies: @@ -8816,18 +8800,6 @@ __metadata: languageName: node linkType: hard -"@opentelemetry/host-metrics@npm:^0.33.2": - version: 0.33.2 - resolution: "@opentelemetry/host-metrics@npm:0.33.2" - dependencies: - "@opentelemetry/sdk-metrics": "npm:^1.8.0" - systeminformation: "npm:^5.0.0" - peerDependencies: - "@opentelemetry/api": ^1.3.0 - checksum: 35140ecb2c4e97ab49873411a9e81a23238cf392112212f936f238091a6572c0802d6364ccea428cb7f0595cb28f4cbafbe6fa5c23510ebb1243cf7f398871e5 - languageName: node - linkType: hard - "@opentelemetry/instrumentation-graphql@npm:^0.36.0": version: 0.36.0 resolution: "@opentelemetry/instrumentation-graphql@npm:0.36.0" @@ -9041,7 +9013,7 @@ __metadata: languageName: node linkType: hard -"@opentelemetry/sdk-metrics@npm:1.18.1, @opentelemetry/sdk-metrics@npm:^1.18.1, @opentelemetry/sdk-metrics@npm:^1.8.0": +"@opentelemetry/sdk-metrics@npm:1.18.1, @opentelemetry/sdk-metrics@npm:^1.18.1": version: 1.18.1 resolution: "@opentelemetry/sdk-metrics@npm:1.18.1" dependencies: @@ -33482,16 +33454,6 @@ __metadata: languageName: node linkType: hard -"systeminformation@npm:^5.0.0": - version: 5.21.17 - resolution: "systeminformation@npm:5.21.17" - bin: - systeminformation: lib/cli.js - checksum: bc910730829f1b232a1ca824bb35dfb211489430a1a612fe9e0d27fd3cb73c0793a00cbff7512c782cac2f2dfd2416a7e698b410f6dd5b4f66cbf5ae458c9cab - conditions: (os=darwin | os=linux | os=win32 | os=freebsd | os=openbsd | os=netbsd | os=sunos | os=android) - languageName: node - linkType: hard - "tabbable@npm:^6.0.1": version: 6.2.0 resolution: "tabbable@npm:6.2.0"