pro/server: add monitoring metrics for OTLP traces export

re. https://hasurahq.atlassian.net/browse/INFRA-832

PR-URL: https://github.com/hasura/graphql-engine-mono/pull/10080
Co-authored-by: Toan Nguyen  <1615675+hgiasac@users.noreply.github.com>
GitOrigin-RevId: 8de1c0a1703037cc9955da01f2593c2db15dc189
This commit is contained in:
Brandon Simmons 2023-08-10 06:33:39 -04:00 committed by hasura-bot
parent 75f0629c5d
commit f6bbda77eb
2 changed files with 55 additions and 1 deletions

View File

@ -430,3 +430,30 @@ Health check status of a particular data source, corresponding to the output of
| Name | `hasura_source_health` |
| Type | Gauge |
| Labels | `source_name`: name of the database |
### OpenTelemetry OTLP Export Metrics
These metrics allow for monitoring the reliability and performance of OTLP
exports of telemetry data.
#### Hasura OTLP Sent Spans
Total number of successfully exported trace spans.
| | |
| ------ | -------------------------------------------------------------- |
| Name | `hasura_otel_sent_spans` |
| Type | Counter |
| Labels | none |
#### Hasura OTLP Dropped Spans
Total number of trace spans dropped due to either high trace volume that filled
the buffer, or errors during send (e.g. a timeout or error response from the
collector).
| | |
| ------ | -------------------------------------------------------------- |
| Name | `hasura_otel_dropped_spans` |
| Type | Counter |
| Labels | `reason`: buffer_full \| send_failed |

View File

@ -8,6 +8,7 @@ module Hasura.Server.Prometheus
GraphQLRequestMetrics (..),
EventTriggerMetrics (..),
CacheRequestMetrics (..),
OpenTelemetryMetrics (..),
makeDummyPrometheusMetrics,
ConnectionsGauge,
Connections (..),
@ -77,7 +78,8 @@ data PrometheusMetrics = PrometheusMetrics
pmSubscriptionMetrics :: SubscriptionMetrics,
pmWebsocketMsgQueueTimeSeconds :: Histogram,
pmWebsocketMsgWriteTimeSeconds :: Histogram,
pmCacheRequestMetrics :: CacheRequestMetrics
pmCacheRequestMetrics :: CacheRequestMetrics,
pmOpenTelemetryMetrics :: OpenTelemetryMetrics
}
data GraphQLRequestMetrics = GraphQLRequestMetrics
@ -133,6 +135,20 @@ data CacheRequestMetrics = CacheRequestMetrics
crmCacheMisses :: Counter
}
-- | Metrics related to OTel telemetry export; for now the volume of logs and
-- trace spans shipped, and counts of log lines and spans dropped due to high
-- volume.
data OpenTelemetryMetrics = OpenTelemetryMetrics
{ otmSentSpans :: Counter,
-- | Dropped due to the send buffer being full
otmDroppedSpansInBuffer :: Counter,
-- | Dropped due to some error (after retrying) when sending to collector
otmDroppedSpansInSend :: Counter,
otmSentLogs :: Counter,
otmDroppedLogsInBuffer :: Counter,
otmDroppedLogsInSend :: Counter
}
-- | Create dummy mutable references without associating them to a metrics
-- store.
makeDummyPrometheusMetrics :: IO PrometheusMetrics
@ -149,6 +165,7 @@ makeDummyPrometheusMetrics = do
pmWebsocketMsgQueueTimeSeconds <- Histogram.new []
pmWebsocketMsgWriteTimeSeconds <- Histogram.new []
pmCacheRequestMetrics <- makeDummyCacheRequestMetrics
pmOpenTelemetryMetrics <- makeDummyOpenTelemetryMetrics
pure PrometheusMetrics {..}
makeDummyGraphQLRequestMetrics :: IO GraphQLRequestMetrics
@ -209,6 +226,16 @@ makeDummyCacheRequestMetrics = do
crmCacheMisses <- Counter.new
pure CacheRequestMetrics {..}
makeDummyOpenTelemetryMetrics :: IO OpenTelemetryMetrics
makeDummyOpenTelemetryMetrics = do
otmSentSpans <- Counter.new
otmDroppedSpansInSend <- Counter.new
otmDroppedSpansInBuffer <- Counter.new
otmSentLogs <- Counter.new
otmDroppedLogsInSend <- Counter.new
otmDroppedLogsInBuffer <- Counter.new
pure OpenTelemetryMetrics {..}
--------------------------------------------------------------------------------
-- | A mutable reference for atomically sampling the number of websocket