server: add dynamic labels trigger_name and source_name to existing event trigger metrics

PR-URL: https://github.com/hasura/graphql-engine-mono/pull/9265
GitOrigin-RevId: 6fb6504f1a476ea6c8b810e067770920757e8dc6
This commit is contained in:
Krushan Bauva 2023-05-24 18:52:06 +05:30 committed by hasura-bot
parent 6d27ad97ae
commit e3df24507d
3 changed files with 131 additions and 39 deletions

View File

@ -142,21 +142,21 @@ consider looking into the performance of your database.
Total number of events invoked. Represents the Event Trigger webhook HTTP requests made.
| | |
| ------ | -------------------------------- |
| Name | `hasura_event_invocations_total` |
| Type | Counter |
| Labels | `status`: success \| failed |
| | |
| ------ | ---------------------------------------------------------- |
| Name | `hasura_event_invocations_total` |
| Type | Counter |
| Labels | `status`: success \| failed, `source_name`, `trigger_name` |
### Hasura event processed total
Total number of events processed. Represents the Event Trigger egress.
| | |
| ------ | ------------------------------ |
| Name | `hasura_event_processed_total` |
| Type | Counter |
| Labels | `status`: success \| failed |
| | |
| ------ | ---------------------------------------------------------- |
| Name | `hasura_event_processed_total` |
| Type | Counter |
| Labels | `status`: success \| failed, `source_name`, `trigger_name` |
### Hasura event processing time
@ -167,7 +167,7 @@ This metric can be considered as the end-to-end processing time for an event.
| ------ | --------------------------------------------------------------------- |
| Name | `hasura_event_processing_time_seconds` |
| Type | Histogram<br /><br />Buckets: 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100 |
| Labels | none |
| Labels | `source_name`, `trigger_name` |
### Hasura event queue time
@ -180,7 +180,7 @@ server.
| ------ | --------------------------------------------------------------------- |
| Name | `hasura_event_queue_time_seconds` |
| Type | Histogram<br /><br />Buckets: 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100 |
| Labels | none |
| Labels | `source_name`, `trigger_name` |
### Hasura event trigger HTTP workers
@ -203,7 +203,7 @@ processing time indicates slow webhook, you should try to optimize the event web
| ------ | ------------------------------------------------------------ |
| Name | `hasura_event_webhook_processing_time_seconds` |
| Type | Histogram<br /><br />Buckets: 0.01, 0.03, 0.1, 0.3, 1, 3, 10 |
| Labels | none |
| Labels | `source_name`, `trigger_name` |
### Hasura events fetched per batch

View File

@ -89,6 +89,8 @@ import Refined.Unsafe (unsafeRefine)
import System.Metrics.Distribution qualified as EKG.Distribution
import System.Metrics.Gauge qualified as EKG.Gauge
import System.Metrics.Prometheus.Counter qualified as Prometheus.Counter
import System.Metrics.Prometheus.CounterVector (CounterVector)
import System.Metrics.Prometheus.CounterVector qualified as CounterVector
import System.Metrics.Prometheus.Gauge qualified as Prometheus.Gauge
import System.Metrics.Prometheus.Histogram qualified as Prometheus.Histogram
import System.Timeout.Lifted (timeout)
@ -472,7 +474,13 @@ processEventQueue logger statsLogger httpMgr getSchemaCache getEventEngineCtx ac
eventProcessTime <- liftIO getCurrentTime
let eventQueueTime = realToFrac $ diffUTCTime eventProcessTime eventFetchedTime
_ <- liftIO $ EKG.Distribution.add (smEventQueueTime serverMetrics) eventQueueTime
liftIO $ Prometheus.Histogram.observe (eventQueueTimeSeconds eventTriggerMetrics) eventQueueTime
liftIO $
observeHistogramWithLabel
getPrometheusMetricsGranularity
True
(eventQueueTimeSeconds eventTriggerMetrics)
(DynamicEventTriggerLabel (tmName (eTrigger e)) sourceName)
eventQueueTime
cache <- liftIO getSchemaCache
@ -566,16 +574,39 @@ processEventQueue logger statsLogger httpMgr getSchemaCache getEventEngineCtx ac
-- `eventStartTime`) used here in calculation are all UTC time.
eventStartTime = fromMaybe (eCreatedAtUTC e) (eRetryAtUTC e)
eventProcessingTime' = realToFrac $ diffUTCTime eventExecutionFinishTime eventStartTime
observeHistogramWithLabel getPrometheusMetricsGranularity True (eventProcessingTime eventTriggerMetrics) (TriggerNameLabel (etiName eti)) eventProcessingTime'
observeHistogramWithLabel
getPrometheusMetricsGranularity
True
(eventProcessingTime eventTriggerMetrics)
(DynamicEventTriggerLabel (etiName eti) sourceName)
eventProcessingTime'
liftIO $ do
EKG.Distribution.add (smEventWebhookProcessingTime serverMetrics) eventWebhookProcessingTime'
Prometheus.Histogram.observe (eventWebhookProcessingTime eventTriggerMetrics) eventWebhookProcessingTime'
observeHistogramWithLabel
getPrometheusMetricsGranularity
True
(eventWebhookProcessingTime eventTriggerMetrics)
(DynamicEventTriggerLabel (etiName eti) sourceName)
eventWebhookProcessingTime'
EKG.Distribution.add (smEventProcessingTime serverMetrics) eventProcessingTime'
Prometheus.Counter.inc (eventProcessedTotalSuccess eventTriggerMetrics)
Prometheus.Counter.inc (eventInvocationTotalSuccess eventTriggerMetrics)
incEventTriggerCounterWithLabel
getPrometheusMetricsGranularity
True
(eventProcessedTotal eventTriggerMetrics)
(EventStatusWithTriggerLabel eventSuccessLabel (Just (DynamicEventTriggerLabel (etiName eti) sourceName)))
incEventTriggerCounterWithLabel
getPrometheusMetricsGranularity
True
(eventInvocationTotal eventTriggerMetrics)
(EventStatusWithTriggerLabel eventSuccessLabel (Just (DynamicEventTriggerLabel (etiName eti) sourceName)))
Left eventError -> do
-- TODO (paritosh): We can also add a label to the metric to indicate the type of error
liftIO $ Prometheus.Counter.inc (eventInvocationTotalFailure eventTriggerMetrics)
liftIO $
incEventTriggerCounterWithLabel
getPrometheusMetricsGranularity
True
(eventInvocationTotal eventTriggerMetrics)
(EventStatusWithTriggerLabel eventFailedLabel (Just (DynamicEventTriggerLabel (etiName eti) sourceName)))
case eventError of
(HTTPError reqBody err) ->
processError @b sourceConfig e retryConf logHeaders reqBody maintenanceModeVersion eventTriggerMetrics err >>= flip onLeft logQErr
@ -633,7 +664,8 @@ processSuccess sourceConfig e reqHeaders ep maintenanceModeVersion resp = do
processError ::
forall b m a.
( MonadIO m,
BackendEventTrigger b
BackendEventTrigger b,
MonadGetPolicies m
) =>
SourceConfig b ->
Event b ->
@ -661,13 +693,16 @@ processError sourceConfig e retryConf reqHeaders ep maintenanceModeVersion event
recordError @b sourceConfig e invocation retryOrError maintenanceModeVersion
retryOrSetError ::
MonadIO m =>
( MonadIO m,
MonadGetPolicies m
) =>
Event b ->
RetryConf ->
EventTriggerMetrics ->
HTTPErr a ->
m ProcessEventError
retryOrSetError e retryConf eventTriggerMetrics err = do
getPrometheusMetricsGranularity <- runGetPrometheusMetricsGranularity
let mretryHeader = getRetryAfterHeaderFromError err
tries = eTries e
mretryHeaderSeconds = mretryHeader >>= parseRetryHeader
@ -676,7 +711,12 @@ retryOrSetError e retryConf eventTriggerMetrics err = do
-- current_try = tries + 1 , allowed_total_tries = rcNumRetries retryConf + 1
if triesExhausted && noRetryHeader
then do
liftIO $ Prometheus.Counter.inc (eventProcessedTotalFailure eventTriggerMetrics)
liftIO $
incEventTriggerCounterWithLabel
getPrometheusMetricsGranularity
True
(eventProcessedTotal eventTriggerMetrics)
(EventStatusWithTriggerLabel eventFailedLabel (Just (DynamicEventTriggerLabel (tmName (eTrigger e)) (eSource e))))
pure PESetError
else do
currentTime <- liftIO getCurrentTime
@ -732,3 +772,18 @@ getEventTriggerInfoFromEvent sc e = do
<> "' on table '"
<> table <<> "' not found"
)
incEventTriggerCounterWithLabel ::
(MonadIO m) =>
(IO GranularPrometheusMetricsState) ->
-- should the metric be observed without a label when granularMetricsState is OFF
Bool ->
CounterVector EventStatusWithTriggerLabel ->
EventStatusWithTriggerLabel ->
m ()
incEventTriggerCounterWithLabel getMetricState alwaysObserve counterVector (EventStatusWithTriggerLabel status tl) = do
recordMetricWithLabel
getMetricState
alwaysObserve
(liftIO $ CounterVector.inc counterVector (EventStatusWithTriggerLabel status tl))
(liftIO $ CounterVector.inc counterVector (EventStatusWithTriggerLabel status Nothing))

View File

@ -19,7 +19,13 @@ module Hasura.Server.Prometheus
decWebsocketConnections,
ScheduledTriggerMetrics (..),
SubscriptionMetrics (..),
TriggerNameLabel (..),
DynamicEventTriggerLabel (..),
ResponseStatus (..),
responseStatusToLabelValue,
EventStatusLabel (..),
eventSuccessLabel,
eventFailedLabel,
EventStatusWithTriggerLabel (..),
GranularPrometheusMetricsState (..),
observeHistogramWithLabel,
SubscriptionKindLabel (..),
@ -39,12 +45,15 @@ import Data.Int (Int64)
import Hasura.GraphQL.ParameterizedQueryHash
import Hasura.GraphQL.Transport.HTTP.Protocol (OperationName (..))
import Hasura.Prelude
import Hasura.RQL.Types.Common (SourceName, sourceNameToText)
import Hasura.RQL.Types.EventTrigger (TriggerName, triggerNameToTxt)
import Hasura.Server.Types (GranularPrometheusMetricsState (..))
import Language.GraphQL.Draft.Syntax qualified as G
import System.Metrics.Prometheus (ToLabels (..))
import System.Metrics.Prometheus.Counter (Counter)
import System.Metrics.Prometheus.Counter qualified as Counter
import System.Metrics.Prometheus.CounterVector (CounterVector)
import System.Metrics.Prometheus.CounterVector qualified as CounterVector
import System.Metrics.Prometheus.Gauge (Gauge)
import System.Metrics.Prometheus.Gauge qualified as Gauge
import System.Metrics.Prometheus.GaugeVector qualified as GaugeVector
@ -85,16 +94,14 @@ data GraphQLRequestMetrics = GraphQLRequestMetrics
data EventTriggerMetrics = EventTriggerMetrics
{ eventTriggerHTTPWorkers :: Gauge,
eventsFetchedPerBatch :: Gauge,
eventQueueTimeSeconds :: Histogram,
eventQueueTimeSeconds :: HistogramVector (Maybe DynamicEventTriggerLabel),
eventsFetchTimePerBatch :: Histogram,
eventWebhookProcessingTime :: Histogram,
eventProcessingTime :: HistogramVector (Maybe TriggerNameLabel),
eventWebhookProcessingTime :: HistogramVector (Maybe DynamicEventTriggerLabel),
eventProcessingTime :: HistogramVector (Maybe DynamicEventTriggerLabel),
eventTriggerBytesReceived :: Counter,
eventTriggerBytesSent :: Counter,
eventProcessedTotalSuccess :: Counter,
eventProcessedTotalFailure :: Counter,
eventInvocationTotalSuccess :: Counter,
eventInvocationTotalFailure :: Counter
eventProcessedTotal :: CounterVector EventStatusWithTriggerLabel,
eventInvocationTotal :: CounterVector EventStatusWithTriggerLabel
}
data ScheduledTriggerMetrics = ScheduledTriggerMetrics
@ -159,16 +166,14 @@ makeDummyEventTriggerMetrics :: IO EventTriggerMetrics
makeDummyEventTriggerMetrics = do
eventTriggerHTTPWorkers <- Gauge.new
eventsFetchedPerBatch <- Gauge.new
eventQueueTimeSeconds <- Histogram.new []
eventQueueTimeSeconds <- HistogramVector.new []
eventsFetchTimePerBatch <- Histogram.new []
eventWebhookProcessingTime <- Histogram.new []
eventWebhookProcessingTime <- HistogramVector.new []
eventProcessingTime <- HistogramVector.new []
eventTriggerBytesReceived <- Counter.new
eventTriggerBytesSent <- Counter.new
eventProcessedTotalSuccess <- Counter.new
eventProcessedTotalFailure <- Counter.new
eventInvocationTotalSuccess <- Counter.new
eventInvocationTotalFailure <- Counter.new
eventProcessedTotal <- CounterVector.new
eventInvocationTotal <- CounterVector.new
pure EventTriggerMetrics {..}
makeDummyScheduledTriggerMetrics :: IO ScheduledTriggerMetrics
@ -250,12 +255,44 @@ modifyConnectionsGauge ::
modifyConnectionsGauge f (ConnectionsGauge ref) =
atomicModifyIORef' ref $ \connections -> (f connections, ())
newtype TriggerNameLabel = TriggerNameLabel TriggerName
data DynamicEventTriggerLabel = DynamicEventTriggerLabel
{ _detlTriggerName :: TriggerName,
_detlSourceName :: SourceName
}
deriving (Ord, Eq)
instance ToLabels (Maybe TriggerNameLabel) where
instance ToLabels (Maybe DynamicEventTriggerLabel) where
toLabels Nothing = Map.empty
toLabels (Just (TriggerNameLabel triggerName)) = Map.singleton "trigger_name" (triggerNameToTxt triggerName)
toLabels (Just (DynamicEventTriggerLabel triggerName sourceName)) = Map.fromList $ [("trigger_name", triggerNameToTxt triggerName), ("source_name", sourceNameToText sourceName)]
data ResponseStatus = Success | Failed
-- TODO: Make this a method of a new typeclass of the metrics library
responseStatusToLabelValue :: ResponseStatus -> Text
responseStatusToLabelValue = \case
Success -> "success"
Failed -> "failed"
newtype EventStatusLabel = EventStatusLabel
{ status :: Text
}
deriving stock (Generic, Ord, Eq)
deriving anyclass (ToLabels)
eventSuccessLabel :: EventStatusLabel
eventSuccessLabel = EventStatusLabel $ responseStatusToLabelValue Success
eventFailedLabel :: EventStatusLabel
eventFailedLabel = EventStatusLabel $ responseStatusToLabelValue Failed
data EventStatusWithTriggerLabel = EventStatusWithTriggerLabel
{ _eswtlStatus :: EventStatusLabel,
_eswtlDynamicLabels :: Maybe DynamicEventTriggerLabel
}
deriving stock (Generic, Ord, Eq)
instance ToLabels (EventStatusWithTriggerLabel) where
toLabels (EventStatusWithTriggerLabel esl tl) = (HashMap.fromList $ [("status", status esl)]) <> toLabels tl
data SubscriptionKindLabel = SubscriptionKindLabel
{ subscription_kind :: Text