server: add new metric for event trigger observability

PR-URL: https://github.com/hasura/graphql-engine-mono/pull/8380
Co-authored-by: Rob Dominguez <24390149+robertjdominguez@users.noreply.github.com>
GitOrigin-RevId: df7d5c53668fc84e7d70d471e29774136f5d560a
This commit is contained in:
paritosh-08 2023-03-29 22:02:14 +05:30 committed by hasura-bot
parent 677a972956
commit b7bae6dfec
3 changed files with 43 additions and 11 deletions

View File

@ -122,6 +122,20 @@ curl 'http://127.0.0.1:8080/v1/metrics' -H 'Authorization: Bearer <secret>'
<td>none</td> <td>none</td>
<td>Compare this number to the <a href="/latest/deployment/graphql-engine-flags/reference/#events-http-pool-size">HTTP pool size</a>. Consider increasing it if the metric is near the current configured value.</td> <td>Compare this number to the <a href="/latest/deployment/graphql-engine-flags/reference/#events-http-pool-size">HTTP pool size</a>. Consider increasing it if the metric is near the current configured value.</td>
</tr> </tr>
<tr>
<td><code>hasura_event_processed_total</code></td>
<td>Total number of events processed</td>
<td>Counter</td>
<td>&#8226; "status": success|failed</td>
<td>Represents the Event Trigger egress.</td>
</tr>
<tr>
<td><code>hasura_event_invocations_total</code></td>
<td>Total number of events invoked</td>
<td>Counter</td>
<td>&#8226; "status": success|failed</td>
<td>Represents the Event Trigger webhook HTTP requests made.</td>
</tr>
<tr> <tr>
<td><code>hasura_postgres_connections</code></td> <td><code>hasura_postgres_connections</code></td>
<td>Current number of active PostgreSQL connections</td> <td>Current number of active PostgreSQL connections</td>

View File

@ -554,13 +554,19 @@ processEventQueue logger statsLogger httpMgr getSchemaCache getEventEngineCtx ac
Prometheus.Histogram.observe (eventWebhookProcessingTime eventTriggerMetrics) eventWebhookProcessingTime' Prometheus.Histogram.observe (eventWebhookProcessingTime eventTriggerMetrics) eventWebhookProcessingTime'
EKG.Distribution.add (smEventProcessingTime serverMetrics) eventProcessingTime' EKG.Distribution.add (smEventProcessingTime serverMetrics) eventProcessingTime'
Prometheus.Histogram.observe (eventProcessingTime eventTriggerMetrics) eventProcessingTime' Prometheus.Histogram.observe (eventProcessingTime eventTriggerMetrics) eventProcessingTime'
Left (HTTPError reqBody err) -> Prometheus.Counter.inc (eventProcessedTotalSuccess eventTriggerMetrics)
processError @b sourceConfig e retryConf logHeaders reqBody maintenanceModeVersion err >>= flip onLeft logQErr Prometheus.Counter.inc (eventInvocationTotalSuccess eventTriggerMetrics)
Left (TransformationError _ err) -> do Left eventError -> do
L.unLogger logger $ L.UnstructuredLog L.LevelError (SB.fromLBS $ J.encode err) -- TODO (paritosh): We can also add a label to the metric to indicate the type of error
liftIO $ Prometheus.Counter.inc (eventInvocationTotalFailure eventTriggerMetrics)
case eventError of
(HTTPError reqBody err) ->
processError @b sourceConfig e retryConf logHeaders reqBody maintenanceModeVersion eventTriggerMetrics err >>= flip onLeft logQErr
(TransformationError _ err) -> do
L.unLogger logger $ L.UnstructuredLog L.LevelError (SB.fromLBS $ J.encode err)
-- Record an Event Error -- Record an Event Error
recordError' @b sourceConfig e Nothing PESetError maintenanceModeVersion >>= flip onLeft logQErr recordError' @b sourceConfig e Nothing PESetError maintenanceModeVersion >>= flip onLeft logQErr
-- removing an event from the _eeCtxLockedEvents after the event has been processed: -- removing an event from the _eeCtxLockedEvents after the event has been processed:
removeEventTriggerEventFromLockedEvents sourceName (eId e) leEvents removeEventTriggerEventFromLockedEvents sourceName (eId e) leEvents
@ -608,9 +614,10 @@ processError ::
[HeaderConf] -> [HeaderConf] ->
J.Value -> J.Value ->
MaintenanceMode MaintenanceModeVersion -> MaintenanceMode MaintenanceModeVersion ->
EventTriggerMetrics ->
HTTPErr a -> HTTPErr a ->
m (Either QErr ()) m (Either QErr ())
processError sourceConfig e retryConf reqHeaders ep maintenanceModeVersion err = do processError sourceConfig e retryConf reqHeaders ep maintenanceModeVersion eventTriggerMetrics err = do
let invocation = case err of let invocation = case err of
HClient httpException -> HClient httpException ->
let statusMaybe = getHTTPExceptionStatus httpException let statusMaybe = getHTTPExceptionStatus httpException
@ -623,16 +630,17 @@ processError sourceConfig e retryConf reqHeaders ep maintenanceModeVersion err =
HOther detail -> do HOther detail -> do
let errMsg = SB.fromLBS $ J.encode detail let errMsg = SB.fromLBS $ J.encode detail
mkInvocation (eId e) ep (Just 500) reqHeaders errMsg [] mkInvocation (eId e) ep (Just 500) reqHeaders errMsg []
retryOrError <- retryOrSetError e retryConf err retryOrError <- retryOrSetError e retryConf eventTriggerMetrics err
recordError @b sourceConfig e invocation retryOrError maintenanceModeVersion recordError @b sourceConfig e invocation retryOrError maintenanceModeVersion
retryOrSetError :: retryOrSetError ::
MonadIO m => MonadIO m =>
Event b -> Event b ->
RetryConf -> RetryConf ->
EventTriggerMetrics ->
HTTPErr a -> HTTPErr a ->
m ProcessEventError m ProcessEventError
retryOrSetError e retryConf err = do retryOrSetError e retryConf eventTriggerMetrics err = do
let mretryHeader = getRetryAfterHeaderFromError err let mretryHeader = getRetryAfterHeaderFromError err
tries = eTries e tries = eTries e
mretryHeaderSeconds = mretryHeader >>= parseRetryHeader mretryHeaderSeconds = mretryHeader >>= parseRetryHeader
@ -640,7 +648,9 @@ retryOrSetError e retryConf err = do
noRetryHeader = isNothing mretryHeaderSeconds noRetryHeader = isNothing mretryHeaderSeconds
-- current_try = tries + 1 , allowed_total_tries = rcNumRetries retryConf + 1 -- current_try = tries + 1 , allowed_total_tries = rcNumRetries retryConf + 1
if triesExhausted && noRetryHeader if triesExhausted && noRetryHeader
then pure PESetError then do
liftIO $ Prometheus.Counter.inc (eventProcessedTotalFailure eventTriggerMetrics)
pure PESetError
else do else do
currentTime <- liftIO getCurrentTime currentTime <- liftIO getCurrentTime
let delay = fromMaybe (rcIntervalSec retryConf) mretryHeaderSeconds let delay = fromMaybe (rcIntervalSec retryConf) mretryHeaderSeconds

View File

@ -60,7 +60,11 @@ data EventTriggerMetrics = EventTriggerMetrics
eventWebhookProcessingTime :: Histogram, eventWebhookProcessingTime :: Histogram,
eventProcessingTime :: Histogram, eventProcessingTime :: Histogram,
eventTriggerBytesReceived :: Counter, eventTriggerBytesReceived :: Counter,
eventTriggerBytesSent :: Counter eventTriggerBytesSent :: Counter,
eventProcessedTotalSuccess :: Counter,
eventProcessedTotalFailure :: Counter,
eventInvocationTotalSuccess :: Counter,
eventInvocationTotalFailure :: Counter
} }
-- | Create dummy mutable references without associating them to a metrics -- | Create dummy mutable references without associating them to a metrics
@ -99,6 +103,10 @@ makeDummyEventTriggerMetrics = do
eventProcessingTime <- Histogram.new [] eventProcessingTime <- Histogram.new []
eventTriggerBytesReceived <- Counter.new eventTriggerBytesReceived <- Counter.new
eventTriggerBytesSent <- Counter.new eventTriggerBytesSent <- Counter.new
eventProcessedTotalSuccess <- Counter.new
eventProcessedTotalFailure <- Counter.new
eventInvocationTotalSuccess <- Counter.new
eventInvocationTotalFailure <- Counter.new
pure EventTriggerMetrics {..} pure EventTriggerMetrics {..}
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------