server: add new metric for event trigger observability

PR-URL: https://github.com/hasura/graphql-engine-mono/pull/8380
Co-authored-by: Rob Dominguez <24390149+robertjdominguez@users.noreply.github.com>
GitOrigin-RevId: df7d5c53668fc84e7d70d471e29774136f5d560a
This commit is contained in:
paritosh-08 2023-03-29 22:02:14 +05:30 committed by hasura-bot
parent 677a972956
commit b7bae6dfec
3 changed files with 43 additions and 11 deletions

View File

@ -122,6 +122,20 @@ curl 'http://127.0.0.1:8080/v1/metrics' -H 'Authorization: Bearer <secret>'
<td>none</td>
<td>Compare this number to the <a href="/latest/deployment/graphql-engine-flags/reference/#events-http-pool-size">HTTP pool size</a>. Consider increasing it if the metric is near the current configured value.</td>
</tr>
<tr>
<td><code>hasura_event_processed_total</code></td>
<td>Total number of events processed</td>
<td>Counter</td>
<td>&#8226; "status": success|failed</td>
<td>Represents the Event Trigger egress.</td>
</tr>
<tr>
<td><code>hasura_event_invocations_total</code></td>
<td>Total number of events invoked</td>
<td>Counter</td>
<td>&#8226; "status": success|failed</td>
<td>Represents the Event Trigger webhook HTTP requests made.</td>
</tr>
<tr>
<td><code>hasura_postgres_connections</code></td>
<td>Current number of active PostgreSQL connections</td>

View File

@ -554,13 +554,19 @@ processEventQueue logger statsLogger httpMgr getSchemaCache getEventEngineCtx ac
Prometheus.Histogram.observe (eventWebhookProcessingTime eventTriggerMetrics) eventWebhookProcessingTime'
EKG.Distribution.add (smEventProcessingTime serverMetrics) eventProcessingTime'
Prometheus.Histogram.observe (eventProcessingTime eventTriggerMetrics) eventProcessingTime'
Left (HTTPError reqBody err) ->
processError @b sourceConfig e retryConf logHeaders reqBody maintenanceModeVersion err >>= flip onLeft logQErr
Left (TransformationError _ err) -> do
L.unLogger logger $ L.UnstructuredLog L.LevelError (SB.fromLBS $ J.encode err)
Prometheus.Counter.inc (eventProcessedTotalSuccess eventTriggerMetrics)
Prometheus.Counter.inc (eventInvocationTotalSuccess eventTriggerMetrics)
Left eventError -> do
-- TODO (paritosh): We can also add a label to the metric to indicate the type of error
liftIO $ Prometheus.Counter.inc (eventInvocationTotalFailure eventTriggerMetrics)
case eventError of
(HTTPError reqBody err) ->
processError @b sourceConfig e retryConf logHeaders reqBody maintenanceModeVersion eventTriggerMetrics err >>= flip onLeft logQErr
(TransformationError _ err) -> do
L.unLogger logger $ L.UnstructuredLog L.LevelError (SB.fromLBS $ J.encode err)
-- Record an Event Error
recordError' @b sourceConfig e Nothing PESetError maintenanceModeVersion >>= flip onLeft logQErr
-- Record an Event Error
recordError' @b sourceConfig e Nothing PESetError maintenanceModeVersion >>= flip onLeft logQErr
-- removing an event from the _eeCtxLockedEvents after the event has been processed:
removeEventTriggerEventFromLockedEvents sourceName (eId e) leEvents
@ -608,9 +614,10 @@ processError ::
[HeaderConf] ->
J.Value ->
MaintenanceMode MaintenanceModeVersion ->
EventTriggerMetrics ->
HTTPErr a ->
m (Either QErr ())
processError sourceConfig e retryConf reqHeaders ep maintenanceModeVersion err = do
processError sourceConfig e retryConf reqHeaders ep maintenanceModeVersion eventTriggerMetrics err = do
let invocation = case err of
HClient httpException ->
let statusMaybe = getHTTPExceptionStatus httpException
@ -623,16 +630,17 @@ processError sourceConfig e retryConf reqHeaders ep maintenanceModeVersion err =
HOther detail -> do
let errMsg = SB.fromLBS $ J.encode detail
mkInvocation (eId e) ep (Just 500) reqHeaders errMsg []
retryOrError <- retryOrSetError e retryConf err
retryOrError <- retryOrSetError e retryConf eventTriggerMetrics err
recordError @b sourceConfig e invocation retryOrError maintenanceModeVersion
retryOrSetError ::
MonadIO m =>
Event b ->
RetryConf ->
EventTriggerMetrics ->
HTTPErr a ->
m ProcessEventError
retryOrSetError e retryConf err = do
retryOrSetError e retryConf eventTriggerMetrics err = do
let mretryHeader = getRetryAfterHeaderFromError err
tries = eTries e
mretryHeaderSeconds = mretryHeader >>= parseRetryHeader
@ -640,7 +648,9 @@ retryOrSetError e retryConf err = do
noRetryHeader = isNothing mretryHeaderSeconds
-- current_try = tries + 1 , allowed_total_tries = rcNumRetries retryConf + 1
if triesExhausted && noRetryHeader
then pure PESetError
then do
liftIO $ Prometheus.Counter.inc (eventProcessedTotalFailure eventTriggerMetrics)
pure PESetError
else do
currentTime <- liftIO getCurrentTime
let delay = fromMaybe (rcIntervalSec retryConf) mretryHeaderSeconds

View File

@ -60,7 +60,11 @@ data EventTriggerMetrics = EventTriggerMetrics
eventWebhookProcessingTime :: Histogram,
eventProcessingTime :: Histogram,
eventTriggerBytesReceived :: Counter,
eventTriggerBytesSent :: Counter
eventTriggerBytesSent :: Counter,
eventProcessedTotalSuccess :: Counter,
eventProcessedTotalFailure :: Counter,
eventInvocationTotalSuccess :: Counter,
eventInvocationTotalFailure :: Counter
}
-- | Create dummy mutable references without associating them to a metrics
@ -99,6 +103,10 @@ makeDummyEventTriggerMetrics = do
eventProcessingTime <- Histogram.new []
eventTriggerBytesReceived <- Counter.new
eventTriggerBytesSent <- Counter.new
eventProcessedTotalSuccess <- Counter.new
eventProcessedTotalFailure <- Counter.new
eventInvocationTotalSuccess <- Counter.new
eventInvocationTotalFailure <- Counter.new
pure EventTriggerMetrics {..}
--------------------------------------------------------------------------------