server: replicate log-based data transfer metrics as prometheus metrics

PR-URL: https://github.com/hasura/graphql-engine-mono/pull/7354
GitOrigin-RevId: 3f49b8ebba515b42a9d7b22e83e6f39d9d6087c6
This commit is contained in:
awjchen 2022-12-27 20:47:42 -07:00 committed by hasura-bot
parent 3070470728
commit 1ec5efd5d3
12 changed files with 118 additions and 22 deletions

View File

@ -992,6 +992,7 @@ mkHGEServer setupHook env ServeOptions {..} ServeCtx {..} postPollHook serverMet
(getSchemaCache cacheRef)
(leActionEvents lockedEventsCtx)
_scHttpManager
prometheusMetrics
sleepTime
Nothing
@ -1026,6 +1027,7 @@ mkHGEServer setupHook env ServeOptions {..} ServeCtx {..} postPollHook serverMet
env
logger
_scHttpManager
prometheusMetrics
(getSchemaCache cacheRef)
lockedEventsCtx

View File

@ -86,6 +86,7 @@ import Network.HTTP.Client.Transformable qualified as HTTP
import Refined (NonNegative, Positive, Refined, refineTH, unrefine)
import System.Metrics.Distribution qualified as EKG.Distribution
import System.Metrics.Gauge qualified as EKG.Gauge
import System.Metrics.Prometheus.Counter qualified as Prometheus.Counter
import System.Metrics.Prometheus.Gauge qualified as Prometheus.Gauge
import System.Metrics.Prometheus.Histogram qualified as Prometheus.Histogram
@ -409,7 +410,19 @@ processEventQueue logger httpMgr getSchemaCache EventEngineCtx {..} LockedEvents
runExceptT $
mkRequest headers httpTimeout payload requestTransform (_envVarValue webhook) >>= \reqDetails -> do
let request = extractRequest reqDetails
logger' res details = logHTTPForET res extraLogCtx details (_envVarName webhook) logHeaders
logger' res details = do
logHTTPForET res extraLogCtx details (_envVarName webhook) logHeaders
liftIO $ do
case res of
Left _err -> pure ()
Right response ->
Prometheus.Counter.add
(eventTriggerBytesReceived eventTriggerMetrics)
(hrsSize response)
let RequestDetails {_rdOriginalSize, _rdTransformedSize} = details
in Prometheus.Counter.add
(eventTriggerBytesSent eventTriggerMetrics)
(fromMaybe _rdOriginalSize _rdTransformedSize)
-- Event Triggers have a configuration parameter called
-- HASURA_GRAPHQL_EVENTS_HTTP_WORKERS, which is used
-- to control the concurrency of http delivery.

View File

@ -147,9 +147,11 @@ import Hasura.RQL.Types.Eventing
import Hasura.RQL.Types.ScheduledTrigger
import Hasura.RQL.Types.SchemaCache
import Hasura.SQL.Types
import Hasura.Server.Prometheus (PrometheusMetrics (..))
import Hasura.Tracing qualified as Tracing
import Network.HTTP.Client.Transformable qualified as HTTP
import Refined (unrefine)
import System.Metrics.Prometheus.Counter as Prometheus.Counter
import Text.Builder qualified as TB
-- | runCronEventsGenerator makes sure that all the cron triggers
@ -220,11 +222,12 @@ processCronEvents ::
) =>
L.Logger L.Hasura ->
HTTP.Manager ->
PrometheusMetrics ->
[CronEvent] ->
IO SchemaCache ->
TVar (Set.Set CronEventId) ->
m ()
processCronEvents logger httpMgr cronEvents getSC lockedCronEvents = do
processCronEvents logger httpMgr prometheusMetrics cronEvents getSC lockedCronEvents = do
cronTriggersInfo <- scCronTriggers <$> liftIO getSC
-- save the locked cron events that have been fetched from the
-- database, the events stored here will be unlocked in case a
@ -252,6 +255,7 @@ processCronEvents logger httpMgr cronEvents getSC lockedCronEvents = do
runMetadataStorageT $
flip runReaderT (logger, httpMgr) $
processScheduledEvent
prometheusMetrics
id'
ctiHeaders
retryCtx
@ -271,6 +275,7 @@ processOneOffScheduledEvents ::
Env.Environment ->
L.Logger L.Hasura ->
HTTP.Manager ->
PrometheusMetrics ->
[OneOffScheduledEvent] ->
TVar (Set.Set OneOffScheduledEventId) ->
m ()
@ -278,6 +283,7 @@ processOneOffScheduledEvents
env
logger
httpMgr
prometheusMetrics
oneOffEvents
lockedOneOffScheduledEvents = do
-- save the locked one-off events that have been fetched from the
@ -302,7 +308,7 @@ processOneOffScheduledEvents
retryCtx = RetryContext _ooseTries _ooseRetryConf
webhookEnvRecord = EnvRecord (getTemplateFromUrl _ooseWebhookConf) webhookInfo
flip runReaderT (logger, httpMgr) $
processScheduledEvent _ooseId headerInfo retryCtx payload webhookEnvRecord OneOff
processScheduledEvent prometheusMetrics _ooseId headerInfo retryCtx payload webhookEnvRecord OneOff
removeEventFromLockedEvents _ooseId lockedOneOffScheduledEvents
where
logInternalError err = liftIO . L.unLogger logger $ ScheduledTriggerInternalErr err
@ -316,10 +322,11 @@ processScheduledTriggers ::
Env.Environment ->
L.Logger L.Hasura ->
HTTP.Manager ->
PrometheusMetrics ->
IO SchemaCache ->
LockedEventsCtx ->
m (Forever m)
processScheduledTriggers env logger httpMgr getSC LockedEventsCtx {..} = do
processScheduledTriggers env logger httpMgr prometheusMetrics getSC LockedEventsCtx {..} = do
return $
Forever () $
const $ do
@ -327,8 +334,8 @@ processScheduledTriggers env logger httpMgr getSC LockedEventsCtx {..} = do
case result of
Left e -> logInternalError e
Right (cronEvents, oneOffEvents) -> do
processCronEvents logger httpMgr cronEvents getSC leCronEvents
processOneOffScheduledEvents env logger httpMgr oneOffEvents leOneOffEvents
processCronEvents logger httpMgr prometheusMetrics cronEvents getSC leCronEvents
processOneOffScheduledEvents env logger httpMgr prometheusMetrics oneOffEvents leOneOffEvents
-- NOTE: cron events are scheduled at times with minute resolution (as on
-- unix), while one-off events can be set for arbitrary times. The sleep
-- time here determines how overdue a scheduled event (cron or one-off)
@ -345,6 +352,7 @@ processScheduledEvent ::
Tracing.HasReporter m,
MonadMetadataStorage m
) =>
PrometheusMetrics ->
ScheduledEventId ->
[EventHeaderInfo] ->
RetryContext ->
@ -352,7 +360,7 @@ processScheduledEvent ::
EnvRecord ResolvedWebhook ->
ScheduledEventType ->
m ()
processScheduledEvent eventId eventHeaders retryCtx payload webhookUrl type' =
processScheduledEvent prometheusMetrics eventId eventHeaders retryCtx payload webhookUrl type' =
Tracing.runTraceT Tracing.sampleAlways traceNote do
currentTime <- liftIO getCurrentTime
let retryConf = _rctxConf retryCtx
@ -374,7 +382,19 @@ processScheduledEvent eventId eventHeaders retryCtx payload webhookUrl type' =
runExceptT $
mkRequest headers httpTimeout webhookReqBody requestTransform (_envVarValue webhookUrl) >>= \reqDetails -> do
let request = extractRequest reqDetails
logger e d = logHTTPForST e extraLogCtx d (_envVarName webhookUrl) decodedHeaders
logger e d = do
logHTTPForST e extraLogCtx d (_envVarName webhookUrl) decodedHeaders
liftIO $ do
case e of
Left _err -> pure ()
Right response ->
Prometheus.Counter.add
(pmScheduledTriggerBytesReceived prometheusMetrics)
(hrsSize response)
let RequestDetails {_rdOriginalSize, _rdTransformedSize} = d
in Prometheus.Counter.add
(pmScheduledTriggerBytesSent prometheusMetrics)
(fromMaybe _rdOriginalSize _rdTransformedSize)
sessionVars = _rdSessionVars reqDetails
resp <- invokeRequest reqDetails responseTransform sessionVars logger
pure (request, resp)

View File

@ -318,6 +318,7 @@ getResolvedExecPlan ::
) =>
Env.Environment ->
L.Logger L.Hasura ->
PrometheusMetrics ->
UserInfo ->
SQLGenCtx ->
ReadOnlyMode ->
@ -334,6 +335,7 @@ getResolvedExecPlan ::
getResolvedExecPlan
env
logger
prometheusMetrics
userInfo
sqlGenCtx
readOnlyMode
@ -356,6 +358,7 @@ getResolvedExecPlan
EQ.convertQuerySelSet
env
logger
prometheusMetrics
gCtx
userInfo
httpManager
@ -375,6 +378,7 @@ getResolvedExecPlan
EM.convertMutationSelectionSet
env
logger
prometheusMetrics
gCtx
sqlGenCtx
userInfo

View File

@ -74,6 +74,7 @@ import Hasura.RQL.Types.Function
import Hasura.RQL.Types.SchemaCache
import Hasura.SQL.Backend
import Hasura.SQL.Types
import Hasura.Server.Prometheus (PrometheusMetrics (..))
import Hasura.Server.Utils
( mkClientHeadersForward,
mkSetCookieHeaders,
@ -83,6 +84,7 @@ import Hasura.Tracing qualified as Tracing
import Language.GraphQL.Draft.Syntax qualified as G
import Network.HTTP.Client.Transformable qualified as HTTP
import Network.Wreq qualified as Wreq
import System.Metrics.Prometheus.Counter as Prometheus.Counter
fetchActionLogResponses ::
(MonadError QErr m, MonadMetadataStorage (MetadataStorageT m), Foldable t) =>
@ -137,12 +139,13 @@ asSingleRowJsonResp query args =
resolveActionExecution ::
Env.Environment ->
L.Logger L.Hasura ->
PrometheusMetrics ->
UserInfo ->
IR.AnnActionExecution Void ->
ActionExecContext ->
Maybe GQLQueryText ->
ActionExecution
resolveActionExecution env logger _userInfo IR.AnnActionExecution {..} ActionExecContext {..} gqlQueryText =
resolveActionExecution env logger prometheusMetrics _userInfo IR.AnnActionExecution {..} ActionExecContext {..} gqlQueryText =
ActionExecution $ first (encJFromOrderedValue . makeActionResponseNoRelations _aaeFields _aaeOutputType _aaeOutputFields True) <$> runWebhook
where
handlerPayload = ActionWebhookPayload (ActionContext _aaeName) _aecSessionVariables _aaePayload gqlQueryText
@ -155,6 +158,7 @@ resolveActionExecution env logger _userInfo IR.AnnActionExecution {..} ActionExe
callWebhook
env
_aecManager
prometheusMetrics
_aaeOutputType
_aaeOutputFields
_aecHeaders
@ -433,10 +437,11 @@ asyncActionsProcessor ::
IO SchemaCache ->
STM.TVar (Set LockedActionEventId) ->
HTTP.Manager ->
PrometheusMetrics ->
Milliseconds ->
Maybe GH.GQLQueryText ->
m (Forever m)
asyncActionsProcessor env logger getSCFromRef' lockedActionEvents httpManager sleepTime gqlQueryText =
asyncActionsProcessor env logger getSCFromRef' lockedActionEvents httpManager prometheusMetrics sleepTime gqlQueryText =
return $
Forever () $
const $ do
@ -487,6 +492,7 @@ asyncActionsProcessor env logger getSCFromRef' lockedActionEvents httpManager sl
callWebhook
env
httpManager
prometheusMetrics
outputType
outputFields
reqHeaders
@ -514,6 +520,7 @@ callWebhook ::
) =>
Env.Environment ->
HTTP.Manager ->
PrometheusMetrics ->
GraphQLType ->
IR.ActionOutputFields ->
[HTTP.Header] ->
@ -528,6 +535,7 @@ callWebhook ::
callWebhook
env
manager
prometheusMetrics
outputType
outputFields
reqHeaders
@ -577,6 +585,7 @@ callWebhook
in pure (Just transformedReq, Just transformedPayloadSize, Just reqTransformCtx)
let actualReq = fromMaybe req transformedReq
actualSize = fromMaybe requestBodySize transformedReqSize
httpResponse <-
Tracing.tracedHttpRequest actualReq $ \request ->
@ -615,6 +624,13 @@ callWebhook
throw500WithDetail "Response Transformation Failed" $ J.toJSON err
-- log the request and response to/from the action handler
liftIO $ do
Prometheus.Counter.add
(pmActionBytesSent prometheusMetrics)
actualSize
Prometheus.Counter.add
(pmActionBytesReceived prometheusMetrics)
responseBodySize
logger :: (L.Logger L.Hasura) <- asks getter
L.unLogger logger $ ActionHandlerLog req transformedReq requestBodySize transformedReqSize responseBodySize actionName

View File

@ -31,6 +31,7 @@ import Hasura.RQL.Types.Backend
import Hasura.RQL.Types.Common
import Hasura.RQL.Types.GraphqlSchemaIntrospection
import Hasura.SQL.AnyBackend qualified as AB
import Hasura.Server.Prometheus (PrometheusMetrics (..))
import Hasura.Server.Types (RequestId (..))
import Hasura.Session
import Hasura.Tracing qualified as Tracing
@ -45,14 +46,16 @@ convertMutationAction ::
) =>
Env.Environment ->
L.Logger L.Hasura ->
PrometheusMetrics ->
UserInfo ->
HTTP.Manager ->
HTTP.RequestHeaders ->
Maybe GH.GQLQueryText ->
ActionMutation Void ->
m ActionExecutionPlan
convertMutationAction env logger userInfo manager reqHeaders gqlQueryText = \case
AMSync s -> pure $ AEPSync $ resolveActionExecution env logger userInfo s actionExecContext gqlQueryText
convertMutationAction env logger prometheusMetrics userInfo manager reqHeaders gqlQueryText = \case
AMSync s ->
pure $ AEPSync $ resolveActionExecution env logger prometheusMetrics userInfo s actionExecContext gqlQueryText
AMAsync s ->
AEPAsyncMutation
<$> liftEitherM (runMetadataStorageT $ resolveActionMutationAsync s reqHeaders userSession)
@ -71,6 +74,7 @@ convertMutationSelectionSet ::
) =>
Env.Environment ->
L.Logger L.Hasura ->
PrometheusMetrics ->
GQLContext ->
SQLGenCtx ->
UserInfo ->
@ -88,6 +92,7 @@ convertMutationSelectionSet ::
convertMutationSelectionSet
env
logger
prometheusMetrics
gqlContext
SQLGenCtx {stringifyNum}
userInfo
@ -136,7 +141,7 @@ convertMutationSelectionSet
(actionName, _fch) <- pure $ case noRelsDBAST of
AMSync s -> (_aaeName s, _aaeForwardClientHeaders s)
AMAsync s -> (_aamaName s, _aamaForwardClientHeaders s)
plan <- convertMutationAction env logger userInfo manager reqHeaders (Just (GH._grQuery gqlUnparsed)) noRelsDBAST
plan <- convertMutationAction env logger prometheusMetrics userInfo manager reqHeaders (Just (GH._grQuery gqlUnparsed)) noRelsDBAST
pure $ ExecStepAction plan (ActionsInfo actionName _fch) remoteJoins -- `_fch` represents the `forward_client_headers` option from the action
-- definition which is currently being ignored for actions that are mutations
RFRaw customFieldVal -> flip onLeft throwError =<< executeIntrospection userInfo customFieldVal introspectionDisabledRoles

View File

@ -31,6 +31,7 @@ import Hasura.RQL.Types.Action
import Hasura.RQL.Types.Backend
import Hasura.RQL.Types.GraphqlSchemaIntrospection
import Hasura.SQL.AnyBackend qualified as AB
import Hasura.Server.Prometheus (PrometheusMetrics (..))
import Hasura.Server.Types (RequestId (..))
import Hasura.Session
import Language.GraphQL.Draft.Syntax qualified as G
@ -63,6 +64,7 @@ convertQuerySelSet ::
) =>
Env.Environment ->
L.Logger L.Hasura ->
PrometheusMetrics ->
GQLContext ->
UserInfo ->
HTTP.Manager ->
@ -79,6 +81,7 @@ convertQuerySelSet ::
convertQuerySelSet
env
logger
prometheusMetrics
gqlContext
userInfo
manager
@ -120,7 +123,7 @@ convertQuerySelSet
RFAction action -> do
let (noRelsDBAST, remoteJoins) = RJ.getRemoteJoinsActionQuery action
(actionExecution, actionName, fch) <- pure $ case noRelsDBAST of
AQQuery s -> (AEPSync $ resolveActionExecution env logger userInfo s (ActionExecContext manager reqHeaders (_uiSession userInfo)) (Just (GH._grQuery gqlUnparsed)), _aaeName s, _aaeForwardClientHeaders s)
AQQuery s -> (AEPSync $ resolveActionExecution env logger prometheusMetrics userInfo s (ActionExecContext manager reqHeaders (_uiSession userInfo)) (Just (GH._grQuery gqlUnparsed)), _aaeName s, _aaeForwardClientHeaders s)
AQAsync s -> (AEPAsyncQuery $ AsyncActionQueryExecutionPlan (_aaaqActionId s) $ resolveAsyncActionQuery userInfo s, _aaaqName s, _aaaqForwardClientHeaders s)
pure $ ExecStepAction actionExecution (ActionsInfo actionName fch) remoteJoins
RFRaw r -> flip onLeft throwError =<< executeIntrospection userInfo r introspectionDisabledRoles

View File

@ -343,6 +343,7 @@ runGQ env logger reqId userInfo ipAddress reqHeaders queryType reqUnparsed = do
E.getResolvedExecPlan
env
logger
prometheusMetrics
userInfo
sqlGenCtx
readOnlyMode

View File

@ -70,7 +70,7 @@ createWSServerApp ::
-- -- ^ aka generalized 'WS.ServerApp'
createWSServerApp env enabledLogTypes authMode serverEnv connInitTimeout = \ !ipAddress !pendingConn -> do
let getMetricsConfig = scMetricsConfig . fst <$> _wseGCtxMap serverEnv
WS.createServerApp getMetricsConfig connInitTimeout (_wseServer serverEnv) handlers ipAddress pendingConn
WS.createServerApp getMetricsConfig connInitTimeout (_wseServer serverEnv) prometheusMetrics handlers ipAddress pendingConn
where
handlers =
WS.WSHandlers

View File

@ -460,6 +460,7 @@ onStart env enabledLogTypes serverEnv wsConn shouldCaptureVariables (StartMsg op
E.getResolvedExecPlan
env
logger
prometheusMetrics
userInfo
sqlGenCtx
readOnlyMode

View File

@ -61,12 +61,16 @@ import Hasura.Logging qualified as L
import Hasura.Prelude
import Hasura.RQL.Types.Common (MetricsConfig (..))
import Hasura.Server.Init.Config (WSConnectionInitTimeout (..))
import Hasura.Server.Prometheus
( PrometheusMetrics (..),
)
import ListT qualified
import Network.Wai.Extended (IpAddress)
import Network.WebSockets qualified as WS
import Refined (unrefine)
import StmContainers.Map qualified as STMMap
import System.IO.Error qualified as E
import System.Metrics.Prometheus.Counter qualified as Prometheus.Counter
newtype WSId = WSId {unWSId :: UUID.UUID}
deriving (Show, Eq, Hashable)
@ -307,12 +311,13 @@ createServerApp ::
IO MetricsConfig ->
WSConnectionInitTimeout ->
WSServer a ->
PrometheusMetrics ->
-- | user provided handlers
WSHandlers m a ->
-- | aka WS.ServerApp
HasuraServerApp m
{-# INLINE createServerApp #-}
createServerApp getMetricsConfig wsConnInitTimeout (WSServer logger@(L.Logger writeLog) serverStatus) wsHandlers !ipAddress !pendingConn = do
createServerApp getMetricsConfig wsConnInitTimeout (WSServer logger@(L.Logger writeLog) serverStatus) prometheusMetrics wsHandlers !ipAddress !pendingConn = do
wsId <- WSId <$> liftIO UUID.nextRandom
logWSLog logger $ WSLog wsId EConnectionRequest Nothing
-- NOTE: this timer is specific to `graphql-ws`. the server has to close the connection
@ -401,18 +406,28 @@ createServerApp getMetricsConfig wsConnInitTimeout (WSServer logger@(L.Logger wr
-- Regardless this should be safe:
handleJust (guard . E.isResourceVanishedError) (\() -> throw WS.ConnectionClosed) $
WS.receiveData conn
let censoredMessage =
let messageLength = BL.length msg
censoredMessage =
MessageDetails
(SB.fromLBS (if shouldCaptureVariables then msg else "<censored>"))
(BL.length msg)
messageLength
liftIO $
Prometheus.Counter.add
(pmWebSocketBytesReceived prometheusMetrics)
messageLength
logWSLog logger $ WSLog wsId (EMessageReceived censoredMessage) Nothing
messageHandler wsConn msg subProtocol
let send = forever $ do
WSQueueResponse msg wsInfo <- liftIO $ STM.atomically $ STM.readTQueue sendQ
let message = MessageDetails (SB.fromLBS msg) (BL.length msg)
let messageLength = BL.length msg
messageDetails = MessageDetails (SB.fromLBS msg) messageLength
liftIO $ WS.sendTextData conn msg
logWSLog logger $ WSLog wsId (EMessageSent message) wsInfo
liftIO $
Prometheus.Counter.add
(pmWebSocketBytesSent prometheusMetrics)
messageLength
logWSLog logger $ WSLog wsId (EMessageSent messageDetails) wsInfo
-- withAsync lets us be very sure that if e.g. an async exception is raised while we're
-- forking that the threads we launched will be cleaned up. See also below.

View File

@ -34,7 +34,13 @@ data PrometheusMetrics = PrometheusMetrics
{ pmConnections :: ConnectionsGauge,
pmActiveSubscriptions :: Gauge,
pmGraphQLRequestMetrics :: GraphQLRequestMetrics,
pmEventTriggerMetrics :: EventTriggerMetrics
pmEventTriggerMetrics :: EventTriggerMetrics,
pmWebSocketBytesReceived :: Counter,
pmWebSocketBytesSent :: Counter,
pmActionBytesReceived :: Counter,
pmActionBytesSent :: Counter,
pmScheduledTriggerBytesReceived :: Counter,
pmScheduledTriggerBytesSent :: Counter
}
data GraphQLRequestMetrics = GraphQLRequestMetrics
@ -52,7 +58,9 @@ data EventTriggerMetrics = EventTriggerMetrics
eventQueueTimeSeconds :: Histogram,
eventsFetchTimePerBatch :: Histogram,
eventWebhookProcessingTime :: Histogram,
eventProcessingTime :: Histogram
eventProcessingTime :: Histogram,
eventTriggerBytesReceived :: Counter,
eventTriggerBytesSent :: Counter
}
-- | Create dummy mutable references without associating them to a metrics
@ -63,6 +71,12 @@ makeDummyPrometheusMetrics = do
pmActiveSubscriptions <- Gauge.new
pmGraphQLRequestMetrics <- makeDummyGraphQLRequestMetrics
pmEventTriggerMetrics <- makeDummyEventTriggerMetrics
pmWebSocketBytesReceived <- Counter.new
pmWebSocketBytesSent <- Counter.new
pmActionBytesReceived <- Counter.new
pmActionBytesSent <- Counter.new
pmScheduledTriggerBytesReceived <- Counter.new
pmScheduledTriggerBytesSent <- Counter.new
pure PrometheusMetrics {..}
makeDummyGraphQLRequestMetrics :: IO GraphQLRequestMetrics
@ -83,6 +97,8 @@ makeDummyEventTriggerMetrics = do
eventsFetchTimePerBatch <- Histogram.new []
eventWebhookProcessingTime <- Histogram.new []
eventProcessingTime <- Histogram.new []
eventTriggerBytesReceived <- Counter.new
eventTriggerBytesSent <- Counter.new
pure EventTriggerMetrics {..}
--------------------------------------------------------------------------------