From b0a3ff1d0ffa1305aa628db0c65a9fe600d61e7e Mon Sep 17 00:00:00 2001 From: paritosh-08 <85472423+paritosh-08@users.noreply.github.com> Date: Mon, 18 Mar 2024 20:12:43 +0530 Subject: [PATCH] add `operation_name` and `parameterized_query_hash` to `hasura_graphql_requests_total` metric PR-URL: https://github.com/hasura/graphql-engine-mono/pull/10725 GitOrigin-RevId: b19df2a17e94e72dde790a54e01e290b581aa560 --- .../enterprise-edition/prometheus/metrics.mdx | 10 +- .../Execute/Subscription/Poll/LiveQuery.hs | 6 +- .../Subscription/Poll/StreamingQuery.hs | 6 +- .../GraphQL/Execute/Subscription/State.hs | 10 +- .../src-lib/Hasura/GraphQL/Transport/HTTP.hs | 70 +++++----- .../Hasura/GraphQL/Transport/WebSocket.hs | 123 ++++++++++-------- .../GraphQL/Transport/WebSocket/Server.hs | 6 +- server/src-lib/Hasura/Server/Prometheus.hs | 79 +++++++---- 8 files changed, 181 insertions(+), 129 deletions(-) diff --git a/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx b/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx index 1c5f156237e..7d966a57a8f 100644 --- a/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx +++ b/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx @@ -44,11 +44,11 @@ buckets, you should consider [tuning the performance](/deployment/performance-tu Number of GraphQL requests received, representing the GraphQL query/mutation traffic on the server. -| | | -| ------ | -------------------------------------------------------------- | -| Name | `hasura_graphql_requests_total` | -| Type | Counter | -| Labels | `operation_type`: query \| mutation \| subscription \| unknown | +| | | +| ------ | -------------------------------------------------------------------------------------------------------------------------------------------------- | +| Name | `hasura_graphql_requests_total` | +| Type | Counter | +| Labels | `operation_type`: query \| mutation \| subscription \| unknown, `response_status`: success \| failed, `operation_name`, `parameterized_query_hash` | The `unknown` operation type will be returned for queries that fail authorization, parsing, or certain validations. The `response_status` label will be `success` for successful requests and `failed` for failed requests. diff --git a/server/src-lib/Hasura/GraphQL/Execute/Subscription/Poll/LiveQuery.hs b/server/src-lib/Hasura/GraphQL/Execute/Subscription/Poll/LiveQuery.hs index 18e11f8000b..9fa56532e44 100644 --- a/server/src-lib/Hasura/GraphQL/Execute/Subscription/Poll/LiveQuery.hs +++ b/server/src-lib/Hasura/GraphQL/Execute/Subscription/Poll/LiveQuery.hs @@ -39,7 +39,7 @@ import Hasura.RQL.Types.Common (SourceName) import Hasura.RQL.Types.Roles (RoleName) import Hasura.RQL.Types.Subscription (SubscriptionType (..)) import Hasura.Server.Logging (ModelInfo (..), ModelInfoLog (..)) -import Hasura.Server.Prometheus (PrometheusMetrics (..), SubscriptionMetrics (..), liveQuerySubscriptionLabel, recordSubcriptionMetric) +import Hasura.Server.Prometheus (PrometheusMetrics (..), SubscriptionMetrics (..), liveQuerySubscriptionLabel, recordSubscriptionMetric) import Hasura.Server.Types (GranularPrometheusMetricsState (..), ModelInfoLogState (..)) import Refined (unrefine) import System.Metrics.Prometheus.Gauge qualified as Prometheus.Gauge @@ -121,7 +121,7 @@ pollLiveQuery pollerId pollerResponseState lqOpts (sourceName, sourceConfig) rol (queryExecutionTime, mxRes) <- runDBSubscription @b sourceConfig query (over (each . _2) C._csVariables cohorts) resolvedConnectionTemplate let dbExecTimeMetric = submDBExecTotalTime $ pmSubscriptionMetrics $ prometheusMetrics - recordSubcriptionMetric + recordSubscriptionMetric granularPrometheusMetricsState True operationNamesMap @@ -215,7 +215,7 @@ pollLiveQuery pollerId pollerResponseState lqOpts (sourceName, sourceConfig) rol when (modelInfoLogStatus' == ModelInfoLogOn) $ do for_ (modelInfoList) $ \(ModelInfoPart modelName modelType modelSourceName modelSourceType modelQueryType) -> do L.unLogger logger $ ModelInfoLog L.LevelInfo $ ModelInfo modelName (toTxt modelType) (toTxt <$> modelSourceName) (toTxt <$> modelSourceType) (toTxt modelQueryType) False - recordSubcriptionMetric + recordSubscriptionMetric granularPrometheusMetricsState True operationNamesMap diff --git a/server/src-lib/Hasura/GraphQL/Execute/Subscription/Poll/StreamingQuery.hs b/server/src-lib/Hasura/GraphQL/Execute/Subscription/Poll/StreamingQuery.hs index 3aa397f38be..ca552f1bd7a 100644 --- a/server/src-lib/Hasura/GraphQL/Execute/Subscription/Poll/StreamingQuery.hs +++ b/server/src-lib/Hasura/GraphQL/Execute/Subscription/Poll/StreamingQuery.hs @@ -41,7 +41,7 @@ import Hasura.RQL.Types.Roles (RoleName) import Hasura.RQL.Types.Subscription (SubscriptionType (..)) import Hasura.SQL.Value (TxtEncodedVal (..)) import Hasura.Server.Logging (ModelInfo (..), ModelInfoLog (..)) -import Hasura.Server.Prometheus (PrometheusMetrics (..), SubscriptionMetrics (..), recordSubcriptionMetric, streamingSubscriptionLabel) +import Hasura.Server.Prometheus (PrometheusMetrics (..), SubscriptionMetrics (..), recordSubscriptionMetric, streamingSubscriptionLabel) import Hasura.Server.Types (GranularPrometheusMetricsState (..), ModelInfoLogState (..)) import Language.GraphQL.Draft.Syntax qualified as G import Refined (unrefine) @@ -289,7 +289,7 @@ pollStreamingQuery pollerId pollerResponseState streamingQueryOpts (sourceName, (over (each . _2) C._csVariables $ fmap (fmap fst) cohorts) resolvedConnectionTemplate let dbExecTimeMetric = submDBExecTotalTime $ pmSubscriptionMetrics $ prometheusMetrics - recordSubcriptionMetric + recordSubscriptionMetric granularPrometheusMetricsState True operationNames @@ -470,7 +470,7 @@ pollStreamingQuery pollerId pollerResponseState streamingQueryOpts (sourceName, unLogger logger $ ModelInfoLog LevelInfo $ ModelInfo modelName (toTxt modelType) (toTxt <$> modelSourceName) (toTxt <$> modelSourceType) (toTxt modelQueryType) False postPollHook pollDetails let totalTimeMetric = submTotalTime $ pmSubscriptionMetrics $ prometheusMetrics - recordSubcriptionMetric + recordSubscriptionMetric granularPrometheusMetricsState True operationNames diff --git a/server/src-lib/Hasura/GraphQL/Execute/Subscription/State.hs b/server/src-lib/Hasura/GraphQL/Execute/Subscription/State.hs index 667e2540101..3688397ef1a 100644 --- a/server/src-lib/Hasura/GraphQL/Execute/Subscription/State.hs +++ b/server/src-lib/Hasura/GraphQL/Execute/Subscription/State.hs @@ -56,7 +56,7 @@ import Hasura.RQL.Types.Common (SourceName) import Hasura.SQL.AnyBackend qualified as AB import Hasura.Server.Metrics (ServerMetrics (..)) import Hasura.Server.Prometheus - ( DynamicSubscriptionLabel (..), + ( DynamicGraphqlOperationLabel (..), PrometheusMetrics (..), SubscriptionLabel (..), SubscriptionMetrics (..), @@ -258,7 +258,7 @@ addLiveQuery liftIO $ Prometheus.Gauge.inc $ submActiveLiveQueryPollers $ pmSubscriptionMetrics $ prometheusMetrics liftIO $ EKG.Gauge.inc $ smActiveSubscriptions serverMetrics - let promMetricGranularLabel = SubscriptionLabel liveQuerySubscriptionLabel (Just $ DynamicSubscriptionLabel (Just parameterizedQueryHash) operationName) + let promMetricGranularLabel = SubscriptionLabel liveQuerySubscriptionLabel (Just $ DynamicGraphqlOperationLabel (Just parameterizedQueryHash) operationName) promMetricLabel = SubscriptionLabel liveQuerySubscriptionLabel Nothing let numSubscriptionMetric = submActiveSubscriptions $ pmSubscriptionMetrics $ prometheusMetrics recordMetricWithLabel @@ -390,7 +390,7 @@ addStreamSubscriptionQuery EKG.Gauge.inc $ smActiveSubscriptions serverMetrics EKG.Gauge.inc $ smActiveStreamingSubscriptions serverMetrics - let promMetricGranularLabel = SubscriptionLabel streamingSubscriptionLabel (Just $ DynamicSubscriptionLabel (Just parameterizedQueryHash) operationName) + let promMetricGranularLabel = SubscriptionLabel streamingSubscriptionLabel (Just $ DynamicGraphqlOperationLabel (Just parameterizedQueryHash) operationName) promMetricLabel = SubscriptionLabel streamingSubscriptionLabel Nothing numSubscriptionMetric = submActiveSubscriptions $ pmSubscriptionMetrics $ prometheusMetrics recordMetricWithLabel @@ -470,7 +470,7 @@ removeLiveQuery logger serverMetrics prometheusMetrics lqState lqId@(SubscriberD <*> TMap.null newOps when cohortIsEmpty $ TMap.delete cohortId cohortMap handlerIsEmpty <- TMap.null cohortMap - let promMetricGranularLabel = SubscriptionLabel liveQuerySubscriptionLabel (Just $ DynamicSubscriptionLabel (Just parameterizedQueryHash) maybeOperationName) + let promMetricGranularLabel = SubscriptionLabel liveQuerySubscriptionLabel (Just $ DynamicGraphqlOperationLabel (Just parameterizedQueryHash) maybeOperationName) promMetricLabel = SubscriptionLabel liveQuerySubscriptionLabel Nothing -- when there is no need for handler i.e, this happens to be the last -- operation, take the ref for the polling thread to cancel it @@ -569,7 +569,7 @@ removeStreamingQuery logger serverMetrics prometheusMetrics subscriptionState (S <*> TMap.null newOps when cohortIsEmpty $ TMap.delete currentCohortId cohortMap handlerIsEmpty <- TMap.null cohortMap - let promMetricGranularLabel = SubscriptionLabel streamingSubscriptionLabel (Just $ DynamicSubscriptionLabel (Just parameterizedQueryHash) maybeOperationName) + let promMetricGranularLabel = SubscriptionLabel streamingSubscriptionLabel (Just $ DynamicGraphqlOperationLabel (Just parameterizedQueryHash) maybeOperationName) promMetricLabel = SubscriptionLabel streamingSubscriptionLabel Nothing -- when there is no need for handler i.e, -- operation, take the ref for the polling thread to cancel it diff --git a/server/src-lib/Hasura/GraphQL/Transport/HTTP.hs b/server/src-lib/Hasura/GraphQL/Transport/HTTP.hs index 4d9073eaf4b..ceb56803fb5 100644 --- a/server/src-lib/Hasura/GraphQL/Transport/HTTP.hs +++ b/server/src-lib/Hasura/GraphQL/Transport/HTTP.hs @@ -88,8 +88,11 @@ import Hasura.Server.Limits import Hasura.Server.Logging import Hasura.Server.Logging qualified as L import Hasura.Server.Prometheus - ( GraphQLRequestMetrics (..), + ( GranularPrometheusMetricsState, + GraphQLRequestMetrics (..), PrometheusMetrics (..), + ResponseStatus (..), + recordGraphqlOperationMetric, ) import Hasura.Server.Telemetry.Counters qualified as Telem import Hasura.Server.Types (HeaderPrecedence, ModelInfoLogState (..), MonadGetPolicies (..), ReadOnlyMode (..), RemoteSchemaResponsePriority (..), RequestId (..)) @@ -100,7 +103,7 @@ import Hasura.Tracing qualified as Tracing import Language.GraphQL.Draft.Syntax qualified as G import Network.HTTP.Types qualified as HTTP import Network.Wai.Extended qualified as Wai -import System.Metrics.Prometheus.Counter qualified as Prometheus.Counter +import System.Metrics.Prometheus.CounterVector qualified as Prometheus.CounterVector import System.Metrics.Prometheus.Histogram qualified as Prometheus.Histogram -- | Encapsulates a function that stores a query response in the cache. @@ -329,12 +332,13 @@ runGQ :: ResponseInternalErrorsConfig -> m (GQLQueryOperationSuccessLog, HttpResponse (Maybe GQResponse, EncJSON)) runGQ env sqlGenCtx sc enableAL readOnlyMode remoteSchemaResponsePriority headerPrecedence prometheusMetrics logger agentLicenseKey reqId userInfo ipAddress reqHeaders queryType reqUnparsed responseErrorsConfig = do + granularPrometheusMetricsState <- runGetPrometheusMetricsGranularity getModelInfoLogStatus' <- runGetModelInfoLogStatus modelInfoLogStatus <- liftIO getModelInfoLogStatus' let gqlMetrics = pmGraphQLRequestMetrics prometheusMetrics - (totalTime, (response, parameterizedQueryHash, gqlOpType, modelInfoListForLogging, queryCachedStatus)) <- withElapsedTime $ do - (reqParsed, runLimits, queryParts) <- Tracing.newSpan "Parse GraphQL" $ observeGQLQueryError gqlMetrics Nothing $ do + (totalTime, (response, parameterizedQueryHash, gqlOpType, gqlOperationName, modelInfoListForLogging, queryCachedStatus)) <- withElapsedTime $ do + (reqParsed, runLimits, queryParts) <- Tracing.newSpan "Parse GraphQL" $ observeGQLQueryError granularPrometheusMetricsState gqlMetrics Nothing (_grOperationName reqUnparsed) Nothing $ do -- 1. Run system authorization on the 'reqUnparsed :: GQLReqUnparsed' query. reqParsed <- E.checkGQLExecution userInfo (reqHeaders, ipAddress) enableAL sc reqUnparsed reqId @@ -348,7 +352,8 @@ runGQ env sqlGenCtx sc enableAL readOnlyMode remoteSchemaResponsePriority header return (reqParsed, runLimits, queryParts) let gqlOpType = G._todType queryParts - observeGQLQueryError gqlMetrics (Just gqlOpType) $ do + let gqlOperationName = getOpNameFromParsedReq reqParsed + observeGQLQueryError granularPrometheusMetricsState gqlMetrics (Just gqlOpType) gqlOperationName Nothing $ do -- 3. Construct the remainder of the execution plan. let maybeOperationName = _unOperationName <$> getOpNameFromParsedReq reqParsed for_ maybeOperationName $ \nm -> @@ -374,13 +379,13 @@ runGQ env sqlGenCtx sc enableAL readOnlyMode remoteSchemaResponsePriority header -- 4. Execute the execution plan producing a 'AnnotatedResponse'. (response, queryCachedStatus, modelInfoFromExecution) <- executePlan reqParsed runLimits execPlan - return (response, parameterizedQueryHash, gqlOpType, ((modelInfoList <> (modelInfoFromExecution))), queryCachedStatus) + return (response, parameterizedQueryHash, gqlOpType, gqlOperationName, ((modelInfoList <> (modelInfoFromExecution))), queryCachedStatus) -- 5. Record telemetry recordTimings totalTime response -- 6. Record Prometheus metrics (query successes) - liftIO $ recordGQLQuerySuccess gqlMetrics totalTime gqlOpType + liftIO $ recordGQLQuerySuccess granularPrometheusMetricsState gqlMetrics totalTime gqlOperationName parameterizedQueryHash gqlOpType -- 7. Return the response along with logging metadata. let requestSize = LBS.length $ J.encode reqUnparsed @@ -603,42 +608,45 @@ runGQ env sqlGenCtx sc enableAL readOnlyMode remoteSchemaResponsePriority header ( MonadIO n, MonadError e n ) => + IO GranularPrometheusMetricsState -> GraphQLRequestMetrics -> Maybe G.OperationType -> + Maybe OperationName -> + Maybe ParameterizedQueryHash -> n a -> n a - observeGQLQueryError gqlMetrics mOpType action = + observeGQLQueryError granularPrometheusMetricsState gqlMetrics mOpType mOpName mQHash action = catchError (fmap Right action) (pure . Left) >>= \case Right result -> pure result Left err -> do - case mOpType of - Nothing -> - liftIO $ Prometheus.Counter.inc (gqlRequestsUnknownFailure gqlMetrics) - Just opType -> case opType of - G.OperationTypeQuery -> - liftIO $ Prometheus.Counter.inc (gqlRequestsQueryFailure gqlMetrics) - G.OperationTypeMutation -> - liftIO $ Prometheus.Counter.inc (gqlRequestsMutationFailure gqlMetrics) - G.OperationTypeSubscription -> - -- We do not collect metrics for subscriptions at the request level. - pure () + recordGraphqlOperationMetric + granularPrometheusMetricsState + mOpType + Failed + mOpName + mQHash + (Prometheus.CounterVector.inc $ gqlRequests gqlMetrics) throwError err -- Tally and record execution times for successful GraphQL requests. recordGQLQuerySuccess :: - GraphQLRequestMetrics -> DiffTime -> G.OperationType -> IO () - recordGQLQuerySuccess gqlMetrics totalTime = \case - G.OperationTypeQuery -> liftIO $ do - Prometheus.Counter.inc (gqlRequestsQuerySuccess gqlMetrics) - Prometheus.Histogram.observe (gqlExecutionTimeSecondsQuery gqlMetrics) (realToFrac totalTime) - G.OperationTypeMutation -> liftIO $ do - Prometheus.Counter.inc (gqlRequestsMutationSuccess gqlMetrics) - Prometheus.Histogram.observe (gqlExecutionTimeSecondsMutation gqlMetrics) (realToFrac totalTime) - G.OperationTypeSubscription -> - -- We do not collect metrics for subscriptions at the request level. - -- Furthermore, we do not serve GraphQL subscriptions over HTTP. - pure () + IO GranularPrometheusMetricsState -> GraphQLRequestMetrics -> DiffTime -> Maybe OperationName -> ParameterizedQueryHash -> G.OperationType -> IO () + recordGQLQuerySuccess granularPrometheusMetricsState gqlMetrics totalTime opName qHash opType = do + recordGraphqlOperationMetric + granularPrometheusMetricsState + (Just opType) + Success + opName + (Just qHash) + (Prometheus.CounterVector.inc $ gqlRequests gqlMetrics) + case opType of + G.OperationTypeQuery -> liftIO $ Prometheus.Histogram.observe (gqlExecutionTimeSecondsQuery gqlMetrics) (realToFrac totalTime) + G.OperationTypeMutation -> liftIO $ Prometheus.Histogram.observe (gqlExecutionTimeSecondsMutation gqlMetrics) (realToFrac totalTime) + G.OperationTypeSubscription -> + -- We do not collect metrics for subscriptions at the request level. + -- Furthermore, we do not serve GraphQL subscriptions over HTTP. + pure () coalescePostgresMutations :: EB.ExecutionPlan -> diff --git a/server/src-lib/Hasura/GraphQL/Transport/WebSocket.hs b/server/src-lib/Hasura/GraphQL/Transport/WebSocket.hs index c68bd268e98..c5ac0300107 100644 --- a/server/src-lib/Hasura/GraphQL/Transport/WebSocket.hs +++ b/server/src-lib/Hasura/GraphQL/Transport/WebSocket.hs @@ -101,6 +101,8 @@ import Hasura.Server.Metrics (ServerMetrics (..)) import Hasura.Server.Prometheus ( GraphQLRequestMetrics (..), PrometheusMetrics (..), + ResponseStatus (..), + recordGraphqlOperationMetric, ) import Hasura.Server.Telemetry.Counters qualified as Telem import Hasura.Server.Types (GranularPrometheusMetricsState (..), HeaderPrecedence, ModelInfoLogState (..), MonadGetPolicies (..), RemoteSchemaResponsePriority, RequestId, getRequestId) @@ -115,7 +117,7 @@ import Network.HTTP.Types qualified as HTTP import Network.WebSockets qualified as WS import Refined (unrefine) import StmContainers.Map qualified as STMMap -import System.Metrics.Prometheus.Counter qualified as Prometheus.Counter +import System.Metrics.Prometheus.CounterVector qualified as Prometheus.CounterVector import System.Metrics.Prometheus.Histogram qualified as Prometheus.Histogram -- | 'ES.SubscriberDetails' comes from 'Hasura.GraphQL.Execute.LiveQuery.State.addLiveQuery'. We use @@ -451,6 +453,7 @@ onStart :: onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables (StartMsg opId q) onMessageActions responseErrorsConfig headerPrecedence = catchAndIgnore $ do modelInfoLogStatus' <- runGetModelInfoLogStatus modelInfoLogStatus <- liftIO modelInfoLogStatus' + granularPrometheusMetricsState <- runGetPrometheusMetricsGranularity timerTot <- startTimer op <- liftIO $ STM.atomically $ STMMap.lookup opId opMap @@ -458,7 +461,7 @@ onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables -- we process all operations on a websocket connection serially: when (isJust op) $ withComplete - $ sendStartErr + $ sendStartErr granularPrometheusMetricsState (snd =<< op) $ "an operation already exists with this id: " <> unOperationId opId @@ -467,10 +470,10 @@ onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables CSInitialised WsClientState {..} -> return (wscsUserInfo, wscsReqHeaders, wscsIpAddress) CSInitError initErr -> do let e = "cannot start as connection_init failed with: " <> initErr - withComplete $ sendStartErr e + withComplete $ sendStartErr granularPrometheusMetricsState (_grOperationName q) e CSNotInitialised _ _ -> do let e = "start received before the connection is initialised" - withComplete $ sendStartErr e + withComplete $ sendStartErr granularPrometheusMetricsState (_grOperationName q) e (requestId, reqHdrs) <- liftIO $ getRequestId origReqHdrs sc <- liftIO $ getSchemaCacheWithVersion appStateRef @@ -488,9 +491,9 @@ onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables (reqParsed, queryParts) <- Tracing.newSpan "Parse GraphQL" $ do reqParsedE <- lift $ E.checkGQLExecution userInfo (reqHdrs, ipAddress) enableAL sc q requestId - reqParsed <- onLeft reqParsedE (withComplete . preExecErr requestId Nothing) + reqParsed <- onLeft reqParsedE (withComplete . preExecErr granularPrometheusMetricsState requestId Nothing (_grOperationName q) Nothing) queryPartsE <- runExceptT $ getSingleOperation reqParsed - queryParts <- onLeft queryPartsE (withComplete . preExecErr requestId Nothing) + queryParts <- onLeft queryPartsE (withComplete . preExecErr granularPrometheusMetricsState requestId Nothing (getOpNameFromParsedReq reqParsed) Nothing) pure (reqParsed, queryParts) let gqlOpType = G._todType queryParts @@ -519,7 +522,7 @@ onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables responseErrorsConfig headerPrecedence - (parameterizedQueryHash, execPlan, modelInfoList) <- onLeft execPlanE (withComplete . preExecErr requestId (Just gqlOpType)) + (parameterizedQueryHash, execPlan, modelInfoList) <- onLeft execPlanE (withComplete . preExecErr granularPrometheusMetricsState requestId (Just gqlOpType) opName Nothing) case execPlan of E.QueryExecutionPlan queryPlan asts dirMap -> do @@ -535,7 +538,7 @@ onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables ResponseCached cachedResponseData -> do logQueryLog logger $ QueryLog q Nothing requestId QueryLogKindCached let reportedExecutionTime = 0 - liftIO $ recordGQLQuerySuccess reportedExecutionTime gqlOpType + liftIO $ recordGQLQuerySuccess granularPrometheusMetricsState reportedExecutionTime opName parameterizedQueryHash gqlOpType modelInfoLogging modelInfoList True modelInfoLogStatus sendSuccResp cachedResponseData opName parameterizedQueryHash $ ES.SubscriptionMetadata reportedExecutionTime ResponseUncached storeResponseM -> do @@ -582,7 +585,7 @@ onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables let (allResponses', allModelInfo) = unzip allResponses pure $ (AnnotatedResponsePart 0 Telem.Local (encJFromList (map arpResponse allResponses')) [], concat allModelInfo) in getResponse - sendResultFromFragments Telem.Query timerTot requestId conclusion opName parameterizedQueryHash gqlOpType modelInfoList modelInfoLogStatus + sendResultFromFragments granularPrometheusMetricsState Telem.Query timerTot requestId conclusion opName parameterizedQueryHash gqlOpType modelInfoList modelInfoLogStatus case (storeResponseM, conclusion) of (Just ResponseCacher {..}, Right results) -> do let (key, (compositeValue')) = unzip $ InsOrdHashMap.toList results @@ -608,7 +611,7 @@ onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables $ doQErr $ runPGMutationTransaction requestId q userInfo logger sourceConfig resolvedConnectionTemplate pgMutations -- we do not construct result fragments since we have only one result - handleResult requestId gqlOpType resp \(telemTimeIO_DT, results) -> do + handleResult granularPrometheusMetricsState requestId gqlOpType opName parameterizedQueryHash resp \(telemTimeIO_DT, results) -> do let telemQueryType = Telem.Query telemLocality = Telem.Local telemTimeIO = convertDuration telemTimeIO_DT @@ -618,7 +621,7 @@ onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables $ ES.SubscriptionMetadata telemTimeIO_DT -- Telemetry. NOTE: don't time network IO: Telem.recordTimingMetric Telem.RequestDimensions {..} Telem.RequestTimings {..} - liftIO $ recordGQLQuerySuccess totalTime gqlOpType + liftIO $ recordGQLQuerySuccess granularPrometheusMetricsState totalTime opName parameterizedQueryHash gqlOpType -- we are not in the transaction case; proceeding normally Nothing -> do @@ -666,7 +669,7 @@ onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables let (allResponses', allModelInfo) = unzip allResponses pure $ (AnnotatedResponsePart 0 Telem.Local (encJFromList (map arpResponse allResponses')) [], concat allModelInfo) in getResponse - sendResultFromFragments Telem.Query timerTot requestId conclusion opName parameterizedQueryHash gqlOpType modelInfoList modelInfoLogStatus + sendResultFromFragments granularPrometheusMetricsState Telem.Query timerTot requestId conclusion opName parameterizedQueryHash gqlOpType modelInfoList modelInfoLogStatus liftIO $ sendCompleted (Just requestId) (Just parameterizedQueryHash) E.SubscriptionExecutionPlan (subExec, modifier) -> do case subExec of @@ -718,11 +721,10 @@ onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables asyncActionQueryLive E.SEOnSourceDB (E.SSLivequery actionIds liveQueryBuilder) -> do actionLogMapE <- fmap fst <$> runExceptT (EA.fetchActionLogResponses actionIds) - actionLogMap <- onLeft actionLogMapE (withComplete . preExecErr requestId (Just gqlOpType)) - granularPrometheusMetricsState <- runGetPrometheusMetricsGranularity + actionLogMap <- onLeft actionLogMapE (withComplete . preExecErr granularPrometheusMetricsState requestId (Just gqlOpType) opName (Just parameterizedQueryHash)) modelInfoLogStatus'' <- runGetModelInfoLogStatus opMetadataE <- liftIO $ startLiveQuery opName liveQueryBuilder parameterizedQueryHash requestId actionLogMap granularPrometheusMetricsState modifier modelInfoLogStatus'' - lqId <- onLeft opMetadataE (withComplete . preExecErr requestId (Just gqlOpType)) + lqId <- onLeft opMetadataE (withComplete . preExecErr granularPrometheusMetricsState requestId (Just gqlOpType) opName (Just parameterizedQueryHash)) -- Update async action query subscription state case NE.nonEmpty (toList actionIds) of Nothing -> do @@ -747,11 +749,16 @@ onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables onUnexpectedException asyncActionQueryLive E.SEOnSourceDB (E.SSStreaming rootFieldName streamQueryBuilder) -> do - granularPrometheusMetricsState <- runGetPrometheusMetricsGranularity modelInfoLogStatus'' <- runGetModelInfoLogStatus liftIO $ startStreamingQuery rootFieldName streamQueryBuilder parameterizedQueryHash requestId granularPrometheusMetricsState modifier modelInfoLogStatus'' - liftIO $ Prometheus.Counter.inc (gqlRequestsSubscriptionSuccess gqlMetrics) + recordGraphqlOperationMetric + granularPrometheusMetricsState + (Just G.OperationTypeSubscription) + Success + opName + (Just parameterizedQueryHash) + (Prometheus.CounterVector.inc $ gqlRequests gqlMetrics) liftIO $ logOpEv ODStarted (Just requestId) (Just parameterizedQueryHash) where sendDataMsg = WS._wsaGetDataMessageType onMessageActions @@ -787,18 +794,21 @@ onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables handleResult :: forall a. + IO GranularPrometheusMetricsState -> RequestId -> G.OperationType -> + Maybe OperationName -> + ParameterizedQueryHash -> Either (Either GQExecError QErr) a -> (a -> ExceptT () m ()) -> ExceptT () m () - handleResult requestId gqlOpType r f = case r of - Left (Left err) -> postExecErr' gqlOpType err - Left (Right err) -> postExecErr requestId gqlOpType err + handleResult granularPrometheusMetricsState requestId gqlOpType mOpName pqh r f = case r of + Left (Left err) -> postExecErr' granularPrometheusMetricsState gqlOpType mOpName pqh err + Left (Right err) -> postExecErr granularPrometheusMetricsState requestId gqlOpType mOpName pqh err Right results -> f results - sendResultFromFragments telemQueryType timerTot requestId r opName pqh gqlOpType modelInfoList getModelInfoLogStatus = - handleResult requestId gqlOpType r \results -> do + sendResultFromFragments granularPrometheusMetricsState telemQueryType timerTot requestId r opName pqh gqlOpType modelInfoList getModelInfoLogStatus = + handleResult granularPrometheusMetricsState requestId gqlOpType opName pqh r \results -> do let (key, (compositeValue')) = unzip $ InsOrdHashMap.toList results (annotatedResp, model) = unzip compositeValue' results' = InsOrdHashMap.fromList $ zip key annotatedResp @@ -814,7 +824,7 @@ onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables -- Telemetry. NOTE: don't time network IO: Telem.recordTimingMetric Telem.RequestDimensions {..} Telem.RequestTimings {..} modelInfoLogging (modelInfoList <> modelInfoList') False getModelInfoLogStatus - liftIO $ (recordGQLQuerySuccess totalTime gqlOpType) + liftIO $ (recordGQLQuerySuccess granularPrometheusMetricsState totalTime opName pqh gqlOpType) runRemoteGQ :: RequestId -> @@ -885,7 +895,7 @@ onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables getErrFn ERTLegacy = encodeQErr getErrFn ERTGraphqlCompliant = encodeGQLErr - sendStartErr e = do + sendStartErr granularPrometheusMetricsState mOpName e = do let errFn = getErrFn errRespTy sendMsg wsConn $ SMErr @@ -893,7 +903,7 @@ onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables $ errFn False $ err400 StartFailed e liftIO $ logOpEv (ODProtoErr e) Nothing Nothing - liftIO $ reportGQLQueryError Nothing + liftIO $ reportGQLQueryError granularPrometheusMetricsState mOpName Nothing Nothing liftIO $ closeConnAction wsConn opId (T.unpack e) sendCompleted reqId paramQueryHash = do @@ -901,24 +911,27 @@ onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables logOpEv ODCompleted reqId paramQueryHash postExecErr :: + IO GranularPrometheusMetricsState -> RequestId -> G.OperationType -> + Maybe OperationName -> + ParameterizedQueryHash -> QErr -> ExceptT () m () - postExecErr reqId gqlOpType qErr = do + postExecErr granularPrometheusMetricsState reqId gqlOpType mOpName pqh qErr = do let errFn = getErrFn errRespTy False liftIO $ logOpEv (ODQueryErr qErr) (Just reqId) Nothing - postExecErr' gqlOpType $ GQExecError $ pure $ errFn qErr + postExecErr' granularPrometheusMetricsState gqlOpType mOpName pqh $ GQExecError $ pure $ errFn qErr - postExecErr' :: G.OperationType -> GQExecError -> ExceptT () m () - postExecErr' gqlOpType qErr = + postExecErr' :: IO GranularPrometheusMetricsState -> G.OperationType -> Maybe OperationName -> ParameterizedQueryHash -> GQExecError -> ExceptT () m () + postExecErr' granularPrometheusMetricsState gqlOpType mOpName pqh qErr = liftIO $ do - reportGQLQueryError (Just gqlOpType) + reportGQLQueryError granularPrometheusMetricsState mOpName (Just pqh) (Just gqlOpType) postExecErrAction wsConn opId qErr -- why wouldn't pre exec error use graphql response? - preExecErr reqId mGqlOpType qErr = do - liftIO $ reportGQLQueryError mGqlOpType + preExecErr granularPrometheusMetricsState reqId mGqlOpType mOpName pqh qErr = do + liftIO $ reportGQLQueryError granularPrometheusMetricsState mOpName pqh mGqlOpType liftIO $ sendError reqId qErr sendError reqId qErr = do @@ -1049,30 +1062,32 @@ onStart enabledLogTypes agentLicenseKey serverEnv wsConn shouldCaptureVariables catchAndIgnore :: ExceptT () m () -> m () catchAndIgnore m = void $ runExceptT m - reportGQLQueryError :: Maybe G.OperationType -> IO () - reportGQLQueryError = \case - Nothing -> - liftIO $ Prometheus.Counter.inc (gqlRequestsUnknownFailure gqlMetrics) - Just opType -> case opType of - G.OperationTypeQuery -> - liftIO $ Prometheus.Counter.inc (gqlRequestsQueryFailure gqlMetrics) - G.OperationTypeMutation -> - liftIO $ Prometheus.Counter.inc (gqlRequestsMutationFailure gqlMetrics) - G.OperationTypeSubscription -> - liftIO $ Prometheus.Counter.inc (gqlRequestsSubscriptionFailure gqlMetrics) + reportGQLQueryError :: IO GranularPrometheusMetricsState -> Maybe OperationName -> Maybe ParameterizedQueryHash -> Maybe G.OperationType -> IO () + reportGQLQueryError granularPrometheusMetricsState mOpName mQHash mOpType = + recordGraphqlOperationMetric + granularPrometheusMetricsState + mOpType + Failed + mOpName + mQHash + (Prometheus.CounterVector.inc $ gqlRequests gqlMetrics) -- Tally and record execution times for successful GraphQL requests. - recordGQLQuerySuccess :: DiffTime -> G.OperationType -> IO () - recordGQLQuerySuccess totalTime = \case - G.OperationTypeQuery -> liftIO $ do - Prometheus.Counter.inc (gqlRequestsQuerySuccess gqlMetrics) - Prometheus.Histogram.observe (gqlExecutionTimeSecondsQuery gqlMetrics) (realToFrac totalTime) - G.OperationTypeMutation -> liftIO $ do - Prometheus.Counter.inc (gqlRequestsMutationSuccess gqlMetrics) - Prometheus.Histogram.observe (gqlExecutionTimeSecondsMutation gqlMetrics) (realToFrac totalTime) - G.OperationTypeSubscription -> - -- We do not collect metrics for subscriptions at the request level. - pure () + recordGQLQuerySuccess :: IO GranularPrometheusMetricsState -> DiffTime -> Maybe OperationName -> ParameterizedQueryHash -> G.OperationType -> IO () + recordGQLQuerySuccess granularPrometheusMetricsState totalTime mOpName qHash opType = do + recordGraphqlOperationMetric + granularPrometheusMetricsState + (Just opType) + Success + mOpName + (Just qHash) + (Prometheus.CounterVector.inc $ gqlRequests gqlMetrics) + case opType of + G.OperationTypeQuery -> liftIO $ Prometheus.Histogram.observe (gqlExecutionTimeSecondsQuery gqlMetrics) (realToFrac totalTime) + G.OperationTypeMutation -> liftIO $ Prometheus.Histogram.observe (gqlExecutionTimeSecondsMutation gqlMetrics) (realToFrac totalTime) + G.OperationTypeSubscription -> + -- We do not collect metrics for subscriptions at the request level. + pure () onMessage :: ( MonadIO m, diff --git a/server/src-lib/Hasura/GraphQL/Transport/WebSocket/Server.hs b/server/src-lib/Hasura/GraphQL/Transport/WebSocket/Server.hs index 4e5b02409e8..f121b5bda8a 100644 --- a/server/src-lib/Hasura/GraphQL/Transport/WebSocket/Server.hs +++ b/server/src-lib/Hasura/GraphQL/Transport/WebSocket/Server.hs @@ -72,7 +72,7 @@ import Hasura.Server.Auth (AuthMode, compareAuthMode) import Hasura.Server.Cors (CorsPolicy) import Hasura.Server.Init.Config (AllowListStatus (..), WSConnectionInitTimeout (..)) import Hasura.Server.Prometheus - ( DynamicSubscriptionLabel (..), + ( DynamicGraphqlOperationLabel (..), PrometheusMetrics (..), recordMetricWithLabel, ) @@ -639,8 +639,8 @@ createServerApp getMetricsConfig wsConnInitTimeout (WSServer logger@(L.Logger wr messageDetails = MessageDetails (SB.fromLBS msg) messageLength parameterizedQueryHash = wsInfo >>= _wseiParameterizedQueryHash operationName = wsInfo >>= _wseiOperationName - promMetricGranularLabel = DynamicSubscriptionLabel parameterizedQueryHash operationName - promMetricLabel = DynamicSubscriptionLabel Nothing Nothing + promMetricGranularLabel = DynamicGraphqlOperationLabel parameterizedQueryHash operationName + promMetricLabel = DynamicGraphqlOperationLabel Nothing Nothing websocketBytesSentMetric = pmWebSocketBytesSent prometheusMetrics granularPrometheusMetricsState <- runGetPrometheusMetricsGranularity liftIO $ do diff --git a/server/src-lib/Hasura/Server/Prometheus.hs b/server/src-lib/Hasura/Server/Prometheus.hs index dabd998b6b9..967f1540e5c 100644 --- a/server/src-lib/Hasura/Server/Prometheus.hs +++ b/server/src-lib/Hasura/Server/Prometheus.hs @@ -31,11 +31,13 @@ module Hasura.Server.Prometheus observeHistogramWithLabel, SubscriptionKindLabel (..), SubscriptionLabel (..), - DynamicSubscriptionLabel (..), + DynamicGraphqlOperationLabel (..), streamingSubscriptionLabel, liveQuerySubscriptionLabel, recordMetricWithLabel, - recordSubcriptionMetric, + recordSubscriptionMetric, + GraphQLRequestsLabels, + recordGraphqlOperationMetric, ) where @@ -71,7 +73,7 @@ data PrometheusMetrics = PrometheusMetrics pmGraphQLRequestMetrics :: GraphQLRequestMetrics, pmEventTriggerMetrics :: EventTriggerMetrics, pmWebSocketBytesReceived :: Counter, - pmWebSocketBytesSent :: CounterVector DynamicSubscriptionLabel, + pmWebSocketBytesSent :: CounterVector DynamicGraphqlOperationLabel, pmActionBytesReceived :: Counter, pmActionBytesSent :: Counter, pmScheduledTriggerMetrics :: ScheduledTriggerMetrics, @@ -83,13 +85,7 @@ data PrometheusMetrics = PrometheusMetrics } data GraphQLRequestMetrics = GraphQLRequestMetrics - { gqlRequestsQuerySuccess :: Counter, - gqlRequestsQueryFailure :: Counter, - gqlRequestsMutationSuccess :: Counter, - gqlRequestsMutationFailure :: Counter, - gqlRequestsSubscriptionSuccess :: Counter, - gqlRequestsSubscriptionFailure :: Counter, - gqlRequestsUnknownFailure :: Counter, + { gqlRequests :: CounterVector GraphQLRequestsLabels, gqlExecutionTimeSecondsQuery :: Histogram, gqlExecutionTimeSecondsMutation :: Histogram } @@ -170,13 +166,7 @@ makeDummyPrometheusMetrics = do makeDummyGraphQLRequestMetrics :: IO GraphQLRequestMetrics makeDummyGraphQLRequestMetrics = do - gqlRequestsQuerySuccess <- Counter.new - gqlRequestsQueryFailure <- Counter.new - gqlRequestsMutationSuccess <- Counter.new - gqlRequestsMutationFailure <- Counter.new - gqlRequestsSubscriptionSuccess <- Counter.new - gqlRequestsSubscriptionFailure <- Counter.new - gqlRequestsUnknownFailure <- Counter.new + gqlRequests <- CounterVector.new gqlExecutionTimeSecondsQuery <- Histogram.new [] gqlExecutionTimeSecondsMutation <- Histogram.new [] pure GraphQLRequestMetrics {..} @@ -295,6 +285,7 @@ instance ToLabels (Maybe DynamicEventTriggerLabel) where toLabels (Just (DynamicEventTriggerLabel triggerName sourceName)) = Map.fromList $ [("trigger_name", triggerNameToTxt triggerName), ("source_name", sourceNameToText sourceName)] data ResponseStatus = Success | Failed + deriving stock (Generic, Ord, Eq) -- TODO: Make this a method of a new typeclass of the metrics library responseStatusToLabelValue :: ResponseStatus -> Text @@ -335,21 +326,21 @@ streamingSubscriptionLabel = SubscriptionKindLabel "streaming" liveQuerySubscriptionLabel :: SubscriptionKindLabel liveQuerySubscriptionLabel = SubscriptionKindLabel "live-query" -data DynamicSubscriptionLabel = DynamicSubscriptionLabel +data DynamicGraphqlOperationLabel = DynamicGraphqlOperationLabel { _dslParamQueryHash :: Maybe ParameterizedQueryHash, _dslOperationName :: Maybe OperationName } deriving stock (Generic, Ord, Eq) -instance ToLabels DynamicSubscriptionLabel where - toLabels (DynamicSubscriptionLabel hash opName) = +instance ToLabels DynamicGraphqlOperationLabel where + toLabels (DynamicGraphqlOperationLabel hash opName) = Map.fromList $ maybe [] (\pqh -> [("parameterized_query_hash", bsToTxt $ unParamQueryHash pqh)]) hash <> maybe [] (\op -> [("operation_name", G.unName $ _unOperationName op)]) opName data SubscriptionLabel = SubscriptionLabel { _slKind :: SubscriptionKindLabel, - _slDynamicLabels :: Maybe DynamicSubscriptionLabel + _slDynamicLabels :: Maybe DynamicGraphqlOperationLabel } deriving stock (Generic, Ord, Eq) @@ -357,6 +348,25 @@ instance ToLabels SubscriptionLabel where toLabels (SubscriptionLabel kind Nothing) = Map.fromList $ [("subscription_kind", subscription_kind kind)] toLabels (SubscriptionLabel kind (Just dl)) = (Map.fromList $ [("subscription_kind", subscription_kind kind)]) <> toLabels dl +-- TODO: Make this a method of a new typeclass of the metrics library +opTypeToLabelValue :: Maybe G.OperationType -> Text +opTypeToLabelValue = \case + (Just G.OperationTypeQuery) -> "query" + (Just G.OperationTypeMutation) -> "mutation" + (Just G.OperationTypeSubscription) -> "subscription" + Nothing -> "unknown" + +data GraphQLRequestsLabels = GraphQLRequestsLabels + { operation_type :: Maybe G.OperationType, + response_status :: ResponseStatus, + dynamic_label :: Maybe DynamicGraphqlOperationLabel + } + deriving stock (Generic, Ord, Eq) + +instance ToLabels (GraphQLRequestsLabels) where + toLabels (GraphQLRequestsLabels op_type res_status dynamic_labels) = + (HashMap.fromList $ [("operation_type", opTypeToLabelValue op_type), ("response_status", responseStatusToLabelValue res_status)]) <> (fromMaybe mempty (toLabels <$> dynamic_labels)) + -- | Record metrics with dynamic label recordMetricWithLabel :: (MonadIO m) => @@ -401,7 +411,7 @@ observeHistogramWithLabel getMetricState alwaysObserve histogramVector label val -- | Record a subscription metric for all the operation names present in the subscription. -- Use this when you want to update the same value of the metric for all the operation names. -recordSubcriptionMetric :: +recordSubscriptionMetric :: (MonadIO m) => (IO GranularPrometheusMetricsState) -> -- should the metric be observed without a label when granularMetricsState is OFF @@ -412,11 +422,11 @@ recordSubcriptionMetric :: -- the mertic action to perform (SubscriptionLabel -> IO ()) -> m () -recordSubcriptionMetric getMetricState alwaysObserve operationNamesMap parameterizedQueryHash subscriptionKind metricAction = do +recordSubscriptionMetric getMetricState alwaysObserve operationNamesMap parameterizedQueryHash subscriptionKind metricAction = do -- if no operation names are present, then emit metric with only param query hash as dynamic label if (null operationNamesMap) then do - let promMetricGranularLabel = SubscriptionLabel subscriptionKind (Just $ DynamicSubscriptionLabel (Just parameterizedQueryHash) Nothing) + let promMetricGranularLabel = SubscriptionLabel subscriptionKind (Just $ DynamicGraphqlOperationLabel (Just parameterizedQueryHash) Nothing) promMetricLabel = SubscriptionLabel subscriptionKind Nothing recordMetricWithLabel getMetricState @@ -427,10 +437,29 @@ recordSubcriptionMetric getMetricState alwaysObserve operationNamesMap parameter do let operationNames = HashMap.keys operationNamesMap for_ operationNames $ \opName -> do - let promMetricGranularLabel = SubscriptionLabel subscriptionKind (Just $ DynamicSubscriptionLabel (Just parameterizedQueryHash) opName) + let promMetricGranularLabel = SubscriptionLabel subscriptionKind (Just $ DynamicGraphqlOperationLabel (Just parameterizedQueryHash) opName) promMetricLabel = SubscriptionLabel subscriptionKind Nothing recordMetricWithLabel getMetricState alwaysObserve (metricAction promMetricGranularLabel) (metricAction promMetricLabel) + +recordGraphqlOperationMetric :: + (MonadIO m) => + (IO GranularPrometheusMetricsState) -> + Maybe G.OperationType -> + ResponseStatus -> + Maybe OperationName -> + Maybe ParameterizedQueryHash -> + (GraphQLRequestsLabels -> IO ()) -> + m () +recordGraphqlOperationMetric getMetricState operationType responseStatus operationName parameterizedQueryHash metricAction = do + let dynamicLabel = DynamicGraphqlOperationLabel parameterizedQueryHash operationName + promMetricGranularLabel = GraphQLRequestsLabels operationType responseStatus (Just dynamicLabel) + promMetricLabel = GraphQLRequestsLabels operationType responseStatus Nothing + recordMetricWithLabel + getMetricState + True + (metricAction promMetricGranularLabel) + (metricAction promMetricLabel)