mirror of
https://github.com/hasura/graphql-engine.git
synced 2024-12-14 08:02:15 +03:00
server: extend livequery-poller-log
to log errors
PR-URL: https://github.com/hasura/graphql-engine-mono/pull/9708 GitOrigin-RevId: ba2075f5f62bd56805fde0a8f02803b05105d775
This commit is contained in:
parent
1d8d934157
commit
c68f6c7ba1
@ -8,6 +8,7 @@ module Hasura.GraphQL.Execute.Subscription.Poll.Common
|
||||
BackendPollerKey (..),
|
||||
PollerMap,
|
||||
dumpPollerMap,
|
||||
PollDetailsError (..),
|
||||
PollDetails (..),
|
||||
BatchExecutionDetails (..),
|
||||
CohortExecutionDetails (..),
|
||||
@ -53,9 +54,11 @@ import Control.Immortal qualified as Immortal
|
||||
import Crypto.Hash qualified as CH
|
||||
import Data.Aeson qualified as J
|
||||
import Data.ByteString qualified as BS
|
||||
import Data.Text (unpack)
|
||||
import Data.Time.Clock qualified as Clock
|
||||
import Data.UUID qualified as UUID
|
||||
import Data.UUID.V4 qualified as UUID
|
||||
import Hasura.Base.Error (QErr, showQErr)
|
||||
import Hasura.GraphQL.Execute.Subscription.Options
|
||||
import Hasura.GraphQL.Execute.Subscription.Plan
|
||||
import Hasura.GraphQL.Execute.Subscription.TMap qualified as TMap
|
||||
@ -384,6 +387,22 @@ batchExecutionDetailMinimal BatchExecutionDetails {..} =
|
||||
<> batchRespSize
|
||||
)
|
||||
|
||||
data PollDetailsError = PollDetailsError
|
||||
{ _pdeBatchId :: BatchId,
|
||||
_pdeErrorDetails :: QErr
|
||||
}
|
||||
deriving (Eq)
|
||||
|
||||
instance Show PollDetailsError where
|
||||
show pde = "batch_id = " ++ show (_pdeBatchId pde) ++ ", detail = " ++ unpack (showQErr $ _pdeErrorDetails pde)
|
||||
|
||||
instance J.ToJSON PollDetailsError where
|
||||
toJSON PollDetailsError {..} =
|
||||
J.object
|
||||
$ [ "batch_id" J..= _pdeBatchId,
|
||||
"detail" J..= _pdeErrorDetails
|
||||
]
|
||||
|
||||
-- TODO consider refactoring into two types: one that is returned from pollLiveQuery and pollStreamingQuery, and a parent type containing pollerId, sourceName, and so on, which is assembled at the callsites of those two functions. Move postPollHook out of those functions to callsites
|
||||
data PollDetails = PollDetails
|
||||
{ -- | the unique ID (basically a thread that run as a 'Poller') for the
|
||||
@ -405,7 +424,9 @@ data PollDetails = PollDetails
|
||||
_pdLiveQueryOptions :: SubscriptionsOptions,
|
||||
_pdSource :: SourceName,
|
||||
_pdRole :: RoleName,
|
||||
_pdParameterizedQueryHash :: ParameterizedQueryHash
|
||||
_pdParameterizedQueryHash :: ParameterizedQueryHash,
|
||||
_pdLogLevel :: L.LogLevel,
|
||||
_pdErrors :: Maybe [PollDetailsError]
|
||||
}
|
||||
deriving (Show, Eq)
|
||||
|
||||
@ -423,23 +444,24 @@ they need to.
|
||||
pollDetailMinimal :: PollDetails -> J.Value
|
||||
pollDetailMinimal PollDetails {..} =
|
||||
J.object
|
||||
[ "poller_id" J..= _pdPollerId,
|
||||
"kind" J..= _pdKind,
|
||||
"snapshot_time" J..= _pdSnapshotTime,
|
||||
"batches" J..= batches, -- TODO: deprecate this field
|
||||
"execution_batches" J..= batches,
|
||||
"subscriber_count" J..= sum (map (length . _bedCohorts) _pdBatches),
|
||||
"total_time" J..= _pdTotalTime,
|
||||
"source" J..= _pdSource,
|
||||
"generated_sql" J..= _pdGeneratedSql,
|
||||
"role" J..= _pdRole,
|
||||
"subscription_options" J..= _pdLiveQueryOptions
|
||||
]
|
||||
$ [ "poller_id" J..= _pdPollerId,
|
||||
"kind" J..= _pdKind,
|
||||
"snapshot_time" J..= _pdSnapshotTime,
|
||||
"batches" J..= batches, -- TODO: deprecate this field
|
||||
"execution_batches" J..= batches,
|
||||
"subscriber_count" J..= sum (map (length . _bedCohorts) _pdBatches),
|
||||
"total_time" J..= _pdTotalTime,
|
||||
"source" J..= _pdSource,
|
||||
"generated_sql" J..= _pdGeneratedSql,
|
||||
"role" J..= _pdRole,
|
||||
"subscription_options" J..= _pdLiveQueryOptions
|
||||
]
|
||||
<> maybe [] (\err -> ["errors" J..= err]) _pdErrors
|
||||
where
|
||||
batches = map batchExecutionDetailMinimal _pdBatches
|
||||
|
||||
instance L.ToEngineLog PollDetails L.Hasura where
|
||||
toEngineLog pl = (L.LevelInfo, L.ELTLivequeryPollerLog, pollDetailMinimal pl)
|
||||
toEngineLog pl = (_pdLogLevel pl, L.ELTLivequeryPollerLog, pollDetailMinimal pl)
|
||||
|
||||
type SubscriptionPostPollHook = PollDetails -> IO ()
|
||||
|
||||
|
@ -28,6 +28,7 @@ import Hasura.GraphQL.Execute.Subscription.Types
|
||||
import Hasura.GraphQL.ParameterizedQueryHash (ParameterizedQueryHash)
|
||||
import Hasura.GraphQL.Transport.Backend
|
||||
import Hasura.GraphQL.Transport.HTTP.Protocol
|
||||
import Hasura.Logging (LogLevel (..))
|
||||
import Hasura.Prelude
|
||||
import Hasura.RQL.Types.Backend
|
||||
import Hasura.RQL.Types.BackendTag (backendTag, reify)
|
||||
@ -97,7 +98,7 @@ pollLiveQuery ::
|
||||
IO ()
|
||||
pollLiveQuery pollerId pollerResponseState lqOpts (sourceName, sourceConfig) roleName parameterizedQueryHash query cohortMap postPollHook prometheusMetrics granularPrometheusMetricsState operationNamesMap' resolvedConnectionTemplate modifier = do
|
||||
operationNamesMap <- STM.atomically $ TMap.getMap operationNamesMap'
|
||||
(totalTime, (snapshotTime, batchesDetails)) <- withElapsedTime $ do
|
||||
(totalTime, (snapshotTime, (batchesDetails, maybeErrors))) <- withElapsedTime $ do
|
||||
-- snapshot the current cohorts and split them into batches
|
||||
(snapshotTime, cohortBatches) <- withElapsedTime $ do
|
||||
-- get a snapshot of all the cohorts
|
||||
@ -110,7 +111,7 @@ pollLiveQuery pollerId pollerResponseState lqOpts (sourceName, sourceConfig) rol
|
||||
pure $ zip (BatchId <$> [1 ..]) cohortBatches
|
||||
|
||||
-- concurrently process each batch
|
||||
batchesDetails <- A.forConcurrently cohortBatches $ \(batchId, cohorts) -> do
|
||||
batchesDetailsWithMaybeError <- A.forConcurrently cohortBatches $ \(batchId, cohorts) -> do
|
||||
(queryExecutionTime, mxRes) <- runDBSubscription @b sourceConfig query (over (each . _2) C._csVariables cohorts) resolvedConnectionTemplate
|
||||
|
||||
let dbExecTimeMetric = submDBExecTotalTime $ pmSubscriptionMetrics $ prometheusMetrics
|
||||
@ -124,15 +125,22 @@ pollLiveQuery pollerId pollerResponseState lqOpts (sourceName, sourceConfig) rol
|
||||
|
||||
previousPollerResponseState <- STM.readTVarIO pollerResponseState
|
||||
|
||||
case mxRes of
|
||||
Left _ -> do
|
||||
maybeError <- case mxRes of
|
||||
Left err -> do
|
||||
when (previousPollerResponseState == PRSSuccess) $ do
|
||||
Prometheus.Gauge.inc $ submActiveLiveQueryPollersInError $ pmSubscriptionMetrics prometheusMetrics
|
||||
STM.atomically $ STM.writeTVar pollerResponseState PRSError
|
||||
let pollDetailsError =
|
||||
PollDetailsError
|
||||
{ _pdeBatchId = batchId,
|
||||
_pdeErrorDetails = err
|
||||
}
|
||||
return $ Just pollDetailsError
|
||||
Right _ -> do
|
||||
when (previousPollerResponseState == PRSError) $ do
|
||||
Prometheus.Gauge.dec $ submActiveLiveQueryPollersInError $ pmSubscriptionMetrics prometheusMetrics
|
||||
STM.atomically $ STM.writeTVar pollerResponseState PRSSuccess
|
||||
return Nothing
|
||||
|
||||
let lqMeta = SubscriptionMetadata $ convertDuration queryExecutionTime
|
||||
operations = getCohortOperations cohorts mxRes
|
||||
@ -162,17 +170,17 @@ pollLiveQuery pollerId pollerResponseState lqOpts (sourceName, sourceConfig) rol
|
||||
let pgExecutionTime = case reify (backendTag @b) of
|
||||
Postgres Vanilla -> Just queryExecutionTime
|
||||
_ -> Nothing
|
||||
pure
|
||||
$ BatchExecutionDetails
|
||||
pgExecutionTime
|
||||
queryExecutionTime
|
||||
pushTime
|
||||
batchId
|
||||
cohortsExecutionDetails
|
||||
batchResponseSize
|
||||
pure (snapshotTime, batchesDetails)
|
||||
|
||||
let pollDetails =
|
||||
batchExecDetails =
|
||||
BatchExecutionDetails
|
||||
pgExecutionTime
|
||||
queryExecutionTime
|
||||
pushTime
|
||||
batchId
|
||||
cohortsExecutionDetails
|
||||
batchResponseSize
|
||||
pure $ (batchExecDetails, maybeError)
|
||||
pure (snapshotTime, unzip batchesDetailsWithMaybeError)
|
||||
let initPollDetails =
|
||||
PollDetails
|
||||
{ _pdPollerId = pollerId,
|
||||
_pdKind = LiveQuery,
|
||||
@ -183,8 +191,18 @@ pollLiveQuery pollerId pollerResponseState lqOpts (sourceName, sourceConfig) rol
|
||||
_pdTotalTime = totalTime,
|
||||
_pdSource = sourceName,
|
||||
_pdRole = roleName,
|
||||
_pdParameterizedQueryHash = parameterizedQueryHash
|
||||
_pdParameterizedQueryHash = parameterizedQueryHash,
|
||||
_pdLogLevel = LevelInfo,
|
||||
_pdErrors = Nothing
|
||||
}
|
||||
maybePollDetailsErrors = sequenceA maybeErrors
|
||||
pollDetails = case maybePollDetailsErrors of
|
||||
Nothing -> initPollDetails
|
||||
Just pollDetailsErrors ->
|
||||
initPollDetails
|
||||
{ _pdLogLevel = LevelError,
|
||||
_pdErrors = Just pollDetailsErrors
|
||||
}
|
||||
postPollHook pollDetails
|
||||
let totalTimeMetric = submTotalTime $ pmSubscriptionMetrics $ prometheusMetrics
|
||||
recordSubcriptionMetric
|
||||
|
@ -30,6 +30,7 @@ import Hasura.GraphQL.Execute.Subscription.Types
|
||||
import Hasura.GraphQL.ParameterizedQueryHash (ParameterizedQueryHash)
|
||||
import Hasura.GraphQL.Transport.Backend
|
||||
import Hasura.GraphQL.Transport.HTTP.Protocol
|
||||
import Hasura.Logging (LogLevel (..))
|
||||
import Hasura.Prelude
|
||||
import Hasura.RQL.Types.Backend
|
||||
import Hasura.RQL.Types.BackendTag (backendTag, reify)
|
||||
@ -259,7 +260,7 @@ pollStreamingQuery ::
|
||||
IO ()
|
||||
pollStreamingQuery pollerId pollerResponseState streamingQueryOpts (sourceName, sourceConfig) roleName parameterizedQueryHash query cohortMap rootFieldName postPollHook testActionMaybe prometheusMetrics granularPrometheusMetricsState operationNames' resolvedConnectionTemplate modifier = do
|
||||
operationNames <- STM.atomically $ TMap.getMap operationNames'
|
||||
(totalTime, (snapshotTime, batchesDetailsAndProcessedCohorts)) <- withElapsedTime $ do
|
||||
(totalTime, (snapshotTime, (batchesDetails, processedCohorts, maybeErrors))) <- withElapsedTime $ do
|
||||
-- snapshot the current cohorts and split them into batches
|
||||
-- This STM transaction is a read only transaction i.e. it doesn't mutate any state
|
||||
(snapshotTime, cohortBatches) <- withElapsedTime $ do
|
||||
@ -275,7 +276,7 @@ pollStreamingQuery pollerId pollerResponseState streamingQueryOpts (sourceName,
|
||||
for_ testActionMaybe id -- IO action intended to run after the cohorts have been snapshotted
|
||||
|
||||
-- concurrently process each batch and also get the processed cohort with the new updated cohort key
|
||||
batchesDetailsAndProcessedCohorts <- A.forConcurrently cohortBatches $ \(batchId, cohorts) -> do
|
||||
batchesDetailsAndProcessedCohortsWithMaybeError <- A.forConcurrently cohortBatches $ \(batchId, cohorts) -> do
|
||||
(queryExecutionTime, mxRes) <-
|
||||
runDBStreamingSubscription @b
|
||||
sourceConfig
|
||||
@ -293,15 +294,22 @@ pollStreamingQuery pollerId pollerResponseState streamingQueryOpts (sourceName,
|
||||
|
||||
previousPollerResponseState <- STM.readTVarIO pollerResponseState
|
||||
|
||||
case mxRes of
|
||||
Left _ -> do
|
||||
maybeError <- case mxRes of
|
||||
Left err -> do
|
||||
when (previousPollerResponseState == PRSSuccess) $ do
|
||||
Prometheus.Gauge.inc $ submActiveStreamingPollersInError $ pmSubscriptionMetrics prometheusMetrics
|
||||
STM.atomically $ STM.writeTVar pollerResponseState PRSError
|
||||
let pollDetailsError =
|
||||
PollDetailsError
|
||||
{ _pdeBatchId = batchId,
|
||||
_pdeErrorDetails = err
|
||||
}
|
||||
return $ Just pollDetailsError
|
||||
Right _ -> do
|
||||
when (previousPollerResponseState == PRSError) $ do
|
||||
Prometheus.Gauge.dec $ submActiveStreamingPollersInError $ pmSubscriptionMetrics prometheusMetrics
|
||||
STM.atomically $ STM.writeTVar pollerResponseState PRSSuccess
|
||||
return Nothing
|
||||
|
||||
let subscriptionMeta = SubscriptionMetadata $ convertDuration queryExecutionTime
|
||||
operations = getCohortOperations cohorts mxRes
|
||||
@ -351,23 +359,33 @@ pollStreamingQuery pollerId pollerResponseState streamingQueryOpts (sourceName,
|
||||
batchId
|
||||
(fst <$> cohortsExecutionDetails)
|
||||
batchResponseSize
|
||||
pure $ (batchExecDetails, processedCohortBatch)
|
||||
pure $ (batchExecDetails, processedCohortBatch, maybeError)
|
||||
|
||||
pure (snapshotTime, batchesDetailsAndProcessedCohorts)
|
||||
pure (snapshotTime, unzip3 batchesDetailsAndProcessedCohortsWithMaybeError)
|
||||
|
||||
let pollDetails =
|
||||
let initPollDetails =
|
||||
PollDetails
|
||||
{ _pdPollerId = pollerId,
|
||||
_pdKind = Streaming,
|
||||
_pdGeneratedSql = toTxt query,
|
||||
_pdSnapshotTime = snapshotTime,
|
||||
_pdBatches = fst <$> batchesDetailsAndProcessedCohorts,
|
||||
_pdBatches = batchesDetails,
|
||||
_pdLiveQueryOptions = streamingQueryOpts,
|
||||
_pdTotalTime = totalTime,
|
||||
_pdSource = sourceName,
|
||||
_pdRole = roleName,
|
||||
_pdParameterizedQueryHash = parameterizedQueryHash
|
||||
_pdParameterizedQueryHash = parameterizedQueryHash,
|
||||
_pdLogLevel = LevelInfo,
|
||||
_pdErrors = Nothing
|
||||
}
|
||||
maybePollDetailsErrors = sequenceA maybeErrors
|
||||
pollDetails = case maybePollDetailsErrors of
|
||||
Nothing -> initPollDetails
|
||||
Just pollDetailsError ->
|
||||
initPollDetails
|
||||
{ _pdLogLevel = LevelError,
|
||||
_pdErrors = Just pollDetailsError
|
||||
}
|
||||
|
||||
STM.atomically $ do
|
||||
-- constructing a cohort map for all the cohorts that have been
|
||||
@ -375,7 +393,7 @@ pollStreamingQuery pollerId pollerResponseState streamingQueryOpts (sourceName,
|
||||
|
||||
-- processed cohorts is an array of tuples of the current poll cohort variables and a tuple
|
||||
-- of the cohort and the new cohort key
|
||||
let processedCohortsMap = HashMap.fromList $ snd =<< batchesDetailsAndProcessedCohorts
|
||||
let processedCohortsMap = HashMap.fromList $ concat processedCohorts
|
||||
|
||||
-- rebuilding the cohorts and the cohort map, see [Streaming subscription polling]
|
||||
-- and [Streaming subscriptions rebuilding cohort map]
|
||||
|
Loading…
Reference in New Issue
Block a user