mirror of
https://github.com/hasura/graphql-engine.git
synced 2024-12-18 04:51:35 +03:00
194 lines
6.9 KiB
Haskell
194 lines
6.9 KiB
Haskell
{-# LANGUAGE DuplicateRecordFields #-}
|
|
{-# LANGUAGE RecordWildCards #-}
|
|
{-|
|
|
Counters used in telemetry collection. Additional counters can be added here.and
|
|
serviced in "Hasura.Server.Telemetry".
|
|
-}
|
|
|
|
module Hasura.Server.Telemetry.Counters
|
|
(
|
|
-- * Service timing and counts, by various dimensions
|
|
-- ** Local metric recording
|
|
recordTimingMetric
|
|
, RequestDimensions(..), RequestTimings(..)
|
|
-- *** Dimensions
|
|
, CacheHit(..), QueryType(..), Locality(..), Transport(..)
|
|
-- ** Metric upload
|
|
, dumpServiceTimingMetrics
|
|
, ServiceTimingMetrics(..)
|
|
, ServiceTimingMetric(..)
|
|
, RunningTimeBucket(..)
|
|
, RequestTimingsCount(..)
|
|
)
|
|
where
|
|
|
|
import qualified Data.Aeson as A
|
|
import qualified Data.Aeson.Casing as A
|
|
import qualified Data.Aeson.TH as A
|
|
import Data.Hashable
|
|
import qualified Data.HashMap.Strict as HM
|
|
import Data.IORef
|
|
import Data.Time.Clock.POSIX (POSIXTime, getPOSIXTime)
|
|
import GHC.IO.Unsafe (unsafePerformIO)
|
|
import Hasura.Prelude
|
|
|
|
-- | The properties that characterize this request. The dimensions over which
|
|
-- we collect metrics for each serviced request.
|
|
data RequestDimensions =
|
|
RequestDimensions {
|
|
telemCacheHit :: !CacheHit
|
|
, telemQueryType :: !QueryType
|
|
, telemLocality :: !Locality
|
|
, telemTransport :: !Transport
|
|
}
|
|
deriving (Show, Generic, Eq)
|
|
|
|
instance Hashable RequestDimensions
|
|
|
|
|
|
-- | Accumulated time metrics.
|
|
data RequestTimings =
|
|
RequestTimings {
|
|
telemTimeIO :: !Seconds
|
|
-- ^ Time spent waiting on PG/remote http calls
|
|
, telemTimeTot :: !Seconds
|
|
-- ^ Total service time for request (including 'telemTimeIO')
|
|
}
|
|
|
|
-- | Sum
|
|
instance Semigroup RequestTimings where
|
|
RequestTimings a b <> RequestTimings x y = RequestTimings (a+x) (b+y)
|
|
|
|
-- | 'RequestTimings' along with the count
|
|
data RequestTimingsCount =
|
|
RequestTimingsCount {
|
|
telemTimeIO :: !Seconds
|
|
, telemTimeTot :: !Seconds
|
|
, telemCount :: !Word
|
|
-- ^ The number of requests that have contributed to the accumulated timings above.
|
|
-- So e.g. @telemTimeTot / count@ would give the mean service time.
|
|
}
|
|
deriving (Show, Generic, Eq)
|
|
|
|
-- | Sum
|
|
instance Semigroup RequestTimingsCount where
|
|
RequestTimingsCount a b c <> RequestTimingsCount x y z =
|
|
RequestTimingsCount (a+x) (b+y) (c+z)
|
|
|
|
-- | Internal. Counts and durations across many 'RequestDimensions'.
|
|
--
|
|
-- NOTE: We use the global mutable variable pattern for metric collection
|
|
-- counters for convenience at collection site (don't wear hairshirts that
|
|
-- discourage useful reporting).
|
|
requestCounters :: IORef (HM.HashMap (RequestDimensions, RunningTimeBucket) RequestTimingsCount)
|
|
{-# NOINLINE requestCounters #-}
|
|
requestCounters = unsafePerformIO $ newIORef HM.empty
|
|
|
|
-- | Internal. Since these metrics are accumulated while graphql-engine is
|
|
-- running and sent periodically, we need to include a tag that is unique for
|
|
-- each start of hge. This lets us e.g. query for just the latest uploaded
|
|
-- sample for each start of hge.
|
|
--
|
|
-- We use time rather than a UUID since having this be monotonic increasing is
|
|
-- convenient.
|
|
approxStartTime :: POSIXTime
|
|
{-# NOINLINE approxStartTime #-}
|
|
approxStartTime = unsafePerformIO getPOSIXTime
|
|
|
|
-- | Did this request hit the plan cache?
|
|
data CacheHit = Hit | Miss
|
|
deriving (Enum, Show, Eq, Generic)
|
|
instance Hashable CacheHit
|
|
instance A.ToJSON CacheHit
|
|
instance A.FromJSON CacheHit
|
|
|
|
-- | Was this request a mutation (involved DB writes)?
|
|
data QueryType = Mutation | Query
|
|
deriving (Enum, Show, Eq, Generic)
|
|
instance Hashable QueryType
|
|
instance A.ToJSON QueryType
|
|
instance A.FromJSON QueryType
|
|
|
|
-- | Was this a PG local query, or did it involve remote execution?
|
|
data Locality
|
|
= Empty -- ^ No data was fetched
|
|
| Local -- ^ local DB data
|
|
| Remote -- ^ remote schema
|
|
| Heterogeneous -- ^ mixed
|
|
deriving (Enum, Show, Eq, Generic)
|
|
instance Hashable Locality
|
|
instance A.ToJSON Locality
|
|
instance A.FromJSON Locality
|
|
instance Semigroup Locality where
|
|
Empty <> x = x
|
|
x <> Empty = x
|
|
x <> y | x == y = x
|
|
_ <> _ = Heterogeneous
|
|
instance Monoid Locality where
|
|
mempty = Empty
|
|
|
|
-- | Was this a query over http or websockets?
|
|
data Transport = HTTP | WebSocket
|
|
deriving (Enum, Show, Eq, Generic)
|
|
instance Hashable Transport
|
|
instance A.ToJSON Transport
|
|
instance A.FromJSON Transport
|
|
|
|
-- | The timings and counts here were from requests with total time longer than
|
|
-- 'bucketGreaterThan' (but less than any larger bucket cutoff times).
|
|
newtype RunningTimeBucket = RunningTimeBucket { bucketGreaterThan :: Seconds }
|
|
deriving (Ord, Eq, Show, Generic, A.ToJSON, A.FromJSON, Hashable)
|
|
|
|
|
|
-- NOTE: an HDR histogram is a nice way to collect metrics when you don't know
|
|
-- a priori what the most useful binning is. It's not clear how we'd make use
|
|
-- of that here though. So these buckets are arbitrary, and can be adjusted as
|
|
-- needed, but we shouldn't have more than a handful to keep payload size down.
|
|
totalTimeBuckets :: [RunningTimeBucket]
|
|
totalTimeBuckets = coerce [0.000, 0.001, 0.050, 1.000, 3600.000 :: Seconds]
|
|
|
|
-- | Save a timing metric sample in our in-memory store. These will be
|
|
-- accumulated and uploaded periodically in "Hasura.Server.Telemetry".
|
|
recordTimingMetric :: MonadIO m => RequestDimensions -> RequestTimings -> m ()
|
|
recordTimingMetric reqDimensions RequestTimings{..} = liftIO $ do
|
|
let ourBucket = fromMaybe (RunningTimeBucket 0) $ -- although we expect 'head' would be safe here
|
|
listToMaybe $ dropWhile (> coerce telemTimeTot) $
|
|
reverse $ sort totalTimeBuckets
|
|
atomicModifyIORef' requestCounters $ (,()) .
|
|
HM.insertWith (<>) (reqDimensions, ourBucket) RequestTimingsCount{telemCount = 1, ..}
|
|
|
|
-- | The final shape of this part of our metrics data JSON. This should allow
|
|
-- reasonably efficient querying using GIN indexes and JSONB containment
|
|
-- operations (which treat arrays as sets).
|
|
data ServiceTimingMetrics
|
|
= ServiceTimingMetrics
|
|
{ collectionTag :: Int
|
|
-- ^ This is set to a new unique value when the counters reset (e.g. because of a restart)
|
|
, serviceTimingMetrics :: [ServiceTimingMetric]
|
|
}
|
|
deriving (Show, Generic, Eq)
|
|
data ServiceTimingMetric
|
|
= ServiceTimingMetric
|
|
{ dimensions :: RequestDimensions
|
|
, bucket :: RunningTimeBucket
|
|
, metrics :: RequestTimingsCount
|
|
}
|
|
deriving (Show, Generic, Eq)
|
|
|
|
|
|
$(A.deriveJSON (A.aesonDrop 5 A.snakeCase) ''RequestTimingsCount)
|
|
$(A.deriveJSON (A.aesonDrop 5 A.snakeCase) ''RequestDimensions)
|
|
|
|
instance A.ToJSON ServiceTimingMetric
|
|
instance A.FromJSON ServiceTimingMetric
|
|
instance A.ToJSON ServiceTimingMetrics
|
|
instance A.FromJSON ServiceTimingMetrics
|
|
|
|
dumpServiceTimingMetrics :: MonadIO m => m ServiceTimingMetrics
|
|
dumpServiceTimingMetrics = liftIO $ do
|
|
cs <- readIORef requestCounters
|
|
let serviceTimingMetrics = flip map (HM.toList cs) $
|
|
\((dimensions, bucket), metrics)-> ServiceTimingMetric{..}
|
|
collectionTag = round approxStartTime
|
|
return ServiceTimingMetrics{..}
|