graphql-engine/server/src-lib/Hasura/Server/Telemetry/Counters.hs

194 lines
6.9 KiB
Haskell

{-# LANGUAGE DuplicateRecordFields #-}
{-# LANGUAGE RecordWildCards #-}
{-|
Counters used in telemetry collection. Additional counters can be added here.and
serviced in "Hasura.Server.Telemetry".
-}
module Hasura.Server.Telemetry.Counters
(
-- * Service timing and counts, by various dimensions
-- ** Local metric recording
recordTimingMetric
, RequestDimensions(..), RequestTimings(..)
-- *** Dimensions
, CacheHit(..), QueryType(..), Locality(..), Transport(..)
-- ** Metric upload
, dumpServiceTimingMetrics
, ServiceTimingMetrics(..)
, ServiceTimingMetric(..)
, RunningTimeBucket(..)
, RequestTimingsCount(..)
)
where
import qualified Data.Aeson as A
import qualified Data.Aeson.Casing as A
import qualified Data.Aeson.TH as A
import Data.Hashable
import qualified Data.HashMap.Strict as HM
import Data.IORef
import Data.Time.Clock.POSIX (POSIXTime, getPOSIXTime)
import GHC.IO.Unsafe (unsafePerformIO)
import Hasura.Prelude
-- | The properties that characterize this request. The dimensions over which
-- we collect metrics for each serviced request.
data RequestDimensions =
RequestDimensions {
telemCacheHit :: !CacheHit
, telemQueryType :: !QueryType
, telemLocality :: !Locality
, telemTransport :: !Transport
}
deriving (Show, Generic, Eq)
instance Hashable RequestDimensions
-- | Accumulated time metrics.
data RequestTimings =
RequestTimings {
telemTimeIO :: !Seconds
-- ^ Time spent waiting on PG/remote http calls
, telemTimeTot :: !Seconds
-- ^ Total service time for request (including 'telemTimeIO')
}
-- | Sum
instance Semigroup RequestTimings where
RequestTimings a b <> RequestTimings x y = RequestTimings (a+x) (b+y)
-- | 'RequestTimings' along with the count
data RequestTimingsCount =
RequestTimingsCount {
telemTimeIO :: !Seconds
, telemTimeTot :: !Seconds
, telemCount :: !Word
-- ^ The number of requests that have contributed to the accumulated timings above.
-- So e.g. @telemTimeTot / count@ would give the mean service time.
}
deriving (Show, Generic, Eq)
-- | Sum
instance Semigroup RequestTimingsCount where
RequestTimingsCount a b c <> RequestTimingsCount x y z =
RequestTimingsCount (a+x) (b+y) (c+z)
-- | Internal. Counts and durations across many 'RequestDimensions'.
--
-- NOTE: We use the global mutable variable pattern for metric collection
-- counters for convenience at collection site (don't wear hairshirts that
-- discourage useful reporting).
requestCounters :: IORef (HM.HashMap (RequestDimensions, RunningTimeBucket) RequestTimingsCount)
{-# NOINLINE requestCounters #-}
requestCounters = unsafePerformIO $ newIORef HM.empty
-- | Internal. Since these metrics are accumulated while graphql-engine is
-- running and sent periodically, we need to include a tag that is unique for
-- each start of hge. This lets us e.g. query for just the latest uploaded
-- sample for each start of hge.
--
-- We use time rather than a UUID since having this be monotonic increasing is
-- convenient.
approxStartTime :: POSIXTime
{-# NOINLINE approxStartTime #-}
approxStartTime = unsafePerformIO getPOSIXTime
-- | Did this request hit the plan cache?
data CacheHit = Hit | Miss
deriving (Enum, Show, Eq, Generic)
instance Hashable CacheHit
instance A.ToJSON CacheHit
instance A.FromJSON CacheHit
-- | Was this request a mutation (involved DB writes)?
data QueryType = Mutation | Query
deriving (Enum, Show, Eq, Generic)
instance Hashable QueryType
instance A.ToJSON QueryType
instance A.FromJSON QueryType
-- | Was this a PG local query, or did it involve remote execution?
data Locality
= Empty -- ^ No data was fetched
| Local -- ^ local DB data
| Remote -- ^ remote schema
| Heterogeneous -- ^ mixed
deriving (Enum, Show, Eq, Generic)
instance Hashable Locality
instance A.ToJSON Locality
instance A.FromJSON Locality
instance Semigroup Locality where
Empty <> x = x
x <> Empty = x
x <> y | x == y = x
_ <> _ = Heterogeneous
instance Monoid Locality where
mempty = Empty
-- | Was this a query over http or websockets?
data Transport = HTTP | WebSocket
deriving (Enum, Show, Eq, Generic)
instance Hashable Transport
instance A.ToJSON Transport
instance A.FromJSON Transport
-- | The timings and counts here were from requests with total time longer than
-- 'bucketGreaterThan' (but less than any larger bucket cutoff times).
newtype RunningTimeBucket = RunningTimeBucket { bucketGreaterThan :: Seconds }
deriving (Ord, Eq, Show, Generic, A.ToJSON, A.FromJSON, Hashable)
-- NOTE: an HDR histogram is a nice way to collect metrics when you don't know
-- a priori what the most useful binning is. It's not clear how we'd make use
-- of that here though. So these buckets are arbitrary, and can be adjusted as
-- needed, but we shouldn't have more than a handful to keep payload size down.
totalTimeBuckets :: [RunningTimeBucket]
totalTimeBuckets = coerce [0.000, 0.001, 0.050, 1.000, 3600.000 :: Seconds]
-- | Save a timing metric sample in our in-memory store. These will be
-- accumulated and uploaded periodically in "Hasura.Server.Telemetry".
recordTimingMetric :: MonadIO m => RequestDimensions -> RequestTimings -> m ()
recordTimingMetric reqDimensions RequestTimings{..} = liftIO $ do
let ourBucket = fromMaybe (RunningTimeBucket 0) $ -- although we expect 'head' would be safe here
listToMaybe $ dropWhile (> coerce telemTimeTot) $
reverse $ sort totalTimeBuckets
atomicModifyIORef' requestCounters $ (,()) .
HM.insertWith (<>) (reqDimensions, ourBucket) RequestTimingsCount{telemCount = 1, ..}
-- | The final shape of this part of our metrics data JSON. This should allow
-- reasonably efficient querying using GIN indexes and JSONB containment
-- operations (which treat arrays as sets).
data ServiceTimingMetrics
= ServiceTimingMetrics
{ collectionTag :: Int
-- ^ This is set to a new unique value when the counters reset (e.g. because of a restart)
, serviceTimingMetrics :: [ServiceTimingMetric]
}
deriving (Show, Generic, Eq)
data ServiceTimingMetric
= ServiceTimingMetric
{ dimensions :: RequestDimensions
, bucket :: RunningTimeBucket
, metrics :: RequestTimingsCount
}
deriving (Show, Generic, Eq)
$(A.deriveJSON (A.aesonDrop 5 A.snakeCase) ''RequestTimingsCount)
$(A.deriveJSON (A.aesonDrop 5 A.snakeCase) ''RequestDimensions)
instance A.ToJSON ServiceTimingMetric
instance A.FromJSON ServiceTimingMetric
instance A.ToJSON ServiceTimingMetrics
instance A.FromJSON ServiceTimingMetrics
dumpServiceTimingMetrics :: MonadIO m => m ServiceTimingMetrics
dumpServiceTimingMetrics = liftIO $ do
cs <- readIORef requestCounters
let serviceTimingMetrics = flip map (HM.toList cs) $
\((dimensions, bucket), metrics)-> ServiceTimingMetric{..}
collectionTag = round approxStartTime
return ServiceTimingMetrics{..}