mirror of
https://github.com/hasura/graphql-engine.git
synced 2024-12-15 01:12:56 +03:00
Fix latency buckets for telemetry data
These must have gotten messed up during a refactor. As a consequence almost all samples received so far fall into the single erroneous 0 to 1K seconds (originally supposed to be 1ms?) bucket. I also re-thought what the numbers should be, but these are still arbitrary and might want adjusting in the future.
This commit is contained in:
parent
03305bb788
commit
2eab6a89aa
@ -126,21 +126,22 @@ instance A.FromJSON Transport
|
||||
-- | The timings and counts here were from requests with total time longer than
|
||||
-- 'bucketGreaterThan' (but less than any larger bucket cutoff times).
|
||||
newtype RunningTimeBucket = RunningTimeBucket { bucketGreaterThan :: Seconds }
|
||||
deriving (Fractional, Num, Ord, Eq, Show, Generic, A.ToJSON, A.FromJSON, Hashable)
|
||||
deriving (Ord, Eq, Show, Generic, A.ToJSON, A.FromJSON, Hashable)
|
||||
|
||||
|
||||
-- NOTE: an HDR histogram is a nice way to collect metrics when you don't know
|
||||
-- a priori what the most useful binning is. It's not clear how we'd make use
|
||||
-- of that here though.
|
||||
-- of that here though. So these buckets are arbitrary, and can be adjusted as
|
||||
-- needed, but we shouldn't have more than a handful to keep payload size down.
|
||||
totalTimeBuckets :: [RunningTimeBucket]
|
||||
totalTimeBuckets = [0, 1000, 10*1000, 100*1000]
|
||||
totalTimeBuckets = coerce [0.000, 0.001, 0.050, 1.000, 3600.000 :: Seconds]
|
||||
|
||||
-- | Save a timing metric sample in our in-memory store. These will be
|
||||
-- accumulated and uploaded periodically in "Hasura.Server.Telemetry".
|
||||
recordTimingMetric :: MonadIO m=> RequestDimensions -> RequestTimings -> m ()
|
||||
recordTimingMetric reqDimensions RequestTimings{..} = liftIO $ do
|
||||
let ourBucket = fromMaybe 0 $ -- although we expect 'head' would be safe here
|
||||
listToMaybe $ dropWhile (> realToFrac telemTimeTot) $
|
||||
let ourBucket = fromMaybe (RunningTimeBucket 0) $ -- although we expect 'head' would be safe here
|
||||
listToMaybe $ dropWhile (> coerce telemTimeTot) $
|
||||
reverse $ sort totalTimeBuckets
|
||||
atomicModifyIORef' requestCounters $ (,()) .
|
||||
HM.insertWith (<>) (reqDimensions, ourBucket) RequestTimingsCount{telemCount = 1, ..}
|
||||
|
@ -23,24 +23,27 @@ telemetryCountersTests = do
|
||||
let expected =
|
||||
-- NOTE: ordering is arbitrary here (and hence fragile)
|
||||
[ServiceTimingMetric {
|
||||
dimensions = RequestDimensions Miss Mutation Local HTTP
|
||||
, bucket = RunningTimeBucket {bucketGreaterThan = 0.050}
|
||||
, metrics = RequestTimingsCount {telemTimeIO = 2, telemTimeTot = 1.050, telemCount = 2}},
|
||||
ServiceTimingMetric {
|
||||
dimensions = RequestDimensions Hit Mutation Local HTTP
|
||||
, bucket = RunningTimeBucket {bucketGreaterThan = 0}
|
||||
, metrics = RequestTimingsCount {telemTimeIO = 2, telemTimeTot = 200, telemCount = 2}},
|
||||
ServiceTimingMetric {
|
||||
dimensions = RequestDimensions Miss Mutation Local HTTP
|
||||
, bucket = RunningTimeBucket {bucketGreaterThan = 1000}
|
||||
, metrics = RequestTimingsCount {telemTimeIO = 2, telemTimeTot = 2002, telemCount = 2}},
|
||||
, metrics = RequestTimingsCount {telemTimeIO = 2, telemTimeTot = 0.001, telemCount = 2}},
|
||||
ServiceTimingMetric {
|
||||
dimensions = RequestDimensions Hit Query Remote WebSocket
|
||||
, bucket = RunningTimeBucket {bucketGreaterThan = 100000}
|
||||
, metrics = RequestTimingsCount {telemTimeIO = 1, telemTimeTot = 100001, telemCount = 1}}]
|
||||
, bucket = RunningTimeBucket {bucketGreaterThan = 1.000}
|
||||
, metrics = RequestTimingsCount {telemTimeIO = 1, telemTimeTot = 5.000, telemCount = 1}}]
|
||||
|
||||
it "accumulates as expected" $ do
|
||||
recordTimingMetric (RequestDimensions Hit Mutation Local HTTP) (RequestTimings 1 100)
|
||||
recordTimingMetric (RequestDimensions Hit Mutation Local HTTP) (RequestTimings 1 100)
|
||||
recordTimingMetric (RequestDimensions Miss Mutation Local HTTP) (RequestTimings 1 1001)
|
||||
recordTimingMetric (RequestDimensions Miss Mutation Local HTTP) (RequestTimings 1 1001)
|
||||
recordTimingMetric (RequestDimensions Hit Query Remote WebSocket) (RequestTimings 1 100001)
|
||||
-- bucket 0sec - 1ms:
|
||||
recordTimingMetric (RequestDimensions Hit Mutation Local HTTP) (RequestTimings 1 0.0001)
|
||||
recordTimingMetric (RequestDimensions Hit Mutation Local HTTP) (RequestTimings 1 0.0009)
|
||||
-- bucket 50ms - 1 sec:
|
||||
recordTimingMetric (RequestDimensions Miss Mutation Local HTTP) (RequestTimings 1 0.0510)
|
||||
recordTimingMetric (RequestDimensions Miss Mutation Local HTTP) (RequestTimings 1 0.9990)
|
||||
-- bucket 1 sec - 1 hour:
|
||||
recordTimingMetric (RequestDimensions Hit Query Remote WebSocket) (RequestTimings 1 5.0000)
|
||||
fmap serviceTimingMetrics dumpServiceTimingMetrics `shouldReturn` expected
|
||||
|
||||
it "serializes and deserializes properly" $ do
|
||||
|
Loading…
Reference in New Issue
Block a user