Fix latency buckets for telemetry data

These must have gotten messed up during a refactor. As a consequence
almost all samples received so far fall into the single erroneous 0 to
1K seconds (originally supposed to be 1ms?) bucket.

I also re-thought what the numbers should be, but these are still
arbitrary and might want adjusting in the future.
This commit is contained in:
Brandon Simmons 2020-07-20 22:28:12 -04:00
parent 03305bb788
commit 2eab6a89aa
2 changed files with 21 additions and 17 deletions

View File

@ -126,21 +126,22 @@ instance A.FromJSON Transport
-- | The timings and counts here were from requests with total time longer than
-- 'bucketGreaterThan' (but less than any larger bucket cutoff times).
newtype RunningTimeBucket = RunningTimeBucket { bucketGreaterThan :: Seconds }
deriving (Fractional, Num, Ord, Eq, Show, Generic, A.ToJSON, A.FromJSON, Hashable)
deriving (Ord, Eq, Show, Generic, A.ToJSON, A.FromJSON, Hashable)
-- NOTE: an HDR histogram is a nice way to collect metrics when you don't know
-- a priori what the most useful binning is. It's not clear how we'd make use
-- of that here though.
-- of that here though. So these buckets are arbitrary, and can be adjusted as
-- needed, but we shouldn't have more than a handful to keep payload size down.
totalTimeBuckets :: [RunningTimeBucket]
totalTimeBuckets = [0, 1000, 10*1000, 100*1000]
totalTimeBuckets = coerce [0.000, 0.001, 0.050, 1.000, 3600.000 :: Seconds]
-- | Save a timing metric sample in our in-memory store. These will be
-- accumulated and uploaded periodically in "Hasura.Server.Telemetry".
recordTimingMetric :: MonadIO m=> RequestDimensions -> RequestTimings -> m ()
recordTimingMetric reqDimensions RequestTimings{..} = liftIO $ do
let ourBucket = fromMaybe 0 $ -- although we expect 'head' would be safe here
listToMaybe $ dropWhile (> realToFrac telemTimeTot) $
let ourBucket = fromMaybe (RunningTimeBucket 0) $ -- although we expect 'head' would be safe here
listToMaybe $ dropWhile (> coerce telemTimeTot) $
reverse $ sort totalTimeBuckets
atomicModifyIORef' requestCounters $ (,()) .
HM.insertWith (<>) (reqDimensions, ourBucket) RequestTimingsCount{telemCount = 1, ..}

View File

@ -23,24 +23,27 @@ telemetryCountersTests = do
let expected =
-- NOTE: ordering is arbitrary here (and hence fragile)
[ServiceTimingMetric {
dimensions = RequestDimensions Miss Mutation Local HTTP
, bucket = RunningTimeBucket {bucketGreaterThan = 0.050}
, metrics = RequestTimingsCount {telemTimeIO = 2, telemTimeTot = 1.050, telemCount = 2}},
ServiceTimingMetric {
dimensions = RequestDimensions Hit Mutation Local HTTP
, bucket = RunningTimeBucket {bucketGreaterThan = 0}
, metrics = RequestTimingsCount {telemTimeIO = 2, telemTimeTot = 200, telemCount = 2}},
ServiceTimingMetric {
dimensions = RequestDimensions Miss Mutation Local HTTP
, bucket = RunningTimeBucket {bucketGreaterThan = 1000}
, metrics = RequestTimingsCount {telemTimeIO = 2, telemTimeTot = 2002, telemCount = 2}},
, metrics = RequestTimingsCount {telemTimeIO = 2, telemTimeTot = 0.001, telemCount = 2}},
ServiceTimingMetric {
dimensions = RequestDimensions Hit Query Remote WebSocket
, bucket = RunningTimeBucket {bucketGreaterThan = 100000}
, metrics = RequestTimingsCount {telemTimeIO = 1, telemTimeTot = 100001, telemCount = 1}}]
, bucket = RunningTimeBucket {bucketGreaterThan = 1.000}
, metrics = RequestTimingsCount {telemTimeIO = 1, telemTimeTot = 5.000, telemCount = 1}}]
it "accumulates as expected" $ do
recordTimingMetric (RequestDimensions Hit Mutation Local HTTP) (RequestTimings 1 100)
recordTimingMetric (RequestDimensions Hit Mutation Local HTTP) (RequestTimings 1 100)
recordTimingMetric (RequestDimensions Miss Mutation Local HTTP) (RequestTimings 1 1001)
recordTimingMetric (RequestDimensions Miss Mutation Local HTTP) (RequestTimings 1 1001)
recordTimingMetric (RequestDimensions Hit Query Remote WebSocket) (RequestTimings 1 100001)
-- bucket 0sec - 1ms:
recordTimingMetric (RequestDimensions Hit Mutation Local HTTP) (RequestTimings 1 0.0001)
recordTimingMetric (RequestDimensions Hit Mutation Local HTTP) (RequestTimings 1 0.0009)
-- bucket 50ms - 1 sec:
recordTimingMetric (RequestDimensions Miss Mutation Local HTTP) (RequestTimings 1 0.0510)
recordTimingMetric (RequestDimensions Miss Mutation Local HTTP) (RequestTimings 1 0.9990)
-- bucket 1 sec - 1 hour:
recordTimingMetric (RequestDimensions Hit Query Remote WebSocket) (RequestTimings 1 5.0000)
fmap serviceTimingMetrics dumpServiceTimingMetrics `shouldReturn` expected
it "serializes and deserializes properly" $ do