graphql-engine/server/src-lib/Hasura/Server/Telemetry.hs
Brandon Simmons 58ef316118 Add request timings and count histograms to telemetry. Closes #3552
We upload a set of accumulating timers and counters to track service
time for different types of operations, across several dimensions (e.g.
did we hit the plan cache, was a remote involved, etc.)

Also...

Standardize on DiffTime as a standard duration type, and try to use it
consistently.

See discussion here:
https://github.com/hasura/graphql-engine/pull/3584#pullrequestreview-340679369

It should be possible to overwrite that module so the new threadDelay
sticks per the pattern in #3705 blocked on #3558

Rename the Control.Concurrent.Extended.threadDelay to `sleep` since a
naive use with a literal argument would be very bad!

We catch a bug in 'computeTimeDiff'.

Add convenient 'Read' instances to the time unit utility types. Make
'Second' a newtype to support this.
2020-02-03 18:50:10 -06:00

222 lines
7.5 KiB
Haskell

{-# LANGUAGE RecordWildCards #-}
{-|
Send anonymized metrics to the telemetry server regarding usage of various
features of Hasura.
-}
module Hasura.Server.Telemetry
( runTelemetry
, mkTelemetryLog
)
where
import Control.Exception (try)
import Control.Lens
import Data.List
import Data.Text.Conversions (UTF8 (..), decodeText)
import Hasura.HTTP
import Hasura.Logging
import Hasura.Prelude
import Hasura.RQL.Types
import Hasura.Server.Init
import Hasura.Server.Telemetry.Counters
import Hasura.Server.Version
import qualified CI
import qualified Control.Concurrent.Extended as C
import qualified Data.Aeson as A
import qualified Data.Aeson.Casing as A
import qualified Data.Aeson.TH as A
import qualified Data.ByteString.Lazy as BL
import qualified Data.HashMap.Strict as Map
import qualified Data.Text as T
import qualified Network.HTTP.Client as HTTP
import qualified Network.HTTP.Types as HTTP
import qualified Network.Wreq as Wreq
data RelationshipMetric
= RelationshipMetric
{ _rmManual :: !Int
, _rmAuto :: !Int
} deriving (Show, Eq)
$(A.deriveToJSON (A.aesonDrop 3 A.snakeCase) ''RelationshipMetric)
data PermissionMetric
= PermissionMetric
{ _pmSelect :: !Int
, _pmInsert :: !Int
, _pmUpdate :: !Int
, _pmDelete :: !Int
, _pmRoles :: !Int
} deriving (Show, Eq)
$(A.deriveToJSON (A.aesonDrop 3 A.snakeCase) ''PermissionMetric)
data Metrics
= Metrics
{ _mtTables :: !Int
, _mtViews :: !Int
, _mtEnumTables :: !Int
, _mtRelationships :: !RelationshipMetric
, _mtPermissions :: !PermissionMetric
, _mtEventTriggers :: !Int
, _mtRemoteSchemas :: !Int
, _mtFunctions :: !Int
, _mtServiceTimings :: !ServiceTimingMetrics
} deriving (Show, Eq)
$(A.deriveToJSON (A.aesonDrop 3 A.snakeCase) ''Metrics)
data HasuraTelemetry
= HasuraTelemetry
{ _htDbUid :: !Text
, _htInstanceUid :: !InstanceId
, _htVersion :: !Version
, _htCi :: !(Maybe CI.CI)
, _htMetrics :: !Metrics
} deriving (Show)
$(A.deriveToJSON (A.aesonDrop 3 A.snakeCase) ''HasuraTelemetry)
data TelemetryPayload
= TelemetryPayload
{ _tpTopic :: !Text
, _tpData :: !HasuraTelemetry
} deriving (Show)
$(A.deriveToJSON (A.aesonDrop 3 A.snakeCase) ''TelemetryPayload)
telemetryUrl :: Text
telemetryUrl = "https://telemetry.hasura.io/v1/http"
mkPayload :: Text -> InstanceId -> Version -> Metrics -> IO TelemetryPayload
mkPayload dbId instanceId version metrics = do
ci <- CI.getCI
let topic = case version of
VersionDev _ -> "server_test"
VersionRelease _ -> "server"
pure $ TelemetryPayload topic $ HasuraTelemetry dbId instanceId version ci metrics
-- | An infinite loop that sends updated telemetry data ('Metrics') every 24
-- hours. The send time depends on when the server was started and will
-- naturally drift.
runTelemetry
:: (HasVersion)
=> Logger Hasura
-> HTTP.Manager
-> IO SchemaCache
-- ^ an action that always returns the latest schema cache
-> Text
-> InstanceId
-> IO ()
runTelemetry (Logger logger) manager getSchemaCache dbId instanceId = do
let options = wreqOptions manager []
forever $ do
schemaCache <- getSchemaCache
serviceTimings <- dumpServiceTimingMetrics
let metrics = computeMetrics schemaCache serviceTimings
payload <- A.encode <$> mkPayload dbId instanceId currentVersion metrics
logger $ debugLBS $ "metrics_info: " <> payload
resp <- try $ Wreq.postWith options (T.unpack telemetryUrl) payload
either logHttpEx handleHttpResp resp
C.sleep $ days 1
where
logHttpEx :: HTTP.HttpException -> IO ()
logHttpEx ex = do
let httpErr = Just $ mkHttpError telemetryUrl Nothing (Just $ HttpException ex)
logger $ mkTelemetryLog "http_exception" "http exception occurred" httpErr
handleHttpResp resp = do
let statusCode = resp ^. Wreq.responseStatus . Wreq.statusCode
logger $ debugLBS $ "http_success: " <> resp ^. Wreq.responseBody
when (statusCode /= 200) $ do
let httpErr = Just $ mkHttpError telemetryUrl (Just resp) Nothing
logger $ mkTelemetryLog "http_error" "failed to post telemetry" httpErr
computeMetrics :: SchemaCache -> ServiceTimingMetrics -> Metrics
computeMetrics sc _mtServiceTimings =
let _mtTables = countUserTables (isNothing . _tciViewInfo . _tiCoreInfo)
_mtViews = countUserTables (isJust . _tciViewInfo . _tiCoreInfo)
_mtEnumTables = countUserTables (isJust . _tciEnumValues . _tiCoreInfo)
allRels = join $ Map.elems $ Map.map (getRels . _tciFieldInfoMap . _tiCoreInfo) userTables
(manualRels, autoRels) = partition riIsManual allRels
_mtRelationships = RelationshipMetric (length manualRels) (length autoRels)
rolePerms = join $ Map.elems $ Map.map permsOfTbl userTables
_pmRoles = length $ nub $ fst <$> rolePerms
allPerms = snd <$> rolePerms
_pmInsert = calcPerms _permIns allPerms
_pmSelect = calcPerms _permSel allPerms
_pmUpdate = calcPerms _permUpd allPerms
_pmDelete = calcPerms _permDel allPerms
_mtPermissions =
PermissionMetric{..}
_mtEventTriggers = Map.size $ Map.filter (not . Map.null)
$ Map.map _tiEventTriggerInfoMap userTables
_mtRemoteSchemas = Map.size $ scRemoteSchemas sc
_mtFunctions = Map.size $ Map.filter (not . isSystemDefined . fiSystemDefined) $ scFunctions sc
in Metrics{..}
where
userTables = Map.filter (not . isSystemDefined . _tciSystemDefined . _tiCoreInfo) $ scTables sc
countUserTables predicate = length . filter predicate $ Map.elems userTables
calcPerms :: (RolePermInfo -> Maybe a) -> [RolePermInfo] -> Int
calcPerms fn perms = length $ catMaybes $ map fn perms
permsOfTbl :: TableInfo -> [(RoleName, RolePermInfo)]
permsOfTbl = Map.toList . _tiRolePermInfoMap
-- | Logging related
data TelemetryLog
= TelemetryLog
{ _tlLogLevel :: !LogLevel
, _tlType :: !Text
, _tlMessage :: !Text
, _tlHttpError :: !(Maybe TelemetryHttpError)
} deriving (Show)
data TelemetryHttpError
= TelemetryHttpError
{ tlheStatus :: !(Maybe HTTP.Status)
, tlheUrl :: !T.Text
, tlheHttpException :: !(Maybe HttpException)
, tlheResponse :: !(Maybe T.Text)
} deriving (Show)
instance A.ToJSON TelemetryLog where
toJSON tl =
A.object [ "type" A..= _tlType tl
, "message" A..= _tlMessage tl
, "http_error" A..= (A.toJSON <$> _tlHttpError tl)
]
instance A.ToJSON TelemetryHttpError where
toJSON tlhe =
A.object [ "status_code" A..= (HTTP.statusCode <$> tlheStatus tlhe)
, "url" A..= tlheUrl tlhe
, "response" A..= tlheResponse tlhe
, "http_exception" A..= (A.toJSON <$> tlheHttpException tlhe)
]
instance ToEngineLog TelemetryLog Hasura where
toEngineLog tl = (_tlLogLevel tl, ELTInternal ILTTelemetry, A.toJSON tl)
mkHttpError
:: Text
-> Maybe (Wreq.Response BL.ByteString)
-> Maybe HttpException
-> TelemetryHttpError
mkHttpError url mResp httpEx =
case mResp of
Nothing -> TelemetryHttpError Nothing url httpEx Nothing
Just resp ->
let status = resp ^. Wreq.responseStatus
body = decodeText $ UTF8 (resp ^. Wreq.responseBody)
in TelemetryHttpError (Just status) url httpEx body
mkTelemetryLog :: Text -> Text -> Maybe TelemetryHttpError -> TelemetryLog
mkTelemetryLog = TelemetryLog LevelInfo