graphql-engine/server/src-lib/Hasura/Server/Telemetry.hs
Vamshi Surabhi c52bfc540d
More robust forking, exception safety. Closes #3768 (#3860)
This is the result of a general audit of how we fork threads, with a
detour into how we're using mutable state especially in websocket
codepaths, making more robust to async exceptions and exceptions
resulting from bugs.

Some highlights:
- use a wrapper around 'immortal' so threads that die due to bugs are
  restarted, and log the error
- use 'withAsync' some places
- use bracket a few places where we might break invariants
- log some codepaths that represent bugs
- export UnstructuredLog for ad hoc logging (the alternative is we
  continue not logging useful stuff)

I had to timebox this. There are a few TODOs I didn't want to address.
And we'll wait until this is merged to attempt #3705 for
Control.Concurrent.Extended
2020-03-05 23:29:26 +05:30

222 lines
7.5 KiB
Haskell

{-# LANGUAGE RecordWildCards #-}
{-|
Send anonymized metrics to the telemetry server regarding usage of various
features of Hasura.
-}
module Hasura.Server.Telemetry
( runTelemetry
, mkTelemetryLog
)
where
import Control.Exception (try)
import Control.Lens
import Data.List
import Data.Text.Conversions (UTF8 (..), decodeText)
import Hasura.HTTP
import Hasura.Logging
import Hasura.Prelude
import Hasura.RQL.Types
import Hasura.Server.Init
import Hasura.Server.Telemetry.Counters
import Hasura.Server.Version
import qualified CI
import qualified Control.Concurrent.Extended as C
import qualified Data.Aeson as A
import qualified Data.Aeson.Casing as A
import qualified Data.Aeson.TH as A
import qualified Data.ByteString.Lazy as BL
import qualified Data.HashMap.Strict as Map
import qualified Data.Text as T
import qualified Network.HTTP.Client as HTTP
import qualified Network.HTTP.Types as HTTP
import qualified Network.Wreq as Wreq
data RelationshipMetric
= RelationshipMetric
{ _rmManual :: !Int
, _rmAuto :: !Int
} deriving (Show, Eq)
$(A.deriveToJSON (A.aesonDrop 3 A.snakeCase) ''RelationshipMetric)
data PermissionMetric
= PermissionMetric
{ _pmSelect :: !Int
, _pmInsert :: !Int
, _pmUpdate :: !Int
, _pmDelete :: !Int
, _pmRoles :: !Int
} deriving (Show, Eq)
$(A.deriveToJSON (A.aesonDrop 3 A.snakeCase) ''PermissionMetric)
data Metrics
= Metrics
{ _mtTables :: !Int
, _mtViews :: !Int
, _mtEnumTables :: !Int
, _mtRelationships :: !RelationshipMetric
, _mtPermissions :: !PermissionMetric
, _mtEventTriggers :: !Int
, _mtRemoteSchemas :: !Int
, _mtFunctions :: !Int
, _mtServiceTimings :: !ServiceTimingMetrics
} deriving (Show, Eq)
$(A.deriveToJSON (A.aesonDrop 3 A.snakeCase) ''Metrics)
data HasuraTelemetry
= HasuraTelemetry
{ _htDbUid :: !Text
, _htInstanceUid :: !InstanceId
, _htVersion :: !Version
, _htCi :: !(Maybe CI.CI)
, _htMetrics :: !Metrics
} deriving (Show)
$(A.deriveToJSON (A.aesonDrop 3 A.snakeCase) ''HasuraTelemetry)
data TelemetryPayload
= TelemetryPayload
{ _tpTopic :: !Text
, _tpData :: !HasuraTelemetry
} deriving (Show)
$(A.deriveToJSON (A.aesonDrop 3 A.snakeCase) ''TelemetryPayload)
telemetryUrl :: Text
telemetryUrl = "https://telemetry.hasura.io/v1/http"
mkPayload :: Text -> InstanceId -> Version -> Metrics -> IO TelemetryPayload
mkPayload dbId instanceId version metrics = do
ci <- CI.getCI
let topic = case version of
VersionDev _ -> "server_test"
VersionRelease _ -> "server"
pure $ TelemetryPayload topic $ HasuraTelemetry dbId instanceId version ci metrics
-- | An infinite loop that sends updated telemetry data ('Metrics') every 24
-- hours. The send time depends on when the server was started and will
-- naturally drift.
runTelemetry
:: (HasVersion)
=> Logger Hasura
-> HTTP.Manager
-> IO SchemaCache
-- ^ an action that always returns the latest schema cache
-> Text
-> InstanceId
-> IO void
runTelemetry (Logger logger) manager getSchemaCache dbId instanceId = do
let options = wreqOptions manager []
forever $ do
schemaCache <- getSchemaCache
serviceTimings <- dumpServiceTimingMetrics
let metrics = computeMetrics schemaCache serviceTimings
payload <- A.encode <$> mkPayload dbId instanceId currentVersion metrics
logger $ debugLBS $ "metrics_info: " <> payload
resp <- try $ Wreq.postWith options (T.unpack telemetryUrl) payload
either logHttpEx handleHttpResp resp
C.sleep $ days 1
where
logHttpEx :: HTTP.HttpException -> IO ()
logHttpEx ex = do
let httpErr = Just $ mkHttpError telemetryUrl Nothing (Just $ HttpException ex)
logger $ mkTelemetryLog "http_exception" "http exception occurred" httpErr
handleHttpResp resp = do
let statusCode = resp ^. Wreq.responseStatus . Wreq.statusCode
logger $ debugLBS $ "http_success: " <> resp ^. Wreq.responseBody
when (statusCode /= 200) $ do
let httpErr = Just $ mkHttpError telemetryUrl (Just resp) Nothing
logger $ mkTelemetryLog "http_error" "failed to post telemetry" httpErr
computeMetrics :: SchemaCache -> ServiceTimingMetrics -> Metrics
computeMetrics sc _mtServiceTimings =
let _mtTables = countUserTables (isNothing . _tciViewInfo . _tiCoreInfo)
_mtViews = countUserTables (isJust . _tciViewInfo . _tiCoreInfo)
_mtEnumTables = countUserTables (isJust . _tciEnumValues . _tiCoreInfo)
allRels = join $ Map.elems $ Map.map (getRels . _tciFieldInfoMap . _tiCoreInfo) userTables
(manualRels, autoRels) = partition riIsManual allRels
_mtRelationships = RelationshipMetric (length manualRels) (length autoRels)
rolePerms = join $ Map.elems $ Map.map permsOfTbl userTables
_pmRoles = length $ nub $ fst <$> rolePerms
allPerms = snd <$> rolePerms
_pmInsert = calcPerms _permIns allPerms
_pmSelect = calcPerms _permSel allPerms
_pmUpdate = calcPerms _permUpd allPerms
_pmDelete = calcPerms _permDel allPerms
_mtPermissions =
PermissionMetric{..}
_mtEventTriggers = Map.size $ Map.filter (not . Map.null)
$ Map.map _tiEventTriggerInfoMap userTables
_mtRemoteSchemas = Map.size $ scRemoteSchemas sc
_mtFunctions = Map.size $ Map.filter (not . isSystemDefined . fiSystemDefined) $ scFunctions sc
in Metrics{..}
where
userTables = Map.filter (not . isSystemDefined . _tciSystemDefined . _tiCoreInfo) $ scTables sc
countUserTables predicate = length . filter predicate $ Map.elems userTables
calcPerms :: (RolePermInfo -> Maybe a) -> [RolePermInfo] -> Int
calcPerms fn perms = length $ catMaybes $ map fn perms
permsOfTbl :: TableInfo -> [(RoleName, RolePermInfo)]
permsOfTbl = Map.toList . _tiRolePermInfoMap
-- | Logging related
data TelemetryLog
= TelemetryLog
{ _tlLogLevel :: !LogLevel
, _tlType :: !Text
, _tlMessage :: !Text
, _tlHttpError :: !(Maybe TelemetryHttpError)
} deriving (Show)
data TelemetryHttpError
= TelemetryHttpError
{ tlheStatus :: !(Maybe HTTP.Status)
, tlheUrl :: !T.Text
, tlheHttpException :: !(Maybe HttpException)
, tlheResponse :: !(Maybe T.Text)
} deriving (Show)
instance A.ToJSON TelemetryLog where
toJSON tl =
A.object [ "type" A..= _tlType tl
, "message" A..= _tlMessage tl
, "http_error" A..= (A.toJSON <$> _tlHttpError tl)
]
instance A.ToJSON TelemetryHttpError where
toJSON tlhe =
A.object [ "status_code" A..= (HTTP.statusCode <$> tlheStatus tlhe)
, "url" A..= tlheUrl tlhe
, "response" A..= tlheResponse tlhe
, "http_exception" A..= (A.toJSON <$> tlheHttpException tlhe)
]
instance ToEngineLog TelemetryLog Hasura where
toEngineLog tl = (_tlLogLevel tl, ELTInternal ILTTelemetry, A.toJSON tl)
mkHttpError
:: Text
-> Maybe (Wreq.Response BL.ByteString)
-> Maybe HttpException
-> TelemetryHttpError
mkHttpError url mResp httpEx =
case mResp of
Nothing -> TelemetryHttpError Nothing url httpEx Nothing
Just resp ->
let status = resp ^. Wreq.responseStatus
body = decodeText $ UTF8 (resp ^. Wreq.responseBody)
in TelemetryHttpError (Just status) url httpEx body
mkTelemetryLog :: Text -> Text -> Maybe TelemetryHttpError -> TelemetryLog
mkTelemetryLog = TelemetryLog LevelInfo