graphql-engine/server/src-exec/Main.hs
Antoine Leblanc cf531b05cb Rewrite Tracing to allow for only one TraceT in the entire stack.
This PR is on top of #7789.

### Description

This PR entirely rewrites the API of the Tracing library, to make `interpTraceT` a thing of the past. Before this change, we ran traces by sticking a `TraceT` on top of whatever we were doing. This had several major drawbacks:
- we were carrying a bunch of `TraceT` across the codebase, and the entire codebase had to know about it
- we needed to carry a second class constraint around (`HasReporterM`) to be able to run all of those traces
- we kept having to do stack rewriting with `interpTraceT`, which went from inconvenient to horrible
- we had to declare several behavioral instances on `TraceT m`

This PR rewrite all of `Tracing` using a more conventional model: there is ONE `TraceT` at the bottom of the stack, and there is an associated class constraint `MonadTrace`: any part of the code that happens to satisfy `MonadTrace` is able to create new traces. We NEVER have to do stack rewriting, `interpTraceT` is gone, and `TraceT` and `Reporter` become  implementation details that 99% of the code is blissfully unaware of: code that needs to do tracing only needs to declare that the monad in which it operates implements `MonadTrace`.

In doing so, this PR revealed **several bugs in the codebase**: places where we were expecting to trace something, but due to the default instance of `HasReporterM IO` we would actually not do anything. This PR also splits the code of `Tracing` in more byte-sized modules, with the goal of potentially moving to `server/lib` down the line.

### Remaining work

This PR is a draft; what's left to do is:
- [x] make Pro compile; i haven't updated `HasuraPro/Main` yet
- [x] document Tracing by writing a note that explains how to use the library, and the meaning of "reporter", "trace" and "span", as well as the pitfalls
- [x] discuss some of the trade-offs in the implementation, which is why i'm opening this PR already despite it not fully building yet
- [x] it depends on #7789 being merged first

PR-URL: https://github.com/hasura/graphql-engine-mono/pull/7791
GitOrigin-RevId: cadd32d039134c93ddbf364599a2f4dd988adea8
2023-03-13 17:38:39 +00:00

168 lines
7.2 KiB
Haskell

module Main
( main,
)
where
import Control.Concurrent.Extended qualified as C
import Control.Exception
import Control.Monad.Trans.Managed (ManagedT (..), lowerManagedT)
import Data.ByteString.Char8 qualified as BC
import Data.Environment qualified as Env
import Data.Int (Int64)
import Data.Kind (Type)
import Data.Text.Conversions (convertText)
import Data.Time.Clock (getCurrentTime)
import Data.Time.Clock.POSIX (getPOSIXTime)
import Database.PG.Query qualified as PG
import GHC.Debug.Stub
import GHC.TypeLits (Symbol)
import Hasura.App
import Hasura.App.State
import Hasura.Backends.Postgres.Connection.MonadTx
import Hasura.Backends.Postgres.Connection.Settings
import Hasura.GC qualified as GC
import Hasura.Logging (Hasura, LogLevel (..), defaultEnabledEngineLogTypes)
import Hasura.Prelude
import Hasura.RQL.DDL.Schema
import Hasura.Server.Init
import Hasura.Server.Metrics (ServerMetricsSpec, createServerMetrics)
import Hasura.Server.Migrate (downgradeCatalog)
import Hasura.Server.Prometheus (makeDummyPrometheusMetrics)
import Hasura.Server.Version
import Hasura.ShutdownLatch
import Hasura.Tracing (ignoreTraceT, sampleAlways)
import System.Environment (getEnvironment, lookupEnv, unsetEnv)
import System.Exit qualified as Sys
import System.Metrics qualified as EKG
import System.Posix.Signals qualified as Signals
{-# ANN main ("HLINT: ignore Use env_from_function_argument" :: String) #-}
main :: IO ()
main = maybeWithGhcDebug $ do
catch
do
env <- Env.getEnvironment
clearEnvironment
args <- parseArgs env
runApp env args
(\(ExitException _code msg) -> BC.putStrLn msg >> Sys.exitFailure)
where
-- Since the handling of environment variables works differently between the
-- Cloud version and the OSSS version we clear the process environment to
-- avoid accidentally reading directly from the operating system environment
-- variables.
clearEnvironment :: IO ()
clearEnvironment = getEnvironment >>= traverse_ \(v, _) -> unsetEnv v
runApp :: Env.Environment -> HGEOptions (ServeOptions Hasura) -> IO ()
runApp env (HGEOptions rci metadataDbUrl hgeCmd) = ignoreTraceT do
initTime <- liftIO getCurrentTime
case hgeCmd of
HCServe serveOptions -> do
globalCtx@GlobalCtx {} <- initGlobalCtx env metadataDbUrl rci
(ekgStore, serverMetrics) <- liftIO $ do
store <- EKG.newStore @AppMetricsSpec
void $ EKG.register (EKG.subset GcSubset store) EKG.registerGcMetrics
let getTimeMs :: IO Int64
getTimeMs = (round . (* 1000)) `fmap` getPOSIXTime
void $ EKG.register store $ EKG.registerCounter ServerTimestampMs () getTimeMs
serverMetrics <-
liftIO $ createServerMetrics $ EKG.subset ServerSubset store
pure (EKG.subset EKG.emptyOf store, serverMetrics)
prometheusMetrics <- lift makeDummyPrometheusMetrics
-- It'd be nice if we didn't have to call runManagedT twice here, but
-- there is a data dependency problem since the call to runPGMetadataStorageApp
-- below depends on appCtx.
runManagedT (initialiseContext env globalCtx serveOptions Nothing serverMetrics prometheusMetrics sampleAlways) $ \(appCtx, appEnv) -> do
-- Catches the SIGTERM signal and initiates a graceful shutdown.
-- Graceful shutdown for regular HTTP requests is already implemented in
-- Warp, and is triggered by invoking the 'closeSocket' callback.
-- We only catch the SIGTERM signal once, that is, if the user hits CTRL-C
-- once again, we terminate the process immediately.
liftIO $ do
void $ Signals.installHandler Signals.sigTERM (Signals.CatchOnce (shutdownGracefully $ appEnvShutdownLatch appEnv)) Nothing
void $ Signals.installHandler Signals.sigINT (Signals.CatchOnce (shutdownGracefully $ appEnvShutdownLatch appEnv)) Nothing
let Loggers _ logger _ = appEnvLoggers appEnv
_idleGCThread <-
lift $
C.forkImmortal "ourIdleGC" logger $
GC.ourIdleGC logger (seconds 0.3) (seconds 10) (seconds 60)
runPGMetadataStorageAppT (appCtx, appEnv) $
lowerManagedT $
runHGEServer (const $ pure ()) appCtx appEnv initTime Nothing ekgStore
HCExport -> do
GlobalCtx {..} <- initGlobalCtx env metadataDbUrl rci
res <- runTxWithMinimalPool _gcMetadataDbConnInfo fetchMetadataFromCatalog
either (throwErrJExit MetadataExportError) printJSON res
HCClean -> do
GlobalCtx {..} <- initGlobalCtx env metadataDbUrl rci
res <- runTxWithMinimalPool _gcMetadataDbConnInfo dropHdbCatalogSchema
let cleanSuccessMsg = "successfully cleaned graphql-engine related data"
either (throwErrJExit MetadataCleanError) (const $ liftIO $ putStrLn cleanSuccessMsg) res
HCDowngrade opts -> do
GlobalCtx {..} <- initGlobalCtx env metadataDbUrl rci
let (maybeDefaultPgConnInfo, maybeRetries) = _gcDefaultPostgresConnInfo
let defaultSourceConfig =
maybeDefaultPgConnInfo <&> \(dbUrlConf, _) ->
let pgSourceConnInfo =
PostgresSourceConnInfo
dbUrlConf
(Just setPostgresPoolSettings {_ppsRetries = maybeRetries <|> Just 1})
False
PG.ReadCommitted
Nothing
in PostgresConnConfiguration pgSourceConnInfo Nothing defaultPostgresExtensionsSchema Nothing mempty
res <- runTxWithMinimalPool _gcMetadataDbConnInfo $ downgradeCatalog defaultSourceConfig opts initTime
either (throwErrJExit DowngradeProcessError) (liftIO . print) res
HCVersion -> liftIO $ putStrLn $ "Hasura GraphQL Engine: " ++ convertText currentVersion
where
runTxWithMinimalPool connInfo tx = lowerManagedT $ do
minimalPool <- mkMinimalPool connInfo
liftIO $ runExceptT $ PG.runTx minimalPool (PG.ReadCommitted, Nothing) tx
mkMinimalPool connInfo = do
pgLogger <- _lsPgLogger <$> mkLoggers defaultEnabledEngineLogTypes LevelInfo
let connParams = PG.defaultConnParams {PG.cpConns = 1}
liftIO $ PG.initPGPool connInfo connParams pgLogger
-- | A specification of all EKG metrics tracked in `runApp`.
data
AppMetricsSpec ::
Symbol -> -- Metric name
EKG.MetricType -> -- Metric type, e.g. Counter, Gauge
Type -> -- Tag structure
Type
where
ServerSubset ::
ServerMetricsSpec name metricType tags ->
AppMetricsSpec name metricType tags
GcSubset ::
EKG.GcMetrics name metricType tags ->
AppMetricsSpec name metricType tags
ServerTimestampMs ::
AppMetricsSpec "ekg.server_timestamp_ms" 'EKG.CounterType ()
-- | 'withGhcDebug' but conditional on the environment variable
-- @HASURA_GHC_DEBUG=true@. When this is set a debug socket will be opened,
-- otherwise the server will start normally. This must only be called once and
-- it's argument should be the program's @main@
maybeWithGhcDebug :: IO a -> IO a
maybeWithGhcDebug theMain = do
lookupEnv "HASURA_GHC_DEBUG" >>= \case
Just "true" -> do
putStrLn "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
putStrLn "!!!!! Opening a ghc-debug socket !!!!!"
putStrLn "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
withGhcDebug theMain
_ -> theMain