graphql-engine/server/src-lib/Hasura/GraphQL/Transport/WebSocket.hs

1028 lines
40 KiB
Haskell
Raw Normal View History

{-# LANGUAGE CPP #-}
-- | This file contains the handlers that are used within websocket server.
--
-- This module export three main handlers for the websocket server ('onConn',
-- 'onMessage', 'onClose'), and two helpers for sending messages to the client
-- ('sendMsg', 'sendCloseWithMsg').
--
-- NOTE!
-- The handler functions 'onClose', 'onMessage', etc. depend for correctness on two properties:
-- - they run with async exceptions masked
-- - they do not race on the same connection
module Hasura.GraphQL.Transport.WebSocket
( onConn,
onMessage,
onClose,
sendMsg,
sendCloseWithMsg,
)
where
import Control.Concurrent.Extended (sleep)
import Control.Concurrent.STM qualified as STM
import Control.Monad.Trans.Control qualified as MC
import Data.Aeson qualified as J
import Data.Aeson.Casing qualified as J
import Data.Aeson.TH qualified as J
import Data.ByteString (ByteString)
import Data.ByteString.Lazy qualified as LBS
import Data.CaseInsensitive qualified as CI
import Data.Dependent.Map qualified as DM
import Data.Environment qualified as Env
import Data.HashMap.Strict qualified as Map
import Data.HashMap.Strict.InsOrd qualified as OMap
import Data.HashSet qualified as Set
import Data.List.NonEmpty qualified as NE
import Data.String
import Data.Text qualified as T
import Data.Text.Encoding qualified as TE
import Data.Time.Clock qualified as TC
import Data.Word (Word16)
import GHC.AssertNF.CPP
import Hasura.Backends.Postgres.Instances.Transport (runPGMutationTransaction)
import Hasura.Base.Error
import Hasura.EncJSON
import Hasura.GraphQL.Execute qualified as E
import Hasura.GraphQL.Execute.Action qualified as EA
import Hasura.GraphQL.Execute.Backend qualified as EB
import Hasura.GraphQL.Execute.LiveQuery.Plan qualified as LQ
import Hasura.GraphQL.Execute.LiveQuery.Poll qualified as LQ
import Hasura.GraphQL.Execute.LiveQuery.State qualified as LQ
import Hasura.GraphQL.Execute.RemoteJoin qualified as RJ
import Hasura.GraphQL.Logging
import Hasura.GraphQL.Namespace (RootFieldAlias)
import Hasura.GraphQL.ParameterizedQueryHash (ParameterizedQueryHash)
import Hasura.GraphQL.Parser.Directives (cached)
import Hasura.GraphQL.Transport.Backend
import Hasura.GraphQL.Transport.HTTP
import Hasura.GraphQL.Transport.HTTP.Protocol
import Hasura.GraphQL.Transport.Instances ()
import Hasura.GraphQL.Transport.WebSocket.Protocol
import Hasura.GraphQL.Transport.WebSocket.Server qualified as WS
import Hasura.GraphQL.Transport.WebSocket.Types
import Hasura.Logging qualified as L
import Hasura.Metadata.Class
import Hasura.Prelude
import Hasura.RQL.Types.RemoteSchema
import Hasura.RQL.Types.ResultCustomization
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
import Hasura.RQL.Types.SchemaCache (scApiLimits)
import Hasura.SQL.AnyBackend qualified as AB
import Hasura.Server.Auth
( AuthMode,
UserAuthentication,
resolveUserInfo,
)
import Hasura.Server.Cors
import Hasura.Server.Init.Config (KeepAliveDelay (..))
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
import Hasura.Server.Limits
( HasResourceLimits (..),
ResourceLimits (..),
)
import Hasura.Server.Metrics (ServerMetrics (..))
import Hasura.Server.Telemetry.Counters qualified as Telem
import Hasura.Server.Types (RequestId, getRequestId)
import Hasura.Session
import Hasura.Tracing qualified as Tracing
import Language.GraphQL.Draft.Syntax (Name (..))
import ListT qualified
import Network.HTTP.Client qualified as H
import Network.HTTP.Types qualified as H
import Network.WebSockets qualified as WS
import StmContainers.Map qualified as STMMap
-- | 'LQ.LiveQueryId' comes from 'Hasura.GraphQL.Execute.LiveQuery.State.addLiveQuery'. We use
-- this to track a connection's operations so we can remove them from 'LiveQueryState', and
-- log.
--
-- NOTE!: This must be kept consistent with the global 'LiveQueryState', in 'onClose'
-- and 'onStart'.
data OpDetail
= ODStarted
| ODProtoErr !Text
| ODQueryErr !QErr
| ODCompleted
| ODStopped
deriving (Show, Eq)
$( J.deriveToJSON
J.defaultOptions
{ J.constructorTagModifier = J.snakeCase . drop 2,
J.sumEncoding = J.TaggedObject "type" "detail"
}
''OpDetail
)
data OperationDetails = OperationDetails
{ _odOperationId :: !OperationId,
_odRequestId :: !(Maybe RequestId),
_odOperationName :: !(Maybe OperationName),
_odOperationType :: !OpDetail,
_odQuery :: !(Maybe GQLReqUnparsed),
_odParameterizedQueryHash :: !(Maybe ParameterizedQueryHash)
}
deriving (Show, Eq)
$(J.deriveToJSON hasuraJSON ''OperationDetails)
data WSEvent
= EAccepted
| ERejected !QErr
| EConnErr !ConnErrMsg
| EOperation !OperationDetails
| EClosed
deriving (Show, Eq)
$( J.deriveToJSON
J.defaultOptions
{ J.constructorTagModifier = J.snakeCase . drop 1,
J.sumEncoding = J.TaggedObject "type" "detail"
}
''WSEvent
)
data WsConnInfo = WsConnInfo
{ _wsciWebsocketId :: !WS.WSId,
_wsciTokenExpiry :: !(Maybe TC.UTCTime),
_wsciMsg :: !(Maybe Text)
}
deriving (Show, Eq)
$(J.deriveToJSON hasuraJSON ''WsConnInfo)
data WSLogInfo = WSLogInfo
{ _wsliUserVars :: !(Maybe SessionVariables),
_wsliConnectionInfo :: !WsConnInfo,
_wsliEvent :: !WSEvent
}
deriving (Show, Eq)
$(J.deriveToJSON hasuraJSON ''WSLogInfo)
data WSLog = WSLog
{ _wslLogLevel :: !L.LogLevel,
_wslInfo :: !WSLogInfo
}
instance L.ToEngineLog WSLog L.Hasura where
toEngineLog (WSLog logLevel wsLog) =
(logLevel, L.ELTWebsocketLog, J.toJSON wsLog)
backend only insert permissions (rfc #4120) (#4224) * move user info related code to Hasura.User module * the RFC #4120 implementation; insert permissions with admin secret * revert back to old RoleName based schema maps An attempt made to avoid duplication of schema contexts in types if any role doesn't possess any admin secret specific schema * fix compile errors in haskell test * keep 'user_vars' for session variables in http-logs * no-op refacto * tests for admin only inserts * update docs for admin only inserts * updated CHANGELOG.md * default behaviour when admin secret is not set * fix x-hasura-role to X-Hasura-Role in pytests * introduce effective timeout in actions async tests * update docs for admin-secret not configured case * Update docs/graphql/manual/api-reference/schema-metadata-api/permission.rst Co-Authored-By: Marion Schleifer <marion@hasura.io> * Apply suggestions from code review Co-Authored-By: Marion Schleifer <marion@hasura.io> * a complete iteration backend insert permissions accessable via 'x-hasura-backend-privilege' session variable * console changes for backend-only permissions * provide tooltip id; update labels and tooltips; * requested changes * requested changes - remove className from Toggle component - use appropriate function name (capitalizeFirstChar -> capitalize) * use toggle props from definitelyTyped * fix accidental commit * Revert "introduce effective timeout in actions async tests" This reverts commit b7a59c19d643520cfde6af579889e1038038438a. * generate complete schema for both 'default' and 'backend' sessions * Apply suggestions from code review Co-Authored-By: Marion Schleifer <marion@hasura.io> * remove unnecessary import, export Toggle as is * update session variable in tooltip * 'x-hasura-use-backend-only-permissions' variable to switch * update help texts * update docs * update docs * update console help text * regenerate package-lock * serve no backend schema when backend_only: false and header set to true - Few type name refactor as suggested by @0x777 * update CHANGELOG.md * Update CHANGELOG.md * Update CHANGELOG.md * fix a merge bug where a certain entity didn't get removed Co-authored-by: Marion Schleifer <marion@hasura.io> Co-authored-by: Rishichandra Wawhal <rishi@hasura.io> Co-authored-by: rikinsk <rikin.kachhia@gmail.com> Co-authored-by: Tirumarai Selvan <tiru@hasura.io>
2020-04-24 12:10:53 +03:00
mkWsInfoLog :: Maybe SessionVariables -> WsConnInfo -> WSEvent -> WSLog
mkWsInfoLog uv ci ev =
WSLog L.LevelInfo $ WSLogInfo uv ci ev
backend only insert permissions (rfc #4120) (#4224) * move user info related code to Hasura.User module * the RFC #4120 implementation; insert permissions with admin secret * revert back to old RoleName based schema maps An attempt made to avoid duplication of schema contexts in types if any role doesn't possess any admin secret specific schema * fix compile errors in haskell test * keep 'user_vars' for session variables in http-logs * no-op refacto * tests for admin only inserts * update docs for admin only inserts * updated CHANGELOG.md * default behaviour when admin secret is not set * fix x-hasura-role to X-Hasura-Role in pytests * introduce effective timeout in actions async tests * update docs for admin-secret not configured case * Update docs/graphql/manual/api-reference/schema-metadata-api/permission.rst Co-Authored-By: Marion Schleifer <marion@hasura.io> * Apply suggestions from code review Co-Authored-By: Marion Schleifer <marion@hasura.io> * a complete iteration backend insert permissions accessable via 'x-hasura-backend-privilege' session variable * console changes for backend-only permissions * provide tooltip id; update labels and tooltips; * requested changes * requested changes - remove className from Toggle component - use appropriate function name (capitalizeFirstChar -> capitalize) * use toggle props from definitelyTyped * fix accidental commit * Revert "introduce effective timeout in actions async tests" This reverts commit b7a59c19d643520cfde6af579889e1038038438a. * generate complete schema for both 'default' and 'backend' sessions * Apply suggestions from code review Co-Authored-By: Marion Schleifer <marion@hasura.io> * remove unnecessary import, export Toggle as is * update session variable in tooltip * 'x-hasura-use-backend-only-permissions' variable to switch * update help texts * update docs * update docs * update console help text * regenerate package-lock * serve no backend schema when backend_only: false and header set to true - Few type name refactor as suggested by @0x777 * update CHANGELOG.md * Update CHANGELOG.md * Update CHANGELOG.md * fix a merge bug where a certain entity didn't get removed Co-authored-by: Marion Schleifer <marion@hasura.io> Co-authored-by: Rishichandra Wawhal <rishi@hasura.io> Co-authored-by: rikinsk <rikin.kachhia@gmail.com> Co-authored-by: Tirumarai Selvan <tiru@hasura.io>
2020-04-24 12:10:53 +03:00
mkWsErrorLog :: Maybe SessionVariables -> WsConnInfo -> WSEvent -> WSLog
mkWsErrorLog uv ci ev =
WSLog L.LevelError $ WSLogInfo uv ci ev
logWSEvent ::
(MonadIO m) =>
L.Logger L.Hasura ->
WSConn ->
WSEvent ->
m ()
logWSEvent (L.Logger logger) wsConn wsEv = do
userInfoME <- liftIO $ STM.readTVarIO userInfoR
let (userVarsM, tokenExpM) = case userInfoME of
CSInitialised WsClientState {..} ->
( Just $ _uiSession wscsUserInfo,
wscsTokenExpTime
)
_ -> (Nothing, Nothing)
liftIO $ logger $ WSLog logLevel $ WSLogInfo userVarsM (WsConnInfo wsId tokenExpM Nothing) wsEv
where
WSConnData userInfoR _ _ _ = WS.getData wsConn
wsId = WS.getWSId wsConn
logLevel = bool L.LevelInfo L.LevelError isError
isError = case wsEv of
EAccepted -> False
ERejected _ -> True
EConnErr _ -> True
EClosed -> False
EOperation operation -> case _odOperationType operation of
ODStarted -> False
ODProtoErr _ -> True
ODQueryErr _ -> True
ODCompleted -> False
ODStopped -> False
sendMsg :: (MonadIO m) => WSConn -> ServerMsg -> m ()
sendMsg wsConn msg =
liftIO $ WS.sendMsg wsConn $ WS.WSQueueResponse (encodeServerMsg msg) Nothing
-- sendCloseWithMsg closes the websocket server with an error code that can be supplied as (Maybe Word16),
-- if there is `Nothing`, the server will be closed with an error code derived from ServerErrorCode
sendCloseWithMsg ::
(MonadIO m) =>
L.Logger L.Hasura ->
WSConn ->
ServerErrorCode ->
Maybe ServerMsg ->
Maybe Word16 ->
m ()
sendCloseWithMsg logger wsConn errCode mErrServerMsg mCode = do
case mErrServerMsg of
Just errServerMsg -> do
sendMsg wsConn errServerMsg
Nothing -> pure ()
logWSEvent logger wsConn EClosed
liftIO $ WS.sendCloseCode wsc errCloseCode errMsg
where
wsc = WS.getRawWebSocketConnection wsConn
errMsg = encodeServerErrorMsg errCode
errCloseCode = fromMaybe (getErrCode errCode) mCode
getErrCode :: ServerErrorCode -> Word16
getErrCode err = case err of
ProtocolError1002 -> 1002
GenericError4400 _ -> 4400
Unauthorized4401 -> 4401
Forbidden4403 -> 4403
ConnectionInitTimeout4408 -> 4408
NonUniqueSubscription4409 _ -> 4409
TooManyRequests4429 -> 4429
sendMsgWithMetadata ::
(MonadIO m) =>
WSConn ->
ServerMsg ->
Maybe OperationName ->
Maybe ParameterizedQueryHash ->
LQ.LiveQueryMetadata ->
m ()
sendMsgWithMetadata wsConn msg opName paramQueryHash (LQ.LiveQueryMetadata execTime) =
liftIO $ WS.sendMsg wsConn $ WS.WSQueueResponse bs wsInfo
where
bs = encodeServerMsg msg
(msgType, operationId) = case msg of
(SMNext (DataMsg opId _)) -> (Just SMT_GQL_NEXT, Just opId)
(SMData (DataMsg opId _)) -> (Just SMT_GQL_DATA, Just opId)
_ -> (Nothing, Nothing)
wsInfo =
Just
$! WS.WSEventInfo
{ WS._wseiEventType = msgType,
WS._wseiOperationId = operationId,
WS._wseiOperationName = opName,
WS._wseiQueryExecutionTime = Just $! realToFrac execTime,
WS._wseiResponseSize = Just $! LBS.length bs,
WS._wseiParameterizedQueryHash = paramQueryHash
}
onConn ::
(MonadIO m, MonadReader WSServerEnv m) =>
WS.OnConnH m WSConnData
onConn wsId requestHead ipAddress onConnHActions = do
res <- runExceptT $ do
(errType, queryType) <- checkPath
let reqHdrs = WS.requestHeaders requestHead
headers <- maybe (return reqHdrs) (flip enforceCors reqHdrs . snd) getOrigin
return (WsHeaders $ filterWsHeaders headers, errType, queryType)
either reject accept res
where
kaAction = WS._wsaKeepAliveAction onConnHActions
acceptRequest = WS._wsaAcceptRequest onConnHActions
-- NOTE: the "Keep-Alive" delay is something that's mentioned
-- in the Apollo spec. For 'graphql-ws', we're using the Ping
-- messages that are part of the spec.
keepAliveAction keepAliveDelay wsConn = do
liftIO $
forever $ do
kaAction wsConn
sleep $ seconds (unKeepAliveDelay keepAliveDelay)
tokenExpiryHandler wsConn = do
expTime <- liftIO $
STM.atomically $ do
connState <- STM.readTVar $ (_wscUser . WS.getData) wsConn
case connState of
CSNotInitialised _ _ -> STM.retry
CSInitError _ -> STM.retry
CSInitialised clientState -> onNothing (wscsTokenExpTime clientState) STM.retry
currTime <- TC.getCurrentTime
sleep $ convertDuration $ TC.diffUTCTime expTime currTime
accept (hdrs, errType, queryType) = do
(L.Logger logger) <- asks _wseLogger
keepAliveDelay <- asks _wseKeepAliveDelay
logger $ mkWsInfoLog Nothing (WsConnInfo wsId Nothing Nothing) EAccepted
connData <-
liftIO $
WSConnData
<$> STM.newTVarIO (CSNotInitialised hdrs ipAddress)
<*> STMMap.newIO
<*> pure errType
<*> pure queryType
pure $
Right $
WS.AcceptWith
connData
acceptRequest
(keepAliveAction keepAliveDelay)
tokenExpiryHandler
reject qErr = do
(L.Logger logger) <- asks _wseLogger
logger $ mkWsErrorLog Nothing (WsConnInfo wsId Nothing Nothing) (ERejected qErr)
return $
Left $
WS.RejectRequest
(H.statusCode $ qeStatus qErr)
(H.statusMessage $ qeStatus qErr)
[]
(LBS.toStrict $ J.encode $ encodeGQLErr False qErr)
checkPath = case WS.requestPath requestHead of
"/v1alpha1/graphql" -> return (ERTLegacy, E.QueryHasura)
"/v1/graphql" -> return (ERTGraphqlCompliant, E.QueryHasura)
"/v1beta1/relay" -> return (ERTGraphqlCompliant, E.QueryRelay)
_ ->
throw404 "only '/v1/graphql', '/v1alpha1/graphql' and '/v1beta1/relay' are supported on websockets"
getOrigin =
find ((==) "Origin" . fst) (WS.requestHeaders requestHead)
enforceCors origin reqHdrs = do
(L.Logger logger) <- asks _wseLogger
corsPolicy <- asks _wseCorsPolicy
case cpConfig corsPolicy of
CCAllowAll -> return reqHdrs
CCDisabled readCookie ->
if readCookie
then return reqHdrs
else do
lift $ logger $ mkWsInfoLog Nothing (WsConnInfo wsId Nothing (Just corsNote)) EAccepted
return $ filter (\h -> fst h /= "Cookie") reqHdrs
CCAllowedOrigins ds
-- if the origin is in our cors domains, no error
| bsToTxt origin `elem` dmFqdns ds -> return reqHdrs
-- if current origin is part of wildcard domain list, no error
| inWildcardList ds (bsToTxt origin) -> return reqHdrs
-- otherwise error
| otherwise -> corsErr
filterWsHeaders hdrs = flip filter hdrs $ \(n, _) ->
n
`notElem` [ "sec-websocket-key",
"sec-websocket-version",
"upgrade",
"connection"
]
corsErr =
throw400
AccessDenied
"received origin header does not match configured CORS domains"
corsNote =
"Cookie is not read when CORS is disabled, because it is a potential "
<> "security issue. If you're already handling CORS before Hasura and enforcing "
<> "CORS on websocket connections, then you can use the flag --ws-read-cookie or "
<> "HASURA_GRAPHQL_WS_READ_COOKIE to force read cookie when CORS is disabled."
onStart ::
forall m.
( MonadIO m,
E.MonadGQLExecutionCheck m,
MonadQueryLog m,
Tracing.MonadTrace m,
MonadExecuteQuery m,
MC.MonadBaseControl IO m,
MonadMetadataStorage (MetadataStorageT m),
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
EB.MonadQueryTags m,
HasResourceLimits m
) =>
Env.Environment ->
HashSet (L.EngineLogType L.Hasura) ->
WSServerEnv ->
WSConn ->
StartMsg ->
WS.WSActions WSConnData ->
m ()
onStart env enabledLogTypes serverEnv wsConn (StartMsg opId q) onMessageActions = catchAndIgnore $ do
timerTot <- startTimer
op <- liftIO $ STM.atomically $ STMMap.lookup opId opMap
let opName = _grOperationName q
-- NOTE: it should be safe to rely on this check later on in this function, since we expect that
-- we process all operations on a websocket connection serially:
when (isJust op) $
withComplete $
sendStartErr $
"an operation already exists with this id: " <> unOperationId opId
userInfoM <- liftIO $ STM.readTVarIO userInfoR
(userInfo, origReqHdrs, ipAddress) <- case userInfoM of
CSInitialised WsClientState {..} -> return (wscsUserInfo, wscsReqHeaders, wscsIpAddress)
CSInitError initErr -> do
let e = "cannot start as connection_init failed with : " <> initErr
withComplete $ sendStartErr e
CSNotInitialised _ _ -> do
let e = "start received before the connection is initialised"
withComplete $ sendStartErr e
(requestId, reqHdrs) <- getRequestId origReqHdrs
(sc, scVer) <- liftIO getSchemaCache
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
operationLimit <- askGraphqlOperationLimit
let runLimits ::
ExceptT (Either GQExecError QErr) (ExceptT () m) a ->
ExceptT (Either GQExecError QErr) (ExceptT () m) a
runLimits = withErr Right $ runResourceLimits $ operationLimit userInfo (scApiLimits sc)
reqParsedE <- lift $ E.checkGQLExecution userInfo (reqHdrs, ipAddress) enableAL sc q
reqParsed <- onLeft reqParsedE (withComplete . preExecErr requestId)
execPlanE <-
runExceptT $
E.getResolvedExecPlan
env
logger
userInfo
sqlGenCtx
sc
scVer
queryType
httpMgr
reqHdrs
(q, reqParsed)
requestId
server: remove remnants of query plan caching (fix #1795) Query plan caching was introduced by - I believe - hasura/graphql-engine#1934 in order to reduce the query response latency. During the development of PDV in hasura/graphql-engine#4111, it was found out that the new architecture (for which query plan caching wasn't implemented) performed comparably to the pre-PDV architecture with caching. Hence, it was decided to leave query plan caching until some day in the future when it was deemed necessary. Well, we're in the future now, and there still isn't a convincing argument for query plan caching. So the time has come to remove some references to query plan caching from the codebase. For the most part, any code being removed would probably not be very well suited to the post-PDV architecture of query execution, so arguably not much is lost. Apart from simplifying the code, this PR will contribute towards making the GraphQL schema generation more modular, testable, and easier to profile. I'd like to eventually work towards a situation in which it's easy to generate a GraphQL schema parser *in isolation*, without being connected to a database, and then parse a GraphQL query *in isolation*, without even listening any HTTP port. It is important that both of these operations can be examined in detail, and in isolation, since they are two major performance bottlenecks, as well as phases where many important upcoming features hook into. Implementation The following have been removed: - The entirety of `server/src-lib/Hasura/GraphQL/Execute/Plan.hs` - The core phases of query parsing and execution no longer have any references to query plan caching. Note that this is not to be confused with query *response* caching, which is not affected by this PR. This includes removal of the types: - - `Opaque`, which is replaced by a tuple. Note that the old implementation was broken and did not adequately hide the constructors. - - `QueryReusability` (and the `markNotReusable` method). Notably, the implementation of the `ParseT` monad now consists of two, rather than three, monad transformers. - Cache-related tests (in `server/src-test/Hasura/CacheBoundedSpec.hs`) have been removed . - References to query plan caching in the documentation. - The `planCacheOptions` in the `TenantConfig` type class was removed. However, during parsing, unrecognized fields in the YAML config get ignored, so this does not cause a breaking change. (Confirmed manually, as well as in consultation with @sordina.) - The metrics no longer send cache hit/miss messages. There are a few places in which one can still find references to query plan caching: - We still accept the `--query-plan-cache-size` command-line option for backwards compatibility. The `HASURA_QUERY_PLAN_CACHE_SIZE` environment variable is not read. https://github.com/hasura/graphql-engine-mono/pull/1815 GitOrigin-RevId: 17d92b254ec093c62a7dfeec478658ede0813eb7
2021-07-27 14:51:52 +03:00
(parameterizedQueryHash, execPlan) <- onLeft execPlanE (withComplete . preExecErr requestId)
case execPlan of
E.QueryExecutionPlan queryPlan asts dirMap -> Tracing.trace "Query" $ do
Caching, Rate Limiting, Metrics & Session Variable Improvements (#376) * server: use a leaky bucket algorithm for bytes-per-second cache rate limiting * Use evalsha properly * Adds redis cache limit parameters to PoliciesConfig * Loads Leaky Bucket Script On Server Start * Adds more redis logging and moves cache update into lua script * reverts setex in lua and adds notes * Refactors cacheStore and adds max TTL and cache size limits * Filter session vars in cache key * WIP * parens * cache-clear-hander POC implementation * cache-clear-hander POC implementation * Pro projectId used as cache key * POC working! * prefixing query-response keys in redis * Add cacheClearer to RedisScripts * Partial implementation of cacheClearer from scripts record * updating tests * [automated] stylish-haskell commit * Adds query look with up with metrics script * Adds missing module and lua script from last commit * Changes redis script module structure to match cache clearing branch * minor change to lua script * cleaning up cache clearing * generalising JsonLog * [automated] stylish-haskell commit * Draft Cache Metrics Endpoint * Adds Cache Metrics Handler * Adds hook handler module * Missed HandlerHook module in last commit * glob * Fixes redis mget bug * Removes cache totals and changes dashes to colons in metric cache keys * Adds query param to clear clear endpoint for deleting specific keys * Adds query param to clear clear endpoint for deleting specific keys * Cache Metrics on query families rather then queries * Replace Set with nub * Base16 Redis Hashes * Query Family Redis Keys With Roles * response headers for cache keys * fixing bug in family key by excluding operation name; using hash for response header instead of entire key * Adds query family to redis cache keys and cache clear endpoint * Fixes queryfamily hash bug * Moves cache endpoints to /pro * Moved cache clear to POST * Refactors cache clear function * Fixes query family format bug * Adds query cache tests and optional --redis-url flag to python test suite * Adds session variable cache test * Update pro changelog * adding documentation for additional caching features * more docs * clearing up units of leaky bucket params * Adds comments to leaky bucket script * removes old todo * Fixes session variable filtering to work with new query rootfield * more advanced defaulting behaviour for bucket rate and capacity. * Updates Docs * Moves Role into QueryFamily hash * Use Aeson for Cache Clear endpoint response * Moves trace to bracket the leaky bucket script * Misc review tweaks * Adds sum type for cache clear query params * Hardcodes RegisReplyLog log level * Update docs/graphql/cloud/response-caching.rst Co-authored-by: Phil Freeman <phil@hasura.io> * new prose for rate limiting docs * [automated] stylish-haskell commit * make rootToSessVarPreds total * [automated] stylish-haskell commit * Fixes out of scope error * Renamed _acRedis to _acCacheStore Co-authored-by: Solomon Bothwell <ssbothwell@gmail.com> Co-authored-by: Lyndon Maydwell <lyndon@sordina.net> Co-authored-by: David Overton <david@hasura.io> Co-authored-by: Stylish Haskell Bot <stylish-haskell@users.noreply.github.com> Co-authored-by: Lyndon Maydwell <lyndon@hasura.io> GitOrigin-RevId: dda5c1a3f902967b3d78310f950541a55fabb1b0
2021-02-13 03:05:23 +03:00
let filteredSessionVars = runSessVarPred (filterVariablesFromQuery asts) (_uiSession userInfo)
cacheKey = QueryCacheKey reqParsed (_uiRole userInfo) filteredSessionVars
remoteSchemas =
OMap.elems queryPlan >>= \case
E.ExecStepDB _remoteHeaders _ remoteJoins ->
maybe [] (map RJ._rsjRemoteSchema . RJ.getRemoteSchemaJoins) remoteJoins
_ -> []
actionsInfo =
foldl getExecStepActionWithActionInfo [] $
OMap.elems $
OMap.filter
( \case
E.ExecStepAction _ _ _remoteJoins -> True
_ -> False
)
queryPlan
cachedDirective = runIdentity <$> DM.lookup cached dirMap
-- We ignore the response headers (containing TTL information) because
-- WebSockets don't support them.
(_responseHeaders, cachedValue) <- Tracing.interpTraceT (withExceptT mempty) $ cacheLookup remoteSchemas actionsInfo cacheKey cachedDirective
case cachedValue of
Just cachedResponseData -> do
logQueryLog logger $ QueryLog q Nothing requestId QueryLogKindCached
sendSuccResp cachedResponseData opName parameterizedQueryHash $ LQ.LiveQueryMetadata 0
Nothing -> do
conclusion <- runExceptT $
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
runLimits $
forWithKey queryPlan $ \fieldName -> \case
E.ExecStepDB _headers exists remoteJoins -> doQErr $ do
(telemTimeIO_DT, resp) <-
AB.dispatchAnyBackend @BackendTransport
exists
\(EB.DBStepInfo _ sourceConfig genSql tx :: EB.DBStepInfo b) ->
runDBQuery @b
requestId
q
fieldName
userInfo
logger
sourceConfig
tx
genSql
finalResponse <-
RJ.processRemoteJoins requestId logger env httpMgr reqHdrs userInfo resp remoteJoins q
pure $ AnnotatedResponsePart telemTimeIO_DT Telem.Local finalResponse []
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
E.ExecStepRemote rsi resultCustomizer gqlReq -> do
logQueryLog logger $ QueryLog q Nothing requestId QueryLogKindRemoteSchema
runRemoteGQ fieldName userInfo reqHdrs rsi resultCustomizer gqlReq
E.ExecStepAction actionExecPlan _ remoteJoins -> do
logQueryLog logger $ QueryLog q Nothing requestId QueryLogKindAction
(time, (resp, _)) <- doQErr $ do
(time, (resp, hdrs)) <- EA.runActionExecution userInfo actionExecPlan
finalResponse <-
RJ.processRemoteJoins requestId logger env httpMgr reqHdrs userInfo resp remoteJoins q
pure (time, (finalResponse, hdrs))
pure $ AnnotatedResponsePart time Telem.Empty resp []
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
E.ExecStepRaw json -> do
logQueryLog logger $ QueryLog q Nothing requestId QueryLogKindIntrospection
buildRaw json
sendResultFromFragments Telem.Query timerTot requestId conclusion opName parameterizedQueryHash
case conclusion of
Left _ -> pure ()
Right results -> do
-- Note: The result of cacheStore is ignored here since we can't ensure that
-- the WS client will respond correctly to multiple messages.
void $
Tracing.interpTraceT (withExceptT mempty) $
cacheStore cacheKey cachedDirective $ encodeAnnotatedResponseParts results
liftIO $ sendCompleted (Just requestId) (Just parameterizedQueryHash)
E.MutationExecutionPlan mutationPlan -> do
-- See Note [Backwards-compatible transaction optimisation]
case coalescePostgresMutations mutationPlan of
-- we are in the aforementioned case; we circumvent the normal process
Just (sourceConfig, pgMutations) -> do
resp <-
runExceptT $
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
runLimits $
doQErr $
runPGMutationTransaction requestId q userInfo logger sourceConfig pgMutations
-- we do not construct result fragments since we have only one result
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
handleResult requestId resp \(telemTimeIO_DT, results) -> do
let telemQueryType = Telem.Query
telemLocality = Telem.Local
telemTimeIO = convertDuration telemTimeIO_DT
telemTimeTot <- Seconds <$> timerTot
sendSuccResp (encodeEncJSONResults results) opName parameterizedQueryHash $
LQ.LiveQueryMetadata telemTimeIO_DT
-- Telemetry. NOTE: don't time network IO:
Telem.recordTimingMetric Telem.RequestDimensions {..} Telem.RequestTimings {..}
-- we are not in the transaction case; proceeding normally
Nothing -> do
conclusion <- runExceptT $
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
runLimits $
forWithKey mutationPlan $ \fieldName -> \case
-- Ignoring response headers since we can't send them over WebSocket
E.ExecStepDB _responseHeaders exists remoteJoins -> doQErr $ do
(telemTimeIO_DT, resp) <-
AB.dispatchAnyBackend @BackendTransport
exists
\(EB.DBStepInfo _ sourceConfig genSql tx :: EB.DBStepInfo b) ->
runDBMutation @b
requestId
q
fieldName
userInfo
logger
sourceConfig
tx
genSql
finalResponse <-
RJ.processRemoteJoins requestId logger env httpMgr reqHdrs userInfo resp remoteJoins q
pure $ AnnotatedResponsePart telemTimeIO_DT Telem.Local finalResponse []
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
E.ExecStepAction actionExecPlan _ remoteJoins -> do
logQueryLog logger $ QueryLog q Nothing requestId QueryLogKindAction
(time, (resp, hdrs)) <- doQErr $ do
(time, (resp, hdrs)) <- EA.runActionExecution userInfo actionExecPlan
finalResponse <-
RJ.processRemoteJoins requestId logger env httpMgr reqHdrs userInfo resp remoteJoins q
pure (time, (finalResponse, hdrs))
pure $ AnnotatedResponsePart time Telem.Empty resp $ fromMaybe [] hdrs
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
E.ExecStepRemote rsi resultCustomizer gqlReq -> do
logQueryLog logger $ QueryLog q Nothing requestId QueryLogKindRemoteSchema
runRemoteGQ fieldName userInfo reqHdrs rsi resultCustomizer gqlReq
E.ExecStepRaw json -> do
logQueryLog logger $ QueryLog q Nothing requestId QueryLogKindIntrospection
buildRaw json
sendResultFromFragments Telem.Query timerTot requestId conclusion opName parameterizedQueryHash
liftIO $ sendCompleted (Just requestId) (Just parameterizedQueryHash)
E.SubscriptionExecutionPlan subExec -> do
case subExec of
E.SEAsyncActionsWithNoRelationships actions -> do
logQueryLog logger $ QueryLog q Nothing requestId QueryLogKindAction
liftIO do
let allActionIds = map fst $ toList actions
case NE.nonEmpty allActionIds of
Nothing -> sendCompleted (Just requestId) (Just parameterizedQueryHash)
Just actionIds -> do
let sendResponseIO actionLogMap = do
(dTime, resultsE) <- withElapsedTime $
runExceptT $
for actions $ \(actionId, resultBuilder) -> do
actionLogResponse <-
Map.lookup actionId actionLogMap
`onNothing` throw500 "unexpected: cannot lookup action_id in response map"
liftEither $ resultBuilder actionLogResponse
case resultsE of
Left err -> sendError requestId err
Right results -> do
let dataMsg =
sendDataMsg $
DataMsg opId $
pure $
encJToLBS $
encodeEncJSONResults results
sendMsgWithMetadata wsConn dataMsg opName (Just parameterizedQueryHash) $ LQ.LiveQueryMetadata dTime
asyncActionQueryLive =
LQ.LAAQNoRelationships $
LQ.LiveAsyncActionQueryWithNoRelationships sendResponseIO (sendCompleted (Just requestId) (Just parameterizedQueryHash))
LQ.addAsyncActionLiveQuery
(LQ._lqsAsyncActions lqMap)
opId
actionIds
(sendError requestId)
asyncActionQueryLive
E.SEOnSourceDB actionIds liveQueryBuilder -> do
actionLogMapE <- fmap fst <$> runExceptT (EA.fetchActionLogResponses actionIds)
actionLogMap <- onLeft actionLogMapE (withComplete . preExecErr requestId)
lqIdE <- liftIO $ startLiveQuery liveQueryBuilder parameterizedQueryHash requestId actionLogMap
lqId <- onLeft lqIdE (withComplete . preExecErr requestId)
-- Update async action query subscription state
case NE.nonEmpty (toList actionIds) of
Nothing -> do
logQueryLog logger $ QueryLog q Nothing requestId QueryLogKindDatabase
-- No async action query fields present, do nothing.
pure ()
Just nonEmptyActionIds -> do
logQueryLog logger $ QueryLog q Nothing requestId QueryLogKindAction
liftIO $ do
let asyncActionQueryLive =
LQ.LAAQOnSourceDB $
LQ.LiveAsyncActionQueryOnSource lqId actionLogMap $
restartLiveQuery parameterizedQueryHash requestId liveQueryBuilder
onUnexpectedException err = do
sendError requestId err
stopOperation serverEnv wsConn opId (pure ()) -- Don't log in case opId don't exist
LQ.addAsyncActionLiveQuery
(LQ._lqsAsyncActions lqMap)
opId
nonEmptyActionIds
onUnexpectedException
asyncActionQueryLive
liftIO $ logOpEv ODStarted (Just requestId) (Just parameterizedQueryHash)
where
sendDataMsg = WS._wsaGetDataMessageType onMessageActions
closeConnAction = WS._wsaConnectionCloseAction onMessageActions
postExecErrAction = WS._wsaPostExecErrMessageAction onMessageActions
fmtErrorMessage = WS._wsaErrorMsgFormat onMessageActions
getExecStepActionWithActionInfo acc execStep = case execStep of
E.ExecStepAction _ actionInfo _remoteJoins -> (actionInfo : acc)
_ -> acc
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
doQErr ::
Monad n =>
ExceptT QErr n a ->
ExceptT (Either GQExecError QErr) n a
doQErr = withExceptT Right
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
withErr ::
forall e f n a.
Monad n =>
(e -> f) ->
(ExceptT e (ExceptT f n) a -> ExceptT e (ExceptT f n) a) ->
ExceptT f n a ->
ExceptT f n a
withErr embed f action = do
res <- runExceptT $ f $ lift action
onLeft res (\e -> throwError $ embed e)
forWithKey = flip OMap.traverseWithKey
telemTransport = Telem.WebSocket
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
handleResult ::
forall a.
RequestId ->
Either (Either GQExecError QErr) a ->
(a -> ExceptT () m ()) ->
ExceptT () m ()
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
handleResult requestId r f = case r of
Left (Left err) -> postExecErr' err
Left (Right err) -> postExecErr requestId err
Right results -> f results
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
sendResultFromFragments telemQueryType timerTot requestId r opName pqh =
handleResult requestId r \results -> do
let telemLocality = foldMap arpLocality results
telemTimeIO = convertDuration $ sum $ fmap arpTimeIO results
telemTimeTot <- Seconds <$> timerTot
sendSuccResp (encodeAnnotatedResponseParts results) opName pqh $
LQ.LiveQueryMetadata $ sum $ fmap arpTimeIO results
-- Telemetry. NOTE: don't time network IO:
Telem.recordTimingMetric Telem.RequestDimensions {..} Telem.RequestTimings {..}
runRemoteGQ ::
RootFieldAlias ->
UserInfo ->
[H.Header] ->
RemoteSchemaInfo ->
ResultCustomizer ->
GQLReqOutgoing ->
ExceptT (Either GQExecError QErr) (ExceptT () m) AnnotatedResponsePart
runRemoteGQ fieldName userInfo reqHdrs rsi resultCustomizer gqlReq = do
(telemTimeIO_DT, _respHdrs, resp) <-
doQErr $
E.execRemoteGQ env httpMgr userInfo reqHdrs (rsDef rsi) gqlReq
value <- mapExceptT lift $ extractFieldFromResponse fieldName rsi resultCustomizer resp
return $ AnnotatedResponsePart telemTimeIO_DT Telem.Remote (encJFromOrderedValue value) []
WSServerEnv
logger
lqMap
getSchemaCache
httpMgr
_
sqlGenCtx
_
enableAL
_keepAliveDelay
_connInitTime = serverEnv
WSConnData userInfoR opMap errRespTy queryType = WS.getData wsConn
logOpEv opTy reqId parameterizedQueryHash =
-- See Note [Disable query printing when query-log is disabled]
let queryToLog = bool Nothing (Just q) (Set.member L.ELTQueryLog enabledLogTypes)
in logWSEvent logger wsConn $
EOperation $
OperationDetails opId reqId (_grOperationName q) opTy queryToLog parameterizedQueryHash
getErrFn ERTLegacy = encodeQErr
getErrFn ERTGraphqlCompliant = encodeGQLErr
sendStartErr e = do
let errFn = getErrFn errRespTy
sendMsg wsConn $
SMErr $ ErrorMsg opId $ errFn False $ err400 StartFailed e
liftIO $ logOpEv (ODProtoErr e) Nothing Nothing
liftIO $ closeConnAction wsConn opId (T.unpack e)
sendCompleted reqId paramQueryHash = do
sendMsg wsConn (SMComplete . CompletionMsg $ opId)
logOpEv ODCompleted reqId paramQueryHash
postExecErr :: RequestId -> QErr -> ExceptT () m ()
postExecErr reqId qErr = do
let errFn = getErrFn errRespTy False
liftIO $ logOpEv (ODQueryErr qErr) (Just reqId) Nothing
postExecErr' $ GQExecError $ pure $ errFn qErr
postExecErr' :: GQExecError -> ExceptT () m ()
postExecErr' qErr = liftIO $ postExecErrAction wsConn opId qErr
-- why wouldn't pre exec error use graphql response?
preExecErr reqId qErr = liftIO $ sendError reqId qErr
sendError reqId qErr = do
let errFn = getErrFn errRespTy
logOpEv (ODQueryErr qErr) (Just reqId) Nothing
let err = case errRespTy of
ERTLegacy -> errFn False qErr
ERTGraphqlCompliant -> fmtErrorMessage [errFn False qErr]
sendMsg wsConn (SMErr $ ErrorMsg opId err)
sendSuccResp ::
EncJSON ->
Maybe OperationName ->
ParameterizedQueryHash ->
LQ.LiveQueryMetadata ->
ExceptT () m ()
sendSuccResp encJson opName queryHash =
sendMsgWithMetadata
wsConn
(sendDataMsg $ DataMsg opId $ pure $ encJToLBS encJson)
opName
(Just queryHash)
withComplete :: ExceptT () m () -> ExceptT () m a
withComplete action = do
action
liftIO $ sendCompleted Nothing Nothing
throwError ()
restartLiveQuery parameterizedQueryHash requestId liveQueryBuilder lqId actionLogMap = do
LQ.removeLiveQuery logger (_wseServerMetrics serverEnv) lqMap lqId
either (const Nothing) Just <$> startLiveQuery liveQueryBuilder parameterizedQueryHash requestId actionLogMap
startLiveQuery liveQueryBuilder parameterizedQueryHash requestId actionLogMap = do
liveQueryE <- runExceptT $ liveQueryBuilder actionLogMap
for liveQueryE $ \(sourceName, E.LQP exists) -> do
let !opName = _grOperationName q
subscriberMetadata = LQ.mkSubscriberMetadata (WS.getWSId wsConn) opId opName requestId
-- NOTE!: we mask async exceptions higher in the call stack, but it's
-- crucial we don't lose lqId after addLiveQuery returns successfully.
!lqId <- liftIO $ AB.dispatchAnyBackend @BackendTransport
exists
\(E.MultiplexedLiveQueryPlan liveQueryPlan) ->
LQ.addLiveQuery
logger
(_wseServerMetrics serverEnv)
subscriberMetadata
lqMap
sourceName
parameterizedQueryHash
opName
requestId
liveQueryPlan
(liveQOnChange opName parameterizedQueryHash $ LQ._lqpNamespace liveQueryPlan)
liftIO $ $assertNFHere (lqId, opName) -- so we don't write thunks to mutable vars
STM.atomically $
-- NOTE: see crucial `lookup` check above, ensuring this doesn't clobber:
STMMap.insert (lqId, opName) opId opMap
pure lqId
-- on change, send message on the websocket
liveQOnChange :: Maybe OperationName -> ParameterizedQueryHash -> Maybe Name -> LQ.OnChange
liveQOnChange opName queryHash namespace = \case
Right (LQ.LiveQueryResponse bs dTime) ->
sendMsgWithMetadata
wsConn
(sendDataMsg $ DataMsg opId $ pure $ maybe LBS.fromStrict wrapNamespace namespace bs)
opName
(Just queryHash)
(LQ.LiveQueryMetadata dTime)
resp ->
sendMsg wsConn $
sendDataMsg $ DataMsg opId $ LBS.fromStrict . LQ._lqrPayload <$> resp
-- If the source has a namespace then we need to wrap the response
-- from the DB in that namespace.
wrapNamespace :: Name -> ByteString -> LBS.ByteString
wrapNamespace namespace bs =
encJToLBS $ encJFromAssocList [(unName namespace, encJFromBS bs)]
catchAndIgnore :: ExceptT () m () -> m ()
catchAndIgnore m = void $ runExceptT m
onMessage ::
( MonadIO m,
UserAuthentication (Tracing.TraceT m),
E.MonadGQLExecutionCheck m,
MonadQueryLog m,
Tracing.HasReporter m,
MonadExecuteQuery m,
MC.MonadBaseControl IO m,
MonadMetadataStorage (MetadataStorageT m),
server: operation timeout with postgres cancelling ### Description This PR implements operation timeouts, as specced in #1232. RFC: [rfcs/operation-timeout-api-limits.md](https://github.com/hasura/graphql-engine-mono/blob/c025a90fe9779436bc0188a2bbf0ad95b5ed1f32/rfcs/operation-timeout-api-limits.md) There's still some things to be done (tests and docs most notably), but apart from that it can be reviewed. I'd still appreciate feedback on the RFC! TODO: - [x] break out the `ApiLimits` refactoring into a separate PR: #2103 - [x] finish the `pg-client-hs` PR: https://github.com/hasura/pg-client-hs/pull/39 - [x] remove configurability, after testing, prior to merging - [ ] tests: #2390 has some tests that I've run locally to confirm things work on a fundamental level - [x] changelog - [x] documentation - [x] fill in the detailed PR checklist ### Changelog - [x] `CHANGELOG.md` is updated with user-facing content relevant to this PR. If no changelog is required, then add the `no-changelog-required` label. ### Affected components - [x] Server - [ ] Console - [ ] CLI - [x] Docs - [ ] Tests ### Related Issues Product spec: #1232. ### Solution and Design Compare `rfcs/operation-timeout-api-limits.md`. ### Steps to test and verify Configure operation timeouts, e.g. by posting ``` { "type": "set_api_limits", "args": { "operation_timeout": { "global": 3 } } } ``` to `v1/metadata` to set an operation timeout of 3s. Then verify that 1. non-admin queries that take longer than 3s time out with a nice error message 2. that those queries return after ~3s (at least for postgres) 3. also that everything else still works as usual ### Limitations, known bugs & workarounds - while this will cause slow queries against any backends to fail, it's only verified to actually interrupt queries against postgres - this will only successfully short-cut (cancel) queries to postgres if the database server is responsive #### Catalog upgrade Does this PR change Hasura Catalog version? - [x] No #### Metadata Does this PR add a new Metadata feature? - [x] Yes - Does `run_sql` auto manages the new metadata through schema diffing? - [x] Not required - Does `run_sql` auto manages the definitions of metadata on renaming? - [x] Not required - Does `export_metadata`/`replace_metadata` supports the new metadata added? - [x] Yes #### GraphQL - [x] No new GraphQL schema is generated #### Breaking changes - [x] No Breaking changes PR-URL: https://github.com/hasura/graphql-engine-mono/pull/1593 GitOrigin-RevId: f0582d0be3ed9fadf89e0c4aaf96344d18331dc4
2021-09-29 19:20:06 +03:00
EB.MonadQueryTags m,
HasResourceLimits m
) =>
Env.Environment ->
HashSet (L.EngineLogType L.Hasura) ->
AuthMode ->
WSServerEnv ->
WSConn ->
LBS.ByteString ->
WS.WSActions WSConnData ->
m ()
onMessage env enabledLogTypes authMode serverEnv wsConn msgRaw onMessageActions = Tracing.runTraceT "websocket" do
case J.eitherDecode msgRaw of
Left e -> do
let err = ConnErrMsg $ "parsing ClientMessage failed: " <> T.pack e
logWSEvent logger wsConn $ EConnErr err
liftIO $ onErrAction wsConn err WS.onClientMessageParseErrorText
Right msg -> case msg of
-- common to both protocols
CMConnInit params ->
onConnInit
logger
(_wseHManager serverEnv)
wsConn
authMode
params
onErrAction
keepAliveMessageAction
CMStart startMsg -> onStart env enabledLogTypes serverEnv wsConn startMsg onMessageActions
CMStop stopMsg -> onStop serverEnv wsConn stopMsg
-- specfic to graphql-ws
CMPing mPayload -> onPing wsConn mPayload
CMPong mPayload -> onPong wsConn mPayload
-- specific to apollo clients
CMConnTerm -> liftIO $ WS.closeConn wsConn "GQL_CONNECTION_TERMINATE received"
where
logger = _wseLogger serverEnv
onErrAction = WS._wsaOnErrorMessageAction onMessageActions
keepAliveMessageAction = WS._wsaKeepAliveAction onMessageActions
onPing :: (MonadIO m) => WSConn -> Maybe PingPongPayload -> m ()
onPing wsConn mPayload =
liftIO $ sendMsg wsConn (SMPong mPayload)
onPong :: (MonadIO m) => WSConn -> Maybe PingPongPayload -> m ()
onPong wsConn mPayload = liftIO $ case mPayload of
Just message -> do
when (message /= keepAliveMessage) $
sendMsg wsConn (SMPing mPayload)
-- NOTE: this is done to avoid sending Ping for every "keepalive" that the server sends
Nothing -> sendMsg wsConn $ SMPing Nothing
onStop :: (MonadIO m) => WSServerEnv -> WSConn -> StopMsg -> m ()
onStop serverEnv wsConn (StopMsg opId) = liftIO $ do
-- When a stop message is received for an operation, it may not be present in OpMap
-- in these cases:
-- 1. If the operation is a query/mutation - as we remove the operation from the
-- OpMap as soon as it is executed
-- 2. A misbehaving client
-- 3. A bug on our end
stopOperation serverEnv wsConn opId $
L.unLogger logger $
L.UnstructuredLog L.LevelDebug $
fromString $
"Received STOP for an operation that we have no record for: "
<> show (unOperationId opId)
<> " (could be a query/mutation operation or a misbehaving client or a bug)"
where
logger = _wseLogger serverEnv
stopOperation :: WSServerEnv -> WSConn -> OperationId -> IO () -> IO ()
stopOperation serverEnv wsConn opId logWhenOpNotExist = do
opM <- liftIO $ STM.atomically $ STMMap.lookup opId opMap
case opM of
2019-04-17 12:48:41 +03:00
Just (lqId, opNameM) -> do
logWSEvent logger wsConn $ EOperation $ opDet opNameM
LQ.removeLiveQuery logger (_wseServerMetrics serverEnv) lqMap lqId
Nothing -> logWhenOpNotExist
STM.atomically $ STMMap.delete opId opMap
where
logger = _wseLogger serverEnv
lqMap = _wseLiveQMap serverEnv
opMap = _wscOpMap $ WS.getData wsConn
opDet n = OperationDetails opId Nothing n ODStopped Nothing Nothing
onConnInit ::
(MonadIO m, UserAuthentication (Tracing.TraceT m)) =>
L.Logger L.Hasura ->
H.Manager ->
WSConn ->
AuthMode ->
Maybe ConnParams ->
-- | this is the message handler for handling errors on initializing a from the client connection
WS.WSOnErrorMessageAction WSConnData ->
-- | this is the message handler for handling "keep-alive" messages to the client
WS.WSKeepAliveMessageAction WSConnData ->
Tracing.TraceT m ()
onConnInit logger manager wsConn authMode connParamsM onConnInitErrAction keepAliveMessageAction = do
Rewrite GraphQL schema generation and query parsing (close #2801) (#4111) Aka “the PDV refactor.” History is preserved on the branch 2801-graphql-schema-parser-refactor. * [skip ci] remove stale benchmark commit from commit_diff * [skip ci] Check for root field name conflicts between remotes * [skip ci] Additionally check for conflicts between remotes and DB * [skip ci] Check for conflicts in schema when tracking a table * [skip ci] Fix equality checking in GraphQL AST * server: fix mishandling of GeoJSON inputs in subscriptions (fix #3239) (#4551) * Add support for multiple top-level fields in a subscription to improve testability of subscriptions * Add an internal flag to enable multiple subscriptions * Add missing call to withConstructorFn in live queries (fix #3239) Co-authored-by: Alexis King <lexi.lambda@gmail.com> * Scheduled triggers (close #1914) (#3553) server: add scheduled triggers Co-authored-by: Alexis King <lexi.lambda@gmail.com> Co-authored-by: Marion Schleifer <marion@hasura.io> Co-authored-by: Karthikeyan Chinnakonda <karthikeyan@hasura.io> Co-authored-by: Aleksandra Sikora <ola.zxcvbnm@gmail.com> * dev.sh: bump version due to addition of croniter python dependency * server: fix an introspection query caching issue (fix #4547) (#4661) Introspection queries accept variables, but we need to make sure to also touch the variables that we ignore, so that an introspection query is marked not reusable if we are not able to build a correct query plan for it. A better solution here would be to deal with such unused variables correctly, so that more introspection queries become reusable. An even better solution would be to type-safely track *how* to reuse which variables, rather than to split the reusage marking from the planning. Co-authored-by: Tirumarai Selvan <tiru@hasura.io> * flush log buffer on exception in mkWaiApp ( fix #4772 ) (#4801) * flush log buffer on exception in mkWaiApp * add comment to explain the introduced change * add changelog * allow logging details of a live query polling thread (#4959) * changes for poller-log add various multiplexed query info in poller-log * minor cleanup, also fixes a bug which will return duplicate data * Live query poller stats can now be logged This also removes in-memory stats that are collected about batched query execution as the log lines when piped into an monitoring tool will give us better insights. * allow poller-log to be configurable * log minimal information in the livequery-poller-log Other information can be retrieved from /dev/subscriptions/extended * fix few review comments * avoid marshalling and unmarshalling from ByteString to EncJSON * separate out SubscriberId and SubscriberMetadata Co-authored-by: Anon Ray <rayanon004@gmail.com> * Don't compile in developer APIs by default * Tighten up handling of admin secret, more docs Store the admin secret only as a hash to prevent leaking the secret inadvertently, and to prevent timing attacks on the secret. NOTE: best practice for stored user passwords is a function with a tunable cost like bcrypt, but our threat model is quite different (even if we thought we could reasonably protect the secret from an attacker who could read arbitrary regions of memory), and bcrypt is far too slow (by design) to perform on each request. We'd have to rely on our (technically savvy) users to choose high entropy passwords in any case. Referencing #4736 * server/docs: add instructions to fix loss of float precision in PostgreSQL <= 11 (#5187) This adds a server flag, --pg-connection-options, that can be used to set a PostgreSQL connection parameter, extra_float_digits, that needs to be used to avoid loss of data on older versions of PostgreSQL, which have odd default behavior when returning float values. (fixes #5092) * [skip ci] Add new commits from master to the commit diff * [skip ci] serve default directives (skip & include) over introspection * [skip ci] Update non-Haskell assets with the version on master * server: refactor GQL execution check and config API (#5094) Co-authored-by: Vamshi Surabhi <vamshi@hasura.io> Co-authored-by: Vamshi Surabhi <0x777@users.noreply.github.com> * [skip ci] fix js issues in tests by pinning dependencies version * [skip ci] bump graphql version * [skip ci] Add note about memory usage * generalize query execution logic on Postgres (#5110) * generalize PGExecCtx to support specialized functions for various operations * fix tests compilation * allow customising PGExecCtx when starting the web server * server: changes catalog initialization and logging for pro customization (#5139) * new typeclass to abstract the logic of QueryLog-ing * abstract the logic of logging websocket-server logs introduce a MonadWSLog typeclass * move catalog initialization to init step expose a helper function to migrate catalog create schema cache in initialiseCtx * expose various modules and functions for pro * [skip ci] cosmetic change * [skip ci] fix test calling a mutation that does not exist * [skip ci] minor text change * [skip ci] refactored input values * [skip ci] remove VString Origin * server: fix updating of headers behaviour in the update cron trigger API and create future events immediately (#5151) * server: fix bug to update headers in an existing cron trigger and create future events Co-authored-by: Tirumarai Selvan <tiru@hasura.io> * Lower stack chunk size in RTS to reduce thread STACK memory (closes #5190) This reduces memory consumption for new idle subscriptions significantly (see linked ticket). The hypothesis is: we fork a lot of threads per websocket, and some of these use slightly more than the initial 1K stack size, so the first overflow balloons to 32K, when significantly less is required. However: running with `+RTS -K1K -xc` did not seem to show evidence of any overflows! So it's a mystery why this improves things. GHC should probably also be doubling the stack buffer at each overflow or doing something even smarter; the knobs we have aren't so helpful. * [skip ci] fix todo and schema generation for aggregate fields * 5087 libpq pool leak (#5089) Shrink libpq buffers to 1MB before returning connection to pool. Closes #5087 See: https://github.com/hasura/pg-client-hs/pull/19 Also related: #3388 #4077 * bump pg-client-hs version (fixes a build issue on some environments) (#5267) * do not use prepared statements for mutations * server: unlock scheduled events on graceful shutdown (#4928) * Fix buggy parsing of new --conn-lifetime flag in 2b0e3774 * [skip ci] remove cherry-picked commit from commit_diff.txt * server: include additional fields in scheduled trigger webhook payload (#5262) * include scheduled triggers metadata in the webhook body Co-authored-by: Tirumarai Selvan <tiru@hasura.io> * server: call the webhook asynchronously in event triggers (#5352) * server: call the webhook asynchronosly in event triggers * Expose all modules in Cabal file (#5371) * [skip ci] update commit_diff.txt * [skip ci] fix cast exp parser & few TODOs * [skip ci] fix remote fields arguments * [skip ci] fix few more TODO, no-op refactor, move resolve/action.hs to execute/action.hs * Pass environment variables around as a data structure, via @sordina (#5374) * Pass environment variables around as a data structure, via @sordina * Resolving build error * Adding Environment passing note to changelog * Removing references to ILTPollerLog as this seems to have been reintroduced from a bad merge * removing commented-out imports * Language pragmas already set by project * Linking async thread * Apply suggestions from code review Use `runQueryTx` instead of `runLazyTx` for queries. * remove the non-user facing entry in the changelog Co-authored-by: Phil Freeman <paf31@cantab.net> Co-authored-by: Phil Freeman <phil@hasura.io> Co-authored-by: Vamshi Surabhi <0x777@users.noreply.github.com> * [skip ci] fix: restrict remote relationship field generation for hasura queries * [skip ci] no-op refactor; move insert execution code from schema parser module * server: call the webhook asynchronously in event triggers (#5352) * server: call the webhook asynchronosly in event triggers * Expose all modules in Cabal file (#5371) * [skip ci] update commit_diff.txt * Pass environment variables around as a data structure, via @sordina (#5374) * Pass environment variables around as a data structure, via @sordina * Resolving build error * Adding Environment passing note to changelog * Removing references to ILTPollerLog as this seems to have been reintroduced from a bad merge * removing commented-out imports * Language pragmas already set by project * Linking async thread * Apply suggestions from code review Use `runQueryTx` instead of `runLazyTx` for queries. * remove the non-user facing entry in the changelog Co-authored-by: Phil Freeman <paf31@cantab.net> Co-authored-by: Phil Freeman <phil@hasura.io> Co-authored-by: Vamshi Surabhi <0x777@users.noreply.github.com> * [skip ci] implement header checking Probably closes #14 and #3659. * server: refactor 'pollQuery' to have a hook to process 'PollDetails' (#5391) Co-authored-by: Vamshi Surabhi <0x777@users.noreply.github.com> * update pg-client (#5421) * [skip ci] update commit_diff * Fix latency buckets for telemetry data These must have gotten messed up during a refactor. As a consequence almost all samples received so far fall into the single erroneous 0 to 1K seconds (originally supposed to be 1ms?) bucket. I also re-thought what the numbers should be, but these are still arbitrary and might want adjusting in the future. * [skip ci] include the latest commit compared against master in commit_diff * [skip ci] include new commits from master in commit_diff * [skip ci] improve description generation * [skip ci] sort all introspect arrays * [skip ci] allow parsers to specify error codes * [skip ci] fix integer and float parsing error code * [skip ci] scalar from json errors are now parse errors * [skip ci] fixed negative integer error message and code * [skip ci] Re-fix nullability in relationships * [skip ci] no-op refactor and removed couple of FIXMEs * [skip ci] uncomment code in 'deleteMetadataObject' * [skip ci] Fix re-fix of nullability for relationships * [skip ci] fix default arguments error code * [skip ci] updated test error message !!! WARNING !!! Since all fields accept `null`, they all are technically optional in the new schema. Meaning there's no such thing as a missing mandatory field anymore: a field that doesn't have a default value, and which therefore isn't labelled as "optional" in the schema, will be assumed to be null if it's missing, meaning it isn't possible anymore to have an error for a missing mandatory field. The only possible error is now when a optional positional argument is omitted but is not the last positional argument. * [skip ci] cleanup of int scalar parser * [skip ci] retro-compatibility of offset as string * [skip ci] Remove commit from commit_diff.txt Although strictly speaking we don't know if this will work correctly in PDV if we would implement query plan caching, the fact is that in the theoretical case that we would have the same issue in PDV, it would probably apply not just to introspection, and the fix would be written completely differently. So this old commit is of no value to us other than the heads-up "make sure query plan caching works correctly even in the presence of unused variables", which is already part of the test suite. * Add MonadTrace and MonadExecuteQuery abstractions (#5383) * [skip ci] Fix accumulation of input object types Just like object types, interface types, and union types, we have to avoid circularities when collecting input types from the GraphQL AST. Additionally, this fixes equality checks for input object types (whose fields are unordered, and hence should be compared as sets) and enum types (ditto). * [skip ci] fix fragment error path * [skip ci] fix node error code * [skip ci] fix paths in insert queries * [skip ci] fix path in objects * [skip ci] manually alter node id path for consistency * [skip ci] more node error fixups * [skip ci] one last relay error message fix * [skip ci] update commit_diff * Propagate the trace context to event triggers (#5409) * Propagate the trace context to event triggers * Handle missing trace and span IDs * Store trace context as one LOCAL * Add migrations * Documentation * changelog * Fix warnings * Respond to code review suggestions * Respond to code review * Undo changelog * Update CHANGELOG.md Co-authored-by: Vamshi Surabhi <0x777@users.noreply.github.com> * server: log request/response sizes for event triggers (#5463) * server: log request/response sizes for event triggers event triggers (and scheduled triggers) now have request/response size in their logs. * add changelog entry * Tracing: Simplify HTTP traced request (#5451) Remove the Inversion of Control (SuspendRequest) and simplify the tracing of HTTP Requests. Co-authored-by: Phil Freeman <phil@hasura.io> * Attach request ID as tracing metadata (#5456) * Propagate the trace context to event triggers * Handle missing trace and span IDs * Store trace context as one LOCAL * Add migrations * Documentation * Include the request ID as trace metadata * changelog * Fix warnings * Respond to code review suggestions * Respond to code review * Undo changelog * Update CHANGELOG.md * Typo Co-authored-by: Vamshi Surabhi <0x777@users.noreply.github.com> * server: add logging for action handlers (#5471) * server: add logging for action handlers * add changelog entry * change action-handler log type from internal to non-internal * fix action-handler-log name * server: pass http and websocket request to logging context (#5470) * pass request body to logging context in all cases * add message size logging on the websocket API this is required by graphql-engine-pro/#416 * message size logging on websocket API As we need to log all messages recieved/sent by the websocket server, it makes sense to log them as part of the websocket server event logs. Previously message recieved were logged inside the onMessage handler, and messages sent were logged only for "data" messages (as a server event log) * fix review comments Co-authored-by: Phil Freeman <phil@hasura.io> * server: stop eventing subsystem threads when shutting down (#5479) * server: stop eventing subsystem threads when shutting down * Apply suggestions from code review Co-authored-by: Karthikeyan Chinnakonda <chkarthikeyan95@gmail.com> Co-authored-by: Phil Freeman <phil@hasura.io> Co-authored-by: Phil Freeman <paf31@cantab.net> Co-authored-by: Karthikeyan Chinnakonda <chkarthikeyan95@gmail.com> * [skip ci] update commit_diff with new commits added in master * Bugfix to support 0-size HASURA_GRAPHQL_QUERY_PLAN_CACHE_SIZE Also some minor refactoring of bounded cache module: - the maxBound check in `trim` was confusing and unnecessary - consequently trim was unnecessary for lookupPure Also add some basic tests * Support only the bounded cache, with default HASURA_GRAPHQL_QUERY_PLAN_CACHE_SIZE of 4000. Closes #5363 * [skip ci] remove merge commit from commit_diff * server: Fix compiler warning caused by GHC upgrade (#5489) Co-authored-by: Vamshi Surabhi <0x777@users.noreply.github.com> * [skip ci] update all non server code from master * [skip ci] aligned object field error message with master * [skip ci] fix remaining undefined? * [skip ci] remove unused import * [skip ci] revert to previous error message, fix tests * Move nullableType/nonNullableType to Schema.hs These are functions on Types, not on Parsers. * [skip ci] fix setup to fix backend only test the order in which permission checks are performed on the branch is slightly different than on master, resulting in a slightly different error if there are no other mutations the user has access to. By adding update permissions, we go back to the expected case. * [skip ci] fix insert geojson tests to reflect new paths * [skip ci] fix enum test for better error message * [skip ci] fix header test for better error message * [skip ci] fix fragment cycle test for better error message * [skip ci] fix error message for type mismatch * [skip ci] fix variable path in test * [skip ci] adjust tests after bug fix * [skip ci] more tests fixing * Add hdb_catalog.current_setting abstraction for reading Hasura settings As the comment in the function’s definition explains, this is needed to work around an awkward Postgres behavior. * [skip ci] Update CONTRIBUTING.md to mention Node setup for Python tests * [skip ci] Add missing Python tests env var to CONTRIBUTING.md * [skip ci] fix order of result when subscription is run with multiple nodes * [skip ci] no-op refactor: fix a warning in Internal/Parser.hs * [skip ci] throw error when a subscription contains remote joins * [skip ci] Enable easier profiling by hiding AssertNF behind a flag In order to compile a profiling build, run: $ cabal new-build -f profiling --enable-profiling * [skip ci] Fix two warnings We used to lookup the objects that implement a given interface by filtering all objects in the schema document. However, one of the tests expects us to generate a warning if the provided `implements` field of an introspection query specifies an object not implementing some interface. So we use that field instead. * [skip ci] Fix warnings by commenting out query plan caching * [skip ci] improve masking/commenting query caching related code & few warning fixes * [skip ci] Fixed compiler warnings in graphql-parser-hs * Sync non-Haskell assets with master * [skip ci] add a test inserting invalid GraphQL but valid JSON value in a jsonb column * [skip ci] Avoid converting to/from Map * [skip ci] Apply some hlint suggestions * [skip ci] remove redundant constraints from buildLiveQueryPlan and explainGQLQuery * [skip ci] add NOTEs about missing Tracing constraints in PDV from master * Remove -fdefer-typed-holes, fix warnings * Update cabal.project.freeze * Limit GHC’s heap size to 8GB in CI to avoid the OOM killer * Commit package-lock.json for Python tests’ remote schema server * restrict env variables start with HASURA_GRAPHQL_ for headers configuration in actions, event triggers & remote schemas (#5519) * restrict env variables start with HASURA_GRAPHQL_ for headers definition in actions & event triggers * update CHANGELOG.md * Apply suggestions from code review Co-authored-by: Vamshi Surabhi <0x777@users.noreply.github.com> * add test for table_by_pk node when roles doesn't have permission to PK * [skip ci] fix introspection query if any enum column present in primary key (fix #5200) (#5522) * [skip ci] test case fix for a6450e126bc2d98bcfd3791501986e4627ce6c6f * [skip ci] add tests to agg queries when role doesn't have access to any cols * fix backend test * Simplify subscription execution * [skip ci] add test to check if required headers are present while querying * Suppose, table B is related to table A and to query B certain headers are necessary, then the test checks that we are throwing error when the header is not set when B is queried through A * fix mutations not checking for view mutability * [skip ci] add variable type checking and corresponding tests * [skip ci] add test to check if update headers are present while doing an upsert * [skip ci] add positive counterparts to some of the negative permission tests * fix args missing their description in introspect * [skip ci] Remove unused function; insert missing markNotReusable call * [skip ci] Add a Note about InputValue * [skip ci] Delete LegacySchema/ 🎉 * [skip ci] Delete GraphQL/{Resolve,Validate}/ 🎉 * [skip ci] Delete top-level Resolve/Validate modules; tidy .cabal file * [skip ci] Delete LegacySchema top-level module Somehow I missed this one. * fix input value to json * [skip ci] elaborate on JSON objects in GraphQL * [skip ci] add missing file * [skip ci] add a test with subscription containing remote joins * add a test with remote joins in mutation output * [skip ci] Add some comments to Schema/Mutation.hs * [skip ci] Remove no longer needed code from RemoteServer.hs * [skip ci] Use a helper function to generate conflict clause parsers * [skip ci] fix type checker error in fields with default value * capitalize the header keys in select_articles_without_required_headers * Somehow, this was the reason the tests were failing. I have no idea, why! * [skip ci] Add a long Note about optional fields and nullability * Improve comments a bit; simplify Schema/Common.hs a bit * [skip ci] full implementation of 5.8.5 type checking. * [skip ci] fix validation test teardown * [skip ci] fix schema stitching test * fix remote schema ignoring enum nullability * [skip ci] fix fieldOptional to not discard nullability * revert nullability of use_spheroid * fix comment * add required remote fields with arguments for tests * [skip ci] add missing docstrings * [skip ci] fixed description of remote fields * [skip ci] change docstring for consistency * fix several schema inconsistencies * revert behaviour change in function arguments parsing * fix remaining nullability issues in new schema * minor no-op refactor; use isListType from graphql-parser-hs * use nullability of remote schema node, while creating a Remote reln * fix 'ID' input coercing & action 'ID' type relationship mapping * include ASTs in MonadExecuteQuery * needed for PRO code-base * Delete code for "interfaces implementing ifaces" (draft GraphQL spec) Previously I started writing some code that adds support for a future GraphQL feature where interfaces may themselves be sub-types of other interfaces. However, this code was incomplete, and partially incorrect. So this commit deletes support for that entirely. * Ignore a remote schema test during the upgrade/downgrade test The PDV refactor does a better job at exposing a minimal set of types through introspection. In particular, not every type that is present in a remote schema is re-exposed by Hasura. The test test_schema_stitching.py::TestRemoteSchemaBasic::test_introspection assumed that all types were re-exposed, which is not required for GraphQL compatibility, in order to test some aspect of our support for remote schemas. So while this particular test has been updated on PDV, the PDV branch now does not pass the old test, which we argue to be incorrect. Hence this test is disabled while we await a release, after which we can re-enable it. This also re-enables a test that was previously disabled for similar, though unrelated, reasons. * add haddock documentation to the action's field parsers * Deslecting some tests in server-upgrade Some tests with current build are failing on server upgrade which it should not. The response is more accurate than what it was. Also the upgrade tests were not throwing errors when the test is expected to return an error, but succeeds. The test framework is patched to catch this case. * [skip ci] Add a long Note about interfaces and object types * send the response headers back to client after running a query * Deselect a few more tests during upgrade/downgrade test * Update commit_diff.txt * change log kind from db_migrate to catalog_migrate (#5531) * Show method and complete URI in traced HTTP calls (#5525) Co-authored-by: Vamshi Surabhi <0x777@users.noreply.github.com> * restrict env variables start with HASURA_GRAPHQL_ for headers configuration in actions, event triggers & remote schemas (#5519) * restrict env variables start with HASURA_GRAPHQL_ for headers definition in actions & event triggers * update CHANGELOG.md * Apply suggestions from code review Co-authored-by: Vamshi Surabhi <0x777@users.noreply.github.com> * fix introspection query if any enum column present in primary key (fix #5200) (#5522) * Fix telemetry reporting of transport (websocket was reported as http) * add log kinds in cli-migrations image (#5529) * add log kinds in cli-migrations image * give hint to resolve timeout error * minor changes and CHANGELOG * server: set hasura.tracecontext in RQL mutations [#5542] (#5555) * server: set hasura.tracecontext in RQL mutations [#5542] * Update test suite Co-authored-by: Tirumarai Selvan <tiru@hasura.io> * Add bulldozer auto-merge and -update configuration We still need to add the github app (as of time of opening this PR) Afterwards devs should be able to allow bulldozer to automatically "update" the branch, merging in parent when it changes, as well as automatically merge when all checks pass. This is opt-in by adding the `auto-update-auto-merge` label to the PR. * Remove 'bulldozer' config, try 'kodiak' for auto-merge see: https://github.com/chdsbd/kodiak The main issue that bit us was not being able to auto update forked branches, also: https://github.com/palantir/bulldozer/issues/66 https://github.com/palantir/bulldozer/issues/145 * Cherry-picked all commits * [skip ci] Slightly improve formatting * Revert "fix introspection query if any enum column present in primary key (fix #5200) (#5522)" This reverts commit 0f9a5afa59a88f6824f4d63d58db246a5ba3fb03. This undoes a cherry-pick of 34288e1eb5f2c5dad9e6d1e05453dd52397dc970 that was already done previously in a6450e126bc2d98bcfd3791501986e4627ce6c6f, and subsequently fixed for PDV in 70e89dc250f8ddc6e2b7930bbe2b3eeaa6dbe1db * Do a small bit of tidying in Hasura.GraphQL.Parser.Collect * Fix cherry-picking work Some previous cherry-picks ended up modifying code that is commented out * [skip ci] clarified comment regarding insert representation * [skip ci] removed obsolete todos * cosmetic change * fix action error message * [skip ci] remove obsolete comment * [skip ci] synchronize stylish haskell extensions list * use previously defined scalar names in parsers rather than ad-hoc literals * Apply most syntax hlint hints. * Clarify comment on update mutation. * [skip ci] Clarify what fields should be specified for objects * Update "_inc" description. * Use record types rather than tuples fo IntrospectionResult and ParsedIntrospection * Get rid of checkFieldNamesUnique (use Data.List.Extended.duplicates) * Throw more errors when collecting query root names * [skip ci] clean column parser comment * Remove dead code inserted in ab65b39 * avoid converting to non-empty list where not needed * add note and TODO about the disabled checks in PDV * minor refactor in remoteField' function * Unify two getObject methods * Nitpicks in Remote.hs * Update CHANGELOG.md * Revert "Unify two getObject methods" This reverts commit bd6bb40355b3d189a46c0312eb52225e18be57b3. We do need two different getObject functions as the corresponding error message is different * Fix error message in Remote.hs * Update CHANGELOG.md Co-authored-by: Auke Booij <auke@tulcod.com> * Apply suggested Changelog fix. Co-authored-by: Auke Booij <auke@tulcod.com> * Fix typo in Changelog. * [skip ci] Update changelog. * reuse type names to avoid duplication * Fix Hashable instance for Definition The presence of `Maybe Unique`, and an optional description, as part of `Definition`s, means that `Definition`s that are considered `Eq`ual may get different hashes. This can happen, for instance, when one object is memoized but another is not. * [skip ci] Update commit_diff.txt * Bump parser version. * Bump freeze file after changes in parser. * [skip ci] Incorporate commits from master * Fix developer flag in server/cabal.project.freeze Co-authored-by: Auke Booij <auke@tulcod.com> * Deselect a changed ENUM test for upgrade/downgrade CI * Deselect test here as well * [skip ci] remove dead code * Disable more tests for upgrade/downgrade * Fix which test gets deselected * Revert "Add hdb_catalog.current_setting abstraction for reading Hasura settings" This reverts commit 66e85ab9fbd56cca2c28a80201f6604fbe811b85. * Remove circular reference in cabal.project.freeze Co-authored-by: Karthikeyan Chinnakonda <karthikeyan@hasura.io> Co-authored-by: Auke Booij <auke@hasura.io> Co-authored-by: Tirumarai Selvan <tiru@hasura.io> Co-authored-by: Marion Schleifer <marion@hasura.io> Co-authored-by: Aleksandra Sikora <ola.zxcvbnm@gmail.com> Co-authored-by: Brandon Simmons <brandon.m.simmons@gmail.com> Co-authored-by: Vamshi Surabhi <0x777@users.noreply.github.com> Co-authored-by: Anon Ray <rayanon004@gmail.com> Co-authored-by: rakeshkky <12475069+rakeshkky@users.noreply.github.com> Co-authored-by: Anon Ray <ecthiender@users.noreply.github.com> Co-authored-by: Vamshi Surabhi <vamshi@hasura.io> Co-authored-by: Antoine Leblanc <antoine@hasura.io> Co-authored-by: Brandon Simmons <brandon@hasura.io> Co-authored-by: Phil Freeman <phil@hasura.io> Co-authored-by: Lyndon Maydwell <lyndon@sordina.net> Co-authored-by: Phil Freeman <paf31@cantab.net> Co-authored-by: Naveen Naidu <naveennaidu479@gmail.com> Co-authored-by: Karthikeyan Chinnakonda <chkarthikeyan95@gmail.com> Co-authored-by: Nizar Malangadan <nizar-m@users.noreply.github.com> Co-authored-by: Antoine Leblanc <crucuny@gmail.com> Co-authored-by: Auke Booij <auke@tulcod.com>
2020-08-21 20:27:01 +03:00
-- TODO(from master): what should be the behaviour of connection_init message when a
-- connection is already iniatilized? Currently, we seem to be doing
-- something arbitrary which isn't correct. Ideally, we should stick to
-- this:
--
-- > Allow connection_init message only when the connection state is
-- 'not initialised'. This means that there is no reason for the
-- connection to be in `CSInitError` state.
connState <- liftIO (STM.readTVarIO (_wscUser $ WS.getData wsConn))
case getIpAddress connState of
Left err -> unexpectedInitError err
Right ipAddress -> do
let headers = mkHeaders connState
res <- resolveUserInfo logger manager headers authMode Nothing
case res of
Left e -> do
let !initErr = CSInitError $ qeError e
liftIO $ do
$assertNFHere initErr -- so we don't write thunks to mutable vars
STM.atomically $ STM.writeTVar (_wscUser $ WS.getData wsConn) initErr
let connErr = ConnErrMsg $ qeError e
logWSEvent logger wsConn $ EConnErr connErr
liftIO $ onConnInitErrAction wsConn connErr WS.onConnInitErrorText
Right (userInfo, expTimeM) -> do
let !csInit = CSInitialised $ WsClientState userInfo expTimeM paramHeaders ipAddress
liftIO $ do
$assertNFHere csInit -- so we don't write thunks to mutable vars
STM.atomically $ STM.writeTVar (_wscUser $ WS.getData wsConn) csInit
sendMsg wsConn SMConnAck
liftIO $ keepAliveMessageAction wsConn
where
unexpectedInitError e = do
let connErr = ConnErrMsg e
logWSEvent logger wsConn $ EConnErr connErr
liftIO $ onConnInitErrAction wsConn connErr WS.onConnInitErrorText
getIpAddress = \case
CSNotInitialised _ ip -> return ip
CSInitialised WsClientState {..} -> return wscsIpAddress
CSInitError e -> Left e
mkHeaders st =
paramHeaders ++ getClientHdrs st
paramHeaders =
[ (CI.mk $ TE.encodeUtf8 h, TE.encodeUtf8 v)
| (h, v) <- maybe [] Map.toList $ connParamsM >>= _cpHeaders
]
getClientHdrs st = case st of
CSNotInitialised h _ -> unWsHeaders h
_ -> []
onClose ::
MonadIO m =>
L.Logger L.Hasura ->
ServerMetrics ->
LQ.LiveQueriesState ->
WSConn ->
m ()
onClose logger serverMetrics lqMap wsConn = do
logWSEvent logger wsConn EClosed
operations <- liftIO $ STM.atomically $ ListT.toList $ STMMap.listT opMap
liftIO $
for_ operations $ \(_, (lqId, _)) ->
LQ.removeLiveQuery logger serverMetrics lqMap lqId
where
opMap = _wscOpMap $ WS.getData wsConn