diff --git a/community/boilerplates/observability/enterprise/grafana/dashboards/hasura/hasura-overview.json b/community/boilerplates/observability/enterprise/grafana/dashboards/hasura/hasura-overview.json index 64b9b67893f..b04842f3762 100644 --- a/community/boilerplates/observability/enterprise/grafana/dashboards/hasura/hasura-overview.json +++ b/community/boilerplates/observability/enterprise/grafana/dashboards/hasura/hasura-overview.json @@ -3453,7 +3453,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "Total number of incoming requests for cache lookup", + "description": "Postgres connection errors from GraphQL Engine instances", "fieldConfig": { "defaults": { "color": { @@ -3501,6 +3501,10 @@ { "color": "green", "value": null + }, + { + "color": "red", + "value": 1 } ] }, @@ -3514,7 +3518,7 @@ "x": 12, "y": 81 }, - "id": 57, + "id": 66, "options": { "legend": { "calcs": [], @@ -3534,25 +3538,13 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum(rate(hasura_cache_request_count{job=~\"$job\",instance=~\"$instance\",status=\"hit\"}[$__rate_interval]))", - "legendFormat": "Cache Hit", + "expr": "sum by (job, role,conn_info,source_name) (increase(hasura_postgres_connection_error_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])) ", + "legendFormat": "__auto", "range": true, - "refId": "Hit" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "expr": "sum(rate(hasura_cache_request_count{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))", - "hide": false, - "legendFormat": "Total", - "range": true, - "refId": "Total" + "refId": "A" } ], - "title": "Cache Request Rate", + "title": "Postgres Connectionr Errors", "type": "timeseries" }, { @@ -3748,6 +3740,113 @@ ], "title": "Postgres Pool Wait Time (P95)", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total number of incoming requests for cache lookup", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 95 + }, + "id": 57, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(hasura_cache_request_count{job=~\"$job\",instance=~\"$instance\",status=\"hit\"}[$__rate_interval]))", + "legendFormat": "Cache Hit", + "range": true, + "refId": "Hit" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(hasura_cache_request_count{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))", + "hide": false, + "legendFormat": "Total", + "range": true, + "refId": "Total" + } + ], + "title": "Cache Request Rate", + "type": "timeseries" } ], "refresh": "", @@ -3832,6 +3931,6 @@ "timezone": "", "title": "Hasura Overview", "uid": "Of9GFjr7z", - "version": 2, + "version": 1, "weekStart": "" } \ No newline at end of file diff --git a/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx b/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx index 7979050333b..881d0bb1a81 100644 --- a/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx +++ b/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx @@ -485,6 +485,16 @@ The time taken to acquire a connection from the pool. | Labels | `source_name`: name of the database
`conn_info`: connection url string (password omitted) or name of the connection url environment variable
`role`: primary \| replica | | Unit | seconds | +#### Hasura Postgres Connection Errors Total + +Total number of PostgreSQL connection errors. + +| | | +| ------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Name | `hasura_postgres_connection_error_total` | +| Type | Counter | +| Labels | `source_name`: name of the database
`conn_info`: connection url string (password omitted) or name of the connection url environment variable
`role`: primary \| replica | + ### Hasura source health Health check status of a particular data source, corresponding to the output of `/healthz/sources`, with possible values diff --git a/rfcs/v3/engine-plugins.md b/rfcs/v3/engine-plugins.md deleted file mode 100644 index 21a70f4aeb4..00000000000 --- a/rfcs/v3/engine-plugins.md +++ /dev/null @@ -1,346 +0,0 @@ -# Engine-plugins in Hasura V3 - -This document focuses on the implementation details for HTTP-based engine -plugins. - -## Pre-parse Hook - -For a pre-parse plugin, the request to the plugin is performed just after -receiving the request to the engine. - -### Configuration - -The pre-parse plugin can be configured using an OpenDD object of kind `LifecyclePluginHook`. It includes the following information: - -1. The engine-plugin URL -2. Request Includes (this can be used to optimize critical engine plugins): - 1. Request Headers - 2. Graphql request - 3. Variables - -Please note that the presence of `operationName` is not configurable, and -including/excluding operation name won't have much impact on the request size. - -An example of configuration JSON is: - -```json -{ - "kind": "LifecyclePluginHook", - "version": "v1", - "definition": { - "pre": "parse", - "name": "test", - "url": "http://localhost:8787", - "config": { - "request": { - "headers": { - "additional": { - "hasura-m-auth": { - "value": "zZkhKqFjqXR4g5MZCsJUZCnhCcoPyZ" - } - } - }, - "session": {}, - "rawRequest": { - "query": {}, - "variables": {} - } - } - } - } -} -``` - -### Request - -The request to the pre-parse hook should have sufficient information to cater to -the following planned use cases: - -1. Rate limits -2. Depth limits -3. Node limits -4. Caching (get-cache) - -The request should have the following: - -1. Headers: Include information for the uniqueness of the request (origin, - session variables, etc.), cache control information, etc. -2. Hasura’s session information: Role and session variables -3. Raw request: Raw request received by graphql-engine (including variables) - -```json -{ - "session": , - "rawRequest": -} -``` - -### Response - -The response of a pre-parse hook can be of three types: - -1. Return with a response: The engine-plugin has handled the request, and the - graphql-engine should return the response provided by the engine-plugin. - (Should we check if the response is valid according to the spec?) -2. Continue with the execution: The graphql-engine should proceed with the - request handling. -3. Error response: Abort the request with the error response. - -As suggested by @SamirTalwar, we can also use HTTP status codes to decide the -type of the response, i.e. - -1. 200s HTTP status code will mean either: - 1. 200: A successful response - 2. 204: Or continued execution -2. 400 HTTP status code will mean user error -3. 500 HTTP status code will mean an internal error - -#### Success response - -HTTP code: 200 - -``` - -``` - -#### Continue with execution - -HTTP code: 204 There should be no response body for this case - -#### Error - -A pre-plugin response can be of two types: - -1. User error: This will include errors that can be propagated to the user. - - HTTP code: 400 - - ``` - - ``` - -2. Internal error: Internal errors are encountered while handling the request. - The engine-plugin can dictate the engine to either abort the execution or - continue with the request. The internal errors will not be propagated to the - users; they will only be part of the traces. - - HTTP code: 500 - ```json - { - "details": , - "action": - } - ``` - -## Pre-response hook - -A pre-response hook is called just before returning a response to the user. For -now, we will have asynchronous pre-response hooks only. - -An asynchronous hook will be useful for the following use cases: - -1. Caching (cache-set) -2. Custom business logic: Send mail/slack notifications for mutations - -### Configuration - -Like a pre-parse hook, a pre-response hook can also be configured using a -configuration file. The request can be configured to omit a few fields if -needed. - -An example of configuration JSON is: - -```json -{ - "kind": "LifecyclePluginHook", - "version": "v1", - "definition": { - "pre": "response", - "name": "test", - "url": "http://localhost:8787", - "config": { - "request": { - "headers": { - "additional": { - "hasura-m-auth": { - "value": "zZkhKqFjqXR4g5MZCsJUZCnhCcoPyZ" - } - } - }, - "session": {}, - "rawRequest": { - "query": {}, - "variables": {} - }, - "response": {} - } - } - } -} -``` - -### Request - -A pre-response hook’s request can have the following fields: - -1. Raw request: The raw request for which the engine has generated the response. -2. Session: The role and session variables -3. Engine’s response: The response that we have generated after executing the - query. -4. Request headers: This can be important for caching engine plugins - -```json -{ - "session": , - "rawRequest": , - "response": -} -``` - -### Response - -For asynchronous pre-response hook, the request can be either of the two: - -1. Success -2. Error - -#### Async Success Response - -HTTP Code: 200s - -``` -There need not be any response body. -``` - -#### Async Error Response - -HTTP code 400s - -``` - -``` - -The error details will be part of the traces. - -## Multiple engine-plugins - -The engine can handle multiple engine plugins. - -### Pre-plugins - -For example, multiple pre-plugins can be thought of as a pipeline: - -``` - _____________________ ______________________ __________________ - | | | | | | - Request--->| Pre-parse Plugin 1 |---->| Pre-parse Plugin 2 |---->| Engine Execution |--... - |_____________________| |______________________| |__________________| -``` - -For plugin 2, we will do the following: - -- If plugin 1 responds successfully/error, we will NOT call plugin 2, and there - will be a short-circuit. -- Only for the continued execution case will we call plugin 2. -- The request to all the pre-plugin will be the same (the raw request and - session information are not going to change) - -### Pre-response - -Multiple pre-response engine plugins can also be handled. Since they are async -in nature, we can execute them in parallel: - -``` -Engine execution ------> To the user - | ________________ - | | Async | - |---->| Pre-response | - | | Plugin 1 | - | |________________| - | ________________ - | | Async | - |---->| Pre-response | - | | Plugin 2 | - | |________________| - ... -``` - -## How will this look in the metadata? - -Engine plugins will be part of the metadata (OpenDD). This will be more like the -`AuthConfig` and will be handled while building the artifacts. - -The engine-plugin artifacts will be similar to how we store `AuthConfig` -artifacts right now. We will have new artifacts (pre-parse and pre-response -plugin artifacts). - -Each artifact will have a list of engine plugins in the order of execution. For -example: - -``` - __________________ -| ______________ | -| | Pre-parse 1 | | ________________ ________________ __________________ -| |______________| | | | | | | | -| ______________ | =====> Request--->| Pre-Parse 1 |---->| Pre-Parse 2 |---->| Engine Execution |--... -| | Pre-parse 2 | | |________________| |________________| |__________________| -| |______________| | -|__________________| -``` - -For pre-response, the order doesn’t matter right now, but we will still maintain -an order (to future-proof for synchronous pre-response). - -There are a few caveats with the ordering of engine plugins for the multitenant -engine or DDN cloud: Auth plugin (once converted to an engine plugin, will -always be executed first). - -## Future plans - -### Synchronous pre-response hook - -A synchronous hook can be useful for response transformation using something -like kriti-lang. - -For synchronous pre-response hooks, the response can be similar to the pre-parse -hook. I.e., it can be one of the three: Return with a response: The engine -plugin has handled the request, and the graphql-engine should return the -response provided by the engine plugin (and ignore the response generated by the -engine). Return with engine’s response: The graphql-engine should proceed with -the engine’s response. Error response: Abort the request with the error -response. - -Synchronous pre-response engine-plugins will be daisy-chained with one another: - -``` - __________________ __________________ - | | | | -Engine execution --->| pre-response 1 |---->| pre-response 2 |----> ... - |__________________| |__________________| -``` - -For synchronous pre-response, the response will be the response from the -previous node (i.e., for response 1, the response will be generated by the -engine, but for pre-response 2, it will be dependent on pre-response 1). Here -also, in case of an error response, we will short-circuit the execution stack. - -#### Mixing synchronous and asynchronous pre-response - -In case there are multiple synchronous as well as asynchronous pre-response, the -execution stack will look like this: First, we will handle all the synchronous -pre-response. In the end, we will handle the asynchronous ones. - -``` - _________________ _________________ - | Sync | | Sync | -Engine execution --->| pre-response 1 |---->| pre-response 2 |-------> To the user - |_________________| |_________________| | _________________ - | | Async | - |---->| pre-response 1 | - | |_________________| - | _________________ - | | Async | - |---->| pre-response 2 | - | |_________________| - ... -``` diff --git a/server/lib/pg-client/src/Database/PG/Query/Pool.hs b/server/lib/pg-client/src/Database/PG/Query/Pool.hs index ff11ad16dc0..1f6688acaf3 100644 --- a/server/lib/pg-client/src/Database/PG/Query/Pool.hs +++ b/server/lib/pg-client/src/Database/PG/Query/Pool.hs @@ -59,6 +59,8 @@ import Language.Haskell.TH.Quote (QuasiQuoter (..)) import Language.Haskell.TH.Syntax (Exp, Q, lift, qAddDependentFile, runIO) import System.Metrics.Distribution (Distribution) import System.Metrics.Distribution qualified as EKG.Distribution +import System.Metrics.Prometheus.Counter (Counter) +import System.Metrics.Prometheus.Counter qualified as Counter import System.Metrics.Prometheus.Histogram (Histogram) import System.Metrics.Prometheus.Histogram qualified as Histogram import Prelude @@ -92,7 +94,9 @@ data PGPoolMetrics = PGPoolMetrics { -- | time taken to establish and initialise a PostgreSQL connection _pgConnAcquireLatencyMetric :: !Histogram, -- | time taken to acquire a connection from the pool - _poolWaitTimeMetric :: !Histogram + _poolWaitTimeMetric :: !Histogram, + -- | total number of PostgreSQL errors + _pgErrorTotalMetric :: !Counter } getInUseConnections :: PGPool -> IO Int @@ -129,6 +133,7 @@ initPGPoolMetrics :: IO PGPoolMetrics initPGPoolMetrics = do _pgConnAcquireLatencyMetric <- Histogram.new histogramBuckets _poolWaitTimeMetric <- Histogram.new histogramBuckets + _pgErrorTotalMetric <- Counter.new pure PGPoolMetrics {..} where histogramBuckets = [0.000001, 0.0001, 0.01, 0.1, 0.3, 1, 3, 10, 30, 100] @@ -151,7 +156,7 @@ initPGPool ci context cp logger = do retryP = mkPGRetryPolicy $ ciRetries ci creator stats metrics = do createdAt <- getCurrentTime - pqConn <- initPQConn ci logger + pqConn <- (initPQConn ci logger) `Exc.onException` (Counter.inc (_pgErrorTotalMetric metrics)) connAcquiredAt <- getCurrentTime let connAcquiredMicroseconds = realToFrac (1000000 * diffUTCTime connAcquiredAt createdAt) connAcquiredSeconds = realToFrac $ diffUTCTime connAcquiredAt createdAt