diff --git a/community/boilerplates/observability/enterprise/grafana/dashboards/hasura/hasura-overview.json b/community/boilerplates/observability/enterprise/grafana/dashboards/hasura/hasura-overview.json
index 64b9b67893f..b04842f3762 100644
--- a/community/boilerplates/observability/enterprise/grafana/dashboards/hasura/hasura-overview.json
+++ b/community/boilerplates/observability/enterprise/grafana/dashboards/hasura/hasura-overview.json
@@ -3453,7 +3453,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "description": "Total number of incoming requests for cache lookup",
+ "description": "Postgres connection errors from GraphQL Engine instances",
"fieldConfig": {
"defaults": {
"color": {
@@ -3501,6 +3501,10 @@
{
"color": "green",
"value": null
+ },
+ {
+ "color": "red",
+ "value": 1
}
]
},
@@ -3514,7 +3518,7 @@
"x": 12,
"y": 81
},
- "id": 57,
+ "id": 66,
"options": {
"legend": {
"calcs": [],
@@ -3534,25 +3538,13 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "sum(rate(hasura_cache_request_count{job=~\"$job\",instance=~\"$instance\",status=\"hit\"}[$__rate_interval]))",
- "legendFormat": "Cache Hit",
+ "expr": "sum by (job, role,conn_info,source_name) (increase(hasura_postgres_connection_error_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])) ",
+ "legendFormat": "__auto",
"range": true,
- "refId": "Hit"
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "editorMode": "code",
- "expr": "sum(rate(hasura_cache_request_count{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))",
- "hide": false,
- "legendFormat": "Total",
- "range": true,
- "refId": "Total"
+ "refId": "A"
}
],
- "title": "Cache Request Rate",
+ "title": "Postgres Connectionr Errors",
"type": "timeseries"
},
{
@@ -3748,6 +3740,113 @@
],
"title": "Postgres Pool Wait Time (P95)",
"type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Total number of incoming requests for cache lookup",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "smooth",
+ "lineStyle": {
+ "fill": "solid"
+ },
+ "lineWidth": 1,
+ "pointSize": 1,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "none"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 95
+ },
+ "id": 57,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "sum(rate(hasura_cache_request_count{job=~\"$job\",instance=~\"$instance\",status=\"hit\"}[$__rate_interval]))",
+ "legendFormat": "Cache Hit",
+ "range": true,
+ "refId": "Hit"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "sum(rate(hasura_cache_request_count{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))",
+ "hide": false,
+ "legendFormat": "Total",
+ "range": true,
+ "refId": "Total"
+ }
+ ],
+ "title": "Cache Request Rate",
+ "type": "timeseries"
}
],
"refresh": "",
@@ -3832,6 +3931,6 @@
"timezone": "",
"title": "Hasura Overview",
"uid": "Of9GFjr7z",
- "version": 2,
+ "version": 1,
"weekStart": ""
}
\ No newline at end of file
diff --git a/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx b/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx
index 7979050333b..881d0bb1a81 100644
--- a/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx
+++ b/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx
@@ -485,6 +485,16 @@ The time taken to acquire a connection from the pool.
| Labels | `source_name`: name of the database
`conn_info`: connection url string (password omitted) or name of the connection url environment variable
`role`: primary \| replica |
| Unit | seconds |
+#### Hasura Postgres Connection Errors Total
+
+Total number of PostgreSQL connection errors.
+
+| | |
+| ------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Name | `hasura_postgres_connection_error_total` |
+| Type | Counter |
+| Labels | `source_name`: name of the database
`conn_info`: connection url string (password omitted) or name of the connection url environment variable
`role`: primary \| replica |
+
### Hasura source health
Health check status of a particular data source, corresponding to the output of `/healthz/sources`, with possible values
diff --git a/rfcs/v3/engine-plugins.md b/rfcs/v3/engine-plugins.md
deleted file mode 100644
index 21a70f4aeb4..00000000000
--- a/rfcs/v3/engine-plugins.md
+++ /dev/null
@@ -1,346 +0,0 @@
-# Engine-plugins in Hasura V3
-
-This document focuses on the implementation details for HTTP-based engine
-plugins.
-
-## Pre-parse Hook
-
-For a pre-parse plugin, the request to the plugin is performed just after
-receiving the request to the engine.
-
-### Configuration
-
-The pre-parse plugin can be configured using an OpenDD object of kind `LifecyclePluginHook`. It includes the following information:
-
-1. The engine-plugin URL
-2. Request Includes (this can be used to optimize critical engine plugins):
- 1. Request Headers
- 2. Graphql request
- 3. Variables
-
-Please note that the presence of `operationName` is not configurable, and
-including/excluding operation name won't have much impact on the request size.
-
-An example of configuration JSON is:
-
-```json
-{
- "kind": "LifecyclePluginHook",
- "version": "v1",
- "definition": {
- "pre": "parse",
- "name": "test",
- "url": "http://localhost:8787",
- "config": {
- "request": {
- "headers": {
- "additional": {
- "hasura-m-auth": {
- "value": "zZkhKqFjqXR4g5MZCsJUZCnhCcoPyZ"
- }
- }
- },
- "session": {},
- "rawRequest": {
- "query": {},
- "variables": {}
- }
- }
- }
- }
-}
-```
-
-### Request
-
-The request to the pre-parse hook should have sufficient information to cater to
-the following planned use cases:
-
-1. Rate limits
-2. Depth limits
-3. Node limits
-4. Caching (get-cache)
-
-The request should have the following:
-
-1. Headers: Include information for the uniqueness of the request (origin,
- session variables, etc.), cache control information, etc.
-2. Hasura’s session information: Role and session variables
-3. Raw request: Raw request received by graphql-engine (including variables)
-
-```json
-{
- "session": ,
- "rawRequest":
-}
-```
-
-### Response
-
-The response of a pre-parse hook can be of three types:
-
-1. Return with a response: The engine-plugin has handled the request, and the
- graphql-engine should return the response provided by the engine-plugin.
- (Should we check if the response is valid according to the spec?)
-2. Continue with the execution: The graphql-engine should proceed with the
- request handling.
-3. Error response: Abort the request with the error response.
-
-As suggested by @SamirTalwar, we can also use HTTP status codes to decide the
-type of the response, i.e.
-
-1. 200s HTTP status code will mean either:
- 1. 200: A successful response
- 2. 204: Or continued execution
-2. 400 HTTP status code will mean user error
-3. 500 HTTP status code will mean an internal error
-
-#### Success response
-
-HTTP code: 200
-
-```
-
-```
-
-#### Continue with execution
-
-HTTP code: 204 There should be no response body for this case
-
-#### Error
-
-A pre-plugin response can be of two types:
-
-1. User error: This will include errors that can be propagated to the user.
-
- HTTP code: 400
-
- ```
-
- ```
-
-2. Internal error: Internal errors are encountered while handling the request.
- The engine-plugin can dictate the engine to either abort the execution or
- continue with the request. The internal errors will not be propagated to the
- users; they will only be part of the traces.
-
- HTTP code: 500
- ```json
- {
- "details": ,
- "action":
- }
- ```
-
-## Pre-response hook
-
-A pre-response hook is called just before returning a response to the user. For
-now, we will have asynchronous pre-response hooks only.
-
-An asynchronous hook will be useful for the following use cases:
-
-1. Caching (cache-set)
-2. Custom business logic: Send mail/slack notifications for mutations
-
-### Configuration
-
-Like a pre-parse hook, a pre-response hook can also be configured using a
-configuration file. The request can be configured to omit a few fields if
-needed.
-
-An example of configuration JSON is:
-
-```json
-{
- "kind": "LifecyclePluginHook",
- "version": "v1",
- "definition": {
- "pre": "response",
- "name": "test",
- "url": "http://localhost:8787",
- "config": {
- "request": {
- "headers": {
- "additional": {
- "hasura-m-auth": {
- "value": "zZkhKqFjqXR4g5MZCsJUZCnhCcoPyZ"
- }
- }
- },
- "session": {},
- "rawRequest": {
- "query": {},
- "variables": {}
- },
- "response": {}
- }
- }
- }
-}
-```
-
-### Request
-
-A pre-response hook’s request can have the following fields:
-
-1. Raw request: The raw request for which the engine has generated the response.
-2. Session: The role and session variables
-3. Engine’s response: The response that we have generated after executing the
- query.
-4. Request headers: This can be important for caching engine plugins
-
-```json
-{
- "session": ,
- "rawRequest": ,
- "response":
-}
-```
-
-### Response
-
-For asynchronous pre-response hook, the request can be either of the two:
-
-1. Success
-2. Error
-
-#### Async Success Response
-
-HTTP Code: 200s
-
-```
-There need not be any response body.
-```
-
-#### Async Error Response
-
-HTTP code 400s
-
-```
-
-```
-
-The error details will be part of the traces.
-
-## Multiple engine-plugins
-
-The engine can handle multiple engine plugins.
-
-### Pre-plugins
-
-For example, multiple pre-plugins can be thought of as a pipeline:
-
-```
- _____________________ ______________________ __________________
- | | | | | |
- Request--->| Pre-parse Plugin 1 |---->| Pre-parse Plugin 2 |---->| Engine Execution |--...
- |_____________________| |______________________| |__________________|
-```
-
-For plugin 2, we will do the following:
-
-- If plugin 1 responds successfully/error, we will NOT call plugin 2, and there
- will be a short-circuit.
-- Only for the continued execution case will we call plugin 2.
-- The request to all the pre-plugin will be the same (the raw request and
- session information are not going to change)
-
-### Pre-response
-
-Multiple pre-response engine plugins can also be handled. Since they are async
-in nature, we can execute them in parallel:
-
-```
-Engine execution ------> To the user
- | ________________
- | | Async |
- |---->| Pre-response |
- | | Plugin 1 |
- | |________________|
- | ________________
- | | Async |
- |---->| Pre-response |
- | | Plugin 2 |
- | |________________|
- ...
-```
-
-## How will this look in the metadata?
-
-Engine plugins will be part of the metadata (OpenDD). This will be more like the
-`AuthConfig` and will be handled while building the artifacts.
-
-The engine-plugin artifacts will be similar to how we store `AuthConfig`
-artifacts right now. We will have new artifacts (pre-parse and pre-response
-plugin artifacts).
-
-Each artifact will have a list of engine plugins in the order of execution. For
-example:
-
-```
- __________________
-| ______________ |
-| | Pre-parse 1 | | ________________ ________________ __________________
-| |______________| | | | | | | |
-| ______________ | =====> Request--->| Pre-Parse 1 |---->| Pre-Parse 2 |---->| Engine Execution |--...
-| | Pre-parse 2 | | |________________| |________________| |__________________|
-| |______________| |
-|__________________|
-```
-
-For pre-response, the order doesn’t matter right now, but we will still maintain
-an order (to future-proof for synchronous pre-response).
-
-There are a few caveats with the ordering of engine plugins for the multitenant
-engine or DDN cloud: Auth plugin (once converted to an engine plugin, will
-always be executed first).
-
-## Future plans
-
-### Synchronous pre-response hook
-
-A synchronous hook can be useful for response transformation using something
-like kriti-lang.
-
-For synchronous pre-response hooks, the response can be similar to the pre-parse
-hook. I.e., it can be one of the three: Return with a response: The engine
-plugin has handled the request, and the graphql-engine should return the
-response provided by the engine plugin (and ignore the response generated by the
-engine). Return with engine’s response: The graphql-engine should proceed with
-the engine’s response. Error response: Abort the request with the error
-response.
-
-Synchronous pre-response engine-plugins will be daisy-chained with one another:
-
-```
- __________________ __________________
- | | | |
-Engine execution --->| pre-response 1 |---->| pre-response 2 |----> ...
- |__________________| |__________________|
-```
-
-For synchronous pre-response, the response will be the response from the
-previous node (i.e., for response 1, the response will be generated by the
-engine, but for pre-response 2, it will be dependent on pre-response 1). Here
-also, in case of an error response, we will short-circuit the execution stack.
-
-#### Mixing synchronous and asynchronous pre-response
-
-In case there are multiple synchronous as well as asynchronous pre-response, the
-execution stack will look like this: First, we will handle all the synchronous
-pre-response. In the end, we will handle the asynchronous ones.
-
-```
- _________________ _________________
- | Sync | | Sync |
-Engine execution --->| pre-response 1 |---->| pre-response 2 |-------> To the user
- |_________________| |_________________| | _________________
- | | Async |
- |---->| pre-response 1 |
- | |_________________|
- | _________________
- | | Async |
- |---->| pre-response 2 |
- | |_________________|
- ...
-```
diff --git a/server/lib/pg-client/src/Database/PG/Query/Pool.hs b/server/lib/pg-client/src/Database/PG/Query/Pool.hs
index ff11ad16dc0..1f6688acaf3 100644
--- a/server/lib/pg-client/src/Database/PG/Query/Pool.hs
+++ b/server/lib/pg-client/src/Database/PG/Query/Pool.hs
@@ -59,6 +59,8 @@ import Language.Haskell.TH.Quote (QuasiQuoter (..))
import Language.Haskell.TH.Syntax (Exp, Q, lift, qAddDependentFile, runIO)
import System.Metrics.Distribution (Distribution)
import System.Metrics.Distribution qualified as EKG.Distribution
+import System.Metrics.Prometheus.Counter (Counter)
+import System.Metrics.Prometheus.Counter qualified as Counter
import System.Metrics.Prometheus.Histogram (Histogram)
import System.Metrics.Prometheus.Histogram qualified as Histogram
import Prelude
@@ -92,7 +94,9 @@ data PGPoolMetrics = PGPoolMetrics
{ -- | time taken to establish and initialise a PostgreSQL connection
_pgConnAcquireLatencyMetric :: !Histogram,
-- | time taken to acquire a connection from the pool
- _poolWaitTimeMetric :: !Histogram
+ _poolWaitTimeMetric :: !Histogram,
+ -- | total number of PostgreSQL errors
+ _pgErrorTotalMetric :: !Counter
}
getInUseConnections :: PGPool -> IO Int
@@ -129,6 +133,7 @@ initPGPoolMetrics :: IO PGPoolMetrics
initPGPoolMetrics = do
_pgConnAcquireLatencyMetric <- Histogram.new histogramBuckets
_poolWaitTimeMetric <- Histogram.new histogramBuckets
+ _pgErrorTotalMetric <- Counter.new
pure PGPoolMetrics {..}
where
histogramBuckets = [0.000001, 0.0001, 0.01, 0.1, 0.3, 1, 3, 10, 30, 100]
@@ -151,7 +156,7 @@ initPGPool ci context cp logger = do
retryP = mkPGRetryPolicy $ ciRetries ci
creator stats metrics = do
createdAt <- getCurrentTime
- pqConn <- initPQConn ci logger
+ pqConn <- (initPQConn ci logger) `Exc.onException` (Counter.inc (_pgErrorTotalMetric metrics))
connAcquiredAt <- getCurrentTime
let connAcquiredMicroseconds = realToFrac (1000000 * diffUTCTime connAcquiredAt createdAt)
connAcquiredSeconds = realToFrac $ diffUTCTime connAcquiredAt createdAt