server: add hasura_postgres_connection_error_total metric

PR-URL: https://github.com/hasura/graphql-engine-mono/pull/11063
GitOrigin-RevId: 0e0f8b6e7759623f470893aaa6d6e68d205269b5
This commit is contained in:
Toan Nguyen 2024-10-22 18:51:40 +07:00 committed by hasura-bot
parent f68438b78e
commit 8b956bfafa
4 changed files with 135 additions and 367 deletions

View File

@ -3453,7 +3453,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Total number of incoming requests for cache lookup",
"description": "Postgres connection errors from GraphQL Engine instances",
"fieldConfig": {
"defaults": {
"color": {
@ -3501,6 +3501,10 @@
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
},
@ -3514,7 +3518,7 @@
"x": 12,
"y": 81
},
"id": 57,
"id": 66,
"options": {
"legend": {
"calcs": [],
@ -3534,25 +3538,13 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(rate(hasura_cache_request_count{job=~\"$job\",instance=~\"$instance\",status=\"hit\"}[$__rate_interval]))",
"legendFormat": "Cache Hit",
"expr": "sum by (job, role,conn_info,source_name) (increase(hasura_postgres_connection_error_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])) ",
"legendFormat": "__auto",
"range": true,
"refId": "Hit"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(rate(hasura_cache_request_count{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))",
"hide": false,
"legendFormat": "Total",
"range": true,
"refId": "Total"
"refId": "A"
}
],
"title": "Cache Request Rate",
"title": "Postgres Connectionr Errors",
"type": "timeseries"
},
{
@ -3748,6 +3740,113 @@
],
"title": "Postgres Pool Wait Time (P95)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Total number of incoming requests for cache lookup",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineStyle": {
"fill": "solid"
},
"lineWidth": 1,
"pointSize": 1,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 95
},
"id": 57,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(rate(hasura_cache_request_count{job=~\"$job\",instance=~\"$instance\",status=\"hit\"}[$__rate_interval]))",
"legendFormat": "Cache Hit",
"range": true,
"refId": "Hit"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(rate(hasura_cache_request_count{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))",
"hide": false,
"legendFormat": "Total",
"range": true,
"refId": "Total"
}
],
"title": "Cache Request Rate",
"type": "timeseries"
}
],
"refresh": "",
@ -3832,6 +3931,6 @@
"timezone": "",
"title": "Hasura Overview",
"uid": "Of9GFjr7z",
"version": 2,
"version": 1,
"weekStart": ""
}

View File

@ -485,6 +485,16 @@ The time taken to acquire a connection from the pool.
| Labels | `source_name`: name of the database<br />`conn_info`: connection url string (password omitted) or name of the connection url environment variable<br />`role`: primary \| replica |
| Unit | seconds |
#### Hasura Postgres Connection Errors Total
Total number of PostgreSQL connection errors.
| | |
| ------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Name | `hasura_postgres_connection_error_total` |
| Type | Counter |
| Labels | `source_name`: name of the database<br />`conn_info`: connection url string (password omitted) or name of the connection url environment variable<br />`role`: primary \| replica |
### Hasura source health
Health check status of a particular data source, corresponding to the output of `/healthz/sources`, with possible values

View File

@ -1,346 +0,0 @@
# Engine-plugins in Hasura V3
This document focuses on the implementation details for HTTP-based engine
plugins.
## Pre-parse Hook
For a pre-parse plugin, the request to the plugin is performed just after
receiving the request to the engine.
### Configuration
The pre-parse plugin can be configured using an OpenDD object of kind `LifecyclePluginHook`. It includes the following information:
1. The engine-plugin URL
2. Request Includes (this can be used to optimize critical engine plugins):
1. Request Headers
2. Graphql request
3. Variables
Please note that the presence of `operationName` is not configurable, and
including/excluding operation name won't have much impact on the request size.
An example of configuration JSON is:
```json
{
"kind": "LifecyclePluginHook",
"version": "v1",
"definition": {
"pre": "parse",
"name": "test",
"url": "http://localhost:8787",
"config": {
"request": {
"headers": {
"additional": {
"hasura-m-auth": {
"value": "zZkhKqFjqXR4g5MZCsJUZCnhCcoPyZ"
}
}
},
"session": {},
"rawRequest": {
"query": {},
"variables": {}
}
}
}
}
}
```
### Request
The request to the pre-parse hook should have sufficient information to cater to
the following planned use cases:
1. Rate limits
2. Depth limits
3. Node limits
4. Caching (get-cache)
The request should have the following:
1. Headers: Include information for the uniqueness of the request (origin,
session variables, etc.), cache control information, etc.
2. Hasuras session information: Role and session variables
3. Raw request: Raw request received by graphql-engine (including variables)
```json
{
"session": <the hasura session object>,
"rawRequest": <raw request>
}
```
### Response
The response of a pre-parse hook can be of three types:
1. Return with a response: The engine-plugin has handled the request, and the
graphql-engine should return the response provided by the engine-plugin.
(Should we check if the response is valid according to the spec?)
2. Continue with the execution: The graphql-engine should proceed with the
request handling.
3. Error response: Abort the request with the error response.
As suggested by @SamirTalwar, we can also use HTTP status codes to decide the
type of the response, i.e.
1. 200s HTTP status code will mean either:
1. 200: A successful response
2. 204: Or continued execution
2. 400 HTTP status code will mean user error
3. 500 HTTP status code will mean an internal error
#### Success response
HTTP code: 200
```
<the response json value>
```
#### Continue with execution
HTTP code: 204 There should be no response body for this case
#### Error
A pre-plugin response can be of two types:
1. User error: This will include errors that can be propagated to the user.
HTTP code: 400
```
<The user error json value>
```
2. Internal error: Internal errors are encountered while handling the request.
The engine-plugin can dictate the engine to either abort the execution or
continue with the request. The internal errors will not be propagated to the
users; they will only be part of the traces.
HTTP code: 500
```json
{
"details": <The internal error json value>,
"action": <abort/continue>
}
```
## Pre-response hook
A pre-response hook is called just before returning a response to the user. For
now, we will have asynchronous pre-response hooks only.
An asynchronous hook will be useful for the following use cases:
1. Caching (cache-set)
2. Custom business logic: Send mail/slack notifications for mutations
### Configuration
Like a pre-parse hook, a pre-response hook can also be configured using a
configuration file. The request can be configured to omit a few fields if
needed.
An example of configuration JSON is:
```json
{
"kind": "LifecyclePluginHook",
"version": "v1",
"definition": {
"pre": "response",
"name": "test",
"url": "http://localhost:8787",
"config": {
"request": {
"headers": {
"additional": {
"hasura-m-auth": {
"value": "zZkhKqFjqXR4g5MZCsJUZCnhCcoPyZ"
}
}
},
"session": {},
"rawRequest": {
"query": {},
"variables": {}
},
"response": {}
}
}
}
}
```
### Request
A pre-response hooks request can have the following fields:
1. Raw request: The raw request for which the engine has generated the response.
2. Session: The role and session variables
3. Engines response: The response that we have generated after executing the
query.
4. Request headers: This can be important for caching engine plugins
```json
{
"session": <the hasura session object>,
"rawRequest": <raw request>,
"response": <engine's response>
}
```
### Response
For asynchronous pre-response hook, the request can be either of the two:
1. Success
2. Error
#### Async Success Response
HTTP Code: 200s
```
There need not be any response body.
```
#### Async Error Response
HTTP code 400s
```
<optional error details as JSON>
```
The error details will be part of the traces.
## Multiple engine-plugins
The engine can handle multiple engine plugins.
### Pre-plugins
For example, multiple pre-plugins can be thought of as a pipeline:
```
_____________________ ______________________ __________________
| | | | | |
Request--->| Pre-parse Plugin 1 |---->| Pre-parse Plugin 2 |---->| Engine Execution |--...
|_____________________| |______________________| |__________________|
```
For plugin 2, we will do the following:
- If plugin 1 responds successfully/error, we will NOT call plugin 2, and there
will be a short-circuit.
- Only for the continued execution case will we call plugin 2.
- The request to all the pre-plugin will be the same (the raw request and
session information are not going to change)
### Pre-response
Multiple pre-response engine plugins can also be handled. Since they are async
in nature, we can execute them in parallel:
```
Engine execution ------> To the user
| ________________
| | Async |
|---->| Pre-response |
| | Plugin 1 |
| |________________|
| ________________
| | Async |
|---->| Pre-response |
| | Plugin 2 |
| |________________|
...
```
## How will this look in the metadata?
Engine plugins will be part of the metadata (OpenDD). This will be more like the
`AuthConfig` and will be handled while building the artifacts.
The engine-plugin artifacts will be similar to how we store `AuthConfig`
artifacts right now. We will have new artifacts (pre-parse and pre-response
plugin artifacts).
Each artifact will have a list of engine plugins in the order of execution. For
example:
```
__________________
| ______________ |
| | Pre-parse 1 | | ________________ ________________ __________________
| |______________| | | | | | | |
| ______________ | =====> Request--->| Pre-Parse 1 |---->| Pre-Parse 2 |---->| Engine Execution |--...
| | Pre-parse 2 | | |________________| |________________| |__________________|
| |______________| |
|__________________|
```
For pre-response, the order doesnt matter right now, but we will still maintain
an order (to future-proof for synchronous pre-response).
There are a few caveats with the ordering of engine plugins for the multitenant
engine or DDN cloud: Auth plugin (once converted to an engine plugin, will
always be executed first).
## Future plans
### Synchronous pre-response hook
A synchronous hook can be useful for response transformation using something
like kriti-lang.
For synchronous pre-response hooks, the response can be similar to the pre-parse
hook. I.e., it can be one of the three: Return with a response: The engine
plugin has handled the request, and the graphql-engine should return the
response provided by the engine plugin (and ignore the response generated by the
engine). Return with engines response: The graphql-engine should proceed with
the engines response. Error response: Abort the request with the error
response.
Synchronous pre-response engine-plugins will be daisy-chained with one another:
```
__________________ __________________
| | | |
Engine execution --->| pre-response 1 |---->| pre-response 2 |----> ...
|__________________| |__________________|
```
For synchronous pre-response, the response will be the response from the
previous node (i.e., for response 1, the response will be generated by the
engine, but for pre-response 2, it will be dependent on pre-response 1). Here
also, in case of an error response, we will short-circuit the execution stack.
#### Mixing synchronous and asynchronous pre-response
In case there are multiple synchronous as well as asynchronous pre-response, the
execution stack will look like this: First, we will handle all the synchronous
pre-response. In the end, we will handle the asynchronous ones.
```
_________________ _________________
| Sync | | Sync |
Engine execution --->| pre-response 1 |---->| pre-response 2 |-------> To the user
|_________________| |_________________| | _________________
| | Async |
|---->| pre-response 1 |
| |_________________|
| _________________
| | Async |
|---->| pre-response 2 |
| |_________________|
...
```

View File

@ -59,6 +59,8 @@ import Language.Haskell.TH.Quote (QuasiQuoter (..))
import Language.Haskell.TH.Syntax (Exp, Q, lift, qAddDependentFile, runIO)
import System.Metrics.Distribution (Distribution)
import System.Metrics.Distribution qualified as EKG.Distribution
import System.Metrics.Prometheus.Counter (Counter)
import System.Metrics.Prometheus.Counter qualified as Counter
import System.Metrics.Prometheus.Histogram (Histogram)
import System.Metrics.Prometheus.Histogram qualified as Histogram
import Prelude
@ -92,7 +94,9 @@ data PGPoolMetrics = PGPoolMetrics
{ -- | time taken to establish and initialise a PostgreSQL connection
_pgConnAcquireLatencyMetric :: !Histogram,
-- | time taken to acquire a connection from the pool
_poolWaitTimeMetric :: !Histogram
_poolWaitTimeMetric :: !Histogram,
-- | total number of PostgreSQL errors
_pgErrorTotalMetric :: !Counter
}
getInUseConnections :: PGPool -> IO Int
@ -129,6 +133,7 @@ initPGPoolMetrics :: IO PGPoolMetrics
initPGPoolMetrics = do
_pgConnAcquireLatencyMetric <- Histogram.new histogramBuckets
_poolWaitTimeMetric <- Histogram.new histogramBuckets
_pgErrorTotalMetric <- Counter.new
pure PGPoolMetrics {..}
where
histogramBuckets = [0.000001, 0.0001, 0.01, 0.1, 0.3, 1, 3, 10, 30, 100]
@ -151,7 +156,7 @@ initPGPool ci context cp logger = do
retryP = mkPGRetryPolicy $ ciRetries ci
creator stats metrics = do
createdAt <- getCurrentTime
pqConn <- initPQConn ci logger
pqConn <- (initPQConn ci logger) `Exc.onException` (Counter.inc (_pgErrorTotalMetric metrics))
connAcquiredAt <- getCurrentTime
let connAcquiredMicroseconds = realToFrac (1000000 * diffUTCTime connAcquiredAt createdAt)
connAcquiredSeconds = realToFrac $ diffUTCTime connAcquiredAt createdAt