mirror of
https://github.com/hasura/graphql-engine.git
synced 2024-12-04 08:32:49 +03:00
server: add hasura_postgres_connection_error_total metric
PR-URL: https://github.com/hasura/graphql-engine-mono/pull/11063 GitOrigin-RevId: 0e0f8b6e7759623f470893aaa6d6e68d205269b5
This commit is contained in:
parent
f68438b78e
commit
8b956bfafa
@ -3453,7 +3453,7 @@
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Total number of incoming requests for cache lookup",
|
||||
"description": "Postgres connection errors from GraphQL Engine instances",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
@ -3501,6 +3501,10 @@
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -3514,7 +3518,7 @@
|
||||
"x": 12,
|
||||
"y": 81
|
||||
},
|
||||
"id": 57,
|
||||
"id": 66,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
@ -3534,25 +3538,13 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(hasura_cache_request_count{job=~\"$job\",instance=~\"$instance\",status=\"hit\"}[$__rate_interval]))",
|
||||
"legendFormat": "Cache Hit",
|
||||
"expr": "sum by (job, role,conn_info,source_name) (increase(hasura_postgres_connection_error_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])) ",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "Hit"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(hasura_cache_request_count{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))",
|
||||
"hide": false,
|
||||
"legendFormat": "Total",
|
||||
"range": true,
|
||||
"refId": "Total"
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Cache Request Rate",
|
||||
"title": "Postgres Connectionr Errors",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
@ -3748,6 +3740,113 @@
|
||||
],
|
||||
"title": "Postgres Pool Wait Time (P95)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Total number of incoming requests for cache lookup",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "smooth",
|
||||
"lineStyle": {
|
||||
"fill": "solid"
|
||||
},
|
||||
"lineWidth": 1,
|
||||
"pointSize": 1,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 95
|
||||
},
|
||||
"id": 57,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(hasura_cache_request_count{job=~\"$job\",instance=~\"$instance\",status=\"hit\"}[$__rate_interval]))",
|
||||
"legendFormat": "Cache Hit",
|
||||
"range": true,
|
||||
"refId": "Hit"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(hasura_cache_request_count{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))",
|
||||
"hide": false,
|
||||
"legendFormat": "Total",
|
||||
"range": true,
|
||||
"refId": "Total"
|
||||
}
|
||||
],
|
||||
"title": "Cache Request Rate",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "",
|
||||
@ -3832,6 +3931,6 @@
|
||||
"timezone": "",
|
||||
"title": "Hasura Overview",
|
||||
"uid": "Of9GFjr7z",
|
||||
"version": 2,
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
@ -485,6 +485,16 @@ The time taken to acquire a connection from the pool.
|
||||
| Labels | `source_name`: name of the database<br />`conn_info`: connection url string (password omitted) or name of the connection url environment variable<br />`role`: primary \| replica |
|
||||
| Unit | seconds |
|
||||
|
||||
#### Hasura Postgres Connection Errors Total
|
||||
|
||||
Total number of PostgreSQL connection errors.
|
||||
|
||||
| | |
|
||||
| ------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Name | `hasura_postgres_connection_error_total` |
|
||||
| Type | Counter |
|
||||
| Labels | `source_name`: name of the database<br />`conn_info`: connection url string (password omitted) or name of the connection url environment variable<br />`role`: primary \| replica |
|
||||
|
||||
### Hasura source health
|
||||
|
||||
Health check status of a particular data source, corresponding to the output of `/healthz/sources`, with possible values
|
||||
|
@ -1,346 +0,0 @@
|
||||
# Engine-plugins in Hasura V3
|
||||
|
||||
This document focuses on the implementation details for HTTP-based engine
|
||||
plugins.
|
||||
|
||||
## Pre-parse Hook
|
||||
|
||||
For a pre-parse plugin, the request to the plugin is performed just after
|
||||
receiving the request to the engine.
|
||||
|
||||
### Configuration
|
||||
|
||||
The pre-parse plugin can be configured using an OpenDD object of kind `LifecyclePluginHook`. It includes the following information:
|
||||
|
||||
1. The engine-plugin URL
|
||||
2. Request Includes (this can be used to optimize critical engine plugins):
|
||||
1. Request Headers
|
||||
2. Graphql request
|
||||
3. Variables
|
||||
|
||||
Please note that the presence of `operationName` is not configurable, and
|
||||
including/excluding operation name won't have much impact on the request size.
|
||||
|
||||
An example of configuration JSON is:
|
||||
|
||||
```json
|
||||
{
|
||||
"kind": "LifecyclePluginHook",
|
||||
"version": "v1",
|
||||
"definition": {
|
||||
"pre": "parse",
|
||||
"name": "test",
|
||||
"url": "http://localhost:8787",
|
||||
"config": {
|
||||
"request": {
|
||||
"headers": {
|
||||
"additional": {
|
||||
"hasura-m-auth": {
|
||||
"value": "zZkhKqFjqXR4g5MZCsJUZCnhCcoPyZ"
|
||||
}
|
||||
}
|
||||
},
|
||||
"session": {},
|
||||
"rawRequest": {
|
||||
"query": {},
|
||||
"variables": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Request
|
||||
|
||||
The request to the pre-parse hook should have sufficient information to cater to
|
||||
the following planned use cases:
|
||||
|
||||
1. Rate limits
|
||||
2. Depth limits
|
||||
3. Node limits
|
||||
4. Caching (get-cache)
|
||||
|
||||
The request should have the following:
|
||||
|
||||
1. Headers: Include information for the uniqueness of the request (origin,
|
||||
session variables, etc.), cache control information, etc.
|
||||
2. Hasura’s session information: Role and session variables
|
||||
3. Raw request: Raw request received by graphql-engine (including variables)
|
||||
|
||||
```json
|
||||
{
|
||||
"session": <the hasura session object>,
|
||||
"rawRequest": <raw request>
|
||||
}
|
||||
```
|
||||
|
||||
### Response
|
||||
|
||||
The response of a pre-parse hook can be of three types:
|
||||
|
||||
1. Return with a response: The engine-plugin has handled the request, and the
|
||||
graphql-engine should return the response provided by the engine-plugin.
|
||||
(Should we check if the response is valid according to the spec?)
|
||||
2. Continue with the execution: The graphql-engine should proceed with the
|
||||
request handling.
|
||||
3. Error response: Abort the request with the error response.
|
||||
|
||||
As suggested by @SamirTalwar, we can also use HTTP status codes to decide the
|
||||
type of the response, i.e.
|
||||
|
||||
1. 200s HTTP status code will mean either:
|
||||
1. 200: A successful response
|
||||
2. 204: Or continued execution
|
||||
2. 400 HTTP status code will mean user error
|
||||
3. 500 HTTP status code will mean an internal error
|
||||
|
||||
#### Success response
|
||||
|
||||
HTTP code: 200
|
||||
|
||||
```
|
||||
<the response json value>
|
||||
```
|
||||
|
||||
#### Continue with execution
|
||||
|
||||
HTTP code: 204 There should be no response body for this case
|
||||
|
||||
#### Error
|
||||
|
||||
A pre-plugin response can be of two types:
|
||||
|
||||
1. User error: This will include errors that can be propagated to the user.
|
||||
|
||||
HTTP code: 400
|
||||
|
||||
```
|
||||
<The user error json value>
|
||||
```
|
||||
|
||||
2. Internal error: Internal errors are encountered while handling the request.
|
||||
The engine-plugin can dictate the engine to either abort the execution or
|
||||
continue with the request. The internal errors will not be propagated to the
|
||||
users; they will only be part of the traces.
|
||||
|
||||
HTTP code: 500
|
||||
```json
|
||||
{
|
||||
"details": <The internal error json value>,
|
||||
"action": <abort/continue>
|
||||
}
|
||||
```
|
||||
|
||||
## Pre-response hook
|
||||
|
||||
A pre-response hook is called just before returning a response to the user. For
|
||||
now, we will have asynchronous pre-response hooks only.
|
||||
|
||||
An asynchronous hook will be useful for the following use cases:
|
||||
|
||||
1. Caching (cache-set)
|
||||
2. Custom business logic: Send mail/slack notifications for mutations
|
||||
|
||||
### Configuration
|
||||
|
||||
Like a pre-parse hook, a pre-response hook can also be configured using a
|
||||
configuration file. The request can be configured to omit a few fields if
|
||||
needed.
|
||||
|
||||
An example of configuration JSON is:
|
||||
|
||||
```json
|
||||
{
|
||||
"kind": "LifecyclePluginHook",
|
||||
"version": "v1",
|
||||
"definition": {
|
||||
"pre": "response",
|
||||
"name": "test",
|
||||
"url": "http://localhost:8787",
|
||||
"config": {
|
||||
"request": {
|
||||
"headers": {
|
||||
"additional": {
|
||||
"hasura-m-auth": {
|
||||
"value": "zZkhKqFjqXR4g5MZCsJUZCnhCcoPyZ"
|
||||
}
|
||||
}
|
||||
},
|
||||
"session": {},
|
||||
"rawRequest": {
|
||||
"query": {},
|
||||
"variables": {}
|
||||
},
|
||||
"response": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Request
|
||||
|
||||
A pre-response hook’s request can have the following fields:
|
||||
|
||||
1. Raw request: The raw request for which the engine has generated the response.
|
||||
2. Session: The role and session variables
|
||||
3. Engine’s response: The response that we have generated after executing the
|
||||
query.
|
||||
4. Request headers: This can be important for caching engine plugins
|
||||
|
||||
```json
|
||||
{
|
||||
"session": <the hasura session object>,
|
||||
"rawRequest": <raw request>,
|
||||
"response": <engine's response>
|
||||
}
|
||||
```
|
||||
|
||||
### Response
|
||||
|
||||
For asynchronous pre-response hook, the request can be either of the two:
|
||||
|
||||
1. Success
|
||||
2. Error
|
||||
|
||||
#### Async Success Response
|
||||
|
||||
HTTP Code: 200s
|
||||
|
||||
```
|
||||
There need not be any response body.
|
||||
```
|
||||
|
||||
#### Async Error Response
|
||||
|
||||
HTTP code 400s
|
||||
|
||||
```
|
||||
<optional error details as JSON>
|
||||
```
|
||||
|
||||
The error details will be part of the traces.
|
||||
|
||||
## Multiple engine-plugins
|
||||
|
||||
The engine can handle multiple engine plugins.
|
||||
|
||||
### Pre-plugins
|
||||
|
||||
For example, multiple pre-plugins can be thought of as a pipeline:
|
||||
|
||||
```
|
||||
_____________________ ______________________ __________________
|
||||
| | | | | |
|
||||
Request--->| Pre-parse Plugin 1 |---->| Pre-parse Plugin 2 |---->| Engine Execution |--...
|
||||
|_____________________| |______________________| |__________________|
|
||||
```
|
||||
|
||||
For plugin 2, we will do the following:
|
||||
|
||||
- If plugin 1 responds successfully/error, we will NOT call plugin 2, and there
|
||||
will be a short-circuit.
|
||||
- Only for the continued execution case will we call plugin 2.
|
||||
- The request to all the pre-plugin will be the same (the raw request and
|
||||
session information are not going to change)
|
||||
|
||||
### Pre-response
|
||||
|
||||
Multiple pre-response engine plugins can also be handled. Since they are async
|
||||
in nature, we can execute them in parallel:
|
||||
|
||||
```
|
||||
Engine execution ------> To the user
|
||||
| ________________
|
||||
| | Async |
|
||||
|---->| Pre-response |
|
||||
| | Plugin 1 |
|
||||
| |________________|
|
||||
| ________________
|
||||
| | Async |
|
||||
|---->| Pre-response |
|
||||
| | Plugin 2 |
|
||||
| |________________|
|
||||
...
|
||||
```
|
||||
|
||||
## How will this look in the metadata?
|
||||
|
||||
Engine plugins will be part of the metadata (OpenDD). This will be more like the
|
||||
`AuthConfig` and will be handled while building the artifacts.
|
||||
|
||||
The engine-plugin artifacts will be similar to how we store `AuthConfig`
|
||||
artifacts right now. We will have new artifacts (pre-parse and pre-response
|
||||
plugin artifacts).
|
||||
|
||||
Each artifact will have a list of engine plugins in the order of execution. For
|
||||
example:
|
||||
|
||||
```
|
||||
__________________
|
||||
| ______________ |
|
||||
| | Pre-parse 1 | | ________________ ________________ __________________
|
||||
| |______________| | | | | | | |
|
||||
| ______________ | =====> Request--->| Pre-Parse 1 |---->| Pre-Parse 2 |---->| Engine Execution |--...
|
||||
| | Pre-parse 2 | | |________________| |________________| |__________________|
|
||||
| |______________| |
|
||||
|__________________|
|
||||
```
|
||||
|
||||
For pre-response, the order doesn’t matter right now, but we will still maintain
|
||||
an order (to future-proof for synchronous pre-response).
|
||||
|
||||
There are a few caveats with the ordering of engine plugins for the multitenant
|
||||
engine or DDN cloud: Auth plugin (once converted to an engine plugin, will
|
||||
always be executed first).
|
||||
|
||||
## Future plans
|
||||
|
||||
### Synchronous pre-response hook
|
||||
|
||||
A synchronous hook can be useful for response transformation using something
|
||||
like kriti-lang.
|
||||
|
||||
For synchronous pre-response hooks, the response can be similar to the pre-parse
|
||||
hook. I.e., it can be one of the three: Return with a response: The engine
|
||||
plugin has handled the request, and the graphql-engine should return the
|
||||
response provided by the engine plugin (and ignore the response generated by the
|
||||
engine). Return with engine’s response: The graphql-engine should proceed with
|
||||
the engine’s response. Error response: Abort the request with the error
|
||||
response.
|
||||
|
||||
Synchronous pre-response engine-plugins will be daisy-chained with one another:
|
||||
|
||||
```
|
||||
__________________ __________________
|
||||
| | | |
|
||||
Engine execution --->| pre-response 1 |---->| pre-response 2 |----> ...
|
||||
|__________________| |__________________|
|
||||
```
|
||||
|
||||
For synchronous pre-response, the response will be the response from the
|
||||
previous node (i.e., for response 1, the response will be generated by the
|
||||
engine, but for pre-response 2, it will be dependent on pre-response 1). Here
|
||||
also, in case of an error response, we will short-circuit the execution stack.
|
||||
|
||||
#### Mixing synchronous and asynchronous pre-response
|
||||
|
||||
In case there are multiple synchronous as well as asynchronous pre-response, the
|
||||
execution stack will look like this: First, we will handle all the synchronous
|
||||
pre-response. In the end, we will handle the asynchronous ones.
|
||||
|
||||
```
|
||||
_________________ _________________
|
||||
| Sync | | Sync |
|
||||
Engine execution --->| pre-response 1 |---->| pre-response 2 |-------> To the user
|
||||
|_________________| |_________________| | _________________
|
||||
| | Async |
|
||||
|---->| pre-response 1 |
|
||||
| |_________________|
|
||||
| _________________
|
||||
| | Async |
|
||||
|---->| pre-response 2 |
|
||||
| |_________________|
|
||||
...
|
||||
```
|
@ -59,6 +59,8 @@ import Language.Haskell.TH.Quote (QuasiQuoter (..))
|
||||
import Language.Haskell.TH.Syntax (Exp, Q, lift, qAddDependentFile, runIO)
|
||||
import System.Metrics.Distribution (Distribution)
|
||||
import System.Metrics.Distribution qualified as EKG.Distribution
|
||||
import System.Metrics.Prometheus.Counter (Counter)
|
||||
import System.Metrics.Prometheus.Counter qualified as Counter
|
||||
import System.Metrics.Prometheus.Histogram (Histogram)
|
||||
import System.Metrics.Prometheus.Histogram qualified as Histogram
|
||||
import Prelude
|
||||
@ -92,7 +94,9 @@ data PGPoolMetrics = PGPoolMetrics
|
||||
{ -- | time taken to establish and initialise a PostgreSQL connection
|
||||
_pgConnAcquireLatencyMetric :: !Histogram,
|
||||
-- | time taken to acquire a connection from the pool
|
||||
_poolWaitTimeMetric :: !Histogram
|
||||
_poolWaitTimeMetric :: !Histogram,
|
||||
-- | total number of PostgreSQL errors
|
||||
_pgErrorTotalMetric :: !Counter
|
||||
}
|
||||
|
||||
getInUseConnections :: PGPool -> IO Int
|
||||
@ -129,6 +133,7 @@ initPGPoolMetrics :: IO PGPoolMetrics
|
||||
initPGPoolMetrics = do
|
||||
_pgConnAcquireLatencyMetric <- Histogram.new histogramBuckets
|
||||
_poolWaitTimeMetric <- Histogram.new histogramBuckets
|
||||
_pgErrorTotalMetric <- Counter.new
|
||||
pure PGPoolMetrics {..}
|
||||
where
|
||||
histogramBuckets = [0.000001, 0.0001, 0.01, 0.1, 0.3, 1, 3, 10, 30, 100]
|
||||
@ -151,7 +156,7 @@ initPGPool ci context cp logger = do
|
||||
retryP = mkPGRetryPolicy $ ciRetries ci
|
||||
creator stats metrics = do
|
||||
createdAt <- getCurrentTime
|
||||
pqConn <- initPQConn ci logger
|
||||
pqConn <- (initPQConn ci logger) `Exc.onException` (Counter.inc (_pgErrorTotalMetric metrics))
|
||||
connAcquiredAt <- getCurrentTime
|
||||
let connAcquiredMicroseconds = realToFrac (1000000 * diffUTCTime connAcquiredAt createdAt)
|
||||
connAcquiredSeconds = realToFrac $ diffUTCTime connAcquiredAt createdAt
|
||||
|
Loading…
Reference in New Issue
Block a user