Merge pull request #297 from nix-community/prometheus-rules

prometheus: various fixes
This commit is contained in:
Jörg Thalheim 2023-11-08 21:14:43 +01:00 committed by GitHub
commit 8963b35a9e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -2,12 +2,12 @@
srvos.prometheus.ruleGroups.srvosAlerts = {
alertRules = {
MonitoringTooManyRestarts = {
expr = "changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager|telegraf\"}[15m]) > 2";
expr = ''changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager|telegraf"}[15m]) > 2'';
annotations.description = "Service has restarted more than twice in the last 15 minutes. It might be crashlooping";
};
AlertManagerConfigNotSynced = {
expr = "count(count_values(\"config_hash\", alertmanager_config_hash)) > 1";
expr = ''count(count_values("config_hash", alertmanager_config_hash)) > 1'';
annotations.description = "Configurations of AlertManager cluster instances are out of sync";
};
@ -28,9 +28,9 @@
};
PromtailRequestsErrors = {
expr = "100 * sum(rate(promtail_request_duration_seconds_count{status_code=~\"5..|failed\"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10";
expr = ''100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10'';
for = "15m";
annotations.description = "{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors";
annotations.description = ''{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors'';
};
PromtailFileLagging = {
@ -75,7 +75,7 @@
};
SwapUsing30Percent = {
expr = ''mem_swap_total{host!="eva"} - (mem_swap_cached + mem_swap_free) > mem_swap_total * 0.3'';
expr = ''mem_swap_total - (mem_swap_cached + mem_swap_free) > mem_swap_total * 0.3'';
for = "30m";
annotations.description = "{{$labels.host}} is using 30% of its swap space for at least 30 minutes";
};
@ -115,7 +115,7 @@
};
TelegrafDown = {
expr = "min(up{job=~\"telegraf\",type!='mobile'}) by (source, job, instance, org) == 0";
expr = ''min(up{job=~"telegraf",type!='mobile'}) by (source, job, instance, org) == 0'';
for = "3m";
annotations.description = "{{$labels.instance}}: telegraf exporter from {{$labels.instance}} is down";
};
@ -158,7 +158,7 @@
# https://healthchecks.io/
Healthchecks = {
expr = "hc_check_up == 0";
annotations.description = "{{$labels.instance}}: healtcheck {{$labels.job}} fails";
annotations.description = "{{$labels.instance}}: healthcheck {{$labels.job}} fails";
};
CertExpiry = {
@ -222,7 +222,7 @@
};
AlertmanagerSilencesChanged = {
expr = "abs(delta(alertmanager_silences{state=\"active\"}[1h])) >= 1";
expr = ''abs(delta(alertmanager_silences{state="active"}[1h])) >= 1'';
annotations.description = "alertmanager: number of active silences has changed: {{$value}}";
};
};