From f1d59d1f21377bd9893546ea280cd9d2824feebf Mon Sep 17 00:00:00 2001 From: zowoq <59103226+zowoq@users.noreply.github.com> Date: Tue, 7 Nov 2023 10:54:42 +1000 Subject: [PATCH 1/3] prometheus: remove escaping --- nixos/roles/prometheus/default-alerts.nix | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nixos/roles/prometheus/default-alerts.nix b/nixos/roles/prometheus/default-alerts.nix index 902dc7d..9ab5d8e 100644 --- a/nixos/roles/prometheus/default-alerts.nix +++ b/nixos/roles/prometheus/default-alerts.nix @@ -2,12 +2,12 @@ srvos.prometheus.ruleGroups.srvosAlerts = { alertRules = { MonitoringTooManyRestarts = { - expr = "changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager|telegraf\"}[15m]) > 2"; + expr = ''changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager|telegraf"}[15m]) > 2''; annotations.description = "Service has restarted more than twice in the last 15 minutes. It might be crashlooping"; }; AlertManagerConfigNotSynced = { - expr = "count(count_values(\"config_hash\", alertmanager_config_hash)) > 1"; + expr = ''count(count_values("config_hash", alertmanager_config_hash)) > 1''; annotations.description = "Configurations of AlertManager cluster instances are out of sync"; }; @@ -28,9 +28,9 @@ }; PromtailRequestsErrors = { - expr = "100 * sum(rate(promtail_request_duration_seconds_count{status_code=~\"5..|failed\"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10"; + expr = ''100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10''; for = "15m"; - annotations.description = "{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors"; + annotations.description = ''{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors''; }; PromtailFileLagging = { @@ -115,7 +115,7 @@ }; TelegrafDown = { - expr = "min(up{job=~\"telegraf\",type!='mobile'}) by (source, job, instance, org) == 0"; + expr = ''min(up{job=~"telegraf",type!='mobile'}) by (source, job, instance, org) == 0''; for = "3m"; annotations.description = "{{$labels.instance}}: telegraf exporter from {{$labels.instance}} is down"; }; @@ -222,7 +222,7 @@ }; AlertmanagerSilencesChanged = { - expr = "abs(delta(alertmanager_silences{state=\"active\"}[1h])) >= 1"; + expr = ''abs(delta(alertmanager_silences{state="active"}[1h])) >= 1''; annotations.description = "alertmanager: number of active silences has changed: {{$value}}"; }; }; From cb0666de51d8ee198de94f32c045f85301e05809 Mon Sep 17 00:00:00 2001 From: zowoq <59103226+zowoq@users.noreply.github.com> Date: Tue, 7 Nov 2023 10:57:27 +1000 Subject: [PATCH 2/3] prometheus: remove filters --- nixos/roles/prometheus/default-alerts.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nixos/roles/prometheus/default-alerts.nix b/nixos/roles/prometheus/default-alerts.nix index 9ab5d8e..099aae2 100644 --- a/nixos/roles/prometheus/default-alerts.nix +++ b/nixos/roles/prometheus/default-alerts.nix @@ -75,7 +75,7 @@ }; SwapUsing30Percent = { - expr = ''mem_swap_total{host!="eva"} - (mem_swap_cached + mem_swap_free) > mem_swap_total * 0.3''; + expr = ''mem_swap_total - (mem_swap_cached + mem_swap_free) > mem_swap_total * 0.3''; for = "30m"; annotations.description = "{{$labels.host}} is using 30% of its swap space for at least 30 minutes"; }; From 27f80f3b13e05efd6f3b2d5e5534f746f03ff03d Mon Sep 17 00:00:00 2001 From: zowoq <59103226+zowoq@users.noreply.github.com> Date: Tue, 7 Nov 2023 10:58:41 +1000 Subject: [PATCH 3/3] prometheus: fix typo --- nixos/roles/prometheus/default-alerts.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nixos/roles/prometheus/default-alerts.nix b/nixos/roles/prometheus/default-alerts.nix index 099aae2..839fa6a 100644 --- a/nixos/roles/prometheus/default-alerts.nix +++ b/nixos/roles/prometheus/default-alerts.nix @@ -158,7 +158,7 @@ # https://healthchecks.io/ Healthchecks = { expr = "hc_check_up == 0"; - annotations.description = "{{$labels.instance}}: healtcheck {{$labels.job}} fails"; + annotations.description = "{{$labels.instance}}: healthcheck {{$labels.job}} fails"; }; CertExpiry = {