Merge pull request #297 from nix-community/prometheus-rules

prometheus: various fixes
2024-08-17 01:40:26 +03:00 · 2023-11-08 21:14:43 +01:00 · 2023-11-08 21:14:43 +01:00 · 8963b35a9e
commit 8963b35a9e
parent b93f9eec50 27f80f3b13
1 changed files with 8 additions and 8 deletions
--- a/nixos/roles/prometheus/default-alerts.nix
+++ b/nixos/roles/prometheus/default-alerts.nix
@ -2,12 +2,12 @@
  srvos.prometheus.ruleGroups.srvosAlerts = {
    alertRules = {
      MonitoringTooManyRestarts = {
-        expr = "changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager|telegraf\"}[15m]) > 2";
+        expr = ''changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager|telegraf"}[15m]) > 2'';
        annotations.description = "Service has restarted more than twice in the last 15 minutes. It might be crashlooping";
      };

      AlertManagerConfigNotSynced = {
-        expr = "count(count_values(\"config_hash\", alertmanager_config_hash)) > 1";
+        expr = ''count(count_values("config_hash", alertmanager_config_hash)) > 1'';
        annotations.description = "Configurations of AlertManager cluster instances are out of sync";
      };

@ -28,9 +28,9 @@
      };

      PromtailRequestsErrors = {
-        expr = "100 * sum(rate(promtail_request_duration_seconds_count{status_code=~\"5..|failed\"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10";
+        expr = ''100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10'';
        for = "15m";
-        annotations.description = "{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors";
+        annotations.description = ''{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors'';
      };

      PromtailFileLagging = {
@ -75,7 +75,7 @@
      };

      SwapUsing30Percent = {
-        expr = ''mem_swap_total{host!="eva"} - (mem_swap_cached + mem_swap_free) > mem_swap_total * 0.3'';
+        expr = ''mem_swap_total - (mem_swap_cached + mem_swap_free) > mem_swap_total * 0.3'';
        for = "30m";
        annotations.description = "{{$labels.host}} is using 30% of its swap space for at least 30 minutes";
      };
@ -115,7 +115,7 @@
      };

      TelegrafDown = {
-        expr = "min(up{job=~\"telegraf\",type!='mobile'}) by (source, job, instance, org) == 0";
+        expr = ''min(up{job=~"telegraf",type!='mobile'}) by (source, job, instance, org) == 0'';
        for = "3m";
        annotations.description = "{{$labels.instance}}: telegraf exporter from {{$labels.instance}} is down";
      };
@ -158,7 +158,7 @@
      # https://healthchecks.io/
      Healthchecks = {
        expr = "hc_check_up == 0";
-        annotations.description = "{{$labels.instance}}: healtcheck {{$labels.job}} fails";
+        annotations.description = "{{$labels.instance}}: healthcheck {{$labels.job}} fails";
      };

      CertExpiry = {
@ -222,7 +222,7 @@
      };

      AlertmanagerSilencesChanged = {
-        expr = "abs(delta(alertmanager_silences{state=\"active\"}[1h])) >= 1";
+        expr = ''abs(delta(alertmanager_silences{state="active"}[1h])) >= 1'';
        annotations.description = "alertmanager: number of active silences has changed: {{$value}}";
      };
    };