Make the global event message available per event

2024-09-19 08:37:52 +03:00 · 2024-03-16 17:54:50 +01:00 · 2024-03-16 17:54:50 +01:00 · 684d1d7b94
commit 684d1d7b94
parent 6508acaba7
2 changed files with 171 additions and 156 deletions
--- a/glances/events.py
+++ b/glances/events.py
@ -13,6 +13,151 @@ import time
 from datetime import datetime

 from glances.processes import glances_processes, sort_stats
+from glances.thresholds import glances_thresholds
+
+# Static decision tree for the global alert message
+# - msg: Message to be displayed (result of the decision tree)
+# - thresholds: a list of stats to take into account
+# - thresholds_min: minimal value of the thresholds sum
+# -                 0: OK
+# -                 1: CAREFUL
+# -                 2: WARNING
+# -                 3: CRITICAL
+tree = [
+    {'msg': 'No warning or critical alert detected', 'thresholds': [], 'thresholds_min': 0},
+    {'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2},
+    {'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2},
+    {'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2},
+    {
+        'msg': 'Large CPU stolen time. System running the hypervisor is too busy.',
+        'thresholds': ['cpu_steal'],
+        'thresholds_min': 2,
+    },
+    {'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2},
+    {'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2},
+    {'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2},
+    {'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2},
+]
+
+# TODO: change the algo to use the following decision tree
+# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart
+# _yes means threshold >= 2
+# _no  means threshold < 2
+# With threshold:
+# - 0: OK
+# - 1: CAREFUL
+# - 2: WARNING
+# - 3: CRITICAL
+tree_new = {
+    'cpu_iowait': {
+        '_yes': {
+            'memswap': {
+                '_yes': {
+                    'mem': {
+                        '_yes': {
+                            # Once you've identified the offenders, the resolution will again
+                            # depend on whether their memory usage seems business-as-usual or not.
+                            # For example, a memory leak can be satisfactorily addressed by a one-time
+                            # or periodic restart of the process.
+                            # - if memory usage seems anomalous: kill the offending processes.
+                            # - if memory usage seems business-as-usual: add RAM to the server,
+                            # or split high-memory using services to other servers.
+                            '_msg': "Memory issue"
+                        },
+                        '_no': {
+                            # ???
+                            '_msg': "Swap issue"
+                        },
+                    }
+                },
+                '_no': {
+                    # Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO.
+                    # iotop is an awesome tool for identifying io offenders. Two things to note:
+                    # unless you've already installed iotop, it's probably not already on your system.
+                    # Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting
+                    # tool on an overloaded machine (iotop requires a Linux of 2.62 or above)
+                    '_msg': "I/O issue"
+                },
+            }
+        },
+        '_no': {
+            'cpu_total': {
+                '_yes': {
+                    'cpu_user': {
+                        '_yes': {
+                            # We expect the user-time percentage to be high.
+                            # There's most likely a program or service you've configured on you server that's
+                            # hogging CPU.
+                            # Checking the % user time just confirms this. When you see that the % user-time is high,
+                            # it's time to see what executable is monopolizing the CPU
+                            # Once you've confirmed that the % usertime is high, check the process list(also provided
+                            # by top).
+                            # Be default, top sorts the process list by % CPU, so you can just look at the top process
+                            # or processes.
+                            # If there's a single process hogging the CPU in a way that seems abnormal, it's an
+                            # anomalous situation
+                            # that a service restart can fix. If there are are multiple processes taking up CPU
+                            # resources, or it
+                            # there's one process that takes lots of resources while otherwise functioning normally,
+                            # than your setup
+                            # may just be underpowered. You'll need to upgrade your server(add more cores),
+                            # or split services out onto
+                            # other boxes. In either case, you have a resolution:
+                            # - if situation seems anomalous: kill the offending processes.
+                            # - if situation seems typical given history: upgrade server or add more servers.
+                            '_msg': "CPU issue with user process(es)"
+                        },
+                        '_no': {
+                            'cpu_steal': {
+                                '_yes': {
+                                    '_msg': "CPU issue with stolen time. System running the hypervisor may be too busy."
+                                },
+                                '_no': {'_msg': "CPU issue with system process(es)"},
+                            }
+                        },
+                    }
+                },
+                '_no': {
+                    '_yes': {
+                        # ???
+                        '_msg': "Memory issue"
+                    },
+                    '_no': {
+                        # Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue.
+                        # It's also possible that the slowness is being caused by another server in your cluster, or
+                        # by an external service you rely on.
+                        # start by checking important applications for uncharacteristic slowness(the DB is a good place
+                        # to start), think through which parts of your infrastructure could be slowed down externally.
+                        # For example, do you use an externally hosted email service that could slow down critical
+                        # parts of your application ?
+                        # If you suspect another server in your cluster, strace and lsof can provide information on
+                        # what the process is doing or waiting on. Strace will show you which file descriptors are
+                        # being read or written to (or being attempted to be read from) and lsof can give you a
+                        # mapping of those file descriptors to network connections.
+                        '_msg': "External issue"
+                    },
+                },
+            }
+        },
+    }
+}
+
+
+def build_global_message():
+    """Parse the decision tree and return the message.
+
+    Note: message corresponding to the current thresholds values
+    """
+    # Compute the weight for each item in the tree
+    current_thresholds = glances_thresholds.get()
+    for i in tree:
+        i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds])
+    themax = max(tree, key=lambda d: d['weight'])
+    if themax['weight'] >= themax['thresholds_min']:
+        # Check if the weight is > to the minimal threshold value
+        return themax['msg']
+    else:
+        return tree[0]['msg']


 class GlancesEvents(object):
@ -37,7 +182,8 @@ class GlancesEvents(object):
            "count": COUNT,
            "top": [top 3 process name],
            "desc": "Processes description",
-            "sort": "top sort key"
+            "sort": "top sort key",
+            "global": "global alert message"
        }
    """

@ -125,26 +271,31 @@ class GlancesEvents(object):
        event_value = value
        proc_list = list of processes
        proc_desc = processes description
+        global_message = global alert message

        If 'event' is a 'new one', add it at the beginning of the list.
        If 'event' is not a 'new one', update the list .
        When finished if event duration < peak_time then the alert is not set.
        """
        event_time = time.mktime(datetime.now().timetuple())
+        global_message = build_global_message()
        proc_list = proc_list or glances_processes.get_list()

        # Add or update the log
        event_index = self.__event_exist(event_time, event_type)
        if event_index < 0:
            # Event did not exist, add it
-            self._create_event(event_time, event_state, event_type, event_value, proc_desc)
+            self._create_event(event_time, event_state, event_type, event_value,
+                               proc_desc, global_message)
        else:
            # Event exist, update it
-            self._update_event(event_time, event_index, event_state, event_type, event_value, proc_list, proc_desc)
+            self._update_event(event_time, event_index, event_state, event_type, event_value,
+                               proc_list, proc_desc, global_message)

        return self.len()

-    def _create_event(self, event_time, event_state, event_type, event_value, proc_desc):
+    def _create_event(self, event_time, event_state, event_type, event_value,
+                      proc_desc, global_message):
        """Add a new item in the log list.

        Item is added only if the criticality (event_state) is WARNING or CRITICAL.
@ -169,6 +320,7 @@ class GlancesEvents(object):
                "top": [],
                "desc": proc_desc,
                "sort": glances_processes.sort_key,
+                "global": global_message,
            }

            # Add the item to the list
@ -181,7 +333,8 @@ class GlancesEvents(object):
        else:
            return False

-    def _update_event(self, event_time, event_index, event_state, event_type, event_value, proc_list, proc_desc):
+    def _update_event(self, event_time, event_index, event_state, event_type, event_value,
+                      proc_list, proc_desc, global_message):
        """Update an event in the list"""
        if event_state == "OK" or event_state == "CAREFUL":
            # Reset the automatic process sort key
@ -198,7 +351,7 @@ class GlancesEvents(object):
        else:
            # Update the item

-            # It's an ogoing event, update the end time
+            # It's an ongoing event, update the end time
            self.events_list[event_index]['end'] = -1

            # Set process sort key
@ -226,6 +379,9 @@ class GlancesEvents(object):
            # MONITORED PROCESSES DESC
            self.events_list[event_index]['desc'] = proc_desc

+            # Global message:
+            self.events_list[event_index]['global'] = global_message
+
        return True

    def clean(self, critical=False):
--- a/glances/plugins/alert/init.py
+++ b/glances/plugins/alert/init.py
@ -14,7 +14,6 @@ from time import tzname
 import pytz

 from glances.events import glances_events
-from glances.thresholds import glances_thresholds

 # from glances.logger import logger
 from glances.plugins.plugin.model import GlancesPluginModel
@ -32,6 +31,7 @@ from glances.plugins.plugin.model import GlancesPluginModel
 #     "top": [top3 process list],
 #     "desc": "Processes description",
 #     "sort": "top sort key"
+#     "global": "global alert message"
 # }
 # Fields description
 # description: human readable description
@ -88,153 +88,13 @@ fields_description = {
        'description': 'Sort key of the top processes',
        'unit': 'string',
    },
-}
-
-# Static decision tree for the global alert message
-# - msg: Message to be displayed (result of the decision tree)
-# - thresholds: a list of stats to take into account
-# - thresholds_min: minimal value of the thresholds sum
-# -                 0: OK
-# -                 1: CAREFUL
-# -                 2: WARNING
-# -                 3: CRITICAL
-tree = [
-    {'msg': 'No warning or critical alert detected', 'thresholds': [], 'thresholds_min': 0},
-    {'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2},
-    {'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2},
-    {'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2},
-    {
-        'msg': 'Large CPU stolen time. System running the hypervisor is too busy.',
-        'thresholds': ['cpu_steal'],
-        'thresholds_min': 2,
-    },
-    {'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2},
-    {'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2},
-    {'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2},
-    {'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2},
-]
-
-# TODO: change the algo to use the following decision tree
-# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart
-# _yes means threshold >= 2
-# _no  means threshold < 2
-# With threshold:
-# - 0: OK
-# - 1: CAREFUL
-# - 2: WARNING
-# - 3: CRITICAL
-tree_new = {
-    'cpu_iowait': {
-        '_yes': {
-            'memswap': {
-                '_yes': {
-                    'mem': {
-                        '_yes': {
-                            # Once you've identified the offenders, the resolution will again
-                            # depend on whether their memory usage seems business-as-usual or not.
-                            # For example, a memory leak can be satisfactorily addressed by a one-time
-                            # or periodic restart of the process.
-                            # - if memory usage seems anomalous: kill the offending processes.
-                            # - if memory usage seems business-as-usual: add RAM to the server,
-                            # or split high-memory using services to other servers.
-                            '_msg': "Memory issue"
-                        },
-                        '_no': {
-                            # ???
-                            '_msg': "Swap issue"
-                        },
-                    }
-                },
-                '_no': {
-                    # Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO.
-                    # iotop is an awesome tool for identifying io offenders. Two things to note:
-                    # unless you've already installed iotop, it's probably not already on your system.
-                    # Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting
-                    # tool on an overloaded machine (iotop requires a Linux of 2.62 or above)
-                    '_msg': "I/O issue"
-                },
-            }
-        },
-        '_no': {
-            'cpu_total': {
-                '_yes': {
-                    'cpu_user': {
-                        '_yes': {
-                            # We expect the user-time percentage to be high.
-                            # There's most likely a program or service you've configured on you server that's
-                            # hogging CPU.
-                            # Checking the % user time just confirms this. When you see that the % user-time is high,
-                            # it's time to see what executable is monopolizing the CPU
-                            # Once you've confirmed that the % usertime is high, check the process list(also provided
-                            # by top).
-                            # Be default, top sorts the process list by % CPU, so you can just look at the top process
-                            # or processes.
-                            # If there's a single process hogging the CPU in a way that seems abnormal, it's an
-                            # anomalous situation
-                            # that a service restart can fix. If there are are multiple processes taking up CPU
-                            # resources, or it
-                            # there's one process that takes lots of resources while otherwise functioning normally,
-                            # than your setup
-                            # may just be underpowered. You'll need to upgrade your server(add more cores),
-                            # or split services out onto
-                            # other boxes. In either case, you have a resolution:
-                            # - if situation seems anomalous: kill the offending processes.
-                            # - if situation seems typical given history: upgrade server or add more servers.
-                            '_msg': "CPU issue with user process(es)"
-                        },
-                        '_no': {
-                            'cpu_steal': {
-                                '_yes': {
-                                    '_msg': "CPU issue with stolen time. System running the hypervisor may be too busy."
-                                },
-                                '_no': {'_msg': "CPU issue with system process(es)"},
-                            }
-                        },
-                    }
-                },
-                '_no': {
-                    '_yes': {
-                        # ???
-                        '_msg': "Memory issue"
-                    },
-                    '_no': {
-                        # Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue.
-                        # It's also possible that the slowness is being caused by another server in your cluster, or
-                        # by an external service you rely on.
-                        # start by checking important applications for uncharacteristic slowness(the DB is a good place
-                        # to start), think through which parts of your infrastructure could be slowed down externally.
-                        # For example, do you use an externally hosted email service that could slow down critical
-                        # parts of your application ?
-                        # If you suspect another server in your cluster, strace and lsof can provide information on
-                        # what the process is doing or waiting on. Strace will show you which file descriptors are
-                        # being read or written to (or being attempted to be read from) and lsof can give you a
-                        # mapping of those file descriptors to network connections.
-                        '_msg': "External issue"
-                    },
-                },
-            }
-        },
+    'global': {
+        'description': 'Global alert message',
+        'unit': 'string',
    }
 }


-def global_message():
-    """Parse the decision tree and return the message.
-
-    Note: message corresponding to the current thresholds values
-    """
-    # Compute the weight for each item in the tree
-    current_thresholds = glances_thresholds.get()
-    for i in tree:
-        i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds])
-    themax = max(tree, key=lambda d: d['weight'])
-    if themax['weight'] >= themax['thresholds_min']:
-        # Check if the weight is > to the minimal threshold value
-        return themax['msg']
-    else:
-        return tree[0]['msg']
-
-
 class PluginModel(GlancesPluginModel):
    """Glances alert plugin.

@ -265,10 +125,6 @@ class PluginModel(GlancesPluginModel):
        """Nothing to do here. Just return the global glances_log."""
        # Set the stats to the glances_events
        self.stats = glances_events.get()
-        # Define the global message thanks to the current thresholds
-        # and the decision tree
-        # !!! Call directly in the msg_curse function
-        # global_message()

    def msg_curse(self, args=None, max_width=None):
        """Return the dict to display in the curse interface."""
@ -280,8 +136,11 @@ class PluginModel(GlancesPluginModel):
            return ret

        # Build the string message
-        # Header
-        ret.append(self.curse_add_line(global_message(), "TITLE"))
+        # Header with the global message
+        if len(self.stats) > 0 and self.stats[0]['end'] < 0 and 'global' in self.stats[0]:
+            ret.append(self.curse_add_line(self.stats[0]['global'], "TITLE"))
+        else:
+            ret.append(self.curse_add_line("ALERTS", "TITLE"))
        # Loop over alerts
        for alert in self.stats:
            # New line