diff --git a/.python-version b/.python-version index e9b674c..6e1919c 100644 --- a/.python-version +++ b/.python-version @@ -1,3 +1,3 @@ -3.6.12 -3.7.9 -3.9.0 +3.6.13 +3.7.10 +3.9.2 diff --git a/MANIFEST.in b/MANIFEST.in index 691de57..629384a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,2 @@ include LICENSE prune test -prune systemd_source_code \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5d15c72 --- /dev/null +++ b/Makefile @@ -0,0 +1,26 @@ +build: + rm -rf dist + # No bdist_wheel: problems with versioneer. + python3 setup.py sdist + +upload: + pip3 install twine + twine upload --skip-existing dist/* + +readme: + tox -e py38 -- test/test_doc.py + +test: + pyenv update + pyenv install --skip-existing 3.6.13 + pyenv install --skip-existing 3.7.10 + pyenv install --skip-existing 3.9.2 + pyenv local 3.6.13 3.7.10 3.9.2 + pip3 install tox + tox + +doc: + tox -e docs + xdg-open .tox/docs/tmp/html/index.html + +.PHONY: build upload test readme doc diff --git a/README-template.md b/README-template.md index 02cc1ec..1fa3991 100644 --- a/README-template.md +++ b/README-template.md @@ -1,78 +1,88 @@ [![pypi.org](http://img.shields.io/pypi/v/check_systemd.svg)](https://pypi.python.org/pypi/check_systemd) [![Build Status](https://travis-ci.org/Josef-Friedrich/check_systemd.svg?branch=master)](https://travis-ci.org/Josef-Friedrich/check_systemd) # check_systemd -`check_systemd` is a -[Nagios](https://www.nagios.org) / [Icinga](https://icinga.com) -monitoring plugin to check [systemd](https://systemd.io) for failed -units. - -This Python script will report a degraded system to your monitoring solution. -It requires only the -[nagiosplugin](https://nagiosplugin.readthedocs.io/en/stable) library. - -You can also test a single service with the `-u, --unit` parameter. +`check_systemd` is a [Nagios](https://www.nagios.org) / +[Icinga](https://icinga.com) monitoring plugin to check +[systemd](https://systemd.io). This Python script will report a degraded +system to your monitoring solution. It can also be used to monitor +individual systemd services (with the `-u, --unit` parameter) and timers +units (with the `-t, --dead-timers` parameter). The only dependency the +plugin needs is the Python library +[nagiosplugin](https://nagiosplugin.readthedocs.io/en/stable). ## Installation ``` pip3 install check_systemd ``` +## Packages + +* Debian ([package](https://ftp-master.debian.org/new/monitoring-plugins-systemd_2.3.0+ds-1.html), [source code](https://salsa.debian.org/python-team/packages/monitoring-plugins-systemd/-/tree/debian/master/debian)): work in progress +* NixOS ([package](https://search.nixos.org/packages?channel=unstable&query=check_systemd), [source code](https://github.com/NixOS/nixpkgs/blob/nixos-unstable/pkgs/servers/monitoring/nagios/plugins/check_systemd.nix)): `nix-env -iA nixos.check_systemd` + ## Command line interface ``` {{ argparse }} ``` ## Project pages -* https://github.com/Josef-Friedrich/check_systemd -* https://exchange.icinga.com/joseffriedrich/check_systemd -* https://exchange.nagios.org/directory/Plugins/System-Metrics/Processes/check_systemd/details +* on [github.com](https://github.com/Josef-Friedrich/check_systemd) +* on [icinga.com](https://exchange.icinga.com/joseffriedrich/check_systemd) +* on [nagios.org](https://exchange.nagios.org/directory/Plugins/System-Metrics/Processes/check_systemd/details) ## Behind the scenes To detect failed units this monitoring script runs: ```sh systemctl list-units --all ``` To get the startup time it executes: ```sh systemd-analyze ``` To check a specific unit (`-u, --unit`) this command is executed: ```sh systemctl is-active ``` To find dead timers this plugin launches: ```sh systemctl list-timers --all ``` +To learn how `systemd` produces the text output on the command line, it +is worthwhile to take a look at `systemd`’s source +code. Files relevant for text output are: +[basic/time-util.c](https://github.com/systemd/systemd/blob/main/src/basic/time-util.c), +[analyze/analyze.c](https://github.com/systemd/systemd/blob/main/src/analyze/analyze.c). + ## Testing ``` pyenv install 3.6.12 pyenv install 3.7.9 pyenv local 3.6.12 3.7.9 pip3 install tox tox ``` ## Deploying -Edit version number in check_systemd.py (without `v`) +Edit the version number in check_systemd.py (without `v`). Use the `-s` +option to sign the tag (required for the Debian package). ``` -git tag v2.0.11 +git tag -s v2.0.11 git push --tags ``` diff --git a/README.md b/README.md index 66ae487..e2377a9 100644 --- a/README.md +++ b/README.md @@ -1,139 +1,149 @@ [![pypi.org](http://img.shields.io/pypi/v/check_systemd.svg)](https://pypi.python.org/pypi/check_systemd) [![Build Status](https://travis-ci.org/Josef-Friedrich/check_systemd.svg?branch=master)](https://travis-ci.org/Josef-Friedrich/check_systemd) # check_systemd -`check_systemd` is a -[Nagios](https://www.nagios.org) / [Icinga](https://icinga.com) -monitoring plugin to check [systemd](https://systemd.io) for failed -units. - -This Python script will report a degraded system to your monitoring solution. -It requires only the -[nagiosplugin](https://nagiosplugin.readthedocs.io/en/stable) library. - -You can also test a single service with the `-u, --unit` parameter. +`check_systemd` is a [Nagios](https://www.nagios.org) / +[Icinga](https://icinga.com) monitoring plugin to check +[systemd](https://systemd.io). This Python script will report a degraded +system to your monitoring solution. It can also be used to monitor +individual systemd services (with the `-u, --unit` parameter) and timers +units (with the `-t, --dead-timers` parameter). The only dependency the +plugin needs is the Python library +[nagiosplugin](https://nagiosplugin.readthedocs.io/en/stable). ## Installation ``` pip3 install check_systemd ``` +## Packages + +* Debian ([package](https://ftp-master.debian.org/new/monitoring-plugins-systemd_2.3.0+ds-1.html), [source code](https://salsa.debian.org/python-team/packages/monitoring-plugins-systemd/-/tree/debian/master/debian)): work in progress +* NixOS ([package](https://search.nixos.org/packages?channel=unstable&query=check_systemd), [source code](https://github.com/NixOS/nixpkgs/blob/nixos-unstable/pkgs/servers/monitoring/nagios/plugins/check_systemd.nix)): `nix-env -iA nixos.check_systemd` + ## Command line interface ``` usage: check_systemd [-h] [-u UNIT | -e UNIT] [-n] [-w SECONDS] [-c SECONDS] [-t] [-W SECONDS] [-C SECONDS] [-i] [-v] [-V] -Copyright (c) 2014-18 Andrea Briganti a.k.a 'Kbyte' +Copyright (c) 2014-18 Andrea Briganti Copyright (c) 2019-21 Josef Friedrich Nagios / Icinga monitoring plugin to check systemd. optional arguments: -h, --help show this help message and exit -u UNIT, --unit UNIT Name of the systemd unit that is being tested. -e UNIT, --exclude UNIT Exclude a systemd unit from the checks. This option can be applied multiple times, for example: -e mnt- data.mount -e task.service. Regular expressions can be used to exclude multiple units at once, for example: -e 'user@\d+\.service'. For more informations see the Python documentation about regular expressions (https://docs.python.org/3/library/re.html). -n, --no-startup-time Don’t check the startup time. Using this option the options '-w, --warning' and '-c, --critical' have no effect. Performance data about the startup time is collected, but no critical, warning etc. states are triggered. -w SECONDS, --warning SECONDS Startup time in seconds to result in a warning status. Thedefault is 60 seconds. -c SECONDS, --critical SECONDS Startup time in seconds to result in a critical status. Thedefault is 120 seconds. -t, --dead-timers Detect dead / inactive timers. See the corresponding options '-W, --dead-timer-warning' and '-C, --dead- timers-critical'. Dead timers are detected by parsing the output of 'systemctl list-timers'. Dead timer rows displaying 'n/a' in the NEXT and LEFTcolumns and the time span in the column PASSED exceeds the values specified with the options '-W, --dead-timer-warning' and '-C, --dead-timers-critical'. -W SECONDS, --dead-timers-warning SECONDS Time ago in seconds for dead / inactive timers to trigger a warning state (by default 6 days). -C SECONDS, --dead-timers-critical SECONDS Time ago in seconds for dead / inactive timers to trigger a critical state (by default 7 days). -i, --ignore-inactive-state Ignore an inactive state on a specific unit. Oneshot services for example are only active while running and not enabled. The rest of the time they are inactive. This option has only an affect if it is used with the option -u. -v, --verbose Increase output verbosity (use up to 3 times). -V, --version show program's version number and exit Performance data: - count_units - startup_time - units_activating - units_active - units_failed - units_inactive ``` ## Project pages -* https://github.com/Josef-Friedrich/check_systemd -* https://exchange.icinga.com/joseffriedrich/check_systemd -* https://exchange.nagios.org/directory/Plugins/System-Metrics/Processes/check_systemd/details +* on [github.com](https://github.com/Josef-Friedrich/check_systemd) +* on [icinga.com](https://exchange.icinga.com/joseffriedrich/check_systemd) +* on [nagios.org](https://exchange.nagios.org/directory/Plugins/System-Metrics/Processes/check_systemd/details) ## Behind the scenes To detect failed units this monitoring script runs: ```sh systemctl list-units --all ``` To get the startup time it executes: ```sh systemd-analyze ``` To check a specific unit (`-u, --unit`) this command is executed: ```sh systemctl is-active ``` To find dead timers this plugin launches: ```sh systemctl list-timers --all ``` +To learn how `systemd` produces the text output on the command line, it +is worthwhile to take a look at `systemd`’s source +code. Files relevant for text output are: +[basic/time-util.c](https://github.com/systemd/systemd/blob/main/src/basic/time-util.c), +[analyze/analyze.c](https://github.com/systemd/systemd/blob/main/src/analyze/analyze.c). + ## Testing ``` pyenv install 3.6.12 pyenv install 3.7.9 pyenv local 3.6.12 3.7.9 pip3 install tox tox ``` ## Deploying -Edit version number in check_systemd.py (without `v`) +Edit the version number in check_systemd.py (without `v`). Use the `-s` +option to sign the tag (required for the Debian package). ``` -git tag v2.0.11 +git tag -s v2.0.11 git push --tags ``` diff --git a/check_systemd.py b/check_systemd.py index d75085a..a513280 100755 --- a/check_systemd.py +++ b/check_systemd.py @@ -1,698 +1,724 @@ #!/usr/bin/env python3 """ -``check_system`` is a Nagios / Icinga monitoring plugin to check systemd. +``check_system`` is a `Nagios `_ / `Icinga +`_ monitoring plugin to check systemd. This Python script +will report a degraded system to your monitoring solution. It can also be used +to monitor individual systemd services (with the ``-u, --unit`` parameter) and +timers units (with the ``-t, --dead-timers`` parameter). + +To lern more about the project, please visit the repository on `Github +`_. This plugin is based on a Python package named `nagiosplugin `_. ``nagiosplugin`` has a fine-grained -class model to separate concerns. A Nagios / Icinga plugin need to perform +class model to separate concerns. A Nagios / Icinga plugin must perform these three steps: data `acquisition`, `evaluation` and `presentation`. ``nagiosplugin`` provides for this three steps three classes: ``Resource``, ``Context``, ``Summary``. ``check_systemd`` extends this three model classes in the following subclasses: Acquisition (``Resource``) ========================== * :class:`SystemctlIsActiveResource` * :class:`SystemctlListTimersResource` * :class:`SystemctlListUnitsResource` * :class:`SystemdAnalyseResource` Evaluation (``Context``) ======================== * :class:`DeadTimersContext` * :class:`PerformanceDataContext` * :class:`UnitContext` Presentation (``Summary``) ========================== * :class:`SystemdSummary` """ import io import subprocess import argparse import re import nagiosplugin from nagiosplugin import Metric -__version__ = '2.3.0' +__version__ = '2.3.1' class SystemctlListUnitsResource(nagiosplugin.Resource): """ Resource that calls ``systemctl list-units --all`` on the command line to get informations about all systemd units. :param list excludes: A list of systemd unit names. """ name = 'SYSTEMD' def __init__(self, excludes=[]): self.excludes = excludes def re_match(self, unit): for exclude in self.excludes: if re.match(exclude, unit): return(True) return(False) def probe(self): """Query system state and return metrics. :return: generator that emits :class:`~nagiosplugin.metric.Metric` objects """ # We don’t use `systemctl --failed --no-legend`, because we want to # collect performance data of all units. try: # p = subprocess.Popen(['./test/bin/ok/systemctl', 'list-units', '--all'], # noqa: E501 p = subprocess.Popen(['systemctl', 'list-units', '--all'], stderr=subprocess.PIPE, stdin=subprocess.PIPE, stdout=subprocess.PIPE) stdout, stderr = p.communicate() except OSError as e: raise nagiosplugin.CheckError(e) if stderr: raise nagiosplugin.CheckError(stderr) # Dictionary to store all units according their active state. units = { 'failed': [], 'active': [], 'activating': [], 'inactive': [], } if stdout: lines = stdout.decode('utf-8').splitlines() table_heading = lines[0] # Remove the first line because it is the header. # Remove the last seven lines: # empty line # LOAD = Reflects whether the unit definition... # ACTIVE = The high-level unit activation state... # SUB = The low-level unit activation state... # empty line # xxx loaded units listed. Pass --all to see ... # To show all installed unit files use... table_body = lines[1:-7] table_parser = TableParser(table_heading) # Output of `systemctl list-units --all: # UNIT LOAD ACTIVE SUB JOB DESCRIPTION # foobar.service loaded active waiting Description text count_units = 0 for line in table_body: # foobar.service unit = table_parser.get_column_text(line, 'UNIT') # failed active = table_parser.get_column_text(line, 'ACTIVE') # Only count not excluded units. if not self.re_match(unit): # Quick fix: # Issue on Arch: “not-found” in column ACTIVE # maybe cli table output changed on newer versions of # systemd? # maybe .split() is not working correctly? if active not in units: units[active] = [] units[active].append(unit) count_units += 1 for unit in units['failed']: if not self.re_match(unit): yield Metric(name=unit, value='failed', context='unit') for active, unit_names in units.items(): yield Metric(name='units_{}'.format(active), value=len(units[active]), context='performance_data') yield Metric(name='count_units', value=count_units, context='performance_data') if len(units['failed']) == 0: yield Metric(name='all', value=None, context='unit') def format_timespan_to_seconds(fmt_timespan): """Convert a timespan format string into secondes. https://github.com/systemd/systemd/blob/master/src/basic/time-util.c#L357 :param str fmt_timespan: for example `2.345s` or `3min 45.234s` or `34min left` or `2 months 8 days` :return: The seconds :rtype: float """ for replacement in [ ['years', 'y'], ['months', 'month'], ['weeks', 'w'], ['days', 'd'], ]: fmt_timespan = fmt_timespan.replace( ' ' + replacement[0], replacement[1] ) seconds = { 'y': 31536000, # 365 * 24 * 60 * 60 'month': 2592000, # 30 * 24 * 60 * 60 'w': 604800, # 7 * 24 * 60 * 60 'd': 86400, # 24 * 60 * 60 'h': 3600, # 60 * 60 'min': 60, 's': 1, 'ms': 0.001, } result = 0 for span in fmt_timespan.split(): match = re.search(r'([\d\.]+)([a-z]+)', span) if match: value = match.group(1) unit = match.group(2) result += float(value) * seconds[unit] return round(float(result), 3) class SystemdAnalyseResource(nagiosplugin.Resource): """Resource that calls ``systemd-analyze`` on the command line to get informations about the startup time..""" name = 'SYSTEMD' def probe(self): """Query system state and return metrics. :return: generator that emits :class:`~nagiosplugin.metric.Metric` objects """ try: p = subprocess.Popen(['systemd-analyze'], stderr=subprocess.PIPE, stdin=subprocess.PIPE, stdout=subprocess.PIPE) stdout, stderr = p.communicate() except OSError as e: raise nagiosplugin.CheckError(e) if stderr: raise nagiosplugin.CheckError(stderr) if stdout: stdout = stdout.decode('utf-8') # First line: # Startup finished in 1.672s (kernel) + 21.378s (userspace) = # 23.050s # On raspian no second line # Second line: # graphical.target reached after 1min 2.154s in userspace match = re.search(r'reached after (.+) in userspace', stdout) if not match: match = re.search(r' = (.+)\n', stdout) # Output when boot process is not finished: # Bootup is not yet finished. Please try again later. if match: yield Metric(name='startup_time', value=format_timespan_to_seconds(match.group(1)), context='startup_time') class TableParser: """A parser for the various table outputs of different systemd commands.""" def __init__(self, heading_row): """ :param str heading_row: A row with column titles. """ self.heading_row = heading_row def detect_column_boundaries(self, column_title): """ :param str column_title: The title of the column, for example UNIT, ACTIVE. The column title must be included in the heading row. """ match = re.search(re.compile(column_title + r'\s*'), self.heading_row) return [match.start(), match.end()] def get_column_text(self, row, column_title): """Get the text of a certain column, that is specified by the column title. Leading and trailing whitespaces are removed. :param str row: The current row of the table to extract a certain column. :param str column_title: The title of the column, for example UNIT, ACTIVE. The column title must be included in the heading row. """ boundaries = self.detect_column_boundaries(column_title) column = row[boundaries[0]:boundaries[1]] return column.strip() class SystemctlListTimersResource(nagiosplugin.Resource): """ Resource that calls ``systemctl list-timers --all`` on the command line to get informations about dead / inactive timers. There is one type of systemd “degradation” which is normally not detected: dead / inactive timers. :param list excludes: A list of systemd unit names to exclude from the checks. """ def __init__(self, excludes=[], *args, **kwargs): self.excludes = excludes self.warning = kwargs.pop('warning') self.critical = kwargs.pop('critical') super().__init__(*args, **kwargs) name = 'SYSTEMD' column_names = [ 'NEXT', 'LEFT', 'LAST', 'PASSED', 'UNIT', 'ACTIVATES' ] column_boundaries = None def re_match(self, unit): for exclude in self.excludes: if re.match(exclude, unit): return(True) return(False) def detect_column_boundaries(self, heading): boundaries = [] previous_column_start = 0 for column_title in self.column_names[1:]: next_column_start = heading.index(column_title) boundaries.append([previous_column_start, next_column_start]) previous_column_start = next_column_start return boundaries def get_column_text(self, row, column_name): boundaries = self.column_boundaries[ self.column_names.index(column_name) ] return row[boundaries[0]:boundaries[1]].strip() def probe(self): """ :return: generator that emits :class:`~nagiosplugin.metric.Metric` objects """ try: p = subprocess.Popen(['systemctl', 'list-timers', '--all'], stderr=subprocess.PIPE, stdin=subprocess.PIPE, stdout=subprocess.PIPE) stdout, stderr = p.communicate() except OSError as e: raise nagiosplugin.CheckError(e) if stderr: raise nagiosplugin.CheckError(stderr) # NEXT LEFT # Sat 2020-05-16 15:11:15 CEST 34min left # LAST PASSED # Sat 2020-05-16 14:31:56 CEST 4min 20s ago # UNIT ACTIVATES # apt-daily.timer apt-daily.service if stdout: lines = stdout.decode('utf-8').splitlines() table_heading = lines[0] self.column_boundaries = self.detect_column_boundaries( table_heading ) # Remove the first line because it is the header. # Remove the two last lines: empty line + "XX timers listed." table_body = lines[1:-2] state = nagiosplugin.Ok # ok for row in table_body: unit = self.get_column_text(row, 'UNIT') if self.re_match(unit): continue next_date_time = self.get_column_text(row, 'NEXT') if next_date_time == 'n/a': passed_text = self.get_column_text(row, 'PASSED') if passed_text == 'n/a': state = nagiosplugin.Critical else: passed = format_timespan_to_seconds( passed_text ) if passed_text == 'n/a' or passed >= self.critical: state = nagiosplugin.Critical elif passed >= self.warning: state = nagiosplugin.Warn yield Metric( name=unit, value=state, context='dead_timers' ) class SystemctlIsActiveResource(nagiosplugin.Resource): """Resource that calls ``systemctl is-active `` on the command line to get informations about one specifiy systemd unit.""" name = 'SYSTEMD' def __init__(self, *args, **kwargs): self.unit = kwargs.pop('unit') super().__init__(*args, **kwargs) def probe(self): """Query system state and return metrics. :return: generator that emits :class:`~nagiosplugin.metric.Metric` objects """ # Execute `systemctl is-active ` and get output # - active # - inactive (by unkown unit file) # - failed try: p = subprocess.Popen(['systemctl', 'is-active', self.unit], stderr=subprocess.PIPE, stdin=subprocess.PIPE, stdout=subprocess.PIPE) stdout, stderr = p.communicate() except OSError as e: raise nagiosplugin.CheckError(e) if stderr: raise nagiosplugin.CheckError(stderr) if stdout: for line in io.StringIO(stdout.decode('utf-8')): active = line.strip() yield Metric(name=self.unit, value=active, context='unit') class UnitContext(nagiosplugin.Context): def __init__(self, args): self.args = args super(UnitContext, self).__init__('unit') def evaluate(self, metric, resource): """Determines state of a given metric. :param metric: associated metric that is to be evaluated :param resource: resource that produced the associated metric (may optionally be consulted) + :returns: :class:`~.result.Result` """ if metric.value: hint = '{}: {}'.format(metric.name, metric.value) else: hint = metric.name # The option -u is not specifed if not metric.value: return self.result_cls(nagiosplugin.Ok, metric=metric, hint=hint) if self.args.ignore_inactive_state and metric.value == 'failed': return self.result_cls(nagiosplugin.Critical, metric=metric, hint=hint) elif not self.args.ignore_inactive_state and metric.value != 'active': return self.result_cls(nagiosplugin.Critical, metric=metric, hint=hint) else: return self.result_cls(nagiosplugin.Ok, metric=metric, hint=hint) class DeadTimersContext(nagiosplugin.Context): def __init__(self): super(DeadTimersContext, self).__init__('dead_timers') def evaluate(self, metric, resource): """Determines state of a given metric. :param metric: associated metric that is to be evaluated :param resource: resource that produced the associated metric (may optionally be consulted) + :returns: :class:`~.result.Result` """ return self.result_cls(metric.value, metric=metric, hint=metric.name) class PerformanceDataContext(nagiosplugin.Context): def __init__(self): super(PerformanceDataContext, self).__init__('performance_data') def performance(self, metric, resource): """Derives performance data from a given metric. :param metric: associated metric from which performance data are derived :param resource: resource that produced the associated metric (may optionally be consulted) :returns: :class:`Perfdata` object """ return nagiosplugin.Performance(label=metric.name, value=metric.value) class SystemdSummary(nagiosplugin.Summary): + """Format the different status lines. A subclass of `nagiosplugin.Summary + `_. + """ def ok(self, results): """Formats status line when overall state is ok. :param results: :class:`~nagiosplugin.result.Results` container :returns: status line """ for result in results.most_significant: if isinstance(result.context, UnitContext): return '{0}'.format(result) return 'all' def problem(self, results): """Formats status line when overall state is not ok. :param results: :class:`~.result.Results` container + :returns: status line """ summary = [] for result in results.most_significant: if result.context.name in ['startup_time', 'unit', 'dead_timers']: summary.append(result) return ', '.join(['{0}'.format(result) for result in summary]) def verbose(self, results): """Provides extra lines if verbose plugin execution is requested. :param results: :class:`~.result.Results` container + :returns: list of strings """ summary = [] for result in results.most_significant: if result.context.name in ['startup_time', 'unit', 'dead_timers']: summary.append('{0}: {1}'.format(result.state, result)) return summary def get_argparser(): parser = argparse.ArgumentParser( prog='check_systemd', # To get the right command name in the README. formatter_class=lambda prog: argparse.RawDescriptionHelpFormatter(prog, width=80), # noqa: E501 description= # noqa: E251 - 'Copyright (c) 2014-18 Andrea Briganti a.k.a \'Kbyte\' \n' # noqa: E501 + 'Copyright (c) 2014-18 Andrea Briganti \n' 'Copyright (c) 2019-21 Josef Friedrich \n' '\n' 'Nagios / Icinga monitoring plugin to check systemd.\n', # noqa: E501 epilog= # noqa: E251 'Performance data:\n' ' - count_units\n' ' - startup_time\n' ' - units_activating\n' ' - units_active\n' ' - units_failed\n' ' - units_inactive\n', ) exclusive_group = parser.add_mutually_exclusive_group() exclusive_group.add_argument( '-u', '--unit', type=str, dest='unit', help='Name of the systemd unit that is being tested.', ) exclusive_group.add_argument( '-e', '--exclude', metavar='UNIT', action='append', default=[], help='Exclude a systemd unit from the checks. This option can be ' 'applied multiple times, for example: -e mnt-data.mount -e ' 'task.service. Regular expressions can be used to exclude ' 'multiple units at once, for example: ' '-e \'user@\\d+\\.service\'. ' 'For more informations see the Python documentation about ' 'regular expressions ' '(https://docs.python.org/3/library/re.html).', ) parser.add_argument( '-n', '--no-startup-time', action='store_true', default=False, help='Don’t check the startup time. Using this option the options ' '\'-w, --warning\' and \'-c, --critical\' have no effect. ' 'Performance data about the startup time is collected, but ' 'no critical, warning etc. states are triggered.', ) parser.add_argument( '-w', '--warning', default=60, metavar='SECONDS', help='Startup time in seconds to result in a warning status. The' 'default is 60 seconds.', ) parser.add_argument( '-c', '--critical', metavar='SECONDS', default=120, help='Startup time in seconds to result in a critical status. The' 'default is 120 seconds.', ) parser.add_argument( '-t', '--dead-timers', action='store_true', help='Detect dead / inactive timers. See the corresponding options ' '\'-W, --dead-timer-warning\' and ' '\'-C, --dead-timers-critical\'. ' 'Dead timers are detected by parsing the output of ' '\'systemctl list-timers\'. ' 'Dead timer rows displaying \'n/a\' in the NEXT and LEFT' 'columns and the time span in the column PASSED exceeds the ' 'values specified with the options \'-W, --dead-timer-warning\' ' 'and \'-C, --dead-timers-critical\'.' ) parser.add_argument( '-W', '--dead-timers-warning', metavar='SECONDS', type=float, default=60 * 60 * 24 * 6, help='Time ago in seconds for dead / inactive timers to trigger a ' 'warning state (by default 6 days).' ) parser.add_argument( '-C', '--dead-timers-critical', metavar='SECONDS', type=float, default=60 * 60 * 24 * 7, help='Time ago in seconds for dead / inactive timers to trigger a ' 'critical state (by default 7 days).' ) parser.add_argument( '-i', '--ignore-inactive-state', action='store_true', help='Ignore an inactive state on a specific unit. Oneshot services ' 'for example are only active while running and not enabled. ' 'The rest of the time they are inactive. This option has only ' 'an affect if it is used with the option -u.' ) parser.add_argument( '-v', '--verbose', action='count', default=0, help='Increase output verbosity (use up to 3 times).' ) parser.add_argument( '-V', '--version', action='version', version='%(prog)s {}'.format(__version__), ) return parser def main(): - """The main function""" + """The main entry point of the monitoring plugin. First the command line + arguments are read into the variable ``args``. The configuration of this + ``args`` object decides which instances of the `Resource + `_, + `Context + `_ + and `Summary + `_ + subclasses are assembled in a list called ``tasks``. This list is passed + the main class of the ``nagiosplugin`` library: the `Check + `_ + class. + """ args = get_argparser().parse_args() - objects = [] + tasks = [] if args.dead_timers: - objects += [ + tasks += [ SystemctlListTimersResource( excludes=args.exclude, warning=args.dead_timers_warning, critical=args.dead_timers_critical, ), DeadTimersContext() ] if args.unit: - objects.append(SystemctlIsActiveResource(unit=args.unit)) + tasks.append(SystemctlIsActiveResource(unit=args.unit)) else: - objects += [ + tasks += [ SystemctlListUnitsResource(excludes=args.exclude), PerformanceDataContext(), ] analyse = subprocess.run( ['systemd-analyze'], stderr=subprocess.PIPE, stdin=subprocess.PIPE, stdout=subprocess.PIPE ) # systemd-analyze: boot not finshed exits with 1 if analyse.returncode == 0: - objects.append(SystemdAnalyseResource()) + tasks.append(SystemdAnalyseResource()) - objects += [ + tasks += [ UnitContext(args), SystemdSummary() ] if not args.no_startup_time: - objects.append( + tasks.append( nagiosplugin.ScalarContext( name='startup_time', warning=args.warning, critical=args.critical, ) ) else: - objects.append( + tasks.append( nagiosplugin.ScalarContext( name='startup_time' ) ) - check = nagiosplugin.Check(*objects) + check = nagiosplugin.Check(*tasks) check.main(args.verbose) if __name__ == '__main__': main() diff --git a/test/check_systemd_dbus.py b/test/check_systemd_dbus.py new file mode 100755 index 0000000..045a27e --- /dev/null +++ b/test/check_systemd_dbus.py @@ -0,0 +1,256 @@ +#! /usr/bin/python3 + +""" +Work in progress! Do not use this script at the moment! Use the file +`check_systemd.py` instead. It is a rewrite of the plugin using D-Bus instead +of parsing the CLI output. +""" + +import argparse + +import nagiosplugin +from nagiosplugin import Metric + +__version__ = '2.3.0' + + +data_source = 'dbus' +"""This variable indicates which data source should be used for the acquisition +of monitoring informations. It accepts the values ``dbus`` or ``cli``. It +preferes the D-Bus source. """ + + +try: + # Look for gi https://pygobject.readthedocs.io/en/latest/index.html + from gi.repository.Gio import DBusProxy, BusType +except ImportError: + try: + # Fallback to pgi Pure Python GObject Introspection Bindings + # https://github.com/pygobject/pgi + from pgi.repository.Gio import DBusProxy, BusType + except ImportError: + # Fallback to the command line interface source. + data_source = 'cli' + + +class DbusManager: + """ + This class holds the main entry point object of the D-Bus systemd API. See + the section `The Manager Object + `_ + in the systemd D-Bus API. + """ + + def __init__(self): + self.__manager = DBusProxy.new_for_bus_sync( + BusType.SYSTEM, 0, None, 'org.freedesktop.systemd1', + '/org/freedesktop/systemd1', 'org.freedesktop.systemd1.Manager', + None) + + def load_unit(self, unit_name): + """ + Load a systemd D-Bus unit object by it’s name. + + :param str unit_name: A systemd unit name like ``tor.service``, + ``mnt-nextcloud.automount`` or ``update-motd.timer``. + """ + try: + return self.__manager.LoadUnit('(s)', unit_name) + except Exception as e: + raise e + + +dbus_manager = DbusManager() +""" +The systemd D-Bus API main entry point object, the so called “manager”. +""" + + +class SystemdUnitState: + """ + Class that provides easy access to the three state properties + ``ActiveState``, ``SubState`` and ``LoadState`` of the Dbus systemd API. + """ + + def __init__(self, unit_name): + """ + :param str unit_name: A systemd unit name like ``tor.service``, + ``mnt-nextcloud.automount`` or ``update-motd.timer``. + """ + try: + loaded_unit = dbus_manager.load_unit(unit_name) + except Exception as e: + raise e + + self.__dbus_unit = DBusProxy.new_for_bus_sync( + BusType.SYSTEM, 0, None, 'org.freedesktop.systemd1', + loaded_unit, 'org.freedesktop.systemd1.Unit', None) + """ + The systemd D-Bus unit object is fetched by the method + `Gio.DBusProxy.new_for_bus_sync + `_. + """ + + def __get_dbus_property(self, property_name): + """ + Get the property of a systemd D-Bus unit object. This method uses the + methods `Gio.DBusProxy.get_cached_property + `_ + and + `GLib.Variant.unpack + `_ + for the lookup. + """ + return self.__dbus_unit.get_cached_property(property_name).unpack() + + @property + def active_state(self): + """From the `D-Bus interface of systemd documentation + `_: + + ``ActiveState`` contains a state value that reflects whether the unit + is currently active or not. The following states are currently defined: + + * ``active``, + * ``reloading``, + * ``inactive``, + * ``failed``, + * ``activating``, and ``deactivating``. + + ``active`` indicates that unit is active (obviously...). + + ``reloading`` indicates that the unit is active and currently reloading + its configuration. + + ``inactive`` indicates that it is inactive and the previous run was + successful or no previous run has taken place yet. + + ``failed`` indicates that it is inactive and the previous run was not + successful (more information about the reason for this is available on + the unit type specific interfaces, for example for services in the + Result property, see below). + + ``activating`` indicates that the unit has previously been inactive but + is currently in the process of entering an active state. + + Conversely ``deactivating`` indicates that the unit is currently in the + process of deactivation. + """ + return self.__get_dbus_property('ActiveState') + + @property + def sub_state(self): + """From the `D-Bus interface of systemd documentation + `_: + + ``SubState`` encodes states of the same state machine that + ``ActiveState`` covers, but knows more fine-grained states that are + unit-type-specific. Where ``ActiveState`` only covers six high-level + states, ``SubState`` covers possibly many more low-level + unit-type-specific states that are mapped to the six high-level states. + Note that multiple low-level states might map to the same high-level + state, but not vice versa. Not all high-level states have low-level + counterparts on all unit types. + """ + return self.__get_dbus_property('SubState') + + @property + def load_state(self): + """From the `D-Bus interface of systemd documentation + `_: + + ``LoadState`` contains a state value that reflects whether the + configuration file of this unit has been loaded. The following states + are currently defined: + + * ``loaded``, + * ``error`` and + * ``masked``. + + ``loaded`` indicates that the configuration was successfully loaded. + + ``error`` indicates that the configuration failed to load, the + ``LoadError`` field contains information about the cause of this + failure. + + ``masked`` indicates that the unit is currently masked out (i.e. + symlinked to /dev/null or suchlike). + + Note that the ``LoadState`` is fully orthogonal to the ``ActiveState`` + (see below) as units without valid loaded configuration might be active + (because configuration might have been reloaded at a time where a unit + was already active). + """ + return self.__get_dbus_property('LoadState') + + +class UnitResource(nagiosplugin.Resource): + """Get informations about one specific systemd unit.""" + + name = 'SYSTEMD' + + def __init__(self, *args, **kwargs): + self.unit = kwargs.pop('unit') + super().__init__(*args, **kwargs) + + def probe(self): + return Metric(name=self.unit, value=SystemdUnitState(self.unit), + context='unit') + + +class UnitContext(nagiosplugin.Context): + + def __init__(self): + super(UnitContext, self).__init__('unit') + + def evaluate(self, metric, resource): + print(metric.value.active_state) + return self.result_cls(nagiosplugin.Ok, metric=metric) + + +def get_argparser(): + parser = argparse.ArgumentParser( + prog='check_systemd', # To get the right command name in the README. + formatter_class=lambda prog: argparse.RawDescriptionHelpFormatter(prog, width=80), # noqa: E501 + description= # noqa: E251 + 'Copyright (c) 2014-18 Andrea Briganti \n' + 'Copyright (c) 2019-21 Josef Friedrich \n' + '\n' + 'Nagios / Icinga monitoring plugin to check systemd.\n', # noqa: E501 + epilog= # noqa: E251 + 'Performance data:\n' + ' - count_units\n' + ' - startup_time\n' + ' - units_activating\n' + ' - units_active\n' + ' - units_failed\n' + ' - units_inactive\n', + ) + + exclusive_group = parser.add_mutually_exclusive_group() + + exclusive_group.add_argument( + '-u', '--unit', + type=str, + dest='unit', + help='Name of the systemd unit that is being tested.', + ) + + return parser + + +def main(): + """The main function""" + args = get_argparser().parse_args() + + objects = [UnitContext()] + + if args.unit: + objects.append(UnitResource(unit=args.unit)) + + check = nagiosplugin.Check(*objects) + check.main() + + +if __name__ == '__main__': + main() diff --git a/tox.ini b/tox.ini index b20d675..5958283 100644 --- a/tox.ini +++ b/tox.ini @@ -1,30 +1,31 @@ [tox] envlist = py36, py37, py38, py39, flake8, docs [testenv] deps = nose commands = nosetests --nocapture {posargs} [testenv:py36] basepython = python3.6 [testenv:py37] basepython = python3.7 [testenv:py38] basepython = python3.8 [testenv:py39] basepython = python3.9 [testenv:flake8] basepython = python3.8 deps = flake8 commands = flake8 check_systemd.py test setup.py [testenv:docs] basepython = python3.8 deps = + pgi # PGI - Pure Python GObject Introspection Bindings API compatible with PyGObject. sphinx sphinx_rtd_theme commands = sphinx-build -q -b html doc/source {envtmpdir}/html