diff --git a/files/icinga2/plugins/check_journal b/files/icinga2/plugins/check_journal index 467c92e..af2cc8a 100644 --- a/files/icinga2/plugins/check_journal +++ b/files/icinga2/plugins/check_journal @@ -1,161 +1,161 @@ #!/usr/bin/python3 # # Check that a systemd journal cursor is within a given lag of the systemd # journal head # # Copyright (c) 2017 The Software Heritage Developers # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import argparse import logging import nagiosplugin from nagiosplugin import ScalarContext import systemd.journal _log = logging.getLogger('nagiosplugin') # The BooleanContext class is (c) 2014 Raphael Michel # Retrieved from https://github.com/raphaelm/monitoring/blob/master/mail/check_mail_twoway # Published under the MIT license class BooleanContext(nagiosplugin.Context): """This context only cares about boolean values. You can specify using the ``critical``-parameter whether a False result should cause a warning or a critical error. """ def __init__(self, name, critical=True, fmt_metric='{name} is {value}', result_cls=nagiosplugin.result.Result): self.critical = critical super().__init__(name, fmt_metric, result_cls) def evaluate(self, metric, resource): if not metric.value and self.critical: return self.result_cls(nagiosplugin.state.Critical, "NOT OK", metric) elif not metric.value and not self.critical: return self.result_cls(nagiosplugin.state.Warn, "NOT OK", metric) else: return self.result_cls(nagiosplugin.state.Ok, "OK", metric) class JournalLag(nagiosplugin.Resource): """Check journal lag""" def __init__(self, cursorfile): self.cursorfile = cursorfile def parse_cursor(self, cursor): """Parse a journald cursor entry""" entries = cursor.strip().split(';') ret = {} for entry in entries: key, value = entry.split('=') ret[key] = value for key in ('i', 'm', 't'): # Those cursor keys are hexadecimal if key in ret: ret[key] = int(ret[key], 16) return ret def get_file_journal_cursor(self): _log.info("querying the journal cursor cache file %s" % self.cursorfile) try: with open(self.cursorfile, 'r') as f: ret = f.read().strip() except OSError as e: raise nagiosplugin.CheckError("failed to read journal cursor file: %s" % e) else: _log.debug("current journal cursor: %s" % ret) return ret def get_system_journal_cursor(self): _log.info("querying the system journal for the current cursor") reader = systemd.journal.Reader() reader.seek_tail() ret = reader.get_previous()['__CURSOR'] _log.debug("current journald cursor: %s" % ret) return ret def probe(self): file_cursor = self.parse_cursor(self.get_file_journal_cursor()) _log.debug("parsed journal cursor: %s" % file_cursor) system_cursor = self.parse_cursor(self.get_system_journal_cursor()) _log.debug("parsed system cursor: %s" % system_cursor) sameboot = system_cursor['b'] == file_cursor['b'] seqnum_lag = system_cursor['i'] - file_cursor['i'] if sameboot else 0 monotonic_lag = system_cursor['m'] - file_cursor['m'] if sameboot else 0 - realtime_lag = system_cursor['t'] - file_cursor['t'] if sameboot else 0 + realtime_lag = system_cursor['t'] - file_cursor['t'] return [ nagiosplugin.Metric('sameboot', sameboot, context='sameboot'), nagiosplugin.Metric('entries_lag', seqnum_lag, context='lag_entries'), nagiosplugin.Metric('monotonic_lag', monotonic_lag / 1000000, uom='s', context='lag_time'), nagiosplugin.Metric('realtime_lag', realtime_lag / 1000000, uom='s', context='lag_time'), ] class JournalSummary(nagiosplugin.Summary): def ok(self, results): return ', '.join([ self.temp_lag_display(results), self.items_lag_display(results), ]) def items_lag_display(self, results): entries = results['entries_lag'].metric.value return '%s %s behind' % (entries, 'entries' if entries != 1 else 'entry') def temp_lag_display(self, results): return '%s behind' % str(results['monotonic_lag'].metric) @nagiosplugin.guarded def main(): argp = argparse.ArgumentParser(description=__doc__) argp.add_argument('-v', '--verbose', action='count', default=0, help='increase output verbosity (use up to 3 times)') argp.add_argument('-f', '--file', metavar='FILE', default='/var/lib/journalbeat/cursor-state', help='read journald cursor state from this file') argp.add_argument('-w', '--warning', metavar='RANGE', default='1200', help='return warning if temporal lag is outside RANGE') argp.add_argument('-c', '--critical', metavar='RANGE', default='3600', help='return critical if temporal lag is outside RANGE') argp.add_argument('-wn', '--warning-entries', metavar='RANGE', default='', help='return warning if entries lag is outside RANGE') argp.add_argument('-cn', '--critical-entries', metavar='RANGE', default='', help='return critical if entries lag is outside RANGE') args = argp.parse_args() check = nagiosplugin.Check( JournalLag(args.file), BooleanContext('sameboot'), ScalarContext('lag_time', args.warning, args.critical), ScalarContext('lag_entries', args.warning_entries, args.critical_entries), JournalSummary(), ) check.name = 'JOURNAL LAG' check.main(verbose=args.verbose) if __name__ == '__main__': main()