diff --git a/files/icinga2/plugins/check_journal b/files/icinga2/plugins/check_journal new file mode 100644 index 0000000..467c92e --- /dev/null +++ b/files/icinga2/plugins/check_journal @@ -0,0 +1,161 @@ +#!/usr/bin/python3 +# +# Check that a systemd journal cursor is within a given lag of the systemd +# journal head +# +# Copyright (c) 2017 The Software Heritage Developers +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import argparse +import logging + +import nagiosplugin +from nagiosplugin import ScalarContext +import systemd.journal + +_log = logging.getLogger('nagiosplugin') + + +# The BooleanContext class is (c) 2014 Raphael Michel +# Retrieved from https://github.com/raphaelm/monitoring/blob/master/mail/check_mail_twoway +# Published under the MIT license + +class BooleanContext(nagiosplugin.Context): + """This context only cares about boolean values. + You can specify using the ``critical``-parameter whether + a False result should cause a warning or a critical error. + """ + + def __init__(self, name, critical=True, + fmt_metric='{name} is {value}', + result_cls=nagiosplugin.result.Result): + self.critical = critical + super().__init__(name, fmt_metric, result_cls) + + def evaluate(self, metric, resource): + if not metric.value and self.critical: + return self.result_cls(nagiosplugin.state.Critical, "NOT OK", metric) + elif not metric.value and not self.critical: + return self.result_cls(nagiosplugin.state.Warn, "NOT OK", metric) + else: + return self.result_cls(nagiosplugin.state.Ok, "OK", metric) + + +class JournalLag(nagiosplugin.Resource): + """Check journal lag""" + + def __init__(self, cursorfile): + self.cursorfile = cursorfile + + def parse_cursor(self, cursor): + """Parse a journald cursor entry""" + entries = cursor.strip().split(';') + ret = {} + for entry in entries: + key, value = entry.split('=') + ret[key] = value + + for key in ('i', 'm', 't'): + # Those cursor keys are hexadecimal + if key in ret: + ret[key] = int(ret[key], 16) + + return ret + + def get_file_journal_cursor(self): + _log.info("querying the journal cursor cache file %s" % self.cursorfile) + try: + with open(self.cursorfile, 'r') as f: + ret = f.read().strip() + except OSError as e: + raise nagiosplugin.CheckError("failed to read journal cursor file: %s" % e) + else: + _log.debug("current journal cursor: %s" % ret) + return ret + + def get_system_journal_cursor(self): + _log.info("querying the system journal for the current cursor") + reader = systemd.journal.Reader() + reader.seek_tail() + ret = reader.get_previous()['__CURSOR'] + _log.debug("current journald cursor: %s" % ret) + return ret + + def probe(self): + file_cursor = self.parse_cursor(self.get_file_journal_cursor()) + _log.debug("parsed journal cursor: %s" % file_cursor) + + system_cursor = self.parse_cursor(self.get_system_journal_cursor()) + _log.debug("parsed system cursor: %s" % system_cursor) + + sameboot = system_cursor['b'] == file_cursor['b'] + seqnum_lag = system_cursor['i'] - file_cursor['i'] if sameboot else 0 + monotonic_lag = system_cursor['m'] - file_cursor['m'] if sameboot else 0 + realtime_lag = system_cursor['t'] - file_cursor['t'] if sameboot else 0 + + return [ + nagiosplugin.Metric('sameboot', sameboot, context='sameboot'), + nagiosplugin.Metric('entries_lag', seqnum_lag, context='lag_entries'), + nagiosplugin.Metric('monotonic_lag', monotonic_lag / 1000000, uom='s', context='lag_time'), + nagiosplugin.Metric('realtime_lag', realtime_lag / 1000000, uom='s', context='lag_time'), + ] + +class JournalSummary(nagiosplugin.Summary): + def ok(self, results): + return ', '.join([ + self.temp_lag_display(results), + self.items_lag_display(results), + ]) + def items_lag_display(self, results): + entries = results['entries_lag'].metric.value + return '%s %s behind' % (entries, 'entries' if entries != 1 else 'entry') + def temp_lag_display(self, results): + return '%s behind' % str(results['monotonic_lag'].metric) + +@nagiosplugin.guarded +def main(): + argp = argparse.ArgumentParser(description=__doc__) + argp.add_argument('-v', '--verbose', action='count', default=0, + help='increase output verbosity (use up to 3 times)') + argp.add_argument('-f', '--file', metavar='FILE', default='/var/lib/journalbeat/cursor-state', + help='read journald cursor state from this file') + argp.add_argument('-w', '--warning', metavar='RANGE', default='1200', + help='return warning if temporal lag is outside RANGE') + argp.add_argument('-c', '--critical', metavar='RANGE', default='3600', + help='return critical if temporal lag is outside RANGE') + argp.add_argument('-wn', '--warning-entries', metavar='RANGE', default='', + help='return warning if entries lag is outside RANGE') + argp.add_argument('-cn', '--critical-entries', metavar='RANGE', default='', + help='return critical if entries lag is outside RANGE') + + args = argp.parse_args() + + check = nagiosplugin.Check( + JournalLag(args.file), + BooleanContext('sameboot'), + ScalarContext('lag_time', args.warning, args.critical), + ScalarContext('lag_entries', args.warning_entries, args.critical_entries), + JournalSummary(), + ) + check.name = 'JOURNAL LAG' + check.main(verbose=args.verbose) + +if __name__ == '__main__': + main() diff --git a/manifests/icinga2/agent.pp b/manifests/icinga2/agent.pp index 32ab293..c26bd13 100644 --- a/manifests/icinga2/agent.pp +++ b/manifests/icinga2/agent.pp @@ -1,60 +1,62 @@ # Icinga2 agent configuration class profile::icinga2::agent { $features = hiera('icinga2::features') $icinga2_network = hiera('icinga2::network') $hiera_host_vars = hiera_hash('icinga2::host::vars') $parent_zone = hiera('icinga2::parent_zone') $parent_endpoints = hiera('icinga2::parent_endpoints') $local_host_vars = { disks => hash(flatten( $::mounts.map |$mount| { ["disk ${mount}", {disk_partitions => $mount}] }, )), } class {'::icinga2': confd => false, features => $features, } class { '::icinga2::feature::api': accept_config => true, accept_commands => true, zones => { 'ZoneName' => { endpoints => ['NodeName'], parent => $parent_zone, }, }, } create_resources('::icinga2::object::endpoint', $parent_endpoints) ::icinga2::object::zone {$parent_zone: endpoints => keys($parent_endpoints), } @@::icinga2::object::endpoint {$::fqdn: target => "/etc/icinga2/zones.d/${parent_zone}/${::fqdn}.conf", } @@::icinga2::object::zone {$::fqdn: endpoints => [$::fqdn], parent => $parent_zone, target => "/etc/icinga2/zones.d/${parent_zone}/${::fqdn}.conf", } @@::icinga2::object::host {$::fqdn: address => ip_for_network($icinga2_network), display_name => $::fqdn, check_command => 'hostalive', vars => deep_merge($local_host_vars, $hiera_host_vars), target => "/etc/icinga2/zones.d/${parent_zone}/${::fqdn}.conf", } icinga2::object::zone { 'global-templates': global => true, } + + include profile::icinga2::objects::agent_checks } diff --git a/manifests/icinga2/objects.pp b/manifests/icinga2/objects.pp index 3eb0208..fe62740 100644 --- a/manifests/icinga2/objects.pp +++ b/manifests/icinga2/objects.pp @@ -1,11 +1,12 @@ # Icinga2 object definitions class profile::icinga2::objects { include profile::icinga2::objects::templates include profile::icinga2::objects::commands include profile::icinga2::objects::users include profile::icinga2::objects::notifications include profile::icinga2::objects::timeperiods include profile::icinga2::objects::common_checks include profile::icinga2::objects::static_checks + include profile::icinga2::objects::agent_checks } diff --git a/manifests/icinga2/objects/agent_checks.pp b/manifests/icinga2/objects/agent_checks.pp new file mode 100644 index 0000000..55aed5f --- /dev/null +++ b/manifests/icinga2/objects/agent_checks.pp @@ -0,0 +1,69 @@ +# Checks that need to be supported on icinga2 agents +class profile::icinga2::objects::agent_checks { + $plugins = { + 'check_journal' => { + arguments => { + '-f' => { + 'value' => '$journal_cursor_file$', + 'set_if' => '$journal_cursor_file$', + }, + '-w' => '$journal_lag_warn$', + '-c' => '$journal_lag_crit$', + '-wn' => { + 'value' => '$journal_lag_entries_warn$', + 'set_if' => '$journal_lag_entries_warn$', + }, + '-cn' => { + 'value' => '$journal_lag_entries_crit$', + 'set_if' => '$journal_lag_entries_crit$', + }, + }, + vars => { + 'journal_lag_warn' => 1200, + 'journal_lag_crit' => 3600, + } + }, + } + + $swh_plugin_dir = '/usr/lib/nagios/plugins/swh' + $swh_plugin_configfile = '/etc/icinga2/conf.d/swh-plugins.conf' + + $packages = [ + 'python3-nagiosplugin', + 'monitoring-plugins-basic', + ] + package {$packages: + ensure => present, + } + + file {$swh_plugin_dir: + ensure => 'directory', + owner => 'root', + group => 'root', + mode => '0755', + recurse => true, + purge => true, + require => Package[$packages], + } + + $plugins.each |$command, $plugin| { + $command_path = "${swh_plugin_dir}/${command}" + file {$command_path: + ensure => present, + owner => 'root', + group => 'root', + mode => '0755', + source => "puppet:///modules/profile/icinga2/plugins/${command}", + require => Package[$packages], + + } + + ::icinga2::object::checkcommand {$command: + import => ['plugin-check-command'], + command => [$command_path], + arguments => $plugin['arguments'], + vars => $plugin['vars'], + target => $swh_plugin_configfile, + } + } +}