diff --git a/site-modules/profile/files/icinga2/plugins/check_belvedere_replication_lag.sh b/site-modules/profile/files/icinga2/plugins/check_belvedere_replication_lag.sh deleted file mode 100644 index 53c14abf..00000000 --- a/site-modules/profile/files/icinga2/plugins/check_belvedere_replication_lag.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -# -# File managed by puppet. All modifications will be lost. - -# Wrapper calling check_prometheus_metric.sh with the harcoded prometheus query -# incorrectly parsed when passed to vars.check_prometheus_metric_query - -PROGPATH=$(dirname $0) - -while getopts ':H:n:c:w:' OPT "$@" -do - case ${OPT} in - H) PROMETHEUS_SERVER="$OPTARG" ;; - n) METRIC_NAME="$OPTARG" ;; - c) CRITICAL_THRESHOLD=${OPTARG} - ;; - w) WARNING_THRESHOLD=${OPTARG} - ;; - *) echo "Invalid option ${OPT}" - exit 1 - ;; - esac -done - -QUERY='sum(sql_pg_stat_replication{instance="belvedere.internal.softwareheritage.org", host=":5433", application_name="softwareheritage_replica"})' - -${PROGPATH}/check_prometheus_metric.sh -H ${PROMETHEUS_SERVER} -q "${QUERY}" -w ${WARNING_THRESHOLD} -c ${CRITICAL_THRESHOLD} -n "${METRIC_NAME}" -t vector diff --git a/site-modules/profile/functions/icinga2/literal_var.pp b/site-modules/profile/functions/icinga2/literal_var.pp new file mode 100644 index 00000000..abdd25ad --- /dev/null +++ b/site-modules/profile/functions/icinga2/literal_var.pp @@ -0,0 +1,5 @@ +function profile::icinga2::literal_var(String $argument) >> String { + # " => \"; \ => \\. + $escaped_argument = regsubst($argument, '(["\\\\])', '\\\\\\1', 'G') + "-:\"${escaped_argument}\"" +} diff --git a/site-modules/profile/manifests/icinga2/objects/agent_checks.pp b/site-modules/profile/manifests/icinga2/objects/agent_checks.pp index 7b437a4c..5d33d3fc 100644 --- a/site-modules/profile/manifests/icinga2/objects/agent_checks.pp +++ b/site-modules/profile/manifests/icinga2/objects/agent_checks.pp @@ -1,170 +1,159 @@ # Checks that need to be supported on icinga2 agents class profile::icinga2::objects::agent_checks { $prometheus_host = lookup('prometheus::server::fqdn') $prometheus_port = lookup('prometheus::server::listen_port') $prometheus_url = "http://${prometheus_host}:${prometheus_port}" $plugins = { 'check_journal' => { arguments => { '-f' => { 'value' => '$journal_cursor_file$', 'set_if' => '{{ var filename = macro("$journal_cursor_file$"); return len(filename) > 0 }}', }, '-w' => '$journal_lag_warn$', '-c' => '$journal_lag_crit$', '-wn' => { 'value' => '$journal_lag_entries_warn$', 'set_if' => '$journal_lag_entries_warn$', }, '-cn' => { 'value' => '$journal_lag_entries_crit$', 'set_if' => '$journal_lag_entries_crit$', }, }, vars => { 'journal_lag_warn' => 1200, 'journal_lag_crit' => 3600, }, sudo => true, sudo_user => 'root', }, 'check_newest_file_age' => { arguments => { '-d' => '$check_directory$', '-w' => '$check_directory_warn_age$', '-c' => '$check_directory_crit_age$', '-W' => { 'set_if' => '$check_directory_missing_warn$', }, '-C' => { 'set_if' => '$check_directory_missing_crit$', }, }, vars => { 'check_directory_warn_age' => 26, 'check_directory_crit_age' => 52, 'check_directory_missing_warn' => false, 'check_directory_missing_crit' => true, }, sudo => true, sudo_user => 'root', }, 'check_prometheus_metric.sh' => { arguments => { '-H' => '$check_prometheus_metric_url$', '-q' => '$check_prometheus_metric_query$', '-w' => '$check_prometheus_metric_warning$', '-c' => '$check_prometheus_metric_critical$', '-n' => '$check_prometheus_metric_name$', }, vars => { 'check_prometheus_metric_url' => $prometheus_url, } }, - 'check_belvedere_replication_lag.sh' => { - arguments => { - '-H' => '$check_prometheus_metric_url$', - '-w' => '$check_prometheus_metric_warning$', - '-c' => '$check_prometheus_metric_critical$', - '-n' => '$check_prometheus_metric_name$', - }, - vars => { - 'check_prometheus_metric_url' => $prometheus_url, - } - } } $plugin_dir = '/usr/lib/nagios/plugins' $swh_plugin_dir = "${plugin_dir}/swh" $swh_plugin_configfile = '/etc/icinga2/conf.d/swh-plugins.conf' $packages = [ 'python3-nagiosplugin', 'python3-systemd', 'monitoring-plugins-basic', 'monitoring-plugins-systemd', # in swh repository ] package {$packages: ensure => present, } file {$swh_plugin_dir: ensure => 'directory', owner => 'root', group => 'root', mode => '0755', recurse => true, purge => true, require => Package[$packages], } $plugins.each |$command, $plugin| { $command_path = "${swh_plugin_dir}/${command}" file {$command_path: ensure => present, owner => 'root', group => 'root', mode => '0755', source => "puppet:///modules/profile/icinga2/plugins/${command}", require => Package[$packages], } if $plugin['sudo'] { $sudo_user = $plugin['sudo_user'] $icinga_command = ['sudo', '-u', $sudo_user, $command_path] ::sudo::conf { "icinga-${command}": ensure => present, content => "nagios ALL=(${sudo_user}) NOPASSWD: ${command_path}", priority => 50, } } else { $icinga_command = [$command_path] ::sudo::conf { "icinga-${command}": ensure => absent, } } ::icinga2::object::checkcommand {$command: import => ['plugin-check-command'], command => $icinga_command, arguments => $plugin['arguments'], vars => $plugin['vars'], target => $swh_plugin_configfile, } } $check_command = "check_systemd" $check_command_path = "${plugin_dir}/${check_command}" ::icinga2::object::checkcommand {$check_command: import => ['plugin-check-command'], command => [ $check_command_path ], arguments => { '--unit' => { value => '$systemd_units$', description => 'Name of the systemd units that are being tested.', repeat_key => true, }, '--exclude' => { value => '$systemd_excludes$', description => '-:"Name of the systemd units to exclude from checks can be a regular expression)."', repeat_key => true, }, '--no-startup-time' => { set_if => '{{ !macro("$systemd_check_startup_time$") }}', description => 'Whether to check the system startup time' }, }, vars => { systemd_units => [], systemd_excludes => [], systemd_check_startup_time => false, }, target => $swh_plugin_configfile, require => Package[$packages], } } diff --git a/site-modules/profile/manifests/icinga2/objects/static_checks.pp b/site-modules/profile/manifests/icinga2/objects/static_checks.pp index 04884282..2e320048 100644 --- a/site-modules/profile/manifests/icinga2/objects/static_checks.pp +++ b/site-modules/profile/manifests/icinga2/objects/static_checks.pp @@ -1,139 +1,141 @@ # Static checks on the icinga master class profile::icinga2::objects::static_checks { $checks_file = '/etc/icinga2/conf.d/static-checks.conf' ::icinga2::object::host {'www.softwareheritage.org': import => ['generic-host'], check_command => 'dummy', address => 'www.softwareheritage.org', target => $checks_file, vars => { dummy_state => 0, # up dummy_text => "HTTP-only host", }, } ::icinga2::object::host {'softwareheritage.org': import => ['generic-host'], check_command => 'dummy', address => 'softwareheritage.org', target => $checks_file, vars => { dummy_state => 0, # up dummy_text => "HTTP-only host", }, } ::icinga2::object::host {'graphql.staging.swh.network': import => ['generic-host'], check_command => 'dummy', address => 'graphql.staging.swh.network', target => $checks_file, vars => { dummy_state => 0, # up dummy_text => "HTTP-only host", }, } ::icinga2::object::service {'Software Heritage Homepage': import => ['generic-service'], host_name => 'www.softwareheritage.org', check_command => 'http', target => $checks_file, vars => { http_vhost => 'www.softwareheritage.org', http_uri => '/', http_ssl => true, http_sni => true, http_string => 'Software Heritage', }, } ::icinga2::object::service {'Software Heritage Homepage (redirect to www)': import => ['generic-service'], host_name => 'softwareheritage.org', check_command => 'http', target => $checks_file, vars => { http_vhost => 'softwareheritage.org', http_uri => '/', http_ssl => true, http_sni => true, }, } ::icinga2::object::host {'swh-logging-prod': check_command => 'dummy', address => '127.0.0.1', target => $checks_file, vars => { dummy_state => 0, # up dummy_text => "virtual host for clustered checks", }, } ::icinga2::object::service {'swh-logging-prod cluster': host_name => 'swh-logging-prod', check_command => 'check_escluster', target => $checks_file, } ::icinga2::object::checkcommand {'check_escluster': import => ['plugin-check-command'], command => '/usr/lib/nagios/plugins/icinga_check_elasticsearch.sh', target => $checks_file, } ::icinga2::object::host {'DNS resolvers': check_command => 'dummy', address => '127.0.0.1', target => $checks_file, vars => { dummy_state => 0, # up dummy_text => "virtual host for clustered checks", }, } ::icinga2::object::service {'SOA': host_name => 'DNS resolvers', check_command => 'check_resolvers', target => $checks_file, } ::icinga2::object::checkcommand {'check_resolvers': import => ['plugin-check-command'], command => [ '/usr/lib/nagios/plugins/dsa-nagios-checks_checks_dsa-check-soas.txt', 'internal.softwareheritage.org', ], target => $checks_file, } - $prometheus_host = lookup('prometheus::server::certname') ::icinga2::object::service {'Postgresql replication lag (belvedere -> somerset)': - check_command => 'check_belvedere_replication_lag.sh', + check_command => 'check_prometheus_metric.sh', target => $checks_file, host_name => 'belvedere.internal.softwareheritage.org', vars => { check_prometheus_metric_name => 'pg replication_lag belvedere somerset', + check_prometheus_query => profile::icinga2::literal_var( + 'sum(sql_pg_stat_replication{instance="belvedere.internal.softwareheritage.org", host=":5433", application_name="softwareheritage_replica"})' + ), check_prometheus_metric_warning => '1073741824', # 1GiB 1*1024*1024*1024 check_prometheus_metric_critical => '2147483648', # 2GiB 2*1024*1024*1024 }, } ::icinga2::object::service {'Software Heritage Staging Graphql Instance': import => ['generic-service'], host_name => 'graphql.staging.swh.network', check_command => 'http', target => $checks_file, vars => { http_vhost => 'graphql.staging.swh.network', http_uri => '/', http_ssl => true, http_sni => true, http_string => 'GraphQL Playground', }, } }