diff --git a/data/common/common.yaml b/data/common/common.yaml --- a/data/common/common.yaml +++ b/data/common/common.yaml @@ -3164,6 +3164,7 @@ prometheus::server::listen_network: "%{lookup('internal_network')}" prometheus::server::listen_port: 9090 prometheus::server::certname: pergamon.softwareheritage.org +prometheus::server::fqdn: pergamon.internal.softwareheritage.org swh::deploy::environment: production diff --git a/site-modules/profile/files/icinga2/plugins/check_belvedere_replication_lag.sh b/site-modules/profile/files/icinga2/plugins/check_belvedere_replication_lag.sh deleted file mode 100644 --- a/site-modules/profile/files/icinga2/plugins/check_belvedere_replication_lag.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -# -# File managed by puppet. All modifications will be lost. - -# Wrapper calling check_prometheus_metric.sh with the harcoded prometheus query -# incorrectly parsed when passed to vars.check_prometheus_metric_query - -PROGPATH=$(dirname $0) - -while getopts ':H:n:c:w:' OPT "$@" -do - case ${OPT} in - H) PROMETHEUS_SERVER="$OPTARG" ;; - n) METRIC_NAME="$OPTARG" ;; - c) CRITICAL_THRESHOLD=${OPTARG} - ;; - w) WARNING_THRESHOLD=${OPTARG} - ;; - *) echo "Invalid option ${OPT}" - exit 1 - ;; - esac -done - -QUERY='sum(sql_pg_stat_replication{instance="belvedere.internal.softwareheritage.org", host=":5433", application_name="softwareheritage_replica"})' - -${PROGPATH}/check_prometheus_metric.sh -H ${PROMETHEUS_SERVER} -q "${QUERY}" -w ${WARNING_THRESHOLD} -c ${CRITICAL_THRESHOLD} -n "${METRIC_NAME}" -t vector diff --git a/site-modules/profile/files/icinga2/plugins/check_prometheus_metric.sh b/site-modules/profile/files/icinga2/plugins/check_prometheus_metric rename from site-modules/profile/files/icinga2/plugins/check_prometheus_metric.sh rename to site-modules/profile/files/icinga2/plugins/check_prometheus_metric diff --git a/site-modules/profile/functions/icinga2/literal_var.pp b/site-modules/profile/functions/icinga2/literal_var.pp new file mode 100644 --- /dev/null +++ b/site-modules/profile/functions/icinga2/literal_var.pp @@ -0,0 +1,5 @@ +function profile::icinga2::literal_var(String $argument) >> String { + # " => \"; \ => \\. + $escaped_argument = regsubst($argument, '(["\\\\])', '\\\\\\1', 'G') + "-:\"${escaped_argument}\"" +} diff --git a/site-modules/profile/manifests/icinga2/objects/agent_checks.pp b/site-modules/profile/manifests/icinga2/objects/agent_checks.pp --- a/site-modules/profile/manifests/icinga2/objects/agent_checks.pp +++ b/site-modules/profile/manifests/icinga2/objects/agent_checks.pp @@ -1,8 +1,9 @@ # Checks that need to be supported on icinga2 agents class profile::icinga2::objects::agent_checks { + $prometheus_host = lookup('prometheus::server::fqdn') $prometheus_port = lookup('prometheus::server::listen_port') - $prometheus_url = "pergamon.internal.softwareheritage.org:${prometheus_port}" + $prometheus_url = "http://${prometheus_host}:${prometheus_port}" $plugins = { 'check_journal' => { @@ -50,29 +51,30 @@ sudo => true, sudo_user => 'root', }, - 'check_prometheus_metric.sh' => { + 'check_prometheus_metric' => { arguments => { '-H' => '$check_prometheus_metric_url$', '-q' => '$check_prometheus_metric_query$', '-w' => '$check_prometheus_metric_warning$', '-c' => '$check_prometheus_metric_critical$', '-n' => '$check_prometheus_metric_name$', + '-m' => '$check_prometheus_comparison_method$', + '-t' => '$check_prometheus_query_type$', + '-O' => { + 'set_if' => '$check_prometheus_nan_ok$', + }, + '-P' => { + 'set_if' => '$check_prometheus_perfdata$', + }, }, vars => { - 'check_prometheus_metric_url' => $prometheus_url, + 'check_prometheus_metric_url' => $prometheus_url, + 'check_prometheus_comparison_method' => 'ge', + 'check_prometheus_query_type' => 'scalar', + 'check_prometheus_nan_ok' => false, + 'check_prometheus_perfdat' => true, } }, - 'check_belvedere_replication_lag.sh' => { - arguments => { - '-H' => '$check_prometheus_metric_url$', - '-w' => '$check_prometheus_metric_warning$', - '-c' => '$check_prometheus_metric_critical$', - '-n' => '$check_prometheus_metric_name$', - }, - vars => { - 'check_prometheus_metric_url' => $prometheus_url, - } - } } $plugin_dir = '/usr/lib/nagios/plugins' diff --git a/site-modules/profile/manifests/icinga2/objects/static_checks.pp b/site-modules/profile/manifests/icinga2/objects/static_checks.pp --- a/site-modules/profile/manifests/icinga2/objects/static_checks.pp +++ b/site-modules/profile/manifests/icinga2/objects/static_checks.pp @@ -110,13 +110,16 @@ target => $checks_file, } - $prometheus_host = lookup('prometheus::server::certname') + $prometheus_host = lookup('prometheus::server::fqdn') ::icinga2::object::service {'Postgresql replication lag (belvedere -> somerset)': - check_command => 'check_belvedere_replication_lag.sh', + check_command => 'check_prometheus_metric', target => $checks_file, host_name => 'belvedere.internal.softwareheritage.org', vars => { check_prometheus_metric_name => 'pg replication_lag belvedere somerset', + check_prometheus_query => profile::icinga2::literal_var( + 'sum(sql_pg_stat_replication{instance="belvedere.internal.softwareheritage.org", host=":5433", application_name="softwareheritage_replica"})' + ), check_prometheus_metric_warning => '1073741824', # 1GiB 1*1024*1024*1024 check_prometheus_metric_critical => '2147483648', # 2GiB 2*1024*1024*1024 },