Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9348226
D6050.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
12 KB
Subscribers
None
D6050.diff
View Options
diff --git a/site-modules/profile/files/icinga2/plugins/check_belvedere_replication_lag.sh b/site-modules/profile/files/icinga2/plugins/check_belvedere_replication_lag.sh
new file mode 100644
--- /dev/null
+++ b/site-modules/profile/files/icinga2/plugins/check_belvedere_replication_lag.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+#
+# File managed by puppet. All modifications will be lost.
+
+# Wrapper calling check_prometheus_metric.sh with the harcoded prometheus query
+# incorrectly parsed when passed to vars.check_prometheus_metric_query
+
+PROGPATH=$(dirname $0)
+
+while getopts ':H:n:c:w:' OPT "$@"
+do
+ case ${OPT} in
+ H) PROMETHEUS_SERVER="$OPTARG" ;;
+ n) METRIC_NAME="$OPTARG" ;;
+ c) CRITICAL_THRESHOLD=${OPTARG}
+ ;;
+ w) WARNING_THRESHOLD=${OPTARG}
+ ;;
+ *) echo "Invalid option ${OPT}"
+ exit 1
+ ;;
+ esac
+done
+
+QUERY='sum(sql_pg_stat_replication{instance="belvedere.internal.softwareheritage.org", host=":5433", application_name="softwareheritage_replica"})'
+
+${PROGPATH}/check_prometheus_metric.sh -H ${PROMETHEUS_SERVER} -q "${QUERY}" -w ${WARNING_THRESHOLD} -c ${CRITICAL_THRESHOLD} -n "${METRIC_NAME}" -t vector
diff --git a/site-modules/profile/files/icinga2/plugins/check_prometheus_metric.sh b/site-modules/profile/files/icinga2/plugins/check_prometheus_metric.sh
new file mode 100644
--- /dev/null
+++ b/site-modules/profile/files/icinga2/plugins/check_prometheus_metric.sh
@@ -0,0 +1,289 @@
+#!/bin/bash
+
+#
+# File managed by puppet. All modifications will be lost.
+
+#
+# check_prometheus_metric.sh - Nagios plugin wrapper for checking Prometheus
+# metrics. Requires curl and jq to be in $PATH.
+
+####
+# SWH
+# Script copied from https://github.com/prometheus/nagios_plugins
+# https://archive.softwareheritage.org/swh:1:cnt:88897d9be48bd7635866a0baa2ca950669d277fb;origin=https://github.com/prometheus/nagios_plugins;visit=swh:1:snp:336a21610d7ebbcdff88532112fb503ace89488b;anchor=swh:1:rev:767cf2f31506827570895674faa5c395ba482406;path=/check_prometheus_metric.sh
+
+# Avoid locale complications:
+export LC_ALL=C
+
+# Default configuration:
+CURL_OPTS=()
+COMPARISON_METHOD=ge
+NAN_OK="false"
+NAGIOS_INFO="false"
+PERFDATA="false"
+PROMETHEUS_QUERY_TYPE="scalar"
+
+# Nagios status codes:
+OK=0
+WARNING=1
+CRITICAL=2
+UNKNOWN=3
+
+if ! type curl >/dev/null 2>&1
+then
+ echo 'ERROR: Missing "curl" command'
+ exit ${UNKNOWN}
+fi
+
+if ! type jq >/dev/null 2>&1
+then
+ echo 'ERROR: Missing "jq" command'
+ exit ${UNKNOWN}
+fi
+
+function usage {
+
+ cat <<'EoL'
+
+ check_prometheus_metric.sh - Nagios plugin wrapper for checking Prometheus
+ metrics. Requires curl and jq to be in $PATH.
+
+ Usage:
+ check_prometheus_metric.sh -H HOST -q QUERY -w INT -c INT -n NAME [-m METHOD] [-O] [-i] [-p] [-t QUERY_TYPE]
+
+ options:
+ -H HOST URL of Prometheus host to query.
+ -q QUERY Prometheus query, in single quotes, that returns by default a float or int (see -t).
+ -w INT Warning level value (must be zero or positive).
+ -c INT Critical level value (must be zero or positive).
+ -n NAME A name for the metric being checked.
+ -m METHOD Comparison method, one of gt, ge, lt, le, eq, ne.
+ (Defaults to ge unless otherwise specified.)
+ -C CURL_OPTS Additional flags to pass to curl.
+ Can be passed multiple times. Options and option values must be passed separately.
+ e.g. -C --conect-timetout -C 10 -C --cacert -C /path/to/ca.crt
+ -O Accept NaN as an "OK" result .
+ -i Print the extra metric information into the Nagios message.
+ -p Add perfdata to check output.
+ -t QUERY_TYPE Prometheus query return type: scalar (default) or vector.
+ The first element of the vector is used for the check.
+
+EoL
+}
+
+
+function process_command_line {
+
+ while getopts ':H:q:w:c:m:n:C:Oipt:' OPT "$@"
+ do
+ case ${OPT} in
+ H) PROMETHEUS_SERVER="$OPTARG" ;;
+ q) PROMETHEUS_QUERY="$OPTARG" ;;
+ n) METRIC_NAME="$OPTARG" ;;
+
+ m) if [[ ${OPTARG} =~ ^([lg][et]|eq|ne)$ ]]
+ then
+ COMPARISON_METHOD=${OPTARG}
+ else
+ NAGIOS_SHORT_TEXT="invalid comparison method: ${OPTARG}"
+ NAGIOS_LONG_TEXT="$(usage)"
+ exit
+ fi
+ ;;
+
+ c) if [[ ${OPTARG} =~ ^[0-9]+$ ]]
+ then
+ CRITICAL_LEVEL=${OPTARG}
+ else
+ NAGIOS_SHORT_TEXT='-c CRITICAL_LEVEL requires an integer'
+ NAGIOS_LONG_TEXT="$(usage)"
+ exit
+ fi
+ ;;
+
+ w) if [[ ${OPTARG} =~ ^[0-9]+$ ]]
+ then
+ WARNING_LEVEL=${OPTARG}
+ else
+ NAGIOS_SHORT_TEXT='-w WARNING_LEVEL requires an integer'
+ NAGIOS_LONG_TEXT="$(usage)"
+ exit
+ fi
+ ;;
+
+ C) CURL_OPTS+=("${OPTARG}")
+ ;;
+ O) NAN_OK="true"
+ ;;
+
+ i) NAGIOS_INFO="true"
+ ;;
+
+ p) PERFDATA="true"
+ ;;
+
+ t) if [[ ${OPTARG} =~ ^(scalar|vector)$ ]]
+ then
+ PROMETHEUS_QUERY_TYPE=${OPTARG}
+ else
+ NAGIOS_SHORT_TEXT="invalid comparison method: ${OPTARG}"
+ NAGIOS_LONG_TEXT="$(usage)"
+ exit
+ fi
+ ;;
+
+ \?) NAGIOS_SHORT_TEXT="invalid option: -$OPTARG"
+ NAGIOS_LONG_TEXT="$(usage)"
+ exit
+ ;;
+
+ \:) NAGIOS_SHORT_TEXT="-$OPTARG requires an arguement"
+ NAGIOS_LONG_TEXT="$(usage)"
+ exit
+ ;;
+ esac
+ done
+
+ # check for missing parameters
+ if [[ -z ${PROMETHEUS_SERVER} ]] ||
+ [[ -z ${PROMETHEUS_QUERY} ]] ||
+ [[ -z ${PROMETHEUS_QUERY_TYPE} ]] ||
+ [[ -z ${METRIC_NAME} ]] ||
+ [[ -z ${WARNING_LEVEL} ]] ||
+ [[ -z ${CRITICAL_LEVEL} ]]
+ then
+ NAGIOS_SHORT_TEXT='missing required option'
+ NAGIOS_LONG_TEXT="$(usage)"
+ exit
+ fi
+}
+
+function on_exit {
+
+ if [[ -z ${NAGIOS_STATUS} ]]
+ then
+ NAGIOS_STATUS=UNKNOWN
+ fi
+
+ if [[ -z ${NAGIOS_SHORT_TEXT} ]]
+ then
+ NAGIOS_SHORT_TEXT='an unknown error occured'
+ fi
+
+ printf '%s - %s\n' ${NAGIOS_STATUS} "${NAGIOS_SHORT_TEXT}"
+
+ if [[ -n ${NAGIOS_LONG_TEXT} ]]
+ then
+ printf '%s\n' "${NAGIOS_LONG_TEXT}"
+ fi
+
+ exit ${!NAGIOS_STATUS} # hint: an indirect variable reference
+}
+
+
+function get_prometheus_raw_result {
+
+ local _RESULT
+
+ _RESULT=$(curl -sgG "${CURL_OPTS[@]}" --data-urlencode "query=${PROMETHEUS_QUERY}" "${PROMETHEUS_SERVER}/api/v1/query" | jq -r '.data.result')
+ printf '%s' "${_RESULT}"
+
+}
+
+function get_prometheus_scalar_result {
+
+ local _RESULT
+
+ _RESULT=$(echo $1 | jq -r '.[1]')
+
+ # check result
+ if [[ ${_RESULT} =~ ^-?[0-9]+\.?[0-9]*$ ]]
+ then
+ printf '%.0F' ${_RESULT} # return an int if result is a number
+ else
+ case "${_RESULT}" in
+ +Inf) printf '%.0F' $(( ${WARNING_LEVEL} + ${CRITICAL_LEVEL} )) # something greater than either level
+ ;;
+ -Inf) printf -- '-1' # something smaller than any level
+ ;;
+ *) printf '%s' "${_RESULT}" # otherwise return as a string
+ ;;
+ esac
+ fi
+}
+
+function get_prometheus_vector_value {
+
+ local _RESULT
+
+ # return the value of the first element of the vector
+ _RESULT=$(echo $1 | jq -r '.[0].value?')
+ printf '%s' "${_RESULT}"
+
+}
+
+function get_prometheus_vector_metric {
+
+ local _RESULT
+
+ # return the metric information of the first element of the vector
+ _RESULT=$(echo $1 | jq -r '.[0].metric?' | xargs)
+ printf '%s' "${_RESULT}"
+
+}
+
+# set up exit function
+trap on_exit EXIT TERM
+
+# process the cli options
+process_command_line "$@"
+
+# get the raw query from prometheus
+PROMETHEUS_RAW_RESULT="$( get_prometheus_raw_result )"
+
+# extract the metric value from the raw prometheus result
+if [[ "${PROMETHEUS_QUERY_TYPE}" = "scalar" ]]
+then
+ PROMETHEUS_RESULT=$( get_prometheus_scalar_result "$PROMETHEUS_RAW_RESULT" )
+ PROMETHEUS_METRIC=UNKNOWN
+else
+ PROMETHEUS_VALUE=$( get_prometheus_vector_value "$PROMETHEUS_RAW_RESULT" )
+ PROMETHEUS_RESULT=$( get_prometheus_scalar_result "$PROMETHEUS_VALUE" )
+ PROMETHEUS_METRIC=$( get_prometheus_vector_metric "$PROMETHEUS_RAW_RESULT" )
+fi
+
+# check the value
+if [[ ${PROMETHEUS_RESULT} =~ ^-?[0-9]+$ ]]
+then
+ if eval [[ ${PROMETHEUS_RESULT} -${COMPARISON_METHOD} ${CRITICAL_LEVEL} ]]
+ then
+ NAGIOS_STATUS=CRITICAL
+ NAGIOS_SHORT_TEXT="${METRIC_NAME} is ${PROMETHEUS_RESULT}"
+ elif eval [[ ${PROMETHEUS_RESULT} -${COMPARISON_METHOD} $WARNING_LEVEL ]]
+ then
+ NAGIOS_STATUS=WARNING
+ NAGIOS_SHORT_TEXT="${METRIC_NAME} is ${PROMETHEUS_RESULT}"
+ else
+ NAGIOS_STATUS=OK
+ NAGIOS_SHORT_TEXT="${METRIC_NAME} is ${PROMETHEUS_RESULT}"
+ fi
+else
+ if [[ "${NAN_OK}" = "true" && "${PROMETHEUS_RESULT}" = "NaN" ]]
+ then
+ NAGIOS_STATUS=OK
+ NAGIOS_SHORT_TEXT="${METRIC_NAME} is ${PROMETHEUS_RESULT}"
+ else
+ NAGIOS_SHORT_TEXT="unable to parse prometheus response"
+ NAGIOS_LONG_TEXT="${METRIC_NAME} is ${PROMETHEUS_RESULT}"
+ fi
+fi
+if [[ "${NAGIOS_INFO}" = "true" ]]
+then
+ NAGIOS_SHORT_TEXT="${NAGIOS_SHORT_TEXT}: ${PROMETHEUS_METRIC}"
+fi
+if [[ "${PERFDATA}" = "true" ]]
+then
+ NAGIOS_SHORT_TEXT="${NAGIOS_SHORT_TEXT} | query_result=${PROMETHEUS_RESULT}"
+fi
+
+exit
diff --git a/site-modules/profile/manifests/icinga2/objects/agent_checks.pp b/site-modules/profile/manifests/icinga2/objects/agent_checks.pp
--- a/site-modules/profile/manifests/icinga2/objects/agent_checks.pp
+++ b/site-modules/profile/manifests/icinga2/objects/agent_checks.pp
@@ -1,5 +1,9 @@
# Checks that need to be supported on icinga2 agents
class profile::icinga2::objects::agent_checks {
+
+ $prometheus_port = lookup('prometheus::server::listen_port')
+ $prometheus_url = "pergamon.internal.softwareheritage.org:${prometheus_port}"
+
$plugins = {
'check_journal' => {
arguments => {
@@ -46,6 +50,29 @@
sudo => true,
sudo_user => 'root',
},
+ 'check_prometheus_metric.sh' => {
+ arguments => {
+ '-H' => '$check_prometheus_metric_url$',
+ '-q' => '$check_prometheus_metric_query$',
+ '-w' => '$check_prometheus_metric_warning$',
+ '-c' => '$check_prometheus_metric_critical$',
+ '-n' => '$check_prometheus_metric_name$',
+ },
+ vars => {
+ 'check_prometheus_metric_url' => $prometheus_url,
+ }
+ },
+ 'check_belvedere_replication_lag.sh' => {
+ arguments => {
+ '-H' => '$check_prometheus_metric_url$',
+ '-w' => '$check_prometheus_metric_warning$',
+ '-c' => '$check_prometheus_metric_critical$',
+ '-n' => '$check_prometheus_metric_name$',
+ },
+ vars => {
+ 'check_prometheus_metric_url' => $prometheus_url,
+ }
+ }
}
$swh_plugin_dir = '/usr/lib/nagios/plugins/swh'
diff --git a/site-modules/profile/manifests/icinga2/objects/static_checks.pp b/site-modules/profile/manifests/icinga2/objects/static_checks.pp
--- a/site-modules/profile/manifests/icinga2/objects/static_checks.pp
+++ b/site-modules/profile/manifests/icinga2/objects/static_checks.pp
@@ -98,4 +98,17 @@
],
target => $checks_file,
}
+
+ $prometheus_host = lookup('prometheus::server::certname')
+ ::icinga2::object::service {'Postgresql replication lag (belvedere -> somerset)':
+ check_command => 'check_belvedere_replication_lag.sh',
+ target => $checks_file,
+ host_name => 'belvedere.internal.softwareheritage.org',
+ vars => {
+ check_prometheus_metric_name => 'pg replication_lag belvedere somerset',
+ check_prometheus_metric_warning => '107374182400', # 100GiB 100*1024*1024*1024
+ check_prometheus_metric_critical => '214748364800', # 200GiB 200*1024*1024*1024
+ },
+ }
+
}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Jul 3 2025, 6:18 PM (5 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219298
Attached To
D6050: monitor postgresql replication lag through prometheus data
Event Timeline
Log In to Comment