diff --git a/site-modules/profile/manifests/icinga2/objects/agent_checks.pp b/site-modules/profile/manifests/icinga2/objects/agent_checks.pp index 1b458ab4..c29070b5 100644 --- a/site-modules/profile/manifests/icinga2/objects/agent_checks.pp +++ b/site-modules/profile/manifests/icinga2/objects/agent_checks.pp @@ -1,171 +1,171 @@ # Checks that need to be supported on icinga2 agents class profile::icinga2::objects::agent_checks { $prometheus_host = lookup('prometheus::server::fqdn') $prometheus_port = lookup('prometheus::server::listen_port') $prometheus_url = "http://${prometheus_host}:${prometheus_port}" $plugins = { 'check_journal' => { arguments => { '-f' => { 'value' => '$journal_cursor_file$', 'set_if' => '{{ var filename = macro("$journal_cursor_file$"); return len(filename) > 0 }}', }, '-w' => '$journal_lag_warn$', '-c' => '$journal_lag_crit$', '-wn' => { 'value' => '$journal_lag_entries_warn$', 'set_if' => '$journal_lag_entries_warn$', }, '-cn' => { 'value' => '$journal_lag_entries_crit$', 'set_if' => '$journal_lag_entries_crit$', }, }, vars => { 'journal_lag_warn' => 1200, 'journal_lag_crit' => 3600, }, sudo => true, sudo_user => 'root', }, 'check_newest_file_age' => { arguments => { '-d' => '$check_directory$', '-w' => '$check_directory_warn_age$', '-c' => '$check_directory_crit_age$', '-W' => { 'set_if' => '$check_directory_missing_warn$', }, '-C' => { 'set_if' => '$check_directory_missing_crit$', }, }, vars => { 'check_directory_warn_age' => 26, 'check_directory_crit_age' => 52, 'check_directory_missing_warn' => false, 'check_directory_missing_crit' => true, }, sudo => true, sudo_user => 'root', }, 'check_prometheus_metric' => { arguments => { - '-H' => '$check_prometheus_metric_url$', - '-q' => '$check_prometheus_metric_query$', - '-w' => '$check_prometheus_metric_warning$', - '-c' => '$check_prometheus_metric_critical$', - '-n' => '$check_prometheus_metric_name$', - '-m' => '$check_prometheus_comparison_method$', - '-t' => '$check_prometheus_query_type$', + '-H' => '$prometheus_url$', + '-q' => '$prometheus_query$', + '-w' => '$prometheus_metric_warning$', + '-c' => '$prometheus_metric_critical$', + '-n' => '$prometheus_metric_name$', + '-m' => '$prometheus_metric_comparison_method$', + '-t' => '$prometheus_query_type$', '-O' => { - 'set_if' => '$check_prometheus_nan_ok$', + 'set_if' => '$prometheus_metric_nan_ok$', }, '-P' => { - 'set_if' => '$check_prometheus_perfdata$', + 'set_if' => '$prometheus_perfdata$', }, }, vars => { - 'check_prometheus_metric_url' => $prometheus_url, - 'check_prometheus_comparison_method' => 'ge', - 'check_prometheus_query_type' => 'scalar', - 'check_prometheus_nan_ok' => false, - 'check_prometheus_perfdat' => true, + 'prometheus_url' => $prometheus_url, + 'prometheus_metric_comparison_method' => 'ge', + 'prometheus_query_type' => 'scalar', + 'prometheus_metric_nan_ok' => false, + 'prometheus_perfdata' => true, } }, } $plugin_dir = '/usr/lib/nagios/plugins' $swh_plugin_dir = "${plugin_dir}/swh" $swh_plugin_configfile = '/etc/icinga2/conf.d/swh-plugins.conf' $packages = [ 'python3-nagiosplugin', 'python3-systemd', 'monitoring-plugins-basic', 'monitoring-plugins-systemd', # in swh repository ] package {$packages: ensure => present, } file {$swh_plugin_dir: ensure => 'directory', owner => 'root', group => 'root', mode => '0755', recurse => true, purge => true, require => Package[$packages], } $plugins.each |$command, $plugin| { $command_path = "${swh_plugin_dir}/${command}" file {$command_path: ensure => present, owner => 'root', group => 'root', mode => '0755', source => "puppet:///modules/profile/icinga2/plugins/${command}", require => Package[$packages], } if $plugin['sudo'] { $sudo_user = $plugin['sudo_user'] $icinga_command = ['sudo', '-u', $sudo_user, $command_path] ::sudo::conf { "icinga-${command}": ensure => present, content => "nagios ALL=(${sudo_user}) NOPASSWD: ${command_path}", priority => 50, } } else { $icinga_command = [$command_path] ::sudo::conf { "icinga-${command}": ensure => absent, } } ::icinga2::object::checkcommand {$command: import => ['plugin-check-command'], command => $icinga_command, arguments => $plugin['arguments'], vars => $plugin['vars'], target => $swh_plugin_configfile, } } $check_command = "check_systemd" $check_command_path = "${plugin_dir}/${check_command}" ::icinga2::object::checkcommand {$check_command: import => ['plugin-check-command'], command => [ $check_command_path ], arguments => { '--unit' => { value => '$systemd_units$', description => 'Name of the systemd units that are being tested.', repeat_key => true, }, '--exclude' => { value => '$systemd_excludes$', description => '-:"Name of the systemd units to exclude from checks can be a regular expression)."', repeat_key => true, }, '--no-startup-time' => { set_if => '{{ !macro("$systemd_check_startup_time$") }}', description => 'Whether to check the system startup time' }, }, vars => { systemd_units => [], systemd_excludes => [], systemd_check_startup_time => false, }, target => $swh_plugin_configfile, require => Package[$packages], } } diff --git a/site-modules/profile/manifests/icinga2/objects/static_checks.pp b/site-modules/profile/manifests/icinga2/objects/static_checks.pp index b455f430..28a1f295 100644 --- a/site-modules/profile/manifests/icinga2/objects/static_checks.pp +++ b/site-modules/profile/manifests/icinga2/objects/static_checks.pp @@ -1,142 +1,143 @@ # Static checks on the icinga master class profile::icinga2::objects::static_checks { $checks_file = '/etc/icinga2/conf.d/static-checks.conf' ::icinga2::object::host {'www.softwareheritage.org': import => ['generic-host'], check_command => 'dummy', address => 'www.softwareheritage.org', target => $checks_file, vars => { dummy_state => 0, # up dummy_text => "HTTP-only host", }, } ::icinga2::object::host {'softwareheritage.org': import => ['generic-host'], check_command => 'dummy', address => 'softwareheritage.org', target => $checks_file, vars => { dummy_state => 0, # up dummy_text => "HTTP-only host", }, } ::icinga2::object::host {'graphql.staging.swh.network': import => ['generic-host'], check_command => 'dummy', address => 'graphql.staging.swh.network', target => $checks_file, vars => { dummy_state => 0, # up dummy_text => "HTTP-only host", }, } ::icinga2::object::service {'Software Heritage Homepage': import => ['generic-service'], host_name => 'www.softwareheritage.org', check_command => 'http', target => $checks_file, vars => { http_vhost => 'www.softwareheritage.org', http_uri => '/', http_ssl => true, http_sni => true, http_string => 'Software Heritage', }, } ::icinga2::object::service {'Software Heritage Homepage (redirect to www)': import => ['generic-service'], host_name => 'softwareheritage.org', check_command => 'http', target => $checks_file, vars => { http_vhost => 'softwareheritage.org', http_uri => '/', http_ssl => true, http_sni => true, }, } ::icinga2::object::host {'swh-logging-prod': check_command => 'dummy', address => '127.0.0.1', target => $checks_file, vars => { dummy_state => 0, # up dummy_text => "virtual host for clustered checks", }, } ::icinga2::object::service {'swh-logging-prod cluster': host_name => 'swh-logging-prod', check_command => 'check_escluster', target => $checks_file, } ::icinga2::object::checkcommand {'check_escluster': import => ['plugin-check-command'], command => '/usr/lib/nagios/plugins/icinga_check_elasticsearch.sh', target => $checks_file, } ::icinga2::object::host {'DNS resolvers': check_command => 'dummy', address => '127.0.0.1', target => $checks_file, vars => { dummy_state => 0, # up dummy_text => "virtual host for clustered checks", }, } ::icinga2::object::service {'SOA': host_name => 'DNS resolvers', check_command => 'check_resolvers', target => $checks_file, } ::icinga2::object::checkcommand {'check_resolvers': import => ['plugin-check-command'], command => [ '/usr/lib/nagios/plugins/dsa-nagios-checks_checks_dsa-check-soas.txt', 'internal.softwareheritage.org', ], target => $checks_file, } $prometheus_host = lookup('prometheus::server::fqdn') ::icinga2::object::service {'Postgresql replication lag (belvedere -> somerset)': check_command => 'check_prometheus_metric', target => $checks_file, host_name => 'belvedere.internal.softwareheritage.org', vars => { - check_prometheus_metric_name => 'pg replication_lag belvedere somerset', - check_prometheus_metric_query => profile::icinga2::literal_var( + prometheus_metric_name => 'pg replication_lag belvedere somerset', + prometheus_query => profile::icinga2::literal_var( 'sum(sql_pg_stat_replication{instance="belvedere.internal.softwareheritage.org", host=":5433", application_name="softwareheritage_replica"})' ), - check_prometheus_metric_warning => '1073741824', # 1GiB 1*1024*1024*1024 - check_prometheus_metric_critical => '2147483648', # 2GiB 2*1024*1024*1024 + prometheus_query_type => 'vector', + prometheus_metric_warning => '1073741824', # 1GiB 1*1024*1024*1024 + prometheus_metric_critical => '2147483648', # 2GiB 2*1024*1024*1024 }, } ::icinga2::object::service {'Software Heritage Staging Graphql Instance': import => ['generic-service'], host_name => 'graphql.staging.swh.network', check_command => 'http', target => $checks_file, vars => { http_vhost => 'graphql.staging.swh.network', http_uri => '/', http_ssl => true, http_sni => true, http_string => 'GraphQL Playground', }, } } diff --git a/site-modules/profile/manifests/thanos/prometheus_sidecar.pp b/site-modules/profile/manifests/thanos/prometheus_sidecar.pp index 68251a8c..1a0127e9 100644 --- a/site-modules/profile/manifests/thanos/prometheus_sidecar.pp +++ b/site-modules/profile/manifests/thanos/prometheus_sidecar.pp @@ -1,109 +1,110 @@ # Thanos prometheus sidecar class profile::thanos::prometheus_sidecar { include profile::thanos::base include profile::thanos::tls_certificate $service_name = 'thanos-sidecar' $unit_name = "${service_name}.service" $objstore_config = lookup('thanos::objstore::config') $objstore_config_file = "${::profile::thanos::base::config_dir}/objstore.yml" $port_http = lookup('thanos::sidecar::port_http') $port_grpc = lookup('thanos::sidecar::port_grpc') $internal_ip = ip_for_network(lookup('internal_network')) $grpc_address = "${internal_ip}:${port_grpc}" $grpc_target = "${swh_hostname['internal_fqdn']}:${port_grpc}" $cert_paths = $::profile::thanos::tls_certificate::cert_paths $sidecar_arguments = { tsdb => { path => '/var/lib/prometheus/metrics2', }, prometheus => { # use the listen address for the prometheus server url => "http://${::profile::prometheus::server::target}/", }, objstore => { 'config-file' => $objstore_config_file, }, shipper => { 'upload-compacted' => true, }, 'grpc-server-tls-cert' => $cert_paths['fullchain'], 'grpc-server-tls-key' => $cert_paths['privkey'], 'http-address' => "${internal_ip}:${port_http}", 'grpc-address' => $grpc_address, } file {$objstore_config_file: ensure => present, owner => 'root', group => 'prometheus', mode => '0640', content => inline_yaml($objstore_config), require => File[$::profile::thanos::base::config_dir], } # Template uses: # $sidecar_arguments systemd::unit_file {$unit_name: ensure => present, content => template('profile/thanos/thanos-sidecar.service.erb'), require => Class['profile::thanos::base'], notify => Service[$service_name] } service {$service_name: ensure => 'running', enable => true, require => [ Service['prometheus'], File[$cert_paths['fullchain']], File[$cert_paths['privkey']], ], tag => 'thanos', } # Ensure prometheus is configured properly before starting the sidecar Exec['restart-prometheus'] -> Service[$service_name] # Ensure service is restarted when the certs are renewed File[$cert_paths['fullchain']] ~> Service[$service_name] File[$cert_paths['privkey']] ~> Service[$service_name] ::profile::thanos::export_query_endpoint {"thanos-sidecar-${::fqdn}": grpc_address => $grpc_target } $http_target = "${swh_hostname['internal_fqdn']}:${port_http}" ::profile::prometheus::export_scrape_config {"thanos-sidecar-${::fqdn}": target => $http_target, job => 'thanos_sidecar', } $icinga_checks_file = lookup('icinga2::exported_checks::filename') @@::icinga2::object::service {"thanos sidecar on ${::fqdn}": service_name => 'thanos sidecar', import => ['generic-service'], host_name => $::fqdn, check_command => 'check_prometheus_metric', vars => { - 'check_prometheus_metric_query' => profile::icinga2::literal_var( + 'prometheus_query' => profile::icinga2::literal_var( join([ 'time() - thanos_objstore_bucket_last_successful_upload_time{job="thanos_sidecar", instance="', $swh_hostname['internal_fqdn'], '"}', ]) ), - 'check_prometheus_metric_name' => 'thanos_sidecar_upload_lag', + 'prometheus_query_type' => 'vector', + 'prometheus_metric_name' => 'thanos_sidecar_upload_lag', # We expect an upload every 2 hours - 'check_prometheus_metric_warning' => 3 * 3600, - 'check_prometheus_metric_critical' => 24 * 3600, + 'prometheus_metric_warning' => 3 * 3600, + 'prometheus_metric_critical' => 24 * 3600, }, target => $icinga_checks_file, tag => 'icinga2::exported', } }