That service is critical for runners to schedule correctly origins to visit. So raise an
alert if something is wrong with its startup.
Related to T3502
Differential D6156
Ensure icinga alerts are raised if the scheduler journal client is down Authored by ardumont on Aug 27 2021, 5:01 PM. Tags None Subscribers None
Details
That service is critical for runners to schedule correctly origins to visit. So raise an Related to T3502 octo-diff: bin/octocatalog-diff --octocatalog-diff-args --no-truncate-details --to staging saatchi
...
+ Concat::Fragment[icinga2::object::Service::check_scheduler_journal_client] =>
parameters =>
"order": 60
"target": "/etc/icinga2/zones.d/global-templates/services.conf"
"content": >>>
apply Service "check_scheduler_journal_client" {
import "generic-service"
check_command = "check_systemd"
command_endpoint = host.name
vars.check_systemd_unit = "swh-scheduler-journal-client.service"
assign where host.vars.os == "Linux"
ignore where host.vars.noagent
}
<<<
*******************************************
+ Concat[/etc/icinga2/zones.d/global-templates/services.conf] =>
parameters =>
"backup": "puppet"
"ensure": "present"
"ensure_newline": false
"force": false
"format": "plain"
"group": "nagios"
"mode": "0640"
"notify": ["Class[Icinga2::Service]"]
"order": "alpha"
"owner": "root"
"path": "/etc/icinga2/zones.d/global-templates/services.conf"
"replace": true
"show_diff": true
"tag": "icinga2::config::file"
"warn": true
*******************************************
+ Concat_file[/etc/icinga2/zones.d/global-templates/services.conf] =>
parameters =>
"backup": "puppet"
"ensure_newline": false
"force": false
"format": "plain"
"group": "nagios"
"mode": "0640"
"order": "alpha"
"owner": "root"
"replace": true
"show_diff": true
"tag": "_etc_icinga2_zones.d_global-templates_services.conf"
*******************************************
+ Concat_fragment[/etc/icinga2/zones.d/global-templates/services.conf_header] =>
parameters =>
"order": "0"
"tag": "_etc_icinga2_zones.d_global-templates_services.conf"
"target": "/etc/icinga2/zones.d/global-templates/services.conf"
"content": >>>
# This file is managed by Puppet. DO NOT EDIT.
<<<
*******************************************
+ Concat_fragment[icinga2::object::Service::check_scheduler_journal_client] =>
parameters =>
"order": 60
"tag": "_etc_icinga2_zones.d_global-templates_services.conf"
"target": "/etc/icinga2/zones.d/global-templates/services.conf"
"content": >>>
apply Service "check_scheduler_journal_client" {
import "generic-service"
check_command = "check_systemd"
command_endpoint = host.name
vars.check_systemd_unit = "swh-scheduler-journal-client.service"
assign where host.vars.os == "Linux"
ignore where host.vars.noagent
}
<<<
*******************************************
+ Icinga2::Object::Service[check_scheduler_journal_client] =>
parameters =>
"apply": true
"assign": ["host.vars.os == Linux"]
"check_command": "check_systemd"
"command_endpoint": "host.name"
"ensure": "present"
"ignore": ["host.vars.noagent"]
"import": ["generic-service"]
"name": "Check swh scheduler journal client service"
"order": 60
"prefix": false
"service_name": "check_scheduler_journal_client"
"target": "/etc/icinga2/zones.d/global-templates/services.conf"
"template": false
"vars": {"check_systemd_unit"=>"swh-scheduler-journal-client.service"}
*******************************************
+ Icinga2::Object[icinga2::object::Service::check_scheduler_journal_client] =>
parameters =>
"apply": true
"assign": ["host.vars.os == Linux"]
"attrs": {"check_command"=>"check_systemd", "command_endpoint"=>"host.name", "vars"=>{"check_systemd_unit"=>"swh-scheduler-journal-client.service"}}
"attrs_list": ["display_name", "host_name", "check_command", "check_timeout", "check_interval", "check_period", "retry_interval", "max_check_attempts", "groups", "enable_notifications", "enable_active_checks", "enable_passive_checks", "enable_event_handler", "enable_flapping", "enable_perfdata", "event_command", "flapping_threshold_low", "flapping_threshold_high", "volatile", "zone", "command_endpoint", "notes", "notes_url", "action_url", "icon_image", "icon_image_alt", "vars", "Acknowledgement", "ApiBindHost", "ApiBindPort", "ApiEnvironment", "ApplicationType", "AttachDebugger", "BuildCompilerName", "BuildCompilerVersion", "BuildHostName", "Concurrency", "Critical", "Custom", "Deprecated", "Down", "DowntimeEnd", "DowntimeRemoved", "DowntimeStart", "Environment", "FlappingEnd", "FlappingStart", "HostDown", "HostUp", "IncludeConfDir", "Internal", "Json", "LocalStateDir", "LogCritical", "LogDebug", "LogInformation", "LogNotice", "LogWarning", "Math", "MaxConcurrentChecks", "ModAttrPath", "NodeName", "OK", "ObjectsPath", "PidPath", "PkgDataDir", "PlatformArchitecture", "PlatformKernel", "PlatformKernelVersion", "PlatformName", "PlatformVersion", "PrefixDir", "Problem", "Recovery", "RunAsGroup", "RunAsUser", "RunDir", "ServiceCritical", "ServiceOK", "ServiceUnknown", "ServiceWarning", "StatePath", "SysconfDir", "System", "Types", "Unknown", "Up", "UseVfork", "VarsPath", "Warning", "ZonesDir", "NodeName", "ZoneName", "TicketSalt", "PluginDir", "PluginContribDir", "ManubulonPluginDir", "name", "NodeName", "ZoneName", "TicketSalt", "PluginDir", "PluginContribDir", "ManubulonPluginDir", "name"]
"ensure": "present"
"ignore": ["host.vars.noagent"]
"import": ["generic-service"]
"object_name": "check_scheduler_journal_client"
"object_type": "Service"
"order": 60
"prefix": false
"target": "/etc/icinga2/zones.d/global-templates/services.conf"
"template": false
*******************************************
*** End octocatalog-diff on saatchi.internal.softwareheritage.org
Diff Detail
Event TimelineComment Actions For information, the generic check_systemd does not detect when the root@scheduler0:~# /usr/lib/nagios/plugins/check_systemd
SYSTEMD CRITICAL - startup_time is 176.4 (outside range 0:120) | count_units=223 startup_time=176.353;60;120 units_activating=0 units_active=142 units_failed=0 units_inactive=81 # <------ current state
root@scheduler0:~# /usr/lib/nagios/plugins/check_systemd --unit swh-scheduler-journal-client.service
SYSTEMD OK - swh-scheduler-journal-client.service: active
root@scheduler0:~# systemctl status swh-scheduler-journal-client.service
● swh-scheduler-journal-client.service - Software Heritage Scheduler Journal Client
Loaded: loaded (/etc/systemd/system/swh-scheduler-journal-client.service; enabled; vendor preset: enabled)
Active: active (running) since Mon 2021-08-30 08:56:28 UTC; 32min ago
Main PID: 1890467 (swh)
Tasks: 5 (limit: 9537)
Memory: 36.7M
CPU: 12.903s
CGroup: /system.slice/swh-scheduler-journal-client.service
└─1890467 /usr/bin/python3 /usr/bin/swh scheduler --config-file /etc/softwareheritage/scheduler/journal-client.yml journal-client
Aug 30 08:56:28 scheduler0 systemd[1]: Started Software Heritage Scheduler Journal Client.
root@scheduler0:~# kill -9 1890467
root@scheduler0:~# systemctl status swh-scheduler-journal-client.service
● swh-scheduler-journal-client.service - Software Heritage Scheduler Journal Client
Loaded: loaded (/etc/systemd/system/swh-scheduler-journal-client.service; enabled; vendor preset: enabled)
Active: activating (auto-restart) (Result: signal) since Mon 2021-08-30 09:29:31 UTC; 992ms ago
Process: 1890467 ExecStart=/usr/bin/swh scheduler --config-file /etc/softwareheritage/scheduler/journal-client.yml journal-client (code=killed, signal=KILL)
Main PID: 1890467 (code=killed, signal=KILL)
CPU: 12.941s
Aug 30 09:29:31 scheduler0 systemd[1]: swh-scheduler-journal-client.service: Failed with result 'signal'.
Aug 30 09:29:31 scheduler0 systemd[1]: swh-scheduler-journal-client.service: Consumed 12.941s CPU time.
root@scheduler0:~# /usr/lib/nagios/plugins/check_systemd
SYSTEMD CRITICAL - startup_time is 176.4 (outside range 0:120) | count_units=223 startup_time=176.353;60;120 units_activating=1 units_active=141 units_failed=0 units_inactive=81 # <---- undetected
root@scheduler0:~# /usr/lib/nagios/plugins/check_systemd --unit swh-scheduler-journal-client.service
SYSTEMD CRITICAL - swh-scheduler-journal-client.service: activating # <---- detectedComment Actions Debugging this to make it work properly... I've been able to filter out on the host.name (and nothing else seems interesting to install it properly) apply Service "check_scheduler_journal_client" {
import "generic-service"
check_command = "check_systemd"
command_endpoint = host.name
vars.check_systemd_unit = "swh-scheduler-journal-client.service"
assign where host.vars.os == "Linux" && host.name in ["scheduler0.internal.staging.swh.network", "saatchi.internal.softwareheritage.org", "pergamon.softwareheritage.org"] # <--- pergamon for testing purposes in vagrant vms
ignore where host.vars.noagent
}I'd like to try back to install collected resources but as far as i recall my previous attempts Comment Actions
D6163 confirms this ^ Comment Actions Using the proper missing instruction, it now works. |