diff --git a/site-modules/profile/files/swh/deploy/worker/swh-worker-ping-restart b/site-modules/profile/files/swh/deploy/worker/swh-worker-ping-restart new file mode 100755 index 00000000..c025a199 --- /dev/null +++ b/site-modules/profile/files/swh/deploy/worker/swh-worker-ping-restart @@ -0,0 +1,79 @@ +#!/bin/bash + +set -e + +is_service_enabled () { + service_name=$1 + + if [[ $(systemctl is-enabled $service_name) = "enabled" ]]; then + return 0 + else + return 1 + fi +} + +is_service_active () { + service_name=$1 + + if [[ $(systemctl is-active $service_name) = "active" ]]; then + return 0 + else + return 1 + fi +} + +restart_service_with_prejudice () { + service_name=$1 + + systemctl kill --kill-who all --signal 9 "$service_name" + systemctl restart "$service_name" +} + + +if [ $# -ne 2 ]; then + echo "$0 CELERY_WORKER_NAME WORKER_INSTANCE" + echo + echo "e.g. $0 celery@loader_git.`hostname` loader_git" + exit 2 +fi + +celery_name="$1" +worker_instance="$2" +service_name="swh-worker@${worker_instance}.service" + +SWH_CONFIG_FILENAME="/etc/softwareheritage/${worker_instance}.yml" + +if ! [ -e "$SWH_CONFIG_FILENAME" ]; then + echo "Missing configuration $SWH_CONFIG_FILENAME" >&2 +else + export SWH_CONFIG_FILENAME +fi + +if ! is_service_enabled "${service_name}"; then + echo "Service ${service_name} not enabled; exiting" >&2 + exit 0 +fi + +if ! is_service_active "${service_name}"; then + echo "Service ${service_name} inactive; starting" >&2 + systemctl start "${service_name}" + exit 1 +fi + +ping_count=0 +max_ping_attempts=5 + +while [ $ping_count -lt $max_ping_attempts ]; do + ping_count=$((ping_count + 1)) + echo "Ping attempt number $ping_count..." >&2 + if swh scheduler celery-monitor --pattern "$celery_name" ping-workers >&2; then + echo "Got an answer from $celery_name at attempt $ping_count; exiting" >&2 + exit 0 + fi +done + + +echo "$ping_count ping attempts failed; killing and restarting $service_name" >&2 + +restart_service_with_prejudice "$service_name" +exit 1 diff --git a/site-modules/profile/manifests/swh/deploy/worker/base.pp b/site-modules/profile/manifests/swh/deploy/worker/base.pp index 553234a0..8c3b801a 100644 --- a/site-modules/profile/manifests/swh/deploy/worker/base.pp +++ b/site-modules/profile/manifests/swh/deploy/worker/base.pp @@ -1,36 +1,43 @@ # Base worker profile class profile::swh::deploy::worker::base { $systemd_template_unit_name = 'swh-worker@.service' $systemd_unit_name = 'swh-worker.service' $systemd_slice_name = 'system-swh\x2dworker.slice' package {'python3-swh.scheduler': ensure => installed, } ::systemd::unit_file {$systemd_template_unit_name: ensure => 'present', source => "puppet:///modules/profile/swh/deploy/worker/${systemd_template_unit_name}", } ::systemd::unit_file {$systemd_unit_name: ensure => 'present', source => "puppet:///modules/profile/swh/deploy/worker/${systemd_unit_name}", } ~> service {'swh-worker': ensure => running, enable => true, } ::systemd::unit_file {$systemd_slice_name: ensure => 'present', source => "puppet:///modules/profile/swh/deploy/worker/${systemd_slice_name}", } profile::cron::d {'cleanup-workers-tmp': command => 'find /tmp -depth -mindepth 3 -maxdepth 3 -type d -ctime +2 -exec rm -rf {} \+', target => 'swh-worker', minute => 'fqdn_rand', hour => 'fqdn_rand/2', } + + file {'/usr/local/sbin/swh-worker-ping-restart': + source => 'puppet:///profile/swh/deploy/worker/swh-worker-ping-restart', + owner => 'root', + group => 'root', + mode => '0755', + } }