Page MenuHomeSoftware Heritage

Add a monitoring alert when logstash is failing to send logs to ES
ClosedPublic

Authored by vsellier on May 7 2021, 12:09 PM.

Details

Summary

Monitor errors (with_errors and non_retryable_errors) which are errors
detected when the messages should be ingested by ES.
Ignore failures because this counter is increased when an ES node
is not responding which can be a normal case during maintenance phases

Related to T3222

Test Plan
  • pergamon
diff origin/production/pergamon.softwareheritage.org current/pergamon.softwareheritage.org
*******************************************
+ Concat::Fragment[icinga2::object::CheckCommand::check_logstash] =>
   parameters =>
     "content": "\nobject CheckCommand \"check_logstash\" {\n  import \"plugin-ch...
     "order": 15,
     "target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Concat::Fragment[icinga2::object::Service::logstash_errors] =>
   parameters =>
     "content": "\napply Service \"logstash_errors\" {\n  import \"generic-servic...
     "order": 60,
     "target": "/etc/icinga2/zones.d/global-templates/services.conf"
*******************************************
+ Concat_fragment[icinga2::object::CheckCommand::check_logstash] =>
   parameters =>
     "content": "\nobject CheckCommand \"check_logstash\" {\n  import \"plugin-ch...
     "order": 15,
     "tag": "_etc_icinga2_conf.d_static-checks.conf",
     "target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Concat_fragment[icinga2::object::Service::logstash_errors] =>
   parameters =>
     "content": "\napply Service \"logstash_errors\" {\n  import \"generic-servic...
     "order": 60,
     "tag": "_etc_icinga2_zones.d_global-templates_services.conf",
     "target": "/etc/icinga2/zones.d/global-templates/services.conf"
*******************************************
+ Icinga2::Object::Checkcommand[check_logstash] =>
   parameters =>
     "checkcommand_name": "check_logstash",
     "command": "/usr/lib/nagios/plugins/icinga_check_logstash.sh",
     "ensure": "present",
     "import": [
       "plugin-check-command"
     ],
     "order": 15,
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false
*******************************************
+ Icinga2::Object::Service[logstash_errors] =>
   parameters =>
     "apply": true,
     "assign": [
       "\"check_logstash_errors.sh\" in host.vars.plugins"
     ],
     "check_command": "check_logstash_errors.sh",
     "command_endpoint": "host.name",
     "ensure": "present",
     "ignore": [
       "host.vars.noagent"
     ],
     "import": [
       "generic-service"
     ],
     "order": 60,
     "prefix": false,
     "service_name": "logstash_errors",
     "target": "/etc/icinga2/zones.d/global-templates/services.conf",
     "template": false
*******************************************
+ Icinga2::Object[icinga2::object::CheckCommand::check_logstash] =>
   parameters =>
     "apply": false,
     "assign": [
    
     ],
     "attrs": {
       "command": "/usr/lib/nagios/plugins/icinga_check_logstash.sh"
     },
     "attrs_list": [
       "command",
       "env",
       "timeout",
       "arguments",
       "vars",
       "Acknowledgement",
       "ApiBindHost",
       "ApiBindPort",
       "ApiEnvironment",
       "ApplicationType",
       "AttachDebugger",
       "BuildCompilerName",
       "BuildCompilerVersion",
       "BuildHostName",
       "Concurrency",
       "Critical",
       "Custom",
       "Deprecated",
       "Down",
       "DowntimeEnd",
       "DowntimeRemoved",
       "DowntimeStart",
       "Environment",
       "FlappingEnd",
       "FlappingStart",
       "HostDown",
       "HostUp",
       "IncludeConfDir",
       "Internal",
       "Json",
       "LocalStateDir",
       "LogCritical",
       "LogDebug",
       "LogInformation",
       "LogNotice",
       "LogWarning",
       "Math",
       "MaxConcurrentChecks",
       "ModAttrPath",
       "NodeName",
       "OK",
       "ObjectsPath",
       "PidPath",
       "PkgDataDir",
       "PlatformArchitecture",
       "PlatformKernel",
       "PlatformKernelVersion",
       "PlatformName",
       "PlatformVersion",
       "PrefixDir",
       "Problem",
       "Recovery",
       "RunAsGroup",
       "RunAsUser",
       "RunDir",
       "ServiceCritical",
       "ServiceOK",
       "ServiceUnknown",
       "ServiceWarning",
       "StatePath",
       "SysconfDir",
       "System",
       "Types",
       "Unknown",
       "Up",
       "UseVfork",
       "VarsPath",
       "Warning",
       "ZonesDir",
       "NodeName",
       "ZoneName",
       "TicketSalt",
       "PluginDir",
       "PluginContribDir",
       "ManubulonPluginDir",
       "name",
       "NodeName",
       "ZoneName",
       "TicketSalt",
       "PluginDir",
       "PluginContribDir",
       "ManubulonPluginDir",
       "name"
     ],
     "ensure": "present",
     "ignore": [
    
     ],
     "import": [
       "plugin-check-command"
     ],
     "object_name": "check_logstash",
     "object_type": "CheckCommand",
     "order": 15,
     "prefix": false,
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false
*******************************************
+ Icinga2::Object[icinga2::object::Service::logstash_errors] =>
   parameters =>
     "apply": true,
     "assign": [
       "\"check_logstash_errors.sh\" in host.vars.plugins"
     ],
     "attrs": {
       "check_command": "check_logstash_errors.sh",
       "command_endpoint": "host.name"
     },
     "attrs_list": [
       "display_name",
       "host_name",
       "check_command",
       "check_timeout",
       "check_interval",
       "check_period",
       "retry_interval",
       "max_check_attempts",
       "groups",
       "enable_notifications",
       "enable_active_checks",
       "enable_passive_checks",
       "enable_event_handler",
       "enable_flapping",
       "enable_perfdata",
       "event_command",
       "flapping_threshold_low",
       "flapping_threshold_high",
       "volatile",
       "zone",
       "command_endpoint",
       "notes",
       "notes_url",
       "action_url",
       "icon_image",
       "icon_image_alt",
       "vars",
       "Acknowledgement",
       "ApiBindHost",
       "ApiBindPort",
       "ApiEnvironment",
       "ApplicationType",
       "AttachDebugger",
       "BuildCompilerName",
       "BuildCompilerVersion",
       "BuildHostName",
       "Concurrency",
       "Critical",
       "Custom",
       "Deprecated",
       "Down",
       "DowntimeEnd",
       "DowntimeRemoved",
       "DowntimeStart",
       "Environment",
       "FlappingEnd",
       "FlappingStart",
       "HostDown",
       "HostUp",
       "IncludeConfDir",
       "Internal",
       "Json",
       "LocalStateDir",
       "LogCritical",
       "LogDebug",
       "LogInformation",
       "LogNotice",
       "LogWarning",
       "Math",
       "MaxConcurrentChecks",
       "ModAttrPath",
       "NodeName",
       "OK",
       "ObjectsPath",
       "PidPath",
       "PkgDataDir",
       "PlatformArchitecture",
       "PlatformKernel",
       "PlatformKernelVersion",
       "PlatformName",
       "PlatformVersion",
       "PrefixDir",
       "Problem",
       "Recovery",
       "RunAsGroup",
       "RunAsUser",
       "RunDir",
       "ServiceCritical",
       "ServiceOK",
       "ServiceUnknown",
       "ServiceWarning",
       "StatePath",
       "SysconfDir",
       "System",
       "Types",
       "Unknown",
       "Up",
       "UseVfork",
       "VarsPath",
       "Warning",
       "ZonesDir",
       "NodeName",
       "ZoneName",
       "TicketSalt",
       "PluginDir",
       "PluginContribDir",
       "ManubulonPluginDir",
       "name",
       "NodeName",
       "ZoneName",
       "TicketSalt",
       "PluginDir",
       "PluginContribDir",
       "ManubulonPluginDir",
       "name"
     ],
     "ensure": "present",
     "ignore": [
       "host.vars.noagent"
     ],
     "import": [
       "generic-service"
     ],
     "object_name": "logstash_errors",
     "object_type": "Service",
     "order": 60,
     "prefix": false,
     "target": "/etc/icinga2/zones.d/global-templates/services.conf",
     "template": false
*******************************************
*** End octocatalog-diff on pergamon.softwareheritage.org
  • logstash0:
I, [2021-05-07T12:06:49.195099 #19784]  INFO -- : Diffs computed for logstash0.internal.softwareheritage.org
diff origin/production/logstash0.internal.softwareheritage.org current/logstash0.internal.softwareheritage.org
*******************************************
+ Concat::Fragment[icinga2::object::CheckCommand::check_logstash_errors.sh] =>
   parameters =>
     "content": "\nobject CheckCommand \"check_logstash_errors.sh\" {\n  import \...
     "order": 15,
     "target": "/etc/icinga2/conf.d/swh-plugins.conf"
*******************************************
+ Concat_fragment[icinga2::object::CheckCommand::check_logstash_errors.sh] =>
   parameters =>
     "content": "\nobject CheckCommand \"check_logstash_errors.sh\" {\n  import \...
     "order": 15,
     "tag": "_etc_icinga2_conf.d_swh-plugins.conf",
     "target": "/etc/icinga2/conf.d/swh-plugins.conf"
*******************************************
+ File[/usr/lib/nagios/plugins/swh/check_logstash_errors.sh] =>
   parameters =>
     "content": "#!/bin/bash\n\nset pipefail\n\nCODE_CRITICAL=2\nCODE_OK=0\n\nSTA...
     "ensure": "present",
     "group": "root",
     "mode": "0755",
     "owner": "root"
*******************************************
+ Icinga2::Object::Checkcommand[check_logstash_errors.sh] =>
   parameters =>
     "checkcommand_name": "check_logstash_errors.sh",
     "command": "/usr/lib/nagios/plugins/swh/check_logstash_errors.sh",
     "ensure": "present",
     "import": [
       "plugin-check-command"
     ],
     "order": 15,
     "target": "/etc/icinga2/conf.d/swh-plugins.conf",
     "template": false
*******************************************
+ Icinga2::Object[icinga2::object::CheckCommand::check_logstash_errors.sh] =>
   parameters =>
     "apply": false,
     "assign": [
    
     ],
     "attrs": {
       "command": "/usr/lib/nagios/plugins/swh/check_logstash_errors.sh"
     },
     "attrs_list": [
       "command",
       "env",
       "timeout",
       "arguments",
       "vars",
       "Acknowledgement",
       "ApiBindHost",
       "ApiBindPort",
       "ApiEnvironment",
       "ApplicationType",
       "AttachDebugger",
       "BuildCompilerName",
       "BuildCompilerVersion",
       "BuildHostName",
       "Concurrency",
       "Critical",
       "Custom",
       "Deprecated",
       "Down",
       "DowntimeEnd",
       "DowntimeRemoved",
       "DowntimeStart",
       "Environment",
       "FlappingEnd",
       "FlappingStart",
       "HostDown",
       "HostUp",
       "IncludeConfDir",
       "Internal",
       "Json",
       "LocalStateDir",
       "LogCritical",
       "LogDebug",
       "LogInformation",
       "LogNotice",
       "LogWarning",
       "Math",
       "MaxConcurrentChecks",
       "ModAttrPath",
       "NodeName",
       "OK",
       "ObjectsPath",
       "PidPath",
       "PkgDataDir",
       "PlatformArchitecture",
       "PlatformKernel",
       "PlatformKernelVersion",
       "PlatformName",
       "PlatformVersion",
       "PrefixDir",
       "Problem",
       "Recovery",
       "RunAsGroup",
       "RunAsUser",
       "RunDir",
       "ServiceCritical",
       "ServiceOK",
       "ServiceUnknown",
       "ServiceWarning",
       "StatePath",
       "SysconfDir",
       "System",
       "Types",
       "Unknown",
       "Up",
       "UseVfork",
       "VarsPath",
       "Warning",
       "ZonesDir",
       "NodeName",
       "ZoneName",
       "TicketSalt",
       "PluginDir",
       "PluginContribDir",
       "ManubulonPluginDir",
       "name",
       "NodeName",
       "ZoneName",
       "TicketSalt",
       "PluginDir",
       "PluginContribDir",
       "ManubulonPluginDir",
       "name"
     ],
     "ensure": "present",
     "ignore": [
    
     ],
     "import": [
       "plugin-check-command"
     ],
     "object_name": "check_logstash_errors.sh",
     "object_type": "CheckCommand",
     "order": 15,
     "prefix": false,
     "target": "/etc/icinga2/conf.d/swh-plugins.conf",
     "template": false
*******************************************
*** End octocatalog-diff on logstash0.internal.softwareheritage.org
  • No changes on other nodes

Diff Detail

Repository
rSPSITE puppet-swh-site
Lint
Automatic diff as part of commit; lint not applicable.
Unit
Automatic diff as part of commit; unit tests not applicable.

Event Timeline

vsellier created this revision.
ardumont added a subscriber: ardumont.

lgtm

couple of questions inline.

site-modules/profile/files/icinga2/plugins/check_logstash_errors.sh
3

Picture me curious, what does that do?

site-modules/profile/manifests/icinga2/objects/common_checks.pp
114

Isn't it better to full qualify the path to the script?

or is it just a command name? (i think that's it).

This revision is now accepted and ready to land.May 7 2021, 1:57 PM
site-modules/profile/files/icinga2/plugins/check_logstash_errors.sh
3

actually, it does nothing as I forgot to specify the -o argument, but the goal is to change the way the return code of a pipeline of command is returned

$ false | true
$ echo $?
0
$ set pipefail
$ false | true
$ echo $?
0
$ set -o pipefail
$ false | true
$ echo $?
1
site-modules/profile/manifests/icinga2/objects/common_checks.pp
114

it's the command name as declared in static_checks.pp... so there is an inconsistency :)

  • fix inconsistency in check command naming
  • remove the unecessary set option on the check script