Page MenuHomeSoftware Heritage

monitor postgresql replication lag through prometheus data
ClosedPublic

Authored by vsellier on Aug 3 2021, 9:03 AM.

Details

Summary
  • Use a prometheus plugin for grafana (from prometheus)
  • create a wrapper to handle the query because when used with the puppet way to configure the probe, the parsing is not don correctly
  • raise the warning at 100GiB and the critical alert at 200GiB, it match what we observed in the last month but will be adapted later if needed

Related to T3452

Test Plan
  • pergamon:
diff origin/production/pergamon.softwareheritage.org current/pergamon.softwareheritage.org
*******************************************
+ Concat::Fragment[icinga2::object::CheckCommand::check_belvedere_replication_lag.sh] =>
   parameters =>
     "content": "\nobject CheckCommand \"check_belvedere_replication_lag.sh\" {\n...
     "order": 15,
     "target": "/etc/icinga2/conf.d/swh-plugins.conf"
*******************************************
+ Concat::Fragment[icinga2::object::CheckCommand::check_prometheus_metric.sh] =>
   parameters =>
     "content": "\nobject CheckCommand \"check_prometheus_metric.sh\" {\n  import...
     "order": 15,
     "target": "/etc/icinga2/conf.d/swh-plugins.conf"
*******************************************
+ Concat::Fragment[icinga2::object::Service::Postgresql replication lag (belvedere -> somerset)] =>
   parameters =>
     "content": "\nobject Service \"Postgresql replication lag (belvedere -> some...
     "order": 60,
     "target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Concat_fragment[icinga2::object::CheckCommand::check_belvedere_replication_lag.sh] =>
   parameters =>
     "content": "\nobject CheckCommand \"check_belvedere_replication_lag.sh\" {\n...
     "order": 15,
     "tag": "_etc_icinga2_conf.d_swh-plugins.conf",
     "target": "/etc/icinga2/conf.d/swh-plugins.conf"
*******************************************
+ Concat_fragment[icinga2::object::CheckCommand::check_prometheus_metric.sh] =>
   parameters =>
     "content": "\nobject CheckCommand \"check_prometheus_metric.sh\" {\n  import...
     "order": 15,
     "tag": "_etc_icinga2_conf.d_swh-plugins.conf",
     "target": "/etc/icinga2/conf.d/swh-plugins.conf"
*******************************************
+ Concat_fragment[icinga2::object::Service::Postgresql replication lag (belvedere -> somerset)] =>
   parameters =>
     "content": "\nobject Service \"Postgresql replication lag (belvedere -> some...
     "order": 60,
     "tag": "_etc_icinga2_conf.d_static-checks.conf",
     "target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Exec[sudo-syntax-check for file /etc/sudoers.d/10_icinga-check_belvedere_replication_lag-sh] =>
   parameters =>
     "command": "visudo -c || ",
     "path": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
     "refreshonly": true
*******************************************
+ Exec[sudo-syntax-check for file /etc/sudoers.d/10_icinga-check_prometheus_metric-sh] =>
   parameters =>
     "command": "visudo -c || ",
     "path": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
     "refreshonly": true
*******************************************
+ File[/etc/sudoers.d/10_icinga-check_belvedere_replication_lag-sh] =>
   parameters =>
     "ensure": "absent",
     "group": "root",
     "mode": "0440",
     "owner": "root"
*******************************************
+ File[/etc/sudoers.d/10_icinga-check_prometheus_metric-sh] =>
   parameters =>
     "ensure": "absent",
     "group": "root",
     "mode": "0440",
     "owner": "root"
*******************************************
+ File[/usr/lib/nagios/plugins/swh/check_belvedere_replication_lag.sh] =>
   parameters =>
     "content": "#!/bin/bash\n\n#\n# File managed by puppet. All modifications wi...
     "ensure": "present",
     "group": "root",
     "mode": "0755",
     "owner": "root"
*******************************************
+ File[/usr/lib/nagios/plugins/swh/check_prometheus_metric.sh] =>
   parameters =>
     "content": "#!/bin/bash\n\n#\n# File managed by puppet. All modifications wi...
     "ensure": "present",
     "group": "root",
     "mode": "0755",
     "owner": "root"
*******************************************
+ Icinga2::Object::Checkcommand[check_belvedere_replication_lag.sh] =>
   parameters =>
     "arguments": {
       "-H": "$check_prometheus_metric_url$",
       "-w": "$check_prometheus_metric_warning$",
       "-c": "$check_prometheus_metric_critical$",
       "-n": "$check_prometheus_metric_name$"
     },
     "checkcommand_name": "check_belvedere_replication_lag.sh",
     "command": [
       "/usr/lib/nagios/plugins/swh/check_belvedere_replication_lag.sh"
     ],
     "ensure": "present",
     "import": [
       "plugin-check-command"
     ],
     "order": 15,
     "target": "/etc/icinga2/conf.d/swh-plugins.conf",
     "template": false,
     "vars": {
       "check_prometheus_metric_url": "pergamon.internal.softwareheritage.org:909...
     }
*******************************************
+ Icinga2::Object::Checkcommand[check_prometheus_metric.sh] =>
   parameters =>
     "arguments": {
       "-H": "$check_prometheus_metric_url$",
       "-q": "$check_prometheus_metric_query$",
       "-w": "$check_prometheus_metric_warning$",
       "-c": "$check_prometheus_metric_critical$",
       "-n": "$check_prometheus_metric_name$"
     },
     "checkcommand_name": "check_prometheus_metric.sh",
     "command": [
       "/usr/lib/nagios/plugins/swh/check_prometheus_metric.sh"
     ],
     "ensure": "present",
     "import": [
       "plugin-check-command"
     ],
     "order": 15,
     "target": "/etc/icinga2/conf.d/swh-plugins.conf",
     "template": false,
     "vars": {
       "check_prometheus_metric_url": "pergamon.internal.softwareheritage.org:909...
     }
*******************************************
+ Icinga2::Object::Service[Postgresql replication lag (belvedere -> somerset)] =>
   parameters =>
     "apply": false,
     "assign": [
    
     ],
     "check_command": "check_belvedere_replication_lag.sh",
     "ensure": "present",
     "host_name": "belvedere.internal.softwareheritage.org",
     "ignore": [
    
     ],
     "import": [
    
     ],
     "order": 60,
     "prefix": false,
     "service_name": "Postgresql replication lag (belvedere -> somerset)",
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false,
     "vars": {
       "check_prometheus_metric_name": "pg replication_lag belvedere somerset",
       "check_prometheus_metric_warning": "107374182400",
       "check_prometheus_metric_critical": "214748364800"
     }
*******************************************
+ Icinga2::Object[icinga2::object::CheckCommand::check_belvedere_replication_lag.sh] =>
   parameters =>
     "apply": false,
     "assign": [
    
     ],
     "attrs": {
       "command": [
         "/usr/lib/nagios/plugins/swh/check_belvedere_replication_lag.sh"
       ],
       "arguments": {
         "-H": "$check_prometheus_metric_url$",
         "-w": "$check_prometheus_metric_warning$",
         "-c": "$check_prometheus_metric_critical$",
         "-n": "$check_prometheus_metric_name$"
       },
       "vars": {
         "check_prometheus_metric_url": "pergamon.internal.softwareheritage.org:9...
       }
     },
     "attrs_list": [
       "command",
       "env",
       "timeout",
       "arguments",
       "vars",
       "Acknowledgement",
       "ApiBindHost",
       "ApiBindPort",
       "ApiEnvironment",
       "ApplicationType",
       "AttachDebugger",
       "BuildCompilerName",
       "BuildCompilerVersion",
       "BuildHostName",
       "Concurrency",
       "Critical",
       "Custom",
       "Deprecated",
       "Down",
       "DowntimeEnd",
       "DowntimeRemoved",
       "DowntimeStart",
       "Environment",
       "FlappingEnd",
       "FlappingStart",
       "HostDown",
       "HostUp",
       "IncludeConfDir",
       "Internal",
       "Json",
       "LocalStateDir",
       "LogCritical",
       "LogDebug",
       "LogInformation",
       "LogNotice",
       "LogWarning",
       "Math",
       "MaxConcurrentChecks",
       "ModAttrPath",
       "NodeName",
       "OK",
       "ObjectsPath",
       "PidPath",
       "PkgDataDir",
       "PlatformArchitecture",
       "PlatformKernel",
       "PlatformKernelVersion",
       "PlatformName",
       "PlatformVersion",
       "PrefixDir",
       "Problem",
       "Recovery",
       "RunAsGroup",
       "RunAsUser",
       "RunDir",
       "ServiceCritical",
       "ServiceOK",
       "ServiceUnknown",
       "ServiceWarning",
       "StatePath",
       "SysconfDir",
       "System",
       "Types",
       "Unknown",
       "Up",
       "UseVfork",
       "VarsPath",
       "Warning",
       "ZonesDir",
       "NodeName",
       "ZoneName",
       "TicketSalt",
       "PluginDir",
       "PluginContribDir",
       "ManubulonPluginDir",
       "name",
       "NodeName",
       "ZoneName",
       "TicketSalt",
       "PluginDir",
       "PluginContribDir",
       "ManubulonPluginDir",
       "name"
     ],
     "ensure": "present",
     "ignore": [
    
     ],
     "import": [
       "plugin-check-command"
     ],
     "object_name": "check_belvedere_replication_lag.sh",
     "object_type": "CheckCommand",
     "order": 15,
     "prefix": false,
     "target": "/etc/icinga2/conf.d/swh-plugins.conf",
     "template": false
*******************************************
+ Icinga2::Object[icinga2::object::CheckCommand::check_prometheus_metric.sh] =>
   parameters =>
     "apply": false,
     "assign": [
    
     ],
     "attrs": {
       "command": [
         "/usr/lib/nagios/plugins/swh/check_prometheus_metric.sh"
       ],
       "arguments": {
         "-H": "$check_prometheus_metric_url$",
         "-q": "$check_prometheus_metric_query$",
         "-w": "$check_prometheus_metric_warning$",
         "-c": "$check_prometheus_metric_critical$",
         "-n": "$check_prometheus_metric_name$"
       },
       "vars": {
         "check_prometheus_metric_url": "pergamon.internal.softwareheritage.org:9...
       }
     },
     "attrs_list": [
       "command",
       "env",
       "timeout",
       "arguments",
       "vars",
       "Acknowledgement",
       "ApiBindHost",
       "ApiBindPort",
       "ApiEnvironment",
       "ApplicationType",
       "AttachDebugger",
       "BuildCompilerName",
       "BuildCompilerVersion",
       "BuildHostName",
       "Concurrency",
       "Critical",
       "Custom",
       "Deprecated",
       "Down",
       "DowntimeEnd",
       "DowntimeRemoved",
       "DowntimeStart",
       "Environment",
       "FlappingEnd",
       "FlappingStart",
       "HostDown",
       "HostUp",
       "IncludeConfDir",
       "Internal",
       "Json",
       "LocalStateDir",
       "LogCritical",
       "LogDebug",
       "LogInformation",
       "LogNotice",
       "LogWarning",
       "Math",
       "MaxConcurrentChecks",
       "ModAttrPath",
       "NodeName",
       "OK",
       "ObjectsPath",
       "PidPath",
       "PkgDataDir",
       "PlatformArchitecture",
       "PlatformKernel",
       "PlatformKernelVersion",
       "PlatformName",
       "PlatformVersion",
       "PrefixDir",
       "Problem",
       "Recovery",
       "RunAsGroup",
       "RunAsUser",
       "RunDir",
       "ServiceCritical",
       "ServiceOK",
       "ServiceUnknown",
       "ServiceWarning",
       "StatePath",
       "SysconfDir",
       "System",
       "Types",
       "Unknown",
       "Up",
       "UseVfork",
       "VarsPath",
       "Warning",
       "ZonesDir",
       "NodeName",
       "ZoneName",
       "TicketSalt",
       "PluginDir",
       "PluginContribDir",
       "ManubulonPluginDir",
       "name",
       "NodeName",
       "ZoneName",
       "TicketSalt",
       "PluginDir",
       "PluginContribDir",
       "ManubulonPluginDir",
       "name"
     ],
     "ensure": "present",
     "ignore": [
    
     ],
     "import": [
       "plugin-check-command"
     ],
     "object_name": "check_prometheus_metric.sh",
     "object_type": "CheckCommand",
     "order": 15,
     "prefix": false,
     "target": "/etc/icinga2/conf.d/swh-plugins.conf",
     "template": false
*******************************************
+ Icinga2::Object[icinga2::object::Service::Postgresql replication lag (belvedere -> somerset)] =>
   parameters =>
     "apply": false,
     "assign": [
    
     ],
     "attrs": {
       "host_name": "belvedere.internal.softwareheritage.org",
       "check_command": "check_belvedere_replication_lag.sh",
       "vars": {
         "check_prometheus_metric_name": "pg replication_lag belvedere somerset",...
         "check_prometheus_metric_warning": "107374182400",
         "check_prometheus_metric_critical": "214748364800"
       }
     },
     "attrs_list": [
       "display_name",
       "host_name",
       "check_command",
       "check_timeout",
       "check_interval",
       "check_period",
       "retry_interval",
       "max_check_attempts",
       "groups",
       "enable_notifications",
       "enable_active_checks",
       "enable_passive_checks",
       "enable_event_handler",
       "enable_flapping",
       "enable_perfdata",
       "event_command",
       "flapping_threshold_low",
       "flapping_threshold_high",
       "volatile",
       "zone",
       "command_endpoint",
       "notes",
       "notes_url",
       "action_url",
       "icon_image",
       "icon_image_alt",
       "vars",
       "Acknowledgement",
       "ApiBindHost",
       "ApiBindPort",
       "ApiEnvironment",
       "ApplicationType",
       "AttachDebugger",
       "BuildCompilerName",
       "BuildCompilerVersion",
       "BuildHostName",
       "Concurrency",
       "Critical",
       "Custom",
       "Deprecated",
       "Down",
       "DowntimeEnd",
       "DowntimeRemoved",
       "DowntimeStart",
       "Environment",
       "FlappingEnd",
       "FlappingStart",
       "HostDown",
       "HostUp",
       "IncludeConfDir",
       "Internal",
       "Json",
       "LocalStateDir",
       "LogCritical",
       "LogDebug",
       "LogInformation",
       "LogNotice",
       "LogWarning",
       "Math",
       "MaxConcurrentChecks",
       "ModAttrPath",
       "NodeName",
       "OK",
       "ObjectsPath",
       "PidPath",
       "PkgDataDir",
       "PlatformArchitecture",
       "PlatformKernel",
       "PlatformKernelVersion",
       "PlatformName",
       "PlatformVersion",
       "PrefixDir",
       "Problem",
       "Recovery",
       "RunAsGroup",
       "RunAsUser",
       "RunDir",
       "ServiceCritical",
       "ServiceOK",
       "ServiceUnknown",
       "ServiceWarning",
       "StatePath",
       "SysconfDir",
       "System",
       "Types",
       "Unknown",
       "Up",
       "UseVfork",
       "VarsPath",
       "Warning",
       "ZonesDir",
       "NodeName",
       "ZoneName",
       "TicketSalt",
       "PluginDir",
       "PluginContribDir",
       "ManubulonPluginDir",
       "name",
       "NodeName",
       "ZoneName",
       "TicketSalt",
       "PluginDir",
       "PluginContribDir",
       "ManubulonPluginDir",
       "name"
     ],
     "ensure": "present",
     "ignore": [
    
     ],
     "import": [
    
     ],
     "object_name": "Postgresql replication lag (belvedere -> somerset)",
     "object_type": "Service",
     "order": 60,
     "prefix": false,
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false
*******************************************
+ Sudo::Conf[icinga-check_belvedere_replication_lag.sh] =>
   parameters =>
     "ensure": "absent",
     "priority": 10,
     "sudo_syntax_path": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin...
*******************************************
+ Sudo::Conf[icinga-check_prometheus_metric.sh] =>
   parameters =>
     "ensure": "absent",
     "priority": 10,
     "sudo_syntax_path": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin...
*******************************************
*** End octocatalog-diff on pergamon.softwareheritage.org

Diff Detail

Repository
rSPSITE puppet-swh-site
Lint
Automatic diff as part of commit; lint not applicable.
Unit
Automatic diff as part of commit; unit tests not applicable.