Add a monitoring alert when logstash is failing to send logs to ES
Monitor errors (with_errors and non_retryable_errors) which are errors detected when the messages should be ingested by ES. Ignore failures because this counter is increased when an ES node is not responding which can be a normal case during maintenance phases
Related to T3222
Test Plan
- pergamon
diff origin/production/pergamon.softwareheritage.org current/pergamon.softwareheritage.org
*******************************************
+ Concat::Fragment[icinga2::object::CheckCommand::check_logstash] =>
parameters =>
"content": "\nobject CheckCommand \"check_logstash\" {\n import \"plugin-ch...
"order": 15,
"target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Concat::Fragment[icinga2::object::Service::logstash_errors] =>
parameters =>
"content": "\napply Service \"logstash_errors\" {\n import \"generic-servic...
"order": 60,
"target": "/etc/icinga2/zones.d/global-templates/services.conf"
*******************************************
+ Concat_fragment[icinga2::object::CheckCommand::check_logstash] =>
parameters =>
"content": "\nobject CheckCommand \"check_logstash\" {\n import \"plugin-ch...
"order": 15,
"tag": "_etc_icinga2_conf.d_static-checks.conf",
"target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Concat_fragment[icinga2::object::Service::logstash_errors] =>
parameters =>
"content": "\napply Service \"logstash_errors\" {\n import \"generic-servic...
"order": 60,
"tag": "_etc_icinga2_zones.d_global-templates_services.conf",
"target": "/etc/icinga2/zones.d/global-templates/services.conf"
*******************************************
+ Icinga2::Object::Checkcommand[check_logstash] =>
parameters =>
"checkcommand_name": "check_logstash",
"command": "/usr/lib/nagios/plugins/icinga_check_logstash.sh",
"ensure": "present",
"import": [
"plugin-check-command"
],
"order": 15,
"target": "/etc/icinga2/conf.d/static-checks.conf",
"template": false
*******************************************
+ Icinga2::Object::Service[logstash_errors] =>
parameters =>
"apply": true,
"assign": [
"\"check_logstash_errors.sh\" in host.vars.plugins"
],
"check_command": "check_logstash_errors.sh",
"command_endpoint": "host.name",
"ensure": "present",
"ignore": [
"host.vars.noagent"
],
"import": [
"generic-service"
],
"order": 60,
"prefix": false,
"service_name": "logstash_errors",
"target": "/etc/icinga2/zones.d/global-templates/services.conf",
"template": false
*******************************************
+ Icinga2::Object[icinga2::object::CheckCommand::check_logstash] =>
parameters =>
"apply": false,
"assign": [
],
"attrs": {
"command": "/usr/lib/nagios/plugins/icinga_check_logstash.sh"
},
"attrs_list": [
"command",
"env",
"timeout",
"arguments",
"vars",
"Acknowledgement",
"ApiBindHost",
"ApiBindPort",
"ApiEnvironment",
"ApplicationType",
"AttachDebugger",
"BuildCompilerName",
"BuildCompilerVersion",
"BuildHostName",
"Concurrency",
"Critical",
"Custom",
"Deprecated",
"Down",
"DowntimeEnd",
"DowntimeRemoved",
"DowntimeStart",
"Environment",
"FlappingEnd",
"FlappingStart",
"HostDown",
"HostUp",
"IncludeConfDir",
"Internal",
"Json",
"LocalStateDir",
"LogCritical",
"LogDebug",
"LogInformation",
"LogNotice",
"LogWarning",
"Math",
"MaxConcurrentChecks",
"ModAttrPath",
"NodeName",
"OK",
"ObjectsPath",
"PidPath",
"PkgDataDir",
"PlatformArchitecture",
"PlatformKernel",
"PlatformKernelVersion",
"PlatformName",
"PlatformVersion",
"PrefixDir",
"Problem",
"Recovery",
"RunAsGroup",
"RunAsUser",
"RunDir",
"ServiceCritical",
"ServiceOK",
"ServiceUnknown",
"ServiceWarning",
"StatePath",
"SysconfDir",
"System",
"Types",
"Unknown",
"Up",
"UseVfork",
"VarsPath",
"Warning",
"ZonesDir",
"NodeName",
"ZoneName",
"TicketSalt",
"PluginDir",
"PluginContribDir",
"ManubulonPluginDir",
"name",
"NodeName",
"ZoneName",
"TicketSalt",
"PluginDir",
"PluginContribDir",
"ManubulonPluginDir",
"name"
],
"ensure": "present",
"ignore": [
],
"import": [
"plugin-check-command"
],
"object_name": "check_logstash",
"object_type": "CheckCommand",
"order": 15,
"prefix": false,
"target": "/etc/icinga2/conf.d/static-checks.conf",
"template": false
*******************************************
+ Icinga2::Object[icinga2::object::Service::logstash_errors] =>
parameters =>
"apply": true,
"assign": [
"\"check_logstash_errors.sh\" in host.vars.plugins"
],
"attrs": {
"check_command": "check_logstash_errors.sh",
"command_endpoint": "host.name"
},
"attrs_list": [
"display_name",
"host_name",
"check_command",
"check_timeout",
"check_interval",
"check_period",
"retry_interval",
"max_check_attempts",
"groups",
"enable_notifications",
"enable_active_checks",
"enable_passive_checks",
"enable_event_handler",
"enable_flapping",
"enable_perfdata",
"event_command",
"flapping_threshold_low",
"flapping_threshold_high",
"volatile",
"zone",
"command_endpoint",
"notes",
"notes_url",
"action_url",
"icon_image",
"icon_image_alt",
"vars",
"Acknowledgement",
"ApiBindHost",
"ApiBindPort",
"ApiEnvironment",
"ApplicationType",
"AttachDebugger",
"BuildCompilerName",
"BuildCompilerVersion",
"BuildHostName",
"Concurrency",
"Critical",
"Custom",
"Deprecated",
"Down",
"DowntimeEnd",
"DowntimeRemoved",
"DowntimeStart",
"Environment",
"FlappingEnd",
"FlappingStart",
"HostDown",
"HostUp",
"IncludeConfDir",
"Internal",
"Json",
"LocalStateDir",
"LogCritical",
"LogDebug",
"LogInformation",
"LogNotice",
"LogWarning",
"Math",
"MaxConcurrentChecks",
"ModAttrPath",
"NodeName",
"OK",
"ObjectsPath",
"PidPath",
"PkgDataDir",
"PlatformArchitecture",
"PlatformKernel",
"PlatformKernelVersion",
"PlatformName",
"PlatformVersion",
"PrefixDir",
"Problem",
"Recovery",
"RunAsGroup",
"RunAsUser",
"RunDir",
"ServiceCritical",
"ServiceOK",
"ServiceUnknown",
"ServiceWarning",
"StatePath",
"SysconfDir",
"System",
"Types",
"Unknown",
"Up",
"UseVfork",
"VarsPath",
"Warning",
"ZonesDir",
"NodeName",
"ZoneName",
"TicketSalt",
"PluginDir",
"PluginContribDir",
"ManubulonPluginDir",
"name",
"NodeName",
"ZoneName",
"TicketSalt",
"PluginDir",
"PluginContribDir",
"ManubulonPluginDir",
"name"
],
"ensure": "present",
"ignore": [
"host.vars.noagent"
],
"import": [
"generic-service"
],
"object_name": "logstash_errors",
"object_type": "Service",
"order": 60,
"prefix": false,
"target": "/etc/icinga2/zones.d/global-templates/services.conf",
"template": false
*******************************************
*** End octocatalog-diff on pergamon.softwareheritage.org
- logstash0:
I, [2021-05-07T12:06:49.195099 #19784] INFO -- : Diffs computed for logstash0.internal.softwareheritage.org
diff origin/production/logstash0.internal.softwareheritage.org current/logstash0.internal.softwareheritage.org
*******************************************
+ Concat::Fragment[icinga2::object::CheckCommand::check_logstash_errors.sh] =>
parameters =>
"content": "\nobject CheckCommand \"check_logstash_errors.sh\" {\n import \...
"order": 15,
"target": "/etc/icinga2/conf.d/swh-plugins.conf"
*******************************************
+ Concat_fragment[icinga2::object::CheckCommand::check_logstash_errors.sh] =>
parameters =>
"content": "\nobject CheckCommand \"check_logstash_errors.sh\" {\n import \...
"order": 15,
"tag": "_etc_icinga2_conf.d_swh-plugins.conf",
"target": "/etc/icinga2/conf.d/swh-plugins.conf"
*******************************************
+ File[/usr/lib/nagios/plugins/swh/check_logstash_errors.sh] =>
parameters =>
"content": "#!/bin/bash\n\nset pipefail\n\nCODE_CRITICAL=2\nCODE_OK=0\n\nSTA...
"ensure": "present",
"group": "root",
"mode": "0755",
"owner": "root"
*******************************************
+ Icinga2::Object::Checkcommand[check_logstash_errors.sh] =>
parameters =>
"checkcommand_name": "check_logstash_errors.sh",
"command": "/usr/lib/nagios/plugins/swh/check_logstash_errors.sh",
"ensure": "present",
"import": [
"plugin-check-command"
],
"order": 15,
"target": "/etc/icinga2/conf.d/swh-plugins.conf",
"template": false
*******************************************
+ Icinga2::Object[icinga2::object::CheckCommand::check_logstash_errors.sh] =>
parameters =>
"apply": false,
"assign": [
],
"attrs": {
"command": "/usr/lib/nagios/plugins/swh/check_logstash_errors.sh"
},
"attrs_list": [
"command",
"env",
"timeout",
"arguments",
"vars",
"Acknowledgement",
"ApiBindHost",
"ApiBindPort",
"ApiEnvironment",
"ApplicationType",
"AttachDebugger",
"BuildCompilerName",
"BuildCompilerVersion",
"BuildHostName",
"Concurrency",
"Critical",
"Custom",
"Deprecated",
"Down",
"DowntimeEnd",
"DowntimeRemoved",
"DowntimeStart",
"Environment",
"FlappingEnd",
"FlappingStart",
"HostDown",
"HostUp",
"IncludeConfDir",
"Internal",
"Json",
"LocalStateDir",
"LogCritical",
"LogDebug",
"LogInformation",
"LogNotice",
"LogWarning",
"Math",
"MaxConcurrentChecks",
"ModAttrPath",
"NodeName",
"OK",
"ObjectsPath",
"PidPath",
"PkgDataDir",
"PlatformArchitecture",
"PlatformKernel",
"PlatformKernelVersion",
"PlatformName",
"PlatformVersion",
"PrefixDir",
"Problem",
"Recovery",
"RunAsGroup",
"RunAsUser",
"RunDir",
"ServiceCritical",
"ServiceOK",
"ServiceUnknown",
"ServiceWarning",
"StatePath",
"SysconfDir",
"System",
"Types",
"Unknown",
"Up",
"UseVfork",
"VarsPath",
"Warning",
"ZonesDir",
"NodeName",
"ZoneName",
"TicketSalt",
"PluginDir",
"PluginContribDir",
"ManubulonPluginDir",
"name",
"NodeName",
"ZoneName",
"TicketSalt",
"PluginDir",
"PluginContribDir",
"ManubulonPluginDir",
"name"
],
"ensure": "present",
"ignore": [
],
"import": [
"plugin-check-command"
],
"object_name": "check_logstash_errors.sh",
"object_type": "CheckCommand",
"order": 15,
"prefix": false,
"target": "/etc/icinga2/conf.d/swh-plugins.conf",
"template": false
*******************************************
*** End octocatalog-diff on logstash0.internal.softwareheritage.org
- No changes on other nodes
Migrated from D5709 (view on Phabricator)