Improve icinga2 prometheus metric checks
- Don't hardcode FQDN of prometheus server
- Use the generic check_prometheus_metric icinga check for belvedere lag
- Rename check_prometheus_metric without shell suffix
- Add all arguments of the check_prometheus_metric icinga check
Test Plan
octocatalog-diff for pergamon looks sane enough:
diff origin/production/pergamon.softwareheritage.org current/pergamon.softwareheritage.org
*******************************************
- Concat::Fragment[icinga2::object::CheckCommand::check_belvedere_replication_lag.sh]
*******************************************
- Concat::Fragment[icinga2::object::CheckCommand::check_prometheus_metric.sh]
*******************************************
+ Concat::Fragment[icinga2::object::CheckCommand::check_prometheus_metric] =>
parameters =>
"order": 15
"target": "/etc/icinga2/conf.d/swh-plugins.conf"
"content": >>>
object CheckCommand "check_prometheus_metric" {
import "plugin-check-command"
command = [ "/usr/lib/nagios/plugins/swh/check_prometheus_metric", ]
arguments = {
"-H" = "$check_prometheus_metric_url$"
"-q" = "$check_prometheus_metric_query$"
"-w" = "$check_prometheus_metric_warning$"
"-c" = "$check_prometheus_metric_critical$"
"-n" = "$check_prometheus_metric_name$"
"-m" = "$check_prometheus_comparison_method$"
"-t" = "$check_prometheus_query_type$"
"-O" = {
set_if = "$check_prometheus_nan_ok$"
}
"-P" = {
set_if = "$check_prometheus_perfdata$"
}
}
vars.check_prometheus_metric_url = "http://pergamon.internal.softwareheritage.org:9090"
vars.check_prometheus_comparison_method = "ge"
vars.check_prometheus_query_type = "scalar"
vars.check_prometheus_nan_ok = false
vars.check_prometheus_perfdat = true
}
<<<
*******************************************
Concat::Fragment[icinga2::object::Service::Postgresql replication lag (belvedere -> somerset)] =>
parameters =>
content =>
@@ -2,6 +2,7 @@
object Service "Postgresql replication lag (belvedere -> somerset)" {
host_name = "belvedere.internal.softwareheritage.org"
- check_command = "check_belvedere_replication_lag.sh"
+ check_command = "check_prometheus_metric"
vars.check_prometheus_metric_name = "pg replication_lag belvedere somerset"
+ vars.check_prometheus_query = "sum(sql_pg_stat_replication{instance=\"belvedere.internal.softwareheritage.org\", host=\":5433\", application_name=\"softwareheritage_replica\"})"
vars.check_prometheus_metric_warning = 1073741824
vars.check_prometheus_metric_critical = 2147483648
*******************************************
- Concat_fragment[icinga2::object::CheckCommand::check_belvedere_replication_lag.sh]
*******************************************
- Concat_fragment[icinga2::object::CheckCommand::check_prometheus_metric.sh]
*******************************************
+ Concat_fragment[icinga2::object::CheckCommand::check_prometheus_metric] =>
parameters =>
"order": 15
"tag": "_etc_icinga2_conf.d_swh-plugins.conf"
"target": "/etc/icinga2/conf.d/swh-plugins.conf"
"content": >>>
object CheckCommand "check_prometheus_metric" {
import "plugin-check-command"
command = [ "/usr/lib/nagios/plugins/swh/check_prometheus_metric", ]
arguments = {
"-H" = "$check_prometheus_metric_url$"
"-q" = "$check_prometheus_metric_query$"
"-w" = "$check_prometheus_metric_warning$"
"-c" = "$check_prometheus_metric_critical$"
"-n" = "$check_prometheus_metric_name$"
"-m" = "$check_prometheus_comparison_method$"
"-t" = "$check_prometheus_query_type$"
"-O" = {
set_if = "$check_prometheus_nan_ok$"
}
"-P" = {
set_if = "$check_prometheus_perfdata$"
}
}
vars.check_prometheus_metric_url = "http://pergamon.internal.softwareheritage.org:9090"
vars.check_prometheus_comparison_method = "ge"
vars.check_prometheus_query_type = "scalar"
vars.check_prometheus_nan_ok = false
vars.check_prometheus_perfdat = true
}
<<<
*******************************************
Concat_fragment[icinga2::object::Service::Postgresql replication lag (belvedere -> somerset)] =>
parameters =>
content =>
@@ -2,6 +2,7 @@
object Service "Postgresql replication lag (belvedere -> somerset)" {
host_name = "belvedere.internal.softwareheritage.org"
- check_command = "check_belvedere_replication_lag.sh"
+ check_command = "check_prometheus_metric"
vars.check_prometheus_metric_name = "pg replication_lag belvedere somerset"
+ vars.check_prometheus_query = "sum(sql_pg_stat_replication{instance=\"belvedere.internal.softwareheritage.org\", host=\":5433\", application_name=\"softwareheritage_replica\"})"
vars.check_prometheus_metric_warning = 1073741824
vars.check_prometheus_metric_critical = 2147483648
*******************************************
- Exec[sudo-syntax-check for file /etc/sudoers.d/10_icinga-check_belvedere_replication_lag-sh]
*******************************************
- Exec[sudo-syntax-check for file /etc/sudoers.d/10_icinga-check_prometheus_metric-sh]
*******************************************
+ Exec[sudo-syntax-check for file /etc/sudoers.d/10_icinga-check_prometheus_metric] =>
parameters =>
"command": "visudo -c || "
"path": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
"refreshonly": true
*******************************************
- File[/etc/sudoers.d/10_icinga-check_belvedere_replication_lag-sh]
*******************************************
- File[/etc/sudoers.d/10_icinga-check_prometheus_metric-sh]
*******************************************
+ File[/etc/sudoers.d/10_icinga-check_prometheus_metric] =>
parameters =>
"ensure": "absent"
"group": "root"
"mode": "0440"
"owner": "root"
*******************************************
- File[/usr/lib/nagios/plugins/swh/check_belvedere_replication_lag.sh]
*******************************************
- File[/usr/lib/nagios/plugins/swh/check_prometheus_metric.sh]
*******************************************
+ File[/usr/lib/nagios/plugins/swh/check_prometheus_metric] =>
[...]
<<<
*******************************************
- Icinga2::Object::Checkcommand[check_belvedere_replication_lag.sh]
*******************************************
- Icinga2::Object::Checkcommand[check_prometheus_metric.sh]
*******************************************
+ Icinga2::Object::Checkcommand[check_prometheus_metric] =>
parameters =>
"arguments": {"-H"=>"$check_prometheus_metric_url$", "-q"=>"$check_prometheus_metric_query$", "-w"=>"$check_prometheus_metric_warning$", "-c"=>"$check_prometheus_metric_critical$", "-n"=>"$check_prometheus_metric_name$", "-m"=>"$check_prometheus_comparison_method$", "-t"=>"$check_prometheus_query_type$", "-O"=>{"set_if"=>"$check_prometheus_nan_ok$"}, "-P"=>{"set_if"=>"$check_prometheus_perfdata$"}}
"checkcommand_name": "check_prometheus_metric"
"command": ["/usr/lib/nagios/plugins/swh/check_prometheus_metric"]
"ensure": "present"
"import": ["plugin-check-command"]
"order": 15
"target": "/etc/icinga2/conf.d/swh-plugins.conf"
"template": false
"vars": {"check_prometheus_metric_url"=>"http://pergamon.internal.softwareheritage.org:9090", "check_prometheus_comparison_method"=>"ge", "check_prometheus_query_type"=>"scalar", "check_prometheus_nan_ok"=>false, "check_prometheus_perfdat"=>true}
*******************************************
Icinga2::Object::Service[Postgresql replication lag (belvedere -> somerset)] =>
parameters =>
check_command =>
- check_belvedere_replication_lag.sh
+ check_prometheus_metric
vars =>
check_prometheus_query =>
+ -:"sum(sql_pg_stat_replication{instance=\"belvedere.internal.softwareheritage.org\", host=\":5433\", application_name=\"softwareheritage_replica\"})"
*******************************************
- Icinga2::Object[icinga2::object::CheckCommand::check_belvedere_replication_lag.sh]
*******************************************
- Icinga2::Object[icinga2::object::CheckCommand::check_prometheus_metric.sh]
*******************************************
+ Icinga2::Object[icinga2::object::CheckCommand::check_prometheus_metric] =>
parameters =>
"apply": false
"assign": []
"attrs": {"command"=>["/usr/lib/nagios/plugins/swh/check_prometheus_metric"], "arguments"=>{"-H"=>"$check_prometheus_metric_url$", "-q"=>"$check_prometheus_metric_query$", "-w"=>"$check_prometheus_metric_warning$", "-c"=>"$check_prometheus_metric_critical$", "-n"=>"$check_prometheus_metric_name$", "-m"=>"$check_prometheus_comparison_method$", "-t"=>"$check_prometheus_query_type$", "-O"=>{"set_if"=>"$check_prometheus_nan_ok$"}, "-P"=>{"set_if"=>"$check_prometheus_perfdata$"}}, "vars"=>{"check_prometheus_metric_url"=>"http://pergamon.internal.softwareheritage.org:9090", "check_prometheus_comparison_method"=>"ge", "check_prometheus_query_type"=>"scalar", "check_prometheus_nan_ok"=>false, "check_prometheus_perfdat"=>true}}
"attrs_list": ["command", "env", "timeout", "arguments", "vars", "Acknowledgement", "ApiBindHost", "ApiBindPort", "ApiEnvironment", "ApplicationType", "Array", "AttachDebugger", "BuildCompilerName", "BuildCompilerVersion", "BuildHostName", "Checkable", "Command", "Concurrency", "ConfigObject", "Configuration", "Critical", "Custom", "CustomVarObject", "DateTime", "Deprecated", "Dictionary", "Down", "DowntimeEnd", "DowntimeRemoved", "DowntimeStart", "Environment", "FlappingEnd", "FlappingStart", "Function", "HostDown", "HostUp", "IncludeConfDir", "Internal", "Json", "LocalStateDir", "LogCritical", "LogDebug", "Logger", "LogInformation", "LogNotice", "LogWarning", "Math", "MaxConcurrentChecks", "ModAttrPath", "Namespace", "NodeName", "OK", "ObjectsPath", "PerfdataValue", "PidPath", "PkgDataDir", "PlatformArchitecture", "PlatformKernel", "PlatformKernelVersion", "PlatformName", "PlatformVersion", "PrefixDir", "Problem", "Recovery", "Reference", "RunAsGroup", "RunAsUser", "RunDir", "ServiceCritical", "ServiceOK", "ServiceUnknown", "ServiceWarning", "StatePath", "StreamLogger", "SysconfDir", "System", "Type", "Types", "Unknown", "Up", "UseVfork", "VarsPath", "Warning", "ZonesDir", "NodeName", "ZoneName", "TicketSalt", "PluginDir", "PluginContribDir", "ManubulonPluginDir", "name", "NodeName", "ZoneName", "TicketSalt", "PluginDir", "PluginContribDir", "ManubulonPluginDir", "name"]
"ensure": "present"
"ignore": []
"import": ["plugin-check-command"]
"object_name": "check_prometheus_metric"
"object_type": "CheckCommand"
"order": 15
"prefix": false
"target": "/etc/icinga2/conf.d/swh-plugins.conf"
"template": false
*******************************************
Icinga2::Object[icinga2::object::Service::Postgresql replication lag (belvedere -> somerset)] =>
parameters =>
attrs =>
check_command =>
- check_belvedere_replication_lag.sh
+ check_prometheus_metric
vars =>
check_prometheus_query =>
+ -:"sum(sql_pg_stat_replication{instance=\"belvedere.internal.softwareheritage.org\", host=\":5433\", application_name=\"softwareheritage_replica\"})"
*******************************************
- Sudo::Conf[icinga-check_belvedere_replication_lag.sh]
*******************************************
- Sudo::Conf[icinga-check_prometheus_metric.sh]
*******************************************
+ Sudo::Conf[icinga-check_prometheus_metric] =>
parameters =>
"ensure": "absent"
"priority": 10
"sudo_syntax_path": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
*******************************************
Migrated from D8470 (view on Phabricator)