Skip to content

monitor postgresql replication lag through prometheus data

  • Use a prometheus plugin for grafana (from prometheus)
  • create a wrapper to handle the query because when used with the puppet way to configure the probe, the parsing is not don correctly
  • raise the warning at 100GiB and the critical alert at 200GiB, it match what we observed in the last month but will be adapted later if needed

Related to T3452

Test Plan

  • pergamon:
diff origin/production/ current/
+ Concat::Fragment[] =>
   parameters =>
     "content": "\nobject CheckCommand \"\" {\n...
     "order": 15,
     "target": "/etc/icinga2/conf.d/swh-plugins.conf"
+ Concat::Fragment[] =>
   parameters =>
     "content": "\nobject CheckCommand \"\" {\n  import...
     "order": 15,
     "target": "/etc/icinga2/conf.d/swh-plugins.conf"
+ Concat::Fragment[icinga2::object::Service::Postgresql replication lag (belvedere -> somerset)] =>
   parameters =>
     "content": "\nobject Service \"Postgresql replication lag (belvedere -> some...
     "order": 60,
     "target": "/etc/icinga2/conf.d/static-checks.conf"
+ Concat_fragment[] =>
   parameters =>
     "content": "\nobject CheckCommand \"\" {\n...
     "order": 15,
     "tag": "_etc_icinga2_conf.d_swh-plugins.conf",
     "target": "/etc/icinga2/conf.d/swh-plugins.conf"
+ Concat_fragment[] =>
   parameters =>
     "content": "\nobject CheckCommand \"\" {\n  import...
     "order": 15,
     "tag": "_etc_icinga2_conf.d_swh-plugins.conf",
     "target": "/etc/icinga2/conf.d/swh-plugins.conf"
+ Concat_fragment[icinga2::object::Service::Postgresql replication lag (belvedere -> somerset)] =>
   parameters =>
     "content": "\nobject Service \"Postgresql replication lag (belvedere -> some...
     "order": 60,
     "tag": "_etc_icinga2_conf.d_static-checks.conf",
     "target": "/etc/icinga2/conf.d/static-checks.conf"
+ Exec[sudo-syntax-check for file /etc/sudoers.d/10_icinga-check_belvedere_replication_lag-sh] =>
   parameters =>
     "command": "visudo -c || ",
     "path": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
     "refreshonly": true
+ Exec[sudo-syntax-check for file /etc/sudoers.d/10_icinga-check_prometheus_metric-sh] =>
   parameters =>
     "command": "visudo -c || ",
     "path": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
     "refreshonly": true
+ File[/etc/sudoers.d/10_icinga-check_belvedere_replication_lag-sh] =>
   parameters =>
     "ensure": "absent",
     "group": "root",
     "mode": "0440",
     "owner": "root"
+ File[/etc/sudoers.d/10_icinga-check_prometheus_metric-sh] =>
   parameters =>
     "ensure": "absent",
     "group": "root",
     "mode": "0440",
     "owner": "root"
+ File[/usr/lib/nagios/plugins/swh/] =>
   parameters =>
     "content": "#!/bin/bash\n\n#\n# File managed by puppet. All modifications wi...
     "ensure": "present",
     "group": "root",
     "mode": "0755",
     "owner": "root"
+ File[/usr/lib/nagios/plugins/swh/] =>
   parameters =>
     "content": "#!/bin/bash\n\n#\n# File managed by puppet. All modifications wi...
     "ensure": "present",
     "group": "root",
     "mode": "0755",
     "owner": "root"
+ Icinga2::Object::Checkcommand[] =>
   parameters =>
     "arguments": {
       "-H": "$check_prometheus_metric_url$",
       "-w": "$check_prometheus_metric_warning$",
       "-c": "$check_prometheus_metric_critical$",
       "-n": "$check_prometheus_metric_name$"
     "checkcommand_name": "",
     "command": [
     "ensure": "present",
     "import": [
     "order": 15,
     "target": "/etc/icinga2/conf.d/swh-plugins.conf",
     "template": false,
     "vars": {
       "check_prometheus_metric_url": "
+ Icinga2::Object::Checkcommand[] =>
   parameters =>
     "arguments": {
       "-H": "$check_prometheus_metric_url$",
       "-q": "$check_prometheus_metric_query$",
       "-w": "$check_prometheus_metric_warning$",
       "-c": "$check_prometheus_metric_critical$",
       "-n": "$check_prometheus_metric_name$"
     "checkcommand_name": "",
     "command": [
     "ensure": "present",
     "import": [
     "order": 15,
     "target": "/etc/icinga2/conf.d/swh-plugins.conf",
     "template": false,
     "vars": {
       "check_prometheus_metric_url": "
+ Icinga2::Object::Service[Postgresql replication lag (belvedere -> somerset)] =>
   parameters =>
     "apply": false,
     "assign": [
     "check_command": "",
     "ensure": "present",
     "host_name": "",
     "ignore": [
     "import": [
     "order": 60,
     "prefix": false,
     "service_name": "Postgresql replication lag (belvedere -> somerset)",
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false,
     "vars": {
       "check_prometheus_metric_name": "pg replication_lag belvedere somerset",
       "check_prometheus_metric_warning": "107374182400",
       "check_prometheus_metric_critical": "214748364800"
+ Icinga2::Object[] =>
   parameters =>
     "apply": false,
     "assign": [
     "attrs": {
       "command": [
       "arguments": {
         "-H": "$check_prometheus_metric_url$",
         "-w": "$check_prometheus_metric_warning$",
         "-c": "$check_prometheus_metric_critical$",
         "-n": "$check_prometheus_metric_name$"
       "vars": {
         "check_prometheus_metric_url": "
     "attrs_list": [
     "ensure": "present",
     "ignore": [
     "import": [
     "object_name": "",
     "object_type": "CheckCommand",
     "order": 15,
     "prefix": false,
     "target": "/etc/icinga2/conf.d/swh-plugins.conf",
     "template": false
+ Icinga2::Object[] =>
   parameters =>
     "apply": false,
     "assign": [
     "attrs": {
       "command": [
       "arguments": {
         "-H": "$check_prometheus_metric_url$",
         "-q": "$check_prometheus_metric_query$",
         "-w": "$check_prometheus_metric_warning$",
         "-c": "$check_prometheus_metric_critical$",
         "-n": "$check_prometheus_metric_name$"
       "vars": {
         "check_prometheus_metric_url": "
     "attrs_list": [
     "ensure": "present",
     "ignore": [
     "import": [
     "object_name": "",
     "object_type": "CheckCommand",
     "order": 15,
     "prefix": false,
     "target": "/etc/icinga2/conf.d/swh-plugins.conf",
     "template": false
+ Icinga2::Object[icinga2::object::Service::Postgresql replication lag (belvedere -> somerset)] =>
   parameters =>
     "apply": false,
     "assign": [
     "attrs": {
       "host_name": "",
       "check_command": "",
       "vars": {
         "check_prometheus_metric_name": "pg replication_lag belvedere somerset",...
         "check_prometheus_metric_warning": "107374182400",
         "check_prometheus_metric_critical": "214748364800"
     "attrs_list": [
     "ensure": "present",
     "ignore": [
     "import": [
     "object_name": "Postgresql replication lag (belvedere -> somerset)",
     "object_type": "Service",
     "order": 60,
     "prefix": false,
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false
+ Sudo::Conf[] =>
   parameters =>
     "ensure": "absent",
     "priority": 10,
     "sudo_syntax_path": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin...
+ Sudo::Conf[] =>
   parameters =>
     "ensure": "absent",
     "priority": 10,
     "sudo_syntax_path": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin...
*** End octocatalog-diff on

Migrated from D6050 (view on Phabricator)

Merge request reports
