Skip to content

Add alerts on stale scheduler's recurrent lister tasks

Antoine R. Dumont requested to merge add-alert-on-stale-lister-tasks into production

3 kinds of periodic recurring lister tasks on production:

  • incremental: 1 day
  • full: 7 days or 90 days

This is the equivalent of what's been tried in swh-charts [1]. But that could not be functional without too much work on the scraping being done in the cluster. That's gonna be done eventually but not today.

[1] swh/infra/ci-cd/swh-charts!341 (closed)

octo-diff pergamon
diff origin/production/pergamon.softwareheritage.org current/pergamon.softwareheritage.org
*******************************************
+ Concat::Fragment[icinga2::object::Service::production: Scheduler recurrent lister stale tasks (period: 1 day)] =>
   parameters =>
     "content": "\nobject Service \"production: Scheduler recurrent lister stale ...
     "order": 60,
     "target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Concat::Fragment[icinga2::object::Service::production: Scheduler recurrent lister stale tasks (period: 7 days)] =>
   parameters =>
     "content": "\nobject Service \"production: Scheduler recurrent lister stale ...
     "order": 60,
     "target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Concat::Fragment[icinga2::object::Service::production: Scheduler recurrent lister stale tasks (period: 90 days)] =>
   parameters =>
     "content": "\nobject Service \"production: Scheduler recurrent lister stale ...
     "order": 60,
     "target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Concat::Fragment[icinga2::object::Service::staging: Scheduler recurrent lister stale tasks (period: 1 day)] =>
   parameters =>
     "content": "\nobject Service \"staging: Scheduler recurrent lister stale tas...
     "order": 60,
     "target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Concat::Fragment[icinga2::object::Service::staging: Scheduler recurrent lister stale tasks (period: 7 days)] =>
   parameters =>
     "content": "\nobject Service \"staging: Scheduler recurrent lister stale tas...
     "order": 60,
     "target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Concat::Fragment[icinga2::object::Service::staging: Scheduler recurrent lister stale tasks (period: 90 days)] =>
   parameters =>
     "content": "\nobject Service \"staging: Scheduler recurrent lister stale tas...
     "order": 60,
     "target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Concat_fragment[icinga2::object::Service::production: Scheduler recurrent lister stale tasks (period: 1 day)] =>
   parameters =>
     "content": "\nobject Service \"production: Scheduler recurrent lister stale ...
     "order": 60,
     "tag": "_etc_icinga2_conf.d_static-checks.conf",
     "target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Concat_fragment[icinga2::object::Service::production: Scheduler recurrent lister stale tasks (period: 7 days)] =>
   parameters =>
     "content": "\nobject Service \"production: Scheduler recurrent lister stale ...
     "order": 60,
     "tag": "_etc_icinga2_conf.d_static-checks.conf",
     "target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Concat_fragment[icinga2::object::Service::production: Scheduler recurrent lister stale tasks (period: 90 days)] =>
   parameters =>
     "content": "\nobject Service \"production: Scheduler recurrent lister stale ...
     "order": 60,
     "tag": "_etc_icinga2_conf.d_static-checks.conf",
     "target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Concat_fragment[icinga2::object::Service::staging: Scheduler recurrent lister stale tasks (period: 1 day)] =>
   parameters =>
     "content": "\nobject Service \"staging: Scheduler recurrent lister stale tas...
     "order": 60,
     "tag": "_etc_icinga2_conf.d_static-checks.conf",
     "target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Concat_fragment[icinga2::object::Service::staging: Scheduler recurrent lister stale tasks (period: 7 days)] =>
   parameters =>
     "content": "\nobject Service \"staging: Scheduler recurrent lister stale tas...
     "order": 60,
     "tag": "_etc_icinga2_conf.d_static-checks.conf",
     "target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
+ Concat_fragment[icinga2::object::Service::staging: Scheduler recurrent lister stale tasks (period: 90 days)] =>
   parameters =>
     "content": "\nobject Service \"staging: Scheduler recurrent lister stale tas...
     "order": 60,
     "tag": "_etc_icinga2_conf.d_static-checks.conf",
     "target": "/etc/icinga2/conf.d/static-checks.conf"
*******************************************
  File[/etc/bind/keys/local-update] =>
   parameters =>
     content =>
      @@ -2,4 +2,4 @@
       key local-update {
        algorithm hmac-sha256;
      - secret "UBnxzvjkp8jYHlPA1MUYv2xRnPQ7JRdWEm3jt+4orACsBzQPX2UFw92rN5JxeYt19x0o+fE8sujqU8fPHn04EA==";
      + secret "2V8hs2cGlOaZp35bCCLa9u7OnM434pI7QltDj3YYIGKWz5c5v2R/KVIJRfJJRvLXPiu+csCtun5fBder/y+UDw==";
       };
*******************************************
  File[/etc/bind/rndc.key] =>
   parameters =>
     content =>
      @@ -2,4 +2,4 @@
       key rndc-key {
        algorithm hmac-md5;
      - secret "TO+MWC8mJvDTYW3qgtrs1rArWjDMTdDjXxf/a2+VoXvdPqnx0HLHfrrwIAFp4xZyKyM9F5f7Ak6G2JbAra/Imw==";
      + secret "Tp6y4jWOeylH+YxZBr+zqgXc33QdM0QMUwPmDHoPlSiT5ICjmG7StnsEB8e9rg3AaHxpvs7mrRU2bX/cNbjFaw==";
       };
*******************************************
+ Icinga2::Object::Service[production: Scheduler recurrent lister stale tasks (period: 1 day)] =>
   parameters =>
     "apply": false,
     "assign": [

     ],
     "check_command": "check_prometheus_metric",
     "ensure": "present",
     "export_to": [

     ],
     "host_name": "albertina.internal.softwareheritage.org",
     "ignore": [

     ],
     "import": [

     ],
     "order": 60,
     "prefix": false,
     "service_name": "production: Scheduler recurrent lister stale tasks (period:...
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false,
     "vars": {
       "prometheus_metric_name": "production: Scheduler recurrent lister stale ta...
       "prometheus_query": "-:\"histogram_quantile(0.1, sum(sql_swh_scheduler_del...
       "prometheus_query_type": "vector",
       "prometheus_metric_warning": 172800,
       "prometheus_metric_critical": 259200
     }
*******************************************
+ Icinga2::Object::Service[production: Scheduler recurrent lister stale tasks (period: 7 days)] =>
   parameters =>
     "apply": false,
     "assign": [

     ],
     "check_command": "check_prometheus_metric",
     "ensure": "present",
     "export_to": [

     ],
     "host_name": "albertina.internal.softwareheritage.org",
     "ignore": [

     ],
     "import": [

     ],
     "order": 60,
     "prefix": false,
     "service_name": "production: Scheduler recurrent lister stale tasks (period:...
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false,
     "vars": {
       "prometheus_metric_name": "production: Scheduler recurrent lister stale ta...
       "prometheus_query": "-:\"histogram_quantile(0.1, sum(sql_swh_scheduler_del...
       "prometheus_query_type": "vector",
       "prometheus_metric_warning": 691200,
       "prometheus_metric_critical": 777600
     }
*******************************************
+ Icinga2::Object::Service[production: Scheduler recurrent lister stale tasks (period: 90 days)] =>
   parameters =>
     "apply": false,
     "assign": [

     ],
     "check_command": "check_prometheus_metric",
     "ensure": "present",
     "export_to": [

     ],
     "host_name": "albertina.internal.softwareheritage.org",
     "ignore": [

     ],
     "import": [

     ],
     "order": 60,
     "prefix": false,
     "service_name": "production: Scheduler recurrent lister stale tasks (period:...
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false,
     "vars": {
       "prometheus_metric_name": "production: Scheduler recurrent lister stale ta...
       "prometheus_query": "-:\"histogram_quantile(0.1, sum(sql_swh_scheduler_del...
       "prometheus_query_type": "vector",
       "prometheus_metric_warning": 7862400,
       "prometheus_metric_critical": 7948800
     }
*******************************************
+ Icinga2::Object::Service[staging: Scheduler recurrent lister stale tasks (period: 1 day)] =>
   parameters =>
     "apply": false,
     "assign": [

     ],
     "check_command": "check_prometheus_metric",
     "ensure": "present",
     "export_to": [

     ],
     "host_name": "db1.internal.staging.swh.network",
     "ignore": [

     ],
     "import": [

     ],
     "order": 60,
     "prefix": false,
     "service_name": "staging: Scheduler recurrent lister stale tasks (period: 1 ...
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false,
     "vars": {
       "prometheus_metric_name": "staging: Scheduler recurrent lister stale tasks...
       "prometheus_query": "-:\"histogram_quantile(0.1, sum(sql_swh_scheduler_del...
       "prometheus_query_type": "vector",
       "prometheus_metric_warning": 172800,
       "prometheus_metric_critical": 259200
     }
*******************************************
+ Icinga2::Object::Service[staging: Scheduler recurrent lister stale tasks (period: 7 days)] =>
   parameters =>
     "apply": false,
     "assign": [

     ],
     "check_command": "check_prometheus_metric",
     "ensure": "present",
     "export_to": [

     ],
     "host_name": "db1.internal.staging.swh.network",
     "ignore": [

     ],
     "import": [

     ],
     "order": 60,
     "prefix": false,
     "service_name": "staging: Scheduler recurrent lister stale tasks (period: 7 ...
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false,
     "vars": {
       "prometheus_metric_name": "staging: Scheduler recurrent lister stale tasks...
       "prometheus_query": "-:\"histogram_quantile(0.1, sum(sql_swh_scheduler_del...
       "prometheus_query_type": "vector",
       "prometheus_metric_warning": 691200,
       "prometheus_metric_critical": 777600
     }
*******************************************
+ Icinga2::Object::Service[staging: Scheduler recurrent lister stale tasks (period: 90 days)] =>
   parameters =>
     "apply": false,
     "assign": [

     ],
     "check_command": "check_prometheus_metric",
     "ensure": "present",
     "export_to": [

     ],
     "host_name": "db1.internal.staging.swh.network",
     "ignore": [

     ],
     "import": [

     ],
     "order": 60,
     "prefix": false,
     "service_name": "staging: Scheduler recurrent lister stale tasks (period: 90...
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false,
     "vars": {
       "prometheus_metric_name": "staging: Scheduler recurrent lister stale tasks...
       "prometheus_query": "-:\"histogram_quantile(0.1, sum(sql_swh_scheduler_del...
       "prometheus_query_type": "vector",
       "prometheus_metric_warning": 7862400,
       "prometheus_metric_critical": 7948800
     }
*******************************************
+ Icinga2::Object[icinga2::object::Service::production: Scheduler recurrent lister stale tasks (period: 1 day)] =>
   parameters =>
     "apply": false,
     "assign": [

     ],
     "attrs": {
       "host_name": "albertina.internal.softwareheritage.org",
       "check_command": "check_prometheus_metric",
       "vars": {
         "prometheus_metric_name": "production: Scheduler recurrent lister stale ...
         "prometheus_query": "-:\"histogram_quantile(0.1, sum(sql_swh_scheduler_d...
         "prometheus_query_type": "vector",
         "prometheus_metric_warning": 172800,
         "prometheus_metric_critical": 259200
       }
     },
     "attrs_list": [
       "display_name",
       "host_name",
       "check_command",
       "check_timeout",
       "check_interval",
       "check_period",
       "retry_interval",
       "max_check_attempts",
       "groups",
       "enable_notifications",
       "enable_active_checks",
       "enable_passive_checks",
       "enable_event_handler",
       "enable_flapping",
       "enable_perfdata",
       "event_command",
       "flapping_threshold_low",
       "flapping_threshold_high",
       "volatile",
       "zone",
       "command_endpoint",
       "notes",
       "notes_url",
       "action_url",
       "icon_image",
       "icon_image_alt",
       "vars"
     ],
     "ensure": "present",
     "ignore": [

     ],
     "import": [

     ],
     "object_name": "production: Scheduler recurrent lister stale tasks (period: ...
     "object_type": "Service",
     "order": 60,
     "prefix": false,
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false
*******************************************
+ Icinga2::Object[icinga2::object::Service::production: Scheduler recurrent lister stale tasks (period: 7 days)] =>
   parameters =>
     "apply": false,
     "assign": [

     ],
     "attrs": {
       "host_name": "albertina.internal.softwareheritage.org",
       "check_command": "check_prometheus_metric",
       "vars": {
         "prometheus_metric_name": "production: Scheduler recurrent lister stale ...
         "prometheus_query": "-:\"histogram_quantile(0.1, sum(sql_swh_scheduler_d...
         "prometheus_query_type": "vector",
         "prometheus_metric_warning": 691200,
         "prometheus_metric_critical": 777600
       }
     },
     "attrs_list": [
       "display_name",
       "host_name",
       "check_command",
       "check_timeout",
       "check_interval",
       "check_period",
       "retry_interval",
       "max_check_attempts",
       "groups",
       "enable_notifications",
       "enable_active_checks",
       "enable_passive_checks",
       "enable_event_handler",
       "enable_flapping",
       "enable_perfdata",
       "event_command",
       "flapping_threshold_low",
       "flapping_threshold_high",
       "volatile",
       "zone",
       "command_endpoint",
       "notes",
       "notes_url",
       "action_url",
       "icon_image",
       "icon_image_alt",
       "vars"
     ],
     "ensure": "present",
     "ignore": [

     ],
     "import": [

     ],
     "object_name": "production: Scheduler recurrent lister stale tasks (period: ...
     "object_type": "Service",
     "order": 60,
     "prefix": false,
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false
*******************************************
+ Icinga2::Object[icinga2::object::Service::production: Scheduler recurrent lister stale tasks (period: 90 days)] =>
   parameters =>
     "apply": false,
     "assign": [

     ],
     "attrs": {
       "host_name": "albertina.internal.softwareheritage.org",
       "check_command": "check_prometheus_metric",
       "vars": {
         "prometheus_metric_name": "production: Scheduler recurrent lister stale ...
         "prometheus_query": "-:\"histogram_quantile(0.1, sum(sql_swh_scheduler_d...
         "prometheus_query_type": "vector",
         "prometheus_metric_warning": 7862400,
         "prometheus_metric_critical": 7948800
       }
     },
     "attrs_list": [
       "display_name",
       "host_name",
       "check_command",
       "check_timeout",
       "check_interval",
       "check_period",
       "retry_interval",
       "max_check_attempts",
       "groups",
       "enable_notifications",
       "enable_active_checks",
       "enable_passive_checks",
       "enable_event_handler",
       "enable_flapping",
       "enable_perfdata",
       "event_command",
       "flapping_threshold_low",
       "flapping_threshold_high",
       "volatile",
       "zone",
       "command_endpoint",
       "notes",
       "notes_url",
       "action_url",
       "icon_image",
       "icon_image_alt",
       "vars"
     ],
     "ensure": "present",
     "ignore": [

     ],
     "import": [

     ],
     "object_name": "production: Scheduler recurrent lister stale tasks (period: ...
     "object_type": "Service",
     "order": 60,
     "prefix": false,
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false
*******************************************
+ Icinga2::Object[icinga2::object::Service::staging: Scheduler recurrent lister stale tasks (period: 1 day)] =>
   parameters =>
     "apply": false,
     "assign": [

     ],
     "attrs": {
       "host_name": "db1.internal.staging.swh.network",
       "check_command": "check_prometheus_metric",
       "vars": {
         "prometheus_metric_name": "staging: Scheduler recurrent lister stale tas...
         "prometheus_query": "-:\"histogram_quantile(0.1, sum(sql_swh_scheduler_d...
         "prometheus_query_type": "vector",
         "prometheus_metric_warning": 172800,
         "prometheus_metric_critical": 259200
       }
     },
     "attrs_list": [
       "display_name",
       "host_name",
       "check_command",
       "check_timeout",
       "check_interval",
       "check_period",
       "retry_interval",
       "max_check_attempts",
       "groups",
       "enable_notifications",
       "enable_active_checks",
       "enable_passive_checks",
       "enable_event_handler",
       "enable_flapping",
       "enable_perfdata",
       "event_command",
       "flapping_threshold_low",
       "flapping_threshold_high",
       "volatile",
       "zone",
       "command_endpoint",
       "notes",
       "notes_url",
       "action_url",
       "icon_image",
       "icon_image_alt",
       "vars"
     ],
     "ensure": "present",
     "ignore": [

     ],
     "import": [

     ],
     "object_name": "staging: Scheduler recurrent lister stale tasks (period: 1 d...
     "object_type": "Service",
     "order": 60,
     "prefix": false,
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false
*******************************************
+ Icinga2::Object[icinga2::object::Service::staging: Scheduler recurrent lister stale tasks (period: 7 days)] =>
   parameters =>
     "apply": false,
     "assign": [

     ],
     "attrs": {
       "host_name": "db1.internal.staging.swh.network",
       "check_command": "check_prometheus_metric",
       "vars": {
         "prometheus_metric_name": "staging: Scheduler recurrent lister stale tas...
         "prometheus_query": "-:\"histogram_quantile(0.1, sum(sql_swh_scheduler_d...
         "prometheus_query_type": "vector",
         "prometheus_metric_warning": 691200,
         "prometheus_metric_critical": 777600
       }
     },
     "attrs_list": [
       "display_name",
       "host_name",
       "check_command",
       "check_timeout",
       "check_interval",
       "check_period",
       "retry_interval",
       "max_check_attempts",
       "groups",
       "enable_notifications",
       "enable_active_checks",
       "enable_passive_checks",
       "enable_event_handler",
       "enable_flapping",
       "enable_perfdata",
       "event_command",
       "flapping_threshold_low",
       "flapping_threshold_high",
       "volatile",
       "zone",
       "command_endpoint",
       "notes",
       "notes_url",
       "action_url",
       "icon_image",
       "icon_image_alt",
       "vars"
     ],
     "ensure": "present",
     "ignore": [

     ],
     "import": [

     ],
     "object_name": "staging: Scheduler recurrent lister stale tasks (period: 7 d...
     "object_type": "Service",
     "order": 60,
     "prefix": false,
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false
*******************************************
+ Icinga2::Object[icinga2::object::Service::staging: Scheduler recurrent lister stale tasks (period: 90 days)] =>
   parameters =>
     "apply": false,
     "assign": [

     ],
     "attrs": {
       "host_name": "db1.internal.staging.swh.network",
       "check_command": "check_prometheus_metric",
       "vars": {
         "prometheus_metric_name": "staging: Scheduler recurrent lister stale tas...
         "prometheus_query": "-:\"histogram_quantile(0.1, sum(sql_swh_scheduler_d...
         "prometheus_query_type": "vector",
         "prometheus_metric_warning": 7862400,
         "prometheus_metric_critical": 7948800
       }
     },
     "attrs_list": [
       "display_name",
       "host_name",
       "check_command",
       "check_timeout",
       "check_interval",
       "check_period",
       "retry_interval",
       "max_check_attempts",
       "groups",
       "enable_notifications",
       "enable_active_checks",
       "enable_passive_checks",
       "enable_event_handler",
       "enable_flapping",
       "enable_perfdata",
       "event_command",
       "flapping_threshold_low",
       "flapping_threshold_high",
       "volatile",
       "zone",
       "command_endpoint",
       "notes",
       "notes_url",
       "action_url",
       "icon_image",
       "icon_image_alt",
       "vars"
     ],
     "ensure": "present",
     "ignore": [

     ],
     "import": [

     ],
     "object_name": "staging: Scheduler recurrent lister stale tasks (period: 90 ...
     "object_type": "Service",
     "order": 60,
     "prefix": false,
     "target": "/etc/icinga2/conf.d/static-checks.conf",
     "template": false
*******************************************
*** End octocatalog-diff on pergamon.softwareheritage.org

Refs. swh/infra/sysadm-environment#5213 (closed)

Edited by Antoine R. Dumont

Merge request reports