From df94f805f3dfbada70951aae314cb132d7fc5a44 Mon Sep 17 00:00:00 2001 From: Vincent SELLIER <vincent.sellier@softwareheritage.org> Date: Thu, 7 Mar 2024 14:38:31 +0100 Subject: [PATCH 1/2] cluster-component: refactor the alermanager irc relay configuration - Respect the tacit rule of not deploying anything if it's not explicitely asked - Improve the configuration of the AlertManagerConfig and irc deployment by adding more configuration points Related to swh/infra/sysadm-environment#5281 --- .gitignore | 3 +-- cluster-components/templates/NOTES.txt | 11 +++++--- .../templates/alertmanager-config/config.yaml | 16 ++++++----- cluster-components/values.yaml | 27 +++++++++---------- cluster-components/values/admin-rke2.yaml | 2 +- .../values/archive-production-rke2.yaml | 3 +++ .../values/archive-staging-rke2.yaml | 3 +++ .../values/test-staging-rke2.yaml | 2 +- 8 files changed, 37 insertions(+), 30 deletions(-) diff --git a/.gitignore b/.gitignore index 26df1d9a9..3ffe516d1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ __snapshot__ -/cluster-components/charts/ /swh/Chart.lock -/swh/charts/ +charts diff --git a/cluster-components/templates/NOTES.txt b/cluster-components/templates/NOTES.txt index 90f96eae9..862d81adf 100644 --- a/cluster-components/templates/NOTES.txt +++ b/cluster-components/templates/NOTES.txt @@ -6,12 +6,15 @@ This installed/upgraded: http://alertmanager-irc-relay:{{ .Values.alertmanagerIrcRelay.http_port }}/{{ .Values.alertmanagerIrcRelay.room }} {{ end }} {{- if and .Values.alertmanagerIrcRelay.enabled .Values.alertmanagerIrcRelay.ingress.enabled }} -- An alert manager irc relay ingress exposed at {{ .Values.alertmanager.ircrelay.host - }}. Any alerts sent from a cluster with access to this ingress will be propagated to +- An alert manager irc relay ingress exposed at : +{{- range $host := .Values.alertmanagerIrcRelay.ingress.hosts }} + - {{ $host }} +{{ end }} + Any alerts sent from a cluster with access to this ingress will be propagated to irc #{{ .Values.alertmanagerIrcRelay.room }} room {{ end }} -{{- if and .Values.alertmanager.enabled .Values.alertmanager.ircrelay.enabled }} -- Allows to relay alertmanager's alerts to the relay exposed at {{ .Values.alertmanager.ircrelay.host }} +{{- if .Values.alertmanagerConfig.enabled }} +- Allows to relay alertmanager's alerts to the relay exposed at {{ .Values.alertmanagerConfig.ircRelayHost }} {{ end }} {{- if .Values.blackboxExporter.enabled }} - blackbox exporter. This installs probes to expand monitoring to http(s), DNS, ... diff --git a/cluster-components/templates/alertmanager-config/config.yaml b/cluster-components/templates/alertmanager-config/config.yaml index 14fb5ca35..e326426d6 100644 --- a/cluster-components/templates/alertmanager-config/config.yaml +++ b/cluster-components/templates/alertmanager-config/config.yaml @@ -1,10 +1,10 @@ -{{- if and .Values.alertmanagerConfig.enabled }} +{{- if .Values.alertmanagerConfig.enabled }} --- apiVersion: monitoring.coreos.com/v1alpha1 kind: AlertmanagerConfig metadata: name: irc-relay-config - namespace: cattle-monitoring-system + namespace: {{ .Values.alertmanagerConfig.namespace }} spec: route: groupBy: ['...'] @@ -25,13 +25,15 @@ spec: - name: ircrelay webhookConfigs: - sendResolved: true - url: {{ .Values.alertmanager.ircrelay.host }} + url: {{ .Values.alertmanagerConfig.ircRelayHost }} + {{ if .Values.alertmanagerConfig.authentication.enabled -}} httpConfig: basicAuth: username: - key: user - name: alertmanager-irc-relay-config + key: {{ .Values.alertmanagerConfig.authentication.userKeyRef }} + name: {{ .Values.alertmanagerConfig.authentication.secretRef }} password: - key: password - name: alertmanager-irc-relay-config + key: {{ .Values.alertmanagerConfig.authentication.passwordKeyRef }} + name: {{ .Values.alertmanagerConfig.authentication.secretRef }} + {{- end -}} {{ end }} diff --git a/cluster-components/values.yaml b/cluster-components/values.yaml index 618054917..f88f42134 100644 --- a/cluster-components/values.yaml +++ b/cluster-components/values.yaml @@ -10,19 +10,6 @@ cert-manager: # Supported in the chart, not seen on the pods... priorityClassName: cluster-components-system -# This configuration is swh specific (and independent from the prometheus configuration -# already done during terraform provisioning). When activated, this allows to relay the -# cluster's prometheus alerts to the cluster admin's alertmanager ingress irc relay -alertmanager: - enabled: false - # Supported in the chart, not seen on the pods... - priorityClassName: cluster-components-system - ircrelay: - enabled: true - host: https://alertmanager-irc-relay.internal.admin.swh.network/swh-sysadm - # .htaccess or authentication credentials - # secret: - prometheus: enabled: false # Not working somehow... Charts reference it but it's not seen in minikube @@ -35,9 +22,19 @@ prometheus: kube-state-metrics: namespaceOverride: cattle-monitoring-system +# This configuration is swh specific (and independent from the prometheus configuration +# already done during terraform provisioning). When activated, this allows to relay the +# cluster's prometheus alerts to the cluster admin's alertmanager ingress irc relay alertmanagerConfig: - enabled: true - host: https://alertmanager-irc-relay.admin.swh.network/swh-sysadm + enabled: false + namespace: cattle-monitoring-system + ircRelayHost: https://alertmanager-irc-relay.internal.admin.swh.network/swh-sysadm + # .htaccess or authentication credentials + authentication: + enabled: true + secretRef: alertmanager-irc-relay-config + userKeyRef: user + passwordKeyRef: password alertmanagerIrcRelay: enabled: false diff --git a/cluster-components/values/admin-rke2.yaml b/cluster-components/values/admin-rke2.yaml index a51888ef0..a507b4d06 100644 --- a/cluster-components/values/admin-rke2.yaml +++ b/cluster-components/values/admin-rke2.yaml @@ -8,7 +8,7 @@ alertmanagerIrcRelay: clusterIssuer: letsencrypt-production alertmanagerConfig: - host: https://alertmanager-irc-relay.internal.admin.swh.network/swh-sysadm + enabled: true blackboxExporter: enabled: true diff --git a/cluster-components/values/archive-production-rke2.yaml b/cluster-components/values/archive-production-rke2.yaml index 885e0f318..7290674b3 100644 --- a/cluster-components/values/archive-production-rke2.yaml +++ b/cluster-components/values/archive-production-rke2.yaml @@ -5,6 +5,9 @@ alerting: enabled: true environment: production +alertmanagerConfig: + enabled: true + scrapeExternalMetrics: enabled: true deployments: diff --git a/cluster-components/values/archive-staging-rke2.yaml b/cluster-components/values/archive-staging-rke2.yaml index 57e5b7a4c..f44c65fa0 100644 --- a/cluster-components/values/archive-staging-rke2.yaml +++ b/cluster-components/values/archive-staging-rke2.yaml @@ -2,6 +2,9 @@ alertmanager: enabled: true +alertmanagerConfig: + enabled: true + podPriority: enabled: true diff --git a/cluster-components/values/test-staging-rke2.yaml b/cluster-components/values/test-staging-rke2.yaml index eb2817b2a..7ecf05f3f 100644 --- a/cluster-components/values/test-staging-rke2.yaml +++ b/cluster-components/values/test-staging-rke2.yaml @@ -1,5 +1,5 @@ # Relay prometheus alerts to the admin cluster's ingress relay -alertmanager: +alertmanagerConfig: enabled: true svix: -- GitLab From 0d601505cf0251e0edc3fb1597331c0b368b23b1 Mon Sep 17 00:00:00 2001 From: Vincent SELLIER <vincent.sellier@softwareheritage.org> Date: Fri, 8 Mar 2024 00:19:02 +0100 Subject: [PATCH 2/2] cluster-component: Activate irc relay on gitlab staging This is a test to evaluate the behavior and the monitoring noise it will generate Related to swh/infra/sysadm-environment#5281 --- cluster-components/values/gitlab-staging.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cluster-components/values/gitlab-staging.yaml b/cluster-components/values/gitlab-staging.yaml index e69de29bb..60b9c53f1 100644 --- a/cluster-components/values/gitlab-staging.yaml +++ b/cluster-components/values/gitlab-staging.yaml @@ -0,0 +1,4 @@ +alertmanagerConfig: + enabled: true + namespace: monitoring + host: https://alertmanager-irc-relay.admin.swh.network/swh-sysadm -- GitLab