Skip to content

swh/webapp: Avoid overloading the webapps with heavy scn metrics retrieval

Vincent Sellier requested to merge scn-metrics-scraping into production

Use a custom scraping configuration to get them only one time per scraping interval instead of one time per node

Related to swh/infra/sysadm-environment#5474 (closed)

helm diff
[swh] Comparing changes between branches production and scn-metrics-scraping (per environment)...
Your branch is up to date with 'origin/production'.
[swh] Generate config in production branch for environment staging, namespace swh...
[swh] Generate config in production branch for environment staging, namespace swh-cassandra...
[swh] Generate config in production branch for environment staging, namespace swh-cassandra-next-version...
[swh] Generate config in scn-metrics-scraping branch for environment staging...
[swh] Generate config in scn-metrics-scraping branch for environment staging...
[swh] Generate config in scn-metrics-scraping branch for environment staging...
Your branch is up to date with 'origin/production'.
[swh] Generate config in production branch for environment production, namespace swh...
[swh] Generate config in production branch for environment production, namespace swh-cassandra...
[swh] Generate config in production branch for environment production, namespace swh-cassandra-next-version...
[swh] Generate config in scn-metrics-scraping branch for environment production...
[swh] Generate config in scn-metrics-scraping branch for environment production...
[swh] Generate config in scn-metrics-scraping branch for environment production...


------------- diff for environment staging namespace swh -------------

--- /tmp/swh-chart.swh.z91pOUCP/staging-swh.before	2024-11-05 23:15:10.198502589 +0100
+++ /tmp/swh-chart.swh.z91pOUCP/staging-swh.after	2024-11-05 23:15:12.822505912 +0100
@@ -2115,20 +2115,21 @@
   namespace: swh
   name: web-postgresql-configuration-template
 data:
   config.yml.template: |
     instance_name: webapp-postgresql.internal.staging.swh.network
     allowed_hosts:
       - webapp-postgresql.internal.staging.swh.network
       - ${POD_IP}
     staging_server_names:
       - webapp-postgresql.internal.staging.swh.network
+      - web-postgresql.swh
       - ${POD_IP}
     storage:
       cls: remote
       url: http://storage-postgresql-read-only-rpc-ingress
     search:
       cls: remote
       url: http://search-rpc-ingress
     scheduler:
       cls: remote
       url: http://scheduler.internal.staging.swh.network
@@ -5775,21 +5776,21 @@
       app: web-postgresql
   strategy:
     type: RollingUpdate
     rollingUpdate:
       maxSurge: 1
   template:
     metadata:
       labels:
         app: web-postgresql
       annotations:
-        checksum/config: b4770331a75303af14bf767616379959a4351a5e28839fd7e9ef93fd951b366b
+        checksum/config: 79a570f2cc0fd8228da9de1d335cfd5c3e39035fcda7f4ccf9da6df40eda13ce
         checksum/config-logging: 81fb24577eb1777be8690f58c1e92d701777fe4ff045bb8445feb924947b9f84
         checksum/config-utils: d75ca13b805bce6a8ab59c8e24c938f2283108f6a79134f6e71db86308651dc6
     spec:
       affinity:
         nodeAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:
             nodeSelectorTerms:
             - matchExpressions:
               - key: swh/web
                 operator: In


------------- diff for environment staging namespace swh-cassandra -------------

--- /tmp/swh-chart.swh.z91pOUCP/staging-swh-cassandra.before	2024-11-05 23:15:11.474504207 +0100
+++ /tmp/swh-chart.swh.z91pOUCP/staging-swh-cassandra.after	2024-11-05 23:15:13.810507155 +0100
@@ -8139,20 +8139,21 @@
 data:
   config.yml.template: |
     instance_name: webapp.staging.swh.network
     allowed_hosts:
       - webapp.staging.swh.network
       - webapp-cassandra.internal.staging.swh.network
       - ${POD_IP}
     staging_server_names:
       - webapp.staging.swh.network
       - webapp-cassandra.internal.staging.swh.network
+      - web-cassandra.swh-cassandra
       - ${POD_IP}
     storage:
       cls: remote
       url: http://storage-cassandra-read-only-ingress
     search:
       cls: remote
       url: http://search-rpc-ingress
     provenance:
       cls: remote
       url: http://webapp-provenance-ingress
@@ -8336,20 +8337,21 @@
 data:
   config.yml.template: |
     instance_name: webapp.staging.swh.network
     allowed_hosts:
       - webapp.staging.swh.network
       - webapp-cassandra.internal.staging.swh.network
       - ${POD_IP}
     staging_server_names:
       - webapp.staging.swh.network
       - webapp-cassandra.internal.staging.swh.network
+      - web-webhooks.swh-cassandra
       - ${POD_IP}
     storage:
       cls: remote
       url: http://storage-cassandra-read-only-ingress
     search:
       cls: remote
       url: http://search-rpc-ingress
     scheduler:
       cls: remote
       url: http://scheduler.internal.staging.swh.network
@@ -22916,21 +22918,21 @@
       app: web-cassandra
   strategy:
     type: RollingUpdate
     rollingUpdate:
       maxSurge: 1
   template:
     metadata:
       labels:
         app: web-cassandra
       annotations:
-        checksum/config: bc7048c8f8c4c42e0a02187ed2350802ebfdd1955bc72fb25c0f346db4e92f71
+        checksum/config: a7924234dec69ccdca9f58edaa1e4496b73d08ff63f05ed572f0620728ed0295
         checksum/config-logging: 21c90a039f27f4476045b8973a841bb2b3c0e4435be7fb9ab1d748372f8a96c8
         checksum/config-utils: 13a26f6add17e96ce01550153c77dcd48de60241a3f4db3c93d5467234be2a7f
     spec:
       affinity:
         nodeAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:
             nodeSelectorTerms:
             - matchExpressions:
               - key: swh/web
                 operator: In
@@ -23184,21 +23186,21 @@
       app: web-webhooks
   strategy:
     type: RollingUpdate
     rollingUpdate:
       maxSurge: 1
   template:
     metadata:
       labels:
         app: web-webhooks
       annotations:
-        checksum/config: 51ca8f4891b68776c7e6c6d4d5262813fb13658a325185cdbbfb25a698f1b216
+        checksum/config: 7b09773b939b9bd446fc60f1e8576d999f5c50890355d505ef4ef1454229a042
         checksum/config-logging: 8204fa505554e2a92718b6446f5335481339d9b88337df1e300a3cdc6868c0a8
         checksum/config-utils: 13a26f6add17e96ce01550153c77dcd48de60241a3f4db3c93d5467234be2a7f
     spec:
       affinity:
         nodeAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:
             nodeSelectorTerms:
             - matchExpressions:
               - key: swh/web
                 operator: In
@@ -28598,20 +28600,40 @@
   maxReplicaCount: 2
   idleReplicaCount: 0
   triggers:
   - type: kafka
     metadata:
       bootstrapServers: journal2.internal.staging.swh.network:9092
       consumerGroup: swh-archive-stg-webhooks
       lagThreshold: "1000"
       offsetResetPolicy: earliest
 ---
+# Source: swh/templates/web/scn-metrics-scraping.yaml
+apiVersion: monitoring.coreos.com/v1alpha1
+kind: ScrapeConfig
+metadata:
+  name: web-cassandra-scn-metrics
+  namespace: swh-cassandra
+  labels:
+    release: rancher-monitoring
+spec:
+  staticConfigs:
+    - labels:
+        job: web-cassandra-scn-metrics
+        namespace: swh-cassandra
+      targets:
+        -   web-cassandra.swh-cassandra:5004 # target the service
+  metricsPath: /metrics/prometheus/
+  scrapeInterval: 60s
+  scrapeTimeout: 60s
+  scheme: HTTP
+---
 # Source: swh/templates/counters/servicemonitor.yaml
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
   name: counters-rpc
   namespace: swh-cassandra
   labels:
     app: "counters-rpc-sm"
 spec:
   endpoints:
@@ -28656,58 +28678,20 @@
   - path: /metrics
     port: http
     interval: 10s
   selector:
     matchLabels:
       app: prometheus-statsd-exporter
   namespaceSelector:
     matchNames:
       - swh-cassandra
 ---
-# Source: swh/templates/web/monitoring.yaml
-apiVersion: monitoring.coreos.com/v1
-kind: ServiceMonitor
-metadata:
-  name: web-cassandra-metrics
-  namespace: swh-cassandra
-spec:
-  endpoints:
-  - path: /metrics/prometheus/
-    port: rpc
-    interval: 300s
-    scrapeTimeout: 60s
-  selector:
-    matchLabels:
-      app: web-cassandra
-  namespaceSelector:
-    matchNames:
-      - swh-cassandra
----
-# Source: swh/templates/web/monitoring.yaml
-apiVersion: monitoring.coreos.com/v1
-kind: ServiceMonitor
-metadata:
-  name: web-webhooks-metrics
-  namespace: swh-cassandra
-spec:
-  endpoints:
-  - path: /metrics/prometheus/
-    port: rpc
-    interval: 300s
-    scrapeTimeout: 60s
-  selector:
-    matchLabels:
-      app: web-webhooks
-  namespaceSelector:
-    matchNames:
-      - swh-cassandra
----
 # Source: swh/templates/checker-deposit/keda-autoscaling.yaml
 apiVersion: keda.sh/v1alpha1
 kind: TriggerAuthentication
 metadata:
   name: amqp-authentication-checker-deposit
   namespace: swh-cassandra
 spec:
   secretTargetRef:
   - parameter: host            # "host" is required by the scalerObject trigger metadata
     name: common-secrets


------------- diff for environment staging namespace swh-cassandra-next-version -------------

--- /tmp/swh-chart.swh.z91pOUCP/staging-swh-cassandra-next-version.before	2024-11-05 23:15:12.414505396 +0100
+++ /tmp/swh-chart.swh.z91pOUCP/staging-swh-cassandra-next-version.after	2024-11-05 23:15:14.722508300 +0100
@@ -7379,20 +7379,21 @@
   namespace: swh-cassandra-next-version
   name: web-cassandra-configuration-template
 data:
   config.yml.template: |
     instance_name: webapp-cassandra-next-version.internal.staging.swh.network
     allowed_hosts:
       - webapp-cassandra-next-version.internal.staging.swh.network
       - ${POD_IP}
     staging_server_names:
       - webapp-cassandra-next-version.internal.staging.swh.network
+      - web-cassandra.swh-cassandra-next-version
       - ${POD_IP}
     storage:
       cls: remote
       url: http://storage-ro-postgresql:5002
     search:
       cls: remote
       url: http://search-rpc:5010
     provenance:
       cls: remote
       url: http://webapp-provenance-ingress-next-version
@@ -20969,21 +20970,21 @@
       app: web-cassandra
   strategy:
     type: RollingUpdate
     rollingUpdate:
       maxSurge: 1
   template:
     metadata:
       labels:
         app: web-cassandra
       annotations:
-        checksum/config: d16aa75850c7d343bd21803660445bd54290c8fd62c104f77776f77aa878ed13
+        checksum/config: 1acd620bfef834acc183aae323e0530281c12804280f143815ce3ac0fe143c99
         checksum/config-logging: f266f784128ac9c57c6d0f154a646e15f06d0ad7557f191487df0d1b385acb48
         checksum/config-utils: 94d255131467f84bef964a4c72b2b792c5ebaf711bb1c77829d7cd1007a8ac22
     spec:
       affinity:
         nodeAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:
             nodeSelectorTerms:
             - matchExpressions:
               - key: swh/web
                 operator: In
@@ -24563,20 +24564,40 @@
     name: web-cassandra
   triggers:
   - type: prometheus
     metadata:
       serverAddress: http://prometheus-operated.cattle-monitoring-system:9090
       metricName: gunicorn_requests
       threshold: "0.1"
       # There is no environment when using the cluster's prometheus instance
       query: sum(rate(gunicorn_requests{namespace="swh-cassandra-next-version",deployment="web-cassandra"}[2m]))
 ---
+# Source: swh/templates/web/scn-metrics-scraping.yaml
+apiVersion: monitoring.coreos.com/v1alpha1
+kind: ScrapeConfig
+metadata:
+  name: web-cassandra-scn-metrics
+  namespace: swh-cassandra-next-version
+  labels:
+    release: rancher-monitoring
+spec:
+  staticConfigs:
+    - labels:
+        job: web-cassandra-scn-metrics
+        namespace: swh-cassandra-next-version
+      targets:
+        -   web-cassandra.swh-cassandra-next-version:5004 # target the service
+  metricsPath: /metrics/prometheus/
+  scrapeInterval: 60s
+  scrapeTimeout: 60s
+  scheme: HTTP
+---
 # Source: swh/templates/counters/servicemonitor.yaml
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
   name: counters-rpc
   namespace: swh-cassandra-next-version
   labels:
     app: "counters-rpc-sm"
 spec:
   endpoints:


------------- diff for environment production namespace swh -------------

--- /tmp/swh-chart.swh.z91pOUCP/production-swh.before	2024-11-05 23:15:15.318509047 +0100
+++ /tmp/swh-chart.swh.z91pOUCP/production-swh.after	2024-11-05 23:15:16.998511143 +0100
@@ -2430,20 +2430,21 @@
   namespace: swh
   name: web-postgresql-configuration-template
 data:
   config.yml.template: |
     instance_name: webapp-postgresql.internal.softwareheritage.org
     allowed_hosts:
       - webapp-postgresql.internal.softwareheritage.org
       - ${POD_IP}
     production_server_names:
       - webapp-postgresql.internal.softwareheritage.org
+      - web-postgresql.swh
       - ${POD_IP}
     storage:
       cls: remote
       url: http://storage-azure-read-only-rpc-ingress
     search:
       cls: remote
       url: http://search-rpc-ingress-swh-cassandra
     provenance:
       cls: remote
       url: http://webapp-provenance-ingress-swh-cassandra
@@ -6120,21 +6121,21 @@
       app: web-postgresql
   strategy:
     type: RollingUpdate
     rollingUpdate:
       maxSurge: 1
   template:
     metadata:
       labels:
         app: web-postgresql
       annotations:
-        checksum/config: 9008e4ede578ac66000ace26b55c1c8cdfd87347628b04f0d3d92f12f9859063
+        checksum/config: f565b980b5d9dd441d31582228e9500c88d84ed55e06469ef45780b4163e8200
         checksum/config-logging: 81fb24577eb1777be8690f58c1e92d701777fe4ff045bb8445feb924947b9f84
         checksum/config-utils: d75ca13b805bce6a8ab59c8e24c938f2283108f6a79134f6e71db86308651dc6
     spec:
       affinity:
         nodeAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:
             nodeSelectorTerms:
             - matchExpressions:
               - key: swh/web
                 operator: In


------------- diff for environment production namespace swh-cassandra -------------

--- /tmp/swh-chart.swh.z91pOUCP/production-swh-cassandra.before	2024-11-05 23:15:16.426510430 +0100
+++ /tmp/swh-chart.swh.z91pOUCP/production-swh-cassandra.after	2024-11-05 23:15:18.170512598 +0100
@@ -9629,20 +9629,21 @@
       - archive.softwareheritage.org
       - base.softwareheritage.org
       - archive.internal.softwareheritage.org
       - archive-dynamic.internal.softwareheritage.org
       - ${POD_IP}
     production_server_names:
       - archive.softwareheritage.org
       - base.softwareheritage.org
       - archive.internal.softwareheritage.org
       - archive-dynamic.internal.softwareheritage.org
+      - web-archive.swh-cassandra
       - ${POD_IP}
     storage:
       cls: remote
       url: http://storage-azure-read-only-rpc-ingress-swh-cassandra
     search:
       cls: remote
       url: http://search-rpc-ingress-swh-cassandra
     provenance:
       cls: remote
       url: http://webapp-provenance-ingress-swh-cassandra
@@ -9863,20 +9864,21 @@
 data:
   config.yml.template: |
     instance_name: archive.softwareheritage.org
     allowed_hosts:
       - archive.softwareheritage.org
       - archive.internal.softwareheritage.org
       - ${POD_IP}
     production_server_names:
       - archive.softwareheritage.org
       - archive.internal.softwareheritage.org
+      - web-webhooks.swh-cassandra
       - ${POD_IP}
     storage:
       cls: remote
       url: http://storage-azure-read-only-rpc-ingress-swh-cassandra
     search:
       cls: remote
       url: http://search-rpc-ingress-swh-cassandra
     scheduler:
       cls: remote
       url: http://scheduler-rpc-ingress-swh-cassandra
@@ -27420,21 +27422,21 @@
       app: web-archive
   strategy:
     type: RollingUpdate
     rollingUpdate:
       maxSurge: 1
   template:
     metadata:
       labels:
         app: web-archive
       annotations:
-        checksum/config: 714fb4037b9c55ddf888c063040f9a7b17e5257b3f12f493a9c541a671324eb4
+        checksum/config: 81e55ed3fc49bbfb0536a19547724f94d089bc215d44513c01f28f076fe25219
         checksum/config-logging: af7bf52757798a2fcd4c237ed3de9df87c15b7f38419128a8d67d02b8a485097
         checksum/config-utils: 13a26f6add17e96ce01550153c77dcd48de60241a3f4db3c93d5467234be2a7f
     spec:
       affinity:
         nodeAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:
             nodeSelectorTerms:
             - matchExpressions:
               - key: swh/web
                 operator: In
@@ -27688,21 +27690,21 @@
       app: web-webhooks
   strategy:
     type: RollingUpdate
     rollingUpdate:
       maxSurge: 1
   template:
     metadata:
       labels:
         app: web-webhooks
       annotations:
-        checksum/config: d389ff1625c80c931e6f6bd883689d91ef612026c40efc6267948801a6ea0f8d
+        checksum/config: 3525bea865687e1d01124c55e0d634f1608700e736a942514e9fc24ae6306242
         checksum/config-logging: 8204fa505554e2a92718b6446f5335481339d9b88337df1e300a3cdc6868c0a8
         checksum/config-utils: 13a26f6add17e96ce01550153c77dcd48de60241a3f4db3c93d5467234be2a7f
     spec:
       affinity:
         nodeAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:
             nodeSelectorTerms:
             - matchExpressions:
               - key: swh/web
                 operator: In
@@ -33389,20 +33391,40 @@
   maxReplicaCount: 5
   idleReplicaCount: 0
   triggers:
   - type: kafka
     metadata:
       bootstrapServers: kafka1.internal.softwareheritage.org:9092,kafka2.internal.softwareheritage.org:9092,kafka3.internal.softwareheritage.org:9092,kafka4.internal.softwareheritage.org:9092
       consumerGroup: swh-archive-prod-webhooks
       lagThreshold: "2000"
       offsetResetPolicy: earliest
 ---
+# Source: swh/templates/web/scn-metrics-scraping.yaml
+apiVersion: monitoring.coreos.com/v1alpha1
+kind: ScrapeConfig
+metadata:
+  name: web-archive-scn-metrics
+  namespace: swh-cassandra
+  labels:
+    release: rancher-monitoring
+spec:
+  staticConfigs:
+    - labels:
+        job: web-archive-scn-metrics
+        namespace: swh-cassandra
+      targets:
+        -   web-archive.swh-cassandra:5004 # target the service
+  metricsPath: /metrics/prometheus/
+  scrapeInterval: 120s
+  scrapeTimeout: 120s
+  scheme: HTTP
+---
 # Source: swh/templates/counters/servicemonitor.yaml
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
   name: counters-rpc
   namespace: swh-cassandra
   labels:
     app: "counters-rpc-sm"
 spec:
   endpoints:
@@ -33447,58 +33469,20 @@
   - path: /metrics
     port: http
     interval: 10s
   selector:
     matchLabels:
       app: prometheus-statsd-exporter
   namespaceSelector:
     matchNames:
       - swh-cassandra
 ---
-# Source: swh/templates/web/monitoring.yaml
-apiVersion: monitoring.coreos.com/v1
-kind: ServiceMonitor
-metadata:
-  name: web-archive-metrics
-  namespace: swh-cassandra
-spec:
-  endpoints:
-  - path: /metrics/prometheus/
-    port: rpc
-    interval: 300s
-    scrapeTimeout: 60s
-  selector:
-    matchLabels:
-      app: web-archive
-  namespaceSelector:
-    matchNames:
-      - swh-cassandra
----
-# Source: swh/templates/web/monitoring.yaml
-apiVersion: monitoring.coreos.com/v1
-kind: ServiceMonitor
-metadata:
-  name: web-webhooks-metrics
-  namespace: swh-cassandra
-spec:
-  endpoints:
-  - path: /metrics/prometheus/
-    port: rpc
-    interval: 300s
-    scrapeTimeout: 60s
-  selector:
-    matchLabels:
-      app: web-webhooks
-  namespaceSelector:
-    matchNames:
-      - swh-cassandra
----
 # Source: swh/templates/checker-deposit/keda-autoscaling.yaml
 apiVersion: keda.sh/v1alpha1
 kind: TriggerAuthentication
 metadata:
   name: amqp-authentication-checker-deposit
   namespace: swh-cassandra
 spec:
   secretTargetRef:
   - parameter: host            # "host" is required by the scalerObject trigger metadata
     name: common-secrets

Note: https://prometheus-operator.dev/docs/developer/scrapeconfig/

Edited by Antoine R. Dumont

Merge request reports