swh/webapp: Avoid overloading the webapps with heavy scn metrics retrieval
Use a custom scraping configuration to get them only one time per scraping interval instead of one time per node
Related to swh/infra/sysadm-environment#5474 (closed)
helm diff
[swh] Comparing changes between branches production and scn-metrics-scraping (per environment)...
Your branch is up to date with 'origin/production'.
[swh] Generate config in production branch for environment staging, namespace swh...
[swh] Generate config in production branch for environment staging, namespace swh-cassandra...
[swh] Generate config in production branch for environment staging, namespace swh-cassandra-next-version...
[swh] Generate config in scn-metrics-scraping branch for environment staging...
[swh] Generate config in scn-metrics-scraping branch for environment staging...
[swh] Generate config in scn-metrics-scraping branch for environment staging...
Your branch is up to date with 'origin/production'.
[swh] Generate config in production branch for environment production, namespace swh...
[swh] Generate config in production branch for environment production, namespace swh-cassandra...
[swh] Generate config in production branch for environment production, namespace swh-cassandra-next-version...
[swh] Generate config in scn-metrics-scraping branch for environment production...
[swh] Generate config in scn-metrics-scraping branch for environment production...
[swh] Generate config in scn-metrics-scraping branch for environment production...
------------- diff for environment staging namespace swh -------------
--- /tmp/swh-chart.swh.z91pOUCP/staging-swh.before 2024-11-05 23:15:10.198502589 +0100
+++ /tmp/swh-chart.swh.z91pOUCP/staging-swh.after 2024-11-05 23:15:12.822505912 +0100
@@ -2115,20 +2115,21 @@
namespace: swh
name: web-postgresql-configuration-template
data:
config.yml.template: |
instance_name: webapp-postgresql.internal.staging.swh.network
allowed_hosts:
- webapp-postgresql.internal.staging.swh.network
- ${POD_IP}
staging_server_names:
- webapp-postgresql.internal.staging.swh.network
+ - web-postgresql.swh
- ${POD_IP}
storage:
cls: remote
url: http://storage-postgresql-read-only-rpc-ingress
search:
cls: remote
url: http://search-rpc-ingress
scheduler:
cls: remote
url: http://scheduler.internal.staging.swh.network
@@ -5775,21 +5776,21 @@
app: web-postgresql
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
template:
metadata:
labels:
app: web-postgresql
annotations:
- checksum/config: b4770331a75303af14bf767616379959a4351a5e28839fd7e9ef93fd951b366b
+ checksum/config: 79a570f2cc0fd8228da9de1d335cfd5c3e39035fcda7f4ccf9da6df40eda13ce
checksum/config-logging: 81fb24577eb1777be8690f58c1e92d701777fe4ff045bb8445feb924947b9f84
checksum/config-utils: d75ca13b805bce6a8ab59c8e24c938f2283108f6a79134f6e71db86308651dc6
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: swh/web
operator: In
------------- diff for environment staging namespace swh-cassandra -------------
--- /tmp/swh-chart.swh.z91pOUCP/staging-swh-cassandra.before 2024-11-05 23:15:11.474504207 +0100
+++ /tmp/swh-chart.swh.z91pOUCP/staging-swh-cassandra.after 2024-11-05 23:15:13.810507155 +0100
@@ -8139,20 +8139,21 @@
data:
config.yml.template: |
instance_name: webapp.staging.swh.network
allowed_hosts:
- webapp.staging.swh.network
- webapp-cassandra.internal.staging.swh.network
- ${POD_IP}
staging_server_names:
- webapp.staging.swh.network
- webapp-cassandra.internal.staging.swh.network
+ - web-cassandra.swh-cassandra
- ${POD_IP}
storage:
cls: remote
url: http://storage-cassandra-read-only-ingress
search:
cls: remote
url: http://search-rpc-ingress
provenance:
cls: remote
url: http://webapp-provenance-ingress
@@ -8336,20 +8337,21 @@
data:
config.yml.template: |
instance_name: webapp.staging.swh.network
allowed_hosts:
- webapp.staging.swh.network
- webapp-cassandra.internal.staging.swh.network
- ${POD_IP}
staging_server_names:
- webapp.staging.swh.network
- webapp-cassandra.internal.staging.swh.network
+ - web-webhooks.swh-cassandra
- ${POD_IP}
storage:
cls: remote
url: http://storage-cassandra-read-only-ingress
search:
cls: remote
url: http://search-rpc-ingress
scheduler:
cls: remote
url: http://scheduler.internal.staging.swh.network
@@ -22916,21 +22918,21 @@
app: web-cassandra
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
template:
metadata:
labels:
app: web-cassandra
annotations:
- checksum/config: bc7048c8f8c4c42e0a02187ed2350802ebfdd1955bc72fb25c0f346db4e92f71
+ checksum/config: a7924234dec69ccdca9f58edaa1e4496b73d08ff63f05ed572f0620728ed0295
checksum/config-logging: 21c90a039f27f4476045b8973a841bb2b3c0e4435be7fb9ab1d748372f8a96c8
checksum/config-utils: 13a26f6add17e96ce01550153c77dcd48de60241a3f4db3c93d5467234be2a7f
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: swh/web
operator: In
@@ -23184,21 +23186,21 @@
app: web-webhooks
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
template:
metadata:
labels:
app: web-webhooks
annotations:
- checksum/config: 51ca8f4891b68776c7e6c6d4d5262813fb13658a325185cdbbfb25a698f1b216
+ checksum/config: 7b09773b939b9bd446fc60f1e8576d999f5c50890355d505ef4ef1454229a042
checksum/config-logging: 8204fa505554e2a92718b6446f5335481339d9b88337df1e300a3cdc6868c0a8
checksum/config-utils: 13a26f6add17e96ce01550153c77dcd48de60241a3f4db3c93d5467234be2a7f
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: swh/web
operator: In
@@ -28598,20 +28600,40 @@
maxReplicaCount: 2
idleReplicaCount: 0
triggers:
- type: kafka
metadata:
bootstrapServers: journal2.internal.staging.swh.network:9092
consumerGroup: swh-archive-stg-webhooks
lagThreshold: "1000"
offsetResetPolicy: earliest
---
+# Source: swh/templates/web/scn-metrics-scraping.yaml
+apiVersion: monitoring.coreos.com/v1alpha1
+kind: ScrapeConfig
+metadata:
+ name: web-cassandra-scn-metrics
+ namespace: swh-cassandra
+ labels:
+ release: rancher-monitoring
+spec:
+ staticConfigs:
+ - labels:
+ job: web-cassandra-scn-metrics
+ namespace: swh-cassandra
+ targets:
+ - web-cassandra.swh-cassandra:5004 # target the service
+ metricsPath: /metrics/prometheus/
+ scrapeInterval: 60s
+ scrapeTimeout: 60s
+ scheme: HTTP
+---
# Source: swh/templates/counters/servicemonitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: counters-rpc
namespace: swh-cassandra
labels:
app: "counters-rpc-sm"
spec:
endpoints:
@@ -28656,58 +28678,20 @@
- path: /metrics
port: http
interval: 10s
selector:
matchLabels:
app: prometheus-statsd-exporter
namespaceSelector:
matchNames:
- swh-cassandra
---
-# Source: swh/templates/web/monitoring.yaml
-apiVersion: monitoring.coreos.com/v1
-kind: ServiceMonitor
-metadata:
- name: web-cassandra-metrics
- namespace: swh-cassandra
-spec:
- endpoints:
- - path: /metrics/prometheus/
- port: rpc
- interval: 300s
- scrapeTimeout: 60s
- selector:
- matchLabels:
- app: web-cassandra
- namespaceSelector:
- matchNames:
- - swh-cassandra
----
-# Source: swh/templates/web/monitoring.yaml
-apiVersion: monitoring.coreos.com/v1
-kind: ServiceMonitor
-metadata:
- name: web-webhooks-metrics
- namespace: swh-cassandra
-spec:
- endpoints:
- - path: /metrics/prometheus/
- port: rpc
- interval: 300s
- scrapeTimeout: 60s
- selector:
- matchLabels:
- app: web-webhooks
- namespaceSelector:
- matchNames:
- - swh-cassandra
----
# Source: swh/templates/checker-deposit/keda-autoscaling.yaml
apiVersion: keda.sh/v1alpha1
kind: TriggerAuthentication
metadata:
name: amqp-authentication-checker-deposit
namespace: swh-cassandra
spec:
secretTargetRef:
- parameter: host # "host" is required by the scalerObject trigger metadata
name: common-secrets
------------- diff for environment staging namespace swh-cassandra-next-version -------------
--- /tmp/swh-chart.swh.z91pOUCP/staging-swh-cassandra-next-version.before 2024-11-05 23:15:12.414505396 +0100
+++ /tmp/swh-chart.swh.z91pOUCP/staging-swh-cassandra-next-version.after 2024-11-05 23:15:14.722508300 +0100
@@ -7379,20 +7379,21 @@
namespace: swh-cassandra-next-version
name: web-cassandra-configuration-template
data:
config.yml.template: |
instance_name: webapp-cassandra-next-version.internal.staging.swh.network
allowed_hosts:
- webapp-cassandra-next-version.internal.staging.swh.network
- ${POD_IP}
staging_server_names:
- webapp-cassandra-next-version.internal.staging.swh.network
+ - web-cassandra.swh-cassandra-next-version
- ${POD_IP}
storage:
cls: remote
url: http://storage-ro-postgresql:5002
search:
cls: remote
url: http://search-rpc:5010
provenance:
cls: remote
url: http://webapp-provenance-ingress-next-version
@@ -20969,21 +20970,21 @@
app: web-cassandra
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
template:
metadata:
labels:
app: web-cassandra
annotations:
- checksum/config: d16aa75850c7d343bd21803660445bd54290c8fd62c104f77776f77aa878ed13
+ checksum/config: 1acd620bfef834acc183aae323e0530281c12804280f143815ce3ac0fe143c99
checksum/config-logging: f266f784128ac9c57c6d0f154a646e15f06d0ad7557f191487df0d1b385acb48
checksum/config-utils: 94d255131467f84bef964a4c72b2b792c5ebaf711bb1c77829d7cd1007a8ac22
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: swh/web
operator: In
@@ -24563,20 +24564,40 @@
name: web-cassandra
triggers:
- type: prometheus
metadata:
serverAddress: http://prometheus-operated.cattle-monitoring-system:9090
metricName: gunicorn_requests
threshold: "0.1"
# There is no environment when using the cluster's prometheus instance
query: sum(rate(gunicorn_requests{namespace="swh-cassandra-next-version",deployment="web-cassandra"}[2m]))
---
+# Source: swh/templates/web/scn-metrics-scraping.yaml
+apiVersion: monitoring.coreos.com/v1alpha1
+kind: ScrapeConfig
+metadata:
+ name: web-cassandra-scn-metrics
+ namespace: swh-cassandra-next-version
+ labels:
+ release: rancher-monitoring
+spec:
+ staticConfigs:
+ - labels:
+ job: web-cassandra-scn-metrics
+ namespace: swh-cassandra-next-version
+ targets:
+ - web-cassandra.swh-cassandra-next-version:5004 # target the service
+ metricsPath: /metrics/prometheus/
+ scrapeInterval: 60s
+ scrapeTimeout: 60s
+ scheme: HTTP
+---
# Source: swh/templates/counters/servicemonitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: counters-rpc
namespace: swh-cassandra-next-version
labels:
app: "counters-rpc-sm"
spec:
endpoints:
------------- diff for environment production namespace swh -------------
--- /tmp/swh-chart.swh.z91pOUCP/production-swh.before 2024-11-05 23:15:15.318509047 +0100
+++ /tmp/swh-chart.swh.z91pOUCP/production-swh.after 2024-11-05 23:15:16.998511143 +0100
@@ -2430,20 +2430,21 @@
namespace: swh
name: web-postgresql-configuration-template
data:
config.yml.template: |
instance_name: webapp-postgresql.internal.softwareheritage.org
allowed_hosts:
- webapp-postgresql.internal.softwareheritage.org
- ${POD_IP}
production_server_names:
- webapp-postgresql.internal.softwareheritage.org
+ - web-postgresql.swh
- ${POD_IP}
storage:
cls: remote
url: http://storage-azure-read-only-rpc-ingress
search:
cls: remote
url: http://search-rpc-ingress-swh-cassandra
provenance:
cls: remote
url: http://webapp-provenance-ingress-swh-cassandra
@@ -6120,21 +6121,21 @@
app: web-postgresql
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
template:
metadata:
labels:
app: web-postgresql
annotations:
- checksum/config: 9008e4ede578ac66000ace26b55c1c8cdfd87347628b04f0d3d92f12f9859063
+ checksum/config: f565b980b5d9dd441d31582228e9500c88d84ed55e06469ef45780b4163e8200
checksum/config-logging: 81fb24577eb1777be8690f58c1e92d701777fe4ff045bb8445feb924947b9f84
checksum/config-utils: d75ca13b805bce6a8ab59c8e24c938f2283108f6a79134f6e71db86308651dc6
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: swh/web
operator: In
------------- diff for environment production namespace swh-cassandra -------------
--- /tmp/swh-chart.swh.z91pOUCP/production-swh-cassandra.before 2024-11-05 23:15:16.426510430 +0100
+++ /tmp/swh-chart.swh.z91pOUCP/production-swh-cassandra.after 2024-11-05 23:15:18.170512598 +0100
@@ -9629,20 +9629,21 @@
- archive.softwareheritage.org
- base.softwareheritage.org
- archive.internal.softwareheritage.org
- archive-dynamic.internal.softwareheritage.org
- ${POD_IP}
production_server_names:
- archive.softwareheritage.org
- base.softwareheritage.org
- archive.internal.softwareheritage.org
- archive-dynamic.internal.softwareheritage.org
+ - web-archive.swh-cassandra
- ${POD_IP}
storage:
cls: remote
url: http://storage-azure-read-only-rpc-ingress-swh-cassandra
search:
cls: remote
url: http://search-rpc-ingress-swh-cassandra
provenance:
cls: remote
url: http://webapp-provenance-ingress-swh-cassandra
@@ -9863,20 +9864,21 @@
data:
config.yml.template: |
instance_name: archive.softwareheritage.org
allowed_hosts:
- archive.softwareheritage.org
- archive.internal.softwareheritage.org
- ${POD_IP}
production_server_names:
- archive.softwareheritage.org
- archive.internal.softwareheritage.org
+ - web-webhooks.swh-cassandra
- ${POD_IP}
storage:
cls: remote
url: http://storage-azure-read-only-rpc-ingress-swh-cassandra
search:
cls: remote
url: http://search-rpc-ingress-swh-cassandra
scheduler:
cls: remote
url: http://scheduler-rpc-ingress-swh-cassandra
@@ -27420,21 +27422,21 @@
app: web-archive
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
template:
metadata:
labels:
app: web-archive
annotations:
- checksum/config: 714fb4037b9c55ddf888c063040f9a7b17e5257b3f12f493a9c541a671324eb4
+ checksum/config: 81e55ed3fc49bbfb0536a19547724f94d089bc215d44513c01f28f076fe25219
checksum/config-logging: af7bf52757798a2fcd4c237ed3de9df87c15b7f38419128a8d67d02b8a485097
checksum/config-utils: 13a26f6add17e96ce01550153c77dcd48de60241a3f4db3c93d5467234be2a7f
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: swh/web
operator: In
@@ -27688,21 +27690,21 @@
app: web-webhooks
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
template:
metadata:
labels:
app: web-webhooks
annotations:
- checksum/config: d389ff1625c80c931e6f6bd883689d91ef612026c40efc6267948801a6ea0f8d
+ checksum/config: 3525bea865687e1d01124c55e0d634f1608700e736a942514e9fc24ae6306242
checksum/config-logging: 8204fa505554e2a92718b6446f5335481339d9b88337df1e300a3cdc6868c0a8
checksum/config-utils: 13a26f6add17e96ce01550153c77dcd48de60241a3f4db3c93d5467234be2a7f
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: swh/web
operator: In
@@ -33389,20 +33391,40 @@
maxReplicaCount: 5
idleReplicaCount: 0
triggers:
- type: kafka
metadata:
bootstrapServers: kafka1.internal.softwareheritage.org:9092,kafka2.internal.softwareheritage.org:9092,kafka3.internal.softwareheritage.org:9092,kafka4.internal.softwareheritage.org:9092
consumerGroup: swh-archive-prod-webhooks
lagThreshold: "2000"
offsetResetPolicy: earliest
---
+# Source: swh/templates/web/scn-metrics-scraping.yaml
+apiVersion: monitoring.coreos.com/v1alpha1
+kind: ScrapeConfig
+metadata:
+ name: web-archive-scn-metrics
+ namespace: swh-cassandra
+ labels:
+ release: rancher-monitoring
+spec:
+ staticConfigs:
+ - labels:
+ job: web-archive-scn-metrics
+ namespace: swh-cassandra
+ targets:
+ - web-archive.swh-cassandra:5004 # target the service
+ metricsPath: /metrics/prometheus/
+ scrapeInterval: 120s
+ scrapeTimeout: 120s
+ scheme: HTTP
+---
# Source: swh/templates/counters/servicemonitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: counters-rpc
namespace: swh-cassandra
labels:
app: "counters-rpc-sm"
spec:
endpoints:
@@ -33447,58 +33469,20 @@
- path: /metrics
port: http
interval: 10s
selector:
matchLabels:
app: prometheus-statsd-exporter
namespaceSelector:
matchNames:
- swh-cassandra
---
-# Source: swh/templates/web/monitoring.yaml
-apiVersion: monitoring.coreos.com/v1
-kind: ServiceMonitor
-metadata:
- name: web-archive-metrics
- namespace: swh-cassandra
-spec:
- endpoints:
- - path: /metrics/prometheus/
- port: rpc
- interval: 300s
- scrapeTimeout: 60s
- selector:
- matchLabels:
- app: web-archive
- namespaceSelector:
- matchNames:
- - swh-cassandra
----
-# Source: swh/templates/web/monitoring.yaml
-apiVersion: monitoring.coreos.com/v1
-kind: ServiceMonitor
-metadata:
- name: web-webhooks-metrics
- namespace: swh-cassandra
-spec:
- endpoints:
- - path: /metrics/prometheus/
- port: rpc
- interval: 300s
- scrapeTimeout: 60s
- selector:
- matchLabels:
- app: web-webhooks
- namespaceSelector:
- matchNames:
- - swh-cassandra
----
# Source: swh/templates/checker-deposit/keda-autoscaling.yaml
apiVersion: keda.sh/v1alpha1
kind: TriggerAuthentication
metadata:
name: amqp-authentication-checker-deposit
namespace: swh-cassandra
spec:
secretTargetRef:
- parameter: host # "host" is required by the scalerObject trigger metadata
name: common-secrets
Note: https://prometheus-operator.dev/docs/developer/scrapeconfig/
Edited by Antoine R. Dumont