swh/production: Deploy a dedicated loader to ingest big svn repositories
helm diff
[swh] Comparing changes between branches staging and svn-large-repository (per environment)...
Your branch is up to date with 'origin/staging'.
[swh] Generate config in staging branch for environment staging, namespace swh...
[swh] Generate config in staging branch for environment staging, namespace swh-cassandra...
[swh] Generate config in staging branch for environment staging, namespace swh-cassandra-next-version...
[swh] Generate config in svn-large-repository branch for environment staging...
[swh] Generate config in svn-large-repository branch for environment staging...
[swh] Generate config in svn-large-repository branch for environment staging...
Your branch is up to date with 'origin/staging'.
[swh] Generate config in staging branch for environment production, namespace swh...
[swh] Generate config in staging branch for environment production, namespace swh-cassandra...
[swh] Generate config in staging branch for environment production, namespace swh-cassandra-next-version...
[swh] Generate config in svn-large-repository branch for environment production...
[swh] Generate config in svn-large-repository branch for environment production...
[swh] Generate config in svn-large-repository branch for environment production...
------------- diff for environment staging namespace swh -------------
No differences
------------- diff for environment staging namespace swh-cassandra -------------
No differences
------------- diff for environment staging namespace swh-cassandra-next-version -------------
No differences
------------- diff for environment production namespace swh -------------
No differences
------------- diff for environment production namespace swh-cassandra -------------
--- /tmp/swh-chart.swh.KIE6MzLu/production-swh-cassandra.before 2024-12-03 14:29:23.272991305 +0100
+++ /tmp/swh-chart.swh.KIE6MzLu/production-swh-cassandra.after 2024-12-03 14:29:24.640996129 +0100
@@ -6901,20 +6901,125 @@
swh:
level: "INFO"
celery.task:
level: "INFO"
root:
level: "INFO"
handlers:
- console
---
+# Source: swh/templates/loaders/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: loader-svn-large-repository-template
+ namespace: swh-cassandra
+data:
+ config.yml.template: |
+ storage:
+ cls: pipeline
+ steps:
+ - cls: buffer
+ min_batch_size:
+ content: 1000
+ content_bytes: 52428800
+ directory: 1000
+ directory_entries: 12000
+ extid: 1000
+ release: 1000
+ release_bytes: 52428800
+ revision: 1000
+ revision_bytes: 52428800
+ revision_parents: 2000
+ - cls: filter
+ - cls: retry
+ - cls: remote
+ url: http://storage-rw-cassandra-ingress-swh-cassandra
+ celery:
+ task_broker: amqp://swhconsumer:${AMQP_PASSWORD}@rabbitmq.internal.softwareheritage.org:5672/%2f
+ task_acks_late: false
+ task_queues:
+ - large_repository:swh.loader.svn.tasks.LoadSvnRepository
+ - large_repository:swh.loader.svn.tasks.MountAndLoadSvnRepository
+ - large_repository:swh.loader.svn.tasks.DumpMountAndLoadSvnRepository
+
+ sentry_settings_for_celery_tasks:
+ __sentry-settings-for-celery-tasks__
+ metadata_fetcher_credentials:
+ __metadata-fetcher-credentials__
+ init-container-entrypoint.sh: |
+ #!/bin/bash
+
+ set -e
+
+ CONFIG_FILE=/etc/swh/config.yml
+ CONFIG_FILE_WIP=/tmp/wip-config.yml
+
+ # substitute environment variables when creating the default config.yml
+ eval echo \""$(</etc/swh/configuration-template/config.yml.template)"\" \
+ > $CONFIG_FILE
+
+
+ SENTRY_SETTINGS_PATH=/etc/credentials/sentry-settings/sentry_settings_for_celery_tasks
+ if [ -f $SENTRY_SETTINGS_PATH ]; then
+ awk "/__sentry-settings-for-celery-tasks__/{system(\"sed 's/^/ /g' $SENTRY_SETTINGS_PATH\");next}1" $CONFIG_FILE > $CONFIG_FILE_WIP
+ mv $CONFIG_FILE_WIP $CONFIG_FILE
+ else
+ sed -i 's/__sentry-settings-for-celery-tasks__//g' $CONFIG_FILE
+ fi
+
+ CREDS_LISTER_PATH=/etc/credentials/metadata-fetcher/credentials
+ if [ -f $CREDS_LISTER_PATH ]; then
+ awk "/__metadata-fetcher-credentials__/{system(\"sed 's/^/ /g' $CREDS_LISTER_PATH\");next}1" $CONFIG_FILE > $CONFIG_FILE_WIP
+ mv $CONFIG_FILE_WIP $CONFIG_FILE
+ else
+ sed -i 's/__metadata-fetcher-credentials__//g' $CONFIG_FILE
+ fi
+
+ exit 0
+
+
+ logging-configuration.yml: |
+ version: 1
+
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: json
+ stream: ext://sys.stdout
+
+ formatters:
+ json:
+ class: pythonjsonlogger.jsonlogger.JsonFormatter
+ # python-json-logger parses the format argument to get the variables it actually expands into the json
+ format: "%(asctime)s:%(threadName)s:%(pathname)s:%(lineno)s:%(funcName)s:%(task_name)s:%(task_id)s:%(name)s:%(levelname)s:%(message)s"
+
+ loggers:
+ celery:
+ level: "INFO"
+ amqp:
+ level: WARNING
+ urllib3:
+ level: WARNING
+ azure.core.pipeline.policies.http_logging_policy:
+ level: WARNING
+ swh:
+ level: "INFO"
+ celery.task:
+ level: "INFO"
+
+ root:
+ level: "INFO"
+ handlers:
+ - console
+---
# Source: swh/templates/objstorage-replayer/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
namespace: swh-cassandra
name: objstorage-replayer-s3-template
data:
config.yml.template: |
objstorage:
cls: multiplexer
@@ -22119,20 +22224,177 @@
- key: "pre-stop-idempotent.sh"
path: "pre-stop.sh"
- name: metadata-fetcher-credentials
secret:
secretName: metadata-fetcher-credentials
optional: true
- name: sentry-settings-for-celery-tasks
secret:
secretName: sentry-settings-for-celery-tasks
optional: true
+# if defined at the "typed" loader level
+# otherwise use the global image is defined First this needs to replace - in
+# $loader_type with "" to find the proper image name.
+---
+# Source: swh/templates/loaders/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: loader-svn-large-repository
+ namespace: swh-cassandra
+ labels:
+ app: loader-svn-large-repository
+spec:
+ revisionHistoryLimit: 2
+ selector:
+ matchLabels:
+ app: loader-svn-large-repository
+ strategy:
+ type: RollingUpdate
+ rollingUpdate:
+ maxSurge: 1
+ template:
+ metadata:
+ labels:
+ app: loader-svn-large-repository
+ annotations:
+ # Force a rollout upgrade if the configuration changes
+ checksum/config: a3d8c3bc4ee773e7bfc2be66d89ca641001edc56f655246a6592d169864ea158
+ spec:
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: swh/loader
+ operator: In
+ values:
+ - "true"
+ - key: swh/large-scratch-fs
+ operator: In
+ values:
+ - "true"
+ priorityClassName: swh-cassandra-normal-workload
+ terminationGracePeriodSeconds: 3600
+ dnsConfig:
+ options:
+ - name: ndots
+ value: "1"
+ searches:
+ - cluster.local
+ - svc.cluster.local
+ - swh-cassandra.svc.cluster.local
+ initContainers:
+ - name: prepare-configuration
+ image: debian:bullseye
+ imagePullPolicy: IfNotPresent
+ env:
+ - name: AMQP_PASSWORD
+ valueFrom:
+ secretKeyRef:
+ key: swhconsumer-password
+ name: amqp-secrets
+ optional: false
+ command:
+ - /entrypoint.sh
+ volumeMounts:
+ - name: configuration-template
+ mountPath: /entrypoint.sh
+ subPath: "init-container-entrypoint.sh"
+ readOnly: true
+ - name: configuration
+ mountPath: /etc/swh
+ - name: configuration-template
+ mountPath: /etc/swh/configuration-template
+ - name: metadata-fetcher-credentials
+ mountPath: /etc/credentials/metadata-fetcher
+ readOnly: true
+ - name: sentry-settings-for-celery-tasks
+ mountPath: /etc/credentials/sentry-settings
+ readOnly: true
+ containers:
+ - name: loaders
+ image: container-registry.softwareheritage.org/swh/infra/swh-apps/loader_svn:20241127.2
+ imagePullPolicy: IfNotPresent
+ command:
+ - /opt/swh/entrypoint.sh
+ resources:
+ requests:
+ memory: 50Gi
+ cpu: 1
+ lifecycle:
+ preStop:
+ exec:
+ command: ["/pre-stop.sh"]
+ env:
+ - name: STATSD_HOST
+ value: prometheus-statsd-exporter
+ - name: STATSD_PORT
+ value: "9125"
+ - name: STATSD_TAGS
+ value: deployment:loader-svn-large-repository
+ - name: MAX_TASKS_PER_CHILD
+ value: "10"
+ - name: SWH_LOG_LEVEL
+ value: "INFO"
+ - name: SWH_CONFIG_FILENAME
+ value: /etc/swh/config.yml
+ - name: SWH_LOG_CONFIG
+ value: /etc/swh/logging-configuration.yml
+ - name: SWH_SENTRY_ENVIRONMENT
+ value: production
+ - name: SWH_SENTRY_DISABLE_LOGGING_EVENTS
+ value: "yes"
+ volumeMounts:
+ - name: loader-utils
+ mountPath: /pre-stop.sh
+ subPath: "pre-stop.sh"
+ - name: configuration
+ mountPath: /etc/swh
+ - name: localstorage
+ mountPath: /tmp
+ - name: configuration-template
+ mountPath: /etc/swh/logging-configuration.yml
+ subPath: "logging-configuration.yml"
+ readOnly: true
+ volumes:
+ - name: localstorage
+ emptyDir:
+ sizeLimit: 100Gi
+ - name: configuration
+ emptyDir: {}
+ - name: configuration-template
+ configMap:
+ name: loader-svn-large-repository-template
+ defaultMode: 0777
+ items:
+ - key: "config.yml.template"
+ path: "config.yml.template"
+ - key: "init-container-entrypoint.sh"
+ path: "init-container-entrypoint.sh"
+ - key: "logging-configuration.yml"
+ path: "logging-configuration.yml"
+ - name: loader-utils
+ configMap:
+ name: loader-utils
+ defaultMode: 0777
+ items:
+ - key: "pre-stop-idempotent.sh"
+ path: "pre-stop.sh"
+ - name: metadata-fetcher-credentials
+ secret:
+ secretName: metadata-fetcher-credentials
+ optional: true
+ - name: sentry-settings-for-celery-tasks
+ secret:
+ secretName: sentry-settings-for-celery-tasks
+ optional: true
---
# Source: swh/templates/memcached/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: memcached
namespace: swh-cassandra
labels:
chart: "swh-0.1.0"
app: memcached
@@ -33094,20 +33356,97 @@
excludeUnacknowledged: "false" # QueueLength should include unacked messages
# Implies "http" protocol is used
value: "10"
queueName: swh.loader.svn.tasks.LoadSvnExport
vhostName: / # Optional. If not specified, use the vhost in the
# `host` connection string. Alternatively, you can
# use existing environment variables to read
# configuration from: See details in "Parameter
# list" section hostFromEnv: RABBITMQ_HOST%
---
+# Source: swh/templates/loaders/keda-autoscaling.yaml
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+ name: loader-svn-large-repository-operators
+ namespace: swh-cassandra
+spec:
+ scaleTargetRef:
+ apiVersion: apps/v1 # Optional. Default: apps/v1
+ kind: Deployment # Optional. Default: Deployment
+ # Mandatory. Must be in same namespace as ScaledObject
+ name: loader-svn-large-repository
+ # envSourceContainerName: {container-name} # Optional. Default:
+ # .spec.template.spec.containers[0]
+ pollingInterval: 30 # Optional. Default: 30 seconds
+ cooldownPeriod: 300
+ # ^ Optional. Default: 300 seconds
+ idleReplicaCount: 0 # Set to 0 to stop all the workers when
+ # there is no activity on the queue
+ minReplicaCount: 0
+ maxReplicaCount: 1
+ triggers:
+ - type: rabbitmq
+ authenticationRef:
+ name: amqp-authentication-loader-svn-large-repository
+ metadata:
+ protocol: auto # Optional. Specifies protocol to use,
+ # either amqp or http, or auto to
+ # autodetect based on the `host` value.
+ # Default value is auto.
+ mode: QueueLength # QueueLength to trigger on number of msgs in queue
+ excludeUnacknowledged: "false" # QueueLength should include unacked messages
+ # Implies "http" protocol is used
+ value: "1"
+ queueName: large_repository:swh.loader.svn.tasks.LoadSvnRepository
+ vhostName: / # Optional. If not specified, use the vhost in the
+ # `host` connection string. Alternatively, you can
+ # use existing environment variables to read
+ # configuration from: See details in "Parameter
+ # list" section hostFromEnv: RABBITMQ_HOST%
+ - type: rabbitmq
+ authenticationRef:
+ name: amqp-authentication-loader-svn-large-repository
+ metadata:
+ protocol: auto # Optional. Specifies protocol to use,
+ # either amqp or http, or auto to
+ # autodetect based on the `host` value.
+ # Default value is auto.
+ mode: QueueLength # QueueLength to trigger on number of msgs in queue
+ excludeUnacknowledged: "false" # QueueLength should include unacked messages
+ # Implies "http" protocol is used
+ value: "1"
+ queueName: large_repository:swh.loader.svn.tasks.MountAndLoadSvnRepository
+ vhostName: / # Optional. If not specified, use the vhost in the
+ # `host` connection string. Alternatively, you can
+ # use existing environment variables to read
+ # configuration from: See details in "Parameter
+ # list" section hostFromEnv: RABBITMQ_HOST%
+ - type: rabbitmq
+ authenticationRef:
+ name: amqp-authentication-loader-svn-large-repository
+ metadata:
+ protocol: auto # Optional. Specifies protocol to use,
+ # either amqp or http, or auto to
+ # autodetect based on the `host` value.
+ # Default value is auto.
+ mode: QueueLength # QueueLength to trigger on number of msgs in queue
+ excludeUnacknowledged: "false" # QueueLength should include unacked messages
+ # Implies "http" protocol is used
+ value: "1"
+ queueName: large_repository:swh.loader.svn.tasks.DumpMountAndLoadSvnRepository
+ vhostName: / # Optional. If not specified, use the vhost in the
+ # `host` connection string. Alternatively, you can
+ # use existing environment variables to read
+ # configuration from: See details in "Parameter
+ # list" section hostFromEnv: RABBITMQ_HOST%
+---
# Source: swh/templates/objstorage-replayer/keda-autoscaling.yaml
apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
name: objstorage-replayer-s3-scaledobject
namespace: swh-cassandra
spec:
scaleTargetRef:
name: objstorage-replayer-s3
pollingInterval: 120
@@ -34060,20 +34399,32 @@
kind: TriggerAuthentication
metadata:
name: amqp-authentication-loader-svn-export
namespace: swh-cassandra
spec:
secretTargetRef:
- parameter: host # "host" is required by the scalerObject trigger metadata
name: common-secrets
key: rabbitmq-http-host
---
+# Source: swh/templates/loaders/keda-autoscaling.yaml
+apiVersion: keda.sh/v1alpha1
+kind: TriggerAuthentication
+metadata:
+ name: amqp-authentication-loader-svn-large-repository
+ namespace: swh-cassandra
+spec:
+ secretTargetRef:
+ - parameter: host # "host" is required by the scalerObject trigger metadata
+ name: common-secrets
+ key: rabbitmq-http-host
+---
# Source: swh/templates/objstorage-replayer/keda-autoscaling.yaml
apiVersion: keda.sh/v1alpha1
kind: TriggerAuthentication
metadata:
name: keda-objstorage-replayer-s3-authentication
namespace: swh-cassandra
spec:
secretTargetRef:
- parameter: username
name: swh-archive-broker-secret
Related to swh/infra/sysadm-environment#5497 (closed)