Skip to content

swh/production: Deploy a dedicated loader to ingest big svn repositories

Vincent Sellier requested to merge svn-large-repository into production
helm diff
[swh] Comparing changes between branches staging and svn-large-repository (per environment)...
Your branch is up to date with 'origin/staging'.
[swh] Generate config in staging branch for environment staging, namespace swh...
[swh] Generate config in staging branch for environment staging, namespace swh-cassandra...
[swh] Generate config in staging branch for environment staging, namespace swh-cassandra-next-version...
[swh] Generate config in svn-large-repository branch for environment staging...
[swh] Generate config in svn-large-repository branch for environment staging...
[swh] Generate config in svn-large-repository branch for environment staging...
Your branch is up to date with 'origin/staging'.
[swh] Generate config in staging branch for environment production, namespace swh...
[swh] Generate config in staging branch for environment production, namespace swh-cassandra...
[swh] Generate config in staging branch for environment production, namespace swh-cassandra-next-version...
[swh] Generate config in svn-large-repository branch for environment production...
[swh] Generate config in svn-large-repository branch for environment production...
[swh] Generate config in svn-large-repository branch for environment production...


------------- diff for environment staging namespace swh -------------

No differences


------------- diff for environment staging namespace swh-cassandra -------------

No differences


------------- diff for environment staging namespace swh-cassandra-next-version -------------

No differences


------------- diff for environment production namespace swh -------------

No differences


------------- diff for environment production namespace swh-cassandra -------------

--- /tmp/swh-chart.swh.KIE6MzLu/production-swh-cassandra.before	2024-12-03 14:29:23.272991305 +0100
+++ /tmp/swh-chart.swh.KIE6MzLu/production-swh-cassandra.after	2024-12-03 14:29:24.640996129 +0100
@@ -6901,20 +6901,125 @@
       swh:
         level: "INFO"
       celery.task:
         level: "INFO"
 
     root:
       level: "INFO"
       handlers:
       - console
 ---
+# Source: swh/templates/loaders/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: loader-svn-large-repository-template
+  namespace: swh-cassandra
+data:
+  config.yml.template: |
+    storage:
+      cls: pipeline
+      steps:
+      - cls: buffer
+        min_batch_size:
+          content: 1000
+          content_bytes: 52428800
+          directory: 1000
+          directory_entries: 12000
+          extid: 1000
+          release: 1000
+          release_bytes: 52428800
+          revision: 1000
+          revision_bytes: 52428800
+          revision_parents: 2000
+      - cls: filter
+      - cls: retry
+      - cls: remote
+        url: http://storage-rw-cassandra-ingress-swh-cassandra
+    celery:
+      task_broker: amqp://swhconsumer:${AMQP_PASSWORD}@rabbitmq.internal.softwareheritage.org:5672/%2f
+      task_acks_late: false
+      task_queues:
+      - large_repository:swh.loader.svn.tasks.LoadSvnRepository
+      - large_repository:swh.loader.svn.tasks.MountAndLoadSvnRepository
+      - large_repository:swh.loader.svn.tasks.DumpMountAndLoadSvnRepository
+    
+      sentry_settings_for_celery_tasks:
+        __sentry-settings-for-celery-tasks__
+    metadata_fetcher_credentials:
+      __metadata-fetcher-credentials__
+  init-container-entrypoint.sh: |
+    #!/bin/bash
+
+    set -e
+
+    CONFIG_FILE=/etc/swh/config.yml
+    CONFIG_FILE_WIP=/tmp/wip-config.yml
+
+    # substitute environment variables when creating the default config.yml
+    eval echo \""$(</etc/swh/configuration-template/config.yml.template)"\" \
+      > $CONFIG_FILE
+
+    
+    SENTRY_SETTINGS_PATH=/etc/credentials/sentry-settings/sentry_settings_for_celery_tasks
+    if [ -f $SENTRY_SETTINGS_PATH ]; then
+      awk "/__sentry-settings-for-celery-tasks__/{system(\"sed 's/^/    /g' $SENTRY_SETTINGS_PATH\");next}1" $CONFIG_FILE > $CONFIG_FILE_WIP
+      mv $CONFIG_FILE_WIP $CONFIG_FILE
+    else
+      sed -i 's/__sentry-settings-for-celery-tasks__//g' $CONFIG_FILE
+    fi
+
+    CREDS_LISTER_PATH=/etc/credentials/metadata-fetcher/credentials
+    if [ -f $CREDS_LISTER_PATH ]; then
+      awk "/__metadata-fetcher-credentials__/{system(\"sed 's/^/  /g' $CREDS_LISTER_PATH\");next}1" $CONFIG_FILE > $CONFIG_FILE_WIP
+      mv $CONFIG_FILE_WIP $CONFIG_FILE
+    else
+      sed -i 's/__metadata-fetcher-credentials__//g' $CONFIG_FILE
+    fi
+
+    exit 0
+
+  
+  logging-configuration.yml: |
+    version: 1
+
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: json
+        stream: ext://sys.stdout
+
+    formatters:
+      json:
+        class: pythonjsonlogger.jsonlogger.JsonFormatter
+        # python-json-logger parses the format argument to get the variables it actually expands into the json
+        format: "%(asctime)s:%(threadName)s:%(pathname)s:%(lineno)s:%(funcName)s:%(task_name)s:%(task_id)s:%(name)s:%(levelname)s:%(message)s"
+
+    loggers:
+      celery:
+        level: "INFO"
+      amqp:
+        level: WARNING
+      urllib3:
+        level: WARNING
+      azure.core.pipeline.policies.http_logging_policy:
+        level: WARNING
+      swh:
+        level: "INFO"
+      celery.task:
+        level: "INFO"
+
+    root:
+      level: "INFO"
+      handlers:
+      - console
+---
 # Source: swh/templates/objstorage-replayer/configmap.yaml
 apiVersion: v1
 kind: ConfigMap
 metadata:
   namespace: swh-cassandra
   name: objstorage-replayer-s3-template
 data:
   config.yml.template: |
     objstorage:
       cls: multiplexer
@@ -22119,20 +22224,177 @@
           - key: "pre-stop-idempotent.sh"
             path: "pre-stop.sh"
       - name: metadata-fetcher-credentials
         secret:
           secretName: metadata-fetcher-credentials
           optional: true
       - name: sentry-settings-for-celery-tasks
         secret:
           secretName: sentry-settings-for-celery-tasks
           optional: true
+# if defined at the "typed" loader level
+# otherwise use the global image is defined First this needs to replace - in
+# $loader_type with "" to find the proper image name.
+---
+# Source: swh/templates/loaders/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: loader-svn-large-repository
+  namespace: swh-cassandra
+  labels:
+    app: loader-svn-large-repository
+spec:
+  revisionHistoryLimit: 2
+  selector:
+    matchLabels:
+      app: loader-svn-large-repository
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 1
+  template:
+    metadata:
+      labels:
+        app: loader-svn-large-repository
+      annotations:
+        # Force a rollout upgrade if the configuration changes
+        checksum/config: a3d8c3bc4ee773e7bfc2be66d89ca641001edc56f655246a6592d169864ea158
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: swh/loader
+                operator: In
+                values:
+                - "true"
+              - key: swh/large-scratch-fs
+                operator: In
+                values:
+                - "true"
+      priorityClassName: swh-cassandra-normal-workload
+      terminationGracePeriodSeconds: 3600      
+      dnsConfig:
+        options:
+          - name: ndots
+            value: "1"
+        searches:
+          - cluster.local
+          - svc.cluster.local
+          - swh-cassandra.svc.cluster.local
+      initContainers:
+        - name: prepare-configuration
+          image: debian:bullseye
+          imagePullPolicy: IfNotPresent
+          env: 
+          - name: AMQP_PASSWORD
+            valueFrom:
+              secretKeyRef:
+                key: swhconsumer-password
+                name: amqp-secrets
+                optional: false
+          command:
+            - /entrypoint.sh
+          volumeMounts:
+          - name: configuration-template
+            mountPath: /entrypoint.sh
+            subPath: "init-container-entrypoint.sh"
+            readOnly: true
+          - name: configuration
+            mountPath: /etc/swh
+          - name: configuration-template
+            mountPath: /etc/swh/configuration-template
+          - name: metadata-fetcher-credentials
+            mountPath: /etc/credentials/metadata-fetcher
+            readOnly: true
+          - name: sentry-settings-for-celery-tasks
+            mountPath: /etc/credentials/sentry-settings
+            readOnly: true
+      containers:
+      - name: loaders
+        image: container-registry.softwareheritage.org/swh/infra/swh-apps/loader_svn:20241127.2
+        imagePullPolicy: IfNotPresent
+        command:
+          - /opt/swh/entrypoint.sh
+        resources:
+          requests:
+            memory: 50Gi
+            cpu: 1
+        lifecycle:
+          preStop:
+            exec:
+              command: ["/pre-stop.sh"]
+        env:
+        - name: STATSD_HOST
+          value: prometheus-statsd-exporter
+        - name: STATSD_PORT
+          value: "9125"
+        - name: STATSD_TAGS
+          value: deployment:loader-svn-large-repository
+        - name: MAX_TASKS_PER_CHILD
+          value: "10"
+        - name: SWH_LOG_LEVEL
+          value: "INFO"
+        - name: SWH_CONFIG_FILENAME
+          value: /etc/swh/config.yml
+        - name: SWH_LOG_CONFIG
+          value: /etc/swh/logging-configuration.yml
+        - name: SWH_SENTRY_ENVIRONMENT
+          value: production
+        - name: SWH_SENTRY_DISABLE_LOGGING_EVENTS
+          value: "yes"
+        volumeMounts:
+          - name: loader-utils
+            mountPath: /pre-stop.sh
+            subPath: "pre-stop.sh"
+          - name: configuration
+            mountPath: /etc/swh
+          - name: localstorage
+            mountPath: /tmp
+          - name: configuration-template
+            mountPath: /etc/swh/logging-configuration.yml
+            subPath: "logging-configuration.yml"
+            readOnly: true
+      volumes:
+      - name: localstorage
+        emptyDir:
+          sizeLimit: 100Gi
+      - name: configuration
+        emptyDir: {}
+      - name: configuration-template
+        configMap:
+          name: loader-svn-large-repository-template
+          defaultMode: 0777
+          items:
+          - key: "config.yml.template"
+            path: "config.yml.template"
+          - key: "init-container-entrypoint.sh"
+            path: "init-container-entrypoint.sh"
+          - key: "logging-configuration.yml"
+            path: "logging-configuration.yml"
+      - name: loader-utils
+        configMap:
+          name: loader-utils
+          defaultMode: 0777
+          items:
+          - key: "pre-stop-idempotent.sh"
+            path: "pre-stop.sh"
+      - name: metadata-fetcher-credentials
+        secret:
+          secretName: metadata-fetcher-credentials
+          optional: true
+      - name: sentry-settings-for-celery-tasks
+        secret:
+          secretName: sentry-settings-for-celery-tasks
+          optional: true
 ---
 # Source: swh/templates/memcached/deployment.yaml
 apiVersion: apps/v1
 kind: Deployment
 metadata:
   name: memcached
   namespace: swh-cassandra
   labels:
     chart: "swh-0.1.0"
     app: memcached
@@ -33094,20 +33356,97 @@
       excludeUnacknowledged: "false" # QueueLength should include unacked messages
                                      # Implies "http" protocol is used
       value: "10"
       queueName: swh.loader.svn.tasks.LoadSvnExport
       vhostName: /                   # Optional. If not specified, use the vhost in the
                                      # `host` connection string. Alternatively, you can
                                      # use existing environment variables to read
                                      # configuration from: See details in "Parameter
                                      # list" section hostFromEnv: RABBITMQ_HOST%
 ---
+# Source: swh/templates/loaders/keda-autoscaling.yaml
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: loader-svn-large-repository-operators
+  namespace: swh-cassandra
+spec:
+  scaleTargetRef:
+    apiVersion:    apps/v1     # Optional. Default: apps/v1
+    kind:          Deployment  # Optional. Default: Deployment
+    # Mandatory. Must be in same namespace as ScaledObject
+    name:          loader-svn-large-repository
+    # envSourceContainerName: {container-name} # Optional. Default:
+                                               # .spec.template.spec.containers[0]
+  pollingInterval:  30                         # Optional. Default: 30 seconds
+  cooldownPeriod:   300
+                                               # ^ Optional. Default: 300 seconds
+  idleReplicaCount: 0                          # Set to 0 to stop all the workers when
+                                               # there is no activity on the queue
+  minReplicaCount:  0
+  maxReplicaCount:  1
+  triggers:
+  - type: rabbitmq
+    authenticationRef:
+      name: amqp-authentication-loader-svn-large-repository
+    metadata:
+      protocol: auto                 # Optional. Specifies protocol to use,
+                                     # either amqp or http, or auto to
+                                     # autodetect based on the `host` value.
+                                     # Default value is auto.
+      mode: QueueLength              # QueueLength to trigger on number of msgs in queue
+      excludeUnacknowledged: "false" # QueueLength should include unacked messages
+                                     # Implies "http" protocol is used
+      value: "1"
+      queueName: large_repository:swh.loader.svn.tasks.LoadSvnRepository
+      vhostName: /                   # Optional. If not specified, use the vhost in the
+                                     # `host` connection string. Alternatively, you can
+                                     # use existing environment variables to read
+                                     # configuration from: See details in "Parameter
+                                     # list" section hostFromEnv: RABBITMQ_HOST%
+  - type: rabbitmq
+    authenticationRef:
+      name: amqp-authentication-loader-svn-large-repository
+    metadata:
+      protocol: auto                 # Optional. Specifies protocol to use,
+                                     # either amqp or http, or auto to
+                                     # autodetect based on the `host` value.
+                                     # Default value is auto.
+      mode: QueueLength              # QueueLength to trigger on number of msgs in queue
+      excludeUnacknowledged: "false" # QueueLength should include unacked messages
+                                     # Implies "http" protocol is used
+      value: "1"
+      queueName: large_repository:swh.loader.svn.tasks.MountAndLoadSvnRepository
+      vhostName: /                   # Optional. If not specified, use the vhost in the
+                                     # `host` connection string. Alternatively, you can
+                                     # use existing environment variables to read
+                                     # configuration from: See details in "Parameter
+                                     # list" section hostFromEnv: RABBITMQ_HOST%
+  - type: rabbitmq
+    authenticationRef:
+      name: amqp-authentication-loader-svn-large-repository
+    metadata:
+      protocol: auto                 # Optional. Specifies protocol to use,
+                                     # either amqp or http, or auto to
+                                     # autodetect based on the `host` value.
+                                     # Default value is auto.
+      mode: QueueLength              # QueueLength to trigger on number of msgs in queue
+      excludeUnacknowledged: "false" # QueueLength should include unacked messages
+                                     # Implies "http" protocol is used
+      value: "1"
+      queueName: large_repository:swh.loader.svn.tasks.DumpMountAndLoadSvnRepository
+      vhostName: /                   # Optional. If not specified, use the vhost in the
+                                     # `host` connection string. Alternatively, you can
+                                     # use existing environment variables to read
+                                     # configuration from: See details in "Parameter
+                                     # list" section hostFromEnv: RABBITMQ_HOST%
+---
 # Source: swh/templates/objstorage-replayer/keda-autoscaling.yaml
 apiVersion: keda.sh/v1alpha1
 kind: ScaledObject
 metadata:
   name: objstorage-replayer-s3-scaledobject
   namespace: swh-cassandra
 spec:
   scaleTargetRef:
     name: objstorage-replayer-s3
   pollingInterval: 120
@@ -34060,20 +34399,32 @@
 kind: TriggerAuthentication
 metadata:
   name: amqp-authentication-loader-svn-export
   namespace: swh-cassandra
 spec:
   secretTargetRef:
   - parameter: host            # "host" is required by the scalerObject trigger metadata
     name: common-secrets
     key: rabbitmq-http-host
 ---
+# Source: swh/templates/loaders/keda-autoscaling.yaml
+apiVersion: keda.sh/v1alpha1
+kind: TriggerAuthentication
+metadata:
+  name: amqp-authentication-loader-svn-large-repository
+  namespace: swh-cassandra
+spec:
+  secretTargetRef:
+  - parameter: host            # "host" is required by the scalerObject trigger metadata
+    name: common-secrets
+    key: rabbitmq-http-host
+---
 # Source: swh/templates/objstorage-replayer/keda-autoscaling.yaml
 apiVersion: keda.sh/v1alpha1
 kind: TriggerAuthentication
 metadata:
   name: keda-objstorage-replayer-s3-authentication
   namespace: swh-cassandra
 spec:
   secretTargetRef:
   - parameter: username
     name: swh-archive-broker-secret

Related to swh/infra/sysadm-environment#5497 (closed)

Merge request reports

Loading