Skip to content

Add templatized opentelemetry (log) collector configuration as an argocd application

Antoine R. Dumont requested to merge staging-dev into staging

This templatized output matches what's been developed in [1].

After some more fixes, it's now deployed on the staging cluster through argocd.

Application management ui [3].

[3] https://argocd.internal.admin.swh.network/applications/archive-staging-rke2-otlp-collector?resource=health%3AProgressing&conditions=false

[1] https://gitlab.softwareheritage.org/swh/infra/ci-cd/k8s-clusters-conf/-/blob/29f32b35833085b318988fd140257dca85914d31/archive-staging-rke2/opentelemetry-collector-values.yaml

[2]

$ helm template --values values-swh-application-versions.yaml --values cluster-configuration/values.yaml --values cluster-configuration/values/archive-staging-rke2.yaml cluster-test cluster-configuration --dry-run --debug --set namespace=test 2>&1
install.go:194: [debug] Original chart version: ""
install.go:211: [debug] CHART PATH: /home/tony/work/swh/sysadm-environment/swh-charts/cluster-configuration

---
# Source: Argocd applications commonly used in to configure a SWH cluster/templates/otlp-collector/application.yaml
# Retrieve the content of the collector.yaml and pass it to the "helm" > "values" key
# config with the proper indentation spaces
---
# Source: Argocd applications commonly used in to configure a SWH cluster/templates/cert-manager/cert-manager-application.yaml
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
  name: archive-staging-rke2-certmanager
  namespace: argocd
spec:
  revisionHistoryLimit: 2
  project: default
  source:
    chart: cert-manager
    repoURL: https://charts.jetstack.io
    targetRevision: v1.10.0
    helm:
      # must match the application name (https://github.com/argoproj/argo-cd/issues/2871)
      releaseName: archive-staging-rke2-certmanager
      parameters:
        - name: "installCRDs"
          value: "true"
      values: |
          # this is necessary to not retrieve the SOA fields of internal zones
          # when generating a certificate for a non-public domain, e.g. .internal.admin.swh.network
          extraArgs:
            - --dns01-recursive-nameservers=ns-246-a.gandi.net:53,ns-239-b.gandi.net:53,ns-228-c.gandi.net:53
            - --dns01-recursive-nameservers-only
          prometheus:
            enabled: true
            servicemonitor:
              enabled: true
  destination:
    server: https://rancher.euwest.azure.internal.softwareheritage.org/k8s/clusters/c-m-9n5h9nrf
    namespace: "cert-manager"
  syncPolicy:
    automated:
      prune: false
      selfHeal: false
    syncOptions:
    - RespectIgnoreDifferences=true
  ignoreDifferences:
    - group: admissionregistration.k8s.io
      kind: ValidatingWebhookConfiguration
      name: archive-staging-rke2-certmanager-cert-manager-webhook
      jqPathExpressions:
        - .webhooks[].namespaceSelector.matchExpressions[] | select(.key == "control-plane")
---
# Source: Argocd applications commonly used in to configure a SWH cluster/templates/cert-manager/cert-manager-webhook-gandi-application.yaml
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
  name: archive-staging-rke2-certmanager-webhook-gandi
  namespace: argocd
spec:
  revisionHistoryLimit: 2
  project: default
  source:
    repoURL: 'https://gitlab.softwareheritage.org/swh/infra/ci-cd/3rdparty/cert-manager-webhook-gandi.git'
    path: deploy/cert-manager-webhook-gandi
    targetRevision: v0.2.0
    helm:
      parameters:
        - name: "installCRDs"
          value: "true"
      values: |
        image:
          repository: container-registry.softwareheritage.org/swh/infra/ci-cd/3rdparty/cert-manager-webhook-gandi
          tag: v0.2.0
          pullPolicy: IfNotPresent
          features.apiPriorityAndFairness: true
        certManager:
          serviceAccountName: archive-staging-rke2-certmanager-cert-manager
      releaseName: certmanager-webhook-gandi
  destination:
    server: https://rancher.euwest.azure.internal.softwareheritage.org/k8s/clusters/c-m-9n5h9nrf
    namespace: cert-manager
  syncPolicy:
    automated:
      prune: false
      selfHeal: false
    syncOptions:
    - RespectIgnoreDifferences=true
  ignoreDifferences:
    - group: cert-manager.io
      kind: Certificate
      jqPathExpressions:
        - .spec.duration
---
# Source: Argocd applications commonly used in to configure a SWH cluster/templates/cluster-configuration-application.yaml
# Declare the argocd application to apply raw yamls on the cluster
# Yamls are stored on the k8s-cluster-config repository in the `clusterName` directory
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
  name: archive-staging-rke2-configuration-application
  namespace: argocd
spec:
  project: default
  source:
    repoURL: https://gitlab.softwareheritage.org/swh/infra/ci-cd/k8s-clusters-conf.git
    targetRevision: master
    path: archive-staging-rke2
    directory:
      recurse: true
  destination:
    server: https://rancher.euwest.azure.internal.softwareheritage.org/k8s/clusters/c-m-9n5h9nrf
  syncPolicy:
    automated:
      prune: false
      selfHeal: true
      allowEmpty: false
---
# Source: Argocd applications commonly used in to configure a SWH cluster/templates/cluster-secrets-application.yaml
# Configre the application to automatically apply the secrets
# on the cluster.
# Secrets are stored on the k8s-swh-private-data repository
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
  name: archive-staging-rke2-secrets
  namespace: argocd
spec:
  project: default
  source:
    repoURL: https://gitlab.softwareheritage.org/infra-private/k8s-swh-private-data.git
    targetRevision: master
    path: archive-staging-rke2
    directory:
      recurse: true
  destination:
    server: https://rancher.euwest.azure.internal.softwareheritage.org/k8s/clusters/c-m-9n5h9nrf
    namespace: default
  syncPolicy:
    automated:
      prune: false
      selfHeal: false
      allowEmpty: false
---
# Source: Argocd applications commonly used in to configure a SWH cluster/templates/ingress-nginx/nginx-ingress-application.yaml
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
  name: archive-staging-rke2-ingress-nginx-application
  namespace: argocd
spec:
  revisionHistoryLimit: 2
  project: default
  source:
    chart: ingress-nginx
    repoURL: https://kubernetes.github.io/ingress-nginx
    targetRevision: 4.4.0
    helm:
      releaseName: ingress-nginx
      values: |
        controller:
          watchIngressWithoutClass: false
          service:
            externalTrafficPolicy: Local
            annotations:
              metallb.universe.tf/allow-shared-ip: clusterIP
          ingressClassResource:
            name: "nginx"
            default: true
  destination:
    server: https://rancher.euwest.azure.internal.softwareheritage.org/k8s/clusters/c-m-9n5h9nrf
    namespace: ingress-nginx
  syncPolicy:
    automated:
      prune: true
      selfHeal: true
      allowEmpty: false
---
# Source: Argocd applications commonly used in to configure a SWH cluster/templates/metallb/metallb-application.yaml
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
  name: archive-staging-rke2-metallb-application
  namespace: argocd
spec:
  revisionHistoryLimit: 2
  project: default
  source:
    chart: metallb
    repoURL: https://metallb.github.io/metallb
    targetRevision: 0.13.7
    helm:
      releaseName: metallb
  destination:
    server: https://rancher.euwest.azure.internal.softwareheritage.org/k8s/clusters/c-m-9n5h9nrf
    namespace: metallb
  syncPolicy:
    automated:
      prune: false
      selfHeal: false
      allowEmpty: false
    syncOptions:
    - RespectIgnoreDifferences=true
  ignoreDifferences:
  # The ca bundle is updated by the controller during the runtime
  - group: 'apiextensions.k8s.io'
    kind: 'CustomResourceDefinition'
    jsonPointers:
    - /spec/conversion/webhook/clientConfig/caBundle
---
# Source: Argocd applications commonly used in to configure a SWH cluster/templates/otlp-collector/application.yaml
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
  name: archive-staging-rke2-otlp-collector
  namespace: argocd
spec:
  revisionHistoryLimit: 2
  project: default
  source:
    chart: opentelemetry-collector
    repoURL: https://open-telemetry.github.io/opentelemetry-helm-charts
    targetRevision: 0.57.0
    helm:
      releaseName: opentelemetry-collector
      values: |
        # -*- yaml -*-
        # Collector configuration to include in the "helm" > "values" keys
        # in the argocd application defined in application.yaml
        ---
        mode: daemonset
        presets:
          # Configures the collector to collect logs.
          # Adds the filelog receiver to the logs pipeline
          # and adds the necessary volumes and volume mounts.
          # Best used with mode = daemonset.
          logsCollection:
            # Not enabled as this configures too much. Only the necessary is opened below
            enabled: false
          # Configures the Kubernetes Processor to add Kubernetes metadata.
          # Adds the k8sattributes processor to all the pipelines
          # and adds the necessary rules to ClusteRole.
          # Best used with mode = daemonset.
          kubernetesAttributes:
            enabled: true
          # Configures the collector to collect host metrics.
          # Adds the hostmetrics receiver to the metrics pipeline
          # and adds the necessary volumes and volume mounts.
          # Best used with mode = daemonset.
          hostMetrics:
            # Not enabled as this configures too much. Only the necessary is opened below
            enabled: false

        extraEnvs:
        - name: KUBE_NODE_NAME
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: spec.nodeName

        extraVolumes:
        - name: varlogpods
          hostPath:
            path: /var/log/pods
            type: Directory

        extraVolumeMounts:
        - mountPath: /var/log/pods
          name: varlogpods

        resources:
          limits:
            cpu: 256m
            memory: 2Gi

        # The pod monitor by default scrapes the metrics port.
        # The metrics port needs to be enabled as well.
        podMonitor:
          enabled: true

        ports:
          # The metrics port is disabled by default. So we need to enable the port
          # in order to use the PodMonitor (PodMonitor.enabled)
          metrics:
            enabled: true

        config:
          exporters:
            elasticsearch/swh-log:
              endpoints:
                - http://esnode1.internal.softwareheritage.org:9200
                - http://esnode2.internal.softwareheritage.org:9200
                - http://esnode3.internal.softwareheritage.org:9200
                - http://esnode7.internal.softwareheritage.org:9200
              logs_index: staging-logs
              # Contrary to documentation, this does not work. It fails to parse the configmap
              # error with it enabled
              # retry_on_failure:
              #   enabled: true
              timeout: 10s
            elasticsearch/system-log:
              # can be replaced by using the env variable ELASTICSEARCH_URL
              endpoints:
                - http://esnode1.internal.softwareheritage.org:9200
                - http://esnode2.internal.softwareheritage.org:9200
                - http://esnode3.internal.softwareheritage.org:9200
                - http://esnode7.internal.softwareheritage.org:9200
              logs_index: staging-system-logs
              timeout: 10s

          extensions:
            # with port-forward, allows to display the pipeline status to see what's been
            # deployed
            zpages:
              endpoint: "0.0.0.0:8889"
            # The health_check extension is mandatory for this chart. Without the health_check
            # extension the collector will fail the readiness and liveliness probes. The
            # health_check extension can be modified, but should never be removed.
            health_check: {}

          receivers:
            filelog/system:
              include:
                - /var/log/pods/*/*/*.log
              exclude:
                # Exclude 'swh*' namespaced logs
                - /var/log/pods/swh*_*/*/*.log
              start_at: beginning
              include_file_path: true
              include_file_name: false
              multiline:
                # as of now, starts as a date pattern (see parser-containerd below)
                line_start_pattern: '^[^ Z]+Z'
              operators:
              # Find out which log format is used to route it to proper parsers
              # Extract metadata from file path
              - id: extract_metadata_from_filepath
                type: regex_parser
                regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9\-]{32,36})\/(?P<container_name>[^\._]+)\/(?P<run_id>\d+)\.log$'
                parse_from: attributes["log.file.path"]
                parse_to: resource
              # Parse CRI-Containerd format
              - id: parser-containerd
                type: regex_parser
                regex: '^(?P<time>[^ ^Z]+Z) (?P<stream>stdout|stderr)( (?P<logtag>[^ ]*) (?P<message>.*)|.*)$'
                timestamp:
                  parse_from: attributes.time
                  layout: '%Y-%m-%dT%H:%M:%S.%LZ'
              # e.g. redis logs are "mostly" json, but no the ts entry is a timestamp that's
              # not adequately parsed. Type:"mapper_parsing_exception", Reason:"failed to
              # parse field [Attributes.ts] of type [date] in document...
              # - id: parser-json-message
              #   type: json_parser
              #   parse_from: attributes['message']
              #   parse_to: attributes
              #   if: attributes.message matches "^\\{"

            filelog/swh:
              include:
                # Only keep 'swh*' namespaces
                - /var/log/pods/swh*_*/*/*.log
              start_at: beginning
              include_file_path: true
              include_file_name: false
              multiline:
                # as of now, starts as a date pattern (see parser-containerd below)
                line_start_pattern: '^[^ Z]+Z'
              operators:
              # Find out which log format is used to route it to proper parsers
              # Extract metadata from file path
              - id: extract_metadata_from_filepath
                type: regex_parser
                regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9\-]{36})\/(?P<container_name>[^\._]+)\/(?P<run_id>\d+)\.log$'
                parse_from: attributes["log.file.path"]
                parse_to: resource
              # Parse CRI-Containerd format
              - id: parser-containerd
                type: regex_parser
                regex: '^(?P<time>[^ ^Z]+Z) (?P<stream>stdout|stderr)( (?P<logtag>[^ ]*) (?P<message>.*)|.*)$'
                timestamp:
                  parse_from: attributes.time
                  layout: '%Y-%m-%dT%H:%M:%S.%LZ'
              # then parse the json formatted message if any
              - id: parser-json-message
                type: json_parser
                parse_from: attributes['message']
                parse_to: attributes
                if: attributes.stream == 'stdout' && attributes.message matches "^\\{"
              # Those were an attempt to inline the json further but entries 'data.kwargs' and
              # 'return_value' are python dict and not json so we cannot parse them.
              # - id: parser-json-kwargs
              #   type: json_parser
              #   parse_from: attributes.data.kwargs
              #   parse_to: attributes
              #   if: attributes.stream == 'stdout' && attributes.data?.kwargs != nil
              # - id: parser-json-return-value
              #   type: json_parser
              #   parse_from: attributes.return_value
              #   parse_to: attributes
              #   if: attributes.stream == 'stdout' && attributes?.return_value != nil
              # This deals with basic key=value logs (it's not able to deal with "multi"
              # values those like key="this is a value" though, so prometheus, memcached logs
              # are not parsed so far)
              # - id: parse-key-value-message
              #   type: key_value_parser
              #   delimiter: "="
              #   pair_delimiter: " "
              #   parse_from: attributes['message']
              #   parse_to: attributes
              #   if: attributes.message matches "^ts="

          processors:
            resource:
              attributes:
                - key: k8s.pod.name
                  from_attribute: pod_name
                  action: upsert
            k8sattributes:
              filter:
                node_from_env_var: KUBE_NODE_NAME
              passthrough: false
              extract:
                metadata:
                  # from https://opentelemetry.io/docs/reference/specification/resource/semantic_conventions/k8s/
                  - k8s.pod.name
                  - k8s.pod.uid
                  - k8s.deployment.name
                  - k8s.namespace.name
                  - k8s.node.name
                  - k8s.pod.start_time
                  - k8s.daemonset.name
                  - k8s.job.name
                  - k8s.cronjob.name
                  # Desired properties (but not working for now)
                  # 2023/04/26 08:54:58 collector server run finished with error: failed to
                  # build pipelines: failed to create "k8sattributes" processor, in pipeline
                  # "logs/system": "k8s.cluster.name" (or "deployment.environment" )
                  # - k8s.cluster.name
                  # - deployment.environment
              pod_association:
                - sources:
                  - from: resource_attribute
                    name: k8s.pod.name
                - sources:
                  - from: connection
                    name: k8s.pod.ip
                - sources:
                  - from: resource_attribute
                    name: k8s.pod.ip
            batch:
              # for debug
              send_batch_size: 10
            # If set to null, will be overridden with values based on k8s resource limits
            memory_limiter: null
            attributes/insert:
              actions:
              - key: environment
                value: changeme
                action: insert
              - key: cluster
                value: archive-staging-rke2
                action: insert
            attributes/clean-records:
              actions:
              - key: time
                action: delete
              - key: logtag
                action: delete
              - key: log
                action: delete
              - key: log.keyword
                action: delete
              - key: log.file.path
                action: delete
              - key: log.value
                action: delete

          service:
            telemetry:
              metrics:
                address: ${MY_POD_IP}:8888
            extensions:
              - health_check
              - memory_ballast

            pipelines:
              logs/system:
                receivers:
                  - filelog/system
                processors:
                  - batch
                  - resource
                  - k8sattributes
                  - attributes/insert
                  - attributes/clean-records
                exporters:
                  - elasticsearch/system-log
              logs/swh:
                receivers:
                  - filelog/swh
                processors:
                  - batch
                  - resource
                  - k8sattributes
                  - attributes/insert
                  - attributes/clean-records
                exporters:
                  - elasticsearch/swh-log
              # inhibit pipelines
              logs: null
              metrics: null
              traces: null

  destination:
    server: https://rancher.euwest.azure.internal.softwareheritage.org/k8s/clusters/c-m-9n5h9nrf
    namespace: opentelemetry
  syncPolicy:
    automated:
      prune: true
      selfHeal: true
      allowEmpty: false

Refs. swh/infra/sysadm-environment#4524 (closed)

Edited by Antoine R. Dumont

Merge request reports