Add templatized opentelemetry (log) collector configuration as an argocd application
This templatized output matches what's been developed in [1].
After some more fixes, it's now deployed on the staging cluster through argocd.
Application management ui [3].
[2]
$ helm template --values values-swh-application-versions.yaml --values cluster-configuration/values.yaml --values cluster-configuration/values/archive-staging-rke2.yaml cluster-test cluster-configuration --dry-run --debug --set namespace=test 2>&1
install.go:194: [debug] Original chart version: ""
install.go:211: [debug] CHART PATH: /home/tony/work/swh/sysadm-environment/swh-charts/cluster-configuration
---
# Source: Argocd applications commonly used in to configure a SWH cluster/templates/otlp-collector/application.yaml
# Retrieve the content of the collector.yaml and pass it to the "helm" > "values" key
# config with the proper indentation spaces
---
# Source: Argocd applications commonly used in to configure a SWH cluster/templates/cert-manager/cert-manager-application.yaml
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: archive-staging-rke2-certmanager
namespace: argocd
spec:
revisionHistoryLimit: 2
project: default
source:
chart: cert-manager
repoURL: https://charts.jetstack.io
targetRevision: v1.10.0
helm:
# must match the application name (https://github.com/argoproj/argo-cd/issues/2871)
releaseName: archive-staging-rke2-certmanager
parameters:
- name: "installCRDs"
value: "true"
values: |
# this is necessary to not retrieve the SOA fields of internal zones
# when generating a certificate for a non-public domain, e.g. .internal.admin.swh.network
extraArgs:
- --dns01-recursive-nameservers=ns-246-a.gandi.net:53,ns-239-b.gandi.net:53,ns-228-c.gandi.net:53
- --dns01-recursive-nameservers-only
prometheus:
enabled: true
servicemonitor:
enabled: true
destination:
server: https://rancher.euwest.azure.internal.softwareheritage.org/k8s/clusters/c-m-9n5h9nrf
namespace: "cert-manager"
syncPolicy:
automated:
prune: false
selfHeal: false
syncOptions:
- RespectIgnoreDifferences=true
ignoreDifferences:
- group: admissionregistration.k8s.io
kind: ValidatingWebhookConfiguration
name: archive-staging-rke2-certmanager-cert-manager-webhook
jqPathExpressions:
- .webhooks[].namespaceSelector.matchExpressions[] | select(.key == "control-plane")
---
# Source: Argocd applications commonly used in to configure a SWH cluster/templates/cert-manager/cert-manager-webhook-gandi-application.yaml
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: archive-staging-rke2-certmanager-webhook-gandi
namespace: argocd
spec:
revisionHistoryLimit: 2
project: default
source:
repoURL: 'https://gitlab.softwareheritage.org/swh/infra/ci-cd/3rdparty/cert-manager-webhook-gandi.git'
path: deploy/cert-manager-webhook-gandi
targetRevision: v0.2.0
helm:
parameters:
- name: "installCRDs"
value: "true"
values: |
image:
repository: container-registry.softwareheritage.org/swh/infra/ci-cd/3rdparty/cert-manager-webhook-gandi
tag: v0.2.0
pullPolicy: IfNotPresent
features.apiPriorityAndFairness: true
certManager:
serviceAccountName: archive-staging-rke2-certmanager-cert-manager
releaseName: certmanager-webhook-gandi
destination:
server: https://rancher.euwest.azure.internal.softwareheritage.org/k8s/clusters/c-m-9n5h9nrf
namespace: cert-manager
syncPolicy:
automated:
prune: false
selfHeal: false
syncOptions:
- RespectIgnoreDifferences=true
ignoreDifferences:
- group: cert-manager.io
kind: Certificate
jqPathExpressions:
- .spec.duration
---
# Source: Argocd applications commonly used in to configure a SWH cluster/templates/cluster-configuration-application.yaml
# Declare the argocd application to apply raw yamls on the cluster
# Yamls are stored on the k8s-cluster-config repository in the `clusterName` directory
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: archive-staging-rke2-configuration-application
namespace: argocd
spec:
project: default
source:
repoURL: https://gitlab.softwareheritage.org/swh/infra/ci-cd/k8s-clusters-conf.git
targetRevision: master
path: archive-staging-rke2
directory:
recurse: true
destination:
server: https://rancher.euwest.azure.internal.softwareheritage.org/k8s/clusters/c-m-9n5h9nrf
syncPolicy:
automated:
prune: false
selfHeal: true
allowEmpty: false
---
# Source: Argocd applications commonly used in to configure a SWH cluster/templates/cluster-secrets-application.yaml
# Configre the application to automatically apply the secrets
# on the cluster.
# Secrets are stored on the k8s-swh-private-data repository
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: archive-staging-rke2-secrets
namespace: argocd
spec:
project: default
source:
repoURL: https://gitlab.softwareheritage.org/infra-private/k8s-swh-private-data.git
targetRevision: master
path: archive-staging-rke2
directory:
recurse: true
destination:
server: https://rancher.euwest.azure.internal.softwareheritage.org/k8s/clusters/c-m-9n5h9nrf
namespace: default
syncPolicy:
automated:
prune: false
selfHeal: false
allowEmpty: false
---
# Source: Argocd applications commonly used in to configure a SWH cluster/templates/ingress-nginx/nginx-ingress-application.yaml
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: archive-staging-rke2-ingress-nginx-application
namespace: argocd
spec:
revisionHistoryLimit: 2
project: default
source:
chart: ingress-nginx
repoURL: https://kubernetes.github.io/ingress-nginx
targetRevision: 4.4.0
helm:
releaseName: ingress-nginx
values: |
controller:
watchIngressWithoutClass: false
service:
externalTrafficPolicy: Local
annotations:
metallb.universe.tf/allow-shared-ip: clusterIP
ingressClassResource:
name: "nginx"
default: true
destination:
server: https://rancher.euwest.azure.internal.softwareheritage.org/k8s/clusters/c-m-9n5h9nrf
namespace: ingress-nginx
syncPolicy:
automated:
prune: true
selfHeal: true
allowEmpty: false
---
# Source: Argocd applications commonly used in to configure a SWH cluster/templates/metallb/metallb-application.yaml
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: archive-staging-rke2-metallb-application
namespace: argocd
spec:
revisionHistoryLimit: 2
project: default
source:
chart: metallb
repoURL: https://metallb.github.io/metallb
targetRevision: 0.13.7
helm:
releaseName: metallb
destination:
server: https://rancher.euwest.azure.internal.softwareheritage.org/k8s/clusters/c-m-9n5h9nrf
namespace: metallb
syncPolicy:
automated:
prune: false
selfHeal: false
allowEmpty: false
syncOptions:
- RespectIgnoreDifferences=true
ignoreDifferences:
# The ca bundle is updated by the controller during the runtime
- group: 'apiextensions.k8s.io'
kind: 'CustomResourceDefinition'
jsonPointers:
- /spec/conversion/webhook/clientConfig/caBundle
---
# Source: Argocd applications commonly used in to configure a SWH cluster/templates/otlp-collector/application.yaml
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: archive-staging-rke2-otlp-collector
namespace: argocd
spec:
revisionHistoryLimit: 2
project: default
source:
chart: opentelemetry-collector
repoURL: https://open-telemetry.github.io/opentelemetry-helm-charts
targetRevision: 0.57.0
helm:
releaseName: opentelemetry-collector
values: |
# -*- yaml -*-
# Collector configuration to include in the "helm" > "values" keys
# in the argocd application defined in application.yaml
---
mode: daemonset
presets:
# Configures the collector to collect logs.
# Adds the filelog receiver to the logs pipeline
# and adds the necessary volumes and volume mounts.
# Best used with mode = daemonset.
logsCollection:
# Not enabled as this configures too much. Only the necessary is opened below
enabled: false
# Configures the Kubernetes Processor to add Kubernetes metadata.
# Adds the k8sattributes processor to all the pipelines
# and adds the necessary rules to ClusteRole.
# Best used with mode = daemonset.
kubernetesAttributes:
enabled: true
# Configures the collector to collect host metrics.
# Adds the hostmetrics receiver to the metrics pipeline
# and adds the necessary volumes and volume mounts.
# Best used with mode = daemonset.
hostMetrics:
# Not enabled as this configures too much. Only the necessary is opened below
enabled: false
extraEnvs:
- name: KUBE_NODE_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: spec.nodeName
extraVolumes:
- name: varlogpods
hostPath:
path: /var/log/pods
type: Directory
extraVolumeMounts:
- mountPath: /var/log/pods
name: varlogpods
resources:
limits:
cpu: 256m
memory: 2Gi
# The pod monitor by default scrapes the metrics port.
# The metrics port needs to be enabled as well.
podMonitor:
enabled: true
ports:
# The metrics port is disabled by default. So we need to enable the port
# in order to use the PodMonitor (PodMonitor.enabled)
metrics:
enabled: true
config:
exporters:
elasticsearch/swh-log:
endpoints:
- http://esnode1.internal.softwareheritage.org:9200
- http://esnode2.internal.softwareheritage.org:9200
- http://esnode3.internal.softwareheritage.org:9200
- http://esnode7.internal.softwareheritage.org:9200
logs_index: staging-logs
# Contrary to documentation, this does not work. It fails to parse the configmap
# error with it enabled
# retry_on_failure:
# enabled: true
timeout: 10s
elasticsearch/system-log:
# can be replaced by using the env variable ELASTICSEARCH_URL
endpoints:
- http://esnode1.internal.softwareheritage.org:9200
- http://esnode2.internal.softwareheritage.org:9200
- http://esnode3.internal.softwareheritage.org:9200
- http://esnode7.internal.softwareheritage.org:9200
logs_index: staging-system-logs
timeout: 10s
extensions:
# with port-forward, allows to display the pipeline status to see what's been
# deployed
zpages:
endpoint: "0.0.0.0:8889"
# The health_check extension is mandatory for this chart. Without the health_check
# extension the collector will fail the readiness and liveliness probes. The
# health_check extension can be modified, but should never be removed.
health_check: {}
receivers:
filelog/system:
include:
- /var/log/pods/*/*/*.log
exclude:
# Exclude 'swh*' namespaced logs
- /var/log/pods/swh*_*/*/*.log
start_at: beginning
include_file_path: true
include_file_name: false
multiline:
# as of now, starts as a date pattern (see parser-containerd below)
line_start_pattern: '^[^ Z]+Z'
operators:
# Find out which log format is used to route it to proper parsers
# Extract metadata from file path
- id: extract_metadata_from_filepath
type: regex_parser
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9\-]{32,36})\/(?P<container_name>[^\._]+)\/(?P<run_id>\d+)\.log$'
parse_from: attributes["log.file.path"]
parse_to: resource
# Parse CRI-Containerd format
- id: parser-containerd
type: regex_parser
regex: '^(?P<time>[^ ^Z]+Z) (?P<stream>stdout|stderr)( (?P<logtag>[^ ]*) (?P<message>.*)|.*)$'
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
# e.g. redis logs are "mostly" json, but no the ts entry is a timestamp that's
# not adequately parsed. Type:"mapper_parsing_exception", Reason:"failed to
# parse field [Attributes.ts] of type [date] in document...
# - id: parser-json-message
# type: json_parser
# parse_from: attributes['message']
# parse_to: attributes
# if: attributes.message matches "^\\{"
filelog/swh:
include:
# Only keep 'swh*' namespaces
- /var/log/pods/swh*_*/*/*.log
start_at: beginning
include_file_path: true
include_file_name: false
multiline:
# as of now, starts as a date pattern (see parser-containerd below)
line_start_pattern: '^[^ Z]+Z'
operators:
# Find out which log format is used to route it to proper parsers
# Extract metadata from file path
- id: extract_metadata_from_filepath
type: regex_parser
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9\-]{36})\/(?P<container_name>[^\._]+)\/(?P<run_id>\d+)\.log$'
parse_from: attributes["log.file.path"]
parse_to: resource
# Parse CRI-Containerd format
- id: parser-containerd
type: regex_parser
regex: '^(?P<time>[^ ^Z]+Z) (?P<stream>stdout|stderr)( (?P<logtag>[^ ]*) (?P<message>.*)|.*)$'
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
# then parse the json formatted message if any
- id: parser-json-message
type: json_parser
parse_from: attributes['message']
parse_to: attributes
if: attributes.stream == 'stdout' && attributes.message matches "^\\{"
# Those were an attempt to inline the json further but entries 'data.kwargs' and
# 'return_value' are python dict and not json so we cannot parse them.
# - id: parser-json-kwargs
# type: json_parser
# parse_from: attributes.data.kwargs
# parse_to: attributes
# if: attributes.stream == 'stdout' && attributes.data?.kwargs != nil
# - id: parser-json-return-value
# type: json_parser
# parse_from: attributes.return_value
# parse_to: attributes
# if: attributes.stream == 'stdout' && attributes?.return_value != nil
# This deals with basic key=value logs (it's not able to deal with "multi"
# values those like key="this is a value" though, so prometheus, memcached logs
# are not parsed so far)
# - id: parse-key-value-message
# type: key_value_parser
# delimiter: "="
# pair_delimiter: " "
# parse_from: attributes['message']
# parse_to: attributes
# if: attributes.message matches "^ts="
processors:
resource:
attributes:
- key: k8s.pod.name
from_attribute: pod_name
action: upsert
k8sattributes:
filter:
node_from_env_var: KUBE_NODE_NAME
passthrough: false
extract:
metadata:
# from https://opentelemetry.io/docs/reference/specification/resource/semantic_conventions/k8s/
- k8s.pod.name
- k8s.pod.uid
- k8s.deployment.name
- k8s.namespace.name
- k8s.node.name
- k8s.pod.start_time
- k8s.daemonset.name
- k8s.job.name
- k8s.cronjob.name
# Desired properties (but not working for now)
# 2023/04/26 08:54:58 collector server run finished with error: failed to
# build pipelines: failed to create "k8sattributes" processor, in pipeline
# "logs/system": "k8s.cluster.name" (or "deployment.environment" )
# - k8s.cluster.name
# - deployment.environment
pod_association:
- sources:
- from: resource_attribute
name: k8s.pod.name
- sources:
- from: connection
name: k8s.pod.ip
- sources:
- from: resource_attribute
name: k8s.pod.ip
batch:
# for debug
send_batch_size: 10
# If set to null, will be overridden with values based on k8s resource limits
memory_limiter: null
attributes/insert:
actions:
- key: environment
value: changeme
action: insert
- key: cluster
value: archive-staging-rke2
action: insert
attributes/clean-records:
actions:
- key: time
action: delete
- key: logtag
action: delete
- key: log
action: delete
- key: log.keyword
action: delete
- key: log.file.path
action: delete
- key: log.value
action: delete
service:
telemetry:
metrics:
address: ${MY_POD_IP}:8888
extensions:
- health_check
- memory_ballast
pipelines:
logs/system:
receivers:
- filelog/system
processors:
- batch
- resource
- k8sattributes
- attributes/insert
- attributes/clean-records
exporters:
- elasticsearch/system-log
logs/swh:
receivers:
- filelog/swh
processors:
- batch
- resource
- k8sattributes
- attributes/insert
- attributes/clean-records
exporters:
- elasticsearch/swh-log
# inhibit pipelines
logs: null
metrics: null
traces: null
destination:
server: https://rancher.euwest.azure.internal.softwareheritage.org/k8s/clusters/c-m-9n5h9nrf
namespace: opentelemetry
syncPolicy:
automated:
prune: true
selfHeal: true
allowEmpty: false
Edited by Antoine R. Dumont