From 632c5ae802c99405edab2fd8654acc94e90f88aa Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <>
Date: Tue, 20 Jun 2023 11:57:46 +0200
Subject: [PATCH] Add reference for common configuration blocks

So CLI documentation can link to it instead of repeating the whole
configuration of each block.
 docs/devel/configuration.rst | 195 +++++++++++++++++++++++++++++++++++
 docs/devel/index.rst         |   1 +
 2 files changed, 196 insertions(+)
 create mode 100644 docs/devel/configuration.rst

diff --git a/docs/devel/configuration.rst b/docs/devel/configuration.rst
new file mode 100644
index 00000000..382023b0
--- /dev/null
+++ b/docs/devel/configuration.rst
@@ -0,0 +1,195 @@
+.. _cli-config:
+Configuration reference
+.. highlight:: yaml
+|swh| components are all configured with a YAML file, made of multiple blocks,
+most of which describe how to connect to other components/services.
+Most services are composable, so they can be either instantiated locally or
+accessed through |swh|'s HTTP-based RPC protocol (``cls: remote``).
+For example, a possible configuration for swh-vault is::
+    graph:
+      url:
+    storage:
+      cls: pipeline
+      steps:
+      - cls: retry
+      - cls: remote
+        url:
+    objstorage:
+      cls: s3
+      compression: gzip
+      container_name: softwareheritage
+      path_prefix: content
+All URLs in this document are examples, see :ref:`service-url` for actual values.
+.. _cli-config-graph:
+The :ref:`graph <swh-graph>` can only be accessed as a remote service, and
+its configuration block is a single key: ``url``, which is the URL to its
+HTTP endpoint; usually on port 5009 or at the path ``/graph/``.
+.. _cli-config-journal:
+The :ref:`journal <swh-journal>` can only be locally instantiated to consume
+directly from Kafka::
+    journal:
+      brokers:
+        -
+        -
+        -
+        -
+      prefix: swh.journal.objects
+      sasl.mechanism: "SCRAM-SHA-512"
+      security.protocol: "sasl_ssl"
+      sasl.username: "..."
+      sasl.password: "..."
+      privileged: false
+      group_id: "..."
+.. _cli-config-scheduler:
+The :ref:`scheduler <swh-scheduler>` can only be accessed as a remote service, and
+its configuration block is a single key: ``url``, which is the URL to its
+HTTP endpoint; usually on port 5008 or at the path ``/scheduler/``.::
+    scheduler:
+      cls: remote
+      url:
+.. _cli-config-storage:
+The :ref:`storage <swh-storage>` has four possible classes:
+* ``cassandra``, see :class:``::
+    storage:
+      cls: cassandra
+      hosts: [...]
+      keyspace: swh
+      port: 9042
+      journal_writer:
+        # ...
+      # ...
+* ``postgresql``, which takes a `libpq connection string <>`_::
+    storage:
+      cls: postgresql
+      db: service=swh
+      journal_writer:
+        # ...
+  For optional arguments, see :class:``
+* ``memory``, which stores data in-memory instead of persisting it somewhere;
+  this should only be used for debugging::
+    storage:
+      cls: memory
+      journal_writer:
+        # ...
+* ``remote``, which takes a URL to a remote service's HTTP endpoint;
+  usually on port 5002 or at the path ``/storage/``::
+    storage:
+      cls: remote
+      url:
+The ``journal_writer`` key is optional. If provided, it will be used to write all
+additions to some sort of log (usually Kafka) before any write to the main database.
+    cls: kafka
+    brokers:
+      -
+      -
+      -
+      -
+    prefix: swh.journal.objects
+    anonymize: true
+    client_id: ...
+    producer_config: ...
+:mod:``, which writes directly to a file
+(or stdout if set to ``-``)::
+    cls: stream
+    output_stream: /tmp/messages.msgpack
+:mod:`swh.journal.writer.inmemory`, which does not actually persist anywhere,
+and should only be used for tests::
+    cls: memory
+    anonymize: false
+In addition to these three backends, "storage proxies" can be used and chained in order
+to change the behavior of accesses to it. They usually do not change the semantics,
+but perform optimizations such as batching calls, stripping redundant operations,
+and retrying on error.
+They are invoked through the special ``pipeline`` class, which takes as parameter
+a list of proxy configurations, ending with a backend configuration as seen above::
+    storage:
+      cls: pipeline
+      steps:
+        - cls: buffer
+          min_batch_size:
+            content: 10000
+            directory: 5000
+        - cls: filter
+        - cls: retry
+        - cls: remote
+          url:
+which is equivalent to this nested configuration::
+    storage:
+      cls: buffer
+      min_batch_size:
+        content: 10000
+        directory: 5000
+      storage:
+        cls: filter
+        storage:
+          cls: retry
+          storage:
+            cls: remote
+            url:
+See :mod:`` for the list of proxies.
diff --git a/docs/devel/index.rst b/docs/devel/index.rst
index 3ee49c4c..9090004e 100644
--- a/docs/devel/index.rst
+++ b/docs/devel/index.rst
@@ -13,6 +13,7 @@ Development
+   configuration