Compare revisions

408fb430 · 90f38d96 · 96a6768f · 06d79404 · 8621348b · 9ad6d136
--- a/.copier-answers.yml
+++ b/.copier-answers.yml
 # Changes here will be overwritten by Copier
-_commit: v0.1.6
+_commit: v0.3.3
 _src_path: https://gitlab.softwareheritage.org/swh/devel/swh-py-template.git
 description: Software Heritage Documentation
 distribution_name: swh-docs

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v5.0.0
    hooks:
      - id: trailing-whitespace
      - id: check-json
      - id: check-yaml

  - repo: https://github.com/python/black
-    rev: 23.1.0
+    rev: 25.1.0
    hooks:
      - id: black

  - repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
+    rev: 6.0.0
    hooks:
      - id: isort

  - repo: https://github.com/pycqa/flake8
-    rev: 6.0.0
+    rev: 7.1.1
    hooks:
      - id: flake8
-        additional_dependencies: [flake8-bugbear==22.9.23]
+        additional_dependencies: [flake8-bugbear==24.12.12, flake8-pyproject]

  - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.2
+    rev: v2.4.1
    hooks:
      - id: codespell
        name: Check source code spelling
        additional_dependencies:
          - tomli
-        stages: [commit]
+        stages: [pre-commit]
      - id: codespell
        name: Check commit message spelling
        additional_dependencies:
@@ -45,3 +45,13 @@ repos:
        pass_filenames: false
        language: system
        types: [python]
+      - id: twine-check
+        name: twine check
+        description: call twine check when pushing an annotated release tag
+        entry: bash -c "ref=$(git describe) &&
+          [[ $ref =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]] &&
+          (python3 -m build --sdist && twine check $(ls -t dist/* | head -1)) || true"
+        pass_filenames: false
+        stages: [pre-push]
+        language: python
+        additional_dependencies: [twine, build]
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -6,7 +6,7 @@ In the interest of fostering an open and welcoming environment, we as Software
 Heritage contributors and maintainers pledge to making participation in our
 project and our community a harassment-free experience for everyone, regardless
 of age, body size, disability, ethnicity, sex characteristics, gender identity
-and expression, level of experience, education, socio-economic status,
+and expression, level of experience, education, socioeconomic status,
 nationality, personal appearance, race, religion, or sexual identity and
 orientation.


--- a/Makefile.local
+++ b/Makefile.local
@@ -18,7 +18,7 @@ BUILD_DEPS :=
 BUILD_DEPS += cffi

 # swh.search
-BUILD_DEPS += tree-sitter
+BUILD_DEPS += tree-sitter\<0.22.0

 # swh.loader.bzr>breezy
 BUILD_DEPS += configobj
@@ -26,6 +26,12 @@ BUILD_DEPS += configobj
 # swh.docs
 BUILD_DEPS += pifpaf

+# psycopg-c
+BUILD_DEPS += tomli
+
+# docutils >= 0.21
+BUILD_DEPS += flit-core
+
 pip-install-swh-dev:
 	python3 -m pip install --upgrade pip wheel setuptools setuptools-scm
 	python3 -m pip install --upgrade $(BUILD_DEPS)

--- a/Makefile.sphinx
+++ b/Makefile.sphinx
@@ -34,7 +34,7 @@ sphinx/%: $(apidoc_dep)

 apidoc: $(apidoc_dep)
 apidoc-stamp:
-	$(SPHINXAPIDOC) $(APIDOC_OPTS) -o $(APIDOC_DIR) ../swh $(APIDOC_SWH_EXCLUDES)
+	$(SPHINXAPIDOC) $(APIDOC_OPTS) -o $(APIDOC_DIR) `if [ -d ../src/swh ]; then echo ../src/swh; else echo ../swh; fi` $(APIDOC_SWH_EXCLUDES)
 	# to silent Sphinx warnings about apidoc documents not included in any toctree
 	find $(shell pwd)/apidoc -name "*.rst" | xargs sed -i '1i:orphan:\n'
 	touch $@

--- a/docs/devel/api-reference.rst
+++ b/docs/devel/api-reference.rst
@@ -18,8 +18,9 @@ by components:
   swh.auth <swh-auth/index>
   swh.core <swh-core/index>
   swh.counters <swh-counters/index>
-   swh.dataset <swh-dataset/index>
+   swh.datasets <swh-datasets/index>
   swh.deposit <swh-deposit/index>
+   swh.export <swh-export/index>
   swh.fuse <swh-fuse/index>
   swh.graph <swh-graph/index>
   swh.graphql <swh-graphql/index>
@@ -31,6 +32,7 @@ by components:
   swh.objstorage <swh-objstorage/index>
   swh.objstorage.replayer <swh-objstorage-replayer/index>
   swh.perfecthash <swh-perfecthash/index>
+   swh.provenance <swh-provenance/index>
   swh.scanner <swh-scanner/index>
   swh.scheduler <swh-scheduler/index>
   swh.scrubber <swh-scrubber/index>

--- a/docs/devel/architecture/alterations.rst
+++ b/docs/devel/architecture/alterations.rst
+.. _alterations:
+
+Alterations of the Software Architecture Archive
+================================================
+
+
+The main objective of an archive is to store facts forever. As such, it can be
+viewed as an append-only infrastructure. However, it may be necessary to alter
+the content of the archive to account for removal or alteration requests that
+may happen `for several reasons`_.
+
+We currently consider 2 types of alterations that may have to be applied to the
+archive:
+
+- content removal: some objects stored in the archive should not be visible any
+  more; these can be either removed entirely or masked, depending on the
+  situation.
+- personal identity modification: some personal information (namely the name
+  and email of a person) needs not to be visible any more.
+
+
+.. note::
+
+   We will not discuss in this section the administrative process of receiving,
+   handling and processing an alteration request of the Software Heritage
+   Archive. We will only focus on the technical aspects of the processes
+   involved, and their impact on the architectural design.
+
+
+.. _`for several reasons`: https://www.softwareheritage.org/legal/content-policy
+
+
+Types of alteration
+-------------------
+
+Content removal
+~~~~~~~~~~~~~~~
+
+A content removal request starts from one (or more) origin. All the removal
+handling process is based on an origin.
+
+When dealing with a content removal request that needs to be applied to the
+archive, the following steps need to be done:
+
+- identify all the objects in the archive (mostly in the :ref:`Merkle DAG
+  <swh-merkle-dag>`) that need to be removed,
+- build a properly encrypted recovery bundle with all the objects listed previously,
+- store and identify this bundle in a dedicated storage,
+- remove all the identified :py:class:`Content <swh.model.model.Content>`
+  objects from all the :ref:`objstorages <swh-objstorage>` under the legal and
+  technical responsibility of |swh|,
+- remove all the identified objects from all the :ref:`storages <swh-storage>`
+  under the legal and technical responsibility of |swh|,
+- remove all the identified objects from all the secondary data silos, namely
+  the :ref:`kafka journal <swh-journal>`, them :ref:`search index
+  <swh-search>`, the :ref:`compressed graph <swh-graph>` and the :ref:`vault cache
+  <swh-vault>`,
+- possibly: ensure the origins the removal request is targeting are excluded
+  from any future archival
+
+Note that handling archive content removal can also imply masking them
+(temporarily or permanently); for example during the examination process of
+suppression request, it might be necessary to hide all the impacted objects
+until a decision is made for each of them.
+
+
+Name change
+~~~~~~~~~~~
+
+A person may ask for their former identity not to be published any more. When
+this request has been handled and accepted, any occurrence of the former
+identity of the person associated with archived version control system objects
+(such as commits) will be replaced by the new one when using the public
+endpoints of the archive (namely, browsing the archive, using public APIs,
+using the vault).
+
+Note that currently, only :py:class:`Revision <swh.model.model.Revision>` and
+:py:class:`Release <swh.model.model.Release>` objects are affected by the
+process.
+
+
+Read Access - Altering results
+------------------------------
+
+The |swh| component responsible for altering returned objects is the
+:py:class:`MaskingProxyStorage
+<swh.storage.proxies.masking.MaskingProxyStorage>`. It handles both the cases of
+content that are still present in the archive but need to not to be published,
+and the application of active name change requests. It stores in a dedicated
+database a map of email to current display name to used to alter returned
+Revision and Release objects, and a series of tables dedicated to handling
+masking requests. These allow not to return at all an object from the archive
+if it's under a currently active masking request.
+
+As such, all the publicly accessible storage instances -- be it from the web
+frontend, the public API (REST and GraphQL) or the :term:`vault` service -- are
+using an access path that pass through the ``MaskingProxyStorage``.
+
+Note that for services like the :term:`vault`, it will make it fail to perform the
+requested cooking in some cases (especially for git history cooking, where the
+cryptographic integrity of the generated git content is altered, thus invalid.)
+
+
+Write Access - Preventing ingesting origins
+-------------------------------------------
+
+When an origin has been identified as forbidden for any future archiving, we
+use a dedicated storage proxy in the writing path to the archive to ensure this
+cannot happen. The corresponding |swh| component is the
+:py:class:`BlockingProxyStorage
+<swh.storage.proxies.blocking.BlockingProxyStorage>`. It is a simple proxy
+storage keeping a list of forbidden origin URLs in a dedicated database, and
+enforcing any matching origin URL to be ingested in the archive.
--- a/docs/devel/architecture/citation.rst
+++ b/docs/devel/architecture/citation.rst
+Citation workflow and architecture
+==================================
+
+Quick reminder on metadata objects
+----------------------------------
+
+Metadata in *Software Heritage* is explained in detail in the document :ref:`Metadata workflow and
+architecture <architecture-metadata>`. There are two types of metadata that are useful for citation, intrinsic and extrinsic. These two types can come from two
+sources: the archive itself (raw metadata) and the
+indexer (indexed metadata).
+
+For each metadata type and metadata source, metadata can be extracted for a specific
+object (``snapshot``, ``release``, ``revision``, ``directory``,
+``content``), using its SWHID, or using the repository URL (``origin``).
+In the latter case, it will return the metadata for the latest version
+(latest visit snapshot) of the repository root directory on the main
+branch.
+
+Citation use cases
+------------------
+
+.. list-table:: Citation use cases
+    :header-rows: 1
+    :stub-columns: 1
+
+    * - ID
+      - As a
+      - I can
+      - so that
+    * - UC1 (v1, v2)
+      - Researcher
+      - retrieve a citation or BibTeX export for a software artifact directly on SWH interface
+      - the software will be cited with correct attribution
+    * - UC2 (v1, v2)
+      - Publisher (Episciences)
+      - retrieve a citation or BibTeX export for a software artifact programmatically
+      - expose BibTeX
+    * - UC3
+      - Aggregator (OpenAire)
+      - retrieve intrinsic metadata from SWH programmatically
+      - the software record will be enriched
+
+Citation v1: data flow
+----------------------
+
+In this version, *Software Heritage* can generate a citation in BibTeX
+format from the raw intrinsic metadata available in the archive. The raw
+intrinsic metadata used for citation will be a found ``codemeta.json``
+file or, alternatively, a found ``citation.cff`` file in the repository.
+
+As per metadata extraction:
+
+* When given an ``origin`` URL, the citation will be generated from the latest version of the repository root directory metadata on the main branch.
+* When given a SWHID object of type ``snapshot``, ``release`` or ``revision``, the citation will be generated from the repository root directory metadata, associated with that version.
+* When given a ``directory`` object, if the SWHID is qualified with an anchor (explained in the document :ref:`SoftWare Heritage persistent IDentifiers (SWHIDs) <persistent-identifiers>`, the citation will be generated from the repository root directory metadata, associated with the anchor version.
+
+.. warning::
+    However, if no anchor was specified, it will be generated directly from the metadata found in that directory.
+
+* When given a ``content`` object, if the SWHID is qualified with an anchor, the citation will be generated from metadata of the repository root directory. If no anchor was specified, the citation cannot be generated due to a lack of information.
+
+Citation v1: architecture
+-------------------------
+
+*Software Heritage* provides a web API (through :ref:`swh.web <swh-web>`) to generate
+a citation, given an ``origin`` URL or a qualified SWHID.
+
+The corresponding API endpoints are:
+
+* ``/api/1/raw-intrinsic-metadata/citation/origin/`` (example: ``/api/1/raw-intrinsic-metadata/citation/origin/?citation_format=bibtex&origin_url=https://github.com/rdicosmo/parmap``)
+* ``/api/1/raw-intrinsic-metadata/citation/swhid/SWHID/`` (example: ``/api/1/raw-intrinsic-metadata/citation/swhid/?citation_format=bibtex&target_swhid=swh:1:dir:2dc0f462d191524530f5612d2935851505af41dd;origin=https://github.com/rdicosmo/parmap;visit=swh:1:snp:2128ed4f25f2d7ae7c8b7950a611d69cf4429063/``)
+
+Currently, the only allowed citation format value is BibTeX
+(``citation_format=bibtex``).
+
+This API uses intermediate utility methods:
+
+* in :ref:`swh.web <swh-web>`, to retrieve raw intrinsic metadata, given an ``origin`` URL or a qualified SWHID, which return original ``codemeta.json`` and ``citation.cff`` files.
+* in :ref:`swh.indexer <swh-indexer>`, to convert a ``codemeta.json`` or a ``citation.cff`` file into a BibTeX citation.
+
+Codemeta/citation.cff to BibTeX mapping
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A ``citation.cff`` file will be first converted into a ``codemeta.json``
+document. The ``CFF`` to ``CodeMeta`` mapping can be found in the
+`codemeta
+repository <https://github.com/codemeta/codemeta/blob/master/crosswalks/Citation%20File%20Format%201.2.0.csv>`_.
+
+The ``CodeMeta`` to ``BibTeX`` mapping, used for the converter, is
+`currently under
+review <https://github.com/codemeta/codemeta/pull/363>`_.
+
+Note on BibTeX ``@software``, ``@softwareversion`` and ``@codefragment`` usage
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The generated BibTeX citation can be of type ``@software``,
+``@softwareversion`` or ``@codefragment``. The rule is the following:
+
+* If SWHID is not specified,
+
+  * And if version is specified, then it will be ``@softwareversion``.
+  * Otherwise, it will be ``@software``.
+
+* If SWHID is specified
+
+  * And is of type ``snapshot``, then it will be ``@software``.
+  * And is of type ``release``, ``revision`` or ``directory``, then it will be ``@softwareversion``.
+  * And is of type ``content``, then it will be ``@codefragment``.
+
+A generated BibTeX example
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: bibtex
+
+   @software{REPLACEME,
+       author = "Di Cosmo, Roberto and Danelutto, Marco",
+       organization = "Inria and University Paris Diderot and University of Pisa",
+       license = "LGPL-2.0-only",
+       date = "2011-07-18",
+       year = "2011",
+       month = "07",
+       repository = "git+https://github.com/rdicosmo/parmap.git",
+       title = "Parmap",
+       swhid = "swh:1:snp:01b2cc89f4c423f1bda4757edd86ae4013b919b0;origin=https://github.com/rdicosmo/parmap"
+   }
+
+Citation v1: UI
+---------------
+
+Citation should be available in the webapp through a new *Citation* tab
+under the *Permalinks* tab, that should open the *Permalinks/Citation*
+box.
+
+Future
+------
+
+In the current v1 version, citation is generated from raw intrinsic metadata, i.e. ``codemeta.json`` or ``citation.cff`` file.
+
+.. mermaid::
+
+    quadrantChart
+        title Metadata types and sources for citation generation
+        x-axis Raw --> Indexed
+        y-axis Extrinsic --> Intrinsic
+        codemeta.json: [0.25, 0.9]
+        citation.cff: [0.25, 0.75]
+
+*Metadata types and sources for citation generation v1*
+
+The next versions of the citation feature should include:
+
+* New supported citation formats.
+* Citation styles?
+* On the API/backend side:
+
+  * v2: Generating citations from indexed intrinsic and extrinsic metadata (merging behaviour to be defined).
+  * v3: Authorities.
--- a/docs/devel/architecture/index.rst
+++ b/docs/devel/architecture/index.rst
@@ -9,5 +9,7 @@ Software Architecture
   :titlesonly:

   overview
+   alterations
   metadata
+   citation
   object-storage
--- a/docs/devel/architecture/overview.rst
+++ b/docs/devel/architecture/overview.rst
@@ -5,17 +5,154 @@ Software Architecture Overview


 From an end-user point of view, the |swh| platform consists in the
-:term:`archive`, which can be accessed using the web interface or its REST API.
-Behind the scene (and the web app) are several components/services that expose
-different aspects of the |swh| :term:`archive` as internal RPC APIs.
+:term:`archive`, which can be accessed using the web interface or its public
+APIs (REST or GraphQL). Behind the scene (and the web app) are several
+components/services that expose different aspects of the |swh| :term:`archive`
+as internal RPC APIs.

-These internal APIs have a dedicated database, usually PostgreSQL_.
+These internal APIs have a dedicated database, typically PostgreSQL_ or
+Cassandra_.

-A global (and incomplete) view of this architecture looks like:
+Big Pictures
+------------

-.. thumbnail:: ../images/general-architecture.svg

-   General view of the |swh| architecture.
+The Read-Only View
+^^^^^^^^^^^^^^^^^^
+
+A global (and incomplete) view of this architecture, limited to components
+involved when reading from the archive, looks like:
+
+.. thumbnail:: ../images/general-architecture-read.svg
+
+   General view of the |swh| architecture when reading.
+
+As you can see, there are quite a few parts in this infrastructure. We will come
+back on each of them in more details later, but here is a quick description:
+
+- **Ingress**: HTTP requests from the end user are received by a frontend ingress service (a
+  reverse proxy), responsible for routing and load balancing them to the proper
+  backend service.
+
+- **WebApp**: this is the main |swh| frontend tier; it is a Django based HTTP server
+  responsible for handling most the frontend and public API requests (browsing
+  the archive or using the public REST API). Being a central component for any
+  user interaction, it needs to have access to most other |swh| services.
+
+- **Authentication**: this is a Keycloak server used to handle authentication for
+  users that want to have authenticated access to the archive (using lifted
+  rate limiting, have access to administration boards, etc.)
+
+- **Deposit**: this is a Django-based HTTP server with a very minimal UI (a single
+  static documentation page), but providing SWORD API allowing deposit partners
+  to upload software source code (with metadata) directly in the archive. It
+  also allows to check and have feedback on the status of previous deposits.
+  Since it is an authenticated only service, it has access toand uses the Keycloak
+  authentication service.
+
+- **Counters**: a simple service maintaining general archive statistics. It is used
+  by the frontend to generate the per-forge counters and overall evolution
+  curves. It uses a Redis backend (for Hyperloglog counters).
+
+- **Scheduler**: the scheduler service. This is needed by the webapp frontend to
+  get feedback for services like Save Code Now and like, or schedule new
+  loading and listing tasks for these services. This service uses a database
+  storage.
+
+- **Vault**: the service responsible for managing and executing retrieval queries
+  (when a user wants to retrieve a whole directory or a whole git history).
+  This service uses a database storage.
+
+- **Indexer Storage**: a data store that keeps track of all the indexed metadata
+  objects in the archive. It is used directly by the webapp frontend to get
+  information like the mimetype or the possible license of a content. This
+  service is using a database storage.
+
+- **RO Storage**: the main storage service, hosting the whole |swh| archive
+  structure (but the file content themselves). In the context of the read
+  access to the archive, the Storage used is a Read-Only storage with a Masking
+  Proxy. This proxy allows to mask or modify on the fly objects that need
+  to be either hidden completely (typically when a takedown request is being
+  processed that impact the browsed object) or altered (typically when a person
+  asked for their former name not to be visible any more). The main storage can
+  be hosted either on a Postgresql database or a Cassandra one. The main
+  archive now uses Cassandra as main backend storage.
+
+- **Search**: the |swh| search service. This service is using an Elasticsearch
+  backend.
+
+- **Objstorage**: this data storage is used to store all the content blobs (the
+  actual source code files). It is a content-addressable object storage. The
+  |swh| objstorage provides an abstract frontend/API for many possible
+  backends. Currently the main archive is using a Ceph cluster for this, with a
+  custom layer (named Winery) in front to account for the specificities of the
+  |swh| workload (handle 10th of billions of small objects).
+
+
+The Ingestion View
+^^^^^^^^^^^^^^^^^^
+
+When looking at how software source code are harvested and ingested in the
+archive, the global picture looks like:
+
+.. thumbnail:: ../images/general-architecture-ingestion.svg
+
+   General view of the |swh| ingestion architecture.
+
+.. Note:: :term:`REMD` in this pictures stands for :term:`raw extrinsic metadata`.
+
+The central part of this setup is the scheduler service, responsible for
+keeping track of loading, listing and a few other types of tasks. The task
+execution framework uses Celery_ as backend. There are actually 2 completely
+different tasks systems provided by both the scheduler and the side services:
+
+- one is dedicated to managing the loading of source code from origins (aka spawning
+  :ref:`Loader <swh-loader-core>` tasks); these are one-shot celery tasks not
+  reified in the scheduler database,
+
+- the other is a generic task scheduling service mostly responsible for
+  recurring tasks; especially :ref:`forge listing <swh-lister>` ones, but not
+  only. Some one-shot loading tasks are still handled by this scheduler
+  (especially loading origins from :term:`save code now` requests). There are
+  also :ref:`vault <swh-vault>` cooking tasks and deposit checker tasks that
+  are using this generic scheduler.
+
+A more detailed view of this later is :ref:`available below
+<source_code_scrapping>`.
+
+One noticeable point in this schematic is the presence of the :py:class:`Blocking
+Proxy <swh.storage.proxies.blocking.BlockingProxyStorage>` in the :ref:`storage
+<swh-storage>` configuration. This proxy is a helper to prevent from ingesting
+from origins that have been disabled as a result of a takedown notice.
+
+.. Note:: Even if not represented in this diagram, there are actually several
+   :term:`Scheduler Task` runner service instances running: one is scheduling
+   high priority :term:`Scheduler Task` (using a dedicated set of `celery
+   queues`_), typically for :term:`save code now` requests; one is special case
+   for scheduling first visits of a newly added forge or a :term:`bulk
+   on-demand archival` request (also using dedicated celery queues); the last
+   is responsible for scheduling all other standard (non priority)
+   :term:`Scheduler Task`.
+
+.. Note:: Loading tasks are not represented by one-shot :term:`Scheduler Task`
+   instances (in the scheduler database) anymore, but the corresponding celery
+   tasks are directly spawned by the "loader scheduler" (it was not possible to
+   handle that many entries in the database efficiently). There is however
+   still an exception for deposit loading tasks that are still managed via this
+   generic scheduling scaffolding (mostly for historical reasons).
+
+
+The Indexation View
+^^^^^^^^^^^^^^^^^^^
+
+The |swh| archive platform also comes with a complex indexation system. A view
+from this indexation side would look like:
+
+.. thumbnail:: ../images/general-architecture-indexation.svg
+
+   General view of the |swh| indexation architecture.
+
+See the :ref:`swh-indexer` documentation for more details.

 .. _architecture-tier-1:

@@ -35,11 +172,31 @@ It relies on the :ref:`Object Storage <swh-objstorage>` service to store
 the content of source code file themselves.

 Both the Storage and Object Storage are designed as abstractions over possible
-backends. The former supports both PostgreSQL (the current solution in production)
-and Cassandra (a more scalable option we are exploring).
+backends. The former supports both PostgreSQL (the former solution in production)
+and Cassandra (a more scalable option, now used as main backend in production).
 The latter supports a large variety of "cloud" object storage as backends,
 as well as a simple local filesystem.

+Alterations
+~~~~~~~~~~~
+
+The main objective of an archive is to store facts forever. As such, it can be
+viewed as an append-only infrastructure. However, it may be necessary to alter
+the content of the archive to account for removal or alteration requests that
+may happen `for several reasons`_.
+
+We currently consider 2 types of alterations that may have to be done to the
+archive:
+
+- content removal: some objects stored in the archive should not be visible any
+  more; these can be either removed entirely or masked, depending on the
+  situation.
+- personal identity modification: some personal information (namely the name
+  and email of a person) needs not to be visible any more.
+
+These requirements have impact on the overall architecture of the archive.
+Details are documented in a :ref:`dedicated section<alterations>`.
+

 Journal
 ^^^^^^^
@@ -56,6 +213,8 @@ when to visit again these repositories.
 It is also the foundation of the :ref:`mirror` infrastructure, as it allows
 mirrors to stay up to date.

+.. _source_code_scrapping:
+
 Source code scraping
 ^^^^^^^^^^^^^^^^^^^^

@@ -258,8 +417,8 @@ such as full-text search on origin URLs and metadata.
 This service is a recent addition to the |swh| architecture based on ElasticSearch,
 and is currently in use only for URL search.

-Graph
-^^^^^
+Compressed Graph
+^^^^^^^^^^^^^^^^

 :ref:`swh-graph <swh-graph>` is also a recent addition to the architecture
 designed to complement the Storage using a specialized backend.
@@ -336,13 +495,16 @@ designed to keep them in sync:
  in the Journal and recreate it.


+.. _Cassandra: https://cassandra.apache.org
 .. _celery: https://www.celeryproject.org
-.. _CodeMeta: https://codemeta.github.io/
+.. _CodeMeta: https://codemeta.github.io
 .. _gitlab: https://gitlab.com
-.. _PostgreSQL: https://www.postgresql.org/
-.. _Prometheus: https://prometheus.io/
+.. _PostgreSQL: https://www.postgresql.org
+.. _Prometheus: https://prometheus.io
 .. _publish-subscribe: https://en.wikipedia.org/wiki/Publish%E2%80%93subscribe_pattern
-.. _Redis: https://redis.io/
+.. _Redis: https://redis.io
 .. _SWORDv2: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html
-.. _HyperLogLog: https://redislabs.com/redis-best-practices/counting/hyperloglog/
-.. _WebGraph: https://webgraph.di.unimi.it/
+.. _HyperLogLog: https://redislabs.com/redis-best-practices/counting/hyperloglog
+.. _WebGraph: https://webgraph.di.unimi.it
+.. _`for several reasons`: https://www.softwareheritage.org/legal/content-policy
+.. _`celery queues`: https://docs.celeryq.dev/en/stable/getting-started/introduction.html#what-s-a-task-queue
--- a/docs/devel/bin/ln-sphinx-subprojects
+++ b/docs/devel/bin/ln-sphinx-subprojects
@@ -16,6 +16,8 @@ create_links () {
        fi
        if [ -d "../../../${pymodule}/swh" ] ; then
            cp -r -f --symbolic-link $(realpath ../../../${pymodule}/swh/*) sources/swh/
+        elif [ -d "../../../${pymodule}/src/swh" ] ; then
+            cp -r -f --symbolic-link $(realpath ../../../${pymodule}/src/swh/*) sources/swh/
        fi
        pushd ../../../${pymodule}
        for EXT in rst md; do

--- a/docs/devel/contributing/git-style-guide.rst
+++ b/docs/devel/contributing/git-style-guide.rst
@@ -12,7 +12,7 @@ Good commit messages are essentials in a project as large as Software Heritage.
 They are crucial to those who will review your changes and important to anyone else
 who will interact with the codebase at a later time. This includes your future self!

-Make sure to follow the recommandations from `How to write a Git
+Make sure to follow the recommendations from `How to write a Git
 commit message <http://chris.beams.io/posts/git-commit/>`_

 Closing or referencing issues

--- a/docs/devel/contributing/gitlab.rst
+++ b/docs/devel/contributing/gitlab.rst
@@ -153,3 +153,42 @@ If you plan to
 you may also want to
 `upload your GPG key <https://gitlab.softwareheritage.org/-/profile/gpg_keys>`__
 as well.
+
+Make a release
+--------------
+
+.. warning:: Only staff members are allowed to make new releases
+
+Releases are made automatically by Jenkins when a tag is pushed to a module repository.
+
+We are using the `semantic versioning <https://semver.org>`_ scheme to name our
+releases, please ensure that the name of your tag correctly indicates its compatibility
+with the previous version.
+
+Tags themselves should be signed and provide a meaningful annotation with, for example,
+an itemized summary of changes (rather than rehashing the whole git log), breaking
+changes in a separate section, etc.
+
+First, create the tag:
+
+.. code-block::
+
+   # get the latest version number
+   git describe --tags  # returns v1.2.3-x-yyy
+   # list changes between master and v1.2.3
+   git range-diff v1.2.3...master
+   # use the output to write your annotation and create a new signed tag, here for a
+   # minor version upgrade
+   git tag -a -s v1.3.0
+   # push it
+   git push origin tag v1.3.0
+
+Then you'll see jobs on Jenkins (Incoming tag, GitLab builds, Upload to PyPI)
+indicating that the release process is ongoing.
+
+Next, deployment container images are updated.
+
+And finally a new merge request will automatically be created in
+`Helm charts for swh packages`_ so that the devops team can proceed with deployment.
+
+.. _Helm charts for swh packages: https://gitlab.softwareheritage.org/swh/infra/sysadm-environment
--- a/docs/devel/contributing/phabricator.rst
+++ b/docs/devel/contributing/phabricator.rst
-:orphan:
-
-.. highlight:: bash
-
-.. admonition:: Intended audience
-   :class: important
-
-   Contributors
-
-Important
-=========
-
-We have moved our development from Phabricator to a GitLab instance at
-https://gitlab.softwareheritage.org/
-
-The content below is no longer relevant and will be updated soon.
-
-Submitting patches
-==================
-
-`Phabricator`_ is the tool that Software Heritage uses as its
-coding/collaboration forge.
-
-Software Heritage's Phabricator instance can be found at
-https://forge.softwareheritage.org/
-
-.. _Phabricator: http://phabricator.org/
-
-Code Review in Phabricator
--------------------------
-
-We use the Differential application of Phabricator to perform
-:ref:`code reviews <code-review>` in the context of Software Heritage.
-
-* we use Git and ``history.immutable=true``
-  (but beware as that is partly a Phabricator misnomer, read on)
-* when code reviews are required, developers will be allowed to push
-  directly to master once an accepted Differential diff exists
-
-Configuration
-+++++++++++++
-
-.. _arcanist-configuration:
-
-Arcanist configuration
-^^^^^^^^^^^^^^^^^^^^^^
-
-Authentication
-~~~~~~~~~~~~~~
-
-First, you should install Arcanist and authenticate it to Phabricator::
-
-   sudo apt-get install arcanist
-   arc set-config default https://forge.softwareheritage.org/
-   arc install-certificate
-
-arc will prompt you to login into Phabricator via web
-(which will ask your personal Phabricator credentials).
-You will then have to copy paste the API token from the web page to arc,
-and hit Enter to complete the certificate installation.
-
-Immutability
-~~~~~~~~~~~~
-
-When using git, Arcanist by default mess with the local history,
-rewriting commits at the time of first submission.
-To avoid that we use so called `history immutability`_
-
-.. _history immutability: https://secure.phabricator.com/book/phabricator/article/arcanist_new_project/#history-mutability-git
-
-To that end, you shall configure your ``arc`` accordingly::
-
-   arc set-config history.immutable true
-
-Note that this does **not** mean that you are forbidden to rewrite
-your local branches (e.g., with ``git rebase``).
-Quite the contrary: you are encouraged to locally rewrite branches
-before pushing to ensure that commits are logically separated
-and your commit history easy to bisect.
-The above setting just means that *arc* will not rewrite commit
-history under your nose.
-
-Enabling ``git push`` to our forge
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The way we've configured our review setup for continuous integration
-needs you to configure git to allow pushes to our forge.
-There's two ways you can do this : setting a ssh key to push over ssh,
-or setting a specific password for git pushes over https.
-
-SSH key for pushes
-~~~~~~~~~~~~~~~~~~
-
-In your forge User settings page (On the top right, click on your avatar,
-then click *Settings*), you have access to a *Authentication* >
-*SSH Public Keys* section (Direct link:
-``hxxps://forge.softwareheritage.org/settings/user/<your username>/page/ssh/``).
-You then have the option to upload a SSH public key,
-which will authenticate your pushes.
-
-You then need to configure ssh/git to use that key pair,
-for instance by editing the ``~/.ssh/config`` file.
-
-Finally, you should configure git to push over ssh when pushing to
-https://forge.softwareheritage.org, by running the following command::
-
-   git config --global url.git@forge.softwareheritage.org:.pushInsteadOf https://forge.softwareheritage.org
-
-This lets git know that it should use ``git@forge.softwareheritage.org:``
-as a base url when pushing repositories cloned from
-forge.softwareheritage.org over https.
-
-VCS password for pushes
-~~~~~~~~~~~~~~~~~~~~~~~
-
-.. warning:: Please, only use this if you're completely unable to use ssh.
-
-As a fallback to the ssh setup, you have the option of setting a VCS password. This
-password, *separate from your account password*, allows Phabricator to authenticate your
-uploads over HTTPS.
-
-In your forge User settings page (On the top right, click on your avatar, then click
-*Settings*), you need to use the *Authentication* > *VCS Password* section to set your
-VCS password (Direct link: ``hxxps://forge.softwareheritage.org/settings/user/<your
-username>/page/vcspassword/``).
-
-If you still get a 403 error on push, this means you need a forge administrator to
-enable HTTPS pushes for the repository (which wasn't done by default in historical
-repositories). Please drop by on IRC and let us know!
-
-Workflow
-++++++++
-
-* work in a feature branch: ``git checkout -b my-feat``
-* initial review request: hack/commit/hack/commit ;
-  ``arc diff origin/master``
-* react to change requests: hack/commit/hack/commit ;
-  ``arc diff --update Dxx origin/master``
-* landing change: ``git checkout master ; git merge my-feat ; git push``
-
-Starting a new feature and submit it for review
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Use a **one branch per feature** workflow, with well-separated
-**logical commits** (:ref:`following those conventions <git-style-guide>`).
-Please open one diff per logical commit to keep the diff size to a minimum.
-
-.. code-block::
-
-   git checkout -b my-shiny-feature
-   ... hack hack hack ...
-   git commit -m 'architecture skeleton for my-shiny-feature'
-   ... hack hack hack ...
-   git commit -m 'my-shiny-feature: implement module foo'
-   ... etc ...
-
-Please, follow the
-To **submit your code for review** the first time::
-
-   arc diff origin/master
-
-arc will prompt for a **code review message**. Provide the following information:
-
-* first line: *short description* of the overall work
-  (i.e., the feature you're working on).
-  This will become the title of the review
-* *Summary* field (optional): *long description* of the overall work;
-  the field can continue in subsequent lines, up to the next field.
-  This will become the "Summary" section of the review
-* *Test Plan* field (optional): write here if something special is needed
-  to test your change
-* *Reviewers* field (optional): the (Phabricator) name(s) of
-  desired reviewers.
-  If you don't specify one (recommended) the default reviewers will be chosen
-* *Subscribers* field (optional): the (Phabricator) name(s) of people that
-  will be notified about changes to this review request.
-  In most cases it should be left empty
-
-For example::
-
-   mercurial loader
-
-   Summary: first stab at a mercurial loader (T329)
-
-   The implementation follows the plan detailed in F2F discussion with @foo.
-
-   Performances seem decent enough for a first trial (XXX seconds for YYY repository
-   that contains ZZZ patches).
-
-   Test plan:
-
-   Reviewers:
-
-   Subscribers: foo
-
-After completing the message arc will submit the review request
-and tell you its number and URL::
-
-   [...]
-   Created a new Differential revision:
-           Revision URI: https://forge.softwareheritage.org/Dxx
-
-.. _arc-update:
-
-Updating your branch to reflect requested changes
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Your feature might get accepted as is, YAY!
-Or, reviewers might request changes; no big deal!
-
-Use the Differential web UI to follow-up to received comments, if needed.
-
-To implement requested changes in the code, hack on your branch as usual by:
-
-* adding new commits, and/or
-* rewriting old commits with git rebase (to preserve a nice, easy to bisect history)
-* pulling on master and rebasing your branch against it if meanwhile someone
-  landed commits on master:
-
-.. code-block::
-
-   git checkout master
-   git pull
-   git checkout my-shiny-feature
-   git rebase master
-
-
-When you're ready to **update your review request**::
-
-   arc diff --update Dxx HEAD~
-
-Arc will prompt you for a message: **describe what you've changed
-w.r.t. the previous review request**, free form.
-This means you should not repeat the title of your diff (which is
-often the default if you squashed/amended your commits)
-
-Your message will become the changelog entry in Differential
-for this new version of the diff, and will help reviewers
-understand what changes you made since they last read your diff.
-
-Differential only care about the code diff, and not about the commits
-or their order.
-Therefore each "update" can be a completely different series of commits,
-possibly rewritten from the previous submission.
-
-Dependencies between diffs
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Note that you can manage diff dependencies within the same module
-with the following keyword in the diff description::
-
-   Depends on Dxx
-
-That allows to keep a logical view in your diff.
-It's not strictly necessary (because the tooling now deals with it properly)
-but it might help reviewers or yourself to do so.
-
-Landing your change onto master
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Once your change has been approved in Differential,
-you will be able to land it onto the master branch.
-
-Before doing so, you're encouraged to **clean up your git commit history**,
-reordering/splitting/merging commits as needed to have separate
-logical commits and an easy to bisect history.
-Update the diff :ref:`following the prior section <arc-update>`
-(It'd be good to let the CI build finish to make sure everything is still green).
-
-Once you're happy you can **push to origin/master** directly, e.g.::
-
-   git checkout master
-   git merge --ff-only my-shiny-feature
-   git push
-
-``--ff-only`` is optional, and makes sure you don't unintentionally
-create a merge commit.
-
-Optionally you can then delete your local feature branch::
-
-   git branch -d my-shiny-feature
-
-Reviewing locally / landing someone else's changes
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-You can do local reviews of code with arc patch::
-
-   arc patch Dxyz
-
-This will create a branch **arcpatch-Dxyz** containing the changes
-on your local checkout.
-
-You can then merge those changes upstream with::
-
-   git checkout master
-   git merge --ff arcpatch-Dxyz
-   git push origin master
-
-or, alternatively::
-
-   arc land --squash
-
-
-See also
--------
-
-* :ref:`code-review` for guidelines on how code is reviewed
-  when developing for Software Heritage
--- a/docs/devel/contributing/sphinx.rst
+++ b/docs/devel/contributing/sphinx.rst
@@ -3,7 +3,12 @@
 Sphinx gotchas
 ==============

-Here is a list of common gotchas when formatting Python docstrings for `Sphinx <https://www.sphinx-doc.org/en/stable/>`_ and the `Napoleon <https://www.sphinx-doc.org/en/stable/ext/napoleon.html>`_ style.
+Here is a list of common gotchas when formatting Python docstrings for `Sphinx
+<https://www.sphinx-doc.org/en/stable/>`_ and the `Napoleon
+<https://www.sphinx-doc.org/en/stable/ext/napoleon.html>`_ style.
+
+.. highlight:: rst
+

 Sphinx
 ------
@@ -11,12 +16,12 @@ Sphinx
 Lists
 +++++

-All sorts of `lists <https://www.sphinx-doc.org/en/stable/rest.html#lists-and-quote-like-blocks>`_
-require an empty line before the first bullet and after the last one,
-to be properly interpreted as list.
-No indentation is required for list elements w.r.t. surrounding text,
-and line continuations should be indented like the first character
-after the bullet.
+All sorts of `lists
+<https://www.sphinx-doc.org/en/stable/rest.html#lists-and-quote-like-blocks>`_
+require an empty line before the first bullet and after the last one, to be
+properly interpreted as list. No indentation is required for list elements
+w.r.t. surrounding text, and line continuations should be indented like the
+first character after the bullet.

 Bad::

@@ -177,10 +182,13 @@ Good::

   Args:
       foo (int): first argument
+
       bar: second argument, which happen to have a fairly
-           long description of what it does
+            long description of what it does
+
       baz (bool): third argument

+
 Returns
 +++++++

@@ -232,6 +240,7 @@ Good::
       ValueError: if you botched it
       RuntimeError: if we botched it

+
 See also
 --------


--- a/docs/devel/developer-setup.rst
+++ b/docs/devel/developer-setup.rst
@@ -22,58 +22,152 @@ Install required dependencies
 -----------------------------

 Software Heritage requires some dependencies that are usually packaged by your
-package manager. On Debian/Ubuntu-based distributions::
-
-  sudo apt install lsb-release wget apt-transport-https
-  sudo wget https://www.postgresql.org/media/keys/ACCC4CF8.asc -O /etc/apt/trusted.gpg.d/postgresql.asc
-  echo "deb https://apt.postgresql.org/pub/repos/apt/ $(lsb_release -cs)-pgdg main" | sudo tee -a /etc/apt/sources.list.d/pgdg.list
-  sudo wget https://downloads.apache.org/cassandra/KEYS -O /etc/apt/trusted.gpg.d/cassandra.asc
-  echo "deb https://debian.cassandra.apache.org 41x main" | sudo tee -a /etc/apt/sources.list.d/cassandra.list
-  sudo apt update
-  sudo apt install \
-      build-essential pkg-config lzip rsync \
-      python3 python3-pip python3-venv virtualenvwrapper \
-      libpython3-dev libsystemd-dev libsvn-dev libffi-dev librdkafka-dev \
-      fuse3 libfuse3-dev libcmph-dev libleveldb-dev \
-      git myrepos \
-      graphviz plantuml inkscape \
-      postgresql libpq-dev cassandra
-
-.. Note:: Python 3.7 or newer is required
+package manager.
+
+.. tab-set::
+
+  .. tab-item:: Debian/Ubuntu
+
+    .. code-block:: console
+
+      sudo apt install lsb-release wget apt-transport-https
+
+      sudo wget https://www.postgresql.org/media/keys/ACCC4CF8.asc -O /etc/apt/trusted.gpg.d/postgresql.asc
+
+      echo "deb https://apt.postgresql.org/pub/repos/apt/ $(lsb_release -cs)-pgdg main" | sudo tee -a /etc/apt/sources.list.d/pgdg.list
+
+      sudo wget https://downloads.apache.org/cassandra/KEYS -O /etc/apt/trusted.gpg.d/cassandra.asc
+
+      echo "deb https://debian.cassandra.apache.org 41x main" | sudo tee -a /etc/apt/sources.list.d/cassandra.list
+
+      sudo apt update
+
+      sudo apt install \
+          build-essential pkg-config lzip rsync \
+          python3 python3-pip python3-venv virtualenvwrapper \
+          libpython3-dev libsystemd-dev libsvn-dev libffi-dev librdkafka-dev \
+          fuse3 libfuse3-dev libcmph-dev libleveldb-dev \
+          git myrepos \
+          graphviz plantuml inkscape \
+          postgresql libpq-dev cassandra redis-server
+
+  .. tab-item:: Fedora
+
+    .. code-block:: console
+
+      sudo dnf install java-17-openjdk-headless
+
+      # Make sure the path is correct. If not, choose the alternative corresponding to java-17
+      sudo update-alternatives --set java /usr/lib/jvm/java-17-openjdk-17.0.13.0.11-3.fc41.x86_64/bin/java
+
+      sudo rpm --import https://downloads.apache.org/cassandra/KEYS
+
+      echo "[cassandra]
+      name=Apache Cassandra
+      baseurl=https://redhat.cassandra.apache.org/50x/
+      gpgcheck=1
+      repo_gpgcheck=0
+      gpgkey=https://downloads.apache.org/cassandra/KEYS" | sudo tee /etc/yum.repos.d/cassandra.repo
+
+      sudo dnf -y update
+
+      sudo dnf -y install cassandra
+
+      sudo dnf -y group install c-development
+
+      sudo dnf -y install \
+          pkgconf-pkg-config lzip rsync python3.11 python3-virtualenvwrapper \
+          python3.11-devel systemd-devel subversion-devel libffi-devel \
+          librdkafka fuse3 fuse3-devel leveldb-devel git myrepos graphviz \
+          plantuml inkscape postgresql-server postgresql-contrib libpq \
+          libpq-devel redis
+
+      # You will also need to install CMPH manually, as it is not (yet?) included in the Fedora repositories
+      wget https://sourceforge.net/projects/cmph/files/v2.0.2/cmph-2.0.2.tar.gz
+      tar -xvf cmph-2.0.2.tar.gz
+      cd cmph-2.0.2
+      ./configure && make && sudo make install
+      cd ..
+
+.. Note:: Python 3.10 or newer is required

 This installs basic system utilities, Python library dependencies, development tools,
 documentation tools and our main database management systems.

 Cassandra and PostgreSQL will be started by tests when they need it, so you
-don't need them started globally (this will save you some RAM)::
+don't need them started globally (this will save you some RAM):
+
+.. code-block:: console

  sudo systemctl disable --now cassandra postgresql

-If you intend to hack on the frontend part of |swh| Web Applications, you will also
-need to have ``nodejs >= 18`` in your development environment. If the version in your
-Debian-based distribution is lower, you can install node 18 using these commands::
+You must also have ``nodejs >= 20`` in your development environment.
+You can install node 18 using these commands:
+
+.. tab-set::
+
+  .. tab-item:: Debian/Ubuntu
+
+    .. code-block:: console
+
+      curl -fsSL https://deb.nodesource.com/setup_20.x | sudo bash -
+      sudo apt install -y nodejs
+
+  .. tab-item:: Fedora
+
+    .. code-block:: console

-  sudo wget https://deb.nodesource.com/gpgkey/nodesource.gpg.key -O /etc/apt/trusted.gpg.d/nodesource.asc
-  echo "deb https://deb.nodesource.com/node_18.x $(lsb_release -cs) main" | sudo tee -a /etc/apt/sources.list.d/nodesource.list
-  sudo apt update
-  sudo apt install nodejs
+       sudo dnf -y install nodejs

-Also related to Web Applications development, |swh| uses the ``yarn`` package manager
-to retrieve frontend dependencies and development tools. It is recommended to install its
-latest classic version using these commands::
+|swh| uses the ``yarn`` package manager to retrieve frontend dependencies and development tools.
+You must install its latest classic version using this command:

-  sudo wget https://dl.yarnpkg.com/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/yarn.asc
-  echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list
-  sudo apt update
-  sudo apt install yarn
+.. tab-set::
+
+  .. tab-item:: Debian/Ubuntu
+
+    .. code-block:: console
+
+       sudo corepack enable
+
+  .. tab-item:: Fedora
+
+    .. code-block:: console
+
+       sudo dnf -y install yarnpkg

 If you intend to work on |swh| archive search features, Elasticsearch must also be
-present in your development environment. Proceed as follows to install it::
+present in your development environment. Proceed as follows to install it:
+
+.. tab-set::
+
+  .. tab-item:: Debian/Ubuntu
+
+    .. code-block:: console
+
+      sudo wget https://artifacts.elastic.co/GPG-KEY-elasticsearch -O /etc/apt/trusted.gpg.d/elasticsearch.asc
+
+      echo "deb https://artifacts.elastic.co/packages/7.x/apt stable main" | sudo tee -a /etc/apt/sources.list.d/elasticsearch.list
+
+      sudo apt update
+
+      sudo apt install elasticsearch
+
+  .. tab-item:: Fedora

-  sudo wget https://artifacts.elastic.co/GPG-KEY-elasticsearch -O /etc/apt/trusted.gpg.d/elasticsearch.asc
-  echo "deb https://artifacts.elastic.co/packages/7.x/apt stable main" | sudo tee -a /etc/apt/sources.list.d/elasticsearch.list
-  sudo apt update
-  sudo apt install elasticsearch
+    .. code-block:: console
+
+      echo "[elasticsearch]
+      name=Elasticsearch repository for 8.x packages
+      baseurl=https://artifacts.elastic.co/packages/8.x/yum
+      gpgcheck=1
+      gpgkey=https://artifacts.elastic.co/GPG-KEY-elasticsearch
+      autorefresh=1
+      type=rpm-md" | sudo tee /etc/yum.repos.d/elasticsearch.repo
+
+      sudo dnf -y update
+
+      sudo dnf -y install elasticsearch

 If you intend to build the full |swh| documentation, the ``postgresql-autodoc`` utility must
 also be installed, follow these `instructions <https://github.com/cbbrowne/autodoc#installation>`_
@@ -84,30 +178,34 @@ to do so.
 Checkout the source code
 ------------------------

-Clone the |swh| environment repository::
+Clone the |swh| environment repository:
+
+.. code-block:: console

    ~$ git clone https://gitlab.softwareheritage.org/swh/devel/swh-environment.git
    [...]
    ~$ cd swh-environment
    ~/swh-environment$

-Create a virtualenv::
+Create a virtualenv:
+
+.. code-block:: console

    ~/swh-environment$ source /usr/share/virtualenvwrapper/virtualenvwrapper.sh
    ~/swh-environment$ mkvirtualenv -p /usr/bin/python3 -a $PWD swh
    [...]
    (swh) ~/swh-environment$

-Checkout all the swh packages source repositories::
+Checkout all the swh packages source repositories:
+
+.. code-block:: console

    (swh) ~/swh-environment$ pip install pre-commit
    (swh) ~/swh-environment$ ./bin/update

-Use the same mypy version our tox containers use::
-
-    (swh) ~/swh-environment$ pip install mypy==1.8.0
+In the future you can re-activate the created virtualenv with:

-In the future you can re-activate the created virtualenv with::
+.. code-block:: console

   $ workon swh
   (swh) ~/swh-environment$
@@ -122,19 +220,11 @@ In the future you can re-activate the created virtualenv with::
 .. _pipenv: https://pipenv.readthedocs.io/


-Install all the swh packages (in development mode, with testing dependencies)::
+Install all the swh packages (in development mode, with testing dependencies):

-    (swh) ~/swh-environment$ bin/install
-
-.. note::
+.. code-block:: console

-   If you experience issues with :program:`pip` dependency resolution, try with
-   ``bin/install --use-deprecated=legacy-resolver`` (the flag will be passed on
-   to ``pip install``). The same flag can also be set globally in
-   :file:`~/.config/pip/pip.conf`::
-
-      [install]
-      use-deprecated=legacy-resolver
+    (swh) ~/swh-environment$ bin/install


 Executing unit tests
@@ -151,7 +241,9 @@ tox_. The main difference between these 2 test execution environments is:
  current virtualenv, installed from the git repositories: you test your
  modification against the HEAD of every swh package.

-For example, running unit tests for the swh-loader-git_ package::
+For example, running unit tests for the swh-loader-git_ package:
+
+.. code-block:: console

    (swh) ~/swh-environment$ cd swh-loader-git
    (swh) ~/swh-environment/swh-loader-git$ pytest
@@ -171,7 +263,9 @@ For example, running unit tests for the swh-loader-git_ package::
 	[...]
 	================== 25 passed, 12 warnings in 6.66 seconds ==================

-Running the same test, plus code linting and static analysis, using tox::
+Running the same test, plus code linting and static analysis, using tox:
+
+.. code-block:: console

    (swh) ~/swh-environment/swh-loader-git$ tox
    GLOB sdist-make: ~/swh-environment/swh-loader-git/setup.py
@@ -235,7 +329,9 @@ Running the same test, plus code linting and static analysis, using tox::
 Beware that some swh packages require a postgresql server properly configured
 to execute the tests. In this case, you will want to use pifpaf_, which will
 spawn a temporary instance of postgresql, to encapsulate the call to pytest.
-For example, running pytest in the swh-core package::
+For example, running pytest in the swh-core package:
+
+.. code-block:: console

    (swh) ~/swh-environment$ cd swh-core
 	(swh) ~/swh-environment/swh-core$ pifpaf run postgresql -- pytest

--- a/docs/devel/faq/index.rst
+++ b/docs/devel/faq/index.rst
@@ -29,7 +29,7 @@ specific skills needed to work on any topic of your interest.
 What are the minimum system requirements (hardware/software) to run SWH locally?
 --------------------------------------------------------------------------------

-Python 3.7 or newer is required. See the :ref:`developer setup documentation
+Python 3.10 or newer is required. See the :ref:`developer setup documentation
 <developer-setup>` for more details.


@@ -126,8 +126,8 @@ Getting sample datasets
 Is there a way to connect to SWH archived (production) database from my local machine?
 --------------------------------------------------------------------------------------

-We provide the archive as a dataset on public clouds, see the :ref:`swh-dataset
-documentation <swh-dataset>`. We can
+We provide the archive as a dataset on public clouds, see the :ref:`swh-export
+documentation <swh-export>`. We can
 also provide read access to one of the main databases on request, `contact us`_.

 .. _faq_error_bugs:

--- a/docs/devel/getting-started.rst
+++ b/docs/devel/getting-started.rst
@@ -23,20 +23,30 @@ Dependencies
 The easiest way to run a Software Heritage instance is to use Docker.
 Please `ensure that you have a working recent installation first
 <https://docs.docker.com/engine/install/>`_ (including the
-`Compose <https://docs.docker.com/compose/>`_ plugin.
+`Compose <https://docs.docker.com/compose/>`_ plugin).

 Quick start
 -----------

 First, retrieve Software Heritage development environment to get the
-Docker configuration::
+Docker configuration:

-   ~$ git clone https://gitlab.softwareheritage.org/swh/devel/swh-environment.git
-   ~$ cd swh-environment/docker
+.. code-block:: console

-Then, start containers::
+   ~$ git clone https://gitlab.softwareheritage.org/swh/devel/docker.git swh-docker
+   ~$ cd swh-docker

-   ~/swh-environment/docker$ docker compose up -d
+.. note::
+
+   If you intend to hack on Software Heritage source code and test your changes with docker,
+   you should rather follow the instructions in section :ref:`checkout-source-code` to
+   install the full Software Heritage development environment that includes Docker configuration.
+
+Then, start containers:
+
+.. code-block:: console
+
+   ~/swh-docker$ docker compose up -d
   [...]
   Creating docker_amqp_1               ... done
   Creating docker_zookeeper_1          ... done
@@ -46,9 +56,11 @@ Then, start containers::
   [...]

 This will build Docker images and run them. Check everything is running
-fine with::
+fine with:
+
+.. code-block:: console

-   ~/swh-environment/docker$ docker compose ps
+   ~/swh-docker$ docker compose ps
                            Name                                       Command               State                                      Ports
   -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   docker_amqp_1                                    docker-entrypoint.sh rabbi ...   Up      15671/tcp, 0.0.0.0:5018->15672/tcp, 25672/tcp, 4369/tcp, 5671/tcp, 5672/tcp
@@ -63,9 +75,11 @@ dependency-related problems. If some containers failed to start, just
 run the ``docker compose up -d`` command again.

 If a container really refuses to start properly, you can check why using
-the ``docker compose logs`` command. For example::
+the ``docker compose logs`` command. For example:

-   ~/swh-environment/docker$ docker compose logs swh-lister
+.. code-block:: console
+
+   ~/swh-docker$ docker compose logs swh-lister
   Attaching to docker_swh-lister_1
   [...]
   swh-lister_1                      | Processing /src/swh-scheduler
@@ -77,19 +91,37 @@ the ``docker compose logs`` command. For example::
  For details on the various Docker images and how to work with them,
  see the full :ref:`docker-environment` documentation.

-Once all containers are running, you can use the web interface by
-opening http://localhost:5080/ in your web browser.
+Once all containers are running, you can use the web interface by opening
+http://localhost:<nginx-port>/ in your web browser. ``<nginx-port>`` is the
+port on which nginx is exposed to the host. By default, it is randomly
+attributed by docker. Use:
+
+.. code-block:: console
+
+   ~/swh-docker$ docker compose port nginx 80
+
+To find which port is actually used.
+
+.. note::
+
+   Please read the "Exposed Ports" section of the README file in the
+   `swh-docker`_ repository for more details and options on this topic.
+
+.. _`swh-docker`:  https://gitlab.softwareheritage.org/swh/devel/docker.git
+

 At this point, the archive is empty and needs to be filled with some
 content. The simplest way to start loading software is to use the
 *Save Code Now* feature of the archive web interface:

-  http://localhost:5080/browse/origin/save/
+  http://localhost:<nginx-port>/browse/origin/save/

 You can also use the command line interface to inject code. For
-example to retrieve projects hossted on the https://0xacab.org GitLab forge::
+example to retrieve projects hossted on the https://0xacab.org GitLab forge:
+
+.. code-block:: console

-   ~/swh-environment/docker$ docker compose exec swh-scheduler \
+   ~/swh-docker$ docker compose exec swh-scheduler \
       swh scheduler task add list-gitlab-full \
         -p oneshot url=https://0xacab.org/api/v4

@@ -108,17 +140,21 @@ This task will scrape the forge’s project list and register origins to the sch
 This takes at most a couple of minutes.

 Then, you must tell the scheduler to create loading tasks for these origins.
-For example, to create tasks for 100 of these origins::
+For example, to create tasks for 100 of these origins:

-   ~/swh-environment/docker$ docker compose exec swh-scheduler \
+.. code-block:: console
+
+   ~/swh-docker$ docker compose exec swh-scheduler \
       swh scheduler origin schedule-next git 100

 This will take a bit of time to complete.

 To increase the speed at which git repositories are imported, you can
-spawn more ``swh-loader-git`` workers::
+spawn more ``swh-loader-git`` workers:
+
+.. code-block:: console

-   ~/swh-environment/docker$ docker compose exec swh-scheduler \
+   ~/swh-docker$ docker compose exec swh-scheduler \
       celery status
   listers@50ac2185c6c9: OK
   loader@b164f9055637: OK
@@ -126,18 +162,20 @@ spawn more ``swh-loader-git`` workers::
   vault@c9fef1bbfdc1: OK

   4 nodes online.
-   ~/swh-environment/docker$ docker compose exec swh-scheduler \
+   ~/swh-docker$ docker compose exec swh-scheduler \
       celery control pool_grow 3 -d loader@b164f9055637
   -> loader@b164f9055637: OK
           pool will grow
-   ~/swh-environment/docker$ docker compose exec swh-scheduler \
+   ~/swh-docker$ docker compose exec swh-scheduler \
       celery inspect -d loader@b164f9055637 stats | grep prefetch_count
          "prefetch_count": 4

 Now there are 4 workers ingesting git repositories. You can also
-increase the number of ``swh-loader-git`` containers::
+increase the number of ``swh-loader-git`` containers:
+
+.. code-block:: console

-   ~/swh-environment/docker$ docker compose up -d --scale swh-loader=4
+   ~/swh-docker$ docker compose up -d --scale swh-loader=4
   [...]
   Creating docker_swh-loader_2        ... done
   Creating docker_swh-loader_3        ... done
@@ -153,24 +191,28 @@ Heritage. When new versions of these components are released, the docker
 image will not be automatically updated. In order to update all Software
 Heritage components to their latest version, the docker image needs to
 be explicitly rebuilt by issuing the following command from within the
-``docker`` directory::
+``docker`` directory:

-   ~/swh-environment/docker$ docker build --no-cache -t swh/stack .
+.. code-block:: console
+
+   ~/swh-docker$ docker build --no-cache -t swh/stack .

 Monitor your local installation
 -------------------------------

 You can monitor your local installation by looking at:

- http://localhost:5080/rabbitmq to access the rabbitmq dashboard (guest/guest),
- http://localhost:5080/grafana to explore the platform's metrics (admin/admin),
+- http://localhost:<nginx-port>/rabbitmq to access the rabbitmq dashboard (guest/guest),
+- http://localhost:<nginx-port>/grafana to explore the platform's metrics (admin/admin),

 Shut down your local installation
 ---------------------------------

-To shut down your SoftWare Heritage, just run::
+To shut down your SoftWare Heritage, just run:
+
+.. code-block:: console

-   ~/swh-environment/docker$ docker compose down
+   ~/swh-docker$ docker compose down

 Hacking the archive
 -------------------

--- a/docs/devel/getting-started/api.rst
+++ b/docs/devel/getting-started/api.rst
--- a/docs/devel/glossary.rst
+++ b/docs/devel/glossary.rst
@@ -22,6 +22,14 @@ Glossary
     An artifact is one of many kinds of tangible by-products produced during
     the development of software.

+   bulk on-demand archival
+
+     A |swh| service allowing a partner to ask the archival for (possibly
+     large) number of origins. It consists in an authenticated API endpoint
+     allowing the user to upload a list of origins (as a CSV file) to be
+     ingested as soon as possible. The service allows to get feedback from the
+     |swh| archive about the ongoing ingestion process.
+
   content
   blob

@@ -94,6 +102,13 @@ Glossary
     add new file contents int :term:`object storage` and repository structure
     in the :term:`storage database`).

+   loading task
+
+     A celery_ task doing the actual ingestion process; its implementation is
+     provided by a :term:`loader`, and it is executed by celery_ workers. They
+     used to be backed by Scheduler Tasks instances in the :term:`scheduler`
+     database, but it's not the case any more (for performance reasons).
+
   hash
   cryptographic hash
   checksum
@@ -149,6 +164,25 @@ Glossary
     of the corresponding change. A person is associated to a full name and/or
     an email address.

+   raw extrinsic metadata
+   REMD
+
+     A piece of metadata concerning an objects stored in the |swh| archive that
+     is not part of the source code from an :term:`origin`. It can come from a
+     software forge (information about a project that is not the source code
+     repository for this project), a deposited metadata file (for a
+     :term:`deposit`), etc. These pieces of information are kept in their
+     original raw format -- for archiving purpose -- but are also converted
+     into a minimal format (currently a subset of CodeMeta) allowing them to be
+     indexed and searchable.
+
+   raw extrinsic metadata storage
+   REMD Storage
+
+     The |swh| storage dedicated to store all the gathered extrinsic metadata
+     documents verbatim, in their original format. Currently, this service is
+     part of the main :term:`storage`.
+
   release
   tag
   milestone
@@ -165,11 +199,27 @@ Glossary
     associated development metadata (e.g., author, timestamp, log message,
     etc).

+   save code now
+
+     A publicly accessible service allowing users to ask for immediate save of
+     a given source code origin. The request can be automatically accepted and
+     processed if the origin is from a well known domain, or may require manual
+     validation. Note that a save code now request can only concern a supported
+     origin type.
+
   scheduler

     The component of the |swh| architecture dedicated to the management and
     the prioritization of the many tasks.

+   Scheduler Task
+
+     :py:class:`The object <swh.scheduler.model.Task>` (stored in the
+     :term:`scheduler` database) representing a background (celery_) task to be
+     regularly scheduled for execution. Note that not all the background tasks
+     are backed by a Scheduler Task instance; one-shot :term:`loading task`
+     are most of the time not represented and model as Scheduler Task.
+
   snapshot

     the state of all visible branches during a specific visit of an origin
@@ -211,3 +261,4 @@ Glossary
 .. _`persistent identifier`: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html#persistent-identifiers
 .. _`Archival Resource Key`: http://n2t.net/e/ark_ids.html
 .. _publish-subscribe: https://en.wikipedia.org/wiki/Publish%E2%80%93subscribe_pattern
+.. _celery: https://docs.celeryq.dev
No results found