Compare revisions

53e8d774 · 26e47563 · eb0c45b5 · 0b1d5834 · 2ef3ac07 · 62ac6e8c
--- a/.copier-answers.yml
+++ b/.copier-answers.yml
+# Changes here will be overwritten by Copier
+_commit: v0.3.3
+_src_path: https://gitlab.softwareheritage.org/swh/devel/swh-py-template.git
+description: Software Heritage Documentation
+distribution_name: swh-docs
+have_cli: false
+have_workers: false
+package_root: swh/docs
+project_name: swh.docs
+python_minimal_version: '3.7'
+readme_format: rst
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
-# Enable black and pre-commit
+# black
 d71d13ea72434aa9337e2331ba0fbb034ad39acb
+a2b344ef0b824894d2d7bbbed249f3ac68f89eb1
--- a/.gitignore
+++ b/.gitignore
+*.egg-info/
+*.pyc
+.coverage
+.eggs/
+.hypothesis
+.mypy_cache
+.tox
+__pycache__
 build/
-/.eggs/
-/.tox/
-/swh.docs.egg-info/
-/swh/__pycache__/
-/swh/docs/__pycache__/
-/swh/docs/sphinx/__pycache__/
-/version.txt
-apidoc/
+dist/
+# these are symlinks created by a hook in swh-docs' main sphinx conf.py
+docs/README.rst
+docs/README.md
+# this should be a symlink for people who want to build the sphinx doc
+# without using tox, generally created by the swh-env/bin/update script
+docs/Makefile.sphinx
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.3.0
+    rev: v5.0.0
    hooks:
      - id: trailing-whitespace
      - id: check-json
      - id: check-yaml

+  - repo: https://github.com/python/black
+    rev: 25.1.0
+    hooks:
+      - id: black
+
+  - repo: https://github.com/PyCQA/isort
+    rev: 6.0.0
+    hooks:
+      - id: isort
+
  - repo: https://github.com/pycqa/flake8
-    rev: 5.0.4
+    rev: 7.1.1
    hooks:
      - id: flake8
-        additional_dependencies: [flake8-bugbear==22.9.23]
+        additional_dependencies: [flake8-bugbear==24.12.12, flake8-pyproject]

  - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.2
+    rev: v2.4.1
    hooks:
      - id: codespell
        name: Check source code spelling
-        stages: [commit]
+        additional_dependencies:
+          - tomli
+        stages: [pre-commit]
+      - id: codespell
+        name: Check commit message spelling
+        additional_dependencies:
+          - tomli
+        stages: [commit-msg]

  - repo: local
    hooks:
@@ -28,13 +45,13 @@ repos:
        pass_filenames: false
        language: system
        types: [python]
-
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.11.5
-    hooks:
-      - id: isort
-
-  - repo: https://github.com/python/black
-    rev: 22.10.0
-    hooks:
-      - id: black
+      - id: twine-check
+        name: twine check
+        description: call twine check when pushing an annotated release tag
+        entry: bash -c "ref=$(git describe) &&
+          [[ $ref =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]] &&
+          (python3 -m build --sdist && twine check $(ls -t dist/* | head -1)) || true"
+        pass_filenames: false
+        stages: [pre-push]
+        language: python
+        additional_dependencies: [twine, build]
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -6,7 +6,7 @@ In the interest of fostering an open and welcoming environment, we as Software
 Heritage contributors and maintainers pledge to making participation in our
 project and our community a harassment-free experience for everyone, regardless
 of age, body size, disability, ethnicity, sex characteristics, gender identity
-and expression, level of experience, education, socio-economic status,
+and expression, level of experience, education, socioeconomic status,
 nationality, personal appearance, race, religion, or sexual identity and
 orientation.


--- a/MANIFEST.in
+++ b/MANIFEST.in
-include Makefile
-include requirements*.txt
-include version.txt
--- a/Makefile.local
+++ b/Makefile.local
@@ -18,7 +18,7 @@ BUILD_DEPS :=
 BUILD_DEPS += cffi

 # swh.search
-BUILD_DEPS += tree-sitter
+BUILD_DEPS += tree-sitter\<0.22.0

 # swh.loader.bzr>breezy
 BUILD_DEPS += configobj
@@ -26,6 +26,12 @@ BUILD_DEPS += configobj
 # swh.docs
 BUILD_DEPS += pifpaf

+# psycopg-c
+BUILD_DEPS += tomli
+
+# docutils >= 0.21
+BUILD_DEPS += flit-core
+
 pip-install-swh-dev:
 	python3 -m pip install --upgrade pip wheel setuptools setuptools-scm
 	python3 -m pip install --upgrade $(BUILD_DEPS)

--- a/Makefile.sphinx
+++ b/Makefile.sphinx
@@ -2,7 +2,7 @@
 # automatically build sphinx documentation.

 APIDOC_DIR = apidoc
-APIDOC_OPTS = --ext-viewcode --separate
+APIDOC_OPTS = --ext-viewcode --separate --implicit-namespaces
 SPHINXBUILD = sphinx-build
 SPHINXAPIDOC = sphinx-apidoc
 SPHINX_OPTS = -t standalone_package_doc
@@ -12,7 +12,7 @@ SPHINX_SRCDIR = .
 # relative to docs/ dir. Hence "." exclude the docs/ dir itself
 APIDOC_EXCLUDES = conftest.py setup.py
 APIDOC_EXCLUDES += */tests/* */migrations/* */wsgi.py */conftest.py
-APIDOC_EXCLUDES += */deposit/settings/* */web/settings/* */dataset/*
+APIDOC_EXCLUDES += */deposit/settings/* */web/settings/*
 APIDOC_EXCLUDES += bin build dist utils node_modules
 APIDOC_SWH_EXCLUDES = $(patsubst %,"../%",$(APIDOC_EXCLUDES))

@@ -34,7 +34,7 @@ sphinx/%: $(apidoc_dep)

 apidoc: $(apidoc_dep)
 apidoc-stamp:
-	$(SPHINXAPIDOC) $(APIDOC_OPTS) -o $(APIDOC_DIR) .. $(APIDOC_SWH_EXCLUDES)
+	$(SPHINXAPIDOC) $(APIDOC_OPTS) -o $(APIDOC_DIR) `if [ -d ../src/swh ]; then echo ../src/swh; else echo ../swh; fi` $(APIDOC_SWH_EXCLUDES)
 	# to silent Sphinx warnings about apidoc documents not included in any toctree
 	find $(shell pwd)/apidoc -name "*.rst" | xargs sed -i '1i:orphan:\n'
 	touch $@

--- a/README.md
+++ b/README.md
-swh-docs
-========
+Software Heritage Technical Documentation
+=========================================

 This module contains (the logics for generating) the Software Heritage
-development documentation.
+technical documentation.

 Specifically, it contains some general information about Software Heritage
 internals (stuff that would not fit in any other specific software component of
 the Software Heritage stack) and bundle them together component-specific
 documentation coming from other modules of the stack.

-All documentation is written and typeset using [Sphinx][1]. General
+All documentation is written and typeset using Sphinx_. General
 documentation is shipped as part of this module. Module-specific documentation
-is centralized here via symlinks to the `docs/` dirs of individual modules.
+is centralized here via symlinks to the ``docs/`` dirs of individual modules.
 Therefore to build the full documentation you need a working and
-complete [Software Heritage development environment][2].
-
+complete `Software Heritage development environment`_.


 How to build the doc
 --------------------

-Install the [Software Heritage development environment][2]
+Install the `Software Heritage development environment`_:
+
+.. code-block:: shell
+
+   $ git clone https://gitlab.softwareheritage.org/swh/devel/swh-environment.git
+   $ cd swh-environment
+   $ ./bin/update  # this will clone needed git repos, inc. swh-docs
+   $ cd swh-docs

-    $ git clone https://gitlab.softwareheritage.org/swh/devel/swh-environment.git
-    $ cd swh-environment
-    $ ./bin/update  # this will clone needed git repos, inc. swh-docs
-    $ cd swh-docs
+Ensure you have the required tools to generate images (graphviz_'s ``dot``,
+plantuml_ and inkscape_). On a Debian system:

-Ensure you have the required tools to generate images ([graphviz][3]'s `dot`,
-[plantuml][4] and [inkscape][5]). On a Debian system:
+.. code-block:: shell

-    $ sudo apt install plantuml graphviz
+   $ sudo apt install plantuml graphviz
+
+These additional packages are required on Debian 10.x (and newer) systems:

-These additional packages are required on Debian 10.x systems:
 - libapr1-dev
 - libaprutil1-dev
 - libsvn-dev
@@ -40,61 +44,74 @@ These additional packages are required on Debian 10.x systems:
 - dia
 - postgresql-autodoc

-It is also recommended to build the doc using [tox][6], so ensure you have it
+It is also recommended to build the doc using tox_, so ensure you have it
 installed, eg. on a Debian system:

-    $ sudo apt install tox
+.. code-block:: shell
+
+   $ sudo apt install tox
+

+Then (from the ``swh-environment/swh-docs/`` directory):

-Then (from the `swh-environment/swh-docs/` directory):
+.. code-block:: shell

-    $ tox -e sphinx-dev
+    $ tox run -e sphinx-dev

 This tox environment will build the documentation from the sources available in
-the parent directory (`swh-environment`).
+the parent directory (``swh-environment``).

 Behind the scene, this tox environment will run the sphinx documentation
-building process via [pifpaf][7] to encapsulate the need os Postgresql to
+building process via pifpaf_ to encapsulate the need os Postgresql to
 generate database schemas. The documentation building process itself consists
 mainly in 3 steps:

 ### 1. Generate documentation assets for all modules

-    $ cd swh-environment
-    $ make docs-assets
+.. code-block:: shell

-This will *not* build the documentation in each module (there is `make docs`
+   $ cd swh-environment
+   $ pifpaf run postgresql -- make docs-assets
+
+This will *not* build the documentation in each module (there is ``make docs``
 for that).


 ### 2. Build the api docs for all swh python packages

-    $ cd swh-docs/docs
-    $ make apidoc
+.. code-block:: shell
+
+   $ cd swh-docs/docs
+   $ pifpaf run postgresql -- make apidoc

 ### 3. Build the documentation

-    $ cd swh-docs/docs
-    $ make
+.. code-block:: shell
+
+   $ cd swh-docs/docs
+   $ make

-The HTML documentation is now available starting from `_build/html/index.html`.
+The HTML documentation is now available starting from
+``_build/html/index.html``.


 Cleaning up
 -----------

-    $ cd docs
-    $ make distclean
+.. code-block:: shell

-The former (`make clean`) will only clean the local Sphinx build, without
-touching other modules. The latter (`make distclean`) will also clean Sphinx
+   $ cd docs
+   $ make distclean
+
+The former (``make clean``) will only clean the local Sphinx build, without
+touching other modules. The latter (``make distclean``) will also clean Sphinx
 builds in all other modules.


 Publishing the doc
 ------------------

-The publication of the documentation is now managed by the [CI][7].
+The publication of the documentation is now managed by the CI_.


 Building standalone package documentation
@@ -105,20 +122,25 @@ Each documentation local to a swh package can also be built with [tox][6].
 For instance to build the standalone documentation of ``swh-web``, proceed as
 follows:

-    $ cd swh-environment/swh-web
-    $ tox -e sphinx-dev
+.. code-block:: shell
+
+   $ cd swh-environment/swh-web
+   $ tox run -e sphinx-dev

-Sphinx warnings related to unresolved references located in other swh packages are suppressed because expected.
+Sphinx warnings related to unresolved references located in other swh packages
+are suppressed because expected.

 Please also note that Sphinx warnings are turned into errors in that case.

-The HTML documentation is now available starting from `docs/_build/html/index.html`.
+The HTML documentation is now available starting from
+``docs/_build/html/index.html``.


-[1]: http://www.sphinx-doc.org/
-[2]: https://gitlab.softwareheritage.org/swh/devel/swh-environment.git
-[3]: https://graphviz.org
-[4]: http://plantuml.com
-[5]: https://inkscape.org/
-[6]: https://tox.readthedocs.io/
-[7]: https://jenkins.softwareheritage.org/job/DDOC/
+.. _Sphinx: http://www.sphinx-doc.org/
+.. _`Software Heritage development environment`: https://gitlab.softwareheritage.org/swh/devel/swh-environment.git
+.. _graphviz: https://graphviz.org
+.. _plantuml: http://plantuml.com
+.. _inkscape: https://inkscape.org/
+.. _tox: https://tox.readthedocs.io/
+.. _CI: https://jenkins.softwareheritage.org/job/DDOC/
+.. _pifpaf: https://github.com/jd/pifpaf
--- a/docs/.gitignore
+++ b/docs/.gitignore
 *-stamp
 _build/
 devel/swh-*
+errors.log
 !/swh-loader.rst
 sources/
+user/software-origins/dynamic/*.inc
+apidoc/
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -3,7 +3,7 @@

 # You can set these variables from the command line, and also
 # from the environment for the first two.
-SPHINXOPTS  ?=
+SPHINXOPTS  ?= --jobs auto
 SPHINXBUILD ?= sphinx-build
 SOURCEDIR    = .
 BUILDDIR     = _build
@@ -14,13 +14,16 @@ INSTALL_DIR   = /srv/softwareheritage/docs/webroot
 INSTALL_GROUP = swhdev
 INSTALL_PERMS = g+rwX

-.PHONY: help images apidoc html clean install
+.PHONY: help images apidoc dynamic-rst html clean install

 all: html

+dynamic-rst:
+	make -C user dynamic-rst
+
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-sphinx/%: Makefile images apidoc
+sphinx/%: Makefile images apidoc dynamic-rst
 	@$(SPHINXBUILD) -M $* "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

 sphinx/clean:

--- a/docs/_static/custom.css
+++ b/docs/_static/custom.css
@@ -7,11 +7,35 @@ table.swh-logos-table img {
  margin-bottom: 5px;
 }

+
+table.swh-logos-table tr td:nth-child(1) {
+  width: min-content;
+}
+
+table.swh-logos-table tr td:nth-child(1) p {
+  margin-bottom: 0;
+}
+
 table.swh-logos-table ul {
  text-align: left;
  margin-bottom: auto;
 }

+table.swh-logos-table .py {
+  font-family: inherit;
+  font-weight: inherit;
+  font-size: inherit;
+  white-space: inherit;
+  background-color: inherit;
+  border: inherit;
+  border-radius: inherit;
+  padding: inherit;
+}
+
+table.swh-logos-table .py .pre {
+  white-space: inherit;
+}
+
 .landing-page hr {
  margin-top: 5em;
  margin-bottom: 5em;

--- a/docs/_templates/breadcrumbs.html
+++ b/docs/_templates/breadcrumbs.html
-{% extends "!breadcrumbs.html" %}
-
-{# Overrides RTD's default "View page source" link formatting, which
-   forces the full page name in the link; but we don't want it because
-   page names include the repo, eg. "swh-model/index".
-   The view_in_gitlab.py extension sets swh_source_url to
-   ".../source/swh-model/browse/master/docs/index"
-#}
-{%- block breadcrumbs_aside %}
-  <li class="wy-breadcrumbs-aside">
-    {%- if hasdoc(pagename) and display_vcs_links %}
-      <a href="{{ swh_source_url }}{{ page_source_suffix }}?plain=1">{{ _('View page source') }}</a>
-    {% endif %}
-  </li>
-{%- endblock %}
--- a/docs/devel/Makefile
+++ b/docs/devel/Makefile
-SPHINXOPTS ?= -t devel_doc
+SPHINXOPTS ?= -t devel_doc --jobs auto
 SPHINXOPTCOLOR ?= --color
 SPHINXBUILD = sphinx-build
 SOURCEDIR = .
@@ -9,7 +9,7 @@ SPHINXAPIDOC = sphinx-apidoc
 APIDOC_DIR = apidoc
 APIDOC_OPTS = --ext-viewcode --separate --no-toc
 APIDOC_EXCLUDES = */tests */migrations */wsgi.py */conftest.py */setup.py
-APIDOC_EXCLUDES += deposit/settings/* web/settings/* dataset/*
+APIDOC_EXCLUDES += deposit/settings/* web/settings/*
 APIDOC_SWH_EXCLUDES = $(patsubst %,"$(SWHPKGDIR)/%",$(APIDOC_EXCLUDES))

 all: html

--- a/docs/devel/api-reference.rst
+++ b/docs/devel/api-reference.rst
@@ -3,14 +3,24 @@
 API reference
 =============

+.. seealso::
+   Looking for ways to interact with our source code archive from your
+   applications or research projects? Learn :ref:`how to use Software Heritage
+   from your applications <landing-interface>`.
+
+Here is the reference documentation of Software Heritage own software stack
+by components:
+
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 3

+   swh.alter <swh-alter/index>
   swh.auth <swh-auth/index>
   swh.core <swh-core/index>
   swh.counters <swh-counters/index>
-   swh.dataset <swh-dataset/index>
+   swh.datasets <swh-datasets/index>
   swh.deposit <swh-deposit/index>
+   swh.export <swh-export/index>
   swh.fuse <swh-fuse/index>
   swh.graph <swh-graph/index>
   swh.graphql <swh-graphql/index>
@@ -22,6 +32,7 @@ API reference
   swh.objstorage <swh-objstorage/index>
   swh.objstorage.replayer <swh-objstorage-replayer/index>
   swh.perfecthash <swh-perfecthash/index>
+   swh.provenance <swh-provenance/index>
   swh.scanner <swh-scanner/index>
   swh.scheduler <swh-scheduler/index>
   swh.scrubber <swh-scrubber/index>
@@ -30,3 +41,4 @@ API reference
   swh.vault <swh-vault/index>
   swh.web <swh-web/index>
   swh.web.client <swh-web-client/index>
+   swh.webhooks <swh-webhooks/index>
--- a/docs/devel/architecture/alterations.rst
+++ b/docs/devel/architecture/alterations.rst
+.. _alterations:
+
+Alterations of the Software Architecture Archive
+================================================
+
+
+The main objective of an archive is to store facts forever. As such, it can be
+viewed as an append-only infrastructure. However, it may be necessary to alter
+the content of the archive to account for removal or alteration requests that
+may happen `for several reasons`_.
+
+We currently consider 2 types of alterations that may have to be applied to the
+archive:
+
+- content removal: some objects stored in the archive should not be visible any
+  more; these can be either removed entirely or masked, depending on the
+  situation.
+- personal identity modification: some personal information (namely the name
+  and email of a person) needs not to be visible any more.
+
+
+.. note::
+
+   We will not discuss in this section the administrative process of receiving,
+   handling and processing an alteration request of the Software Heritage
+   Archive. We will only focus on the technical aspects of the processes
+   involved, and their impact on the architectural design.
+
+
+.. _`for several reasons`: https://www.softwareheritage.org/legal/content-policy
+
+
+Types of alteration
+-------------------
+
+Content removal
+~~~~~~~~~~~~~~~
+
+A content removal request starts from one (or more) origin. All the removal
+handling process is based on an origin.
+
+When dealing with a content removal request that needs to be applied to the
+archive, the following steps need to be done:
+
+- identify all the objects in the archive (mostly in the :ref:`Merkle DAG
+  <swh-merkle-dag>`) that need to be removed,
+- build a properly encrypted recovery bundle with all the objects listed previously,
+- store and identify this bundle in a dedicated storage,
+- remove all the identified :py:class:`Content <swh.model.model.Content>`
+  objects from all the :ref:`objstorages <swh-objstorage>` under the legal and
+  technical responsibility of |swh|,
+- remove all the identified objects from all the :ref:`storages <swh-storage>`
+  under the legal and technical responsibility of |swh|,
+- remove all the identified objects from all the secondary data silos, namely
+  the :ref:`kafka journal <swh-journal>`, them :ref:`search index
+  <swh-search>`, the :ref:`compressed graph <swh-graph>` and the :ref:`vault cache
+  <swh-vault>`,
+- possibly: ensure the origins the removal request is targeting are excluded
+  from any future archival
+
+Note that handling archive content removal can also imply masking them
+(temporarily or permanently); for example during the examination process of
+suppression request, it might be necessary to hide all the impacted objects
+until a decision is made for each of them.
+
+
+Name change
+~~~~~~~~~~~
+
+A person may ask for their former identity not to be published any more. When
+this request has been handled and accepted, any occurrence of the former
+identity of the person associated with archived version control system objects
+(such as commits) will be replaced by the new one when using the public
+endpoints of the archive (namely, browsing the archive, using public APIs,
+using the vault).
+
+Note that currently, only :py:class:`Revision <swh.model.model.Revision>` and
+:py:class:`Release <swh.model.model.Release>` objects are affected by the
+process.
+
+
+Read Access - Altering results
+------------------------------
+
+The |swh| component responsible for altering returned objects is the
+:py:class:`MaskingProxyStorage
+<swh.storage.proxies.masking.MaskingProxyStorage>`. It handles both the cases of
+content that are still present in the archive but need to not to be published,
+and the application of active name change requests. It stores in a dedicated
+database a map of email to current display name to used to alter returned
+Revision and Release objects, and a series of tables dedicated to handling
+masking requests. These allow not to return at all an object from the archive
+if it's under a currently active masking request.
+
+As such, all the publicly accessible storage instances -- be it from the web
+frontend, the public API (REST and GraphQL) or the :term:`vault` service -- are
+using an access path that pass through the ``MaskingProxyStorage``.
+
+Note that for services like the :term:`vault`, it will make it fail to perform the
+requested cooking in some cases (especially for git history cooking, where the
+cryptographic integrity of the generated git content is altered, thus invalid.)
+
+
+Write Access - Preventing ingesting origins
+-------------------------------------------
+
+When an origin has been identified as forbidden for any future archiving, we
+use a dedicated storage proxy in the writing path to the archive to ensure this
+cannot happen. The corresponding |swh| component is the
+:py:class:`BlockingProxyStorage
+<swh.storage.proxies.blocking.BlockingProxyStorage>`. It is a simple proxy
+storage keeping a list of forbidden origin URLs in a dedicated database, and
+enforcing any matching origin URL to be ingested in the archive.
--- a/docs/devel/architecture/citation.rst
+++ b/docs/devel/architecture/citation.rst
+Citation workflow and architecture
+==================================
+
+Quick reminder on metadata objects
+----------------------------------
+
+Metadata in *Software Heritage* is explained in detail in the document :ref:`Metadata workflow and
+architecture <architecture-metadata>`. There are two types of metadata that are useful for citation, intrinsic and extrinsic. These two types can come from two
+sources: the archive itself (raw metadata) and the
+indexer (indexed metadata).
+
+For each metadata type and metadata source, metadata can be extracted for a specific
+object (``snapshot``, ``release``, ``revision``, ``directory``,
+``content``), using its SWHID, or using the repository URL (``origin``).
+In the latter case, it will return the metadata for the latest version
+(latest visit snapshot) of the repository root directory on the main
+branch.
+
+Citation use cases
+------------------
+
+.. list-table:: Citation use cases
+    :header-rows: 1
+    :stub-columns: 1
+
+    * - ID
+      - As a
+      - I can
+      - so that
+    * - UC1 (v1, v2)
+      - Researcher
+      - retrieve a citation or BibTeX export for a software artifact directly on SWH interface
+      - the software will be cited with correct attribution
+    * - UC2 (v1, v2)
+      - Publisher (Episciences)
+      - retrieve a citation or BibTeX export for a software artifact programmatically
+      - expose BibTeX
+    * - UC3
+      - Aggregator (OpenAire)
+      - retrieve intrinsic metadata from SWH programmatically
+      - the software record will be enriched
+
+Citation v1: data flow
+----------------------
+
+In this version, *Software Heritage* can generate a citation in BibTeX
+format from the raw intrinsic metadata available in the archive. The raw
+intrinsic metadata used for citation will be a found ``codemeta.json``
+file or, alternatively, a found ``citation.cff`` file in the repository.
+
+As per metadata extraction:
+
+* When given an ``origin`` URL, the citation will be generated from the latest version of the repository root directory metadata on the main branch.
+* When given a SWHID object of type ``snapshot``, ``release`` or ``revision``, the citation will be generated from the repository root directory metadata, associated with that version.
+* When given a ``directory`` object, if the SWHID is qualified with an anchor (explained in the document :ref:`SoftWare Heritage persistent IDentifiers (SWHIDs) <persistent-identifiers>`, the citation will be generated from the repository root directory metadata, associated with the anchor version.
+
+.. warning::
+    However, if no anchor was specified, it will be generated directly from the metadata found in that directory.
+
+* When given a ``content`` object, if the SWHID is qualified with an anchor, the citation will be generated from metadata of the repository root directory. If no anchor was specified, the citation cannot be generated due to a lack of information.
+
+Citation v1: architecture
+-------------------------
+
+*Software Heritage* provides a web API (through :ref:`swh.web <swh-web>`) to generate
+a citation, given an ``origin`` URL or a qualified SWHID.
+
+The corresponding API endpoints are:
+
+* ``/api/1/raw-intrinsic-metadata/citation/origin/`` (example: ``/api/1/raw-intrinsic-metadata/citation/origin/?citation_format=bibtex&origin_url=https://github.com/rdicosmo/parmap``)
+* ``/api/1/raw-intrinsic-metadata/citation/swhid/SWHID/`` (example: ``/api/1/raw-intrinsic-metadata/citation/swhid/?citation_format=bibtex&target_swhid=swh:1:dir:2dc0f462d191524530f5612d2935851505af41dd;origin=https://github.com/rdicosmo/parmap;visit=swh:1:snp:2128ed4f25f2d7ae7c8b7950a611d69cf4429063/``)
+
+Currently, the only allowed citation format value is BibTeX
+(``citation_format=bibtex``).
+
+This API uses intermediate utility methods:
+
+* in :ref:`swh.web <swh-web>`, to retrieve raw intrinsic metadata, given an ``origin`` URL or a qualified SWHID, which return original ``codemeta.json`` and ``citation.cff`` files.
+* in :ref:`swh.indexer <swh-indexer>`, to convert a ``codemeta.json`` or a ``citation.cff`` file into a BibTeX citation.
+
+Codemeta/citation.cff to BibTeX mapping
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A ``citation.cff`` file will be first converted into a ``codemeta.json``
+document. The ``CFF`` to ``CodeMeta`` mapping can be found in the
+`codemeta
+repository <https://github.com/codemeta/codemeta/blob/master/crosswalks/Citation%20File%20Format%201.2.0.csv>`_.
+
+The ``CodeMeta`` to ``BibTeX`` mapping, used for the converter, is
+`currently under
+review <https://github.com/codemeta/codemeta/pull/363>`_.
+
+Note on BibTeX ``@software``, ``@softwareversion`` and ``@codefragment`` usage
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The generated BibTeX citation can be of type ``@software``,
+``@softwareversion`` or ``@codefragment``. The rule is the following:
+
+* If SWHID is not specified,
+
+  * And if version is specified, then it will be ``@softwareversion``.
+  * Otherwise, it will be ``@software``.
+
+* If SWHID is specified
+
+  * And is of type ``snapshot``, then it will be ``@software``.
+  * And is of type ``release``, ``revision`` or ``directory``, then it will be ``@softwareversion``.
+  * And is of type ``content``, then it will be ``@codefragment``.
+
+A generated BibTeX example
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: bibtex
+
+   @software{REPLACEME,
+       author = "Di Cosmo, Roberto and Danelutto, Marco",
+       organization = "Inria and University Paris Diderot and University of Pisa",
+       license = "LGPL-2.0-only",
+       date = "2011-07-18",
+       year = "2011",
+       month = "07",
+       repository = "git+https://github.com/rdicosmo/parmap.git",
+       title = "Parmap",
+       swhid = "swh:1:snp:01b2cc89f4c423f1bda4757edd86ae4013b919b0;origin=https://github.com/rdicosmo/parmap"
+   }
+
+Citation v1: UI
+---------------
+
+Citation should be available in the webapp through a new *Citation* tab
+under the *Permalinks* tab, that should open the *Permalinks/Citation*
+box.
+
+Future
+------
+
+In the current v1 version, citation is generated from raw intrinsic metadata, i.e. ``codemeta.json`` or ``citation.cff`` file.
+
+.. mermaid::
+
+    quadrantChart
+        title Metadata types and sources for citation generation
+        x-axis Raw --> Indexed
+        y-axis Extrinsic --> Intrinsic
+        codemeta.json: [0.25, 0.9]
+        citation.cff: [0.25, 0.75]
+
+*Metadata types and sources for citation generation v1*
+
+The next versions of the citation feature should include:
+
+* New supported citation formats.
+* Citation styles?
+* On the API/backend side:
+
+  * v2: Generating citations from indexed intrinsic and extrinsic metadata (merging behaviour to be defined).
+  * v3: Authorities.
--- a/docs/devel/architecture/index.rst
+++ b/docs/devel/architecture/index.rst
@@ -9,4 +9,7 @@ Software Architecture
   :titlesonly:

   overview
+   alterations
   metadata
+   citation
+   object-storage
--- a/docs/devel/architecture/object-storage.rst
+++ b/docs/devel/architecture/object-storage.rst
+.. _objstorage-overview:
+
+Object Storage Overview
+=======================
+
+The Object Storage: Contents
+----------------------------
+
+All the history and context of the archive is represented by
+a graph (Merkle DAG) with the following nodes types:
+
+- releases,
+- revisions (commits)
+- directories
+- directory entries (file names)
+
+This graph is stored in a database, commonly called "Graph" or
+"Storage".
+
+This database is currently based on PostgreSQL, and is going
+to be migrated to Cassandra, which is more efficient in terms of
+concurrent writing.
+
+The source code itself (the content of the files) represents a huge
+volume of data, and one can find exactly the same content in different
+files. In order to avoid storing several times the same content,
+contents are deduplicated: a single content is stored only once,
+and all the files entries having this exact content will refer to the
+same content.
+
+Ceph
+----
+
+These contents are stored in a customized file system, called
+"Object Storage", each content being considered as an object.
+Until now, the actual object storage is based on an open source
+File System technology called ZFS.
+
+The growth of the archive requires a more adapted technology,
+and an few years ago, we chose Ceph, a distributed Storage
+technology created by RedHat.
+
+A specificity of Software Heritage is that each content has a
+small size (half of our contents are less than 3KB), which is
+much smaller than the minimum space used by Ceph to store a
+single file (16KB).
+Using Ceph directly would hence result in a massive waste of space.
+
+Winery
+------
+
+So we needed to create a custom layer on top of Ceph to group
+the data we store, using sharding techniques: a shard is a Ceph
+object that contains many contents. In order to be able to retrieve
+the single contents, we need to handle a mechanism that enables to
+know where the content is located in the shard.
+
+This layer is called Winery, and was developed especially for
+Software Heritage by Easter Eggs.
+
+.. thumbnail:: ../images/object-storage.svg
--- a/docs/devel/architecture/overview.rst
+++ b/docs/devel/architecture/overview.rst
@@ -5,17 +5,154 @@ Software Architecture Overview


 From an end-user point of view, the |swh| platform consists in the
-:term:`archive`, which can be accessed using the web interface or its REST API.
-Behind the scene (and the web app) are several components/services that expose
-different aspects of the |swh| :term:`archive` as internal RPC APIs.
+:term:`archive`, which can be accessed using the web interface or its public
+APIs (REST or GraphQL). Behind the scene (and the web app) are several
+components/services that expose different aspects of the |swh| :term:`archive`
+as internal RPC APIs.

-These internal APIs have a dedicated database, usually PostgreSQL_.
+These internal APIs have a dedicated database, typically PostgreSQL_ or
+Cassandra_.

-A global (and incomplete) view of this architecture looks like:
+Big Pictures
+------------

-.. thumbnail:: ../images/general-architecture.svg

-   General view of the |swh| architecture.
+The Read-Only View
+^^^^^^^^^^^^^^^^^^
+
+A global (and incomplete) view of this architecture, limited to components
+involved when reading from the archive, looks like:
+
+.. thumbnail:: ../images/general-architecture-read.svg
+
+   General view of the |swh| architecture when reading.
+
+As you can see, there are quite a few parts in this infrastructure. We will come
+back on each of them in more details later, but here is a quick description:
+
+- **Ingress**: HTTP requests from the end user are received by a frontend ingress service (a
+  reverse proxy), responsible for routing and load balancing them to the proper
+  backend service.
+
+- **WebApp**: this is the main |swh| frontend tier; it is a Django based HTTP server
+  responsible for handling most the frontend and public API requests (browsing
+  the archive or using the public REST API). Being a central component for any
+  user interaction, it needs to have access to most other |swh| services.
+
+- **Authentication**: this is a Keycloak server used to handle authentication for
+  users that want to have authenticated access to the archive (using lifted
+  rate limiting, have access to administration boards, etc.)
+
+- **Deposit**: this is a Django-based HTTP server with a very minimal UI (a single
+  static documentation page), but providing SWORD API allowing deposit partners
+  to upload software source code (with metadata) directly in the archive. It
+  also allows to check and have feedback on the status of previous deposits.
+  Since it is an authenticated only service, it has access toand uses the Keycloak
+  authentication service.
+
+- **Counters**: a simple service maintaining general archive statistics. It is used
+  by the frontend to generate the per-forge counters and overall evolution
+  curves. It uses a Redis backend (for Hyperloglog counters).
+
+- **Scheduler**: the scheduler service. This is needed by the webapp frontend to
+  get feedback for services like Save Code Now and like, or schedule new
+  loading and listing tasks for these services. This service uses a database
+  storage.
+
+- **Vault**: the service responsible for managing and executing retrieval queries
+  (when a user wants to retrieve a whole directory or a whole git history).
+  This service uses a database storage.
+
+- **Indexer Storage**: a data store that keeps track of all the indexed metadata
+  objects in the archive. It is used directly by the webapp frontend to get
+  information like the mimetype or the possible license of a content. This
+  service is using a database storage.
+
+- **RO Storage**: the main storage service, hosting the whole |swh| archive
+  structure (but the file content themselves). In the context of the read
+  access to the archive, the Storage used is a Read-Only storage with a Masking
+  Proxy. This proxy allows to mask or modify on the fly objects that need
+  to be either hidden completely (typically when a takedown request is being
+  processed that impact the browsed object) or altered (typically when a person
+  asked for their former name not to be visible any more). The main storage can
+  be hosted either on a Postgresql database or a Cassandra one. The main
+  archive now uses Cassandra as main backend storage.
+
+- **Search**: the |swh| search service. This service is using an Elasticsearch
+  backend.
+
+- **Objstorage**: this data storage is used to store all the content blobs (the
+  actual source code files). It is a content-addressable object storage. The
+  |swh| objstorage provides an abstract frontend/API for many possible
+  backends. Currently the main archive is using a Ceph cluster for this, with a
+  custom layer (named Winery) in front to account for the specificities of the
+  |swh| workload (handle 10th of billions of small objects).
+
+
+The Ingestion View
+^^^^^^^^^^^^^^^^^^
+
+When looking at how software source code are harvested and ingested in the
+archive, the global picture looks like:
+
+.. thumbnail:: ../images/general-architecture-ingestion.svg
+
+   General view of the |swh| ingestion architecture.
+
+.. Note:: :term:`REMD` in this pictures stands for :term:`raw extrinsic metadata`.
+
+The central part of this setup is the scheduler service, responsible for
+keeping track of loading, listing and a few other types of tasks. The task
+execution framework uses Celery_ as backend. There are actually 2 completely
+different tasks systems provided by both the scheduler and the side services:
+
+- one is dedicated to managing the loading of source code from origins (aka spawning
+  :ref:`Loader <swh-loader-core>` tasks); these are one-shot celery tasks not
+  reified in the scheduler database,
+
+- the other is a generic task scheduling service mostly responsible for
+  recurring tasks; especially :ref:`forge listing <swh-lister>` ones, but not
+  only. Some one-shot loading tasks are still handled by this scheduler
+  (especially loading origins from :term:`save code now` requests). There are
+  also :ref:`vault <swh-vault>` cooking tasks and deposit checker tasks that
+  are using this generic scheduler.
+
+A more detailed view of this later is :ref:`available below
+<source_code_scrapping>`.
+
+One noticeable point in this schematic is the presence of the :py:class:`Blocking
+Proxy <swh.storage.proxies.blocking.BlockingProxyStorage>` in the :ref:`storage
+<swh-storage>` configuration. This proxy is a helper to prevent from ingesting
+from origins that have been disabled as a result of a takedown notice.
+
+.. Note:: Even if not represented in this diagram, there are actually several
+   :term:`Scheduler Task` runner service instances running: one is scheduling
+   high priority :term:`Scheduler Task` (using a dedicated set of `celery
+   queues`_), typically for :term:`save code now` requests; one is special case
+   for scheduling first visits of a newly added forge or a :term:`bulk
+   on-demand archival` request (also using dedicated celery queues); the last
+   is responsible for scheduling all other standard (non priority)
+   :term:`Scheduler Task`.
+
+.. Note:: Loading tasks are not represented by one-shot :term:`Scheduler Task`
+   instances (in the scheduler database) anymore, but the corresponding celery
+   tasks are directly spawned by the "loader scheduler" (it was not possible to
+   handle that many entries in the database efficiently). There is however
+   still an exception for deposit loading tasks that are still managed via this
+   generic scheduling scaffolding (mostly for historical reasons).
+
+
+The Indexation View
+^^^^^^^^^^^^^^^^^^^
+
+The |swh| archive platform also comes with a complex indexation system. A view
+from this indexation side would look like:
+
+.. thumbnail:: ../images/general-architecture-indexation.svg
+
+   General view of the |swh| indexation architecture.
+
+See the :ref:`swh-indexer` documentation for more details.

 .. _architecture-tier-1:

@@ -35,11 +172,31 @@ It relies on the :ref:`Object Storage <swh-objstorage>` service to store
 the content of source code file themselves.

 Both the Storage and Object Storage are designed as abstractions over possible
-backends. The former supports both PostgreSQL (the current solution in production)
-and Cassandra (a more scalable option we are exploring).
+backends. The former supports both PostgreSQL (the former solution in production)
+and Cassandra (a more scalable option, now used as main backend in production).
 The latter supports a large variety of "cloud" object storage as backends,
 as well as a simple local filesystem.

+Alterations
+~~~~~~~~~~~
+
+The main objective of an archive is to store facts forever. As such, it can be
+viewed as an append-only infrastructure. However, it may be necessary to alter
+the content of the archive to account for removal or alteration requests that
+may happen `for several reasons`_.
+
+We currently consider 2 types of alterations that may have to be done to the
+archive:
+
+- content removal: some objects stored in the archive should not be visible any
+  more; these can be either removed entirely or masked, depending on the
+  situation.
+- personal identity modification: some personal information (namely the name
+  and email of a person) needs not to be visible any more.
+
+These requirements have impact on the overall architecture of the archive.
+Details are documented in a :ref:`dedicated section<alterations>`.
+

 Journal
 ^^^^^^^
@@ -56,6 +213,8 @@ when to visit again these repositories.
 It is also the foundation of the :ref:`mirror` infrastructure, as it allows
 mirrors to stay up to date.

+.. _source_code_scrapping:
+
 Source code scraping
 ^^^^^^^^^^^^^^^^^^^^

@@ -258,8 +417,8 @@ such as full-text search on origin URLs and metadata.
 This service is a recent addition to the |swh| architecture based on ElasticSearch,
 and is currently in use only for URL search.

-Graph
-^^^^^
+Compressed Graph
+^^^^^^^^^^^^^^^^

 :ref:`swh-graph <swh-graph>` is also a recent addition to the architecture
 designed to complement the Storage using a specialized backend.
@@ -336,13 +495,16 @@ designed to keep them in sync:
  in the Journal and recreate it.


+.. _Cassandra: https://cassandra.apache.org
 .. _celery: https://www.celeryproject.org
-.. _CodeMeta: https://codemeta.github.io/
+.. _CodeMeta: https://codemeta.github.io
 .. _gitlab: https://gitlab.com
-.. _PostgreSQL: https://www.postgresql.org/
-.. _Prometheus: https://prometheus.io/
+.. _PostgreSQL: https://www.postgresql.org
+.. _Prometheus: https://prometheus.io
 .. _publish-subscribe: https://en.wikipedia.org/wiki/Publish%E2%80%93subscribe_pattern
-.. _Redis: https://redis.io/
+.. _Redis: https://redis.io
 .. _SWORDv2: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html
-.. _HyperLogLog: https://redislabs.com/redis-best-practices/counting/hyperloglog/
-.. _WebGraph: https://webgraph.di.unimi.it/
+.. _HyperLogLog: https://redislabs.com/redis-best-practices/counting/hyperloglog
+.. _WebGraph: https://webgraph.di.unimi.it
+.. _`for several reasons`: https://www.softwareheritage.org/legal/content-policy
+.. _`celery queues`: https://docs.celeryq.dev/en/stable/getting-started/introduction.html#what-s-a-task-queue
No results found