diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3e223cac1cb300df89c5eb5035b6dc4fbd475977..02181e7e0576348658117b476fbfe09340eca0d1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,19 +1,19 @@
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.1.0
+    rev: v4.3.0
     hooks:
       - id: trailing-whitespace
       - id: check-json
       - id: check-yaml
 
-  - repo: https://gitlab.com/pycqa/flake8
-    rev: 4.0.1
+  - repo: https://github.com/pycqa/flake8
+    rev: 5.0.4
     hooks:
       - id: flake8
-        additional_dependencies: [flake8-bugbear==22.3.23]
+        additional_dependencies: [flake8-bugbear==22.9.23]
 
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.1.0
+    rev: v2.2.2
     hooks:
       - id: codespell
         name: Check source code spelling
@@ -31,11 +31,11 @@ repos:
         types: [python]
 
   - repo: https://github.com/PyCQA/isort
-    rev: 5.10.1
+    rev: 5.11.5
     hooks:
       - id: isort
 
   - repo: https://github.com/python/black
-    rev: 22.3.0
+    rev: 22.10.0
     hooks:
       - id: black
diff --git a/README.md b/README.md
index f4f248183d29825cd197f8c3368968ba0499f7f4..56e255b0a6f4726f7936ed7b90b34629361b1a77 100644
--- a/README.md
+++ b/README.md
@@ -4,12 +4,10 @@ swh-indexer
 Tools to compute multiple indexes on SWH's raw contents:
 - content:
   - mimetype
-  - ctags
-  - language
   - fossology-license
   - metadata
-- revision:
-  - metadata
+- origin:
+  - metadata (intrinsic, using the content indexer; and extrinsic)
 
 An indexer is in charge of:
 - looking up objects
@@ -32,18 +30,13 @@ Current content indexers:
 - mimetype (queue swh_indexer_content_mimetype): detect the encoding
   and mimetype
 
-- language (queue swh_indexer_content_language): detect the
-  programming language
-
-- ctags (queue swh_indexer_content_ctags): compute tags information
-
 - fossology-license (queue swh_indexer_fossology_license): compute the
   license
 
-- metadata: translate file into translated_metadata dict
+- metadata: translate file from an ecosystem-specific formats to JSON-LD
+  (using schema.org/CodeMeta vocabulary)
 
-Current revision indexers:
+Current origin indexers:
 
-- metadata: detects files containing metadata and retrieves translated_metadata
-  in content_metadata table in storage or run content indexer to translate
-  files.
+- metadata: translate file from an ecosystem-specific formats to JSON-LD
+  (using schema.org/CodeMeta and ForgeFed vocabularies)
diff --git a/docs/dev-info.rst b/docs/dev-info.rst
index 9ef8497b905308b41cb9574090977fc93ce8342a..4720098873a7569890053a41f18abcb8d68ad518 100644
--- a/docs/dev-info.rst
+++ b/docs/dev-info.rst
@@ -26,15 +26,9 @@ commands:
 .. code-block:: yaml
 
   indexers:
-    # language:
-    #   batch_size: 10
-    #   check_presence: false
     fossology_license:
       batch_size: 10
       check_presence: false
-    # ctags:
-    #   batch_size: 2
-    #   check_presence: false
 
 - Mimetype indexer at
   ``~/.config/swh/indexer/mimetype.yml``
@@ -132,8 +126,6 @@ commands:
   	  - swh_indexer_orchestrator_content_all
   	  - swh_indexer_orchestrator_content_text
   	  - swh_indexer_content_mimetype
-  	  - swh_indexer_content_language
-  	  - swh_indexer_content_ctags
   	  - swh_indexer_content_fossology_license
   	  - swh_loader_svn_mount_and_load
   	  - swh_loader_git_express
diff --git a/docs/index.rst b/docs/index.rst
index 9cc3d625a9f3e49345239fe53a0d34d1ce8ba779..37623521eb23dbdf2d2404fa051d9ab1e52d094b 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -14,6 +14,7 @@ information from archive source code artifacts.
    README.md
    dev-info.rst
    metadata-workflow.rst
+   swhpkg.rst
    mesocore.rst
 
 
@@ -24,4 +25,12 @@ Reference Documentation
    :maxdepth: 2
 
    cli
-   /apidoc/swh.indexer
+
+.. only:: standalone_package_doc
+
+   Indices and tables
+   ------------------
+
+   * :ref:`genindex`
+   * :ref:`modindex`
+   * :ref:`search`
diff --git a/docs/metadata-workflow.rst b/docs/metadata-workflow.rst
index 4d99106134c484a89b5ed7bbe88f394e85af613c..96bf24f29c85fde26fdb22a6db2ddeab0d10447b 100644
--- a/docs/metadata-workflow.rst
+++ b/docs/metadata-workflow.rst
@@ -69,7 +69,11 @@ Translation from ecosystem-specific metadata to CodeMeta
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Intrinsic metadata is extracted from files provided with a project's source
-code, and translated using `CodeMeta`_'s `crosswalk table`_.
+code, and translated using `CodeMeta`_'s `crosswalk table`_; which is vendored
+in :file:`swh/indexer/data/codemeta/codemeta.csv`.
+Ecosystems not yet included in Codemeta's crosswalk have their own
+:file:`swh/indexer/data/*.csv` file, with one row for each CodeMeta property,
+even when not supported by the ecosystem.
 
 All input formats supported so far are straightforward dictionaries (eg. JSON)
 or can be accessed as such (eg. XML); and the first part of the translation is
diff --git a/docs/swhpkg.rst b/docs/swhpkg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..bbec70e5f8c2ac6fcdc1359c473f7fe2cbe3814e
--- /dev/null
+++ b/docs/swhpkg.rst
@@ -0,0 +1,117 @@
+SwhPkg Vocabulary
+================================
+
+.. note:: This is an early draft and hasn't been implemented yet
+
+
+SwhPkg is a vocabulary that complements ontologies like schema.org and CodeMeta
+in describing software projects. While the latter are meant to describe
+source code projects, SwhPkg describes relationships between different packages released
+by such projects.
+
+The namespace is ``https://www.softwareheritage.org/schema/2023/packages/``;
+and it is meant to be used primarily alongside CodeMeta/schema.org
+and ForgeFed/ActivityStreams.
+
+
+The following prefixes are used throughout this document for readability:
+
+.. code-block:: json
+
+    {
+        "schema": "http://schema.org/",
+        "codemeta": "https://codemeta.github.io/terms/",
+        "swhpkg": "https://www.softwareheritage.org/schema/2023/packages/",
+        "swhpackages": "https://archive.softwareheritage.org/packages/",
+    }
+
+For example, here is a document using all three together:
+
+.. code-block:: json
+
+    {
+      "@context": {
+        "schema": "http://schema.org/",
+        "codemeta": "https://codemeta.github.io/terms/",
+        "swhpkg": "https://www.softwareheritage.org/schema/2023/packages/",
+        "swhpackages": "https://archive.softwareheritage.org/packages/",
+        "package": {"@id": "swhpkg:package", "@type": "@id"},
+        "release": {"@id": "swhpkg:release", "@type": "@id"},
+        "dependencies": {"@id": "swhpkg:dependencies"},
+        "dependency": {"@id": "swhpkg:dependency", "@type": "@id"},
+        "dependent": {"@id": "swhpkg:dependent", "@type": "@id"},
+        "kind": {"@id": "swhpkg:kind"},
+        "optional": {"@id": "swhpkg:optional"}
+      },
+      "@type": "schema:SoftwareSourceCode",
+      "@id": "https://npmjs.com/package/d3@7.8.2",
+      "package": "swhpackages:js/d3",
+      "release": "swhpackages:js/d3@7.8.2",
+      "schema:name": "d3",
+      "schema:version": "7.8.2",
+      "schema:description": "Data-Driven Documents",
+      "dependencies": [
+        {
+          "@type": "swhpkg:dependencies",
+          "@id": "swhpackages:js/d3@7.8.2#d3-array",
+          "dependent": "swhpackages:js/d3@7.8.2",
+          "dependency": "swhpackages:js/d3-array",
+          "constraint": "^3.0.0",
+          "kind": "runtime",
+          "optional": false
+        },
+        {
+          "@type": "swhpkg:dependencies",
+          "@id": "swhpackages:js/d3@7.8.2#mocha",
+          "dependent": "swhpackages:js/d3@7.8.2",
+          "dependency": "swhpackages:js/mocha",
+          "constraint": ">10.0.0",
+          "kind": "development",
+          "optional": true
+        }
+      ]
+    }
+
+SwhPkg Terms
+-------------
+
+.. list-table::
+   :header-rows: 1
+
+   * - Property
+     - Type
+     - Examples
+     - Description
+   * - ``package``
+     - ``swhpkg:package``
+     - ``swhpackages:js/d3``, ``swhpackages:python/numpy``
+     - Package that is released by the SoftwareSourceCode/SofwtareApplication.
+   * - ``release``
+     - ``swhpkg:release``
+     - ``swhpackages:js/d3@7.8.2``, ``swhpackages:python/numpy@1.24.2``
+     - Specific version of the package that is released by the SoftwareSourceCode/SoftwareApplication
+   * - ``dependencies``
+     - ``swhpkg:dependencies``
+     - d3 depends on d3-array and mocha.
+     - Dependencies of the project. There can be many of them.
+   * - ``dependent``
+     - ``swhpkg:release``
+     - ``swhpkg:js/d3``
+     - A reference to the package release that depends on the dependency.
+   * - ``dependency``
+     - ``swhpkg:package``
+     - ``swhpkg:js/d3``, ``swhpkg:python/django``
+     - A reference to the package that is depended on.
+   * - ``constraint``
+     - Text
+     - ``^3.0.0``, ``>10.0.0``
+     - The constraint on a dependency relation. It can be a version range, or a git commit hash, or even a file path.
+   * - ``kind``
+     - Text
+     - ``runtime``, ``development``
+     - The type of dependency relation. Some common values are ``runtime``, ``development``.
+   * - ``optional``
+     - boolean
+     - ``true``, ``false``
+     - Whether the dependency is optional or not.
+
diff --git a/mypy.ini b/mypy.ini
index d63e78953bd4973585a369650d034112c3e17408..28c26fbae6feb0024ef68459574c13312e3657de 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -11,6 +11,9 @@ ignore_missing_imports = True
 [mypy-confluent_kafka.*]
 ignore_missing_imports = True
 
+[mypy-iso8601.*]
+ignore_missing_imports = True
+
 [mypy-magic.*]
 ignore_missing_imports = True
 
diff --git a/requirements-swh.txt b/requirements-swh.txt
index 52654a75308d2104402e4a5eaab50c1067462440..0f868e0e63b54acb8365ef596d8ea5b1ff852856 100644
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,4 +1,4 @@
-swh.core[db,http] >= 2.9
+swh.core[db,http] >= 2.20.0
 swh.model >= 0.0.15
 swh.objstorage >= 0.2.2
 swh.scheduler >= 0.5.2
diff --git a/requirements.txt b/requirements.txt
index 4dd61a2c280cd34b5ddefd2ae1204e7af8b9aa87..1cfc8ea75d4c3474fa3886ade2458c59753b7fc4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,7 @@ click
 # the version 2.1.2 is causing segmentation faults
 # cf https://forge.softwareheritage.org/T3815
 frozendict != 2.1.2
+iso8601
 pyld
 rdflib
 sentry-sdk
diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py
index f5c8889e6cb4114ab8604118f768e141476f2a1c..939b4b1fc8fc83854925963018ad0eefc0f55347 100644
--- a/swh/indexer/cli.py
+++ b/swh/indexer/cli.py
@@ -247,6 +247,13 @@ def schedule_origin_metadata_reindex(
     type=int,
     help="Maximum number of objects to replay. Default is to run forever.",
 )
+@click.option(
+    "--batch-size",
+    "-b",
+    default=None,
+    type=int,
+    help="Batch size. Default is 200.",
+)
 @click.pass_context
 def journal_client(
     ctx,
@@ -257,6 +264,7 @@ def journal_client(
     prefix: str,
     group_id: str,
     stop_after_objects: Optional[int],
+    batch_size: Optional[int],
 ):
     """
     Listens for new objects from the SWH Journal, and either:
@@ -280,16 +288,22 @@ def journal_client(
 
     scheduler = _get_api(get_scheduler, cfg, "scheduler", scheduler_url)
 
-    brokers = brokers or journal_cfg.get("brokers")
-    if not brokers:
+    if brokers:
+        journal_cfg["brokers"] = brokers
+    if not journal_cfg.get("brokers"):
         raise ValueError("The brokers configuration is mandatory.")
 
-    prefix = prefix or journal_cfg.get("prefix")
-    group_id = group_id or journal_cfg.get("group_id")
+    if prefix:
+        journal_cfg["prefix"] = prefix
+    if group_id:
+        journal_cfg["group_id"] = group_id
     origin_metadata_task_type = origin_metadata_task_type or journal_cfg.get(
         "origin_metadata_task_type"
     )
-    stop_after_objects = stop_after_objects or journal_cfg.get("stop_after_objects")
+    if stop_after_objects:
+        journal_cfg["stop_after_objects"] = stop_after_objects
+    if batch_size:
+        journal_cfg["batch_size"] = batch_size
 
     object_types = set()
     worker_fns: List[Callable[[ObjectsDict], Dict]] = []
@@ -350,11 +364,8 @@ def journal_client(
 
     client = get_journal_client(
         cls="kafka",
-        brokers=brokers,
-        prefix=prefix,
-        group_id=group_id,
         object_types=list(object_types),
-        stop_after_objects=stop_after_objects,
+        **journal_cfg,
     )
 
     def worker_fn(objects: ObjectsDict):
diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py
index f1d00b1461172379ebe31c73f08aa52102bf4599..d7ddb72d72c420afa98bbe981e28603ebc163c96 100644
--- a/swh/indexer/codemeta.py
+++ b/swh/indexer/codemeta.py
@@ -9,7 +9,7 @@ import itertools
 import json
 import os.path
 import re
-from typing import Any, List
+from typing import Any, Dict, List, Set, TextIO, Tuple
 
 from pyld import jsonld
 import rdflib
@@ -66,7 +66,15 @@ def make_absolute_uri(local_name):
     return uri
 
 
-def _read_crosstable(fd):
+def read_crosstable(fd: TextIO) -> Tuple[Set[str], Dict[str, Dict[str, rdflib.URIRef]]]:
+    """
+    Given a file-like object to a `CodeMeta crosswalk table` (either the main
+    cross-table with all columns, or an auxiliary table with just the CodeMeta
+    column and one ecosystem-specific table); returns a list of all CodeMeta
+    terms, and a dictionary ``{ecosystem: {ecosystem_term: codemeta_term}}``
+
+    .. _CodeMeta crosswalk table: <https://codemeta.github.io/crosswalk/
+    """
     reader = csv.reader(fd)
     try:
         header = next(reader)
@@ -75,7 +83,9 @@ def _read_crosstable(fd):
 
     data_sources = set(header) - {"Parent Type", "Property", "Type", "Description"}
 
-    codemeta_translation = {data_source: {} for data_source in data_sources}
+    codemeta_translation: Dict[str, Dict[str, rdflib.URIRef]] = {
+        data_source: {} for data_source in data_sources
+    }
     terms = set()
 
     for line in reader:  # For each canonical name
@@ -101,7 +111,7 @@ def _read_crosstable(fd):
 
 
 with open(CROSSWALK_TABLE_PATH) as fd:
-    (CODEMETA_TERMS, CROSSWALK_TABLE) = _read_crosstable(fd)
+    (CODEMETA_TERMS, CROSSWALK_TABLE) = read_crosstable(fd)
 
 
 def _document_loader(url, options=None):
diff --git a/swh/indexer/data/Gitea.csv b/swh/indexer/data/Gitea.csv
new file mode 100644
index 0000000000000000000000000000000000000000..4fe89fe07bdb9c583f3a67d89c040ee53f2b021b
--- /dev/null
+++ b/swh/indexer/data/Gitea.csv
@@ -0,0 +1,68 @@
+Property,Gitea
+codeRepository,clone_url
+programmingLanguage,languages
+runtimePlatform,
+targetProduct,
+applicationCategory,
+applicationSubCategory,
+downloadUrl,
+fileSize,
+installUrl,
+memoryRequirements,
+operatingSystem,
+permissions,
+processorRequirements,
+releaseNotes,
+softwareHelp,
+softwareRequirements,
+softwareVersion,
+storageRequirements,
+supportingData,
+author,owner
+citation,
+contributor,
+copyrightHolder,
+copyrightYear,
+dateCreated,created_at
+dateModified,updated_at
+datePublished,
+editor,
+encoding,
+fileFormat,
+funder,
+keywords,
+license,
+producer,
+provider,
+publisher,
+sponsor,
+version,
+isAccessibleForFree,
+isPartOf,
+hasPart,
+position,
+description,description
+identifier,
+name,name
+sameAs,
+url,website
+relatedLink,
+givenName,
+familyName,
+email,
+affiliation,
+identifier,
+name,name
+address,
+type,
+id,
+softwareSuggestions,
+maintainer,
+contIntegration,
+buildInstructions,
+developmentStatus,
+embargoDate,
+funding,
+issueTracker,
+referencePublication,
+readme,
diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
index 65f730c5e6105fc4129726ba17b75864e4f4ad42..fbc0e1f5d3e9b2424a5426407e17e2c82db595f0 100644
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -344,6 +344,9 @@ class ContentIndexer(BaseIndexer[Sha1, bytes, TResult], Generic[TResult]):
             sentry_sdk.capture_exception()
             summary["status"] = "failed"
             return summary
+        else:
+            # Reset tag after we finished processing the given content
+            sentry_sdk.set_tag("swh-indexer-content-sha1", "")
 
         summary_persist = self.persist_index_computations(results)
         self.results = results
@@ -406,6 +409,9 @@ class ContentIndexer(BaseIndexer[Sha1, bytes, TResult], Generic[TResult]):
             self.log.exception("Problem when reading contents metadata.")
             sentry_sdk.capture_exception()
             summary["status"] = "failed"
+        else:
+            # Reset tag after we finished processing the given content
+            sentry_sdk.set_tag("swh-indexer-content-sha1", "")
         return summary
 
 
@@ -493,6 +499,7 @@ class ContentPartitionIndexer(BaseIndexer[Sha1, bytes, TResult], Generic[TResult
                 continue
             sentry_sdk.set_tag("swh-indexer-content-sha1", sha1)
             yield from self.index(sha1, raw_content, **kwargs)
+        sentry_sdk.set_tag("swh-indexer-content-sha1", "")
 
     def _index_with_skipping_already_done(
         self, partition_id: int, nb_partitions: int
@@ -642,6 +649,7 @@ class OriginIndexer(BaseIndexer[str, None, TResult], Generic[TResult]):
         for origin in origins:
             sentry_sdk.set_tag("swh-indexer-origin-url", origin.url)
             results.extend(self.index(origin.url, **kwargs))
+        sentry_sdk.set_tag("swh-indexer-origin-url", "")
         return results
 
 
@@ -710,6 +718,8 @@ class DirectoryIndexer(BaseIndexer[Sha1Git, Directory, TResult], Generic[TResult
                 self.log.exception("Problem when processing directory")
                 sentry_sdk.capture_exception()
                 summary["status"] = "failed"
+            else:
+                sentry_sdk.set_tag("swh-indexer-directory-swhid", "")
 
         summary_persist = self.persist_index_computations(results)
         if summary_persist:
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index 960b6a5c228bd21513b5ab279170e4f3cff01ede..5a7a25c8e6102541783344bdd5eb6c59930a7060 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -4,6 +4,9 @@
 # See top-level LICENSE file for more information
 
 from copy import deepcopy
+import hashlib
+import logging
+import time
 from typing import (
     Any,
     Callable,
@@ -18,6 +21,7 @@ from typing import (
 )
 from urllib.parse import urlparse
 
+import pkg_resources
 import sentry_sdk
 
 from swh.core.config import merge_configs
@@ -55,6 +59,8 @@ ORIGIN_GET_BATCH_SIZE = 10
 T1 = TypeVar("T1")
 T2 = TypeVar("T2")
 
+logger = logging.getLogger(__name__)
+
 
 def call_with_batches(
     f: Callable[[List[T1]], Iterable[T2]],
@@ -73,21 +79,20 @@ class ExtrinsicMetadataIndexer(
     def process_journal_objects(self, objects: ObjectsDict) -> Dict:
         summary: Dict[str, Any] = {"status": "uneventful"}
         try:
-            results = []
+            results = {}
             for item in objects.get("raw_extrinsic_metadata", []):
-                # Drop attribute 'type' (from older model versions) no longer allowed.
-                item.pop("type", None)
                 remd = RawExtrinsicMetadata.from_dict(item)
-                sentry_sdk.set_tag("swh-indexer-remd-swhid", remd.swhid())
-                results.extend(self.index(remd.id, data=remd))
+                sentry_sdk.set_tag("swh-indexer-remd-swhid", str(remd.swhid()))
+                for result in self.index(remd.id, data=remd):
+                    results[result.id] = result
         except Exception:
             if not self.catch_exceptions:
                 raise
             summary["status"] = "failed"
             return summary
 
-        summary_persist = self.persist_index_computations(results)
-        self.results = results
+        self.results = list(results.values())
+        summary_persist = self.persist_index_computations(self.results)
         if summary_persist:
             for value in summary_persist.values():
                 if value > 0:
@@ -105,11 +110,18 @@ class ExtrinsicMetadataIndexer(
             raise NotImplementedError(
                 "ExtrinsicMetadataIndexer.index() without RawExtrinsicMetadata data"
             )
-        if data.target.object_type != ExtendedObjectType.ORIGIN:
+        if data.target.object_type == ExtendedObjectType.ORIGIN:
+            origin_sha1 = data.target.object_id
+        elif data.origin is not None:
+            # HACK: As swh-search does (yet?) not support searching on directories
+            # and traversing back to origins, we index metadata on non-origins with
+            # an origin context as if they were on the origin itself.
+            origin_sha1 = hashlib.sha1(data.origin.encode()).digest()
+        else:
             # other types are not supported yet
             return []
 
-        if data.authority.type != MetadataAuthorityType.FORGE:
+        if data.authority.type == MetadataAuthorityType.REGISTRY:
             # metadata provided by a third-party; don't trust it
             # (technically this could be handled below, but we check it here
             # to return early; sparing a translation and origin lookup)
@@ -131,12 +143,21 @@ class ExtrinsicMetadataIndexer(
             return []
 
         # TODO: batch requests to origin_get_by_sha1()
-        origins = self.storage.origin_get_by_sha1([data.target.object_id])
-        try:
-            (origin,) = origins
-            if origin is None:
-                raise ValueError()
-        except ValueError:
+        for _ in range(6):
+            origins = self.storage.origin_get_by_sha1([origin_sha1])
+            try:
+                (origin,) = origins
+                if origin is not None:
+                    break
+            except ValueError:
+                pass
+            # The origin does not exist. This may be due to some replication lag
+            # between the loader's DB/journal and the DB we are consuming from.
+            # Wait a bit and try again
+            logger.debug("Origin %s not found, sleeping for 10s.", data.target)
+            time.sleep(10)
+        else:
+            # Does not exist, or replication lag > 60s.
             raise ValueError(f"Unknown origin {data.target}") from None
 
         if urlparse(data.authority.url).netloc != urlparse(origin["url"]).netloc:
@@ -239,8 +260,8 @@ class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]):
 
 DEFAULT_CONFIG: Dict[str, Any] = {
     "tools": {
-        "name": "swh-metadata-detector",
-        "version": "0.0.2",
+        "name": "swh.indexer.metadata",
+        "version": pkg_resources.get_distribution("swh.indexer").version,
         "configuration": {},
     },
 }
@@ -356,23 +377,20 @@ class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]):
 
         """
         metadata = []
-        tool = {
-            "name": "swh-metadata-translator",
-            "version": "0.0.2",
-            "configuration": {},
-        }
         # TODO: iterate on each context, on each file
         # -> get raw_contents
         # -> translate each content
-        config = {k: self.config[k] for k in [INDEXER_CFG_KEY, "objstorage", "storage"]}
-        config["tools"] = [tool]
+        config = {
+            k: self.config[k]
+            for k in [INDEXER_CFG_KEY, "objstorage", "storage", "tools"]
+        }
         all_detected_files = detect_metadata(files)
         used_mappings = [
             INTRINSIC_MAPPINGS[context].name for context in all_detected_files
         ]
         for (mapping_name, detected_files) in all_detected_files.items():
             cfg = deepcopy(config)
-            cfg["tools"][0]["configuration"]["context"] = mapping_name
+            cfg["tools"]["configuration"]["context"] = mapping_name
             c_metadata_indexer = ContentMetadataIndexer(config=cfg)
             # sha1s that are in content_metadata table
             sha1s_in_storage = []
@@ -523,25 +541,27 @@ class OriginMetadataIndexer(
         results: List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]],
     ) -> Dict[str, int]:
         # Deduplicate directories
-        dir_metadata: List[DirectoryIntrinsicMetadataRow] = []
-        orig_metadata: List[OriginIntrinsicMetadataRow] = []
+        dir_metadata: Dict[bytes, DirectoryIntrinsicMetadataRow] = {}
+        orig_metadata: Dict[str, OriginIntrinsicMetadataRow] = {}
         summary: Dict = {}
         for (orig_item, dir_item) in results:
             assert dir_item.metadata == orig_item.metadata
             if dir_item.metadata and not (dir_item.metadata.keys() <= {"@context"}):
                 # Only store non-empty metadata sets
-                if dir_item not in dir_metadata:
-                    dir_metadata.append(dir_item)
-                if orig_item not in orig_metadata:
-                    orig_metadata.append(orig_item)
+                if dir_item.id not in dir_metadata:
+                    dir_metadata[dir_item.id] = dir_item
+                if orig_item.id not in orig_metadata:
+                    orig_metadata[orig_item.id] = orig_item
 
         if dir_metadata:
             summary_dir = self.idx_storage.directory_intrinsic_metadata_add(
-                dir_metadata
+                list(dir_metadata.values())
             )
             summary.update(summary_dir)
         if orig_metadata:
-            summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata)
+            summary_ori = self.idx_storage.origin_intrinsic_metadata_add(
+                list(orig_metadata.values())
+            )
             summary.update(summary_ori)
 
         return summary
diff --git a/swh/indexer/metadata_dictionary/__init__.py b/swh/indexer/metadata_dictionary/__init__.py
index 99c2504c2a9cb9333185fa273b27a88f4b859a74..715362418efaf7f0b82b1e47cb507a4ec49816dc 100644
--- a/swh/indexer/metadata_dictionary/__init__.py
+++ b/swh/indexer/metadata_dictionary/__init__.py
@@ -8,7 +8,19 @@ from typing import Dict, Type
 
 import click
 
-from . import cff, codemeta, composer, dart, github, maven, npm, nuget, python, ruby
+from . import (
+    cff,
+    codemeta,
+    composer,
+    dart,
+    gitea,
+    github,
+    maven,
+    npm,
+    nuget,
+    python,
+    ruby,
+)
 from .base import BaseExtrinsicMapping, BaseIntrinsicMapping, BaseMapping
 
 INTRINSIC_MAPPINGS: Dict[str, Type[BaseIntrinsicMapping]] = {
@@ -24,6 +36,7 @@ INTRINSIC_MAPPINGS: Dict[str, Type[BaseIntrinsicMapping]] = {
 }
 
 EXTRINSIC_MAPPINGS: Dict[str, Type[BaseExtrinsicMapping]] = {
+    "GiteaMapping": gitea.GiteaMapping,
     "GitHubMapping": github.GitHubMapping,
     "JsonSwordCodemetaMapping": codemeta.JsonSwordCodemetaMapping,
     "SwordCodemetaMapping": codemeta.SwordCodemetaMapping,
diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
index 418c2ecb82f8f2779a6ef880ed554ffce3263305..e992bb521468ed8482e22593abcfbf7493a958f3 100644
--- a/swh/indexer/metadata_dictionary/base.py
+++ b/swh/indexer/metadata_dictionary/base.py
@@ -5,7 +5,7 @@
 
 import json
 import logging
-from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar
+from typing import Any, Callable, Dict, List, Optional, Pattern, Tuple, TypeVar, Union
 import uuid
 import xml.parsers.expat
 
@@ -19,6 +19,11 @@ from swh.indexer.codemeta import _document_loader, compact
 from swh.indexer.namespaces import RDF, SCHEMA
 from swh.indexer.storage.interface import Sha1
 
+from .utils import add_url_if_valid
+
+TMP_ROOT_URI_PREFIX = "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/"
+"""Prefix used to generate temporary URIs for root nodes being translated."""
+
 
 class DirectoryLsEntry(TypedDict):
     target: Sha1
@@ -126,16 +131,21 @@ class BaseIntrinsicMapping(BaseMapping):
 class SingleFileIntrinsicMapping(BaseIntrinsicMapping):
     """Base class for all intrinsic metadata mappings that use a single file as input."""
 
-    @property
-    def filename(self):
-        """The .json file to extract metadata from."""
-        raise NotImplementedError(f"{self.__class__.__name__}.filename")
+    filename: Union[bytes, Pattern[bytes]]
 
     @classmethod
     def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]:
-        for entry in file_entries:
-            if entry["name"].lower() == cls.filename:
-                return [entry["sha1"]]
+        filename = cls.filename
+        # Check if filename is a regex or bytes:
+        if isinstance(filename, bytes):
+            for entry in file_entries:
+                if entry["name"].lower() == filename:
+                    return [entry["sha1"]]
+        else:
+            for entry in file_entries:
+                if filename.match(entry["name"]):
+                    return [entry["sha1"]]
+
         return []
 
 
@@ -147,6 +157,10 @@ class DictMapping(BaseMapping):
     """List of fields that are simple strings, and don't need any
     normalization."""
 
+    date_fields: List[str] = []
+    """List of fields that are strings that should be typed as http://schema.org/Date
+    """
+
     uri_fields: List[str] = []
     """List of fields that are simple URIs, and don't need any
     normalization."""
@@ -166,7 +180,7 @@ class DictMapping(BaseMapping):
         simple_terms = {
             str(term)
             for (key, term) in cls.mapping.items()
-            if key in cls.string_fields + cls.uri_fields
+            if key in cls.string_fields + cls.date_fields + cls.uri_fields
             or hasattr(cls, "normalize_" + cls._normalize_method_name(key))
         }
 
@@ -180,6 +194,21 @@ class DictMapping(BaseMapping):
 
         return simple_terms | complex_terms
 
+    def get_root_uri(self, content_dict: Dict) -> rdflib.URIRef:
+        """Returns an URI for the SoftwareSourceCode or Repository being described.
+
+        The default implementation uses a temporary URI that is stripped before
+        normalization by :meth:`_translate_dict`.
+        """
+        # The main object being described (the SoftwareSourceCode) does not necessarily
+        # may or may not have an id.
+        # If it does, it will need to be set by a subclass.
+        # If it doesn't we temporarily use this URI to identify it. Unfortunately,
+        # we cannot use a blank node as we need to use it for JSON-LD framing later,
+        # and blank nodes cannot be used for framing in JSON-LD >= 1.1
+        root_id = TMP_ROOT_URI_PREFIX + str(uuid.uuid4())
+        return rdflib.URIRef(root_id)
+
     def _translate_dict(self, content_dict: Dict) -> Dict[str, Any]:
         """
         Translates content  by parsing content from a dict object
@@ -195,16 +224,47 @@ class DictMapping(BaseMapping):
         """
         graph = rdflib.Graph()
 
-        # The main object being described (the SoftwareSourceCode) does not necessarily
-        # may or may not have an id.
-        # Either way, we temporarily use this URI to identify it. Unfortunately,
-        # we cannot use a blank node as we need to use it for JSON-LD framing later,
-        # and blank nodes cannot be used for framing in JSON-LD >= 1.1
-        root_id = (
-            "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/"
-            + str(uuid.uuid4())
+        root = self.get_root_uri(content_dict)
+
+        self._translate_to_graph(graph, root, content_dict)
+
+        self.sanitize(graph)
+
+        # Convert from rdflib's internal graph representation to JSON
+        s = graph.serialize(format="application/ld+json")
+
+        # Load from JSON to a list of Python objects
+        jsonld_graph = json.loads(s)
+
+        # Use JSON-LD framing to turn the graph into a rooted tree
+        # frame = {"@type": str(SCHEMA.SoftwareSourceCode)}
+        translated_metadata = jsonld.frame(
+            jsonld_graph,
+            {"@id": str(root)},
+            options={
+                "documentLoader": _document_loader,
+                "processingMode": "json-ld-1.1",
+            },
         )
-        root = rdflib.URIRef(root_id)
+
+        # Remove the temporary id we added at the beginning
+        assert isinstance(translated_metadata["@id"], str)
+        if translated_metadata["@id"].startswith(TMP_ROOT_URI_PREFIX):
+            del translated_metadata["@id"]
+
+        return self.normalize_translation(translated_metadata)
+
+    def _translate_to_graph(
+        self, graph: rdflib.Graph, root: rdflib.term.Identifier, content_dict: Dict
+    ) -> None:
+        """
+        Translates content  by parsing content from a dict object
+        and translating with the appropriate mapping to the graph passed as parameter
+
+        Args:
+            content_dict (dict): content dict to translate
+
+        """
         graph.add((root, RDF.type, SCHEMA.SoftwareSourceCode))
 
         for k, v in content_dict.items():
@@ -231,53 +291,54 @@ class DictMapping(BaseMapping):
                         pass
                     elif isinstance(v, list):
                         for item in reversed(v):
-                            graph.add((root, codemeta_key, item))
+                            if isinstance(item, rdflib.URIRef):
+                                add_url_if_valid(graph, root, codemeta_key, str(item))
+                            else:
+                                graph.add((root, codemeta_key, item))
                     else:
-                        graph.add((root, codemeta_key, v))
+                        if isinstance(v, rdflib.URIRef):
+                            add_url_if_valid(graph, root, codemeta_key, str(v))
+                        else:
+                            graph.add((root, codemeta_key, v))
                 elif k in self.string_fields and isinstance(v, str):
                     graph.add((root, codemeta_key, rdflib.Literal(v)))
                 elif k in self.string_fields and isinstance(v, list):
                     for item in v:
                         graph.add((root, codemeta_key, rdflib.Literal(item)))
+                elif k in self.date_fields and isinstance(v, str):
+                    typed_v = rdflib.Literal(v, datatype=SCHEMA.Date)
+                    graph.add((root, codemeta_key, typed_v))
+                elif k in self.date_fields and isinstance(v, list):
+                    for item in v:
+                        if isinstance(item, str):
+                            typed_item = rdflib.Literal(item, datatype=SCHEMA.Date)
+                            graph.add((root, codemeta_key, typed_item))
                 elif k in self.uri_fields and isinstance(v, str):
-                    graph.add((root, codemeta_key, rdflib.URIRef(v)))
+                    add_url_if_valid(graph, root, codemeta_key, v)
                 elif k in self.uri_fields and isinstance(v, list):
                     for item in v:
-                        if isinstance(item, str):
-                            graph.add((root, codemeta_key, rdflib.URIRef(item)))
+                        add_url_if_valid(graph, root, codemeta_key, item)
                 else:
                     continue
 
         self.extra_translation(graph, root, content_dict)
 
-        # Convert from rdflib's internal graph representation to JSON
-        s = graph.serialize(format="application/ld+json")
+    def sanitize(self, graph: rdflib.Graph) -> None:
+        # Remove triples that make PyLD crash
+        for (subject, predicate, _) in graph.triples((None, None, rdflib.URIRef(""))):
+            graph.remove((subject, predicate, rdflib.URIRef("")))
 
-        # Load from JSON to a list of Python objects
-        jsonld_graph = json.loads(s)
-
-        # Use JSON-LD framing to turn the graph into a rooted tree
-        # frame = {"@type": str(SCHEMA.SoftwareSourceCode)}
-        translated_metadata = jsonld.frame(
-            jsonld_graph,
-            {"@id": root_id},
-            options={
-                "documentLoader": _document_loader,
-                "processingMode": "json-ld-1.1",
-            },
-        )
-
-        # Remove the temporary id we added at the beginning
-        if isinstance(translated_metadata["@id"], list):
-            translated_metadata["@id"].remove(root_id)
-        else:
-            del translated_metadata["@id"]
-
-        return self.normalize_translation(translated_metadata)
+        # Should not happen, but we's better check as this may lead to incorrect data
+        invalid = False
+        for triple in graph.triples((rdflib.URIRef(""), None, None)):
+            invalid = True
+            logging.error("Empty triple subject URI: %r", triple)
+        if invalid:
+            raise ValueError("Empty triple subject(s)")
 
     def extra_translation(
         self, graph: rdflib.Graph, root: rdflib.term.Node, d: Dict[str, Any]
-    ):
+    ) -> None:
         """Called at the end of the translation process, and may add arbitrary triples
         to ``graph`` based on the input dictionary (passed as ``d``).
         """
@@ -332,14 +393,14 @@ class SafeLoader(yaml.SafeLoader):
     }
 
 
-class YamlMapping(DictMapping, SingleFileIntrinsicMapping):
+class YamlMapping(DictMapping):
     """Base class for all mappings that use Yaml data as input."""
 
     def translate(self, raw_content: bytes) -> Optional[Dict[str, str]]:
         raw_content_string: str = raw_content.decode()
         try:
             content_dict = yaml.load(raw_content_string, Loader=SafeLoader)
-        except yaml.scanner.ScannerError:
+        except (yaml.scanner.ScannerError, yaml.parser.ParserError):
             return None
 
         if isinstance(content_dict, dict):
diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py
index 12121cc0293b90b328f4c1eadbdaae236c1cb402..0d730e883af02061214ff4ea22e623a99f4b79d7 100644
--- a/swh/indexer/metadata_dictionary/cff.py
+++ b/swh/indexer/metadata_dictionary/cff.py
@@ -4,6 +4,7 @@
 # See top-level LICENSE file for more information
 
 from typing import List
+import urllib.parse
 
 from rdflib import BNode, Graph, Literal, URIRef
 import rdflib.term
@@ -11,25 +12,30 @@ import rdflib.term
 from swh.indexer.codemeta import CROSSWALK_TABLE
 from swh.indexer.namespaces import RDF, SCHEMA
 
-from .base import YamlMapping
+from .base import SingleFileIntrinsicMapping, YamlMapping
 from .utils import add_map
 
 DOI = URIRef("https://doi.org/")
 SPDX = URIRef("https://spdx.org/licenses/")
 
 
-class CffMapping(YamlMapping):
+class CffMapping(YamlMapping, SingleFileIntrinsicMapping):
     """Dedicated class for Citation (CITATION.cff) mapping and translation"""
 
     name = "cff"
     filename = b"CITATION.cff"
     mapping = CROSSWALK_TABLE["Citation File Format Core (CFF-Core) 1.0.2"]
     string_fields = ["keywords", "license", "abstract", "version", "doi"]
+    date_fields = ["date-released"]
     uri_fields = ["repository-code"]
 
     def _translate_author(self, graph: Graph, author: dict) -> rdflib.term.Node:
         node: rdflib.term.Node
-        if "orcid" in author and isinstance(author["orcid"], str):
+        if (
+            "orcid" in author
+            and isinstance(author["orcid"], str)
+            and urllib.parse.urlparse(author["orcid"]).netloc
+        ):
             node = URIRef(author["orcid"])
         else:
             node = BNode()
@@ -57,7 +63,3 @@ class CffMapping(YamlMapping):
     def normalize_license(self, s: str) -> URIRef:
         if isinstance(s, str):
             return SPDX + s
-
-    def normalize_date_released(self, s: str) -> Literal:
-        if isinstance(s, str):
-            return Literal(s, datatype=SCHEMA.Date)
diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py
index 4da5eb6a9512c60c8c86795f9d3eba1df1fbb16c..1fc613f74a6c0345179f0aee72c0ecd0b7957ac8 100644
--- a/swh/indexer/metadata_dictionary/codemeta.py
+++ b/swh/indexer/metadata_dictionary/codemeta.py
@@ -5,10 +5,12 @@
 
 import collections
 import json
+import logging
 import re
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 import xml.etree.ElementTree as ET
 
+import iso8601
 import xmltodict
 
 from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand
@@ -19,6 +21,9 @@ ATOM_URI = "http://www.w3.org/2005/Atom"
 
 _TAG_RE = re.compile(r"\{(?P<namespace>.*?)\}(?P<localname>.*)")
 _IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",)
+_DATE_RE = re.compile("^[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}$")
+
+logger = logging.getLogger(__name__)
 
 
 class CodemetaMapping(SingleFileIntrinsicMapping):
@@ -61,8 +66,13 @@ class SwordCodemetaMapping(BaseExtrinsicMapping):
     def supported_terms(cls) -> List[str]:
         return [term for term in CODEMETA_TERMS if not term.startswith("@")]
 
-    def xml_to_jsonld(self, e: ET.Element) -> Dict[str, Any]:
-        doc: Dict[str, List[Dict[str, Any]]] = collections.defaultdict(list)
+    def xml_to_jsonld(self, e: ET.Element) -> Union[str, Dict[str, Any]]:
+        # Keys are JSON-LD property names (URIs or terms).
+        # Values are either a single string (if key is "type") or list of
+        # other dicts with the same type recursively.
+        # To simply annotations, we omit the single string case here.
+        doc: Dict[str, List[Union[str, Dict[str, Any]]]] = collections.defaultdict(list)
+
         for child in e:
             m = _TAG_RE.match(child.tag)
             assert m, f"Tag with no namespace: {child}"
@@ -83,7 +93,42 @@ class SwordCodemetaMapping(BaseExtrinsicMapping):
                 # It is a term defined by the context; write is as-is and JSON-LD
                 # expansion will convert it to a full URI based on
                 # "@context": CODEMETA_CONTEXT_URL
-                doc[localname].append(self.xml_to_jsonld(child))
+                jsonld_child = self.xml_to_jsonld(child)
+                if (
+                    localname
+                    in (
+                        "dateCreated",
+                        "dateModified",
+                        "datePublished",
+                    )
+                    and isinstance(jsonld_child, str)
+                    and _DATE_RE.match(jsonld_child)
+                ):
+                    # Dates missing a leading zero for their day/month, used
+                    # to be allowed by the deposit; so we need to reformat them
+                    # to be valid ISO8601.
+                    jsonld_child = iso8601.parse_date(jsonld_child).date().isoformat()
+                if localname == "id":
+                    # JSON-LD only allows a single id, and they have to be strings.
+                    if localname in doc:
+                        logger.error(
+                            "Duplicate <id>s in SWORD document: %r and %r",
+                            doc[localname],
+                            jsonld_child,
+                        )
+                        continue
+                    elif not jsonld_child:
+                        logger.error("Empty <id> value in SWORD document")
+                        continue
+                    elif not isinstance(jsonld_child, str):
+                        logger.error(
+                            "Unexpected <id> value in SWORD document: %r", jsonld_child
+                        )
+                        continue
+                    else:
+                        doc[localname] = jsonld_child  # type: ignore[assignment]
+                else:
+                    doc[localname].append(jsonld_child)
             else:
                 # Otherwise, we already know the URI
                 doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child))
@@ -95,7 +140,7 @@ class SwordCodemetaMapping(BaseExtrinsicMapping):
         text = e.text.strip() if e.text else None
         if text:
             # TODO: check doc is empty, and raise mixed-content error otherwise?
-            doc_["@value"] = text
+            return text
 
         return doc_
 
@@ -106,6 +151,8 @@ class SwordCodemetaMapping(BaseExtrinsicMapping):
         # Transform to JSON-LD document
         doc = self.xml_to_jsonld(root)
 
+        assert isinstance(doc, dict), f"Root object is not a dict: {doc}"
+
         # Add @context to JSON-LD expansion replaces the "codemeta:" prefix
         # hash (which uses the context URL as namespace URI for historical
         # reasons) into properties in `http://schema.org/` and
diff --git a/swh/indexer/metadata_dictionary/composer.py b/swh/indexer/metadata_dictionary/composer.py
index a43fc23ea777320528a9cab030dd060e280ccbee..0c9b08b4e6eaff49e3bccb76999671280200532c 100644
--- a/swh/indexer/metadata_dictionary/composer.py
+++ b/swh/indexer/metadata_dictionary/composer.py
@@ -8,7 +8,7 @@ from typing import Optional
 
 from rdflib import BNode, Graph, Literal, URIRef
 
-from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
+from swh.indexer.codemeta import _DATA_DIR, read_crosstable
 from swh.indexer.namespaces import RDF, SCHEMA
 
 from .base import JsonMapping, SingleFileIntrinsicMapping
@@ -20,7 +20,7 @@ SPDX = URIRef("https://spdx.org/licenses/")
 COMPOSER_TABLE_PATH = os.path.join(_DATA_DIR, "composer.csv")
 
 with open(COMPOSER_TABLE_PATH) as fd:
-    (CODEMETA_TERMS, COMPOSER_TABLE) = _read_crosstable(fd)
+    (CODEMETA_TERMS, COMPOSER_TABLE) = read_crosstable(fd)
 
 
 class ComposerMapping(JsonMapping, SingleFileIntrinsicMapping):
diff --git a/swh/indexer/metadata_dictionary/dart.py b/swh/indexer/metadata_dictionary/dart.py
index ec6dfb26186de263d5a0115afe64712dca362121..01f28c7cc866380a79dd0157cbf626cc8f6e65a4 100644
--- a/swh/indexer/metadata_dictionary/dart.py
+++ b/swh/indexer/metadata_dictionary/dart.py
@@ -8,10 +8,10 @@ import re
 
 from rdflib import RDF, BNode, Graph, Literal, URIRef
 
-from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
+from swh.indexer.codemeta import _DATA_DIR, read_crosstable
 from swh.indexer.namespaces import SCHEMA
 
-from .base import YamlMapping
+from .base import SingleFileIntrinsicMapping, YamlMapping
 from .utils import add_map
 
 SPDX = URIRef("https://spdx.org/licenses/")
@@ -19,7 +19,7 @@ SPDX = URIRef("https://spdx.org/licenses/")
 PUB_TABLE_PATH = os.path.join(_DATA_DIR, "pubspec.csv")
 
 with open(PUB_TABLE_PATH) as fd:
-    (CODEMETA_TERMS, PUB_TABLE) = _read_crosstable(fd)
+    (CODEMETA_TERMS, PUB_TABLE) = read_crosstable(fd)
 
 
 def name_to_person(name):
@@ -29,7 +29,7 @@ def name_to_person(name):
     }
 
 
-class PubspecMapping(YamlMapping):
+class PubspecMapping(YamlMapping, SingleFileIntrinsicMapping):
 
     name = "pubspec"
     filename = b"pubspec.yaml"
diff --git a/swh/indexer/metadata_dictionary/gitea.py b/swh/indexer/metadata_dictionary/gitea.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f6e648d02831b54ab232af1ed72e11eb791886d
--- /dev/null
+++ b/swh/indexer/metadata_dictionary/gitea.py
@@ -0,0 +1,124 @@
+# Copyright (C) 2022  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+from typing import Any, Tuple
+
+from rdflib import RDF, BNode, Graph, Literal, URIRef
+
+from swh.indexer.codemeta import _DATA_DIR, read_crosstable
+from swh.indexer.namespaces import ACTIVITYSTREAMS, FORGEFED, SCHEMA
+
+from .base import BaseExtrinsicMapping, JsonMapping, produce_terms
+from .utils import prettyprint_graph  # noqa
+
+SPDX = URIRef("https://spdx.org/licenses/")
+
+
+GITEA_TABLE_PATH = os.path.join(_DATA_DIR, "Gitea.csv")
+
+with open(GITEA_TABLE_PATH) as fd:
+    (CODEMETA_TERMS, GITEA_TABLE) = read_crosstable(fd)
+
+
+class GiteaMapping(BaseExtrinsicMapping, JsonMapping):
+    name = "gitea"
+    mapping = GITEA_TABLE["Gitea"]
+    uri_fields = [
+        "website",
+        "clone_url",
+    ]
+    date_fields = [
+        "created_at",
+        "updated_at",
+    ]
+    string_fields = [
+        "name",
+        "full_name",
+        "languages",
+        "description",
+    ]
+
+    @classmethod
+    def extrinsic_metadata_formats(cls) -> Tuple[str, ...]:
+        return ("gitea-project-json", "gogs-project-json")
+
+    def extra_translation(self, graph, root, content_dict):
+        graph.remove((root, RDF.type, SCHEMA.SoftwareSourceCode))
+        graph.add((root, RDF.type, FORGEFED.Repository))
+
+    def get_root_uri(self, content_dict: dict) -> URIRef:
+        if isinstance(content_dict.get("html_url"), str):
+            return URIRef(content_dict["html_url"])
+        else:
+            raise ValueError(
+                f"Gitea/Gogs metadata has invalid/missing html_url: {content_dict}"
+            )
+
+    @produce_terms(FORGEFED.forks, ACTIVITYSTREAMS.totalItems)
+    def translate_forks_count(self, graph: Graph, root: BNode, v: Any) -> None:
+        """
+
+        >>> graph = Graph()
+        >>> root = URIRef("http://example.org/test-software")
+        >>> GiteaMapping().translate_forks_count(graph, root, 42)
+        >>> prettyprint_graph(graph, root)
+        {
+            "@id": ...,
+            "https://forgefed.org/ns#forks": {
+                "@type": "https://www.w3.org/ns/activitystreams#OrderedCollection",
+                "https://www.w3.org/ns/activitystreams#totalItems": 42
+            }
+        }
+        """
+        if isinstance(v, int):
+            collection = BNode()
+            graph.add((root, FORGEFED.forks, collection))
+            graph.add((collection, RDF.type, ACTIVITYSTREAMS.OrderedCollection))
+            graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))
+
+    @produce_terms(ACTIVITYSTREAMS.likes, ACTIVITYSTREAMS.totalItems)
+    def translate_stars_count(self, graph: Graph, root: BNode, v: Any) -> None:
+        """
+
+        >>> graph = Graph()
+        >>> root = URIRef("http://example.org/test-software")
+        >>> GiteaMapping().translate_stars_count(graph, root, 42)
+        >>> prettyprint_graph(graph, root)
+        {
+            "@id": ...,
+            "https://www.w3.org/ns/activitystreams#likes": {
+                "@type": "https://www.w3.org/ns/activitystreams#Collection",
+                "https://www.w3.org/ns/activitystreams#totalItems": 42
+            }
+        }
+        """
+        if isinstance(v, int):
+            collection = BNode()
+            graph.add((root, ACTIVITYSTREAMS.likes, collection))
+            graph.add((collection, RDF.type, ACTIVITYSTREAMS.Collection))
+            graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))
+
+    @produce_terms(ACTIVITYSTREAMS.followers, ACTIVITYSTREAMS.totalItems)
+    def translate_watchers_count(self, graph: Graph, root: BNode, v: Any) -> None:
+        """
+
+        >>> graph = Graph()
+        >>> root = URIRef("http://example.org/test-software")
+        >>> GiteaMapping().translate_watchers_count(graph, root, 42)
+        >>> prettyprint_graph(graph, root)
+        {
+            "@id": ...,
+            "https://www.w3.org/ns/activitystreams#followers": {
+                "@type": "https://www.w3.org/ns/activitystreams#Collection",
+                "https://www.w3.org/ns/activitystreams#totalItems": 42
+            }
+        }
+        """
+        if isinstance(v, int):
+            collection = BNode()
+            graph.add((root, ACTIVITYSTREAMS.followers, collection))
+            graph.add((collection, RDF.type, ACTIVITYSTREAMS.Collection))
+            graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))
diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py
index fe3b87ee5e292a12b13e5f1dbb9c1fd49f49b2c8..0435c4154736132945a7d0982365d493c816ad56 100644
--- a/swh/indexer/metadata_dictionary/github.py
+++ b/swh/indexer/metadata_dictionary/github.py
@@ -8,25 +8,32 @@ from typing import Any, Tuple
 from rdflib import RDF, BNode, Graph, Literal, URIRef
 
 from swh.indexer.codemeta import CROSSWALK_TABLE
-from swh.indexer.namespaces import ACTIVITYSTREAMS, FORGEFED, SCHEMA
+from swh.indexer.namespaces import ACTIVITYSTREAMS, CODEMETA, FORGEFED, SCHEMA
 
 from .base import BaseExtrinsicMapping, JsonMapping, produce_terms
-from .utils import prettyprint_graph  # noqa
+from .utils import add_url_if_valid, prettyprint_graph  # noqa
 
 SPDX = URIRef("https://spdx.org/licenses/")
 
 
 class GitHubMapping(BaseExtrinsicMapping, JsonMapping):
     name = "github"
-    mapping = CROSSWALK_TABLE["GitHub"]
-    string_fields = [
-        "archive_url",
+    mapping = {
+        **CROSSWALK_TABLE["GitHub"],
+        "topics": SCHEMA.keywords,  # TODO: submit this to the official crosswalk
+        "clone_url": SCHEMA.codeRepository,
+    }
+    uri_fields = [
+        "clone_url",
+    ]
+    date_fields = [
         "created_at",
         "updated_at",
+    ]
+    string_fields = [
         "description",
         "full_name",
-        "html_url",
-        "issues_url",
+        "topics",
     ]
 
     @classmethod
@@ -37,6 +44,22 @@ class GitHubMapping(BaseExtrinsicMapping, JsonMapping):
         graph.remove((root, RDF.type, SCHEMA.SoftwareSourceCode))
         graph.add((root, RDF.type, FORGEFED.Repository))
 
+        if content_dict.get("has_issues"):
+            add_url_if_valid(
+                graph,
+                root,
+                CODEMETA.issueTracker,
+                URIRef(content_dict["html_url"] + "/issues"),
+            )
+
+    def get_root_uri(self, content_dict: dict) -> URIRef:
+        if isinstance(content_dict.get("html_url"), str):
+            return URIRef(content_dict["html_url"])
+        else:
+            raise ValueError(
+                f"GitHub metadata has missing/invalid html_url: {content_dict}"
+            )
+
     @produce_terms(FORGEFED.forks, ACTIVITYSTREAMS.totalItems)
     def translate_forks_count(self, graph: Graph, root: BNode, v: Any) -> None:
         """
diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py
index a374a5e7b4dad7d3688314cf3fc178e7b63a30ef..5575ba9260a88e692c90d45fbb5981974a17ee2c 100644
--- a/swh/indexer/metadata_dictionary/maven.py
+++ b/swh/indexer/metadata_dictionary/maven.py
@@ -6,13 +6,13 @@
 import os
 from typing import Any, Dict
 
-from rdflib import Graph, Literal, URIRef
+from rdflib import Graph, Literal
 
 from swh.indexer.codemeta import CROSSWALK_TABLE
 from swh.indexer.namespaces import SCHEMA
 
 from .base import SingleFileIntrinsicMapping, XmlMapping
-from .utils import prettyprint_graph  # noqa
+from .utils import add_url_if_valid, prettyprint_graph  # noqa
 
 
 class MavenMapping(XmlMapping, SingleFileIntrinsicMapping):
@@ -75,7 +75,10 @@ class MavenMapping(XmlMapping, SingleFileIntrinsicMapping):
             and isinstance(artifact_id, str)
         ):
             repo = os.path.join(url, *group_id.split("."), artifact_id)
-            graph.add((root, SCHEMA.codeRepository, URIRef(repo)))
+            if "${" in repo:
+                # Often use as templating in pom.xml files collected from VCSs
+                return
+            add_url_if_valid(graph, root, SCHEMA.codeRepository, repo)
 
     def normalize_groupId(self, id_):
         """https://maven.apache.org/pom.html#Maven_Coordinates
@@ -91,6 +94,7 @@ class MavenMapping(XmlMapping, SingleFileIntrinsicMapping):
 
         >>> import xmltodict
         >>> import json
+        >>> from rdflib import URIRef
         >>> d = xmltodict.parse('''
         ... <licenses>
         ...   <license>
@@ -155,5 +159,5 @@ class MavenMapping(XmlMapping, SingleFileIntrinsicMapping):
         elif not isinstance(licenses, list):
             return
         for license in licenses:
-            if isinstance(license, dict) and isinstance(license.get("url"), str):
-                graph.add((root, SCHEMA.license, URIRef(license["url"])))
+            if isinstance(license, dict):
+                add_url_if_valid(graph, root, SCHEMA.license, license.get("url"))
diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py
index 1540ef6ad4996f1a223db44f5c86d0530243f1c6..b838e5aef86363b582cb8d3be4cd7c739d629b0f 100644
--- a/swh/indexer/metadata_dictionary/npm.py
+++ b/swh/indexer/metadata_dictionary/npm.py
@@ -4,7 +4,6 @@
 # See top-level LICENSE file for more information
 
 import re
-import urllib.parse
 
 from rdflib import RDF, BNode, Graph, Literal, URIRef
 
@@ -12,7 +11,7 @@ from swh.indexer.codemeta import CROSSWALK_TABLE
 from swh.indexer.namespaces import SCHEMA
 
 from .base import JsonMapping, SingleFileIntrinsicMapping
-from .utils import add_list, prettyprint_graph  # noqa
+from .utils import add_list, add_url_if_valid, prettyprint_graph  # noqa
 
 SPDX = URIRef("https://spdx.org/licenses/")
 
@@ -88,11 +87,13 @@ class NpmMapping(JsonMapping, SingleFileIntrinsicMapping):
         rdflib.term.URIRef('https://example.org/bugs/')
         """
         if isinstance(d, dict) and isinstance(d.get("url"), str):
-            return URIRef(d["url"])
+            url = d["url"]
         elif isinstance(d, str):
-            return URIRef(d)
+            url = d
         else:
-            return None
+            url = ""
+
+        return URIRef(url)
 
     _parse_author = re.compile(
         r"^ *" r"(?P<name>.*?)" r"( +<(?P<email>.*)>)?" r"( +\((?P<url>.*)\))?" r" *$"
@@ -185,12 +186,7 @@ class NpmMapping(JsonMapping, SingleFileIntrinsicMapping):
             graph.add((author, SCHEMA.name, Literal(name)))
         if email and isinstance(email, str):
             graph.add((author, SCHEMA.email, Literal(email)))
-        if url and isinstance(url, str):
-            # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop
-            # URLs that are blatantly invalid early, so PyLD does not crash.
-            parsed_url = urllib.parse.urlparse(url)
-            if parsed_url.netloc:
-                graph.add((author, SCHEMA.url, URIRef(url)))
+        add_url_if_valid(graph, author, SCHEMA.url, url)
 
         add_list(graph, root, SCHEMA.author, [author])
 
@@ -270,6 +266,16 @@ class NpmMapping(JsonMapping, SingleFileIntrinsicMapping):
         rdflib.term.URIRef('https://spdx.org/licenses/MIT')
         """
         if isinstance(s, str):
+            if s.startswith("SEE LICENSE IN "):
+                # Very common pattern, because it is an example in the specification.
+                # It is followed by the filename; and the indexer architecture currently
+                # does not allow accessing that from metadata mappings.
+                # (Plus, an hypothetical license mapping would eventually pick it up)
+                return
+            if " " in s:
+                # Either an SPDX expression, or unusable data
+                # TODO: handle it
+                return
             return SPDX + s
 
     def normalize_keywords(self, lst):
diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py
index 62f7ea97e4bf22470c212b8faeef5d419323a4f6..6d52c4ac7a249e4e64e5938e169e2b597f3d3b69 100644
--- a/swh/indexer/metadata_dictionary/nuget.py
+++ b/swh/indexer/metadata_dictionary/nuget.py
@@ -5,31 +5,31 @@
 
 import os.path
 import re
-from typing import Any, Dict, List
+from typing import Any, Dict
 
 from rdflib import RDF, BNode, Graph, Literal, URIRef
 
-from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
+from swh.indexer.codemeta import _DATA_DIR, read_crosstable
 from swh.indexer.namespaces import SCHEMA
-from swh.indexer.storage.interface import Sha1
 
-from .base import BaseIntrinsicMapping, DirectoryLsEntry, XmlMapping
-from .utils import add_list
+from .base import SingleFileIntrinsicMapping, XmlMapping
+from .utils import add_list, add_url_if_valid
 
 NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv")
 
 with open(NUGET_TABLE_PATH) as fd:
-    (CODEMETA_TERMS, NUGET_TABLE) = _read_crosstable(fd)
+    (CODEMETA_TERMS, NUGET_TABLE) = read_crosstable(fd)
 
 SPDX = URIRef("https://spdx.org/licenses/")
 
 
-class NuGetMapping(XmlMapping, BaseIntrinsicMapping):
+class NuGetMapping(XmlMapping, SingleFileIntrinsicMapping):
     """
     dedicated class for NuGet (.nuspec) mapping and translation
     """
 
     name = "nuget"
+    filename = re.compile(rb".*\.nuspec")
     mapping = NUGET_TABLE["NuGet"]
     mapping["copyright"] = URIRef("http://schema.org/copyrightNotice")
     mapping["language"] = URIRef("http://schema.org/inLanguage")
@@ -45,20 +45,13 @@ class NuGetMapping(XmlMapping, BaseIntrinsicMapping):
     ]
     uri_fields = ["projectUrl", "licenseUrl"]
 
-    @classmethod
-    def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]:
-        for entry in file_entries:
-            if entry["name"].endswith(b".nuspec"):
-                return [entry["sha1"]]
-        return []
-
     def _translate_dict(self, d: Dict[str, Any]) -> Dict[str, Any]:
         return super()._translate_dict(d.get("package", {}).get("metadata", {}))
 
     def translate_repository(self, graph, root, v):
         if isinstance(v, dict) and isinstance(v["@url"], str):
             codemeta_key = URIRef(self.mapping["repository.url"])
-            graph.add((root, codemeta_key, URIRef(v["@url"])))
+            add_url_if_valid(graph, root, codemeta_key, v["@url"])
 
     def normalize_license(self, v):
         if isinstance(v, dict) and v["@type"] == "expression":
diff --git a/swh/indexer/metadata_dictionary/ruby.py b/swh/indexer/metadata_dictionary/ruby.py
index 71a0b104008d337f82455770063764a9e2ffee66..7031a0a2900ee5fbe3600c86df7b581794f468e2 100644
--- a/swh/indexer/metadata_dictionary/ruby.py
+++ b/swh/indexer/metadata_dictionary/ruby.py
@@ -6,16 +6,13 @@
 import ast
 import itertools
 import re
-from typing import List
 
 from rdflib import RDF, BNode, Graph, Literal, URIRef
 
 from swh.indexer.codemeta import CROSSWALK_TABLE
-from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
 from swh.indexer.namespaces import SCHEMA
-from swh.indexer.storage.interface import Sha1
 
-from .base import BaseIntrinsicMapping, DictMapping
+from .base import DictMapping, SingleFileIntrinsicMapping
 from .utils import add_map
 
 SPDX = URIRef("https://spdx.org/licenses/")
@@ -30,8 +27,9 @@ def name_to_person(graph: Graph, name):
     return author
 
 
-class GemspecMapping(BaseIntrinsicMapping, DictMapping):
+class GemspecMapping(DictMapping, SingleFileIntrinsicMapping):
     name = "gemspec"
+    filename = re.compile(rb".*\.gemspec")
     mapping = CROSSWALK_TABLE["Ruby Gem"]
     string_fields = ["name", "version", "description", "summary", "email"]
     uri_fields = ["homepage"]
@@ -39,13 +37,6 @@ class GemspecMapping(BaseIntrinsicMapping, DictMapping):
     _re_spec_new = re.compile(r".*Gem::Specification.new +(do|\{) +\|.*\|.*")
     _re_spec_entry = re.compile(r"\s*\w+\.(?P<key>\w+)\s*=\s*(?P<expr>.*)")
 
-    @classmethod
-    def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]:
-        for entry in file_entries:
-            if entry["name"].endswith(b".gemspec"):
-                return [entry["sha1"]]
-        return []
-
     def translate(self, raw_content):
         try:
             raw_content = raw_content.decode()
diff --git a/swh/indexer/metadata_dictionary/utils.py b/swh/indexer/metadata_dictionary/utils.py
index 173b1461e31f4c5a4281aea9b5ca040c76b3a9f9..6aaf4fd587dd0f84df7d5291cee1c82f43e78c1d 100644
--- a/swh/indexer/metadata_dictionary/utils.py
+++ b/swh/indexer/metadata_dictionary/utils.py
@@ -5,7 +5,8 @@
 
 
 import json
-from typing import Callable, Iterable, Optional, Sequence, TypeVar
+from typing import Any, Callable, Iterable, Optional, Sequence, TypeVar
+import urllib.parse
 
 from pyld import jsonld
 from rdflib import RDF, Graph, URIRef
@@ -70,3 +71,46 @@ def add_map(
     """Helper for :func:`add_list` that takes a mapper function ``f``."""
     nodes = [f(graph, value) for value in values]
     add_list(graph, subject, predicate, [node for node in nodes if node])
+
+
+def add_url_if_valid(
+    graph: Graph,
+    subject: rdflib.term.Node,
+    predicate: rdflib.term.Identifier,
+    url: Any,
+) -> None:
+    """Adds ``(subject, predicate, url)`` to the graph if ``url`` is well-formed.
+
+    This is meant as a workaround for https://github.com/digitalbazaar/pyld/issues/91
+    to drop URLs that are blatantly invalid early, so PyLD does not crash.
+
+    >>> from pprint import pprint
+    >>> graph = Graph()
+    >>> subject = rdflib.term.URIRef("http://example.org/test-software")
+    >>> predicate = rdflib.term.URIRef("http://schema.org/license")
+    >>> add_url_if_valid(
+    ...     graph, subject, predicate, "https//www.apache.org/licenses/LICENSE-2.0.txt"
+    ... )
+    >>> add_url_if_valid(
+    ...     graph, subject, predicate, "http:s//www.apache.org/licenses/LICENSE-2.0.txt"
+    ... )
+    >>> add_url_if_valid(
+    ...     graph, subject, predicate, "https://www.apache.org/licenses/LICENSE-2.0.txt"
+    ... )
+    >>> add_url_if_valid(
+    ...     graph, subject, predicate, 42
+    ... )
+    >>> pprint(set(graph.triples((subject, predicate, None))))
+    {(rdflib.term.URIRef('http://example.org/test-software'),
+      rdflib.term.URIRef('http://schema.org/license'),
+      rdflib.term.URIRef('https://www.apache.org/licenses/LICENSE-2.0.txt'))}
+    """
+    if not isinstance(url, str):
+        return
+    try:
+        parsed_url = urllib.parse.urlparse(url)
+    except Exception:
+        return
+    if " " in url or not parsed_url.netloc:
+        return
+    graph.add((subject, predicate, rdflib.term.URIRef(url)))
diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py
index 2d9ff6dafe509eed9d269b2376a2327be1672e13..82ac133069f91f5300da610f9534204cd7a3ee5f 100644
--- a/swh/indexer/origin_head.py
+++ b/swh/indexer/origin_head.py
@@ -4,15 +4,16 @@
 # See top-level LICENSE file for more information
 
 import re
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
-from swh.model.model import SnapshotBranch, TargetType
+from swh.model.model import Snapshot, SnapshotBranch, TargetType
 from swh.model.swhids import CoreSWHID, ObjectType
 from swh.storage.algos.origin import origin_get_latest_visit_status
 from swh.storage.algos.snapshot import snapshot_get_all_branches
+from swh.storage.interface import PartialBranches, StorageInterface
 
 
-def get_head_swhid(storage, origin_url: str) -> Optional[CoreSWHID]:
+def get_head_swhid(storage: StorageInterface, origin_url: str) -> Optional[CoreSWHID]:
     """Returns the SWHID of the head revision or release of an origin"""
     visit_status = origin_get_latest_visit_status(
         storage, origin_url, allowed_statuses=["full"], require_snapshot=True
@@ -20,14 +21,24 @@ def get_head_swhid(storage, origin_url: str) -> Optional[CoreSWHID]:
     if not visit_status:
         return None
     assert visit_status.snapshot is not None
-    snapshot = snapshot_get_all_branches(storage, visit_status.snapshot)
-    if snapshot is None:
-        return None
 
     if visit_status.type == "ftp":
-        return _try_get_ftp_head(dict(snapshot.branches))
+        # We need to fetch all branches in order to find the largest one
+        snapshot = snapshot_get_all_branches(storage, visit_status.snapshot)
+        if snapshot is None:
+            return None
+        return _try_get_ftp_head(storage, snapshot)
     else:
-        return _try_get_head_generic(dict(snapshot.branches))
+        # Peak into the snapshot, without fetching too many refs.
+        # If the snapshot is small, this gets all of it in a single request.
+        # If the snapshot is large, we will query specific branches as we need them.
+        partial_branches = storage.snapshot_get_branches(
+            visit_status.snapshot, branches_count=100
+        )
+        if partial_branches is None:
+            # Snapshot does not exist
+            return None
+        return _try_get_head_generic(storage, partial_branches)
 
 
 _archive_filename_re = re.compile(
@@ -78,31 +89,56 @@ def _parse_version(filename: bytes) -> Tuple[Union[float, int, str], ...]:
 
 
 def _try_get_ftp_head(
-    branches: Dict[bytes, Optional[SnapshotBranch]]
+    storage: StorageInterface, snapshot: Snapshot
 ) -> Optional[CoreSWHID]:
-    archive_names = list(branches)
+    archive_names = list(snapshot.branches)
     max_archive_name = max(archive_names, key=_parse_version)
-    return _try_resolve_target(branches, max_archive_name)
+    return _try_resolve_target(
+        storage,
+        {"id": snapshot.id, "branches": dict(snapshot.branches), "next_branch": None},
+        branch_name=max_archive_name,
+    )
 
 
 def _try_get_head_generic(
-    branches: Dict[bytes, Optional[SnapshotBranch]]
+    storage: StorageInterface, partial_branches: PartialBranches
 ) -> Optional[CoreSWHID]:
     # Works on 'deposit', 'pypi', and VCSs.
-    return _try_resolve_target(branches, b"HEAD") or _try_resolve_target(
-        branches, b"master"
-    )
+    return _try_resolve_target(
+        storage, partial_branches, branch_name=b"HEAD"
+    ) or _try_resolve_target(storage, partial_branches, branch_name=b"master")
+
+
+def _get_branch(
+    storage: StorageInterface, partial_branches: PartialBranches, branch_name: bytes
+) -> Optional[SnapshotBranch]:
+    """Given a ``branch_name``, gets it from ``partial_branches`` if present,
+    and fetches it from the storage otherwise."""
+    if branch_name in partial_branches["branches"]:
+        return partial_branches["branches"][branch_name]
+    elif partial_branches["next_branch"] is not None:
+        # Branch is not in `partial_branches`, and `partial_branches` indeed partial
+        res = storage.snapshot_get_branches(
+            partial_branches["id"], branches_from=branch_name, branches_count=1
+        )
+        assert res is not None, "Snapshot does not exist anymore"
+        return res["branches"].get(branch_name)
+    else:
+        # Branch is not in `partial_branches`, but `partial_branches` is the full
+        # list of branches, which means it is a dangling reference.
+        return None
 
 
 def _try_resolve_target(
-    branches: Dict[bytes, Optional[SnapshotBranch]], branch_name: bytes
+    storage: StorageInterface, partial_branches: PartialBranches, branch_name: bytes
 ) -> Optional[CoreSWHID]:
     try:
-        branch = branches[branch_name]
+        branch = _get_branch(storage, partial_branches, branch_name)
         if branch is None:
             return None
+
         while branch.target_type == TargetType.ALIAS:
-            branch = branches[branch.target]
+            branch = _get_branch(storage, partial_branches, branch.target)
             if branch is None:
                 return None
 
diff --git a/swh/indexer/sql/20-enums.sql b/swh/indexer/sql/20-enums.sql
index a357eb51c8ac755ea2ef52fea18ba122da664769..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644
--- a/swh/indexer/sql/20-enums.sql
+++ b/swh/indexer/sql/20-enums.sql
@@ -1,100 +0,0 @@
-create type languages as enum ( 'abap', 'abnf', 'actionscript',
-  'actionscript-3', 'ada', 'adl', 'agda', 'alloy', 'ambienttalk',
-  'antlr', 'antlr-with-actionscript-target', 'antlr-with-c#-target',
-  'antlr-with-cpp-target', 'antlr-with-java-target',
-  'antlr-with-objectivec-target', 'antlr-with-perl-target',
-  'antlr-with-python-target', 'antlr-with-ruby-target', 'apacheconf',
-  'apl', 'applescript', 'arduino', 'aspectj', 'aspx-cs', 'aspx-vb',
-  'asymptote', 'autohotkey', 'autoit', 'awk', 'base-makefile', 'bash',
-  'bash-session', 'batchfile', 'bbcode', 'bc', 'befunge',
-  'blitzbasic', 'blitzmax', 'bnf', 'boo', 'boogie', 'brainfuck',
-  'bro', 'bugs', 'c', 'c#', 'c++', 'c-objdump', 'ca65-assembler',
-  'cadl', 'camkes', 'cbm-basic-v2', 'ceylon', 'cfengine3',
-  'cfstatement', 'chaiscript', 'chapel', 'cheetah', 'cirru', 'clay',
-  'clojure', 'clojurescript', 'cmake', 'cobol', 'cobolfree',
-  'coffeescript', 'coldfusion-cfc', 'coldfusion-html', 'common-lisp',
-  'component-pascal', 'coq', 'cpp-objdump', 'cpsa', 'crmsh', 'croc',
-  'cryptol', 'csound-document', 'csound-orchestra', 'csound-score',
-  'css', 'css+django/jinja', 'css+genshi-text', 'css+lasso',
-  'css+mako', 'css+mozpreproc', 'css+myghty', 'css+php', 'css+ruby',
-  'css+smarty', 'cuda', 'cypher', 'cython', 'd', 'd-objdump',
-  'darcs-patch', 'dart', 'debian-control-file', 'debian-sourcelist',
-  'delphi', 'dg', 'diff', 'django/jinja', 'docker', 'dtd', 'duel',
-  'dylan', 'dylan-session', 'dylanlid', 'earl-grey', 'easytrieve',
-  'ebnf', 'ec', 'ecl', 'eiffel', 'elixir', 'elixir-iex-session',
-  'elm', 'emacslisp', 'embedded-ragel', 'erb', 'erlang',
-  'erlang-erl-session', 'evoque', 'ezhil', 'factor', 'fancy',
-  'fantom', 'felix', 'fish', 'fortran', 'fortranfixed', 'foxpro',
-  'fsharp', 'gap', 'gas', 'genshi', 'genshi-text', 'gettext-catalog',
-  'gherkin', 'glsl', 'gnuplot', 'go', 'golo', 'gooddata-cl', 'gosu',
-  'gosu-template', 'groff', 'groovy', 'haml', 'handlebars', 'haskell',
-  'haxe', 'hexdump', 'html', 'html+cheetah', 'html+django/jinja',
-  'html+evoque', 'html+genshi', 'html+handlebars', 'html+lasso',
-  'html+mako', 'html+myghty', 'html+php', 'html+smarty', 'html+twig',
-  'html+velocity', 'http', 'hxml', 'hy', 'hybris', 'idl', 'idris',
-  'igor', 'inform-6', 'inform-6-template', 'inform-7', 'ini', 'io',
-  'ioke', 'irc-logs', 'isabelle', 'j', 'jade', 'jags', 'jasmin',
-  'java', 'java-server-page', 'javascript', 'javascript+cheetah',
-  'javascript+django/jinja', 'javascript+genshi-text',
-  'javascript+lasso', 'javascript+mako', 'javascript+mozpreproc',
-  'javascript+myghty', 'javascript+php', 'javascript+ruby',
-  'javascript+smarty', 'jcl', 'json', 'json-ld', 'julia',
-  'julia-console', 'kal', 'kconfig', 'koka', 'kotlin', 'lasso',
-  'lean', 'lesscss', 'lighttpd-configuration-file', 'limbo', 'liquid',
-  'literate-agda', 'literate-cryptol', 'literate-haskell',
-  'literate-idris', 'livescript', 'llvm', 'logos', 'logtalk', 'lsl',
-  'lua', 'makefile', 'mako', 'maql', 'mask', 'mason', 'mathematica',
-  'matlab', 'matlab-session', 'minid', 'modelica', 'modula-2',
-  'moinmoin/trac-wiki-markup', 'monkey', 'moocode', 'moonscript',
-  'mozhashpreproc', 'mozpercentpreproc', 'mql', 'mscgen',
-  'msdos-session', 'mupad', 'mxml', 'myghty', 'mysql', 'nasm',
-  'nemerle', 'nesc', 'newlisp', 'newspeak',
-  'nginx-configuration-file', 'nimrod', 'nit', 'nix', 'nsis', 'numpy',
-  'objdump', 'objdump-nasm', 'objective-c', 'objective-c++',
-  'objective-j', 'ocaml', 'octave', 'odin', 'ooc', 'opa',
-  'openedge-abl', 'pacmanconf', 'pan', 'parasail', 'pawn', 'perl',
-  'perl6', 'php', 'pig', 'pike', 'pkgconfig', 'pl/pgsql',
-  'postgresql-console-(psql)', 'postgresql-sql-dialect', 'postscript',
-  'povray', 'powershell', 'powershell-session', 'praat', 'prolog',
-  'properties', 'protocol-buffer', 'puppet', 'pypy-log', 'python',
-  'python-3', 'python-3.0-traceback', 'python-console-session',
-  'python-traceback', 'qbasic', 'qml', 'qvto', 'racket', 'ragel',
-  'ragel-in-c-host', 'ragel-in-cpp-host', 'ragel-in-d-host',
-  'ragel-in-java-host', 'ragel-in-objective-c-host',
-  'ragel-in-ruby-host', 'raw-token-data', 'rconsole', 'rd', 'rebol',
-  'red', 'redcode', 'reg', 'resourcebundle', 'restructuredtext',
-  'rexx', 'rhtml', 'roboconf-graph', 'roboconf-instances',
-  'robotframework', 'rpmspec', 'rql', 'rsl', 'ruby',
-  'ruby-irb-session', 'rust', 's', 'sass', 'scala',
-  'scalate-server-page', 'scaml', 'scheme', 'scilab', 'scss', 'shen',
-  'slim', 'smali', 'smalltalk', 'smarty', 'snobol', 'sourcepawn',
-  'sparql', 'sql', 'sqlite3con', 'squidconf', 'stan', 'standard-ml',
-  'supercollider', 'swift', 'swig', 'systemverilog', 'tads-3', 'tap',
-  'tcl', 'tcsh', 'tcsh-session', 'tea', 'termcap', 'terminfo',
-  'terraform', 'tex', 'text-only', 'thrift', 'todotxt',
-  'trafficscript', 'treetop', 'turtle', 'twig', 'typescript',
-  'urbiscript', 'vala', 'vb.net', 'vctreestatus', 'velocity',
-  'verilog', 'vgl', 'vhdl', 'viml', 'x10', 'xml', 'xml+cheetah',
-  'xml+django/jinja', 'xml+evoque', 'xml+lasso', 'xml+mako',
-  'xml+myghty', 'xml+php', 'xml+ruby', 'xml+smarty', 'xml+velocity',
-  'xquery', 'xslt', 'xtend', 'xul+mozpreproc', 'yaml', 'yaml+jinja',
-  'zephir', 'unknown'
-);
-comment on type languages is 'Languages recognized by language indexer';
-
-create type ctags_languages as enum ( 'Ada', 'AnsiblePlaybook', 'Ant',
-  'Asm', 'Asp', 'Autoconf', 'Automake', 'Awk', 'Basic', 'BETA', 'C',
-  'C#', 'C++', 'Clojure', 'Cobol', 'CoffeeScript [disabled]', 'CSS',
-  'ctags', 'D', 'DBusIntrospect', 'Diff', 'DosBatch', 'DTS', 'Eiffel',
-  'Erlang', 'Falcon', 'Flex', 'Fortran', 'gdbinit [disabled]',
-  'Glade', 'Go', 'HTML', 'Iniconf', 'Java', 'JavaProperties',
-  'JavaScript', 'JSON', 'Lisp', 'Lua', 'M4', 'Make', 'man [disabled]',
-  'MatLab', 'Maven2', 'Myrddin', 'ObjectiveC', 'OCaml', 'OldC
-  [disabled]', 'OldC++ [disabled]', 'Pascal', 'Perl', 'Perl6', 'PHP',
-  'PlistXML', 'pod', 'Protobuf', 'Python', 'PythonLoggingConfig', 'R',
-  'RelaxNG', 'reStructuredText', 'REXX', 'RpmSpec', 'Ruby', 'Rust',
-  'Scheme', 'Sh', 'SLang', 'SML', 'SQL', 'SVG', 'SystemdUnit',
-  'SystemVerilog', 'Tcl', 'Tex', 'TTCN', 'Vera', 'Verilog', 'VHDL',
-  'Vim', 'WindRes', 'XSLT', 'YACC', 'Yaml', 'YumRepo', 'Zephir'
-);
-comment on type ctags_languages is 'Languages recognized by ctags indexer';
diff --git a/swh/indexer/sql/30-schema.sql b/swh/indexer/sql/30-schema.sql
index 08587c3900547b387d8d1b73faeb6949b57d26e3..318fb695f1d46aa49414a60c598a9427a001d4fa 100644
--- a/swh/indexer/sql/30-schema.sql
+++ b/swh/indexer/sql/30-schema.sql
@@ -36,35 +36,6 @@ comment on column content_mimetype.mimetype is 'Raw content Mimetype';
 comment on column content_mimetype.encoding is 'Raw content encoding';
 comment on column content_mimetype.indexer_configuration_id is 'Tool used to compute the information';
 
--- Language metadata
-create table content_language (
-  id sha1 not null,
-  lang languages not null,
-  indexer_configuration_id bigint not null
-);
-
-comment on table content_language is 'Language information on a raw content';
-comment on column content_language.lang is 'Language information';
-comment on column content_language.indexer_configuration_id is 'Tool used to compute the information';
-
--- ctags information per content
-create table content_ctags (
-  id sha1 not null,
-  name text not null,
-  kind text not null,
-  line bigint not null,
-  lang ctags_languages not null,
-  indexer_configuration_id bigint not null
-);
-
-comment on table content_ctags is 'Ctags information on a raw content';
-comment on column content_ctags.id is 'Content identifier';
-comment on column content_ctags.name is 'Symbol name';
-comment on column content_ctags.kind is 'Symbol kind (function, class, variable, const...)';
-comment on column content_ctags.line is 'Symbol line';
-comment on column content_ctags.lang is 'Language information for that content';
-comment on column content_ctags.indexer_configuration_id is 'Tool used to compute the information';
-
 create table fossology_license(
   id smallserial,
   name text not null
diff --git a/swh/indexer/sql/50-func.sql b/swh/indexer/sql/50-func.sql
index d459a4ab2c2bd32d5a7f63a1fc554413bebc90a9..85f292c6d802a9c9397a262a7cdeedf962c1239b 100644
--- a/swh/indexer/sql/50-func.sql
+++ b/swh/indexer/sql/50-func.sql
@@ -58,6 +58,7 @@ begin
     insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id)
     select id, mimetype, encoding, indexer_configuration_id
     from tmp_content_mimetype tcm
+    order by id, indexer_configuration_id
     on conflict(id, indexer_configuration_id)
     do update set mimetype = excluded.mimetype,
                   encoding = excluded.encoding;
@@ -69,118 +70,6 @@ $$;
 
 comment on function swh_content_mimetype_add() IS 'Add new content mimetypes';
 
--- add tmp_content_language entries to content_language, overwriting duplicates.
---
--- If filtering duplicates is in order, the call to
--- swh_content_language_missing must take place before calling this
--- function.
---
--- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
--- tmp_content_language, 2. call this function
-create or replace function swh_content_language_add()
-    returns bigint
-    language plpgsql
-as $$
-declare
-  res bigint;
-begin
-    insert into content_language (id, lang, indexer_configuration_id)
-    select id, lang, indexer_configuration_id
-    from tmp_content_language tcl
-    on conflict(id, indexer_configuration_id)
-    do update set lang = excluded.lang;
-
-    get diagnostics res = ROW_COUNT;
-    return res;
-end
-$$;
-
-comment on function swh_content_language_add() IS 'Add new content languages';
-
--- create a temporary table for retrieving content_language
-create or replace function swh_mktemp_content_language()
-    returns void
-    language sql
-as $$
-  create temporary table if not exists tmp_content_language (
-    like content_language including defaults
-  ) on commit delete rows;
-$$;
-
-comment on function swh_mktemp_content_language() is 'Helper table to add content language';
-
-
--- create a temporary table for content_ctags tmp_content_ctags,
-create or replace function swh_mktemp_content_ctags()
-    returns void
-    language sql
-as $$
-  create temporary table if not exists tmp_content_ctags (
-    like content_ctags including defaults
-  ) on commit delete rows;
-$$;
-
-comment on function swh_mktemp_content_ctags() is 'Helper table to add content ctags';
-
-
--- add tmp_content_ctags entries to content_ctags, overwriting duplicates
---
--- operates in bulk: 0. swh_mktemp(content_ctags), 1. COPY to tmp_content_ctags,
--- 2. call this function
-create or replace function swh_content_ctags_add()
-    returns bigint
-    language plpgsql
-as $$
-declare
-  res bigint;
-begin
-    insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id)
-    select id, name, kind, line, lang, indexer_configuration_id
-    from tmp_content_ctags tct
-    on conflict(id, hash_sha1(name), kind, line, lang, indexer_configuration_id)
-    do nothing;
-
-    get diagnostics res = ROW_COUNT;
-    return res;
-end
-$$;
-
-comment on function swh_content_ctags_add() IS 'Add new ctags symbols per content';
-
-create type content_ctags_signature as (
-  id sha1,
-  name text,
-  kind text,
-  line bigint,
-  lang ctags_languages,
-  tool_id integer,
-  tool_name text,
-  tool_version text,
-  tool_configuration jsonb
-);
-
--- Search within ctags content.
---
-create or replace function swh_content_ctags_search(
-       expression text,
-       l integer default 10,
-       last_sha1 sha1 default '\x0000000000000000000000000000000000000000')
-    returns setof content_ctags_signature
-    language sql
-as $$
-    select c.id, name, kind, line, lang,
-           i.id as tool_id, tool_name, tool_version, tool_configuration
-    from content_ctags c
-    inner join indexer_configuration i on i.id = c.indexer_configuration_id
-    where hash_sha1(name) = hash_sha1(expression)
-    and c.id > last_sha1
-    order by id
-    limit l;
-$$;
-
-comment on function swh_content_ctags_search(text, integer, sha1) IS 'Equality search through ctags'' symbols';
-
-
 -- create a temporary table for content_fossology_license tmp_content_fossology_license,
 create or replace function swh_mktemp_content_fossology_license()
     returns void
@@ -218,6 +107,7 @@ begin
           (select id from fossology_license where name = tcl.license) as license,
           indexer_configuration_id
     from tmp_content_fossology_license tcl
+    order by tcl.id, license, indexer_configuration_id
     on conflict(id, license_id, indexer_configuration_id)
     do update set license_id = excluded.license_id;
 
@@ -237,7 +127,7 @@ comment on function swh_content_fossology_license_add() IS 'Add new content lice
 -- swh_content_metadata_missing must take place before calling this
 -- function.
 --
--- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
+-- operates in bulk: 0. swh_mktemp(content_metadata), 1. COPY to
 -- tmp_content_metadata, 2. call this function
 create or replace function swh_content_metadata_add()
     returns bigint
@@ -249,6 +139,7 @@ begin
     insert into content_metadata (id, metadata, indexer_configuration_id)
     select id, metadata, indexer_configuration_id
     from tmp_content_metadata tcm
+    order by id, indexer_configuration_id
     on conflict(id, indexer_configuration_id)
     do update set metadata = excluded.metadata;
 
@@ -280,7 +171,7 @@ comment on function swh_mktemp_content_metadata() is 'Helper table to add conten
 -- swh_directory_intrinsic_metadata_missing must take place before calling this
 -- function.
 --
--- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
+-- operates in bulk: 0. swh_mktemp(directory_intrinsic_metadata), 1. COPY to
 -- tmp_directory_intrinsic_metadata, 2. call this function
 create or replace function swh_directory_intrinsic_metadata_add()
     returns bigint
@@ -292,6 +183,7 @@ begin
     insert into directory_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
     select id, metadata, mappings, indexer_configuration_id
     from tmp_directory_intrinsic_metadata tcm
+    order by id, indexer_configuration_id
     on conflict(id, indexer_configuration_id)
     do update set
         metadata = excluded.metadata,
@@ -345,7 +237,7 @@ $$;
 -- swh_origin_intrinsic_metadata_missing must take place before calling this
 -- function.
 --
--- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
+-- operates in bulk: 0. swh_mktemp(origin_intrinsic_metadata), 1. COPY to
 -- tmp_origin_intrinsic_metadata, 2. call this function
 create or replace function swh_origin_intrinsic_metadata_add()
     returns bigint
@@ -360,6 +252,7 @@ begin
     select id, metadata, indexer_configuration_id, from_directory,
            metadata_tsvector, mappings
     from tmp_origin_intrinsic_metadata
+    order by id, indexer_configuration_id
     on conflict(id, indexer_configuration_id)
     do update set
         metadata = excluded.metadata,
@@ -418,7 +311,7 @@ $$;
 -- swh_origin_extrinsic_metadata_missing must take place before calling this
 -- function.
 --
--- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
+-- operates in bulk: 0. swh_mktemp(origin_extrinsic_metadata), 1. COPY to
 -- tmp_origin_extrinsic_metadata, 2. call this function
 create or replace function swh_origin_extrinsic_metadata_add()
     returns bigint
@@ -433,6 +326,7 @@ begin
     select id, metadata, indexer_configuration_id, from_remd_id,
            metadata_tsvector, mappings
     from tmp_origin_extrinsic_metadata
+    order by id, indexer_configuration_id
     on conflict(id, indexer_configuration_id)
     do update set
         metadata = excluded.metadata,
@@ -475,6 +369,7 @@ as $$
 begin
       insert into indexer_configuration(tool_name, tool_version, tool_configuration)
       select tool_name, tool_version, tool_configuration from tmp_indexer_configuration tmp
+      order by tool_name, tool_version, tool_configuration
       on conflict(tool_name, tool_version, tool_configuration) do nothing;
 
       return query
diff --git a/swh/indexer/sql/60-indexes.sql b/swh/indexer/sql/60-indexes.sql
index 5b42af79eeeae36a1aca94db1c1637af9d1816ad..20fe3ca93fc21de91cb46e0ec2c69b1d7765e49a 100644
--- a/swh/indexer/sql/60-indexes.sql
+++ b/swh/indexer/sql/60-indexes.sql
@@ -10,14 +10,6 @@ alter table indexer_configuration add primary key using index indexer_configurat
 
 create unique index on indexer_configuration(tool_name, tool_version, tool_configuration);
 
--- content_ctags
-create index on content_ctags(id);
-create index on content_ctags(hash_sha1(name));
-create unique index on content_ctags(id, hash_sha1(name), kind, line, lang, indexer_configuration_id);
-
-alter table content_ctags add constraint content_ctags_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
-alter table content_ctags validate constraint content_ctags_indexer_configuration_id_fkey;
-
 -- content_metadata
 create unique index content_metadata_pkey on content_metadata(id, indexer_configuration_id);
 alter table content_metadata add primary key using index content_metadata_pkey;
@@ -41,13 +33,6 @@ alter table content_mimetype validate constraint content_mimetype_indexer_config
 
 create index on content_mimetype(id) where mimetype like 'text/%';
 
--- content_language
-create unique index content_language_pkey on content_language(id, indexer_configuration_id);
-alter table content_language add primary key using index content_language_pkey;
-
-alter table content_language add constraint content_language_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
-alter table content_language validate constraint content_language_indexer_configuration_id_fkey;
-
 -- content_fossology_license
 create unique index content_fossology_license_pkey on content_fossology_license(id, license_id, indexer_configuration_id);
 alter table content_fossology_license add primary key using index content_fossology_license_pkey;
diff --git a/swh/indexer/sql/upgrades/136.sql b/swh/indexer/sql/upgrades/136.sql
new file mode 100644
index 0000000000000000000000000000000000000000..01499ac2574bc03a3747b6f7bec562fd4cf2d69c
--- /dev/null
+++ b/swh/indexer/sql/upgrades/136.sql
@@ -0,0 +1,214 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 135
+-- to_version: 136
+-- description: Insert from temporary tables in consistent order
+
+insert into dbversion(version, release, description)
+      values(136, now(), 'Work In Progress');
+
+
+create or replace function swh_content_mimetype_add()
+    returns bigint
+    language plpgsql
+as $$
+declare
+  res bigint;
+begin
+    insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id)
+    select id, mimetype, encoding, indexer_configuration_id
+    from tmp_content_mimetype tcm
+    order by id, indexer_configuration_id
+    on conflict(id, indexer_configuration_id)
+    do update set mimetype = excluded.mimetype,
+                  encoding = excluded.encoding;
+
+    get diagnostics res = ROW_COUNT;
+    return res;
+end
+$$;
+
+
+create or replace function swh_content_language_add()
+    returns bigint
+    language plpgsql
+as $$
+declare
+  res bigint;
+begin
+    insert into content_language (id, lang, indexer_configuration_id)
+    select id, lang, indexer_configuration_id
+    from tmp_content_language tcl
+    order by id, indexer_configuration_id
+    on conflict(id, indexer_configuration_id)
+    do update set lang = excluded.lang;
+
+    get diagnostics res = ROW_COUNT;
+    return res;
+end
+$$;
+
+
+create or replace function swh_content_ctags_add()
+    returns bigint
+    language plpgsql
+as $$
+declare
+  res bigint;
+begin
+    insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id)
+    select id, name, kind, line, lang, indexer_configuration_id
+    from tmp_content_ctags tct
+    order by id, hash_sha1(name), kind, line, lang, indexer_configuration_id
+    on conflict(id, hash_sha1(name), kind, line, lang, indexer_configuration_id)
+    do nothing;
+
+    get diagnostics res = ROW_COUNT;
+    return res;
+end
+$$;
+
+
+create or replace function swh_content_fossology_license_add()
+  returns bigint
+  language plpgsql
+as $$
+declare
+  res bigint;
+begin
+    -- insert unknown licenses first
+    insert into fossology_license (name)
+    select distinct license from tmp_content_fossology_license tmp
+    where not exists (select 1 from fossology_license where name=tmp.license)
+    on conflict(name) do nothing;
+
+    insert into content_fossology_license (id, license_id, indexer_configuration_id)
+    select tcl.id,
+          (select id from fossology_license where name = tcl.license) as license,
+          indexer_configuration_id
+    from tmp_content_fossology_license tcl
+    order by tcl.id, license, indexer_configuration_id
+    on conflict(id, license_id, indexer_configuration_id)
+    do update set license_id = excluded.license_id;
+
+    get diagnostics res = ROW_COUNT;
+    return res;
+end
+$$;
+
+
+create or replace function swh_content_metadata_add()
+    returns bigint
+    language plpgsql
+as $$
+declare
+  res bigint;
+begin
+    insert into content_metadata (id, metadata, indexer_configuration_id)
+    select id, metadata, indexer_configuration_id
+    from tmp_content_metadata tcm
+    order by id, indexer_configuration_id
+    on conflict(id, indexer_configuration_id)
+    do update set metadata = excluded.metadata;
+
+    get diagnostics res = ROW_COUNT;
+    return res;
+end
+$$;
+
+
+create or replace function swh_directory_intrinsic_metadata_add()
+    returns bigint
+    language plpgsql
+as $$
+declare
+  res bigint;
+begin
+    insert into directory_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
+    select id, metadata, mappings, indexer_configuration_id
+    from tmp_directory_intrinsic_metadata tcm
+    order by id, indexer_configuration_id
+    on conflict(id, indexer_configuration_id)
+    do update set
+        metadata = excluded.metadata,
+        mappings = excluded.mappings;
+
+    get diagnostics res = ROW_COUNT;
+    return res;
+end
+$$;
+
+
+create or replace function swh_origin_intrinsic_metadata_add()
+    returns bigint
+    language plpgsql
+as $$
+declare
+  res bigint;
+begin
+    perform swh_origin_intrinsic_metadata_compute_tsvector();
+
+    insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_directory, metadata_tsvector, mappings)
+    select id, metadata, indexer_configuration_id, from_directory,
+           metadata_tsvector, mappings
+    from tmp_origin_intrinsic_metadata
+    order by id, indexer_configuration_id
+    on conflict(id, indexer_configuration_id)
+    do update set
+        metadata = excluded.metadata,
+        metadata_tsvector = excluded.metadata_tsvector,
+        mappings = excluded.mappings,
+        from_directory = excluded.from_directory;
+
+    get diagnostics res = ROW_COUNT;
+    return res;
+end
+$$;
+
+
+create or replace function swh_origin_extrinsic_metadata_add()
+    returns bigint
+    language plpgsql
+as $$
+declare
+  res bigint;
+begin
+    perform swh_origin_extrinsic_metadata_compute_tsvector();
+
+    insert into origin_extrinsic_metadata (id, metadata, indexer_configuration_id, from_remd_id, metadata_tsvector, mappings)
+    select id, metadata, indexer_configuration_id, from_remd_id,
+           metadata_tsvector, mappings
+    from tmp_origin_extrinsic_metadata
+    order by id, indexer_configuration_id
+    on conflict(id, indexer_configuration_id)
+    do update set
+        metadata = excluded.metadata,
+        metadata_tsvector = excluded.metadata_tsvector,
+        mappings = excluded.mappings,
+        from_remd_id = excluded.from_remd_id;
+
+    get diagnostics res = ROW_COUNT;
+    return res;
+end
+$$;
+
+
+create or replace function swh_indexer_configuration_add()
+    returns setof indexer_configuration
+    language plpgsql
+as $$
+begin
+      insert into indexer_configuration(tool_name, tool_version, tool_configuration)
+      select tool_name, tool_version, tool_configuration from tmp_indexer_configuration tmp
+      order by tool_name, tool_version, tool_configuration
+      on conflict(tool_name, tool_version, tool_configuration) do nothing;
+
+      return query
+          select id, tool_name, tool_version, tool_configuration
+          from tmp_indexer_configuration join indexer_configuration
+              using(tool_name, tool_version, tool_configuration);
+
+      return;
+end
+$$;
+
+
diff --git a/swh/indexer/sql/upgrades/137.sql b/swh/indexer/sql/upgrades/137.sql
new file mode 100644
index 0000000000000000000000000000000000000000..152ae0ee4966b1c31a4853e8d0ff8e104bab3d09
--- /dev/null
+++ b/swh/indexer/sql/upgrades/137.sql
@@ -0,0 +1,19 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 136
+-- to_version: 137
+-- description: Drop content_language and content_ctags tables and related functions
+
+drop function if exists swh_content_language_add;
+drop function if exists swh_mktemp_content_language();
+drop function if exists swh_mktemp_content_ctags();
+drop function if exists swh_content_ctags_add();
+drop function if exists swh_content_ctags_search;
+
+drop type if exists content_ctags_signature;
+
+drop table if exists content_language;
+drop table if exists content_ctags;
+
+drop type if exists languages;
+drop type if exists ctags_languages;
+
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
index 2c7bbc238e64b4f70dd726d6ca7329a4d98b9002..261dc525b5f38ed4724cea7fdcd143e70089ecde 100644
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -9,6 +9,7 @@ import json
 from typing import Dict, Iterable, List, Optional, Tuple, Union
 import warnings
 
+import attr
 import psycopg2
 import psycopg2.pool
 
@@ -115,17 +116,19 @@ def check_id_duplicates(data):
     Args:
         data (List[dict]): List of dictionaries to be inserted
 
+    >>> tool1 = {"name": "foo", "version": "1.2.3", "configuration": {}}
+    >>> tool2 = {"name": "foo", "version": "1.2.4", "configuration": {}}
     >>> check_id_duplicates([
-    ...     ContentLicenseRow(id=b'foo', indexer_configuration_id=42, license="GPL"),
-    ...     ContentLicenseRow(id=b'foo', indexer_configuration_id=32, license="GPL"),
+    ...     ContentLicenseRow(id=b'foo', tool=tool1, license="GPL"),
+    ...     ContentLicenseRow(id=b'foo', tool=tool2, license="GPL"),
     ... ])
     >>> check_id_duplicates([
-    ...     ContentLicenseRow(id=b'foo', indexer_configuration_id=42, license="AGPL"),
-    ...     ContentLicenseRow(id=b'foo', indexer_configuration_id=42, license="AGPL"),
+    ...     ContentLicenseRow(id=b'foo', tool=tool1, license="AGPL"),
+    ...     ContentLicenseRow(id=b'foo', tool=tool1, license="AGPL"),
     ... ])
     Traceback (most recent call last):
     ...
-    swh.indexer.storage.exc.DuplicateId: [{'id': b'foo', 'indexer_configuration_id': 42, 'license': 'AGPL'}]
+    swh.indexer.storage.exc.DuplicateId: [{'id': b'foo', 'license': 'AGPL', 'tool_configuration': '{}', 'tool_name': 'foo', 'tool_version': '1.2.3'}]
 
     """  # noqa
     counter = Counter(tuple(sorted(item.unique_key().items())) for item in data)
@@ -137,7 +140,7 @@ def check_id_duplicates(data):
 class IndexerStorage:
     """SWH Indexer Storage Datastore"""
 
-    current_version = 135
+    current_version = 137
 
     def __init__(self, db, min_pool_conns=1, max_pool_conns=10, journal_writer=None):
         """
@@ -147,7 +150,7 @@ class IndexerStorage:
                             `swh.journal.writer.get_journal_writer`
 
         """
-        self.journal_writer = JournalWriter(self._tool_get_from_id, journal_writer)
+        self.journal_writer = JournalWriter(journal_writer)
         try:
             if isinstance(db, psycopg2.extensions.connection):
                 self._pool = None
@@ -169,6 +172,32 @@ class IndexerStorage:
         if db is not self._db:
             db.put_conn()
 
+    def _join_indexer_configuration(self, entries, db, cur):
+        """Replaces ``entry.indexer_configuration_id`` with a full tool dict
+        in ``entry.tool``."""
+        joined_entries = []
+
+        # usually, all the additions in a batch are from the same indexer,
+        # so this cache allows doing a single query for all the entries.
+        tool_cache = {}
+
+        for entry in entries:
+            # get the tool used to generate this addition
+            tool_id = entry.indexer_configuration_id
+            assert tool_id
+            if tool_id not in tool_cache:
+                tool_cache[tool_id] = dict(
+                    self._tool_get_from_id(tool_id, db=db, cur=cur)
+                )
+                del tool_cache[tool_id]["id"]
+            entry = attr.evolve(
+                entry, tool=tool_cache[tool_id], indexer_configuration_id=None
+            )
+
+            joined_entries.append(entry)
+
+        return joined_entries
+
     @timed
     @db_transaction()
     def check_config(self, *, check_write, db=None, cur=None):
@@ -293,9 +322,11 @@ class IndexerStorage:
         db=None,
         cur=None,
     ) -> Dict[str, int]:
-        check_id_duplicates(mimetypes)
-        mimetypes.sort(key=lambda m: m.id)
-        self.journal_writer.write_additions("content_mimetype", mimetypes)
+        mimetypes_with_tools = self._join_indexer_configuration(
+            mimetypes, db=db, cur=cur
+        )
+        check_id_duplicates(mimetypes_with_tools)
+        self.journal_writer.write_additions("content_mimetype", mimetypes_with_tools)
         db.mktemp_content_mimetype(cur)
         db.copy_to(
             [m.to_dict() for m in mimetypes],
@@ -341,9 +372,11 @@ class IndexerStorage:
         db=None,
         cur=None,
     ) -> Dict[str, int]:
-        check_id_duplicates(licenses)
-        licenses.sort(key=lambda m: m.id)
-        self.journal_writer.write_additions("content_fossology_license", licenses)
+        licenses_with_tools = self._join_indexer_configuration(licenses, db=db, cur=cur)
+        check_id_duplicates(licenses_with_tools)
+        self.journal_writer.write_additions(
+            "content_fossology_license", licenses_with_tools
+        )
         db.mktemp_content_fossology_license(cur)
         db.copy_to(
             [license.to_dict() for license in licenses],
@@ -406,9 +439,9 @@ class IndexerStorage:
         db=None,
         cur=None,
     ) -> Dict[str, int]:
-        check_id_duplicates(metadata)
-        metadata.sort(key=lambda m: m.id)
-        self.journal_writer.write_additions("content_metadata", metadata)
+        metadata_with_tools = self._join_indexer_configuration(metadata, db=db, cur=cur)
+        check_id_duplicates(metadata_with_tools)
+        self.journal_writer.write_additions("content_metadata", metadata_with_tools)
 
         db.mktemp_content_metadata(cur)
 
@@ -460,9 +493,11 @@ class IndexerStorage:
         db=None,
         cur=None,
     ) -> Dict[str, int]:
-        check_id_duplicates(metadata)
-        metadata.sort(key=lambda m: m.id)
-        self.journal_writer.write_additions("directory_intrinsic_metadata", metadata)
+        metadata_with_tools = self._join_indexer_configuration(metadata, db=db, cur=cur)
+        check_id_duplicates(metadata_with_tools)
+        self.journal_writer.write_additions(
+            "directory_intrinsic_metadata", metadata_with_tools
+        )
 
         db.mktemp_directory_intrinsic_metadata(cur)
 
@@ -504,9 +539,11 @@ class IndexerStorage:
         db=None,
         cur=None,
     ) -> Dict[str, int]:
-        check_id_duplicates(metadata)
-        metadata.sort(key=lambda m: m.id)
-        self.journal_writer.write_additions("origin_intrinsic_metadata", metadata)
+        metadata_with_tools = self._join_indexer_configuration(metadata, db=db, cur=cur)
+        check_id_duplicates(metadata_with_tools)
+        self.journal_writer.write_additions(
+            "origin_intrinsic_metadata", metadata_with_tools
+        )
 
         db.mktemp_origin_intrinsic_metadata(cur)
 
@@ -646,9 +683,11 @@ class IndexerStorage:
         db=None,
         cur=None,
     ) -> Dict[str, int]:
-        check_id_duplicates(metadata)
-        metadata.sort(key=lambda m: m.id)
-        self.journal_writer.write_additions("origin_extrinsic_metadata", metadata)
+        metadata_with_tools = self._join_indexer_configuration(metadata, db=db, cur=cur)
+        check_id_duplicates(metadata_with_tools)
+        self.journal_writer.write_additions(
+            "origin_extrinsic_metadata", metadata_with_tools
+        )
 
         db.mktemp_origin_extrinsic_metadata(cur)
 
diff --git a/swh/indexer/storage/api/server.py b/swh/indexer/storage/api/server.py
index 020dd2bc47787fabd95d4ec3c21a7fc29c92df00..4bad74c424c4f316fdfe7f6cff3fb1fc9665adb9 100644
--- a/swh/indexer/storage/api/server.py
+++ b/swh/indexer/storage/api/server.py
@@ -42,6 +42,9 @@ def my_error_handler(exception):
     return error_handler(exception, encode_data)
 
 
+app.setup_psycopg2_errorhandlers()
+
+
 @app.errorhandler(IndexerStorageArgumentException)
 def argument_error_handler(exception):
     return error_handler(exception, encode_data, status_code=400)
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
index fc4c9ef3e73e0df2d4f111097f44cd93ba903b4c..a99083796f10c7bdf8961bf5fc3f4ab6c654e31f 100644
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -23,6 +23,8 @@ from typing import (
     Union,
 )
 
+import attr
+
 from swh.core.collections import SortedList
 from swh.model.hashutil import hash_to_bytes, hash_to_hex
 from swh.model.model import SHA1_SIZE
@@ -83,6 +85,30 @@ class SubStorage(Generic[TValue]):
         self._journal_writer = journal_writer
         self._tools_per_id = defaultdict(set)
 
+    def _join_indexer_configuration(self, entries):
+        """Replaces ``entry.indexer_configuration_id`` with a full tool dict
+        in ``entry.tool``."""
+        joined_entries = []
+
+        for entry in entries:
+            # get the tool used to generate this addition
+            tool_id = entry.indexer_configuration_id
+            assert tool_id
+            tool = self._tools[tool_id]
+            entry = attr.evolve(
+                entry,
+                tool={
+                    "name": tool["tool_name"],
+                    "version": tool["tool_version"],
+                    "configuration": tool["tool_configuration"],
+                },
+                indexer_configuration_id=None,
+            )
+
+            joined_entries.append(entry)
+
+        return joined_entries
+
     def _key_from_dict(self, d) -> Tuple:
         """Like the global _key_from_dict, but filters out dict keys that don't
         belong in the unique key."""
@@ -210,15 +236,16 @@ class SubStorage(Generic[TValue]):
 
         """
         data = list(data)
-        check_id_duplicates(data)
+        data_with_tools = self._join_indexer_configuration(data)
+        check_id_duplicates(data_with_tools)
         object_type = self.row_class.object_type  # type: ignore
-        self._journal_writer.write_additions(object_type, data)
+        self._journal_writer.write_additions(object_type, data_with_tools)
         count = 0
-        for obj in data:
+        for (obj, obj_with_tool) in zip(data, data_with_tools):
             item = obj.to_dict()
             id_ = item.pop("id")
             tool_id = item["indexer_configuration_id"]
-            key = _key_from_dict(obj.unique_key())
+            key = _key_from_dict(obj_with_tool.unique_key())
             self._data[id_][key] = item
             self._tools_per_id[id_].add(tool_id)
             count += 1
@@ -233,16 +260,7 @@ class IndexerStorage:
     def __init__(self, journal_writer=None):
         self._tools = {}
 
-        def tool_getter(id_):
-            tool = self._tools[id_]
-            return {
-                "id": tool["id"],
-                "name": tool["tool_name"],
-                "version": tool["tool_version"],
-                "configuration": tool["tool_configuration"],
-            }
-
-        self.journal_writer = JournalWriter(tool_getter, journal_writer)
+        self.journal_writer = JournalWriter(journal_writer)
         args = (self._tools, self.journal_writer)
         self._mimetypes = SubStorage(ContentMimetypeRow, *args)
         self._licenses = SubStorage(ContentLicenseRow, *args)
diff --git a/swh/indexer/storage/model.py b/swh/indexer/storage/model.py
index c05071ab595a7ce4c9d4ff69459a8a5f1a66a19a..ab8fa89f18f79fa1c2ecb94e14deed26591f98ca 100644
--- a/swh/indexer/storage/model.py
+++ b/swh/indexer/storage/model.py
@@ -8,6 +8,7 @@ used for the interface of the idx-storage in the near future."""
 
 from __future__ import annotations
 
+import json
 from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar
 
 import attr
@@ -20,7 +21,7 @@ TSelf = TypeVar("TSelf")
 
 @attr.s
 class BaseRow:
-    UNIQUE_KEY_FIELDS: Tuple = ("id", "indexer_configuration_id")
+    UNIQUE_KEY_FIELDS: Tuple = ("id",)
 
     id = attr.ib(type=Any)
     indexer_configuration_id = attr.ib(type=Optional[int], default=None, kw_only=True)
@@ -55,15 +56,24 @@ class BaseRow:
         return cls(**d)
 
     def unique_key(self) -> Dict:
-        obj = self
+        if not self.tool:
+            raise ValueError(
+                f"Cannot compute unique_key of {self.__class__.__name__} with no tool "
+                f"dictionary (indexer_configuration_id was given instead)"
+            )
 
-        # tool["id"] and obj.indexer_configuration_id are the same value, but
-        # only one of them is set for any given object
-        if obj.indexer_configuration_id is None:
-            assert obj.tool  # constructors ensures tool XOR indexer_configuration_id
-            obj = attr.evolve(obj, indexer_configuration_id=obj.tool["id"], tool=None)
+        tool_dict = {
+            "tool_name": self.tool["name"],
+            "tool_version": self.tool["version"],
+            "tool_configuration": json.dumps(
+                self.tool["configuration"], sort_keys=True
+            ),
+        }
 
-        return {key: getattr(obj, key) for key in self.UNIQUE_KEY_FIELDS}
+        return {
+            **{key: getattr(self, key) for key in self.UNIQUE_KEY_FIELDS},
+            **tool_dict,
+        }
 
 
 @attr.s
@@ -78,7 +88,7 @@ class ContentMimetypeRow(BaseRow):
 @attr.s
 class ContentLicenseRow(BaseRow):
     object_type: Final = "content_fossology_license"
-    UNIQUE_KEY_FIELDS = ("id", "indexer_configuration_id", "license")
+    UNIQUE_KEY_FIELDS = ("id", "license")
 
     id = attr.ib(type=Sha1Git)
     license = attr.ib(type=str)
diff --git a/swh/indexer/storage/writer.py b/swh/indexer/storage/writer.py
index b4fa3658a63255467828479e2c4761a32359cf6c..e0897592f74276366732a9f3973d2760e2804cd4 100644
--- a/swh/indexer/storage/writer.py
+++ b/swh/indexer/storage/writer.py
@@ -1,11 +1,9 @@
-# Copyright (C) 2020 The Software Heritage developers
+# Copyright (C) 2020-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-from typing import Any, Callable, Dict, Iterable, Optional
-
-import attr
+from typing import Any, Dict, Iterable, Optional
 
 try:
     from swh.journal.writer import JournalWriterInterface, get_journal_writer
@@ -24,15 +22,12 @@ class JournalWriter:
 
     journal: Optional[JournalWriterInterface]
 
-    def __init__(self, tool_getter: Callable[[int], Dict[str, Any]], journal_writer):
+    def __init__(self, journal_writer: Dict[str, Any]):
         """
         Args:
-            tool_getter: a callable that takes a tool_id and return a dict representing
-                         a tool object
             journal_writer: configuration passed to
                             `swh.journal.writer.get_journal_writer`
         """
-        self._tool_getter = tool_getter
         if journal_writer:
             if get_journal_writer is None:
                 raise EnvironmentError(
@@ -50,20 +45,25 @@ class JournalWriter:
         if not self.journal:
             return
 
-        # usually, all the additions in a batch are from the same indexer,
-        # so this cache allows doing a single query for all the entries.
-        tool_cache = {}
+        translated = []
 
         for entry in entries:
             assert entry.object_type == obj_type  # type: ignore
-            # get the tool used to generate this addition
-            tool_id = entry.indexer_configuration_id
-            assert tool_id
-            if tool_id not in tool_cache:
-                tool_cache[tool_id] = self._tool_getter(tool_id)
-            entry = attr.evolve(
-                entry, tool=tool_cache[tool_id], indexer_configuration_id=None
-            )
 
-            # write to kafka
-            self.journal.write_addition(obj_type, entry)
+            # ids are internal to the database and should not be sent to postgresql
+            if entry.indexer_configuration_id is not None:
+                raise ValueError(
+                    f"{entry} passed to JournalWriter.write_additions has "
+                    f"indexer_configuration_id instead of full tool dict"
+                )
+            assert entry.tool, "Missing both indexer_configuration_id and tool dict"
+            if "id" in entry.tool:
+                raise ValueError(
+                    f"{entry} passed to JournalWriter.write_additions "
+                    f"contains a tool id"
+                )
+
+            translated.append(entry)
+
+        # write to kafka
+        self.journal.write_additions(obj_type, translated)
diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py
index d1ec3ba3d643328f111e815daeacfdd9e12f2029..29a8de7507d794a11240c48a1260d23f4ab0f753 100644
--- a/swh/indexer/tests/conftest.py
+++ b/swh/indexer/tests/conftest.py
@@ -11,7 +11,6 @@ from unittest.mock import patch
 
 import pytest
 from pytest_postgresql import factories
-import sentry_sdk
 import yaml
 
 from swh.core.db.pytest_plugin import initialize_database_for_module
@@ -131,40 +130,3 @@ def swh_config(swh_indexer_config, monkeypatch, tmp_path):
         f.write(yaml.dump(swh_indexer_config))
     monkeypatch.setenv("SWH_CONFIG_FILENAME", conffile)
     return conffile
-
-
-@pytest.fixture
-def sentry_init():
-    # Inspired by
-    # https://github.com/getsentry/sentry-python/blob/1.5.9/tests/conftest.py#L168-L184
-
-    initialized = False
-
-    def inner(*a, **kw):
-        nonlocal initialized
-        assert not initialized, "already initialized"
-        initialized = True
-        hub = sentry_sdk.Hub.current
-        client = sentry_sdk.Client(*a, **kw)
-        hub.bind_client(client)
-        client.transport = TestTransport()
-
-    class TestTransport:
-        def __init__(self):
-            self.events = []
-            self.envelopes = []
-
-        def capture_event(self, event):
-            self.events.append(event)
-
-        def capture_envelope(self, envelope):
-            self.envelopes.append(envelope)
-
-    with sentry_sdk.Hub(None):
-        yield inner
-
-
-@pytest.fixture
-def sentry_events(monkeypatch, sentry_init):
-    sentry_init()
-    return sentry_sdk.Hub.current.client.transport.events
diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py
index 21865ee4bea30d6189909ac6be046b8ecc61c639..6c9d6def061c33197d9d21f80b04e6cb24760dff 100644
--- a/swh/indexer/tests/metadata_dictionary/test_codemeta.py
+++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py
@@ -6,6 +6,7 @@
 import json
 
 from hypothesis import HealthCheck, given, settings
+import pytest
 
 from swh.indexer.codemeta import CODEMETA_TERMS
 from swh.indexer.metadata_detector import detect_metadata
@@ -213,6 +214,7 @@ def test_sword_basics():
       <codemeta:author>
         <codemeta:name>Author 2</codemeta:name>
       </codemeta:author>
+      <codemeta:dateCreated>2022-10-26</codemeta:dateCreated>
       <author>
         <name>Author 3</name>
         <email>bar@example.org</email>
@@ -229,6 +231,7 @@ def test_sword_basics():
             {"name": "Author 2"},
             {"name": "Author 3", "email": "bar@example.org"},
         ],
+        "dateCreated": "2022-10-26",
     }
 
 
@@ -252,6 +255,117 @@ def test_sword_mixed():
     }
 
 
+@pytest.mark.parametrize("id_", ["", " ", "\n"])
+def test_sword_invalid_id(id_):
+    content = f"""<?xml version="1.0"?>
+    <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
+                xmlns="https://doi.org/10.5063/schema/codemeta-2.0"
+                xmlns:schema="http://schema.org/">
+      <name>My Software</name>
+      <id>{id_}</id>
+    </atom:entry>
+    """
+
+    result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "name": "My Software",
+    }
+
+
+@pytest.mark.parametrize(
+    "id_",
+    [
+        "foo",
+        "42",
+        "http://example.org/",
+        "http://example.org/foo",
+        "https://example.org/",
+        "https://example.org/foo",
+    ],
+)
+def test_sword_id(id_):
+    content = f"""<?xml version="1.0"?>
+    <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
+                xmlns="https://doi.org/10.5063/schema/codemeta-2.0"
+                xmlns:schema="http://schema.org/">
+      <name>My Software</name>
+      <id>{id_}</id>
+    </atom:entry>
+    """
+
+    result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "id": id_,
+        "name": "My Software",
+    }
+
+
+def test_sword_multiple_ids():
+    """JSON-LD only allows a single id, so we ignore all but the first one."""
+    content = """<?xml version="1.0"?>
+    <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
+                xmlns="https://doi.org/10.5063/schema/codemeta-2.0"
+                xmlns:schema="http://schema.org/">
+      <name>My Software</name>
+      <id>http://example.org/foo</id>
+      <id>http://example.org/bar</id>
+    </atom:entry>
+    """
+
+    result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "id": "http://example.org/foo",
+        "name": "My Software",
+    }
+
+
+def test_sword_type():
+    content = """<?xml version="1.0"?>
+    <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
+                xmlns="https://doi.org/10.5063/schema/codemeta-2.0"
+                xmlns:schema="http://schema.org/">
+      <name>My Software</name>
+      <type>http://schema.org/WebSite</type>
+    </atom:entry>
+    """
+
+    result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "schema:WebSite",
+        "name": "My Software",
+    }
+
+
+def test_sword_multiple_type():
+    content = """<?xml version="1.0"?>
+    <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
+                xmlns="https://doi.org/10.5063/schema/codemeta-2.0"
+                xmlns:schema="http://schema.org/">
+      <name>My Software</name>
+      <type>http://schema.org/WebSite</type>
+      <type>http://schema.org/SoftwareSourceCode</type>
+    </atom:entry>
+    """
+
+    result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+    assert result in (
+        {
+            "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+            "type": ["schema:WebSite", "SoftwareSourceCode"],
+            "name": "My Software",
+        },
+        {
+            "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+            "type": ["SoftwareSourceCode", "schema:WebSite"],
+            "name": "My Software",
+        },
+    )
+
+
 def test_sword_schemaorg_in_codemeta():
     content = """<?xml version="1.0"?>
     <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
@@ -273,13 +387,16 @@ def test_sword_schemaorg_in_codemeta():
 def test_sword_schemaorg_in_codemeta_constrained():
     """Resulting property has the compact URI 'schema:url' instead of just
     the term 'url', because term 'url' is defined by the Codemeta schema
-    has having type '@id'."""
+    has having type '@id'.
+    Ditto for dates (with type http://schema.org/Date)."""
     content = """<?xml version="1.0"?>
     <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
                 xmlns="https://doi.org/10.5063/schema/codemeta-2.0"
                 xmlns:schema="http://schema.org/">
       <name>My Software</name>
       <schema:url>http://example.org/my-software</schema:url>
+      <schema:dateCreated>foo</schema:dateCreated>
+      <schema:dateModified>2022-10-26</schema:dateModified>
     </atom:entry>
     """
 
@@ -288,6 +405,8 @@ def test_sword_schemaorg_in_codemeta_constrained():
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "name": "My Software",
         "schema:url": "http://example.org/my-software",
+        "schema:dateCreated": "foo",
+        "schema:dateModified": "2022-10-26",
     }
 
 
@@ -351,6 +470,54 @@ def test_sword_multiple_names():
     }
 
 
+def test_sword_propertyvalue():
+    content = """<?xml version="1.0"?>
+    <entry xmlns="http://www.w3.org/2005/Atom"
+           xmlns:codemeta="https://doi.org/10.5063/schema/codemeta-2.0"
+           xmlns:schema="http://schema.org/">
+      <name>Name</name>
+      <schema:identifier>
+          <codemeta:type>schema:PropertyValue</codemeta:type>
+          <schema:propertyID>HAL-ID</schema:propertyID>
+          <schema:value>hal-03780423</schema:value>
+      </schema:identifier>
+    </entry>
+    """
+
+    result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "name": "Name",
+        "identifier": {
+            "schema:propertyID": "HAL-ID",
+            "schema:value": "hal-03780423",
+            "type": "schema:PropertyValue",
+        },
+    }
+
+
+def test_sword_fix_date():
+    content = """<?xml version="1.0"?>
+    <entry xmlns="http://www.w3.org/2005/Atom"
+           xmlns:codemeta="https://doi.org/10.5063/schema/codemeta-2.0"
+           xmlns:schema="http://schema.org/">
+      <name>Name</name>
+      <codemeta:dateModified>2020-12-1</codemeta:dateModified>
+      <codemeta:dateCreated>2020-12-2</codemeta:dateCreated>
+      <codemeta:datePublished>2020-12-3</codemeta:datePublished>
+    </entry>
+    """
+
+    result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "name": "Name",
+        "dateModified": "2020-12-01",
+        "dateCreated": "2020-12-02",
+        "datePublished": "2020-12-03",
+    }
+
+
 def test_json_sword():
     content = """{"id": "hal-01243573", "@xmlns": "http://www.w3.org/2005/Atom", "author": {"name": "Author 1", "email": "foo@example.org"}, "client": "hal", "codemeta:url": "http://example.org/", "codemeta:name": "The assignment problem", "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "codemeta:author": {"codemeta:name": "Author 2"}, "codemeta:license": {"codemeta:name": "GNU General Public License v3.0 or later"}}"""  # noqa
     result = MAPPINGS["JsonSwordCodemetaMapping"]().translate(content)
@@ -362,6 +529,6 @@ def test_json_sword():
         ],
         "license": {"name": "GNU General Public License v3.0 or later"},
         "name": "The assignment problem",
-        "schema:url": "http://example.org/",
+        "url": "http://example.org/",
         "name": "The assignment problem",
     }
diff --git a/swh/indexer/tests/metadata_dictionary/test_dart.py b/swh/indexer/tests/metadata_dictionary/test_dart.py
index 956d0885329d19da726916485ebc5c33a514c6f0..9dad26379bc5a8801ff9b15db4d836282f548269 100644
--- a/swh/indexer/tests/metadata_dictionary/test_dart.py
+++ b/swh/indexer/tests/metadata_dictionary/test_dart.py
@@ -9,7 +9,7 @@ from swh.indexer.metadata_dictionary import MAPPINGS
 
 
 def test_compute_metadata_pubspec():
-    raw_content = """
+    raw_content = b"""
 ---
 name: newtify
 description: >-
@@ -37,9 +37,7 @@ dependencies:
 
 dev_dependencies:
   test: '>=1.15.0 <2.0.0'
-    """.encode(
-        "utf-8"
-    )
+    """
 
     result = MAPPINGS["PubMapping"]().translate(raw_content)
 
@@ -66,11 +64,9 @@ for.""",
 
 
 def test_normalize_author_pubspec():
-    raw_content = """
+    raw_content = b"""
     author: Atlee Pine <atlee@example.org>
-    """.encode(
-        "utf-8"
-    )
+    """
 
     result = MAPPINGS["PubMapping"]().translate(raw_content)
 
@@ -86,13 +82,11 @@ def test_normalize_author_pubspec():
 
 
 def test_normalize_authors_pubspec():
-    raw_content = """
+    raw_content = b"""
     authors:
       - Vicky Merzown <vmz@example.org>
       - Ron Bilius Weasley
-    """.encode(
-        "utf-8"
-    )
+    """
 
     result = MAPPINGS["PubMapping"]().translate(raw_content)
 
@@ -113,14 +107,12 @@ def test_normalize_authors_pubspec():
 
 @pytest.mark.xfail(reason="https://github.com/w3c/json-ld-api/issues/547")
 def test_normalize_author_authors_pubspec():
-    raw_content = """
+    raw_content = b"""
     authors:
       - Vicky Merzown <vmz@example.org>
       - Ron Bilius Weasley
     author: Hermione Granger
-    """.encode(
-        "utf-8"
-    )
+    """
 
     result = MAPPINGS["PubMapping"]().translate(raw_content)
 
@@ -144,11 +136,9 @@ def test_normalize_author_authors_pubspec():
 
 
 def test_normalize_empty_authors():
-    raw_content = """
+    raw_content = b"""
     authors:
-    """.encode(
-        "utf-8"
-    )
+    """
 
     result = MAPPINGS["PubMapping"]().translate(raw_content)
 
@@ -158,3 +148,14 @@ def test_normalize_empty_authors():
     }
 
     assert result == expected
+
+
+def test_invalid_yaml():
+    raw_content = b"""
+    name: smartech_push
+    license: { :type => "Commercial", :file => "LICENSE" }
+    """
+
+    result = MAPPINGS["PubMapping"]().translate(raw_content)
+
+    assert result is None
diff --git a/swh/indexer/tests/metadata_dictionary/test_gitea.py b/swh/indexer/tests/metadata_dictionary/test_gitea.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1dec7c0d227260cbd9f264f9db3059ab8dad9ea
--- /dev/null
+++ b/swh/indexer/tests/metadata_dictionary/test_gitea.py
@@ -0,0 +1,143 @@
+# Copyright (C) 2022  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.indexer.metadata_dictionary import MAPPINGS
+
+CONTEXT = [
+    "https://doi.org/10.5063/schema/codemeta-2.0",
+    {
+        "as": "https://www.w3.org/ns/activitystreams#",
+        "forge": "https://forgefed.org/ns#",
+    },
+]
+
+
+def test_compute_metadata_none():
+    """
+    testing content empty content is empty
+    should return None
+    """
+    content = b""
+
+    # None if no metadata was found or an error occurred
+    declared_metadata = None
+    result = MAPPINGS["GiteaMapping"]().translate(content)
+    assert declared_metadata == result
+
+
+def test_supported_terms():
+    terms = MAPPINGS["GiteaMapping"].supported_terms()
+    assert {
+        "http://schema.org/name",
+        "http://schema.org/dateCreated",
+        "https://forgefed.org/ns#forks",
+        "https://www.w3.org/ns/activitystreams#totalItems",
+    } <= terms
+
+
+def test_compute_metadata_gitea():
+    content = b"""
+{
+  "id": 48043,
+  "owner": {
+    "id": 48018,
+    "login": "ForgeFed",
+    "full_name": "",
+    "email": "",
+    "avatar_url": "https://codeberg.org/avatars/c20f7a6733a6156304137566ee35ef33",
+    "language": "",
+    "is_admin": false,
+    "last_login": "0001-01-01T00:00:00Z",
+    "created": "2022-04-30T20:13:17+02:00",
+    "restricted": false,
+    "active": false,
+    "prohibit_login": false,
+    "location": "",
+    "website": "https://forgefed.org/",
+    "description": "",
+    "visibility": "public",
+    "followers_count": 0,
+    "following_count": 0,
+    "starred_repos_count": 0,
+    "username": "ForgeFed"
+  },
+  "name": "ForgeFed",
+  "full_name": "ForgeFed/ForgeFed",
+  "description": "ActivityPub-based forge federation protocol specification",
+  "empty": false,
+  "private": false,
+  "fork": false,
+  "template": false,
+  "parent": null,
+  "mirror": false,
+  "size": 3780,
+  "language": "CSS",
+  "languages_url": "https://codeberg.org/api/v1/repos/ForgeFed/ForgeFed/languages",
+  "html_url": "https://codeberg.org/ForgeFed/ForgeFed",
+  "ssh_url": "git@codeberg.org:ForgeFed/ForgeFed.git",
+  "clone_url": "https://codeberg.org/ForgeFed/ForgeFed.git",
+  "original_url": "https://notabug.org/peers/forgefed",
+  "website": "https://forgefed.org",
+  "stars_count": 30,
+  "forks_count": 6,
+  "watchers_count": 11,
+  "open_issues_count": 61,
+  "open_pr_counter": 10,
+  "release_counter": 0,
+  "default_branch": "main",
+  "archived": false,
+  "created_at": "2022-06-13T18:54:26+02:00",
+  "updated_at": "2022-09-02T03:57:22+02:00",
+  "permissions": {
+    "admin": false,
+    "push": false,
+    "pull": true
+  },
+  "has_issues": true,
+  "internal_tracker": {
+    "enable_time_tracker": true,
+    "allow_only_contributors_to_track_time": true,
+    "enable_issue_dependencies": true
+  },
+  "has_wiki": false,
+  "has_pull_requests": true,
+  "has_projects": true,
+  "ignore_whitespace_conflicts": false,
+  "allow_merge_commits": false,
+  "allow_rebase": false,
+  "allow_rebase_explicit": false,
+  "allow_squash_merge": true,
+  "default_merge_style": "squash",
+  "avatar_url": "",
+  "internal": false,
+  "mirror_interval": "",
+  "mirror_updated": "0001-01-01T00:00:00Z",
+  "repo_transfer": null
+}
+    """
+    result = MAPPINGS["GiteaMapping"]().translate(content)
+    assert result == {
+        "@context": CONTEXT,
+        "type": "forge:Repository",
+        "id": "https://codeberg.org/ForgeFed/ForgeFed",
+        "forge:forks": {
+            "as:totalItems": 6,
+            "type": "as:OrderedCollection",
+        },
+        "as:likes": {
+            "as:totalItems": 30,
+            "type": "as:Collection",
+        },
+        "as:followers": {
+            "as:totalItems": 11,
+            "type": "as:Collection",
+        },
+        "name": "ForgeFed",
+        "description": "ActivityPub-based forge federation protocol specification",
+        "codeRepository": "https://codeberg.org/ForgeFed/ForgeFed.git",
+        "dateCreated": "2022-06-13T18:54:26+02:00",
+        "dateModified": "2022-09-02T03:57:22+02:00",
+        "url": "https://forgefed.org",
+    }
diff --git a/swh/indexer/tests/metadata_dictionary/test_github.py b/swh/indexer/tests/metadata_dictionary/test_github.py
index c0592dccd79555e82a3a5e02741e667f6d1f9fe2..0ab595f47c5dc6c2ddb8771203099c5476b10721 100644
--- a/swh/indexer/tests/metadata_dictionary/test_github.py
+++ b/swh/indexer/tests/metadata_dictionary/test_github.py
@@ -32,15 +32,13 @@ def test_supported_terms():
     assert {
         "http://schema.org/name",
         "http://schema.org/license",
+        "http://schema.org/dateCreated",
         "https://forgefed.org/ns#forks",
         "https://www.w3.org/ns/activitystreams#totalItems",
     } <= terms
 
 
 def test_compute_metadata_github():
-    """
-    testing only computation of metadata with hard_mapping_npm
-    """
     content = b"""
 {
   "id": 80521091,
@@ -65,6 +63,8 @@ def test_compute_metadata_github():
   "created_at": "2017-01-31T13:05:39Z",
   "updated_at": "2022-06-22T08:02:20Z",
   "pushed_at": "2022-06-29T09:01:08Z",
+  "archive_url": "https://api.github.com/repos/SoftwareHeritage/swh-indexer/{archive_format}{/ref}",
+  "issues_url": "https://api.github.com/repos/SoftwareHeritage/swh-indexer/issues{/number}",
   "git_url": "git://github.com/SoftwareHeritage/swh-indexer.git",
   "ssh_url": "git@github.com:SoftwareHeritage/swh-indexer.git",
   "clone_url": "https://github.com/SoftwareHeritage/swh-indexer.git",
@@ -116,11 +116,12 @@ def test_compute_metadata_github():
   "subscribers_count": 6
 }
 
-    """
+    """  # noqa
     result = MAPPINGS["GitHubMapping"]().translate(content)
     assert result == {
         "@context": CONTEXT,
         "type": "forge:Repository",
+        "id": "https://github.com/SoftwareHeritage/swh-indexer",
         "forge:forks": {
             "as:totalItems": 1,
             "type": "as:OrderedCollection",
@@ -136,7 +137,42 @@ def test_compute_metadata_github():
         "license": "https://spdx.org/licenses/GPL-3.0",
         "name": "SoftwareHeritage/swh-indexer",
         "description": "GitHub mirror of Metadata indexer",
-        "schema:codeRepository": "https://github.com/SoftwareHeritage/swh-indexer",
-        "schema:dateCreated": "2017-01-31T13:05:39Z",
-        "schema:dateModified": "2022-06-22T08:02:20Z",
+        "codeRepository": "https://github.com/SoftwareHeritage/swh-indexer.git",
+        "dateCreated": "2017-01-31T13:05:39Z",
+        "dateModified": "2022-06-22T08:02:20Z",
+    }
+
+
+def test_github_topics():
+    content = b"""
+{
+  "html_url": "https://github.com/SoftwareHeritage/swh-indexer",
+  "topics": [
+    "foo",
+    "bar"
+  ]
+}
+    """
+    result = MAPPINGS["GitHubMapping"]().translate(content)
+    assert set(result.pop("keywords", [])) == {"foo", "bar"}, result
+    assert result == {
+        "@context": CONTEXT,
+        "type": "forge:Repository",
+        "id": "https://github.com/SoftwareHeritage/swh-indexer",
+    }
+
+
+def test_github_issues():
+    content = b"""
+{
+  "html_url": "https://github.com/SoftwareHeritage/swh-indexer",
+  "has_issues": true
+}
+    """
+    result = MAPPINGS["GitHubMapping"]().translate(content)
+    assert result == {
+        "@context": CONTEXT,
+        "type": "forge:Repository",
+        "id": "https://github.com/SoftwareHeritage/swh-indexer",
+        "issueTracker": "https://github.com/SoftwareHeritage/swh-indexer/issues",
     }
diff --git a/swh/indexer/tests/metadata_dictionary/test_maven.py b/swh/indexer/tests/metadata_dictionary/test_maven.py
index 0267e95214591fe86422012a6e2b00072f45fee5..afde286d8ebcfe5b949fb5cbefc1af5954a3f8ad 100644
--- a/swh/indexer/tests/metadata_dictionary/test_maven.py
+++ b/swh/indexer/tests/metadata_dictionary/test_maven.py
@@ -353,6 +353,47 @@ def test_compute_metadata_maven_multiple():
     }
 
 
+def test_compute_metadata_maven_invalid_repository():
+    raw_content = b"""
+    <project>
+      <name>Maven Default Project</name>
+      <modelVersion>4.0.0</modelVersion>
+      <groupId>com.mycompany.app</groupId>
+      <artifactId>my-app</artifactId>
+      <version>1.2.3</version>
+      <repositories>
+        <repository>
+          <id>tcc-transaction-internal-releases</id>
+          <name>internal repository for released artifacts</name>
+          <url>${repo.internal.releases.url}</url>
+          <snapshots>
+              <enabled>false</enabled>
+          </snapshots>
+          <releases>
+              <enabled>true</enabled>
+          </releases>
+        </repository>
+      </repositories>
+      <licenses>
+        <license>
+          <name>Apache License, Version 2.0</name>
+          <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
+          <distribution>repo</distribution>
+          <comments>A business-friendly OSS license</comments>
+        </license>
+      </licenses>
+    </project>"""
+    result = MAPPINGS["MavenMapping"]().translate(raw_content)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "name": "Maven Default Project",
+        "schema:identifier": "com.mycompany.app",
+        "version": "1.2.3",
+        "license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
+    }
+
+
 @settings(suppress_health_check=[HealthCheck.too_slow])
 @given(
     xml_document_strategy(
diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py
index b0ead256dc0aa5af21847a858499942a949e1248..08f8ea668c38987af5f33d726153eac17bf51712 100644
--- a/swh/indexer/tests/metadata_dictionary/test_npm.py
+++ b/swh/indexer/tests/metadata_dictionary/test_npm.py
@@ -294,6 +294,131 @@ def test_npm_repository_normalization():
     }
 
 
+def test_npm_author():
+    package_json = rb"""{
+  "version": "1.0.0",
+  "author": "Foo Bar (@example)"
+}"""
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "author": [{"name": "Foo Bar", "type": "Person"}],
+        "version": "1.0.0",
+    }
+
+
+def test_npm_invalid_uris():
+    package_json = rb"""{
+  "version": "1.0.0",
+  "homepage": "",
+  "author": {
+    "name": "foo",
+    "url": "http://example.org"
+  }
+}"""
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "author": [{"name": "foo", "type": "Person", "url": "http://example.org"}],
+        "version": "1.0.0",
+    }
+
+    package_json = rb"""{
+  "version": "1.0.0",
+  "homepage": "http://example.org",
+  "author": {
+    "name": "foo",
+    "url": ""
+  }
+}"""
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "author": [{"name": "foo", "type": "Person"}],
+        "url": "http://example.org",
+        "version": "1.0.0",
+    }
+
+    package_json = rb"""{
+  "version": "1.0.0",
+  "homepage": "",
+  "author": {
+    "name": "foo",
+    "url": ""
+  },
+  "bugs": ""
+}"""
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "author": [{"name": "foo", "type": "Person"}],
+        "version": "1.0.0",
+    }
+
+    package_json = rb"""{
+  "version": "1.0.0",
+  "homepage": "http:example.org",
+  "author": {
+    "name": "foo",
+    "url": "http:example.com"
+  },
+  "bugs": {
+    "url": "http:example.com"
+  }
+}"""
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "author": [{"name": "foo", "type": "Person"}],
+        "version": "1.0.0",
+    }
+
+    package_json = rb"""{
+  "version": "1.0.0",
+  "repository": "git+https://g ithub.com/foo/bar.git"
+}"""
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "version": "1.0.0",
+    }
+
+    package_json = rb"""{
+  "version": "1.0.0",
+  "repository": "git+http://\\u001b[D\\u001b[D\\u001b[Ds\\u001b[C\\u001b[C\\u001b[D\\u001b://github.com/dearzoe/array-combination"
+}"""  # noqa
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "version": "1.0.0",
+    }
+
+
+def test_npm_invalid_licenses():
+    package_json = rb"""{
+  "version": "1.0.0",
+  "license": "SEE LICENSE IN LICENSE.md",
+  "author": {
+    "name": "foo",
+    "url": "http://example.org"
+  }
+}"""
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "author": [{"name": "foo", "type": "Person", "url": "http://example.org"}],
+        "version": "1.0.0",
+    }
+
+
 @settings(suppress_health_check=[HealthCheck.too_slow])
 @given(json_document_strategy(keys=list(MAPPINGS["NpmMapping"].mapping)))  # type: ignore
 def test_npm_adversarial(doc):
diff --git a/swh/indexer/tests/storage/test_api_client.py b/swh/indexer/tests/storage/test_api_client.py
index 250b6d870b58f1112a1a11f56b66387fb29f726c..3620d73fab85abf27a407e24222e7d9a055a8565 100644
--- a/swh/indexer/tests/storage/test_api_client.py
+++ b/swh/indexer/tests/storage/test_api_client.py
@@ -1,10 +1,12 @@
-# Copyright (C) 2015-2019  The Software Heritage developers
+# Copyright (C) 2015-2023  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
+import psycopg2
 import pytest
 
+from swh.core.api import RemoteException, TransientRemoteException
 from swh.indexer.storage import get_indexer_storage
 from swh.indexer.storage.api.client import RemoteStorage
 import swh.indexer.storage.api.server as server
@@ -54,3 +56,46 @@ def swh_indexer_storage(swh_rpc_client, app_server):
     storage.journal_writer = app_server.storage.journal_writer
     yield storage
     storage.journal_writer = journal_writer
+
+
+def test_exception(app_server, swh_indexer_storage, mocker):
+    """Checks the client re-raises unknown exceptions as a :exc:`RemoteException`"""
+    assert swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) == []
+    mocker.patch.object(
+        app_server.storage,
+        "content_mimetype_get",
+        side_effect=ValueError("crash"),
+    )
+    with pytest.raises(RemoteException) as e:
+        swh_indexer_storage.content_mimetype_get([b"\x01" * 20])
+    assert not isinstance(e, TransientRemoteException)
+
+
+def test_operationalerror_exception(app_server, swh_indexer_storage, mocker):
+    """Checks the client re-raises as a :exc:`TransientRemoteException`
+    rather than the base :exc:`RemoteException`; so the retrying proxy
+    retries for longer."""
+    assert swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) == []
+    mocker.patch.object(
+        app_server.storage,
+        "content_mimetype_get",
+        side_effect=psycopg2.errors.AdminShutdown("cluster is shutting down"),
+    )
+    with pytest.raises(RemoteException) as excinfo:
+        swh_indexer_storage.content_mimetype_get([b"\x01" * 20])
+    assert isinstance(excinfo.value, TransientRemoteException)
+
+
+def test_querycancelled_exception(app_server, swh_indexer_storage, mocker):
+    """Checks the client re-raises as a :exc:`TransientRemoteException`
+    rather than the base :exc:`RemoteException`; so the retrying proxy
+    retries for longer."""
+    assert swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) == []
+    mocker.patch.object(
+        app_server.storage,
+        "content_mimetype_get",
+        side_effect=psycopg2.errors.QueryCanceled("too big!"),
+    )
+    with pytest.raises(RemoteException) as excinfo:
+        swh_indexer_storage.content_mimetype_get([b"\x01" * 20])
+    assert not isinstance(excinfo.value, TransientRemoteException)
diff --git a/swh/indexer/tests/storage/test_model.py b/swh/indexer/tests/storage/test_model.py
index d33e5294b98770076fcde4c976241e8fbc3d9e79..981546d750ebf5998fed6968246b6bf17e1e192b 100644
--- a/swh/indexer/tests/storage/test_model.py
+++ b/swh/indexer/tests/storage/test_model.py
@@ -1,26 +1,57 @@
-# Copyright (C) 2020  The Software Heritage developers
+# Copyright (C) 2020-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
+import pytest
+
 from swh.indexer.storage.model import BaseRow, ContentLicenseRow
 
 
+def test_unique_key__no_tool_dict():
+    with pytest.raises(ValueError, match="indexer_configuration_id"):
+        BaseRow(id=12, indexer_configuration_id=34).unique_key()
+    with pytest.raises(ValueError, match="indexer_configuration_id"):
+        ContentLicenseRow(
+            id=12, indexer_configuration_id=34, license="BSD"
+        ).unique_key()
+
+
 def test_unique_key():
-    assert BaseRow(id=12, indexer_configuration_id=34).unique_key() == {
+    assert BaseRow(
+        id=12, tool={"id": 34, "name": "foo", "version": "1.2.3", "configuration": {}}
+    ).unique_key() == {
         "id": 12,
-        "indexer_configuration_id": 34,
+        "tool_name": "foo",
+        "tool_version": "1.2.3",
+        "tool_configuration": "{}",
     }
 
-    assert BaseRow(id=12, tool={"id": 34, "name": "foo"}).unique_key() == {
+    assert ContentLicenseRow(
+        id=12,
+        tool={"id": 34, "name": "foo", "version": "1.2.3", "configuration": {}},
+        license="BSD",
+    ).unique_key() == {
         "id": 12,
-        "indexer_configuration_id": 34,
+        "license": "BSD",
+        "tool_name": "foo",
+        "tool_version": "1.2.3",
+        "tool_configuration": "{}",
     }
 
     assert ContentLicenseRow(
-        id=12, indexer_configuration_id=34, license="BSD"
-    ).unique_key() == {"id": 12, "indexer_configuration_id": 34, "license": "BSD"}
-
-    assert ContentLicenseRow(
-        id=12, tool={"id": 34, "name": "foo"}, license="BSD"
-    ).unique_key() == {"id": 12, "indexer_configuration_id": 34, "license": "BSD"}
+        id=12,
+        tool={
+            "id": 34,
+            "name": "foo",
+            "version": "1.2.3",
+            "configuration": {"foo": 1, "bar": 2},
+        },
+        license="BSD",
+    ).unique_key() == {
+        "id": 12,
+        "license": "BSD",
+        "tool_name": "foo",
+        "tool_version": "1.2.3",
+        "tool_configuration": '{"bar": 2, "foo": 1}',
+    }
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
index a2b671453af7f5a5c08950bcece951745807b4a6..e7d20972e83d4ba62a1f6880f0aa8d5b6c1dd558 100644
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -24,6 +24,15 @@ from swh.indexer.storage.model import (
 from swh.model.hashutil import hash_to_bytes
 
 
+def _remove_tool_ids(rows):
+    results = []
+    for row in rows:
+        tool = dict(row.tool)
+        del tool["id"]
+        results.append(attr.evolve(row, tool=tool))
+    return results
+
+
 def prepare_mimetypes_from_licenses(
     fossology_licenses: List[ContentLicenseRow],
 ) -> List[ContentMimetypeRow]:
@@ -358,11 +367,13 @@ class StorageETypeTester:
 
         assert actual_data == expected_data
 
+        expected_journal_data = _remove_tool_ids(expected_data)
+
         journal_objects = storage.journal_writer.journal.objects  # type: ignore
         actual_journal_data = [
             obj for (obj_type, obj) in journal_objects if obj_type == self.endpoint_type
         ]
-        assert list(sorted(actual_journal_data)) == list(sorted(expected_data))
+        assert list(sorted(actual_journal_data)) == list(sorted(expected_journal_data))
 
 
 class TestIndexerStorageContentMimetypes(StorageETypeTester):
@@ -574,11 +585,13 @@ class TestIndexerStorageContentMetadata(StorageETypeTester):
 
         assert actual_data in (expected_data_postgresql, expected_data_verbatim)
 
+        expected_journal_data = _remove_tool_ids(expected_data_verbatim)
+
         journal_objects = storage.journal_writer.journal.objects  # type: ignore
         actual_journal_data = [
             obj for (obj_type, obj) in journal_objects if obj_type == self.endpoint_type
         ]
-        assert list(sorted(actual_journal_data)) == list(sorted(expected_data_verbatim))
+        assert list(sorted(actual_journal_data)) == list(sorted(expected_journal_data))
 
 
 class TestIndexerStorageDirectoryIntrinsicMetadata(StorageETypeTester):
@@ -912,13 +925,17 @@ class TestIndexerStorageOriginIntrinsicMetadata:
 
         assert actual_metadata == expected_metadata
 
+        expected_journal_metadata = _remove_tool_ids(expected_metadata)
+
         journal_objects = storage.journal_writer.journal.objects  # type: ignore
         actual_journal_metadata = [
             obj
             for (obj_type, obj) in journal_objects
             if obj_type == "origin_intrinsic_metadata"
         ]
-        assert list(sorted(actual_journal_metadata)) == list(sorted(expected_metadata))
+        assert list(sorted(actual_journal_metadata)) == list(
+            sorted(expected_journal_metadata)
+        )
 
     def test_origin_intrinsic_metadata_add_update_in_place_duplicate(
         self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
@@ -1527,13 +1544,17 @@ class TestIndexerStorageOriginExtrinsicMetadata:
 
         assert actual_metadata == expected_metadata
 
+        expected_journal_metadata = _remove_tool_ids(expected_metadata)
+
         journal_objects = storage.journal_writer.journal.objects  # type: ignore
         actual_journal_metadata = [
             obj
             for (obj_type, obj) in journal_objects
             if obj_type == "origin_extrinsic_metadata"
         ]
-        assert list(sorted(actual_journal_metadata)) == list(sorted(expected_metadata))
+        assert list(sorted(actual_journal_metadata)) == list(
+            sorted(expected_journal_metadata)
+        )
 
     def test_origin_extrinsic_metadata_add_update_in_place_duplicate(
         self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py
index 6bbab408be74afc82b22c8f2c498ad4fd2c4418d..439a683b15378cad72ae4c1ff93c3c60f875229d 100644
--- a/swh/indexer/tests/test_cli.py
+++ b/swh/indexer/tests/test_cli.py
@@ -28,7 +28,7 @@ from swh.journal.writer import get_journal_writer
 from swh.model.hashutil import hash_to_bytes
 from swh.model.model import Content, Origin, OriginVisitStatus
 
-from .test_metadata import REMD
+from .test_metadata import GITHUB_REMD
 from .utils import (
     DIRECTORY2,
     RAW_CONTENT_IDS,
@@ -110,6 +110,7 @@ def test_cli_mapping_list(cli_runner, swh_config):
             "codemeta",
             "composer",
             "gemspec",
+            "gitea",
             "github",
             "json-sword-codemeta",
             "maven",
@@ -710,7 +711,7 @@ def test_cli_journal_client_index__origin_extrinsic_metadata(
 
     origin = Origin("http://example.org/repo.git")
     storage.origin_add([origin])
-    raw_extrinsic_metadata = attr.evolve(REMD, target=origin.swhid())
+    raw_extrinsic_metadata = attr.evolve(GITHUB_REMD, target=origin.swhid())
     raw_extrinsic_metadata = attr.evolve(
         raw_extrinsic_metadata, id=raw_extrinsic_metadata.compute_hash()
     )
@@ -749,6 +750,7 @@ def test_cli_journal_client_index__origin_extrinsic_metadata(
             mappings=["github"],
             metadata={
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+                "id": "http://example.org/",
                 "type": "https://forgefed.org/ns#Repository",
                 "name": "test software",
             },
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index 20c49c092be38a481c82d8d8cea16385c11a9cb9..61c71cdc39423ffeeb23745d078334add8a897fb 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -59,7 +59,38 @@ DIRECTORY_METADATA_CONFIG = {
     "tools": TRANSLATOR_TOOL,
 }
 
-REMD = RawExtrinsicMetadata(
+DEPOSIT_REMD = RawExtrinsicMetadata(
+    target=ExtendedSWHID(
+        object_type=ExtendedObjectType.DIRECTORY,
+        object_id=b"\x02" * 20,
+    ),
+    discovery_date=datetime.datetime.now(tz=datetime.timezone.utc),
+    authority=MetadataAuthority(
+        type=MetadataAuthorityType.DEPOSIT_CLIENT,
+        url="https://example.org/",
+    ),
+    fetcher=MetadataFetcher(
+        name="example-fetcher",
+        version="1.0.0",
+    ),
+    format="sword-v2-atom-codemeta-v2",
+    metadata="""<?xml version="1.0"?>
+        <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
+                    xmlns="https://doi.org/10.5063/schema/codemeta-2.0">
+          <name>My Software</name>
+          <author>
+            <name>Author 1</name>
+            <email>foo@example.org</email>
+          </author>
+          <author>
+            <name>Author 2</name>
+          </author>
+        </atom:entry>
+    """.encode(),
+    origin="https://example.org/jdoe/myrepo",
+)
+
+GITHUB_REMD = RawExtrinsicMetadata(
     target=ExtendedSWHID(
         object_type=ExtendedObjectType.ORIGIN,
         object_id=b"\x01" * 20,
@@ -74,7 +105,7 @@ REMD = RawExtrinsicMetadata(
         version="1.0.0",
     ),
     format="application/vnd.github.v3+json",
-    metadata=b'{"full_name": "test software"}',
+    metadata=b'{"full_name": "test software", "html_url": "http://example.org/"}',
 )
 
 
@@ -199,7 +230,7 @@ class TestMetadata:
         metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
 
-        remd = attr.evolve(REMD, format="unknown format")
+        remd = attr.evolve(GITHUB_REMD, format="unknown format")
 
         results = metadata_indexer.index(remd.id, data=remd)
 
@@ -221,7 +252,7 @@ class TestMetadata:
         assert tool is not None
 
         assert metadata_indexer.process_journal_objects(
-            {"raw_extrinsic_metadata": [REMD.to_dict()]}
+            {"raw_extrinsic_metadata": [GITHUB_REMD.to_dict()]}
         ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1}
 
         assert metadata_indexer.storage.method_calls == [
@@ -237,22 +268,98 @@ class TestMetadata:
                 tool={"id": tool["id"], **TRANSLATOR_TOOL},
                 metadata={
                     "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+                    "id": "http://example.org/",
                     "type": "https://forgefed.org/ns#Repository",
                     "name": "test software",
                 },
-                from_remd_id=REMD.id,
+                from_remd_id=GITHUB_REMD.id,
                 mappings=["github"],
             )
         ]
 
+    def test_extrinsic_metadata_indexer_firstparty_deposit(self, mocker):
+        """Also nominal case, calling the mapping and storing the result"""
+        origin = "https://example.org/jdoe/myrepo"
+
+        metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
+        metadata_indexer.catch_exceptions = False
+        metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
+        metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
+
+        tool = metadata_indexer.idx_storage.indexer_configuration_get(
+            {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
+        )
+        assert tool is not None
+
+        assert metadata_indexer.process_journal_objects(
+            {"raw_extrinsic_metadata": [DEPOSIT_REMD.to_dict()]}
+        ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1}
+
+        assert metadata_indexer.storage.method_calls == [
+            call.origin_get_by_sha1(
+                [b"\xb1\x0c\\\xd2w\x1b\xdd\xac\x07\xdb\xdf>\x93O1\xd0\xc9L\x0c\xcf"]
+            )
+        ]
+
+        results = list(
+            metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin])
+        )
+        assert results == [
+            OriginExtrinsicMetadataRow(
+                id="https://example.org/jdoe/myrepo",
+                tool={"id": tool["id"], **TRANSLATOR_TOOL},
+                metadata={
+                    "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+                    "author": [
+                        {"email": "foo@example.org", "name": "Author 1"},
+                        {"name": "Author 2"},
+                    ],
+                    "name": "My Software",
+                },
+                from_remd_id=DEPOSIT_REMD.id,
+                mappings=["sword-codemeta"],
+            )
+        ]
+
+    def test_extrinsic_metadata_indexer_thirdparty_deposit(self, mocker):
+        """Metadata-only deposit: currently ignored"""
+        origin = "https://not-from-example.org/jdoe/myrepo"
+
+        metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
+        metadata_indexer.catch_exceptions = False
+        metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
+        metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
+
+        tool = metadata_indexer.idx_storage.indexer_configuration_get(
+            {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
+        )
+        assert tool is not None
+
+        assert metadata_indexer.process_journal_objects(
+            {"raw_extrinsic_metadata": [DEPOSIT_REMD.to_dict()]}
+        ) == {"status": "uneventful", "origin_extrinsic_metadata:add": 0}
+
+        assert metadata_indexer.storage.method_calls == [
+            call.origin_get_by_sha1(
+                [b"\xb1\x0c\\\xd2w\x1b\xdd\xac\x07\xdb\xdf>\x93O1\xd0\xc9L\x0c\xcf"]
+            )
+        ]
+
+        results = list(
+            metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin])
+        )
+        assert results == []
+
     def test_extrinsic_metadata_indexer_nonforge_authority(self, mocker):
         """Early abort on non-forge authorities"""
         metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
 
         remd = attr.evolve(
-            REMD,
-            authority=attr.evolve(REMD.authority, type=MetadataAuthorityType.REGISTRY),
+            GITHUB_REMD,
+            authority=attr.evolve(
+                GITHUB_REMD.authority, type=MetadataAuthorityType.REGISTRY
+            ),
         )
 
         results = metadata_indexer.index(remd.id, data=remd)
@@ -275,9 +382,71 @@ class TestMetadata:
         )
         assert tool is not None
 
-        results = metadata_indexer.index(REMD.id, data=REMD)
+        results = metadata_indexer.index(GITHUB_REMD.id, data=GITHUB_REMD)
 
         assert metadata_indexer.storage.method_calls == [
             call.origin_get_by_sha1([b"\x01" * 20])
         ]
         assert results == []
+
+    def test_extrinsic_metadata_indexer_duplicate_origin(self, mocker):
+        """Two metadata objects with the same origin target"""
+        origin = "https://example.org/jdoe/myrepo"
+
+        metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
+        metadata_indexer.catch_exceptions = False
+        metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
+        metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
+
+        tool = metadata_indexer.idx_storage.indexer_configuration_get(
+            {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
+        )
+        assert tool is not None
+
+        assert metadata_indexer.process_journal_objects(
+            {
+                "raw_extrinsic_metadata": [
+                    GITHUB_REMD.to_dict(),
+                    {**GITHUB_REMD.to_dict(), "id": b"\x00" * 20},
+                ]
+            }
+        ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1}
+
+        results = list(
+            metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin])
+        )
+        assert len(results) == 1, results
+        assert results[0].from_remd_id == b"\x00" * 20
+
+    def test_extrinsic_directory_metadata_indexer_duplicate_origin(self, mocker):
+        """Two metadata objects on directories, but with an origin context"""
+        origin = DEPOSIT_REMD.origin
+
+        metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
+        metadata_indexer.catch_exceptions = False
+        metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
+        metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
+
+        tool = metadata_indexer.idx_storage.indexer_configuration_get(
+            {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
+        )
+        assert tool is not None
+
+        assert metadata_indexer.process_journal_objects(
+            {
+                "raw_extrinsic_metadata": [
+                    DEPOSIT_REMD.to_dict(),
+                    {
+                        **DEPOSIT_REMD.to_dict(),
+                        "id": b"\x00" * 20,
+                        "target": "swh:1:dir:" + "01" * 20,
+                    },
+                ]
+            }
+        ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1}
+
+        results = list(
+            metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin])
+        )
+        assert len(results) == 1, results
+        assert results[0].from_remd_id == b"\x00" * 20
diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py
index 999084bb4982eebdcea0daf7752223a580c3431a..e44ca71244ac28be435815c6ae85fddedda933e2 100644
--- a/swh/indexer/tests/test_origin_head.py
+++ b/swh/indexer/tests/test_origin_head.py
@@ -4,6 +4,7 @@
 # See top-level LICENSE file for more information
 
 from datetime import datetime, timezone
+import itertools
 
 import pytest
 
@@ -20,6 +21,13 @@ from swh.model.model import (
 from swh.model.swhids import CoreSWHID
 from swh.storage.utils import now
 
+
+@pytest.fixture
+def swh_storage_backend_config():
+    """In-memory storage, to make tests go faster."""
+    return {"cls": "memory"}
+
+
 SAMPLE_SNAPSHOT = Snapshot(
     branches={
         b"foo": None,
@@ -31,6 +39,28 @@ SAMPLE_SNAPSHOT = Snapshot(
 )
 
 
+def _add_snapshot_to_origin(storage, origin_url, visit_type, snapshot):
+    storage.origin_add([Origin(url=origin_url)])
+    visit = storage.origin_visit_add(
+        [
+            OriginVisit(
+                origin=origin_url,
+                date=datetime(2019, 2, 27, tzinfo=timezone.utc),
+                type="pypi",
+            )
+        ]
+    )[0]
+    storage.snapshot_add([snapshot])
+    visit_status = OriginVisitStatus(
+        origin=origin_url,
+        visit=visit.visit,
+        date=now(),
+        status="full",
+        snapshot=snapshot.id,
+    )
+    storage.origin_visit_status_add([visit_status])
+
+
 @pytest.fixture
 def storage(swh_storage):
     fill_storage(swh_storage)
@@ -77,31 +107,115 @@ def test_vcs_missing_snapshot(storage):
 
 def test_pypi_missing_branch(storage):
     origin_url = "https://pypi.org/project/abcdef/"
-    storage.origin_add(
-        [
-            Origin(
-                url=origin_url,
-            )
-        ]
+    _add_snapshot_to_origin(storage, origin_url, "pypi", SAMPLE_SNAPSHOT)
+    assert get_head_swhid(storage, origin_url) is None
+
+
+@pytest.mark.parametrize(
+    "branches_start,branches_middle,branches_end",
+    itertools.product([0, 40, 99, 100, 200], [0, 40, 99, 100, 200], [0, 40, 200]),
+)
+def test_large_snapshot(storage, branches_start, branches_middle, branches_end):
+    rev_id = "8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79"
+    snapshot = Snapshot(
+        branches=dict(
+            [(f"AAAA{i}".encode(), None) for i in range(branches_start)]
+            + [
+                (
+                    b"HEAD",
+                    SnapshotBranch(
+                        target_type=TargetType.ALIAS, target=b"refs/heads/foo"
+                    ),
+                )
+            ]
+            + [(f"aaaa{i}".encode(), None) for i in range(branches_middle)]
+            + [
+                (
+                    b"refs/heads/foo",
+                    SnapshotBranch(
+                        target_type=TargetType.REVISION,
+                        target=bytes.fromhex(rev_id),
+                    ),
+                )
+            ]
+            + [(f"zzzz{i}".encode(), None) for i in range(branches_end)]
+        )
     )
-    visit = storage.origin_visit_add(
-        [
-            OriginVisit(
-                origin=origin_url,
-                date=datetime(2019, 2, 27, tzinfo=timezone.utc),
-                type="pypi",
-            )
-        ]
-    )[0]
-    storage.snapshot_add([SAMPLE_SNAPSHOT])
-    visit_status = OriginVisitStatus(
-        origin=origin_url,
-        visit=visit.visit,
-        date=now(),
-        status="full",
-        snapshot=SAMPLE_SNAPSHOT.id,
+
+    origin_url = "https://example.org/repo.git"
+    _add_snapshot_to_origin(storage, origin_url, "git", snapshot)
+
+    assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
+        "swh:1:rev:8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79"
     )
-    storage.origin_visit_status_add([visit_status])
+
+
+def test_large_snapshot_chained_aliases(storage):
+    rev_id = "8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79"
+    snapshot = Snapshot(
+        branches=dict(
+            [(f"AAAA{i}".encode(), None) for i in range(200)]
+            + [
+                (
+                    b"HEAD",
+                    SnapshotBranch(
+                        target_type=TargetType.ALIAS, target=b"refs/heads/alias2"
+                    ),
+                )
+            ]
+            + [(f"aaaa{i}".encode(), None) for i in range(200)]
+            + [
+                (
+                    b"refs/heads/alias2",
+                    SnapshotBranch(
+                        target_type=TargetType.ALIAS, target=b"refs/heads/branch"
+                    ),
+                )
+            ]
+            + [(f"refs/heads/bbbb{i}".encode(), None) for i in range(200)]
+            + [
+                (
+                    b"refs/heads/branch",
+                    SnapshotBranch(
+                        target_type=TargetType.REVISION,
+                        target=bytes.fromhex(rev_id),
+                    ),
+                )
+            ]
+        )
+    )
+
+    origin_url = "https://example.org/repo.git"
+    _add_snapshot_to_origin(storage, origin_url, "git", snapshot)
+
+    assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
+        "swh:1:rev:8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79"
+    )
+
+
+@pytest.mark.parametrize(
+    "branches_start,branches_end",
+    itertools.product([0, 40, 99, 100, 200], [0, 40, 200]),
+)
+def test_large_snapshot_dangling_alias(storage, branches_start, branches_end):
+    snapshot = Snapshot(
+        branches=dict(
+            [(f"AAAA{i}".encode(), None) for i in range(branches_start)]
+            + [
+                (
+                    b"HEAD",
+                    SnapshotBranch(
+                        target_type=TargetType.ALIAS, target=b"refs/heads/foo"
+                    ),
+                )
+            ]
+            + [(f"zzzz{i}".encode(), None) for i in range(branches_end)]
+        )
+    )
+
+    origin_url = "https://example.org/repo.git"
+    _add_snapshot_to_origin(storage, origin_url, "git", snapshot)
+
     assert get_head_swhid(storage, origin_url) is None
 
 
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
index 567f479adbc8bf2165326ee34d903c7239e8f414..4b7057e1796215525142e1141dc9f942169a79bb 100644
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020  The Software Heritage developers
+# Copyright (C) 2018-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -6,6 +6,7 @@
 import copy
 from unittest.mock import patch
 
+import attr
 import pytest
 
 from swh.indexer.metadata import OriginMetadataIndexer
@@ -213,6 +214,58 @@ def test_origin_metadata_indexer_duplicate_directory(
     assert len(orig_results) == 2
 
 
+def test_origin_metadata_indexer_duplicate_directory_different_result(
+    swh_indexer_config,
+    idx_storage: IndexerStorageInterface,
+    storage: StorageInterface,
+    obj_storage,
+    mocker,
+) -> None:
+    """Same as above, but indexing the same directory twice resulted in different
+    data (because list order differs).
+    """
+    indexer = OriginMetadataIndexer(config=swh_indexer_config)
+    indexer.storage = storage
+    indexer.idx_storage = idx_storage
+    indexer.catch_exceptions = False
+    origin1 = "https://github.com/librariesio/yarn-parser"
+    origin2 = "https://github.com/librariesio/yarn-parser.git"
+
+    directory_index = indexer.directory_metadata_indexer.index
+
+    nb_calls = 0
+
+    def side_effect(dir_id):
+        nonlocal nb_calls
+        if nb_calls == 0:
+            keywords = ["foo", "bar"]
+        elif nb_calls == 1:
+            keywords = ["bar", "foo"]
+        else:
+            assert False, nb_calls
+        nb_calls += 1
+        return [
+            attr.evolve(row, metadata={**row.metadata, "keywords": keywords})
+            for row in directory_index(dir_id)
+        ]
+
+    mocker.patch.object(
+        indexer.directory_metadata_indexer, "index", side_effect=side_effect
+    )
+
+    indexer.run([origin1, origin2])
+
+    dir_id = DIRECTORY2.id
+
+    dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+    assert len(dir_results) == 1
+
+    orig_results = list(
+        indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
+    )
+    assert len(orig_results) == 2
+
+
 def test_origin_metadata_indexer_no_metadata_file(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
diff --git a/tox.ini b/tox.ini
index f0bda88c43a56d26850573d587eba2812e509376..b135fcc60c7362d419d26c5b2464ef89692ec11f 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,4 +1,6 @@
 [tox]
+requires =
+  tox>4
 envlist=black,flake8,mypy,py3
 
 [testenv]
@@ -20,15 +22,16 @@ commands =
 [testenv:black]
 skip_install = true
 deps =
-  black==22.3.0
+  black==22.10.0
 commands =
   {envpython} -m black --check swh
 
 [testenv:flake8]
 skip_install = true
 deps =
-  flake8==4.0.1
-  flake8-bugbear==22.3.23
+  flake8==5.0.4
+  flake8-bugbear==22.9.23
+  pycodestyle==2.9.1
 commands =
   {envpython} -m flake8
 
@@ -36,7 +39,7 @@ commands =
 extras =
   testing
 deps =
-  mypy==0.942
+  mypy==1.0
 commands =
   mypy swh
 
@@ -44,14 +47,12 @@ commands =
 # git HEAD of swh-docs, is executed on CI for each diff to prevent
 # breaking doc build
 [testenv:sphinx]
-whitelist_externals = make
+allowlist_externals = make
 usedevelop = true
 extras =
   testing
 deps =
-  # fetch and install swh-docs in develop mode
-  -e git+https://forge.softwareheritage.org/source/swh-docs#egg=swh.docs
-
+  -e git+https://gitlab.softwareheritage.org/swh/devel/swh-docs.git\#egg=swh.docs
 setenv =
   SWH_PACKAGE_DOC_TOX_BUILD = 1
   # turn warnings into errors
@@ -59,18 +60,16 @@ setenv =
 commands =
   make -I ../.tox/sphinx/src/swh-docs/swh/ -C docs
 
-
 # build documentation only inside swh-environment using local state
 # of swh-docs package
 [testenv:sphinx-dev]
-whitelist_externals = make
+allowlist_externals = make
 usedevelop = true
 extras =
   testing
 deps =
   # install swh-docs in develop mode
   -e ../swh-docs
-
 setenv =
   SWH_PACKAGE_DOC_TOX_BUILD = 1
   # turn warnings into errors