From c87a210dfb3b5048c5c4d9acb5dfaea609d1dc37 Mon Sep 17 00:00:00 2001
From: Antoine Lambert <anlambert@softwareheritage.org>
Date: Fri, 13 Dec 2024 17:29:56 +0100
Subject: [PATCH] indexer: Use CompositeObjId as object id type for
 ContentIndexer

Previously, content indexers were only using content sha1 as object
identifier but this could lead to errors in production when attempting
to fetch content data from an object storage where key is the sha256
checksum of the content (winery for instance).

So ensure to use a CompositeObjId dictionary as content object identifier
to avoid these kind of issues when indexing content objects.
---
 swh/indexer/fossology_license.py              | 23 ++++--------
 swh/indexer/indexer.py                        | 22 ++++++------
 swh/indexer/metadata.py                       | 25 +++++++------
 swh/indexer/metadata_detector.py              |  6 ++--
 swh/indexer/metadata_dictionary/base.py       | 30 ++++++++++++----
 swh/indexer/mimetype.py                       | 15 ++++----
 .../metadata_dictionary/test_codemeta.py      |  5 ++-
 .../tests/metadata_dictionary/test_npm.py     | 15 +++++---
 .../tests/metadata_dictionary/test_nuget.py   |  5 +--
 .../tests/metadata_dictionary/test_python.py  |  5 +--
 swh/indexer/tests/test_fossology_license.py   | 10 +++---
 swh/indexer/tests/test_indexer.py             |  7 ++--
 swh/indexer/tests/test_metadata.py            |  4 +--
 swh/indexer/tests/utils.py                    | 35 ++++++++++++-------
 14 files changed, 121 insertions(+), 86 deletions(-)

diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
index 8ccf9d5c..20a9fa84 100644
--- a/swh/indexer/fossology_license.py
+++ b/swh/indexer/fossology_license.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2023  The Software Heritage developers
+# Copyright (C) 2016-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -10,9 +10,10 @@ from typing import Any, Dict, List, Optional
 import sentry_sdk
 
 from swh.core.config import merge_configs
-from swh.indexer.storage.interface import IndexerStorageInterface, Sha1
+from swh.indexer.storage.interface import IndexerStorageInterface
 from swh.indexer.storage.model import ContentLicenseRow
 from swh.model import hashutil
+from swh.objstorage.interface import CompositeObjId
 
 from .indexer import ContentIndexer, write_to_temp
 
@@ -86,7 +87,7 @@ class MixinFossologyLicenseIndexer:
         self.working_directory = self.config["workdir"]
 
     def index(
-        self, id: Sha1, data: Optional[bytes] = None, **kwargs
+        self, id: CompositeObjId, data: Optional[bytes] = None, **kwargs
     ) -> List[ContentLicenseRow]:
         """Index sha1s' content and store result.
 
@@ -105,14 +106,14 @@ class MixinFossologyLicenseIndexer:
         """
         assert data is not None
         with write_to_temp(
-            filename=hashutil.hash_to_hex(id),  # use the id as pathname
+            filename=hashutil.hash_to_hex(id["sha1"]),  # use the id as pathname
             data=data,
             working_directory=self.working_directory,
         ) as content_path:
             properties = compute_license(path=content_path)
         return [
             ContentLicenseRow(
-                id=id,
+                id=id["sha1"],
                 indexer_configuration_id=self.tool["id"],
                 license=license,
             )
@@ -148,14 +149,4 @@ class FossologyLicenseIndexer(
 
     """
 
-    def filter(self, ids):
-        """Filter out known sha1s and return only missing ones."""
-        yield from self.idx_storage.content_fossology_license_missing(
-            (
-                {
-                    "id": sha1,
-                    "indexer_configuration_id": self.tool["id"],
-                }
-                for sha1 in ids
-            )
-        )
+    pass
diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
index bbb753e4..37e51ee0 100644
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2023  The Software Heritage developers
+# Copyright (C) 2016-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -15,12 +15,12 @@ import sentry_sdk
 from typing_extensions import TypedDict
 
 from swh.core.config import load_from_envvar, merge_configs
-from swh.indexer.storage import INDEXER_CFG_KEY, Sha1, get_indexer_storage
+from swh.indexer.storage import INDEXER_CFG_KEY, get_indexer_storage
 from swh.indexer.storage.interface import IndexerStorageInterface
 from swh.model import hashutil
 from swh.model.model import Directory, Origin, Sha1Git
 from swh.objstorage.factory import get_objstorage
-from swh.objstorage.interface import objid_from_dict
+from swh.objstorage.interface import CompositeObjId, objid_from_dict
 from swh.storage import get_storage
 from swh.storage.interface import StorageInterface
 
@@ -279,7 +279,7 @@ class BaseIndexer(Generic[TId, TData, TResult], metaclass=abc.ABCMeta):
         raise NotImplementedError()
 
 
-class ContentIndexer(BaseIndexer[Sha1, bytes, TResult], Generic[TResult]):
+class ContentIndexer(BaseIndexer[CompositeObjId, bytes, TResult], Generic[TResult]):
     """A content indexer working on the journal (method `process_journal_objects`) or on
     a list of ids directly (method `run`).
 
@@ -293,10 +293,12 @@ class ContentIndexer(BaseIndexer[Sha1, bytes, TResult], Generic[TResult]):
         """Read content objects from the journal, retrieve their raw content and compute
         content indexing (e.g. mimetype, fossology license, ...).
         """
-        summary, _ = self.run([obj_id["sha1"] for obj_id in objects.get("content", [])])
+        summary, _ = self.run(
+            [objid_from_dict(obj) for obj in objects.get("content", [])]
+        )
         return summary
 
-    def run(self, ids: List[Sha1], **kwargs) -> Tuple[Dict, List]:
+    def run(self, ids: List[CompositeObjId], **kwargs) -> Tuple[Dict, List]:
         """Given a list of ids:
 
         - retrieve the content from the storage
@@ -314,18 +316,16 @@ class ContentIndexer(BaseIndexer[Sha1, bytes, TResult], Generic[TResult]):
         summary: Dict[str, Any] = {"status": "uneventful"}
         results = []
         try:
-            content_data = self.objstorage.get_batch(
-                [objid_from_dict({"sha1": id}) for id in ids]
-            )
+            content_data = self.objstorage.get_batch(ids)
             for item, raw_content in zip(ids, content_data):
                 id_ = item
                 sentry_sdk.set_tag(
-                    "swh-indexer-content-sha1", hashutil.hash_to_hex(id_)
+                    "swh-indexer-content-sha1", hashutil.hash_to_hex(id_["sha1"])
                 )
                 if not raw_content:
                     self.log.warning(
                         "Content %s not found in objstorage",
-                        hashutil.hash_to_hex(id_),
+                        hashutil.hash_to_hex(id_["sha1"]),
                     )
                     continue
 
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index 2b37e7ed..f95e7b38 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -41,7 +41,7 @@ from swh.indexer.metadata_detector import detect_metadata
 from swh.indexer.metadata_dictionary import EXTRINSIC_MAPPINGS, INTRINSIC_MAPPINGS
 from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
 from swh.indexer.origin_head import get_head_swhid
-from swh.indexer.storage import INDEXER_CFG_KEY, Sha1
+from swh.indexer.storage import INDEXER_CFG_KEY
 from swh.indexer.storage.model import (
     ContentMetadataRow,
     DirectoryIntrinsicMetadataRow,
@@ -58,6 +58,7 @@ from swh.model.model import (
     Sha1Git,
 )
 from swh.model.swhids import CoreSWHID, ExtendedObjectType, ObjectType
+from swh.objstorage.interface import CompositeObjId
 
 REVISION_GET_BATCH_SIZE = 10
 RELEASE_GET_BATCH_SIZE = 10
@@ -232,21 +233,21 @@ class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]):
 
     """
 
-    def filter(self, ids):
+    def filter(self, ids: List[CompositeObjId]):
         """Filter out known sha1s and return only missing ones."""
         yield from self.idx_storage.content_metadata_missing(
             (
                 {
-                    "id": sha1,
+                    "id": id["sha1"],
                     "indexer_configuration_id": self.tool["id"],
                 }
-                for sha1 in ids
+                for id in ids
             )
         )
 
     def index(
         self,
-        id: Sha1,
+        id: CompositeObjId,
         data: Optional[bytes] = None,
         log_suffix="unknown directory",
         **kwargs,
@@ -263,24 +264,24 @@ class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]):
             be returned as None
 
         """
-        assert isinstance(id, bytes)
+        assert "sha1" in id
         assert data is not None
         metadata = None
         try:
             mapping_name = self.tool["tool_configuration"]["context"]
-            log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id)
+            log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id["sha1"])
             metadata = INTRINSIC_MAPPINGS[mapping_name](log_suffix).translate(data)
         except Exception:
             self.log.exception(
                 "Problem during metadata translation "
-                "for content %s" % hashutil.hash_to_hex(id)
+                "for content %s" % hashutil.hash_to_hex(id["sha1"])
             )
             sentry_sdk.capture_exception()
         if metadata is None:
             return []
         return [
             ContentMetadataRow(
-                id=id,
+                id=id["sha1"],
                 indexer_configuration_id=self.tool["id"],
                 metadata=metadata,
             )
@@ -429,7 +430,9 @@ class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]):
             c_metadata_indexer = ContentMetadataIndexer(config=cfg)
             # sha1s that are in content_metadata table
             sha1s_in_storage = []
-            metadata_generator = self.idx_storage.content_metadata_get(detected_files)
+            metadata_generator = self.idx_storage.content_metadata_get(
+                [f["sha1"] for f in detected_files]
+            )
             for c in metadata_generator:
                 # extracting metadata
                 sha1 = c.id
@@ -440,7 +443,7 @@ class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]):
                     metadata.append(local_metadata)
 
             sha1s_filtered = [
-                item for item in detected_files if item not in sha1s_in_storage
+                item for item in detected_files if item["sha1"] not in sha1s_in_storage
             ]
 
             if sha1s_filtered:
diff --git a/swh/indexer/metadata_detector.py b/swh/indexer/metadata_detector.py
index d0ca829c..ceabdf92 100644
--- a/swh/indexer/metadata_detector.py
+++ b/swh/indexer/metadata_detector.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 The Software Heritage developers
+# Copyright (C) 2017-2024 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -7,10 +7,10 @@ from typing import Dict, List
 
 from swh.indexer.metadata_dictionary import INTRINSIC_MAPPINGS
 from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
-from swh.indexer.storage.interface import Sha1
+from swh.objstorage.interface import CompositeObjId
 
 
-def detect_metadata(files: List[DirectoryLsEntry]) -> Dict[str, List[Sha1]]:
+def detect_metadata(files: List[DirectoryLsEntry]) -> Dict[str, List[CompositeObjId]]:
     """
     Detects files potentially containing metadata
 
diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
index d3e8d201..f356ed9e 100644
--- a/swh/indexer/metadata_dictionary/base.py
+++ b/swh/indexer/metadata_dictionary/base.py
@@ -1,11 +1,22 @@
-# Copyright (C) 2017-2023  The Software Heritage developers
+# Copyright (C) 2017-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import json
 import logging
-from typing import Any, Callable, Dict, List, Optional, Pattern, Tuple, TypeVar, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Pattern,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+)
 import uuid
 import xml.parsers.expat
 
@@ -18,6 +29,7 @@ import yaml
 from swh.indexer.codemeta import _document_loader, compact
 from swh.indexer.namespaces import RDF, SCHEMA
 from swh.indexer.storage.interface import Sha1
+from swh.objstorage.interface import CompositeObjId, objid_from_dict
 
 from .utils import add_url_if_valid
 
@@ -28,6 +40,8 @@ TMP_ROOT_URI_PREFIX = "https://www.softwareheritage.org/schema/2022/indexer/tmp-
 class DirectoryLsEntry(TypedDict):
     target: Sha1
     sha1: Optional[Sha1]
+    sha1_git: Optional[bytes]
+    sha256: Optional[bytes]
     name: bytes
     type: str
 
@@ -118,7 +132,9 @@ class BaseIntrinsicMapping(BaseMapping):
     """
 
     @classmethod
-    def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]:
+    def detect_metadata_files(
+        cls, file_entries: List[DirectoryLsEntry]
+    ) -> List[CompositeObjId]:
         """
         Returns the sha1 hashes of files which can be translated by this mapping
         """
@@ -134,19 +150,21 @@ class SingleFileIntrinsicMapping(BaseIntrinsicMapping):
     filename: Union[bytes, Pattern[bytes]]
 
     @classmethod
-    def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]:
+    def detect_metadata_files(
+        cls, file_entries: List[DirectoryLsEntry]
+    ) -> List[CompositeObjId]:
         filename = cls.filename
         # Check if filename is a regex or bytes:
         if isinstance(filename, bytes):
             for entry in file_entries:
                 if entry["name"].lower() == filename.lower():
                     if entry["sha1"] is not None:  # ignore skipped_content and dangling
-                        return [entry["sha1"]]
+                        return [objid_from_dict(cast(dict, entry))]
         else:
             for entry in file_entries:
                 if filename.match(entry["name"]):
                     if entry["sha1"] is not None:  # ignore skipped_content and dangling
-                        return [entry["sha1"]]
+                        return [objid_from_dict(cast(dict, entry))]
 
         return []
 
diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py
index db97ec84..7cb9c3ab 100644
--- a/swh/indexer/mimetype.py
+++ b/swh/indexer/mimetype.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2023  The Software Heritage developers
+# Copyright (C) 2016-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -8,8 +8,9 @@ from typing import Any, Dict, List, Optional
 import magic
 
 from swh.core.config import merge_configs
-from swh.indexer.storage.interface import IndexerStorageInterface, Sha1
+from swh.indexer.storage.interface import IndexerStorageInterface
 from swh.indexer.storage.model import ContentMimetypeRow
+from swh.objstorage.interface import CompositeObjId
 
 from .indexer import ContentIndexer
 
@@ -67,7 +68,7 @@ class MixinMimetypeIndexer:
         self.config = merge_configs(DEFAULT_CONFIG, self.config)
 
     def index(
-        self, id: Sha1, data: Optional[bytes] = None, **kwargs
+        self, id: CompositeObjId, data: Optional[bytes] = None, **kwargs
     ) -> List[ContentMimetypeRow]:
         """Index sha1s' content and store result.
 
@@ -87,7 +88,7 @@ class MixinMimetypeIndexer:
         properties = compute_mimetype_encoding(data)
         return [
             ContentMimetypeRow(
-                id=id,
+                id=id["sha1"],
                 indexer_configuration_id=self.tool["id"],
                 mimetype=properties["mimetype"],
                 encoding=properties["encoding"],
@@ -120,14 +121,14 @@ class MimetypeIndexer(MixinMimetypeIndexer, ContentIndexer[ContentMimetypeRow]):
 
     """
 
-    def filter(self, ids):
+    def filter(self, ids: List[CompositeObjId]):
         """Filter out known sha1s and return only missing ones."""
         yield from self.idx_storage.content_mimetype_missing(
             (
                 {
-                    "id": sha1,
+                    "id": id["sha1"],
                     "indexer_configuration_id": self.tool["id"],
                 }
-                for sha1 in ids
+                for id in ids
             )
         )
diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py
index 8c89bad6..a56e2de4 100644
--- a/swh/indexer/tests/metadata_dictionary/test_codemeta.py
+++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py
@@ -12,6 +12,7 @@ import pytest
 from swh.indexer.codemeta import CODEMETA_TERMS
 from swh.indexer.metadata_detector import detect_metadata
 from swh.indexer.metadata_dictionary import MAPPINGS
+from swh.objstorage.interface import CompositeObjId
 
 from ..utils import json_document_strategy
 
@@ -173,7 +174,9 @@ def test_detect_metadata_codemeta_json_uppercase():
     ]
     results = detect_metadata(df)
 
-    expected_results = {"CodemetaMapping": [b"bcd"]}
+    expected_results = {
+        "CodemetaMapping": [CompositeObjId(sha1=b"bcd", sha1_git=b"aab")]
+    }
     assert expected_results == results
 
 
diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py
index 8d8d6259..63680540 100644
--- a/swh/indexer/tests/metadata_dictionary/test_npm.py
+++ b/swh/indexer/tests/metadata_dictionary/test_npm.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022  The Software Heritage developers
+# Copyright (C) 2017-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -11,6 +11,7 @@ import pytest
 from swh.indexer.metadata_detector import detect_metadata
 from swh.indexer.metadata_dictionary import MAPPINGS
 from swh.indexer.storage.model import ContentMetadataRow
+from swh.objstorage.interface import CompositeObjId
 
 from ..test_metadata import TRANSLATOR_TOOL, ContentMetadataTestIndexer
 from ..utils import (
@@ -112,11 +113,15 @@ def test_index_content_metadata_npm(storage, obj_storage):
     config["tools"] = [TRANSLATOR_TOOL]
     metadata_indexer = ContentMetadataTestIndexer(config=config)
     metadata_indexer.run(sha1s, log_suffix="unknown content")
-    results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s))
+    results = list(
+        metadata_indexer.idx_storage.content_metadata_get(
+            [sha1["sha1"] for sha1 in sha1s]
+        )
+    )
 
     expected_results = [
         ContentMetadataRow(
-            id=sha1s[0],
+            id=sha1s[0]["sha1"],
             tool=TRANSLATOR_TOOL,
             metadata={
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
@@ -128,7 +133,7 @@ def test_index_content_metadata_npm(storage, obj_storage):
             },
         ),
         ContentMetadataRow(
-            id=sha1s[1],
+            id=sha1s[1]["sha1"],
             tool=TRANSLATOR_TOOL,
             metadata={
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
@@ -456,7 +461,7 @@ def test_detect_metadata_package_json(filename):
     ]
     results = detect_metadata(df)
 
-    expected_results = {"NpmMapping": [b"cde"]}
+    expected_results = {"NpmMapping": [CompositeObjId(sha1=b"cde", sha1_git=b"aab")]}
     assert expected_results == results
 
 
diff --git a/swh/indexer/tests/metadata_dictionary/test_nuget.py b/swh/indexer/tests/metadata_dictionary/test_nuget.py
index b03cf9d0..8e45d115 100644
--- a/swh/indexer/tests/metadata_dictionary/test_nuget.py
+++ b/swh/indexer/tests/metadata_dictionary/test_nuget.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2022  The Software Heritage developers
+# Copyright (C) 2022-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -7,6 +7,7 @@ import pytest
 
 from swh.indexer.metadata_detector import detect_metadata
 from swh.indexer.metadata_dictionary import MAPPINGS
+from swh.objstorage.interface import CompositeObjId
 
 
 def test_compute_metadata_nuget():
@@ -104,7 +105,7 @@ def test_detect_metadata_package_nuspec(filename):
     ]
     results = detect_metadata(df)
 
-    expected_results = {"NuGetMapping": [b"cde"]}
+    expected_results = {"NuGetMapping": [CompositeObjId(sha1=b"cde", sha1_git=b"aab")]}
     assert expected_results == results
 
 
diff --git a/swh/indexer/tests/metadata_dictionary/test_python.py b/swh/indexer/tests/metadata_dictionary/test_python.py
index a8d2153d..0877338f 100644
--- a/swh/indexer/tests/metadata_dictionary/test_python.py
+++ b/swh/indexer/tests/metadata_dictionary/test_python.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2023  The Software Heritage developers
+# Copyright (C) 2017-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -6,6 +6,7 @@
 from swh.indexer.metadata_dictionary import MAPPINGS
 from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
 from swh.model.hashutil import hash_to_bytes
+from swh.objstorage.interface import CompositeObjId
 
 
 def test_compute_metadata_pkginfo():
@@ -123,4 +124,4 @@ def test_detect_metadata_files():
         sha1=hash_to_bytes("2" * 40),
     )
     result = MAPPINGS["PythonPkginfoMapping"]().detect_metadata_files([dir_entry])
-    assert result == [dir_entry["sha1"]]
+    assert result == [CompositeObjId(sha1=dir_entry["sha1"])]
diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py
index c3fc4fc2..52a9ece6 100644
--- a/swh/indexer/tests/test_fossology_license.py
+++ b/swh/indexer/tests/test_fossology_license.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2023  The Software Heritage developers
+# Copyright (C) 2017-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -91,12 +91,12 @@ class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase):
         # then
         self.expected_results = [
             *[
-                ContentLicenseRow(id=self.id0, tool=tool, license=license)
-                for license in SHA1_TO_LICENSES[self.id0]
+                ContentLicenseRow(id=self.id0["sha1"], tool=tool, license=license)
+                for license in SHA1_TO_LICENSES[self.id0["sha1"]]
             ],
             *[
-                ContentLicenseRow(id=self.id1, tool=tool, license=license)
-                for license in SHA1_TO_LICENSES[self.id1]
+                ContentLicenseRow(id=self.id1["sha1"], tool=tool, license=license)
+                for license in SHA1_TO_LICENSES[self.id1["sha1"]]
             ],
             *[],  # self.id2
         ]
diff --git a/swh/indexer/tests/test_indexer.py b/swh/indexer/tests/test_indexer.py
index 770ab6d1..f9be40bf 100644
--- a/swh/indexer/tests/test_indexer.py
+++ b/swh/indexer/tests/test_indexer.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020-2023  The Software Heritage developers
+# Copyright (C) 2020-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -11,6 +11,7 @@ import sentry_sdk
 
 from swh.indexer.indexer import ContentIndexer, DirectoryIndexer, OriginIndexer
 from swh.indexer.storage import Sha1
+from swh.objstorage.interface import CompositeObjId
 
 from .utils import BASE_TEST_CONFIG, DIRECTORY2
 
@@ -64,7 +65,7 @@ def test_content_indexer_catch_exceptions(sentry_events):
     sha1 = b"\x12" * 20
 
     # As task, catching exceptions
-    assert indexer.run([sha1]) == ({"status": "failed"}, [])
+    assert indexer.run([CompositeObjId(sha1=sha1)]) == ({"status": "failed"}, [])
     check_sentry(sentry_events, {"swh-indexer-content-sha1": sha1.hex()})
 
     # As journal client, catching exceptions
@@ -77,7 +78,7 @@ def test_content_indexer_catch_exceptions(sentry_events):
 
     # As task, not catching exceptions
     with pytest.raises(_TestException):
-        indexer.run([sha1])
+        indexer.run([CompositeObjId(sha1=sha1)])
     assert sentry_events == []
 
     # As journal client, not catching exceptions
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index b3553532..69d75726 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -136,7 +136,7 @@ class TestMetadata:
                 ContentMetadataRow(
                     id=MAPPING_DESCRIPTION_CONTENT_SHA1[
                         "json:yarn-parser-package.json"
-                    ],
+                    ]["sha1"],
                     indexer_configuration_id=tool["id"],
                     metadata=YARN_PARSER_METADATA,
                 )
@@ -199,7 +199,7 @@ class TestMetadata:
                 ContentMetadataRow(
                     id=MAPPING_DESCRIPTION_CONTENT_SHA1[
                         "json:yarn-parser-package.json"
-                    ],
+                    ]["sha1"],
                     indexer_configuration_id=tool["id"],
                     metadata=YARN_PARSER_METADATA,
                 )
diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py
index eba40428..0f2264e8 100644
--- a/swh/indexer/tests/utils.py
+++ b/swh/indexer/tests/utils.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022  The Software Heritage developers
+# Copyright (C) 2017-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -31,6 +31,7 @@ from swh.model.model import (
     SnapshotTargetType,
     TimestampWithTimezone,
 )
+from swh.objstorage.interface import CompositeObjId
 from swh.storage.utils import now
 
 BASE_TEST_CONFIG: Dict[str, Dict[str, Any]] = {
@@ -182,13 +183,15 @@ OBJ_STORAGE_RAW_CONTENT: Dict[str, bytes] = {
 }
 
 MAPPING_DESCRIPTION_CONTENT_SHA1GIT: Dict[str, bytes] = {}
-MAPPING_DESCRIPTION_CONTENT_SHA1: Dict[str, bytes] = {}
+MAPPING_DESCRIPTION_CONTENT_SHA1: Dict[str, CompositeObjId] = {}
 OBJ_STORAGE_DATA: Dict[bytes, bytes] = {}
 
 for key_description, data in OBJ_STORAGE_RAW_CONTENT.items():
     content = Content.from_data(data)
     MAPPING_DESCRIPTION_CONTENT_SHA1GIT[key_description] = content.sha1_git
-    MAPPING_DESCRIPTION_CONTENT_SHA1[key_description] = content.sha1
+    MAPPING_DESCRIPTION_CONTENT_SHA1[key_description] = CompositeObjId(
+        sha1=content.sha1
+    )
     OBJ_STORAGE_DATA[content.sha1] = data
 
 
@@ -211,21 +214,21 @@ RAW_CONTENT_METADATA = [
 ]
 
 RAW_CONTENTS: Dict[bytes, Tuple] = {}
-RAW_CONTENT_IDS: List[bytes] = []
+RAW_CONTENT_IDS: List[CompositeObjId] = []
 
 for index, raw_content_d in enumerate(RAW_CONTENT_METADATA):
     raw_content = raw_content_d[0]
     content = Content.from_data(raw_content)
     RAW_CONTENTS[content.sha1] = raw_content_d
-    RAW_CONTENT_IDS.append(content.sha1)
+    RAW_CONTENT_IDS.append(CompositeObjId(sha1=content.sha1))
     # and write it to objstorage data so it's flushed in the objstorage
     OBJ_STORAGE_DATA[content.sha1] = raw_content
 
 
 SHA1_TO_LICENSES: Dict[bytes, List[str]] = {
-    RAW_CONTENT_IDS[0]: ["GPL"],
-    RAW_CONTENT_IDS[1]: ["AGPL"],
-    RAW_CONTENT_IDS[2]: [],
+    RAW_CONTENT_IDS[0]["sha1"]: ["GPL"],
+    RAW_CONTENT_IDS[1]["sha1"]: ["AGPL"],
+    RAW_CONTENT_IDS[2]["sha1"]: [],
 }
 
 
@@ -634,7 +637,7 @@ class CommonContentIndexerTest(metaclass=abc.ABCMeta):
         return self.indexer.idx_storage.state
 
     def assert_results_ok(self, sha1s, expected_results=None):
-        sha1s = [hash_to_bytes(sha1) for sha1 in sha1s]
+        sha1s = [sha1["sha1"] for sha1 in sha1s]
         actual_results = list(self.get_indexer_results(sha1s))
 
         if expected_results is None:
@@ -663,15 +666,23 @@ class CommonContentIndexerTest(metaclass=abc.ABCMeta):
         """Unknown sha1s are not indexed"""
         sha1s = [
             self.id1,
-            bytes.fromhex("799a5ef812c53907562fe379d4b3851e69c7cb15"),  # unknown
-            bytes.fromhex("800a5ef812c53907562fe379d4b3851e69c7cb15"),  # unknown
+            CompositeObjId(
+                sha1=bytes.fromhex("799a5ef812c53907562fe379d4b3851e69c7cb15")
+            ),  # unknown
+            CompositeObjId(
+                sha1=bytes.fromhex("800a5ef812c53907562fe379d4b3851e69c7cb15")
+            ),  # unknown
         ]  # unknown
 
         # when
         self.indexer.run(sha1s)
 
         # then
-        expected_results = [res for res in self.expected_results if res.id in sha1s]
+        expected_results = [
+            res
+            for res in self.expected_results
+            if res.id in [sha1["sha1"] for sha1 in sha1s]
+        ]
 
         self.assert_results_ok(sha1s, expected_results)
 
-- 
GitLab