From b6d008e840b8989533b37774e3c5c1739e5d909c Mon Sep 17 00:00:00 2001
From: Nicolas Dandrimont <nicolas@dandrimont.eu>
Date: Wed, 19 Feb 2025 16:38:56 +0100
Subject: [PATCH] tests: disambiguate sha1s for indexer access, and composite
 object ids for objstorage access

The previous bulk migration left some `sha1` variables containing
composite object identifiers, making following the logic a bit
precarious.
---
 .../tests/metadata_dictionary/test_npm.py     | 18 +++---
 swh/indexer/tests/test_cli.py                 | 25 ++++----
 swh/indexer/tests/test_fossology_license.py   |  4 +-
 swh/indexer/tests/test_metadata.py            |  6 +-
 swh/indexer/tests/test_mimetype.py            | 14 ++---
 swh/indexer/tests/utils.py                    | 60 +++++++++----------
 6 files changed, 64 insertions(+), 63 deletions(-)

diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py
index 63680540..70722fdc 100644
--- a/swh/indexer/tests/metadata_dictionary/test_npm.py
+++ b/swh/indexer/tests/metadata_dictionary/test_npm.py
@@ -16,7 +16,7 @@ from swh.objstorage.interface import CompositeObjId
 from ..test_metadata import TRANSLATOR_TOOL, ContentMetadataTestIndexer
 from ..utils import (
     BASE_TEST_CONFIG,
-    MAPPING_DESCRIPTION_CONTENT_SHA1,
+    MAPPING_DESCRIPTION_CONTENT_OBJID,
     json_document_strategy,
 )
 
@@ -101,10 +101,10 @@ def test_index_content_metadata_npm(storage, obj_storage):
     - one sha1 uses a file that can't be translated to metadata and
       should return None in the translated metadata
     """
-    sha1s = [
-        MAPPING_DESCRIPTION_CONTENT_SHA1["json:test-metadata-package.json"],
-        MAPPING_DESCRIPTION_CONTENT_SHA1["json:npm-package.json"],
-        MAPPING_DESCRIPTION_CONTENT_SHA1["python:code"],
+    obj_ids = [
+        MAPPING_DESCRIPTION_CONTENT_OBJID["json:test-metadata-package.json"],
+        MAPPING_DESCRIPTION_CONTENT_OBJID["json:npm-package.json"],
+        MAPPING_DESCRIPTION_CONTENT_OBJID["python:code"],
     ]
 
     # this metadata indexer computes only metadata for package.json
@@ -112,16 +112,16 @@ def test_index_content_metadata_npm(storage, obj_storage):
     config = BASE_TEST_CONFIG.copy()
     config["tools"] = [TRANSLATOR_TOOL]
     metadata_indexer = ContentMetadataTestIndexer(config=config)
-    metadata_indexer.run(sha1s, log_suffix="unknown content")
+    metadata_indexer.run(obj_ids, log_suffix="unknown content")
     results = list(
         metadata_indexer.idx_storage.content_metadata_get(
-            [sha1["sha1"] for sha1 in sha1s]
+            [sha1["sha1"] for sha1 in obj_ids]
         )
     )
 
     expected_results = [
         ContentMetadataRow(
-            id=sha1s[0]["sha1"],
+            id=obj_ids[0]["sha1"],
             tool=TRANSLATOR_TOOL,
             metadata={
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
@@ -133,7 +133,7 @@ def test_index_content_metadata_npm(storage, obj_storage):
             },
         ),
         ContentMetadataRow(
-            id=sha1s[1]["sha1"],
+            id=obj_ids[1]["sha1"],
             tool=TRANSLATOR_TOOL,
             metadata={
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py
index 003dae9b..094b49d0 100644
--- a/swh/indexer/tests/test_cli.py
+++ b/swh/indexer/tests/test_cli.py
@@ -29,7 +29,7 @@ from swh.model.model import Content, Origin, OriginVisitStatus
 from .test_metadata import GITHUB_REMD
 from .utils import (
     DIRECTORY2,
-    RAW_CONTENT_IDS,
+    RAW_CONTENT_OBJIDS,
     RAW_CONTENTS,
     REVISION,
     SHA1_TO_LICENSES,
@@ -402,9 +402,9 @@ def test_cli_journal_client_index__content_mimetype(
     contents = []
     expected_results = []
     content_ids = []
-    for content_id, (raw_content, mimetypes, encoding) in RAW_CONTENTS.items():
+    for content_id, raw_content, mimetypes, encoding in RAW_CONTENTS:
         content = Content.from_data(raw_content)
-        assert content_id == content.sha1
+        assert content_id == content.hashes()
 
         contents.append(content)
         content_ids.append(content_id)
@@ -455,7 +455,7 @@ def test_cli_journal_client_index__content_mimetype(
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
 
-    results = idx_storage.content_mimetype_get(content_ids)
+    results = idx_storage.content_mimetype_get([id["sha1"] for id in content_ids])
     assert len(results) == len(contents)
     for result in results:
         assert result in expected_results
@@ -489,22 +489,23 @@ def test_cli_journal_client_index__fossology_license(
 
     tool = {"id": 1, **swh_indexer_config["tools"]}
 
-    id0, id1, id2 = RAW_CONTENT_IDS
+    id0, id1, id2 = RAW_CONTENT_OBJIDS
 
     contents = []
-    content_ids = []
+    content_sha1s = []
     expected_results = []
-    for content_id, (raw_content, _, _) in RAW_CONTENTS.items():
+    for content_id, raw_content, _, _ in RAW_CONTENTS:
         content = Content.from_data(raw_content)
-        assert content_id == content.sha1
+        assert content_id == content.hashes()
 
+        content_sha1 = content.sha1
         contents.append(content)
-        content_ids.append(content_id)
+        content_sha1s.append(content_sha1)
 
         expected_results.extend(
             [
-                ContentLicenseRow(id=content_id, tool=tool, license=license)
-                for license in SHA1_TO_LICENSES[content_id]
+                ContentLicenseRow(id=content_sha1, tool=tool, license=license)
+                for license in SHA1_TO_LICENSES[content_sha1]
             ]
         )
 
@@ -536,7 +537,7 @@ def test_cli_journal_client_index__fossology_license(
     assert result.exit_code == 0, result.output
     assert result.output == expected_output
 
-    results = idx_storage.content_fossology_license_get(content_ids)
+    results = idx_storage.content_fossology_license_get(content_sha1s)
     assert len(results) == len(expected_results)
     for result in results:
         assert result in expected_results
diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py
index 52a9ece6..9a112308 100644
--- a/swh/indexer/tests/test_fossology_license.py
+++ b/swh/indexer/tests/test_fossology_license.py
@@ -14,7 +14,7 @@ from swh.indexer.fossology_license import FossologyLicenseIndexer, compute_licen
 from swh.indexer.storage.model import ContentLicenseRow
 from swh.indexer.tests.utils import (
     BASE_TEST_CONFIG,
-    RAW_CONTENT_IDS,
+    RAW_CONTENT_OBJIDS,
     SHA1_TO_LICENSES,
     CommonContentIndexerTest,
     fill_obj_storage,
@@ -84,7 +84,7 @@ class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase):
         fill_storage(self.indexer.storage)
         fill_obj_storage(self.indexer.objstorage)
 
-        self.id0, self.id1, self.id2 = RAW_CONTENT_IDS
+        self.id0, self.id1, self.id2 = RAW_CONTENT_OBJIDS
 
         tool = {k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items()}
 
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index 69d75726..bb1af208 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -32,7 +32,7 @@ from swh.model.swhids import ExtendedObjectType, ExtendedSWHID
 
 from .utils import (
     BASE_TEST_CONFIG,
-    MAPPING_DESCRIPTION_CONTENT_SHA1,
+    MAPPING_DESCRIPTION_CONTENT_OBJID,
     MAPPING_DESCRIPTION_CONTENT_SHA1GIT,
     YARN_PARSER_METADATA,
     fill_obj_storage,
@@ -134,7 +134,7 @@ class TestMetadata:
         metadata_indexer.idx_storage.content_metadata_add(
             [
                 ContentMetadataRow(
-                    id=MAPPING_DESCRIPTION_CONTENT_SHA1[
+                    id=MAPPING_DESCRIPTION_CONTENT_OBJID[
                         "json:yarn-parser-package.json"
                     ]["sha1"],
                     indexer_configuration_id=tool["id"],
@@ -197,7 +197,7 @@ class TestMetadata:
         metadata_indexer.idx_storage.content_metadata_add(
             [
                 ContentMetadataRow(
-                    id=MAPPING_DESCRIPTION_CONTENT_SHA1[
+                    id=MAPPING_DESCRIPTION_CONTENT_OBJID[
                         "json:yarn-parser-package.json"
                     ]["sha1"],
                     indexer_configuration_id=tool["id"],
diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py
index f59a3331..b0ac7d49 100644
--- a/swh/indexer/tests/test_mimetype.py
+++ b/swh/indexer/tests/test_mimetype.py
@@ -12,7 +12,7 @@ from swh.indexer.mimetype import MimetypeIndexer, compute_mimetype_encoding
 from swh.indexer.storage.model import ContentMimetypeRow
 from swh.indexer.tests.utils import (
     BASE_TEST_CONFIG,
-    RAW_CONTENT_IDS,
+    RAW_CONTENT_OBJIDS,
     RAW_CONTENTS,
     CommonContentIndexerTest,
     fill_obj_storage,
@@ -22,10 +22,10 @@ from swh.indexer.tests.utils import (
 
 
 @pytest.mark.parametrize(
-    "raw_text,mimetypes,encoding",
-    RAW_CONTENTS.values(),
+    "content_id,raw_text,mimetypes,encoding",
+    RAW_CONTENTS,
 )
-def test_compute_mimetype_encoding(raw_text, mimetypes, encoding):
+def test_compute_mimetype_encoding(content_id, raw_text, mimetypes, encoding):
     """Compute mimetype encoding should return results"""
     actual_result = compute_mimetype_encoding(raw_text)
 
@@ -68,12 +68,12 @@ class TestMimetypeIndexer(CommonContentIndexerTest, unittest.TestCase):
         fill_storage(self.indexer.storage)
         fill_obj_storage(self.indexer.objstorage)
 
-        self.id0, self.id1, self.id2 = RAW_CONTENT_IDS
+        self.id0, self.id1, self.id2 = RAW_CONTENT_OBJIDS
 
         tool = {k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items()}
 
         results = []
-        for raw_content_id, (raw_content, mimetypes, encoding) in RAW_CONTENTS.items():
+        for raw_content_id, raw_content, mimetypes, encoding in RAW_CONTENTS:
             # Older libmagic versions (e.g. buster: 1:5.35-4+deb10u2, bullseye:
             # 1:5.39-3) returns different results. This allows to deal with such a case
             # when executing tests on different environments machines (e.g. ci tox, ci
@@ -83,7 +83,7 @@ class TestMimetypeIndexer(CommonContentIndexerTest, unittest.TestCase):
             results.extend(
                 [
                     ContentMimetypeRow(
-                        id=raw_content_id,
+                        id=raw_content_id["sha1"],
                         tool=tool,
                         mimetype=mimetype,
                         encoding=encoding,
diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py
index 0f2264e8..13b1f44b 100644
--- a/swh/indexer/tests/utils.py
+++ b/swh/indexer/tests/utils.py
@@ -6,7 +6,7 @@
 import abc
 import datetime
 import functools
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Tuple, Union
 import unittest
 
 from hypothesis import strategies
@@ -31,7 +31,7 @@ from swh.model.model import (
     SnapshotTargetType,
     TimestampWithTimezone,
 )
-from swh.objstorage.interface import CompositeObjId
+from swh.objstorage.interface import CompositeObjId, objid_from_dict
 from swh.storage.utils import now
 
 BASE_TEST_CONFIG: Dict[str, Dict[str, Any]] = {
@@ -183,19 +183,18 @@ OBJ_STORAGE_RAW_CONTENT: Dict[str, bytes] = {
 }
 
 MAPPING_DESCRIPTION_CONTENT_SHA1GIT: Dict[str, bytes] = {}
-MAPPING_DESCRIPTION_CONTENT_SHA1: Dict[str, CompositeObjId] = {}
-OBJ_STORAGE_DATA: Dict[bytes, bytes] = {}
+MAPPING_DESCRIPTION_CONTENT_OBJID: Dict[str, CompositeObjId] = {}
+OBJ_STORAGE_DATA: List[Tuple[CompositeObjId, bytes]] = []
 
 for key_description, data in OBJ_STORAGE_RAW_CONTENT.items():
     content = Content.from_data(data)
+    content_id = objid_from_dict(content.hashes())
     MAPPING_DESCRIPTION_CONTENT_SHA1GIT[key_description] = content.sha1_git
-    MAPPING_DESCRIPTION_CONTENT_SHA1[key_description] = CompositeObjId(
-        sha1=content.sha1
-    )
-    OBJ_STORAGE_DATA[content.sha1] = data
+    MAPPING_DESCRIPTION_CONTENT_OBJID[key_description] = content_id
+    OBJ_STORAGE_DATA.append((content_id, data))
 
 
-RAW_CONTENT_METADATA = [
+RAW_CONTENT_METADATA: List[Tuple[bytes, Union[str, Tuple[str, ...]], str]] = [
     (
         "du français".encode(),
         "text/plain",
@@ -213,22 +212,23 @@ RAW_CONTENT_METADATA = [
     ),
 ]
 
-RAW_CONTENTS: Dict[bytes, Tuple] = {}
-RAW_CONTENT_IDS: List[CompositeObjId] = []
+RAW_CONTENTS: List[Tuple[CompositeObjId, bytes, Union[str, Tuple[str, ...]], str]] = []
+RAW_CONTENT_OBJIDS: List[CompositeObjId] = []
 
 for index, raw_content_d in enumerate(RAW_CONTENT_METADATA):
     raw_content = raw_content_d[0]
     content = Content.from_data(raw_content)
-    RAW_CONTENTS[content.sha1] = raw_content_d
-    RAW_CONTENT_IDS.append(CompositeObjId(sha1=content.sha1))
+    content_id = objid_from_dict(content.hashes())
+    RAW_CONTENTS.append((content_id, *raw_content_d))
+    RAW_CONTENT_OBJIDS.append(content_id)
     # and write it to objstorage data so it's flushed in the objstorage
-    OBJ_STORAGE_DATA[content.sha1] = raw_content
+    OBJ_STORAGE_DATA.append((content_id, raw_content))
 
 
 SHA1_TO_LICENSES: Dict[bytes, List[str]] = {
-    RAW_CONTENT_IDS[0]["sha1"]: ["GPL"],
-    RAW_CONTENT_IDS[1]["sha1"]: ["AGPL"],
-    RAW_CONTENT_IDS[2]["sha1"]: [],
+    RAW_CONTENT_OBJIDS[0]["sha1"]: ["GPL"],
+    RAW_CONTENT_OBJIDS[1]["sha1"]: ["AGPL"],
+    RAW_CONTENT_OBJIDS[2]["sha1"]: [],
 }
 
 
@@ -602,13 +602,13 @@ def filter_dict(d, keys):
 
 def fill_obj_storage(obj_storage):
     """Add some content in an object storage."""
-    for obj_id, content in OBJ_STORAGE_DATA.items():
+    for obj_id, content in OBJ_STORAGE_DATA:
         obj_storage.add(content, obj_id)
 
 
 def fill_storage(storage):
     """Fill in storage with consistent test dataset."""
-    storage.content_add([Content.from_data(data) for data in OBJ_STORAGE_DATA.values()])
+    storage.content_add([Content.from_data(data) for _, data in OBJ_STORAGE_DATA])
     storage.directory_add([DIRECTORY, DIRECTORY2])
     storage.revision_add(REVISIONS)
     storage.release_add(RELEASES)
@@ -636,8 +636,8 @@ class CommonContentIndexerTest(metaclass=abc.ABCMeta):
         """Override this for indexers that don't have a mock storage."""
         return self.indexer.idx_storage.state
 
-    def assert_results_ok(self, sha1s, expected_results=None):
-        sha1s = [sha1["sha1"] for sha1 in sha1s]
+    def assert_results_ok(self, objids, expected_results=None):
+        sha1s = [sha1["sha1"] for sha1 in objids]
         actual_results = list(self.get_indexer_results(sha1s))
 
         if expected_results is None:
@@ -650,21 +650,21 @@ class CommonContentIndexerTest(metaclass=abc.ABCMeta):
 
     def test_index(self):
         """Known sha1 have their data indexed"""
-        sha1s = [self.id0, self.id1, self.id2]
+        obj_ids = [self.id0, self.id1, self.id2]
 
         # when
-        self.indexer.run(sha1s)
+        self.indexer.run(obj_ids)
 
-        self.assert_results_ok(sha1s)
+        self.assert_results_ok(obj_ids)
 
         # 2nd pass
-        self.indexer.run(sha1s)
+        self.indexer.run(obj_ids)
 
-        self.assert_results_ok(sha1s)
+        self.assert_results_ok(obj_ids)
 
     def test_index_one_unknown_sha1(self):
         """Unknown sha1s are not indexed"""
-        sha1s = [
+        obj_ids = [
             self.id1,
             CompositeObjId(
                 sha1=bytes.fromhex("799a5ef812c53907562fe379d4b3851e69c7cb15")
@@ -675,16 +675,16 @@ class CommonContentIndexerTest(metaclass=abc.ABCMeta):
         ]  # unknown
 
         # when
-        self.indexer.run(sha1s)
+        self.indexer.run(obj_ids)
 
         # then
         expected_results = [
             res
             for res in self.expected_results
-            if res.id in [sha1["sha1"] for sha1 in sha1s]
+            if res.id in [sha1["sha1"] for sha1 in obj_ids]
         ]
 
-        self.assert_results_ok(sha1s, expected_results)
+        self.assert_results_ok(obj_ids, expected_results)
 
 
 class CommonContentIndexerPartitionTest:
-- 
GitLab