From b6d008e840b8989533b37774e3c5c1739e5d909c Mon Sep 17 00:00:00 2001 From: Nicolas Dandrimont <nicolas@dandrimont.eu> Date: Wed, 19 Feb 2025 16:38:56 +0100 Subject: [PATCH] tests: disambiguate sha1s for indexer access, and composite object ids for objstorage access The previous bulk migration left some `sha1` variables containing composite object identifiers, making following the logic a bit precarious. --- .../tests/metadata_dictionary/test_npm.py | 18 +++--- swh/indexer/tests/test_cli.py | 25 ++++---- swh/indexer/tests/test_fossology_license.py | 4 +- swh/indexer/tests/test_metadata.py | 6 +- swh/indexer/tests/test_mimetype.py | 14 ++--- swh/indexer/tests/utils.py | 60 +++++++++---------- 6 files changed, 64 insertions(+), 63 deletions(-) diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py index 63680540..70722fdc 100644 --- a/swh/indexer/tests/metadata_dictionary/test_npm.py +++ b/swh/indexer/tests/metadata_dictionary/test_npm.py @@ -16,7 +16,7 @@ from swh.objstorage.interface import CompositeObjId from ..test_metadata import TRANSLATOR_TOOL, ContentMetadataTestIndexer from ..utils import ( BASE_TEST_CONFIG, - MAPPING_DESCRIPTION_CONTENT_SHA1, + MAPPING_DESCRIPTION_CONTENT_OBJID, json_document_strategy, ) @@ -101,10 +101,10 @@ def test_index_content_metadata_npm(storage, obj_storage): - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ - sha1s = [ - MAPPING_DESCRIPTION_CONTENT_SHA1["json:test-metadata-package.json"], - MAPPING_DESCRIPTION_CONTENT_SHA1["json:npm-package.json"], - MAPPING_DESCRIPTION_CONTENT_SHA1["python:code"], + obj_ids = [ + MAPPING_DESCRIPTION_CONTENT_OBJID["json:test-metadata-package.json"], + MAPPING_DESCRIPTION_CONTENT_OBJID["json:npm-package.json"], + MAPPING_DESCRIPTION_CONTENT_OBJID["python:code"], ] # this metadata indexer computes only metadata for package.json @@ -112,16 +112,16 @@ def test_index_content_metadata_npm(storage, obj_storage): config = BASE_TEST_CONFIG.copy() config["tools"] = [TRANSLATOR_TOOL] metadata_indexer = ContentMetadataTestIndexer(config=config) - metadata_indexer.run(sha1s, log_suffix="unknown content") + metadata_indexer.run(obj_ids, log_suffix="unknown content") results = list( metadata_indexer.idx_storage.content_metadata_get( - [sha1["sha1"] for sha1 in sha1s] + [sha1["sha1"] for sha1 in obj_ids] ) ) expected_results = [ ContentMetadataRow( - id=sha1s[0]["sha1"], + id=obj_ids[0]["sha1"], tool=TRANSLATOR_TOOL, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", @@ -133,7 +133,7 @@ def test_index_content_metadata_npm(storage, obj_storage): }, ), ContentMetadataRow( - id=sha1s[1]["sha1"], + id=obj_ids[1]["sha1"], tool=TRANSLATOR_TOOL, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py index 003dae9b..094b49d0 100644 --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -29,7 +29,7 @@ from swh.model.model import Content, Origin, OriginVisitStatus from .test_metadata import GITHUB_REMD from .utils import ( DIRECTORY2, - RAW_CONTENT_IDS, + RAW_CONTENT_OBJIDS, RAW_CONTENTS, REVISION, SHA1_TO_LICENSES, @@ -402,9 +402,9 @@ def test_cli_journal_client_index__content_mimetype( contents = [] expected_results = [] content_ids = [] - for content_id, (raw_content, mimetypes, encoding) in RAW_CONTENTS.items(): + for content_id, raw_content, mimetypes, encoding in RAW_CONTENTS: content = Content.from_data(raw_content) - assert content_id == content.sha1 + assert content_id == content.hashes() contents.append(content) content_ids.append(content_id) @@ -455,7 +455,7 @@ def test_cli_journal_client_index__content_mimetype( assert result.exit_code == 0, result.output assert result.output == expected_output - results = idx_storage.content_mimetype_get(content_ids) + results = idx_storage.content_mimetype_get([id["sha1"] for id in content_ids]) assert len(results) == len(contents) for result in results: assert result in expected_results @@ -489,22 +489,23 @@ def test_cli_journal_client_index__fossology_license( tool = {"id": 1, **swh_indexer_config["tools"]} - id0, id1, id2 = RAW_CONTENT_IDS + id0, id1, id2 = RAW_CONTENT_OBJIDS contents = [] - content_ids = [] + content_sha1s = [] expected_results = [] - for content_id, (raw_content, _, _) in RAW_CONTENTS.items(): + for content_id, raw_content, _, _ in RAW_CONTENTS: content = Content.from_data(raw_content) - assert content_id == content.sha1 + assert content_id == content.hashes() + content_sha1 = content.sha1 contents.append(content) - content_ids.append(content_id) + content_sha1s.append(content_sha1) expected_results.extend( [ - ContentLicenseRow(id=content_id, tool=tool, license=license) - for license in SHA1_TO_LICENSES[content_id] + ContentLicenseRow(id=content_sha1, tool=tool, license=license) + for license in SHA1_TO_LICENSES[content_sha1] ] ) @@ -536,7 +537,7 @@ def test_cli_journal_client_index__fossology_license( assert result.exit_code == 0, result.output assert result.output == expected_output - results = idx_storage.content_fossology_license_get(content_ids) + results = idx_storage.content_fossology_license_get(content_sha1s) assert len(results) == len(expected_results) for result in results: assert result in expected_results diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py index 52a9ece6..9a112308 100644 --- a/swh/indexer/tests/test_fossology_license.py +++ b/swh/indexer/tests/test_fossology_license.py @@ -14,7 +14,7 @@ from swh.indexer.fossology_license import FossologyLicenseIndexer, compute_licen from swh.indexer.storage.model import ContentLicenseRow from swh.indexer.tests.utils import ( BASE_TEST_CONFIG, - RAW_CONTENT_IDS, + RAW_CONTENT_OBJIDS, SHA1_TO_LICENSES, CommonContentIndexerTest, fill_obj_storage, @@ -84,7 +84,7 @@ class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase): fill_storage(self.indexer.storage) fill_obj_storage(self.indexer.objstorage) - self.id0, self.id1, self.id2 = RAW_CONTENT_IDS + self.id0, self.id1, self.id2 = RAW_CONTENT_OBJIDS tool = {k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items()} diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index 69d75726..bb1af208 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -32,7 +32,7 @@ from swh.model.swhids import ExtendedObjectType, ExtendedSWHID from .utils import ( BASE_TEST_CONFIG, - MAPPING_DESCRIPTION_CONTENT_SHA1, + MAPPING_DESCRIPTION_CONTENT_OBJID, MAPPING_DESCRIPTION_CONTENT_SHA1GIT, YARN_PARSER_METADATA, fill_obj_storage, @@ -134,7 +134,7 @@ class TestMetadata: metadata_indexer.idx_storage.content_metadata_add( [ ContentMetadataRow( - id=MAPPING_DESCRIPTION_CONTENT_SHA1[ + id=MAPPING_DESCRIPTION_CONTENT_OBJID[ "json:yarn-parser-package.json" ]["sha1"], indexer_configuration_id=tool["id"], @@ -197,7 +197,7 @@ class TestMetadata: metadata_indexer.idx_storage.content_metadata_add( [ ContentMetadataRow( - id=MAPPING_DESCRIPTION_CONTENT_SHA1[ + id=MAPPING_DESCRIPTION_CONTENT_OBJID[ "json:yarn-parser-package.json" ]["sha1"], indexer_configuration_id=tool["id"], diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py index f59a3331..b0ac7d49 100644 --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -12,7 +12,7 @@ from swh.indexer.mimetype import MimetypeIndexer, compute_mimetype_encoding from swh.indexer.storage.model import ContentMimetypeRow from swh.indexer.tests.utils import ( BASE_TEST_CONFIG, - RAW_CONTENT_IDS, + RAW_CONTENT_OBJIDS, RAW_CONTENTS, CommonContentIndexerTest, fill_obj_storage, @@ -22,10 +22,10 @@ from swh.indexer.tests.utils import ( @pytest.mark.parametrize( - "raw_text,mimetypes,encoding", - RAW_CONTENTS.values(), + "content_id,raw_text,mimetypes,encoding", + RAW_CONTENTS, ) -def test_compute_mimetype_encoding(raw_text, mimetypes, encoding): +def test_compute_mimetype_encoding(content_id, raw_text, mimetypes, encoding): """Compute mimetype encoding should return results""" actual_result = compute_mimetype_encoding(raw_text) @@ -68,12 +68,12 @@ class TestMimetypeIndexer(CommonContentIndexerTest, unittest.TestCase): fill_storage(self.indexer.storage) fill_obj_storage(self.indexer.objstorage) - self.id0, self.id1, self.id2 = RAW_CONTENT_IDS + self.id0, self.id1, self.id2 = RAW_CONTENT_OBJIDS tool = {k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items()} results = [] - for raw_content_id, (raw_content, mimetypes, encoding) in RAW_CONTENTS.items(): + for raw_content_id, raw_content, mimetypes, encoding in RAW_CONTENTS: # Older libmagic versions (e.g. buster: 1:5.35-4+deb10u2, bullseye: # 1:5.39-3) returns different results. This allows to deal with such a case # when executing tests on different environments machines (e.g. ci tox, ci @@ -83,7 +83,7 @@ class TestMimetypeIndexer(CommonContentIndexerTest, unittest.TestCase): results.extend( [ ContentMimetypeRow( - id=raw_content_id, + id=raw_content_id["sha1"], tool=tool, mimetype=mimetype, encoding=encoding, diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py index 0f2264e8..13b1f44b 100644 --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -6,7 +6,7 @@ import abc import datetime import functools -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Tuple, Union import unittest from hypothesis import strategies @@ -31,7 +31,7 @@ from swh.model.model import ( SnapshotTargetType, TimestampWithTimezone, ) -from swh.objstorage.interface import CompositeObjId +from swh.objstorage.interface import CompositeObjId, objid_from_dict from swh.storage.utils import now BASE_TEST_CONFIG: Dict[str, Dict[str, Any]] = { @@ -183,19 +183,18 @@ OBJ_STORAGE_RAW_CONTENT: Dict[str, bytes] = { } MAPPING_DESCRIPTION_CONTENT_SHA1GIT: Dict[str, bytes] = {} -MAPPING_DESCRIPTION_CONTENT_SHA1: Dict[str, CompositeObjId] = {} -OBJ_STORAGE_DATA: Dict[bytes, bytes] = {} +MAPPING_DESCRIPTION_CONTENT_OBJID: Dict[str, CompositeObjId] = {} +OBJ_STORAGE_DATA: List[Tuple[CompositeObjId, bytes]] = [] for key_description, data in OBJ_STORAGE_RAW_CONTENT.items(): content = Content.from_data(data) + content_id = objid_from_dict(content.hashes()) MAPPING_DESCRIPTION_CONTENT_SHA1GIT[key_description] = content.sha1_git - MAPPING_DESCRIPTION_CONTENT_SHA1[key_description] = CompositeObjId( - sha1=content.sha1 - ) - OBJ_STORAGE_DATA[content.sha1] = data + MAPPING_DESCRIPTION_CONTENT_OBJID[key_description] = content_id + OBJ_STORAGE_DATA.append((content_id, data)) -RAW_CONTENT_METADATA = [ +RAW_CONTENT_METADATA: List[Tuple[bytes, Union[str, Tuple[str, ...]], str]] = [ ( "du français".encode(), "text/plain", @@ -213,22 +212,23 @@ RAW_CONTENT_METADATA = [ ), ] -RAW_CONTENTS: Dict[bytes, Tuple] = {} -RAW_CONTENT_IDS: List[CompositeObjId] = [] +RAW_CONTENTS: List[Tuple[CompositeObjId, bytes, Union[str, Tuple[str, ...]], str]] = [] +RAW_CONTENT_OBJIDS: List[CompositeObjId] = [] for index, raw_content_d in enumerate(RAW_CONTENT_METADATA): raw_content = raw_content_d[0] content = Content.from_data(raw_content) - RAW_CONTENTS[content.sha1] = raw_content_d - RAW_CONTENT_IDS.append(CompositeObjId(sha1=content.sha1)) + content_id = objid_from_dict(content.hashes()) + RAW_CONTENTS.append((content_id, *raw_content_d)) + RAW_CONTENT_OBJIDS.append(content_id) # and write it to objstorage data so it's flushed in the objstorage - OBJ_STORAGE_DATA[content.sha1] = raw_content + OBJ_STORAGE_DATA.append((content_id, raw_content)) SHA1_TO_LICENSES: Dict[bytes, List[str]] = { - RAW_CONTENT_IDS[0]["sha1"]: ["GPL"], - RAW_CONTENT_IDS[1]["sha1"]: ["AGPL"], - RAW_CONTENT_IDS[2]["sha1"]: [], + RAW_CONTENT_OBJIDS[0]["sha1"]: ["GPL"], + RAW_CONTENT_OBJIDS[1]["sha1"]: ["AGPL"], + RAW_CONTENT_OBJIDS[2]["sha1"]: [], } @@ -602,13 +602,13 @@ def filter_dict(d, keys): def fill_obj_storage(obj_storage): """Add some content in an object storage.""" - for obj_id, content in OBJ_STORAGE_DATA.items(): + for obj_id, content in OBJ_STORAGE_DATA: obj_storage.add(content, obj_id) def fill_storage(storage): """Fill in storage with consistent test dataset.""" - storage.content_add([Content.from_data(data) for data in OBJ_STORAGE_DATA.values()]) + storage.content_add([Content.from_data(data) for _, data in OBJ_STORAGE_DATA]) storage.directory_add([DIRECTORY, DIRECTORY2]) storage.revision_add(REVISIONS) storage.release_add(RELEASES) @@ -636,8 +636,8 @@ class CommonContentIndexerTest(metaclass=abc.ABCMeta): """Override this for indexers that don't have a mock storage.""" return self.indexer.idx_storage.state - def assert_results_ok(self, sha1s, expected_results=None): - sha1s = [sha1["sha1"] for sha1 in sha1s] + def assert_results_ok(self, objids, expected_results=None): + sha1s = [sha1["sha1"] for sha1 in objids] actual_results = list(self.get_indexer_results(sha1s)) if expected_results is None: @@ -650,21 +650,21 @@ class CommonContentIndexerTest(metaclass=abc.ABCMeta): def test_index(self): """Known sha1 have their data indexed""" - sha1s = [self.id0, self.id1, self.id2] + obj_ids = [self.id0, self.id1, self.id2] # when - self.indexer.run(sha1s) + self.indexer.run(obj_ids) - self.assert_results_ok(sha1s) + self.assert_results_ok(obj_ids) # 2nd pass - self.indexer.run(sha1s) + self.indexer.run(obj_ids) - self.assert_results_ok(sha1s) + self.assert_results_ok(obj_ids) def test_index_one_unknown_sha1(self): """Unknown sha1s are not indexed""" - sha1s = [ + obj_ids = [ self.id1, CompositeObjId( sha1=bytes.fromhex("799a5ef812c53907562fe379d4b3851e69c7cb15") @@ -675,16 +675,16 @@ class CommonContentIndexerTest(metaclass=abc.ABCMeta): ] # unknown # when - self.indexer.run(sha1s) + self.indexer.run(obj_ids) # then expected_results = [ res for res in self.expected_results - if res.id in [sha1["sha1"] for sha1 in sha1s] + if res.id in [sha1["sha1"] for sha1 in obj_ids] ] - self.assert_results_ok(sha1s, expected_results) + self.assert_results_ok(obj_ids, expected_results) class CommonContentIndexerPartitionTest: -- GitLab