From c87a210dfb3b5048c5c4d9acb5dfaea609d1dc37 Mon Sep 17 00:00:00 2001 From: Antoine Lambert <anlambert@softwareheritage.org> Date: Fri, 13 Dec 2024 17:29:56 +0100 Subject: [PATCH] indexer: Use CompositeObjId as object id type for ContentIndexer Previously, content indexers were only using content sha1 as object identifier but this could lead to errors in production when attempting to fetch content data from an object storage where key is the sha256 checksum of the content (winery for instance). So ensure to use a CompositeObjId dictionary as content object identifier to avoid these kind of issues when indexing content objects. --- swh/indexer/fossology_license.py | 23 ++++-------- swh/indexer/indexer.py | 22 ++++++------ swh/indexer/metadata.py | 25 +++++++------ swh/indexer/metadata_detector.py | 6 ++-- swh/indexer/metadata_dictionary/base.py | 30 ++++++++++++---- swh/indexer/mimetype.py | 15 ++++---- .../metadata_dictionary/test_codemeta.py | 5 ++- .../tests/metadata_dictionary/test_npm.py | 15 +++++--- .../tests/metadata_dictionary/test_nuget.py | 5 +-- .../tests/metadata_dictionary/test_python.py | 5 +-- swh/indexer/tests/test_fossology_license.py | 10 +++--- swh/indexer/tests/test_indexer.py | 7 ++-- swh/indexer/tests/test_metadata.py | 4 +-- swh/indexer/tests/utils.py | 35 ++++++++++++------- 14 files changed, 121 insertions(+), 86 deletions(-) diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py index 8ccf9d5c..20a9fa84 100644 --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2023 The Software Heritage developers +# Copyright (C) 2016-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -10,9 +10,10 @@ from typing import Any, Dict, List, Optional import sentry_sdk from swh.core.config import merge_configs -from swh.indexer.storage.interface import IndexerStorageInterface, Sha1 +from swh.indexer.storage.interface import IndexerStorageInterface from swh.indexer.storage.model import ContentLicenseRow from swh.model import hashutil +from swh.objstorage.interface import CompositeObjId from .indexer import ContentIndexer, write_to_temp @@ -86,7 +87,7 @@ class MixinFossologyLicenseIndexer: self.working_directory = self.config["workdir"] def index( - self, id: Sha1, data: Optional[bytes] = None, **kwargs + self, id: CompositeObjId, data: Optional[bytes] = None, **kwargs ) -> List[ContentLicenseRow]: """Index sha1s' content and store result. @@ -105,14 +106,14 @@ class MixinFossologyLicenseIndexer: """ assert data is not None with write_to_temp( - filename=hashutil.hash_to_hex(id), # use the id as pathname + filename=hashutil.hash_to_hex(id["sha1"]), # use the id as pathname data=data, working_directory=self.working_directory, ) as content_path: properties = compute_license(path=content_path) return [ ContentLicenseRow( - id=id, + id=id["sha1"], indexer_configuration_id=self.tool["id"], license=license, ) @@ -148,14 +149,4 @@ class FossologyLicenseIndexer( """ - def filter(self, ids): - """Filter out known sha1s and return only missing ones.""" - yield from self.idx_storage.content_fossology_license_missing( - ( - { - "id": sha1, - "indexer_configuration_id": self.tool["id"], - } - for sha1 in ids - ) - ) + pass diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py index bbb753e4..37e51ee0 100644 --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2023 The Software Heritage developers +# Copyright (C) 2016-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -15,12 +15,12 @@ import sentry_sdk from typing_extensions import TypedDict from swh.core.config import load_from_envvar, merge_configs -from swh.indexer.storage import INDEXER_CFG_KEY, Sha1, get_indexer_storage +from swh.indexer.storage import INDEXER_CFG_KEY, get_indexer_storage from swh.indexer.storage.interface import IndexerStorageInterface from swh.model import hashutil from swh.model.model import Directory, Origin, Sha1Git from swh.objstorage.factory import get_objstorage -from swh.objstorage.interface import objid_from_dict +from swh.objstorage.interface import CompositeObjId, objid_from_dict from swh.storage import get_storage from swh.storage.interface import StorageInterface @@ -279,7 +279,7 @@ class BaseIndexer(Generic[TId, TData, TResult], metaclass=abc.ABCMeta): raise NotImplementedError() -class ContentIndexer(BaseIndexer[Sha1, bytes, TResult], Generic[TResult]): +class ContentIndexer(BaseIndexer[CompositeObjId, bytes, TResult], Generic[TResult]): """A content indexer working on the journal (method `process_journal_objects`) or on a list of ids directly (method `run`). @@ -293,10 +293,12 @@ class ContentIndexer(BaseIndexer[Sha1, bytes, TResult], Generic[TResult]): """Read content objects from the journal, retrieve their raw content and compute content indexing (e.g. mimetype, fossology license, ...). """ - summary, _ = self.run([obj_id["sha1"] for obj_id in objects.get("content", [])]) + summary, _ = self.run( + [objid_from_dict(obj) for obj in objects.get("content", [])] + ) return summary - def run(self, ids: List[Sha1], **kwargs) -> Tuple[Dict, List]: + def run(self, ids: List[CompositeObjId], **kwargs) -> Tuple[Dict, List]: """Given a list of ids: - retrieve the content from the storage @@ -314,18 +316,16 @@ class ContentIndexer(BaseIndexer[Sha1, bytes, TResult], Generic[TResult]): summary: Dict[str, Any] = {"status": "uneventful"} results = [] try: - content_data = self.objstorage.get_batch( - [objid_from_dict({"sha1": id}) for id in ids] - ) + content_data = self.objstorage.get_batch(ids) for item, raw_content in zip(ids, content_data): id_ = item sentry_sdk.set_tag( - "swh-indexer-content-sha1", hashutil.hash_to_hex(id_) + "swh-indexer-content-sha1", hashutil.hash_to_hex(id_["sha1"]) ) if not raw_content: self.log.warning( "Content %s not found in objstorage", - hashutil.hash_to_hex(id_), + hashutil.hash_to_hex(id_["sha1"]), ) continue diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index 2b37e7ed..f95e7b38 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -41,7 +41,7 @@ from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import EXTRINSIC_MAPPINGS, INTRINSIC_MAPPINGS from swh.indexer.metadata_dictionary.base import DirectoryLsEntry from swh.indexer.origin_head import get_head_swhid -from swh.indexer.storage import INDEXER_CFG_KEY, Sha1 +from swh.indexer.storage import INDEXER_CFG_KEY from swh.indexer.storage.model import ( ContentMetadataRow, DirectoryIntrinsicMetadataRow, @@ -58,6 +58,7 @@ from swh.model.model import ( Sha1Git, ) from swh.model.swhids import CoreSWHID, ExtendedObjectType, ObjectType +from swh.objstorage.interface import CompositeObjId REVISION_GET_BATCH_SIZE = 10 RELEASE_GET_BATCH_SIZE = 10 @@ -232,21 +233,21 @@ class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]): """ - def filter(self, ids): + def filter(self, ids: List[CompositeObjId]): """Filter out known sha1s and return only missing ones.""" yield from self.idx_storage.content_metadata_missing( ( { - "id": sha1, + "id": id["sha1"], "indexer_configuration_id": self.tool["id"], } - for sha1 in ids + for id in ids ) ) def index( self, - id: Sha1, + id: CompositeObjId, data: Optional[bytes] = None, log_suffix="unknown directory", **kwargs, @@ -263,24 +264,24 @@ class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]): be returned as None """ - assert isinstance(id, bytes) + assert "sha1" in id assert data is not None metadata = None try: mapping_name = self.tool["tool_configuration"]["context"] - log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id) + log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id["sha1"]) metadata = INTRINSIC_MAPPINGS[mapping_name](log_suffix).translate(data) except Exception: self.log.exception( "Problem during metadata translation " - "for content %s" % hashutil.hash_to_hex(id) + "for content %s" % hashutil.hash_to_hex(id["sha1"]) ) sentry_sdk.capture_exception() if metadata is None: return [] return [ ContentMetadataRow( - id=id, + id=id["sha1"], indexer_configuration_id=self.tool["id"], metadata=metadata, ) @@ -429,7 +430,9 @@ class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]): c_metadata_indexer = ContentMetadataIndexer(config=cfg) # sha1s that are in content_metadata table sha1s_in_storage = [] - metadata_generator = self.idx_storage.content_metadata_get(detected_files) + metadata_generator = self.idx_storage.content_metadata_get( + [f["sha1"] for f in detected_files] + ) for c in metadata_generator: # extracting metadata sha1 = c.id @@ -440,7 +443,7 @@ class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]): metadata.append(local_metadata) sha1s_filtered = [ - item for item in detected_files if item not in sha1s_in_storage + item for item in detected_files if item["sha1"] not in sha1s_in_storage ] if sha1s_filtered: diff --git a/swh/indexer/metadata_detector.py b/swh/indexer/metadata_detector.py index d0ca829c..ceabdf92 100644 --- a/swh/indexer/metadata_detector.py +++ b/swh/indexer/metadata_detector.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 The Software Heritage developers +# Copyright (C) 2017-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,10 +7,10 @@ from typing import Dict, List from swh.indexer.metadata_dictionary import INTRINSIC_MAPPINGS from swh.indexer.metadata_dictionary.base import DirectoryLsEntry -from swh.indexer.storage.interface import Sha1 +from swh.objstorage.interface import CompositeObjId -def detect_metadata(files: List[DirectoryLsEntry]) -> Dict[str, List[Sha1]]: +def detect_metadata(files: List[DirectoryLsEntry]) -> Dict[str, List[CompositeObjId]]: """ Detects files potentially containing metadata diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py index d3e8d201..f356ed9e 100644 --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -1,11 +1,22 @@ -# Copyright (C) 2017-2023 The Software Heritage developers +# Copyright (C) 2017-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging -from typing import Any, Callable, Dict, List, Optional, Pattern, Tuple, TypeVar, Union +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Pattern, + Tuple, + TypeVar, + Union, + cast, +) import uuid import xml.parsers.expat @@ -18,6 +29,7 @@ import yaml from swh.indexer.codemeta import _document_loader, compact from swh.indexer.namespaces import RDF, SCHEMA from swh.indexer.storage.interface import Sha1 +from swh.objstorage.interface import CompositeObjId, objid_from_dict from .utils import add_url_if_valid @@ -28,6 +40,8 @@ TMP_ROOT_URI_PREFIX = "https://www.softwareheritage.org/schema/2022/indexer/tmp- class DirectoryLsEntry(TypedDict): target: Sha1 sha1: Optional[Sha1] + sha1_git: Optional[bytes] + sha256: Optional[bytes] name: bytes type: str @@ -118,7 +132,9 @@ class BaseIntrinsicMapping(BaseMapping): """ @classmethod - def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]: + def detect_metadata_files( + cls, file_entries: List[DirectoryLsEntry] + ) -> List[CompositeObjId]: """ Returns the sha1 hashes of files which can be translated by this mapping """ @@ -134,19 +150,21 @@ class SingleFileIntrinsicMapping(BaseIntrinsicMapping): filename: Union[bytes, Pattern[bytes]] @classmethod - def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]: + def detect_metadata_files( + cls, file_entries: List[DirectoryLsEntry] + ) -> List[CompositeObjId]: filename = cls.filename # Check if filename is a regex or bytes: if isinstance(filename, bytes): for entry in file_entries: if entry["name"].lower() == filename.lower(): if entry["sha1"] is not None: # ignore skipped_content and dangling - return [entry["sha1"]] + return [objid_from_dict(cast(dict, entry))] else: for entry in file_entries: if filename.match(entry["name"]): if entry["sha1"] is not None: # ignore skipped_content and dangling - return [entry["sha1"]] + return [objid_from_dict(cast(dict, entry))] return [] diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py index db97ec84..7cb9c3ab 100644 --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2023 The Software Heritage developers +# Copyright (C) 2016-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -8,8 +8,9 @@ from typing import Any, Dict, List, Optional import magic from swh.core.config import merge_configs -from swh.indexer.storage.interface import IndexerStorageInterface, Sha1 +from swh.indexer.storage.interface import IndexerStorageInterface from swh.indexer.storage.model import ContentMimetypeRow +from swh.objstorage.interface import CompositeObjId from .indexer import ContentIndexer @@ -67,7 +68,7 @@ class MixinMimetypeIndexer: self.config = merge_configs(DEFAULT_CONFIG, self.config) def index( - self, id: Sha1, data: Optional[bytes] = None, **kwargs + self, id: CompositeObjId, data: Optional[bytes] = None, **kwargs ) -> List[ContentMimetypeRow]: """Index sha1s' content and store result. @@ -87,7 +88,7 @@ class MixinMimetypeIndexer: properties = compute_mimetype_encoding(data) return [ ContentMimetypeRow( - id=id, + id=id["sha1"], indexer_configuration_id=self.tool["id"], mimetype=properties["mimetype"], encoding=properties["encoding"], @@ -120,14 +121,14 @@ class MimetypeIndexer(MixinMimetypeIndexer, ContentIndexer[ContentMimetypeRow]): """ - def filter(self, ids): + def filter(self, ids: List[CompositeObjId]): """Filter out known sha1s and return only missing ones.""" yield from self.idx_storage.content_mimetype_missing( ( { - "id": sha1, + "id": id["sha1"], "indexer_configuration_id": self.tool["id"], } - for sha1 in ids + for id in ids ) ) diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py index 8c89bad6..a56e2de4 100644 --- a/swh/indexer/tests/metadata_dictionary/test_codemeta.py +++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py @@ -12,6 +12,7 @@ import pytest from swh.indexer.codemeta import CODEMETA_TERMS from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS +from swh.objstorage.interface import CompositeObjId from ..utils import json_document_strategy @@ -173,7 +174,9 @@ def test_detect_metadata_codemeta_json_uppercase(): ] results = detect_metadata(df) - expected_results = {"CodemetaMapping": [b"bcd"]} + expected_results = { + "CodemetaMapping": [CompositeObjId(sha1=b"bcd", sha1_git=b"aab")] + } assert expected_results == results diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py index 8d8d6259..63680540 100644 --- a/swh/indexer/tests/metadata_dictionary/test_npm.py +++ b/swh/indexer/tests/metadata_dictionary/test_npm.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 The Software Heritage developers +# Copyright (C) 2017-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,6 +11,7 @@ import pytest from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.storage.model import ContentMetadataRow +from swh.objstorage.interface import CompositeObjId from ..test_metadata import TRANSLATOR_TOOL, ContentMetadataTestIndexer from ..utils import ( @@ -112,11 +113,15 @@ def test_index_content_metadata_npm(storage, obj_storage): config["tools"] = [TRANSLATOR_TOOL] metadata_indexer = ContentMetadataTestIndexer(config=config) metadata_indexer.run(sha1s, log_suffix="unknown content") - results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s)) + results = list( + metadata_indexer.idx_storage.content_metadata_get( + [sha1["sha1"] for sha1 in sha1s] + ) + ) expected_results = [ ContentMetadataRow( - id=sha1s[0], + id=sha1s[0]["sha1"], tool=TRANSLATOR_TOOL, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", @@ -128,7 +133,7 @@ def test_index_content_metadata_npm(storage, obj_storage): }, ), ContentMetadataRow( - id=sha1s[1], + id=sha1s[1]["sha1"], tool=TRANSLATOR_TOOL, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", @@ -456,7 +461,7 @@ def test_detect_metadata_package_json(filename): ] results = detect_metadata(df) - expected_results = {"NpmMapping": [b"cde"]} + expected_results = {"NpmMapping": [CompositeObjId(sha1=b"cde", sha1_git=b"aab")]} assert expected_results == results diff --git a/swh/indexer/tests/metadata_dictionary/test_nuget.py b/swh/indexer/tests/metadata_dictionary/test_nuget.py index b03cf9d0..8e45d115 100644 --- a/swh/indexer/tests/metadata_dictionary/test_nuget.py +++ b/swh/indexer/tests/metadata_dictionary/test_nuget.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,6 +7,7 @@ import pytest from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS +from swh.objstorage.interface import CompositeObjId def test_compute_metadata_nuget(): @@ -104,7 +105,7 @@ def test_detect_metadata_package_nuspec(filename): ] results = detect_metadata(df) - expected_results = {"NuGetMapping": [b"cde"]} + expected_results = {"NuGetMapping": [CompositeObjId(sha1=b"cde", sha1_git=b"aab")]} assert expected_results == results diff --git a/swh/indexer/tests/metadata_dictionary/test_python.py b/swh/indexer/tests/metadata_dictionary/test_python.py index a8d2153d..0877338f 100644 --- a/swh/indexer/tests/metadata_dictionary/test_python.py +++ b/swh/indexer/tests/metadata_dictionary/test_python.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2023 The Software Heritage developers +# Copyright (C) 2017-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,6 +6,7 @@ from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_dictionary.base import DirectoryLsEntry from swh.model.hashutil import hash_to_bytes +from swh.objstorage.interface import CompositeObjId def test_compute_metadata_pkginfo(): @@ -123,4 +124,4 @@ def test_detect_metadata_files(): sha1=hash_to_bytes("2" * 40), ) result = MAPPINGS["PythonPkginfoMapping"]().detect_metadata_files([dir_entry]) - assert result == [dir_entry["sha1"]] + assert result == [CompositeObjId(sha1=dir_entry["sha1"])] diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py index c3fc4fc2..52a9ece6 100644 --- a/swh/indexer/tests/test_fossology_license.py +++ b/swh/indexer/tests/test_fossology_license.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2023 The Software Heritage developers +# Copyright (C) 2017-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -91,12 +91,12 @@ class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase): # then self.expected_results = [ *[ - ContentLicenseRow(id=self.id0, tool=tool, license=license) - for license in SHA1_TO_LICENSES[self.id0] + ContentLicenseRow(id=self.id0["sha1"], tool=tool, license=license) + for license in SHA1_TO_LICENSES[self.id0["sha1"]] ], *[ - ContentLicenseRow(id=self.id1, tool=tool, license=license) - for license in SHA1_TO_LICENSES[self.id1] + ContentLicenseRow(id=self.id1["sha1"], tool=tool, license=license) + for license in SHA1_TO_LICENSES[self.id1["sha1"]] ], *[], # self.id2 ] diff --git a/swh/indexer/tests/test_indexer.py b/swh/indexer/tests/test_indexer.py index 770ab6d1..f9be40bf 100644 --- a/swh/indexer/tests/test_indexer.py +++ b/swh/indexer/tests/test_indexer.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2023 The Software Heritage developers +# Copyright (C) 2020-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,6 +11,7 @@ import sentry_sdk from swh.indexer.indexer import ContentIndexer, DirectoryIndexer, OriginIndexer from swh.indexer.storage import Sha1 +from swh.objstorage.interface import CompositeObjId from .utils import BASE_TEST_CONFIG, DIRECTORY2 @@ -64,7 +65,7 @@ def test_content_indexer_catch_exceptions(sentry_events): sha1 = b"\x12" * 20 # As task, catching exceptions - assert indexer.run([sha1]) == ({"status": "failed"}, []) + assert indexer.run([CompositeObjId(sha1=sha1)]) == ({"status": "failed"}, []) check_sentry(sentry_events, {"swh-indexer-content-sha1": sha1.hex()}) # As journal client, catching exceptions @@ -77,7 +78,7 @@ def test_content_indexer_catch_exceptions(sentry_events): # As task, not catching exceptions with pytest.raises(_TestException): - indexer.run([sha1]) + indexer.run([CompositeObjId(sha1=sha1)]) assert sentry_events == [] # As journal client, not catching exceptions diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index b3553532..69d75726 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -136,7 +136,7 @@ class TestMetadata: ContentMetadataRow( id=MAPPING_DESCRIPTION_CONTENT_SHA1[ "json:yarn-parser-package.json" - ], + ]["sha1"], indexer_configuration_id=tool["id"], metadata=YARN_PARSER_METADATA, ) @@ -199,7 +199,7 @@ class TestMetadata: ContentMetadataRow( id=MAPPING_DESCRIPTION_CONTENT_SHA1[ "json:yarn-parser-package.json" - ], + ]["sha1"], indexer_configuration_id=tool["id"], metadata=YARN_PARSER_METADATA, ) diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py index eba40428..0f2264e8 100644 --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 The Software Heritage developers +# Copyright (C) 2017-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -31,6 +31,7 @@ from swh.model.model import ( SnapshotTargetType, TimestampWithTimezone, ) +from swh.objstorage.interface import CompositeObjId from swh.storage.utils import now BASE_TEST_CONFIG: Dict[str, Dict[str, Any]] = { @@ -182,13 +183,15 @@ OBJ_STORAGE_RAW_CONTENT: Dict[str, bytes] = { } MAPPING_DESCRIPTION_CONTENT_SHA1GIT: Dict[str, bytes] = {} -MAPPING_DESCRIPTION_CONTENT_SHA1: Dict[str, bytes] = {} +MAPPING_DESCRIPTION_CONTENT_SHA1: Dict[str, CompositeObjId] = {} OBJ_STORAGE_DATA: Dict[bytes, bytes] = {} for key_description, data in OBJ_STORAGE_RAW_CONTENT.items(): content = Content.from_data(data) MAPPING_DESCRIPTION_CONTENT_SHA1GIT[key_description] = content.sha1_git - MAPPING_DESCRIPTION_CONTENT_SHA1[key_description] = content.sha1 + MAPPING_DESCRIPTION_CONTENT_SHA1[key_description] = CompositeObjId( + sha1=content.sha1 + ) OBJ_STORAGE_DATA[content.sha1] = data @@ -211,21 +214,21 @@ RAW_CONTENT_METADATA = [ ] RAW_CONTENTS: Dict[bytes, Tuple] = {} -RAW_CONTENT_IDS: List[bytes] = [] +RAW_CONTENT_IDS: List[CompositeObjId] = [] for index, raw_content_d in enumerate(RAW_CONTENT_METADATA): raw_content = raw_content_d[0] content = Content.from_data(raw_content) RAW_CONTENTS[content.sha1] = raw_content_d - RAW_CONTENT_IDS.append(content.sha1) + RAW_CONTENT_IDS.append(CompositeObjId(sha1=content.sha1)) # and write it to objstorage data so it's flushed in the objstorage OBJ_STORAGE_DATA[content.sha1] = raw_content SHA1_TO_LICENSES: Dict[bytes, List[str]] = { - RAW_CONTENT_IDS[0]: ["GPL"], - RAW_CONTENT_IDS[1]: ["AGPL"], - RAW_CONTENT_IDS[2]: [], + RAW_CONTENT_IDS[0]["sha1"]: ["GPL"], + RAW_CONTENT_IDS[1]["sha1"]: ["AGPL"], + RAW_CONTENT_IDS[2]["sha1"]: [], } @@ -634,7 +637,7 @@ class CommonContentIndexerTest(metaclass=abc.ABCMeta): return self.indexer.idx_storage.state def assert_results_ok(self, sha1s, expected_results=None): - sha1s = [hash_to_bytes(sha1) for sha1 in sha1s] + sha1s = [sha1["sha1"] for sha1 in sha1s] actual_results = list(self.get_indexer_results(sha1s)) if expected_results is None: @@ -663,15 +666,23 @@ class CommonContentIndexerTest(metaclass=abc.ABCMeta): """Unknown sha1s are not indexed""" sha1s = [ self.id1, - bytes.fromhex("799a5ef812c53907562fe379d4b3851e69c7cb15"), # unknown - bytes.fromhex("800a5ef812c53907562fe379d4b3851e69c7cb15"), # unknown + CompositeObjId( + sha1=bytes.fromhex("799a5ef812c53907562fe379d4b3851e69c7cb15") + ), # unknown + CompositeObjId( + sha1=bytes.fromhex("800a5ef812c53907562fe379d4b3851e69c7cb15") + ), # unknown ] # unknown # when self.indexer.run(sha1s) # then - expected_results = [res for res in self.expected_results if res.id in sha1s] + expected_results = [ + res + for res in self.expected_results + if res.id in [sha1["sha1"] for sha1 in sha1s] + ] self.assert_results_ok(sha1s, expected_results) -- GitLab