diff --git a/swh/storage/replay.py b/swh/storage/replay.py index 0516ba853cc323f15035bb50a6bf617e5a5f4a36..52ce8dd10388c6cfbf86bf44052e16e0b6d505ff 100644 --- a/swh/storage/replay.py +++ b/swh/storage/replay.py @@ -4,7 +4,7 @@ # See top-level LICENSE file for more information import logging -from typing import Any, Callable, Dict, Iterable, List +from typing import Any, Callable, Container, Dict, Iterable, List try: from systemd.daemon import notify @@ -103,6 +103,11 @@ def collision_aware_content_add( logger.error("Collision detected: %(collision)s", {"collision": collision}) +def dict_key_dropper(d: Dict, keys_to_drop: Container) -> Dict: + """Returns a copy of the dict d without any key listed in keys_to_drop""" + return {k: v for (k, v) in d.items() if k not in keys_to_drop} + + def _insert_objects(object_type: str, objects: List[Dict], storage) -> None: """Insert objects of type object_type in the storage. @@ -146,6 +151,18 @@ def _insert_objects(object_type: str, objects: List[Dict], storage) -> None: storage.metadata_authority_add(authorities) storage.metadata_fetcher_add(fetchers) storage.raw_extrinsic_metadata_add(converted) + elif object_type == "revision": + # drop the metadata field from the revision (is any); this field is + # about to be dropped from the data model (in favor of + # raw_extrinsic_metadata) and there can be bogus values in the existing + # journal (metadata with \0000 in it) + method = getattr(storage, object_type + "_add") + method( + [ + object_converter_fn[object_type](dict_key_dropper(o, ("metadata",))) + for o in objects + ] + ) elif object_type in ( "directory", "extid", diff --git a/swh/storage/tests/test_replay.py b/swh/storage/tests/test_replay.py index cda12ed342c4d62f767db8639f296e5b158598d5..6562f2aa253e1ac52c455628e4485f2ea765f9f3 100644 --- a/swh/storage/tests/test_replay.py +++ b/swh/storage/tests/test_replay.py @@ -14,8 +14,15 @@ import pytest from swh.journal.client import JournalClient from swh.journal.serializers import key_to_kafka, value_to_kafka -from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_hex -from swh.model.tests.swh_model_data import DUPLICATE_CONTENTS, TEST_OBJECTS +from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes, hash_to_hex +from swh.model.model import Revision, RevisionType +from swh.model.tests.swh_model_data import ( + COMMITTERS, + DATES, + DUPLICATE_CONTENTS, + REVISIONS, +) +from swh.model.tests.swh_model_data import TEST_OBJECTS as _TEST_OBJECTS from swh.storage import get_storage from swh.storage.cassandra.model import ContentRow, SkippedContentRow from swh.storage.in_memory import InMemoryStorage @@ -23,6 +30,23 @@ from swh.storage.replay import process_replay_objects UTC = datetime.timezone.utc +TEST_OBJECTS = _TEST_OBJECTS.copy() +TEST_OBJECTS["revision"] = list(_TEST_OBJECTS["revision"]) + [ + Revision( + id=hash_to_bytes("a569b03ebe6e5f9f2f6077355c40d89bd6986d0c"), + message=b"hello again", + date=DATES[1], + committer=COMMITTERS[1], + author=COMMITTERS[0], + committer_date=DATES[0], + type=RevisionType.GIT, + directory=b"\x03" * 20, + synthetic=False, + metadata={"something": "interesting"}, + parents=(REVISIONS[0].id,), + ), +] + def nullify_ctime(obj): if isinstance(obj, (ContentRow, SkippedContentRow)): @@ -212,6 +236,10 @@ def check_replayed( author=row.author.anonymize(), committer=row.committer.anonymize(), ) + if attr == "revisions": + # the replayer should now drop the metadata attribute; see + # swh/storgae/replay.py:_insert_objects() + row.metadata = "null" return row