Skip to content
Snippets Groups Projects
Commit 39507b24 authored by David Douard's avatar David Douard
Browse files

Make the replayer drop the Revision.metadata

this attribute is deprecated and on the verge of being replaced by
RawExtrinsicMetadata objects, and the kafka journal currently in production
contains a few invalid metadata entries that makes the replayer unhappy.

Closes T3201.
parent 84dcbe3d
No related branches found
Tags v0.27.2
1 merge request!1004Make the replayer drop the Revision.metadata
......@@ -4,7 +4,7 @@
# See top-level LICENSE file for more information
import logging
from typing import Any, Callable, Dict, Iterable, List
from typing import Any, Callable, Container, Dict, Iterable, List
try:
from systemd.daemon import notify
......@@ -103,6 +103,11 @@ def collision_aware_content_add(
logger.error("Collision detected: %(collision)s", {"collision": collision})
def dict_key_dropper(d: Dict, keys_to_drop: Container) -> Dict:
"""Returns a copy of the dict d without any key listed in keys_to_drop"""
return {k: v for (k, v) in d.items() if k not in keys_to_drop}
def _insert_objects(object_type: str, objects: List[Dict], storage) -> None:
"""Insert objects of type object_type in the storage.
......@@ -146,6 +151,18 @@ def _insert_objects(object_type: str, objects: List[Dict], storage) -> None:
storage.metadata_authority_add(authorities)
storage.metadata_fetcher_add(fetchers)
storage.raw_extrinsic_metadata_add(converted)
elif object_type == "revision":
# drop the metadata field from the revision (is any); this field is
# about to be dropped from the data model (in favor of
# raw_extrinsic_metadata) and there can be bogus values in the existing
# journal (metadata with \0000 in it)
method = getattr(storage, object_type + "_add")
method(
[
object_converter_fn[object_type](dict_key_dropper(o, ("metadata",)))
for o in objects
]
)
elif object_type in (
"directory",
"extid",
......
......@@ -14,8 +14,15 @@ import pytest
from swh.journal.client import JournalClient
from swh.journal.serializers import key_to_kafka, value_to_kafka
from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_hex
from swh.model.tests.swh_model_data import DUPLICATE_CONTENTS, TEST_OBJECTS
from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes, hash_to_hex
from swh.model.model import Revision, RevisionType
from swh.model.tests.swh_model_data import (
COMMITTERS,
DATES,
DUPLICATE_CONTENTS,
REVISIONS,
)
from swh.model.tests.swh_model_data import TEST_OBJECTS as _TEST_OBJECTS
from swh.storage import get_storage
from swh.storage.cassandra.model import ContentRow, SkippedContentRow
from swh.storage.in_memory import InMemoryStorage
......@@ -23,6 +30,23 @@ from swh.storage.replay import process_replay_objects
UTC = datetime.timezone.utc
TEST_OBJECTS = _TEST_OBJECTS.copy()
TEST_OBJECTS["revision"] = list(_TEST_OBJECTS["revision"]) + [
Revision(
id=hash_to_bytes("a569b03ebe6e5f9f2f6077355c40d89bd6986d0c"),
message=b"hello again",
date=DATES[1],
committer=COMMITTERS[1],
author=COMMITTERS[0],
committer_date=DATES[0],
type=RevisionType.GIT,
directory=b"\x03" * 20,
synthetic=False,
metadata={"something": "interesting"},
parents=(REVISIONS[0].id,),
),
]
def nullify_ctime(obj):
if isinstance(obj, (ContentRow, SkippedContentRow)):
......@@ -212,6 +236,10 @@ def check_replayed(
author=row.author.anonymize(),
committer=row.committer.anonymize(),
)
if attr == "revisions":
# the replayer should now drop the metadata attribute; see
# swh/storgae/replay.py:_insert_objects()
row.metadata = "null"
return row
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment