diff --git a/swh/storage/replay.py b/swh/storage/replay.py --- a/swh/storage/replay.py +++ b/swh/storage/replay.py @@ -4,7 +4,7 @@ # See top-level LICENSE file for more information import logging -from typing import Any, Callable, Dict, Iterable, List +from typing import Any, Callable, Container, Dict, Iterable, List try: from systemd.daemon import notify @@ -103,6 +103,11 @@ logger.error("Collision detected: %(collision)s", {"collision": collision}) +def dict_key_dropper(d: Dict, keys_to_drop: Container) -> Dict: + """Returns a copy of the dict d without any key listed in keys_to_drop""" + return {k: v for (k, v) in d.items() if k not in keys_to_drop} + + def _insert_objects(object_type: str, objects: List[Dict], storage) -> None: """Insert objects of type object_type in the storage. @@ -146,6 +151,18 @@ storage.metadata_authority_add(authorities) storage.metadata_fetcher_add(fetchers) storage.raw_extrinsic_metadata_add(converted) + elif object_type == "revision": + # drop the metadata field from the revision (is any); this field is + # about to be dropped from the data model (in favor of + # raw_extrinsic_metadata) and there can be bogus values in the existing + # journal (metadata with \0000 in it) + method = getattr(storage, object_type + "_add") + method( + [ + object_converter_fn[object_type](dict_key_dropper(o, ("metadata",))) + for o in objects + ] + ) elif object_type in ( "directory", "extid", diff --git a/swh/storage/tests/test_replay.py b/swh/storage/tests/test_replay.py --- a/swh/storage/tests/test_replay.py +++ b/swh/storage/tests/test_replay.py @@ -14,8 +14,15 @@ from swh.journal.client import JournalClient from swh.journal.serializers import key_to_kafka, value_to_kafka -from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_hex -from swh.model.tests.swh_model_data import DUPLICATE_CONTENTS, TEST_OBJECTS +from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes, hash_to_hex +from swh.model.model import Revision, RevisionType +from swh.model.tests.swh_model_data import ( + COMMITTERS, + DATES, + DUPLICATE_CONTENTS, + REVISIONS, +) +from swh.model.tests.swh_model_data import TEST_OBJECTS as _TEST_OBJECTS from swh.storage import get_storage from swh.storage.cassandra.model import ContentRow, SkippedContentRow from swh.storage.in_memory import InMemoryStorage @@ -23,6 +30,23 @@ UTC = datetime.timezone.utc +TEST_OBJECTS = _TEST_OBJECTS.copy() +TEST_OBJECTS["revision"] = list(_TEST_OBJECTS["revision"]) + [ + Revision( + id=hash_to_bytes("a569b03ebe6e5f9f2f6077355c40d89bd6986d0c"), + message=b"hello again", + date=DATES[1], + committer=COMMITTERS[1], + author=COMMITTERS[0], + committer_date=DATES[0], + type=RevisionType.GIT, + directory=b"\x03" * 20, + synthetic=False, + metadata={"something": "interesting"}, + parents=(REVISIONS[0].id,), + ), +] + def nullify_ctime(obj): if isinstance(obj, (ContentRow, SkippedContentRow)): @@ -212,6 +236,10 @@ author=row.author.anonymize(), committer=row.committer.anonymize(), ) + if attr == "revisions": + # the replayer should now drop the metadata attribute; see + # swh/storgae/replay.py:_insert_objects() + row.metadata = "null" return row