Page MenuHomeSoftware Heritage

D5414.id19400.diff
No OneTemporary

D5414.id19400.diff

diff --git a/swh/storage/replay.py b/swh/storage/replay.py
--- a/swh/storage/replay.py
+++ b/swh/storage/replay.py
@@ -4,7 +4,7 @@
# See top-level LICENSE file for more information
import logging
-from typing import Any, Callable, Dict, Iterable, List
+from typing import Any, Callable, Container, Dict, Iterable, List
try:
from systemd.daemon import notify
@@ -103,6 +103,11 @@
logger.error("Collision detected: %(collision)s", {"collision": collision})
+def dict_key_dropper(d: Dict, keys_to_drop: Container) -> Dict:
+ """Returns a copy of the dict d without any key listed in keys_to_drop"""
+ return {k: v for (k, v) in d.items() if k not in keys_to_drop}
+
+
def _insert_objects(object_type: str, objects: List[Dict], storage) -> None:
"""Insert objects of type object_type in the storage.
@@ -146,6 +151,18 @@
storage.metadata_authority_add(authorities)
storage.metadata_fetcher_add(fetchers)
storage.raw_extrinsic_metadata_add(converted)
+ elif object_type == "revision":
+ # drop the metadata field from the revision (is any); this field is
+ # about to be dropped from the data model (in favor of
+ # raw_extrinsic_metadata) and there can be bogus values in the existing
+ # journal (metadata with \0000 in it)
+ method = getattr(storage, object_type + "_add")
+ method(
+ [
+ object_converter_fn[object_type](dict_key_dropper(o, ("metadata",)))
+ for o in objects
+ ]
+ )
elif object_type in (
"directory",
"extid",
diff --git a/swh/storage/tests/test_replay.py b/swh/storage/tests/test_replay.py
--- a/swh/storage/tests/test_replay.py
+++ b/swh/storage/tests/test_replay.py
@@ -14,8 +14,15 @@
from swh.journal.client import JournalClient
from swh.journal.serializers import key_to_kafka, value_to_kafka
-from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_hex
-from swh.model.tests.swh_model_data import DUPLICATE_CONTENTS, TEST_OBJECTS
+from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes, hash_to_hex
+from swh.model.model import Revision, RevisionType
+from swh.model.tests.swh_model_data import (
+ COMMITTERS,
+ DATES,
+ DUPLICATE_CONTENTS,
+ REVISIONS,
+)
+from swh.model.tests.swh_model_data import TEST_OBJECTS as _TEST_OBJECTS
from swh.storage import get_storage
from swh.storage.cassandra.model import ContentRow, SkippedContentRow
from swh.storage.in_memory import InMemoryStorage
@@ -23,6 +30,23 @@
UTC = datetime.timezone.utc
+TEST_OBJECTS = _TEST_OBJECTS.copy()
+TEST_OBJECTS["revision"] = list(_TEST_OBJECTS["revision"]) + [
+ Revision(
+ id=hash_to_bytes("a569b03ebe6e5f9f2f6077355c40d89bd6986d0c"),
+ message=b"hello again",
+ date=DATES[1],
+ committer=COMMITTERS[1],
+ author=COMMITTERS[0],
+ committer_date=DATES[0],
+ type=RevisionType.GIT,
+ directory=b"\x03" * 20,
+ synthetic=False,
+ metadata={"something": "interesting"},
+ parents=(REVISIONS[0].id,),
+ ),
+]
+
def nullify_ctime(obj):
if isinstance(obj, (ContentRow, SkippedContentRow)):
@@ -212,6 +236,10 @@
author=row.author.anonymize(),
committer=row.committer.anonymize(),
)
+ if attr == "revisions":
+ # the replayer should now drop the metadata attribute; see
+ # swh/storgae/replay.py:_insert_objects()
+ row.metadata = "null"
return row

File Metadata

Mime Type
text/plain
Expires
Tue, Dec 17, 6:51 PM (2 d, 17 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221868

Event Timeline