diff --git a/swh/journal/replay.py b/swh/journal/replay.py --- a/swh/journal/replay.py +++ b/swh/journal/replay.py @@ -9,7 +9,7 @@ from swh.core.statsd import statsd from swh.model.identifiers import normalize_timestamp -from swh.model.hashutil import hash_to_hex +from swh.model.hashutil import hash_to_hex, hash_to_bytes from swh.model.model import SHA1_SIZE from swh.objstorage.objstorage import ID_HASH_ALGO from swh.storage import HashCollision @@ -44,6 +44,16 @@ return rev +def _fix_revision_transplant_source(rev): + if rev.get('extra_headers'): + rev = rev.copy() + rev['extra_headers'] = [ + [key, hash_to_bytes(value)] if key == 'transplant_source' + else [key, value] + for (key, value) in rev['extra_headers']] + return rev + + def _check_date(date): """Returns whether the date can be represented in backends with sane limits on timestamps and timezeones (resp. signed 64-bits and @@ -65,6 +75,7 @@ good_revisions = [] for rev in revisions: rev = _fix_revision_pypi_empty_string(rev) + rev = _fix_revision_transplant_source(rev) if not _check_revision_date(rev): logging.warning('Excluding revision (invalid date): %r', rev) continue @@ -91,7 +102,7 @@ def fix_objects(object_type, objects): - """Converts a possibly old object from the journal to its current + r"""Converts a possibly old object from the journal to its current expected format. List of conversions: @@ -119,6 +130,21 @@ 'date': {'offset': 0, 'timestamp': {'microseconds': 0, 'seconds': 1565096932}}}] + Fix type of 'transplant_source' extra headers: + + >>> revs = fix_objects('revision', [{ + ... 'author': {'email': '', 'fullname': b'', 'name': ''}, + ... 'committer': {'email': '', 'fullname': b'', 'name': ''}, + ... 'date': date, + ... 'committer_date': date, + ... 'extra_headers': [ + ... ['time_offset_seconds', b'-3600'], + ... ['transplant_source', '29c154a012a70f49df983625090434587622b39e']] + ... }]) + >>> pprint(revs[0]['extra_headers']) + [['time_offset_seconds', b'-3600'], + ['transplant_source', b')\xc1T\xa0\x12\xa7\x0fI\xdf\x986%\t\x044Xv"\xb3\x9e']] + Filter out revisions with invalid dates: >>> from copy import deepcopy