diff --git a/swh/journal/replay.py b/swh/journal/replay.py --- a/swh/journal/replay.py +++ b/swh/journal/replay.py @@ -44,6 +44,17 @@ return rev +def _fix_revision_transplant_source(rev): + if rev.get('extra_headers'): + rev = rev.copy() + rev['extra_headers'] = [ + [key, value.encode('ascii')] + if key == 'transplant_source' and isinstance(value, str) + else [key, value] + for (key, value) in rev['extra_headers']] + return rev + + def _check_date(date): """Returns whether the date can be represented in backends with sane limits on timestamps and timezeones (resp. signed 64-bits and @@ -65,6 +76,7 @@ good_revisions = [] for rev in revisions: rev = _fix_revision_pypi_empty_string(rev) + rev = _fix_revision_transplant_source(rev) if not _check_revision_date(rev): logging.warning('Excluding revision (invalid date): %r', rev) continue @@ -91,7 +103,7 @@ def fix_objects(object_type, objects): - """Converts a possibly old object from the journal to its current + r"""Converts a possibly old object from the journal to its current expected format. List of conversions: @@ -119,6 +131,21 @@ 'date': {'offset': 0, 'timestamp': {'microseconds': 0, 'seconds': 1565096932}}}] + Fix type of 'transplant_source' extra headers: + + >>> revs = fix_objects('revision', [{ + ... 'author': {'email': '', 'fullname': b'', 'name': ''}, + ... 'committer': {'email': '', 'fullname': b'', 'name': ''}, + ... 'date': date, + ... 'committer_date': date, + ... 'extra_headers': [ + ... ['time_offset_seconds', b'-3600'], + ... ['transplant_source', '29c154a012a70f49df983625090434587622b39e']] + ... }]) + >>> pprint(revs[0]['extra_headers']) + [['time_offset_seconds', b'-3600'], + ['transplant_source', b'29c154a012a70f49df983625090434587622b39e']] + Filter out revisions with invalid dates: >>> from copy import deepcopy