diff --git a/swh/journal/replay.py b/swh/journal/replay.py --- a/swh/journal/replay.py +++ b/swh/journal/replay.py @@ -20,7 +20,74 @@ _insert_objects(object_type, objects, storage) +def _fix_revision(rev): + # PyPI loader failed to encode empty strings as bytes, see: + # swh:1:rev:8f0095ee0664867055d03de9bcc8f95b91d8a2b9 + # or https://forge.softwareheritage.org/D1772 + if rev['author'].get('email') == '': + rev['author']['email'] = b'' + if rev['author'].get('name') == '': + rev['author']['name'] = b'' + if rev['committer'].get('email') == '': + rev['committer']['email'] = b'' + if rev['committer'].get('name') == '': + rev['committer']['name'] = b'' + + +def _fix_origin_visit(visit): + if isinstance(visit['origin'], str): + # note that it will crash with the pg and + # in-mem storages if the origin is not already known, + # but there is no other choice because we can't add an + # origin without knowing its type. Non-pg storages + # don't use a numeric FK internally, + visit['origin'] = {'url': visit['origin']} + else: + if 'type' not in visit: + visit['type'] = visit['origin']['type'] + + +def fix_object(object_type, obj): + """Converts a possibly old object from the journal to its current + expected format. + + List of conversions: + + Empty author name/email in PyPI releases: + + >>> from pprint import pprint + >>> pprint(fix_object('revision', { + ... 'author': {'email': '', 'fullname': b'', 'name': ''}, + ... 'committer': {'email': '', 'fullname': b'', 'name': ''}, + ... })) + {'author': {'email': b'', 'fullname': b'', 'name': b''}, + 'committer': {'email': b'', 'fullname': b'', 'name': b''}} + + + `visit['origin']` is an URL instead of a dict: + + >>> fix_object('origin_visit', {'origin': 'http://foo'}) + {'origin': {'url': 'http://foo'}} + + `visit['type']` is missing , but `origin['visit']['type']` exists: + + >>> pprint(fix_object( + ... 'origin_visit', + ... {'origin': {'type': 'hg', 'url': 'http://foo'}})) + {'origin': {'type': 'hg', 'url': 'http://foo'}, 'type': 'hg'} + """ + + if object_type == 'revision': + _fix_revision(obj) + elif object_type == 'origin_visit': + _fix_origin_visit(obj) + return obj + + def _insert_objects(object_type, objects, storage): + for obj in objects: + fix_object(object_type, obj) + if object_type == 'content': # TODO: insert 'content' in batches for object_ in objects: @@ -36,19 +103,8 @@ method(objects) elif object_type == 'origin_visit': for visit in objects: - if isinstance(visit['origin'], str): - # old format; note that it will crash with the pg and - # in-mem storages if the origin is not already known, - # but there is no other choice because we can't add an - # origin without knowing its type. Non-pg storages - # don't use a numeric FK internally, - visit['origin'] = {'url': visit['origin']} - else: + if 'type' in visit['origin']: storage.origin_add_one(visit['origin']) - if 'type' not in visit: - # old format - visit['type'] = visit['origin']['type'] - storage.origin_visit_upsert(objects) else: logger.warning('Received a series of %s, this should not happen',