diff --git a/swh/journal/replay.py b/swh/journal/replay.py --- a/swh/journal/replay.py +++ b/swh/journal/replay.py @@ -8,6 +8,7 @@ from concurrent.futures import ThreadPoolExecutor from swh.storage import HashCollision +from swh.model.identifiers import normalize_timestamp from swh.model.hashutil import hash_to_hex from swh.objstorage.objstorage import ID_HASH_ALGO from swh.core.statsd import statsd @@ -20,19 +21,43 @@ _insert_objects(object_type, objects, storage) +def _fix_revision_pypi_empty_string(rev): + """PyPI loader failed to encode empty strings as bytes, see: + swh:1:rev:8f0095ee0664867055d03de9bcc8f95b91d8a2b9 + or https://forge.softwareheritage.org/D1772 + """ + if rev['author'].get('email') == '': + rev['author']['email'] = b'' + if rev['author'].get('name') == '': + rev['author']['name'] = b'' + if rev['committer'].get('email') == '': + rev['committer']['email'] = b'' + if rev['committer'].get('name') == '': + rev['committer']['name'] = b'' + + +def _check_date(date): + date = normalize_timestamp(date) + return (-2**63 <= date['timestamp']['seconds'] < 2**63) \ + and (0 <= date['timestamp']['microseconds'] << 10**6) \ + and (-2**15 <= date['offset'] < 2**15) + + +def _check_revision_date(rev): + """Exclude revisions with invalid dates. + See https://forge.softwareheritage.org/T1339""" + return _check_date(rev['date']) and _check_date(rev['committer_date']) + + def _fix_revisions(revisions): + good_revisions = [] for rev in revisions: - # PyPI loader failed to encode empty strings as bytes, see: - # swh:1:rev:8f0095ee0664867055d03de9bcc8f95b91d8a2b9 - # or https://forge.softwareheritage.org/D1772 - if rev['author'].get('email') == '': - rev['author']['email'] = b'' - if rev['author'].get('name') == '': - rev['author']['name'] = b'' - if rev['committer'].get('email') == '': - rev['committer']['email'] = b'' - if rev['committer'].get('name') == '': - rev['committer']['name'] = b'' + _fix_revision_pypi_empty_string(rev) + if not _check_revision_date(rev): + logging.warning('Excluding revision: %r', rev) + continue + good_revisions.append(rev) + revisions[:] = good_revisions def _fix_origin_visits(visits): diff --git a/swh/journal/tests/conftest.py b/swh/journal/tests/conftest.py --- a/swh/journal/tests/conftest.py +++ b/swh/journal/tests/conftest.py @@ -35,7 +35,7 @@ }, ] -COMMITTER = [ +COMMITTERS = [ { 'id': 1, 'fullname': b'foo', @@ -46,36 +46,41 @@ } ] +DATES = [ + { + 'timestamp': { + 'seconds': 1234567891, + 'microseconds': 0, + }, + 'offset': 120, + 'negative_utc': None, + }, + { + 'timestamp': { + 'seconds': 1234567892, + 'microseconds': 0, + }, + 'offset': 120, + 'negative_utc': None, + } +] + REVISIONS = [ { 'id': hash_to_bytes('7026b7c1a2af56521e951c01ed20f255fa054238'), 'message': b'hello', - 'date': { - 'timestamp': { - 'seconds': 1234567891, - 'microseconds': 0, - }, - 'offset': 120, - 'negative_utc': None, - }, - 'committer': COMMITTER[0], - 'author': COMMITTER[0], - 'committer_date': None, + 'date': DATES[0], + 'committer': COMMITTERS[0], + 'author': COMMITTERS[0], + 'committer_date': DATES[0], }, { 'id': hash_to_bytes('368a48fe15b7db2383775f97c6b247011b3f14f4'), 'message': b'hello again', - 'date': { - 'timestamp': { - 'seconds': 1234567892, - 'microseconds': 0, - }, - 'offset': 120, - 'negative_utc': None, - }, - 'committer': COMMITTER[1], - 'author': COMMITTER[1], - 'committer_date': None, + 'date': DATES[1], + 'committer': COMMITTERS[1], + 'author': COMMITTERS[1], + 'committer_date': DATES[1], }, ] @@ -91,7 +96,7 @@ 'offset': 120, 'negative_utc': None, }, - 'author': COMMITTER[0], + 'author': COMMITTERS[0], }, ]