diff --git a/swh/journal/replay.py b/swh/journal/replay.py --- a/swh/journal/replay.py +++ b/swh/journal/replay.py @@ -8,6 +8,7 @@ from concurrent.futures import ThreadPoolExecutor from swh.storage import HashCollision +from swh.model.identifiers import normalize_timestamp from swh.model.hashutil import hash_to_hex from swh.objstorage.objstorage import ID_HASH_ALGO from swh.core.statsd import statsd @@ -20,19 +21,43 @@ _insert_objects(object_type, objects, storage) +def _fix_revision_pypi_empty_string(rev): + """PyPI loader failed to encode empty strings as bytes, see: + swh:1:rev:8f0095ee0664867055d03de9bcc8f95b91d8a2b9 + or https://forge.softwareheritage.org/D1772 + """ + if rev['author'].get('email') == '': + rev['author']['email'] = b'' + if rev['author'].get('name') == '': + rev['author']['name'] = b'' + if rev['committer'].get('email') == '': + rev['committer']['email'] = b'' + if rev['committer'].get('name') == '': + rev['committer']['name'] = b'' + + +def _check_date(date): + date = normalize_timestamp(date) + return (-2**63 <= date['timestamp']['seconds'] < 2**63) \ + and (0 <= date['timestamp']['microseconds'] << 10**6) \ + and (-2**15 <= date['offset'] < 2**15) + + +def _check_revision_date(rev): + """Exclude revisions with invalid dates. + See https://forge.softwareheritage.org/T1339""" + return _check_date(rev['date']) and _check_date(rev['committer_date']) + + def _fix_revisions(revisions): + good_revisions = [] for rev in revisions: - # PyPI loader failed to encode empty strings as bytes, see: - # swh:1:rev:8f0095ee0664867055d03de9bcc8f95b91d8a2b9 - # or https://forge.softwareheritage.org/D1772 - if rev['author'].get('email') == '': - rev['author']['email'] = b'' - if rev['author'].get('name') == '': - rev['author']['name'] = b'' - if rev['committer'].get('email') == '': - rev['committer']['email'] = b'' - if rev['committer'].get('name') == '': - rev['committer']['name'] = b'' + _fix_revision_pypi_empty_string(rev) + if not _check_revision_date(rev): + logging.warning('Excluding revision: %r', rev) + continue + good_revisions.append(rev) + revisions[:] = good_revisions def _fix_origin_visits(visits):