Changeset View
Changeset View
Standalone View
Standalone View
swh/journal/replay.py
Show First 20 Lines • Show All 130 Lines • ▼ Show 20 Lines | |||||
def _check_revision_date(rev): | def _check_revision_date(rev): | ||||
"""Exclude revisions with invalid dates. | """Exclude revisions with invalid dates. | ||||
See https://forge.softwareheritage.org/T1339""" | See https://forge.softwareheritage.org/T1339""" | ||||
return _check_date(rev['date']) and _check_date(rev['committer_date']) | return _check_date(rev['date']) and _check_date(rev['committer_date']) | ||||
def _fix_revisions(revisions: List[Dict]) -> List[Dict]: | def _fix_revision(revision: Dict[str, Any]) -> Optional[Revision]: | ||||
"""Adapt revisions into a list of (current) storage compatible dicts. | """Adapt revision into an swh revision model object (current storage | ||||
compatible). | |||||
Fix author/committer person: | |||||
>>> from pprint import pprint | >>> from pprint import pprint | ||||
>>> date = { | >>> date = { | ||||
... 'timestamp': { | ... 'timestamp': { | ||||
... 'seconds': 1565096932, | ... 'seconds': 1565096932, | ||||
... 'microseconds': 0, | ... 'microseconds': 0, | ||||
... }, | ... }, | ||||
... 'offset': 0, | ... 'offset': 0, | ||||
... } | ... } | ||||
>>> pprint(_fix_revisions([{ | >>> rev0 = _fix_revision({ | ||||
... 'id': b'rev-id', | |||||
... 'author': {'email': '', 'fullname': b'', 'name': ''}, | ... 'author': {'email': '', 'fullname': b'', 'name': ''}, | ||||
... 'committer': {'email': '', 'fullname': b'', 'name': ''}, | ... 'committer': {'email': '', 'fullname': b'', 'name': ''}, | ||||
... 'date': date, | ... 'date': date, | ||||
... 'committer_date': date, | ... 'committer_date': date, | ||||
... }])) | ... 'type': 'git', | ||||
[{'author': {'email': b'', 'fullname': b'', 'name': b''}, | ... 'message': '', | ||||
'committer': {'email': b'', 'fullname': b'', 'name': b''}, | ... 'directory': b'dir-id', | ||||
'committer_date': {'offset': 0, | ... 'synthetic': False, | ||||
'timestamp': {'microseconds': 0, 'seconds': 1565096932}}, | ... }).to_dict() | ||||
'date': {'offset': 0, | >>> rev0['author'] | ||||
'timestamp': {'microseconds': 0, 'seconds': 1565096932}}}] | {'fullname': b'', 'name': b'', 'email': b''} | ||||
>>> rev0['committer'] | |||||
{'fullname': b'', 'name': b'', 'email': b''} | |||||
Fix type of 'transplant_source' extra headers: | Fix type of 'transplant_source' extra headers: | ||||
>>> revs = _fix_revisions([{ | >>> rev1 = _fix_revision({ | ||||
... 'id': b'rev-id', | |||||
... 'author': {'email': '', 'fullname': b'', 'name': ''}, | ... 'author': {'email': '', 'fullname': b'', 'name': ''}, | ||||
... 'committer': {'email': '', 'fullname': b'', 'name': ''}, | ... 'committer': {'email': '', 'fullname': b'', 'name': ''}, | ||||
... 'date': date, | ... 'date': date, | ||||
... 'committer_date': date, | ... 'committer_date': date, | ||||
... 'metadata': { | ... 'metadata': { | ||||
... 'extra_headers': [ | ... 'extra_headers': [ | ||||
... ['time_offset_seconds', b'-3600'], | ... ['time_offset_seconds', b'-3600'], | ||||
... ['transplant_source', '29c154a012a70f49df983625090434587622b39e'] # noqa | ... ['transplant_source', '29c154a012a70f49df983625090434587622b39e'] | ||||
... ]} | ... ]}, | ||||
... }]) | ... 'type': 'git', | ||||
>>> pprint(revs[0]['metadata']['extra_headers']) | ... 'message': '', | ||||
... 'directory': b'dir-id', | |||||
... 'synthetic': False, | |||||
... }) | |||||
>>> pprint(rev1.metadata['extra_headers']) | |||||
[['time_offset_seconds', b'-3600'], | [['time_offset_seconds', b'-3600'], | ||||
['transplant_source', b'29c154a012a70f49df983625090434587622b39e']] | ['transplant_source', b'29c154a012a70f49df983625090434587622b39e']] | ||||
Filter out revisions with invalid dates: | Revision with invalid date are filtered: | ||||
>>> from copy import deepcopy | >>> from copy import deepcopy | ||||
>>> invalid_date1 = deepcopy(date) | >>> invalid_date1 = deepcopy(date) | ||||
>>> invalid_date1['timestamp']['microseconds'] = 1000000000 # > 10^6 | >>> invalid_date1['timestamp']['microseconds'] = 1000000000 # > 10^6 | ||||
>>> _fix_revisions([{ | >>> rev = _fix_revision({ | ||||
... 'author': {'email': '', 'fullname': b'', 'name': b''}, | ... 'author': {'email': '', 'fullname': b'', 'name': b''}, | ||||
... 'committer': {'email': '', 'fullname': b'', 'name': b''}, | ... 'committer': {'email': '', 'fullname': b'', 'name': b''}, | ||||
... 'date': invalid_date1, | ... 'date': invalid_date1, | ||||
... 'committer_date': date, | ... 'committer_date': date, | ||||
... }]) | ... }) | ||||
[] | >>> rev is None | ||||
True | |||||
>>> invalid_date2 = deepcopy(date) | >>> invalid_date2 = deepcopy(date) | ||||
>>> invalid_date2['timestamp']['seconds'] = 2**70 # > 10^63 | >>> invalid_date2['timestamp']['seconds'] = 2**70 # > 10^63 | ||||
>>> _fix_revisions([{ | >>> rev = _fix_revision({ | ||||
... 'author': {'email': '', 'fullname': b'', 'name': b''}, | ... 'author': {'email': '', 'fullname': b'', 'name': b''}, | ||||
... 'committer': {'email': '', 'fullname': b'', 'name': b''}, | ... 'committer': {'email': '', 'fullname': b'', 'name': b''}, | ||||
... 'date': invalid_date2, | ... 'date': invalid_date2, | ||||
... 'committer_date': date, | ... 'committer_date': date, | ||||
... }]) | ... }) | ||||
[] | >>> rev is None | ||||
True | |||||
>>> invalid_date3 = deepcopy(date) | >>> invalid_date3 = deepcopy(date) | ||||
>>> invalid_date3['offset'] = 2**20 # > 10^15 | >>> invalid_date3['offset'] = 2**20 # > 10^15 | ||||
>>> _fix_revisions([{ | >>> rev = _fix_revision({ | ||||
... 'author': {'email': '', 'fullname': b'', 'name': b''}, | ... 'author': {'email': '', 'fullname': b'', 'name': b''}, | ||||
... 'committer': {'email': '', 'fullname': b'', 'name': b''}, | ... 'committer': {'email': '', 'fullname': b'', 'name': b''}, | ||||
... 'date': date, | ... 'date': date, | ||||
... 'committer_date': invalid_date3, | ... 'committer_date': invalid_date3, | ||||
... }]) | ... }) | ||||
[] | >>> rev is None | ||||
True | |||||
""" | """ # noqa | ||||
good_revisions: List = [] | rev = _fix_revision_pypi_empty_string(revision) | ||||
for rev in revisions: | |||||
rev = _fix_revision_pypi_empty_string(rev) | |||||
rev = _fix_revision_transplant_source(rev) | rev = _fix_revision_transplant_source(rev) | ||||
if not _check_revision_date(rev): | if not _check_revision_date(rev): | ||||
logging.warning('Excluding revision (invalid date): %r', rev) | logger.warning('Invalid revision date detected: %(revision)s', { | ||||
continue | 'revision': rev | ||||
if rev not in good_revisions: | }) | ||||
good_revisions.append(rev) | return None | ||||
return good_revisions | return Revision.from_dict(rev) | ||||
def _fix_origin_visit(visit: Dict) -> OriginVisit: | def _fix_origin_visit(visit: Dict) -> OriginVisit: | ||||
"""Adapt origin visits into a list of current storage compatible | """Adapt origin visits into a list of current storage compatible | ||||
OriginVisits. | OriginVisits. | ||||
`visit['origin']` is a dict instead of an URL: | `visit['origin']` is a dict instead of an URL: | ||||
▲ Show 20 Lines • Show All 101 Lines • ▼ Show 20 Lines | if object_type == 'content': | ||||
else: | else: | ||||
contents.append(c) | contents.append(c) | ||||
collision_aware_content_add( | collision_aware_content_add( | ||||
storage.skipped_content_add, skipped_contents) | storage.skipped_content_add, skipped_contents) | ||||
collision_aware_content_add( | collision_aware_content_add( | ||||
storage.content_add_metadata, contents) | storage.content_add_metadata, contents) | ||||
elif object_type == 'revision': | elif object_type == 'revision': | ||||
storage.revision_add( | revisions: List[Revision] = [] | ||||
Revision.from_dict(r) for r in _fix_revisions(objects) | for revision in objects: | ||||
) | rev = _fix_revision(revision) | ||||
if rev: | |||||
revisions.append(rev) | |||||
storage.revision_add(revisions) | |||||
elif object_type == 'origin_visit': | elif object_type == 'origin_visit': | ||||
visits = [_fix_origin_visit(v) for v in objects] | visits = [_fix_origin_visit(v) for v in objects] | ||||
storage.origin_add(Origin(url=v.origin) for v in visits) | storage.origin_add(Origin(url=v.origin) for v in visits) | ||||
storage.origin_visit_upsert(visits) | storage.origin_visit_upsert(visits) | ||||
elif object_type in ('directory', 'release', 'snapshot', 'origin'): | elif object_type in ('directory', 'release', 'snapshot', 'origin'): | ||||
method = getattr(storage, object_type + '_add') | method = getattr(storage, object_type + '_add') | ||||
method(object_converter_fn[object_type](o) for o in objects) | method(object_converter_fn[object_type](o) for o in objects) | ||||
else: | else: | ||||
▲ Show 20 Lines • Show All 252 Lines • Show Last 20 Lines |