Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/fixer.py
# Copyright (C) 2020 The Software Heritage developers | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
import copy | import copy | ||||
import datetime | |||||
import logging | import logging | ||||
from typing import Any, Dict, List, Optional | from typing import Any, Dict, List, Optional | ||||
from swh.model.identifiers import normalize_timestamp | from swh.model.identifiers import normalize_timestamp | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
def _fix_content(content: Dict[str, Any]) -> Dict[str, Any]: | def _fix_content(content: Dict[str, Any]) -> Dict[str, Any]: | ||||
"""Filters-out invalid 'perms' key that leaked from swh.model.from_disk | """Filters-out invalid 'perms' key that leaked from swh.model.from_disk | ||||
▲ Show 20 Lines • Show All 215 Lines • ▼ Show 20 Lines | def _fix_origin_visit(visit: Dict) -> Dict: | ||||
... })) | ... })) | ||||
{'date': datetime.datetime(2020, 2, 27, 14, 39, 19, tzinfo=datetime.timezone.utc), | {'date': datetime.datetime(2020, 2, 27, 14, 39, 19, tzinfo=datetime.timezone.utc), | ||||
'metadata': None, | 'metadata': None, | ||||
'origin': 'http://foo', | 'origin': 'http://foo', | ||||
'snapshot': None, | 'snapshot': None, | ||||
'status': 'ongoing', | 'status': 'ongoing', | ||||
'type': 'hg'} | 'type': 'hg'} | ||||
>>> pprint(_fix_origin_visit( | |||||
... {'origin': {'type': 'hg', 'url': 'http://foo'}, | |||||
... 'date': '2020-02-27 14:39:19+00:00', | |||||
... 'status': 'ongoing', | |||||
... 'snapshot': None, | |||||
... })) | |||||
{'date': datetime.datetime(2020, 2, 27, 14, 39, 19, tzinfo=datetime.timezone.utc), | |||||
'metadata': None, | |||||
'origin': 'http://foo', | |||||
'snapshot': None, | |||||
'status': 'ongoing', | |||||
'type': 'hg'} | |||||
Old visit format (origin_visit with no type) raises: | Old visit format (origin_visit with no type) raises: | ||||
>>> _fix_origin_visit({ | >>> _fix_origin_visit({ | ||||
... 'origin': {'url': 'http://foo'}, | ... 'origin': {'url': 'http://foo'}, | ||||
... 'date': date, | ... 'date': date, | ||||
... 'status': 'ongoing', | ... 'status': 'ongoing', | ||||
... 'snapshot': None | ... 'snapshot': None | ||||
... }) | ... }) | ||||
Show All 25 Lines | if "type" not in visit: | ||||
# such visits. If it does, the replayer must crash so we can fix | # such visits. If it does, the replayer must crash so we can fix | ||||
# the journal's topic. | # the journal's topic. | ||||
raise ValueError(f"Old origin visit format detected: {visit}") | raise ValueError(f"Old origin visit format detected: {visit}") | ||||
if isinstance(visit["origin"], dict): | if isinstance(visit["origin"], dict): | ||||
# Old version of the schema: visit['origin'] was a dict. | # Old version of the schema: visit['origin'] was a dict. | ||||
visit["origin"] = visit["origin"]["url"] | visit["origin"] = visit["origin"]["url"] | ||||
if "metadata" not in visit: | if "metadata" not in visit: | ||||
visit["metadata"] = None | visit["metadata"] = None | ||||
date = visit["date"] | |||||
if isinstance(date, str): | |||||
ardumont: new from python3.7 [1]
[1] https://docs.python.org/3/library/datetime.html#datetime.datetime. | |||||
visit["date"] = datetime.datetime.fromisoformat(date) | |||||
return visit | return visit | ||||
def fix_objects(object_type: str, objects: List[Dict]) -> List[Dict]: | def fix_objects(object_type: str, objects: List[Dict]) -> List[Dict]: | ||||
""" | """ | ||||
Fix legacy objects from the journal to bring them up to date with the | Fix legacy objects from the journal to bring them up to date with the | ||||
latest storage schema. | latest storage schema. | ||||
""" | """ | ||||
Show All 11 Lines |
new from python3.7 [1]
[1] https://docs.python.org/3/library/datetime.html#datetime.datetime.fromisoformat