Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/fixer.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020 The Software Heritage developers | ||||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||||||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||||||||
import copy | import copy | ||||||||||
import datetime | import datetime | ||||||||||
import logging | import logging | ||||||||||
from typing import Any, Dict, List, Optional | from typing import Any, Dict, List, Optional | ||||||||||
from swh.model.identifiers import normalize_timestamp | from swh.model.identifiers import normalize_timestamp | ||||||||||
from swh.model.model import Origin | |||||||||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||||||||
def _fix_content(content: Dict[str, Any]) -> Dict[str, Any]: | def _fix_content(content: Dict[str, Any]) -> Dict[str, Any]: | ||||||||||
"""Filters-out invalid 'perms' key that leaked from swh.model.from_disk | """Filters-out invalid 'perms' key that leaked from swh.model.from_disk | ||||||||||
to the journal. | to the journal. | ||||||||||
▲ Show 20 Lines • Show All 265 Lines • ▼ Show 20 Lines | def _fix_origin_visit(visit: Dict) -> Dict: | ||||||||||
if isinstance(date, str): | if isinstance(date, str): | ||||||||||
visit["date"] = datetime.datetime.fromisoformat(date) | visit["date"] = datetime.datetime.fromisoformat(date) | ||||||||||
# Those are no longer part of the model | # Those are no longer part of the model | ||||||||||
for key in ["status", "snapshot", "metadata"]: | for key in ["status", "snapshot", "metadata"]: | ||||||||||
visit.pop(key, None) | visit.pop(key, None) | ||||||||||
return visit | return visit | ||||||||||
def _fix_raw_extrinsic_metadata(obj_dict: Dict) -> Dict: | |||||||||||
"""Fix legacy RawExtrinsicMetadata with type which is no longer part of the model. | |||||||||||
>>> _fix_raw_extrinsic_metadata({ | |||||||||||
... 'type': 'directory', | |||||||||||
... 'target': 'swh:1:dir:460a586d1c95d120811eaadb398d534e019b5243', | |||||||||||
... }) | |||||||||||
{'target': 'swh:1:dir:460a586d1c95d120811eaadb398d534e019b5243'} | |||||||||||
>>> _fix_raw_extrinsic_metadata({ | |||||||||||
... 'type': 'origin', | |||||||||||
... 'target': 'https://inria.halpreprod.archives-ouvertes.fr/hal-01667309', | |||||||||||
... }) | |||||||||||
{'target': 'swh:1:ori:155291d5b9ada4570672510509f93fcfd9809882'} | |||||||||||
vlorentz: works too | |||||||||||
Done Inline ActionsI've just replicated what is done in other _fix_xxx functions, but yes, right douardda: I've just replicated what is done in other _fix_xxx functions, but yes, right | |||||||||||
""" | |||||||||||
o = obj_dict.copy() | |||||||||||
if o.pop("type", None) == "origin": | |||||||||||
o["target"] = str(Origin(o["target"]).swhid()) | |||||||||||
Not Done Inline Actions
I'm not a huge fan of mutating o in the conditional, but it's not a big deal vlorentz: I'm not a huge fan of mutating `o` in the conditional, but it's not a big deal | |||||||||||
Done Inline Actionswhat exactly is the purpose of this comment? what is the problem with this implementation? what other solution would you propose/prefer? douardda: what exactly is the purpose of this comment? what is the problem with this implementation? what… | |||||||||||
Done Inline Actionsread too fast, nvm douardda: read too fast, nvm | |||||||||||
return o | |||||||||||
def fix_objects(object_type: str, objects: List[Dict]) -> List[Dict]: | def fix_objects(object_type: str, objects: List[Dict]) -> List[Dict]: | ||||||||||
""" | """ | ||||||||||
Fix legacy objects from the journal to bring them up to date with the | Fix legacy objects from the journal to bring them up to date with the | ||||||||||
latest storage schema. | latest storage schema. | ||||||||||
""" | """ | ||||||||||
if object_type == "content": | if object_type == "content": | ||||||||||
return [_fix_content(v) for v in objects] | return [_fix_content(v) for v in objects] | ||||||||||
elif object_type == "revision": | elif object_type == "revision": | ||||||||||
revisions = [_fix_revision(v) for v in objects] | revisions = [_fix_revision(v) for v in objects] | ||||||||||
return [rev for rev in revisions if rev is not None] | return [rev for rev in revisions if rev is not None] | ||||||||||
elif object_type == "origin": | elif object_type == "origin": | ||||||||||
return [_fix_origin(v) for v in objects] | return [_fix_origin(v) for v in objects] | ||||||||||
elif object_type == "origin_visit": | elif object_type == "origin_visit": | ||||||||||
return [_fix_origin_visit(v) for v in objects] | return [_fix_origin_visit(v) for v in objects] | ||||||||||
elif object_type == "raw_extrinsic_metadata": | |||||||||||
return [_fix_raw_extrinsic_metadata(v) for v in objects] | |||||||||||
else: | else: | ||||||||||
return objects | return objects |
works too