Changeset View
Changeset View
Standalone View
Standalone View
swh/deposit/migrations/0018_migrate_swhids.py
- This file was added.
# -*- coding: utf-8 -*- | |||||
from __future__ import unicode_literals | |||||
import os | |||||
import logging | |||||
from django.db import migrations, models | |||||
from typing import Optional | |||||
from swh.core import config | |||||
from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS | |||||
from swh.model.hashutil import hash_to_bytes, hash_to_hex | |||||
from swh.model.identifiers import ( | |||||
parse_persistent_identifier, | |||||
persistent_identifier, | |||||
DIRECTORY, | |||||
REVISION, | |||||
SNAPSHOT, | |||||
) | |||||
from swh.storage import get_storage as get_storage_client | |||||
logger = logging.getLogger(__name__) | |||||
swh_storage = None | |||||
def get_storage(): | |||||
"""Instantiate a storage client | |||||
""" | |||||
global swh_storage | |||||
if not swh_storage: | |||||
config_file = os.environ.get("SWH_CONFIG_FILENAME") | |||||
if not config_file: | |||||
raise ValueError( | |||||
"Production: SWH_CONFIG_FILENAME must be set to the" | |||||
" configuration file needed!" | |||||
) | |||||
if not os.path.exists(config_file): | |||||
raise ValueError( | |||||
"Production: configuration file %s does not exist!" % (config_file,) | |||||
) | |||||
conf = config.load_named_config(config_file) | |||||
if not conf: | |||||
raise ValueError( | |||||
"Production: configuration %s does not exist." % (config_file,) | |||||
) | |||||
storage_config = conf.get("storage") | |||||
if not storage_config: | |||||
raise ValueError( | |||||
"Production: invalid configuration; missing 'storage' config entry." | |||||
) | |||||
swh_storage = get_storage_client(**storage_config) | |||||
import pdb | |||||
pdb.set_trace() | |||||
return swh_storage | |||||
def get_snapshot(storage, origin: str, revision_id: str) -> Optional[str]: | |||||
"""Retrieve the snapshot targeting the revision_id for the given origin. | |||||
""" | |||||
all_visits = storage.origin_visit_get(origin) | |||||
for visit in all_visits: | |||||
if not visit["snapshot"]: | |||||
continue | |||||
detail_snapshot = storage.snapshot_get(visit["snapshot"]) | |||||
if not detail_snapshot: | |||||
continue | |||||
for branch in detail_snapshot: | |||||
if branch["target_type"] == "revision": | |||||
revision = branch["target"] | |||||
if revision == hash_to_hex(revision): | |||||
# Found the snapshot | |||||
return hash_to_hex(visit["snapshot"]) | |||||
return None | |||||
def migrate_deposit_swhid_context_not_null(apps, schema_editor): | |||||
"""Migrate deposit SWHIDs to the new format. | |||||
Migrate deposit SWHIDs to the new format. Only deposit with status done and | |||||
swh_id_context not null are concerned. | |||||
""" | |||||
storage = get_storage() | |||||
for deposit in models.Deposit.objects.filter( | |||||
status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=False | |||||
): | |||||
obj_dir = parse_persistent_identifier(deposit.swh_id_context) | |||||
assert obj_dir.object_type == DIRECTORY | |||||
obj_rev = parse_persistent_identifier(deposit.swh_anchor_id) | |||||
assert obj_rev.object_type == REVISION | |||||
dir_id = obj_dir.object_id | |||||
origin = obj_dir.metadata["origin"] | |||||
rev_id = obj_rev.object_id | |||||
# Find the snapshot targeting the revision | |||||
snp_id = get_snapshot(storage, origin, rev_id) | |||||
if not snp_id: | |||||
logger.warning("Snapshot not found for deposit id %s!", deposit.id) | |||||
continue | |||||
deposit.swh_id_context = persistent_identifier( | |||||
DIRECTORY, | |||||
dir_id, | |||||
metadata={ | |||||
"origin": origin, | |||||
"visit": persistent_identifier(SNAPSHOT, snp_id), | |||||
"anchor": persistent_identifier(REVISION, rev_id), | |||||
"path": "/", | |||||
}, | |||||
) | |||||
deposit.save() | |||||
def migrate_deposit_swhid_context_null(apps, schema_editor): | |||||
"""Migrate deposit SWHIDs to the new format. | |||||
Migrate deposit whose swh_id_context is not set (initial deposits not migrated at | |||||
the time). Only deposit with status done and swh_id_context null are concerned. | |||||
Note: Those deposits have their swh_id being the SWHPIDs of the revision! So we can | |||||
align them as well. | |||||
""" | |||||
storage = get_storage() | |||||
for deposit in models.Deposit.objects.filter( | |||||
status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=False | |||||
): | |||||
obj_rev = parse_persistent_identifier(deposit.swh_id) | |||||
assert obj_rev.object_type == REVISION | |||||
rev_id = obj_rev.object_id | |||||
revision = storage.revision_get(hash_to_bytes(rev_id)) | |||||
if not revision: | |||||
logger.warning("Snapshot not found for deposit id %s!", deposit.id) | |||||
continue | |||||
provider_url = deposit.client.provider_url | |||||
external_id = deposit.external_id | |||||
origin = f"{provider_url}/{external_id}" | |||||
dir_id = hash_to_hex(revision["directory"]) | |||||
# Align all deposit SWHIDs | |||||
deposit.swh_id = persistent_identifier(DIRECTORY, dir_id) | |||||
# retrieve the snapshot from the archive | |||||
snp_id = get_snapshot(storage, origin, rev_id) | |||||
if not snp_id: | |||||
logger.warning("Snapshot not found for deposit id %s!", deposit.id) | |||||
continue | |||||
deposit.swh_id_context = persistent_identifier( | |||||
DIRECTORY, | |||||
dir_id, | |||||
metadata={ | |||||
"origin": origin, | |||||
"visit": persistent_identifier(SNAPSHOT, snp_id), | |||||
"anchor": persistent_identifier(REVISION, rev_id), | |||||
"path": "/", | |||||
}, | |||||
) | |||||
# Realign the remaining deposit fields | |||||
deposit.swh_anchor_id = persistent_identifier(REVISION, rev_id) | |||||
deposit.swh_anchor_id_context = persistent_identifier( | |||||
REVISION, rev_id, metadata={"origin": origin,} | |||||
) | |||||
deposit.save() | |||||
class Migration(migrations.Migration): | |||||
dependencies = [ | |||||
("deposit", "0017_auto_20190925_0906.py"), | |||||
] | |||||
operations = [ | |||||
migrations.RunPython(migrate_deposit_swhid_context_not_null), | |||||
# migrations.RunPython(migrate_deposit_swhid_context_null), | |||||
] | |||||
moranegg: first you give all deposits all ids ?
Because the swh_anchor_id should be deleted... | |||||
Done Inline ActionsYes, i keep those (swh_anchor_id, swh_anchor_id_context) at first. I don't know yet if it's used or not from the existing deposit clients. In the We can always do the migration of dropping the unneeded fields later (that's ardumont: Yes, i keep those (swh_anchor_id, swh_anchor_id_context) at first.
I don't know yet if it's… |
first you give all deposits all ids ?
Because the swh_anchor_id should be deleted...