Changeset View
Changeset View
Standalone View
Standalone View
swh/deposit/migrations/0018_deposit_migrate_swhids.py
- This file was added.
# -*- coding: utf-8 -*- | |||||
from __future__ import unicode_literals | |||||
import logging | |||||
from django.db import migrations, models | |||||
from typing import Optional | |||||
from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS | |||||
from swh.model.hashutil import hash_to_bytes, hash_to_hex | |||||
from swh.model.identifiers import ( | |||||
parse_persistent_identifier, | |||||
persistent_identifier, | |||||
DIRECTORY, | |||||
REVISION, | |||||
SNAPSHOT, | |||||
) | |||||
from swh.storage import get_storage as get_storage_client | |||||
logger = logging.getLogger(__name__) | |||||
swh_storage = None | |||||
def get_storage(): | |||||
"""Instantiate a storage client | |||||
""" | |||||
global swh_storage | |||||
if not swh_storage: | |||||
swh_storage = get_storage_client( | |||||
cls="remote", url="http://uffizi.internal.sofwareheritage.org:5002" | |||||
) | |||||
return swh_storage | |||||
def get_snapshot(storage, origin: str, revision_id: str) -> Optional[str]: | |||||
"""Retrieve the snapshot targetting the revision_id for the given origin. | |||||
""" | |||||
all_visits = storage.origin_visit_get(origin) | |||||
for visit in all_visits: | |||||
if not visit["snapshot"]: | |||||
continue | |||||
detail_snapshot = storage.snapshot_get(visit["snapshot"]) | |||||
if not detail_snapshot: | |||||
continue | |||||
for branch in detail_snapshot: | |||||
if branch["target_type"] == "revision": | |||||
revision = branch["target"] | |||||
if revision == hash_to_hex(revision): | |||||
# Found the snapshot | |||||
return hash_to_hex(visit["snapshot"]) | |||||
return None | |||||
def migrate_deposit_swhid_context_not_null(apps, schema_editor): | |||||
"""Migrate deposit SWHIDs to the new format. | |||||
Migrate deposit SWHIDs to the new format. Only deposit with status done and | |||||
swh_id_context not null are concerned. | |||||
""" | |||||
storage = get_storage() | |||||
for deposit in models.Deposit.objects.filter( | |||||
status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=False | |||||
): | |||||
obj_dir = parse_persistent_identifier(deposit.swh_id_context) | |||||
assert obj_dir.object_type == DIRECTORY | |||||
obj_rev = parse_persistent_identifier(deposit.swh_anchor_id) | |||||
assert obj_rev.object_type == REVISION | |||||
dir_id = obj_dir.object_id | |||||
origin = obj_dir.metadata["origin"] | |||||
rev_id = obj_rev.object_id | |||||
# Find the snapshot targetting the revision | |||||
snp_id = get_snapshot(storage, origin, rev_id) | |||||
if not snp_id: | |||||
logger.warning("Snapshot not found for deposit id %s!", deposit.id) | |||||
continue | |||||
deposit.swh_id_context = persistent_identifier( | |||||
DIRECTORY, | |||||
dir_id, | |||||
metadata={ | |||||
"origin": origin, | |||||
"visit": persistent_identifier(SNAPSHOT, snp_id), | |||||
"anchor": persistent_identifier(REVISION, rev_id), | |||||
"path": "/", | |||||
}, | |||||
) | |||||
deposit.save() | |||||
def migrate_deposit_swhid_context_null(apps, schema_editor): | |||||
"""Migrate deposit SWHIDs to the new format. | |||||
Migrate deposit whose swh_id_context is not set (initial deposits not migrated at | |||||
the time). Only deposit with status done and swh_id_context null are concerned. | |||||
Note: Those deposits have their swh_id being the SWHPIDs of the revision! So we can | |||||
align them as well. | |||||
""" | |||||
storage = get_storage() | |||||
for deposit in models.Deposit.objects.filter( | |||||
status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=False | |||||
): | |||||
obj_rev = parse_persistent_identifier(deposit.swh_id) | |||||
assert obj_rev.object_type == REVISION | |||||
rev_id = obj_rev.object_id | |||||
revision = storage.revision_get(hash_to_bytes(rev_id)) | |||||
if not revision: | |||||
logger.warning("Snapshot not found for deposit id %s!", deposit.id) | |||||
continue | |||||
provider_url = deposit.client.provider_url | |||||
external_id = deposit.external_id | |||||
origin = f"{provider_url}/{external_id}" | |||||
dir_id = hash_to_hex(revision["directory"]) | |||||
# Align all deposit SWHIDs | |||||
deposit.swh_id = persistent_identifier(DIRECTORY, dir_id) | |||||
# retrieve the snapshot from the archive | |||||
snp_id = get_snapshot(storage, origin, rev_id) | |||||
if not snp_id: | |||||
logger.warning("Snapshot not found for deposit id %s!", deposit.id) | |||||
continue | |||||
deposit.swh_id_context = persistent_identifier( | |||||
DIRECTORY, | |||||
dir_id, | |||||
metadata={ | |||||
"origin": origin, | |||||
"visit": persistent_identifier(SNAPSHOT, snp_id), | |||||
"anchor": persistent_identifier(REVISION, rev_id), | |||||
"path": "/", | |||||
}, | |||||
) | |||||
# Realign the remaining deposit fields | |||||
deposit.swh_anchor_id = persistent_identifier(REVISION, rev_id) | |||||
deposit.swh_anchor_id_context = persistent_identifier( | |||||
REVISION, rev_id, metadata={"origin": origin,} | |||||
) | |||||
deposit.save() | |||||
class Migration(migrations.Migration): | |||||
dependencies = [ | |||||
("deposit", "0017_auto_20190925_0906.py"), | |||||
] | |||||
operations = [ | |||||
migrations.RunPython(migrate_deposit_swhid_context_not_null), | |||||
migrations.RunPython(migrate_deposit_swhid_context_null), | |||||
] |