diff --git a/swh/deposit/migrations/0018_deposit_migrate_swhids.py b/swh/deposit/migrations/0018_deposit_migrate_swhids.py new file mode 100644 --- /dev/null +++ b/swh/deposit/migrations/0018_deposit_migrate_swhids.py @@ -0,0 +1,160 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import logging + +from django.db import migrations, models +from typing import Optional + +from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS +from swh.model.hashutil import hash_to_bytes, hash_to_hex +from swh.model.identifiers import ( + parse_persistent_identifier, + persistent_identifier, + DIRECTORY, + REVISION, + SNAPSHOT, +) +from swh.storage import get_storage as get_storage_client + + +logger = logging.getLogger(__name__) + + +swh_storage = None + + +def get_storage(): + """Instantiate a storage client + + """ + global swh_storage + if not swh_storage: + swh_storage = get_storage_client( + cls="remote", url="http://uffizi.internal.sofwareheritage.org:5002" + ) + return swh_storage + + +def get_snapshot(storage, origin: str, revision_id: str) -> Optional[str]: + """Retrieve the snapshot targetting the revision_id for the given origin. + + """ + all_visits = storage.origin_visit_get(origin) + for visit in all_visits: + if not visit["snapshot"]: + continue + detail_snapshot = storage.snapshot_get(visit["snapshot"]) + if not detail_snapshot: + continue + for branch in detail_snapshot: + if branch["target_type"] == "revision": + revision = branch["target"] + if revision == hash_to_hex(revision): + # Found the snapshot + return hash_to_hex(visit["snapshot"]) + return None + + +def migrate_deposit_swhid_context_not_null(apps, schema_editor): + """Migrate deposit SWHIDs to the new format. + + Migrate deposit SWHIDs to the new format. Only deposit with status done and + swh_id_context not null are concerned. + + """ + storage = get_storage() + for deposit in models.Deposit.objects.filter( + status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=False + ): + obj_dir = parse_persistent_identifier(deposit.swh_id_context) + assert obj_dir.object_type == DIRECTORY + obj_rev = parse_persistent_identifier(deposit.swh_anchor_id) + assert obj_rev.object_type == REVISION + + dir_id = obj_dir.object_id + origin = obj_dir.metadata["origin"] + rev_id = obj_rev.object_id + + # Find the snapshot targetting the revision + snp_id = get_snapshot(storage, origin, rev_id) + if not snp_id: + logger.warning("Snapshot not found for deposit id %s!", deposit.id) + continue + deposit.swh_id_context = persistent_identifier( + DIRECTORY, + dir_id, + metadata={ + "origin": origin, + "visit": persistent_identifier(SNAPSHOT, snp_id), + "anchor": persistent_identifier(REVISION, rev_id), + "path": "/", + }, + ) + + deposit.save() + + +def migrate_deposit_swhid_context_null(apps, schema_editor): + """Migrate deposit SWHIDs to the new format. + + Migrate deposit whose swh_id_context is not set (initial deposits not migrated at + the time). Only deposit with status done and swh_id_context null are concerned. + + Note: Those deposits have their swh_id being the SWHPIDs of the revision! So we can + align them as well. + + """ + storage = get_storage() + for deposit in models.Deposit.objects.filter( + status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=False + ): + obj_rev = parse_persistent_identifier(deposit.swh_id) + assert obj_rev.object_type == REVISION + + rev_id = obj_rev.object_id + revision = storage.revision_get(hash_to_bytes(rev_id)) + if not revision: + logger.warning("Snapshot not found for deposit id %s!", deposit.id) + continue + + provider_url = deposit.client.provider_url + external_id = deposit.external_id + origin = f"{provider_url}/{external_id}" + + dir_id = hash_to_hex(revision["directory"]) + + # Align all deposit SWHIDs + deposit.swh_id = persistent_identifier(DIRECTORY, dir_id) + # retrieve the snapshot from the archive + snp_id = get_snapshot(storage, origin, rev_id) + if not snp_id: + logger.warning("Snapshot not found for deposit id %s!", deposit.id) + continue + deposit.swh_id_context = persistent_identifier( + DIRECTORY, + dir_id, + metadata={ + "origin": origin, + "visit": persistent_identifier(SNAPSHOT, snp_id), + "anchor": persistent_identifier(REVISION, rev_id), + "path": "/", + }, + ) + # Realign the remaining deposit fields + deposit.swh_anchor_id = persistent_identifier(REVISION, rev_id) + deposit.swh_anchor_id_context = persistent_identifier( + REVISION, rev_id, metadata={"origin": origin,} + ) + deposit.save() + + +class Migration(migrations.Migration): + dependencies = [ + ("deposit", "0017_auto_20190925_0906.py"), + ] + + operations = [ + migrations.RunPython(migrate_deposit_swhid_context_not_null), + migrations.RunPython(migrate_deposit_swhid_context_null), + ]