diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -27,11 +27,12 @@ from swh.deposit.models import Deposit from swh.deposit.utils import compute_metadata_context from swh.model import hashutil -from swh.model.identifiers import SWHID, ValidationError +from swh.model.identifiers import ExtendedSWHID, QualifiedSWHID, ValidationError from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, + Origin, RawExtrinsicMetadata, ) from swh.scheduler.utils import create_oneshot_task_dict @@ -67,7 +68,7 @@ ) from ..models import DepositClient, DepositCollection, DepositRequest from ..parsers import parse_xml -from ..utils import parse_swh_reference +from ..utils import extended_swhid_from_qualified, parse_swh_reference ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"] ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"] @@ -608,11 +609,11 @@ def _store_metadata_deposit( self, deposit: Deposit, - swhid_reference: Union[str, SWHID], + swhid_reference: Union[str, QualifiedSWHID], metadata: Dict, raw_metadata: bytes, deposit_origin: Optional[str] = None, - ) -> Tuple[Union[SWHID, str], Union[SWHID, str], Deposit, DepositRequest]: + ) -> Tuple[ExtendedSWHID, Deposit, DepositRequest]: """When all user inputs pass the checks, this associates the raw_metadata to the swhid_reference in the raw extrinsic metadata storage. In case of any issues, a bad request response is returned to the user with the details. @@ -636,7 +637,7 @@ (e.g. functionally invalid metadata, ...) Returns: - Tuple of core swhid, swhid context, deposit and deposit request + Tuple of target swhid, deposit, and deposit request """ metadata_ok, error_details = check_metadata(metadata) @@ -669,20 +670,20 @@ # actually add the metadata to the completed deposit deposit_request = self._deposit_request_put(deposit, deposit_request_data) - object_type, metadata_context = compute_metadata_context(swhid_reference) - if deposit_origin: # metadata deposit update on completed deposit - metadata_context["origin"] = deposit_origin - - swhid_core: Union[str, SWHID] + target_swhid: ExtendedSWHID # origin URL or CoreSWHID if isinstance(swhid_reference, str): - swhid_core = swhid_reference + target_swhid = Origin(swhid_reference).swhid() + metadata_context = {} else: - swhid_core = attr.evolve(swhid_reference, metadata={}) + metadata_context = compute_metadata_context(swhid_reference) + if deposit_origin: # metadata deposit update on completed deposit + metadata_context["origin"] = deposit_origin + + target_swhid = extended_swhid_from_qualified(swhid_reference) # store that metadata to the metadata storage metadata_object = RawExtrinsicMetadata( - type=object_type, - target=swhid_core, # core swhid or origin + target=target_swhid, # core swhid or origin discovery_date=deposit_request.date, authority=metadata_authority, fetcher=metadata_fetcher, @@ -696,7 +697,7 @@ self.storage_metadata.metadata_fetcher_add([metadata_fetcher]) self.storage_metadata.raw_extrinsic_metadata_add([metadata_object]) - return (swhid_core, swhid_reference, deposit, deposit_request) + return (target_swhid, deposit, deposit_request) def _atom_entry( self, @@ -752,13 +753,13 @@ # Determine if we are in the metadata-only deposit case try: - swhid = parse_swh_reference(metadata) + swhid_ref = parse_swh_reference(metadata) except ValidationError as e: raise DepositError( PARSING_ERROR, "Invalid SWHID reference", str(e), ) - if swhid is not None and ( + if swhid_ref is not None and ( deposit.origin_url or deposit.parent or deposit.external_id ): raise DepositError( @@ -768,15 +769,15 @@ "code deposits, only one may be used on a given deposit.", ) - if swhid is not None: + if swhid_ref is not None: deposit.save() # We need a deposit id - swhid, swhid_ref, depo, depo_request = self._store_metadata_deposit( - deposit, swhid, metadata, raw_metadata + target_swhid, depo, depo_request = self._store_metadata_deposit( + deposit, swhid_ref, metadata, raw_metadata ) deposit.status = DEPOSIT_STATUS_LOAD_SUCCESS - if isinstance(swhid_ref, SWHID): - deposit.swhid = str(swhid) + if isinstance(swhid_ref, QualifiedSWHID): + deposit.swhid = str(extended_swhid_from_qualified(swhid_ref)) deposit.swhid_context = str(swhid_ref) deposit.complete_date = depo_request.date deposit.reception_date = depo_request.date diff --git a/swh/deposit/api/edit.py b/swh/deposit/api/edit.py --- a/swh/deposit/api/edit.py +++ b/swh/deposit/api/edit.py @@ -6,7 +6,7 @@ from rest_framework.request import Request from swh.deposit.models import Deposit -from swh.model.identifiers import parse_swhid +from swh.model.identifiers import QualifiedSWHID from ..config import DEPOSIT_STATUS_LOAD_SUCCESS from ..errors import BAD_REQUEST, DepositError, ParserError @@ -125,8 +125,12 @@ "If the body is empty, there is no metadata.", ) - _, _, deposit, deposit_request = self._store_metadata_deposit( - deposit, parse_swhid(swhid), metadata, raw_metadata, deposit.origin_url, + _, deposit, deposit_request = self._store_metadata_deposit( + deposit, + QualifiedSWHID.from_string(swhid), + metadata, + raw_metadata, + deposit.origin_url, ) def process_delete(self, req, collection_name: str, deposit: Deposit) -> None: diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -14,6 +14,7 @@ from swh.core import tarball from swh.deposit.utils import normalize_date from swh.model import identifiers +from swh.model.hashutil import hash_to_hex from swh.model.model import MetadataAuthorityType from . import APIPrivateView, DepositReadMixin @@ -163,8 +164,8 @@ if deposit.parent: parent_swhid = deposit.parent.swhid assert parent_swhid is not None - swhid = identifiers.parse_swhid(parent_swhid) - parent_revision = swhid.object_id + swhid = identifiers.CoreSWHID.from_string(parent_swhid) + parent_revision = hash_to_hex(swhid.object_id) parents = [parent_revision] else: parents = [] diff --git a/swh/deposit/api/private/deposit_update_status.py b/swh/deposit/api/private/deposit_update_status.py --- a/swh/deposit/api/private/deposit_update_status.py +++ b/swh/deposit/api/private/deposit_update_status.py @@ -5,7 +5,8 @@ from rest_framework.parsers import JSONParser -from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid +from swh.model.hashutil import hash_to_bytes +from swh.model.identifiers import CoreSWHID, ObjectType, QualifiedSWHID from . import APIPrivateView from ...errors import BAD_REQUEST, DepositError @@ -85,21 +86,28 @@ origin_url = data["origin_url"] directory_id = data["directory_id"] revision_id = data["revision_id"] - dir_id = swhid(DIRECTORY, directory_id) - snp_id = swhid(SNAPSHOT, data["snapshot_id"]) - rev_id = swhid(REVISION, revision_id) + dir_id = CoreSWHID( + object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(directory_id) + ) + snp_id = CoreSWHID( + object_type=ObjectType.SNAPSHOT, + object_id=hash_to_bytes(data["snapshot_id"]), + ) + rev_id = CoreSWHID( + object_type=ObjectType.REVISION, object_id=hash_to_bytes(revision_id) + ) - deposit.swhid = dir_id + deposit.swhid = str(dir_id) # new id with contextual information - deposit.swhid_context = swhid( - DIRECTORY, - directory_id, - metadata={ - "origin": origin_url, - "visit": snp_id, - "anchor": rev_id, - "path": "/", - }, + deposit.swhid_context = str( + QualifiedSWHID( + object_type=ObjectType.DIRECTORY, + object_id=hash_to_bytes(directory_id), + origin=origin_url, + visit=snp_id, + anchor=rev_id, + path="/", + ) ) else: # rejected deposit.status = status diff --git a/swh/deposit/migrations/0018_migrate_swhids.py b/swh/deposit/migrations/0018_migrate_swhids.py --- a/swh/deposit/migrations/0018_migrate_swhids.py +++ b/swh/deposit/migrations/0018_migrate_swhids.py @@ -15,7 +15,7 @@ from swh.core import config from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS from swh.model.hashutil import hash_to_bytes, hash_to_hex -from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, parse_swhid, swhid +from swh.model.identifiers import CoreSWHID, ObjectType, QualifiedSWHID from swh.storage import get_storage as get_storage_client from swh.storage.algos.snapshot import snapshot_id_get_from_revision @@ -68,7 +68,7 @@ return swh_storage -def migrate_deposit_swhid_context_not_null(apps, schema_editor): +def migrate_deposit_swhid_context_not_null(apps, schema_editor) -> None: """Migrate deposit SWHIDs to the new format. Migrate deposit SWHIDs to the new format. Only deposit with status done and @@ -84,13 +84,13 @@ for deposit in Deposit.objects.filter( status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=False ): - obj_dir = parse_swhid(deposit.swh_id_context) - assert obj_dir.object_type == DIRECTORY + obj_dir = QualifiedSWHID.from_string(deposit.swh_id_context) + assert obj_dir.object_type == ObjectType.DIRECTORY - obj_rev = parse_swhid(deposit.swh_anchor_id) - assert obj_rev.object_type == REVISION + obj_rev = CoreSWHID.from_string(deposit.swh_anchor_id) + assert obj_rev.object_type == ObjectType.REVISION - if set(obj_dir.metadata.keys()) != {"origin"}: + if set(obj_dir.qualifiers()) != {"origin"}: # Assuming the migration is already done for that deposit logger.warning( "Deposit id %s: Migration already done, skipping", deposit.id @@ -100,7 +100,9 @@ # Starting migration dir_id = obj_dir.object_id - origin = obj_dir.metadata["origin"] + origin = obj_dir.origin + + assert origin check_origin = storage.origin_get([origin])[0] if not check_origin: @@ -125,15 +127,15 @@ old_swh_anchor_id_context = deposit.swh_anchor_id_context # Update - deposit.swh_id_context = swhid( - DIRECTORY, - dir_id, - metadata={ - "origin": origin, - "visit": swhid(SNAPSHOT, snp_id.hex()), - "anchor": swhid(REVISION, rev_id), - "path": "/", - }, + deposit.swh_id_context = QualifiedSWHID( + object_type=ObjectType.DIRECTORY, + object_id=dir_id, + origin=origin, + visit=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id), + anchor=CoreSWHID( + object_type=ObjectType.REVISION, object_id=hash_to_bytes(rev_id) + ), + path=b"/", ) # Ensure only deposit.swh_id_context changed @@ -211,7 +213,7 @@ return map_origin.get(key, f"{provider_url.rstrip('/')}/{external_id}") -def migrate_deposit_swhid_context_null(apps, schema_editor): +def migrate_deposit_swhid_context_null(apps, schema_editor) -> None: """Migrate deposit SWHIDs to the new format. Migrate deposit whose swh_id_context is not set (initial deposits not migrated at @@ -229,8 +231,8 @@ for deposit in Deposit.objects.filter( status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=True ): - obj_rev = parse_swhid(deposit.swh_id) - if obj_rev.object_type == DIRECTORY: + obj_rev = CoreSWHID.from_string(deposit.swh_id) + if obj_rev.object_type == ObjectType.DIRECTORY: # Assuming the migration is already done for that deposit logger.warning( "Deposit id %s: Migration already done, skipping", deposit.id @@ -238,7 +240,7 @@ continue # Ensuring Migration not done - assert obj_rev.object_type == REVISION + assert obj_rev.object_type == ObjectType.REVISION assert deposit.swh_id is not None assert deposit.swh_id_context is None @@ -280,21 +282,25 @@ continue # New SWHIDs ids - deposit.swh_id = swhid(DIRECTORY, dir_id) - deposit.swh_id_context = swhid( - DIRECTORY, - dir_id, - metadata={ - "origin": origin, - "visit": swhid(SNAPSHOT, snp_id.hex()), - "anchor": swhid(REVISION, rev_id), - "path": "/", - }, + deposit.swh_id = CoreSWHID( + object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(dir_id) + ) + deposit.swh_id_context = QualifiedSWHID( + object_type=ObjectType.DIRECTORY, + object_id=dir_id, + origin=origin, + visit=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id), + anchor=CoreSWHID(object_type=ObjectType.REVISION, object_id=rev_id_bytes), + path=b"/", ) # Realign the remaining deposit SWHIDs fields - deposit.swh_anchor_id = swhid(REVISION, rev_id) - deposit.swh_anchor_id_context = swhid( - REVISION, rev_id, metadata={"origin": origin,} + deposit.swh_anchor_id = str( + CoreSWHID(object_type=ObjectType.REVISION, object_id=rev_id_bytes) + ) + deposit.swh_anchor_id_context = str( + QualifiedSWHID( + object_type=ObjectType.REVISION, object_id=rev_id_bytes, origin=origin + ) ) # Ensure only deposit.swh_id_context changed diff --git a/swh/deposit/tests/api/test_collection_post_atom.py b/swh/deposit/tests/api/test_collection_post_atom.py --- a/swh/deposit/tests/api/test_collection_post_atom.py +++ b/swh/deposit/tests/api/test_collection_post_atom.py @@ -22,13 +22,13 @@ from swh.deposit.models import Deposit, DepositCollection, DepositRequest from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import post_atom -from swh.deposit.utils import compute_metadata_context -from swh.model.identifiers import SWHID, parse_swhid +from swh.deposit.utils import compute_metadata_context, extended_swhid_from_qualified +from swh.model.identifiers import QualifiedSWHID from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, - MetadataTargetType, + Origin, RawExtrinsicMetadata, ) from swh.storage.interface import PagedResult @@ -452,63 +452,28 @@ @pytest.mark.parametrize( - "swhid,target_type", + "swhid", [ - ( - "swh:1:cnt:01b5c8cc985d190b5a7ef4878128ebfdc2358f49", - MetadataTargetType.CONTENT, - ), - ( - "swh:1:dir:11b5c8cc985d190b5a7ef4878128ebfdc2358f49", - MetadataTargetType.DIRECTORY, - ), - ( - "swh:1:rev:21b5c8cc985d190b5a7ef4878128ebfdc2358f49", - MetadataTargetType.REVISION, - ), - ( - "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", - MetadataTargetType.RELEASE, - ), - ( - "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", - MetadataTargetType.SNAPSHOT, - ), - ( - "swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", - MetadataTargetType.CONTENT, - ), - ( - "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;origin=https://inria.halpreprod.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:rev:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa - MetadataTargetType.DIRECTORY, - ), - ( - "swh:1:rev:71b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", - MetadataTargetType.REVISION, - ), - ( - "swh:1:rel:81b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", - MetadataTargetType.RELEASE, - ), - ( - "swh:1:snp:91b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", - MetadataTargetType.SNAPSHOT, - ), + "swh:1:cnt:01b5c8cc985d190b5a7ef4878128ebfdc2358f49", + "swh:1:dir:11b5c8cc985d190b5a7ef4878128ebfdc2358f49", + "swh:1:rev:21b5c8cc985d190b5a7ef4878128ebfdc2358f49", + "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", + "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", + "swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", + "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;origin=https://inria.halpreprod.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:rev:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa + "swh:1:rev:71b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", + "swh:1:rel:81b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", + "swh:1:snp:91b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", ], ) def test_deposit_metadata_swhid( - swhid, - target_type, - authenticated_client, - deposit_collection, - atom_dataset, - swh_storage, + swhid, authenticated_client, deposit_collection, atom_dataset, swh_storage, ): """Posting a swhid reference is stored on raw extrinsic metadata storage """ - swhid_reference = parse_swhid(swhid) - swhid_core = attr.evolve(swhid_reference, metadata={}) + swhid_reference = QualifiedSWHID.from_string(swhid) + swhid_target = extended_swhid_from_qualified(swhid_reference) xml_data = atom_dataset["entry-data-with-swhid"].format(swhid=swhid) deposit_client = authenticated_client.deposit_client @@ -525,8 +490,7 @@ # Ensure the deposit is finalized deposit_id = int(response_content["swh:deposit_id"]) deposit = Deposit.objects.get(pk=deposit_id) - assert isinstance(swhid_core, SWHID) - assert deposit.swhid == str(swhid_core) + assert deposit.swhid == str(swhid_target) assert deposit.swhid_context == str(swhid_reference) assert deposit.complete_date == deposit.reception_date assert deposit.complete_date is not None @@ -557,19 +521,18 @@ assert actual_fetcher == metadata_fetcher page_results = swh_storage.raw_extrinsic_metadata_get( - target_type, swhid_core, metadata_authority + swhid_target, metadata_authority ) discovery_date = page_results.results[0].discovery_date assert len(page_results.results) == 1 assert page_results.next_page_token is None - object_type, metadata_context = compute_metadata_context(swhid_reference) + metadata_context = compute_metadata_context(swhid_reference) assert page_results == PagedResult( results=[ RawExtrinsicMetadata( - type=object_type, - target=swhid_core, + target=swhid_target, discovery_date=discovery_date, authority=attr.evolve(metadata_authority, metadata=None), fetcher=attr.evolve(metadata_fetcher, metadata=None), @@ -593,6 +556,7 @@ """ xml_data = atom_dataset["entry-data-with-origin-reference"].format(url=url) + origin_swhid = Origin(url).swhid() deposit_client = authenticated_client.deposit_client response = post_atom( authenticated_client, @@ -637,7 +601,7 @@ assert actual_fetcher == metadata_fetcher page_results = swh_storage.raw_extrinsic_metadata_get( - MetadataTargetType.ORIGIN, url, metadata_authority + origin_swhid, metadata_authority ) discovery_date = page_results.results[0].discovery_date @@ -647,8 +611,7 @@ assert page_results == PagedResult( results=[ RawExtrinsicMetadata( - type=MetadataTargetType.ORIGIN, - target=url, + target=origin_swhid, discovery_date=discovery_date, authority=attr.evolve(metadata_authority, metadata=None), fetcher=attr.evolve(metadata_fetcher, metadata=None), diff --git a/swh/deposit/tests/api/test_deposit_private_update_status.py b/swh/deposit/tests/api/test_deposit_private_update_status.py --- a/swh/deposit/tests/api/test_deposit_private_update_status.py +++ b/swh/deposit/tests/api/test_deposit_private_update_status.py @@ -16,7 +16,6 @@ PRIVATE_PUT_DEPOSIT, ) from swh.deposit.models import Deposit -from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid PRIVATE_PUT_DEPOSIT_NC = PRIVATE_PUT_DEPOSIT + "-nc" @@ -50,13 +49,13 @@ "origin_url": origin_url, } for url in private_check_url_endpoints(deposit_collection, deposit): - dir_id = swhid(DIRECTORY, directory_id) - rev_id = swhid(REVISION, revision_id) - snp_id = swhid(SNAPSHOT, snapshot_id) - expected_swhid = "swh:1:dir:%s" % directory_id expected_swhid_context = ( - f"{dir_id};origin={origin_url};" + f"visit={snp_id};anchor={rev_id};path=/" + f"{expected_swhid}" + f";origin={origin_url}" + f";visit=swh:1:snp:{snapshot_id}" + f";anchor=swh:1:rev:{revision_id}" + f";path=/" ) response = authenticated_client.put( diff --git a/swh/deposit/tests/api/test_deposit_update_atom.py b/swh/deposit/tests/api/test_deposit_update_atom.py --- a/swh/deposit/tests/api/test_deposit_update_atom.py +++ b/swh/deposit/tests/api/test_deposit_update_atom.py @@ -23,12 +23,11 @@ from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import post_atom, put_atom from swh.model.hashutil import hash_to_bytes -from swh.model.identifiers import parse_swhid, swhid +from swh.model.identifiers import CoreSWHID, ExtendedSWHID, ObjectType from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, - MetadataTargetType, RawExtrinsicMetadata, ) from swh.storage.interface import PagedResult @@ -319,8 +318,8 @@ Response: 204 """ - deposit_swhid = parse_swhid(complete_deposit.swhid) - assert deposit_swhid.object_type == "directory" + deposit_swhid = CoreSWHID.from_string(complete_deposit.swhid) + assert deposit_swhid.object_type == ObjectType.DIRECTORY directory_id = hash_to_bytes(deposit_swhid.object_id) # directory targeted by the complete_deposit does not exist in the storage @@ -333,7 +332,7 @@ assert list(swh_storage.directory_missing([existing_directory.id])) == [] # and patch one complete deposit swhid so it targets said reference - complete_deposit.swhid = swhid("directory", existing_directory.id) + complete_deposit.swhid = str(existing_directory.swhid()) complete_deposit.save() actual_existing_requests_archive = DepositRequest.objects.filter( @@ -394,14 +393,13 @@ ) assert actual_fetcher == metadata_fetcher - directory_swhid = parse_swhid(complete_deposit.swhid) + directory_swhid = ExtendedSWHID.from_string(complete_deposit.swhid) page_results = swh_storage.raw_extrinsic_metadata_get( - MetadataTargetType.DIRECTORY, directory_swhid, metadata_authority + directory_swhid, metadata_authority ) assert page_results == PagedResult( results=[ RawExtrinsicMetadata( - type=MetadataTargetType.DIRECTORY, target=directory_swhid, discovery_date=request_meta1.date, authority=attr.evolve(metadata_authority, metadata=None), diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py --- a/swh/deposit/tests/conftest.py +++ b/swh/deposit/tests/conftest.py @@ -38,7 +38,8 @@ post_archive, post_atom, ) -from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid +from swh.model.hashutil import hash_to_bytes +from swh.model.identifiers import CoreSWHID, ObjectType, QualifiedSWHID from swh.scheduler import get_scheduler # mypy is asked to ignore the import statement above because setup_databases @@ -457,18 +458,18 @@ ) origin = "https://hal.archives-ouvertes.fr/hal-01727745" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" - revision_id = "548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10" - snapshot_id = "e5e82d064a9c3df7464223042e0c55d72ccff7f0" - deposit.swhid = swhid(DIRECTORY, directory_id) - deposit.swhid_context = swhid( - DIRECTORY, - directory_id, - metadata={ - "origin": origin, - "visit": swhid(SNAPSHOT, snapshot_id), - "anchor": swhid(REVISION, revision_id), - "path": "/", - }, + revision_id = hash_to_bytes("548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10") + snapshot_id = hash_to_bytes("e5e82d064a9c3df7464223042e0c55d72ccff7f0") + deposit.swhid = f"swh:1:dir:{directory_id}" + deposit.swhid_context = str( + QualifiedSWHID( + object_type=ObjectType.DIRECTORY, + object_id=hash_to_bytes(directory_id), + origin=origin, + visit=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snapshot_id), + anchor=CoreSWHID(object_type=ObjectType.REVISION, object_id=revision_id), + path=b"/", + ) ) deposit.save() return deposit diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py --- a/swh/deposit/tests/test_utils.py +++ b/swh/deposit/tests/test_utils.py @@ -3,7 +3,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Union from unittest.mock import patch import pytest @@ -11,8 +10,7 @@ from swh.deposit import utils from swh.deposit.parsers import parse_xml from swh.model.exceptions import ValidationError -from swh.model.identifiers import SWHID, parse_swhid -from swh.model.model import MetadataTargetType +from swh.model.identifiers import CoreSWHID, QualifiedSWHID @pytest.fixture @@ -163,59 +161,43 @@ @pytest.mark.parametrize( - "swhid_or_origin,expected_type,expected_metadata_context", + "swhid,expected_metadata_context", [ - ("https://something", MetadataTargetType.ORIGIN, {"origin": None}), - ( - "swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49", - MetadataTargetType.CONTENT, - {"origin": None}, - ), + ("swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49", {"origin": None},), ( "swh:1:snp:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=http://blah", - MetadataTargetType.SNAPSHOT, {"origin": "http://blah", "path": None}, ), ( "swh:1:dir:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;path=/path", - MetadataTargetType.DIRECTORY, {"origin": None, "path": b"/path"}, ), ( "swh:1:rev:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;visit=swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", # noqa - MetadataTargetType.REVISION, { "origin": None, "path": None, - "snapshot": parse_swhid( + "snapshot": CoreSWHID.from_string( "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49" ), }, ), ( "swh:1:rel:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", # noqa - MetadataTargetType.RELEASE, { "origin": None, "path": None, - "directory": parse_swhid( + "directory": CoreSWHID.from_string( "swh:1:dir:41b5c8cc985d190b5a7ef4878128ebfdc2358f49" ), }, ), ], ) -def test_compute_metadata_context( - swhid_or_origin: Union[str, SWHID], expected_type, expected_metadata_context -): - if expected_type != MetadataTargetType.ORIGIN: - assert isinstance(swhid_or_origin, str) - swhid_or_origin = parse_swhid(swhid_or_origin) - - object_type, metadata_context = utils.compute_metadata_context(swhid_or_origin) - - assert object_type == expected_type - assert metadata_context == expected_metadata_context +def test_compute_metadata_context(swhid: str, expected_metadata_context): + assert expected_metadata_context == utils.compute_metadata_context( + QualifiedSWHID.from_string(swhid) + ) def test_parse_swh_reference_origin(xml_with_origin_reference): @@ -278,7 +260,7 @@ actual_swhid = utils.parse_swh_reference(metadata) assert actual_swhid is not None - expected_swhid = parse_swhid(swhid) + expected_swhid = QualifiedSWHID.from_string(swhid) assert actual_swhid == expected_swhid @@ -291,7 +273,7 @@ "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:rev:0175049fc45055a3824a1675ac06e3711619a55a", # noqa # anchor qualifier should be a core SWHID with type one of "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;anchor=swh:1:cnt:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa - "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:snp:b5f505b005435fa5c4fa4c279792bd7b17167c04" # noqa + "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:snp:b5f505b005435fa5c4fa4c279792bd7b17167c04", # noqa ], ) def test_parse_swh_reference_invalid_swhid(invalid_swhid, xml_with_swhid): diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py --- a/swh/deposit/utils.py +++ b/swh/deposit/utils.py @@ -5,22 +5,18 @@ import logging from types import GeneratorType -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any, Dict, Optional, Union import iso8601 import xmltodict from swh.model.exceptions import ValidationError from swh.model.identifiers import ( - DIRECTORY, - RELEASE, - REVISION, - SNAPSHOT, - SWHID, + ExtendedSWHID, + ObjectType, + QualifiedSWHID, normalize_timestamp, - parse_swhid, ) -from swh.model.model import MetadataTargetType logger = logging.getLogger(__name__) @@ -120,44 +116,36 @@ return normalize_timestamp(date) -def compute_metadata_context( - swhid_reference: Union[SWHID, str] -) -> Tuple[MetadataTargetType, Dict[str, Any]]: +def compute_metadata_context(swhid_reference: QualifiedSWHID) -> Dict[str, Any]: """Given a SWHID object, determine the context as a dict. - The parse_swhid calls within are not expected to raise (because they should have - been caught early on). - """ metadata_context: Dict[str, Any] = {"origin": None} - if isinstance(swhid_reference, SWHID): - object_type = MetadataTargetType(swhid_reference.object_type) - assert object_type != MetadataTargetType.ORIGIN - - if swhid_reference.metadata: - path = swhid_reference.metadata.get("path") - metadata_context = { - "origin": swhid_reference.metadata.get("origin"), - "path": path.encode() if path else None, - } - snapshot = swhid_reference.metadata.get("visit") - if snapshot: - metadata_context["snapshot"] = parse_swhid(snapshot) - - anchor = swhid_reference.metadata.get("anchor") - if anchor: - anchor_swhid = parse_swhid(anchor) - metadata_context[anchor_swhid.object_type] = anchor_swhid - else: - object_type = MetadataTargetType.ORIGIN + if swhid_reference.qualifiers(): + metadata_context = { + "origin": swhid_reference.origin, + "path": swhid_reference.path, + } + snapshot = swhid_reference.visit + if snapshot: + metadata_context["snapshot"] = snapshot + + anchor = swhid_reference.anchor + if anchor: + metadata_context[anchor.object_type.name.lower()] = anchor - return object_type, metadata_context + return metadata_context -ALLOWED_QUALIFIERS_NODE_TYPE = (SNAPSHOT, REVISION, RELEASE, DIRECTORY) +ALLOWED_QUALIFIERS_NODE_TYPE = ( + ObjectType.SNAPSHOT, + ObjectType.REVISION, + ObjectType.RELEASE, + ObjectType.DIRECTORY, +) -def parse_swh_reference(metadata: Dict) -> Optional[Union[str, SWHID]]: +def parse_swh_reference(metadata: Dict,) -> Optional[Union[QualifiedSWHID, str]]: """Parse swh reference within the metadata dict (or origin) reference if found, None otherwise. @@ -182,9 +170,6 @@ Either swhid or origin reference if any. None otherwise. """ # noqa - visit_swhid = None - anchor_swhid = None - swh_deposit = metadata.get("swh:deposit") if not swh_deposit: return None @@ -206,32 +191,31 @@ swhid = swh_object.get("@swhid") if not swhid: return None - swhid_reference = parse_swhid(swhid) + swhid_reference = QualifiedSWHID.from_string(swhid) - if swhid_reference.metadata: - anchor = swhid_reference.metadata.get("anchor") + if swhid_reference.qualifiers(): + anchor = swhid_reference.anchor if anchor: - anchor_swhid = parse_swhid(anchor) - if anchor_swhid.object_type not in ALLOWED_QUALIFIERS_NODE_TYPE: + if anchor.object_type not in ALLOWED_QUALIFIERS_NODE_TYPE: error_msg = ( "anchor qualifier should be a core SWHID with type one of " - f" {', '.join(ALLOWED_QUALIFIERS_NODE_TYPE)}" + f"{', '.join(t.name.lower() for t in ALLOWED_QUALIFIERS_NODE_TYPE)}" ) raise ValidationError(error_msg) - visit = swhid_reference.metadata.get("visit") + visit = swhid_reference.visit if visit: - visit_swhid = parse_swhid(visit) - if visit_swhid.object_type != SNAPSHOT: + if visit.object_type != ObjectType.SNAPSHOT: raise ValidationError( - f"visit qualifier should be a core SWHID with type {SNAPSHOT}" + f"visit qualifier should be a core SWHID with type snp, " + f"not {visit.object_type.value}" ) if ( - visit_swhid - and anchor_swhid - and visit_swhid.object_type == SNAPSHOT - and anchor_swhid.object_type == SNAPSHOT + visit + and anchor + and visit.object_type == ObjectType.SNAPSHOT + and anchor.object_type == ObjectType.SNAPSHOT ): logger.warn( "SWHID use of both anchor and visit targeting " @@ -242,3 +226,9 @@ ) return swhid_reference + + +def extended_swhid_from_qualified(swhid: QualifiedSWHID) -> ExtendedSWHID: + """Used to get the target of a metadata object from a , + as the latter uses a QualifiedSWHID.""" + return ExtendedSWHID.from_string(str(swhid).split(";")[0])