diff --git a/requirements-swh.txt b/requirements-swh.txt index 726dd6ea..29db978c 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,2 @@ swh.core[http] >= 0.4 -swh.model >= 1.0.0 +swh.model >= 3.0.0 diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py index 56bb69a0..17720100 100644 --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -1,1268 +1,1268 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from abc import ABCMeta, abstractmethod import datetime import hashlib import json from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union import uuid import attr from django.core.files.uploadedfile import UploadedFile from django.http import FileResponse, HttpResponse from django.shortcuts import render from django.template.loader import render_to_string from django.urls import reverse from django.utils import timezone from rest_framework import status from rest_framework.authentication import BaseAuthentication, BasicAuthentication from rest_framework.permissions import BasePermission, IsAuthenticated from rest_framework.request import Request from rest_framework.views import APIView from swh.deposit.api.checks import check_metadata from swh.deposit.api.converters import convert_status_detail from swh.deposit.auth import HasDepositPermission, KeycloakBasicAuthentication from swh.deposit.models import Deposit from swh.deposit.utils import compute_metadata_context from swh.model import hashutil -from swh.model.identifiers import ( - ExtendedObjectType, - ExtendedSWHID, - QualifiedSWHID, - ValidationError, -) from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, Origin, RawExtrinsicMetadata, ) +from swh.model.swhids import ( + ExtendedObjectType, + ExtendedSWHID, + QualifiedSWHID, + ValidationError, +) from swh.scheduler.utils import create_oneshot_task_dict from ..config import ( ARCHIVE_KEY, ARCHIVE_TYPE, CONT_FILE_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_PARTIAL, EDIT_IRI, EM_IRI, METADATA_KEY, METADATA_TYPE, RAW_METADATA_KEY, SE_IRI, STATE_IRI, APIConfig, ) from ..errors import ( BAD_REQUEST, CHECKSUM_MISMATCH, ERROR_CONTENT, FORBIDDEN, MAX_UPLOAD_SIZE_EXCEEDED, MEDIATION_NOT_ALLOWED, METHOD_NOT_ALLOWED, NOT_FOUND, PARSING_ERROR, DepositError, ParserError, ) from ..models import DepositClient, DepositCollection, DepositRequest from ..parsers import parse_xml from ..utils import extended_swhid_from_qualified, parse_swh_reference ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"] ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"] @attr.s class ParsedRequestHeaders: content_type = attr.ib(type=str) content_length = attr.ib(type=Optional[int]) in_progress = attr.ib(type=bool) content_disposition = attr.ib(type=Optional[str]) content_md5sum = attr.ib(type=Optional[bytes]) packaging = attr.ib(type=Optional[str]) slug = attr.ib(type=Optional[str]) on_behalf_of = attr.ib(type=Optional[str]) metadata_relevant = attr.ib(type=Optional[str]) swhid = attr.ib(type=Optional[str]) @attr.s class Receipt: """Data computed while handling the request body that will be served in the Deposit Receipt.""" deposit_id = attr.ib(type=int) deposit_date = attr.ib(type=datetime.datetime) status = attr.ib(type=str) archive = attr.ib(type=Optional[str]) def _compute_md5(filehandler: UploadedFile) -> bytes: h = hashlib.md5() for chunk in filehandler: h.update(chunk) # type: ignore return h.digest() def get_deposit_by_id( deposit_id: int, collection_name: Optional[str] = None ) -> Deposit: """Gets an existing Deposit object if it exists, or raises `DepositError`. If `collection` is not None, also checks the deposit belongs to the collection.""" try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: raise DepositError(NOT_FOUND, f"Deposit {deposit_id} does not exist") if collection_name and deposit.collection.name != collection_name: get_collection_by_name(collection_name) # raises if does not exist raise DepositError( NOT_FOUND, f"Deposit {deposit_id} does not belong to collection {collection_name}", ) return deposit def get_collection_by_name(collection_name: str): """Gets an existing Deposit object if it exists, or raises `DepositError`.""" try: collection = DepositCollection.objects.get(name=collection_name) except DepositCollection.DoesNotExist: raise DepositError(NOT_FOUND, f"Unknown collection name {collection_name}") assert collection is not None return collection def guess_deposit_origin_url(deposit: Deposit): """Guesses an origin url for the given deposit.""" external_id = deposit.external_id if not external_id: # The client provided neither an origin_url nor a slug. That's inconvenient, # but SWORD requires we support it. So let's generate a random slug. external_id = str(uuid.uuid4()) return "%s/%s" % (deposit.client.provider_url.rstrip("/"), external_id) def check_client_origin(client: DepositClient, origin_url: str): provider_url = client.provider_url.rstrip("/") + "/" if not origin_url.startswith(provider_url): raise DepositError( FORBIDDEN, f"Cannot create origin {origin_url}, it must start with {provider_url}", ) class APIBase(APIConfig, APIView, metaclass=ABCMeta): """Base deposit request class sharing multiple common behaviors. """ _client: Optional[DepositClient] = None def __init__(self): super().__init__() auth_provider = self.config.get("authentication_provider") if auth_provider == "basic": self.authentication_classes: Sequence[Type[BaseAuthentication]] = ( BasicAuthentication, ) self.permission_classes: Sequence[Type[BasePermission]] = (IsAuthenticated,) elif auth_provider == "keycloak": self.authentication_classes: Sequence[Type[BaseAuthentication]] = ( KeycloakBasicAuthentication, ) self.permission_classes: Sequence[Type[BasePermission]] = ( IsAuthenticated, HasDepositPermission, ) else: raise ValueError( "Configuration key 'authentication_provider' should be provided with" f"either 'basic' or 'keycloak' value not {auth_provider!r}." ) def _read_headers(self, request: Request) -> ParsedRequestHeaders: """Read and unify the necessary headers from the request (those are not stored in the same location or not properly formatted). Args: request: Input request Returns: Dictionary with the following keys (some associated values may be None): - content-type - content-length - in-progress - content-disposition - packaging - slug - on-behalf-of """ meta = request._request.META content_length = meta.get("CONTENT_LENGTH") if content_length and isinstance(content_length, str): content_length = int(content_length) # final deposit if not provided in_progress = meta.get("HTTP_IN_PROGRESS", False) if isinstance(in_progress, str): in_progress = in_progress.lower() == "true" content_md5sum = meta.get("HTTP_CONTENT_MD5") if content_md5sum: content_md5sum = bytes.fromhex(content_md5sum) return ParsedRequestHeaders( content_type=request.content_type, content_length=content_length, in_progress=in_progress, content_disposition=meta.get("HTTP_CONTENT_DISPOSITION"), content_md5sum=content_md5sum, packaging=meta.get("HTTP_PACKAGING"), slug=meta.get("HTTP_SLUG"), on_behalf_of=meta.get("HTTP_ON_BEHALF_OF"), metadata_relevant=meta.get("HTTP_METADATA_RELEVANT"), swhid=meta.get("HTTP_X_CHECK_SWHID"), ) def _deposit_put(self, deposit: Deposit, in_progress: bool = False) -> None: """Save/Update a deposit in db. Args: deposit: deposit being updated/created in_progress: deposit status """ if in_progress is False: self._complete_deposit(deposit) else: deposit.status = DEPOSIT_STATUS_PARTIAL deposit.save() def _complete_deposit(self, deposit: Deposit) -> None: """Marks the deposit as 'deposited', then schedule a check task if configured to do so.""" deposit.complete_date = timezone.now() deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() if not deposit.origin_url: deposit.origin_url = guess_deposit_origin_url(deposit) if self.config["checks"]: scheduler = self.scheduler if deposit.status == DEPOSIT_STATUS_DEPOSITED and not deposit.check_task_id: task = create_oneshot_task_dict( "check-deposit", collection=deposit.collection.name, deposit_id=deposit.id, retries_left=3, ) check_task_id = scheduler.create_tasks([task])[0]["id"] deposit.check_task_id = check_task_id deposit.save() def _deposit_request_put( self, deposit: Deposit, deposit_request_data: Dict[str, Any], replace_metadata: bool = False, replace_archives: bool = False, ) -> DepositRequest: """Save a deposit request with metadata attached to a deposit. Args: deposit: The deposit concerned by the request deposit_request_data: The dictionary with at most 2 deposit request types (archive, metadata) to associate to the deposit replace_metadata: Flag defining if we add or update existing metadata to the deposit replace_archives: Flag defining if we add or update archives to existing deposit Returns: the DepositRequest object stored in the backend """ if replace_metadata: DepositRequest.objects.filter(deposit=deposit, type=METADATA_TYPE).delete() if replace_archives: DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() deposit_request = None archive_file = deposit_request_data.get(ARCHIVE_KEY) if archive_file: deposit_request = DepositRequest( type=ARCHIVE_TYPE, deposit=deposit, archive=archive_file ) deposit_request.save() metadata = deposit_request_data.get(METADATA_KEY) if metadata: raw_metadata = deposit_request_data[RAW_METADATA_KEY] deposit_request = DepositRequest( type=METADATA_TYPE, deposit=deposit, metadata=metadata, raw_metadata=raw_metadata.decode("utf-8"), ) deposit_request.save() assert deposit_request is not None return deposit_request def _delete_archives(self, collection_name: str, deposit: Deposit) -> Dict: """Delete archive references from the deposit id. """ DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() return {} def _delete_deposit(self, collection_name: str, deposit: Deposit) -> Dict: """Delete deposit reference. Args: collection_name: Client's collection deposit: The deposit to delete Returns Empty dict when ok. Dict with error key to describe the failure. """ if deposit.collection.name != collection_name: summary = "Cannot delete a deposit from another collection" description = "Deposit %s does not belong to the collection %s" % ( deposit.id, collection_name, ) raise DepositError( BAD_REQUEST, summary=summary, verbose_description=description ) DepositRequest.objects.filter(deposit=deposit).delete() deposit.delete() return {} def _check_file_length( self, filehandler: UploadedFile, content_length: Optional[int] = None, ) -> None: """Check the filehandler passed as argument has exactly the expected content_length Args: filehandler: The file to check content_length: the expected length if provided. Raises: DepositError if the actual length does not match """ max_upload_size = self.config["max_upload_size"] if content_length: length = filehandler.size if length != content_length: raise DepositError(status.HTTP_412_PRECONDITION_FAILED, "Wrong length") if filehandler.size > max_upload_size: raise DepositError( MAX_UPLOAD_SIZE_EXCEEDED, f"Upload size limit exceeded (max {max_upload_size} bytes)." "Please consider sending the archive in multiple steps.", ) def _check_file_md5sum( self, filehandler: UploadedFile, md5sum: Optional[bytes], ) -> None: """Check the filehandler passed as argument has the expected md5sum Args: filehandler: The file to check md5sum: md5 hash expected from the file's content Raises: DepositError if the md5sum does not match """ if md5sum: _md5sum = _compute_md5(filehandler) if _md5sum != md5sum: raise DepositError( CHECKSUM_MISMATCH, "Wrong md5 hash", f"The checksum sent {hashutil.hash_to_hex(md5sum)} and the actual " f"checksum {hashutil.hash_to_hex(_md5sum)} does not match.", ) def _binary_upload( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, replace_metadata: bool = False, replace_archives: bool = False, ) -> Receipt: """Binary upload routine. Other than such a request, a 415 response is returned. Args: request: the request holding information to parse and inject in db headers: parsed request headers collection_name: the associated client deposit: deposit to be updated replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Raises: - 400 (bad request) if the request is not providing an external identifier - 413 (request entity too large) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided """ content_length = headers.content_length if not content_length: raise DepositError( BAD_REQUEST, "CONTENT_LENGTH header is mandatory", "For archive deposit, the CONTENT_LENGTH header must be sent.", ) content_disposition = headers.content_disposition if not content_disposition: raise DepositError( BAD_REQUEST, "CONTENT_DISPOSITION header is mandatory", "For archive deposit, the CONTENT_DISPOSITION header must be sent.", ) packaging = headers.packaging if packaging and packaging not in ACCEPT_PACKAGINGS: raise DepositError( BAD_REQUEST, f"Only packaging {ACCEPT_PACKAGINGS} is supported", f"The packaging provided {packaging} is not supported", ) filehandler = request.FILES["file"] assert isinstance(filehandler, UploadedFile), filehandler self._check_file_length(filehandler, content_length) self._check_file_md5sum(filehandler, headers.content_md5sum) # actual storage of data archive_metadata = filehandler self._deposit_put( deposit=deposit, in_progress=headers.in_progress, ) self._deposit_request_put( deposit, {ARCHIVE_KEY: archive_metadata}, replace_metadata=replace_metadata, replace_archives=replace_archives, ) return Receipt( deposit_id=deposit.id, deposit_date=deposit.reception_date, status=deposit.status, archive=filehandler.name, ) def _read_metadata(self, metadata_stream) -> Tuple[bytes, Dict[str, Any]]: """Given a metadata stream, reads the metadata and returns both the parsed and the raw metadata. """ raw_metadata = metadata_stream.read() metadata = parse_xml(raw_metadata) return raw_metadata, metadata def _multipart_upload( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, replace_metadata: bool = False, replace_archives: bool = False, ) -> Receipt: """Multipart upload supported with exactly: - 1 archive (zip) - 1 atom entry Other than such a request, a 415 response is returned. Args: request: the request holding information to parse and inject in db headers: parsed request headers collection_name: the associated client deposit: deposit to be updated replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Raises: - 400 (bad request) if the request is not providing an external identifier - 412 (precondition failed) if the potentially md5 hash provided mismatch the reality of the archive - 413 (request entity too large) if the length of the archive exceeds the max size configured - 415 (unsupported media type) if a wrong media type is provided """ content_types_present = set() data: Dict[str, Optional[Any]] = { "application/zip": None, # expected either zip "application/x-tar": None, # or x-tar "application/atom+xml": None, } for key, value in request.FILES.items(): fh = value content_type = fh.content_type if content_type in content_types_present: raise DepositError( ERROR_CONTENT, "Only 1 application/zip (or application/x-tar) archive " "and 1 atom+xml entry is supported (as per sword2.0 " "specification)", "You provided more than 1 application/(zip|x-tar) " "or more than 1 application/atom+xml content-disposition " "header in the multipart deposit", ) content_types_present.add(content_type) assert content_type is not None data[content_type] = fh if len(content_types_present) != 2: raise DepositError( ERROR_CONTENT, "You must provide both 1 application/zip (or " "application/x-tar) and 1 atom+xml entry for multipart " "deposit", "You need to provide only 1 application/(zip|x-tar) " "and 1 application/atom+xml content-disposition header " "in the multipart deposit", ) filehandler = data["application/zip"] if not filehandler: filehandler = data["application/x-tar"] assert isinstance(filehandler, UploadedFile), filehandler self._check_file_length(filehandler) self._check_file_md5sum(filehandler, headers.content_md5sum) try: raw_metadata, metadata = self._read_metadata(data["application/atom+xml"]) except ParserError: raise DepositError( PARSING_ERROR, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) self._set_deposit_origin_from_metadata(deposit, metadata, headers) # actual storage of data self._deposit_put( deposit=deposit, in_progress=headers.in_progress, ) deposit_request_data = { ARCHIVE_KEY: filehandler, METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata, } self._deposit_request_put( deposit, deposit_request_data, replace_metadata, replace_archives ) assert filehandler is not None return Receipt( deposit_id=deposit.id, deposit_date=deposit.reception_date, archive=filehandler.name, status=deposit.status, ) def _store_metadata_deposit( self, deposit: Deposit, swhid_reference: Union[str, QualifiedSWHID], metadata: Dict, raw_metadata: bytes, deposit_origin: Optional[str] = None, ) -> Tuple[ExtendedSWHID, Deposit, DepositRequest]: """When all user inputs pass the checks, this associates the raw_metadata to the swhid_reference in the raw extrinsic metadata storage. In case of any issues, a bad request response is returned to the user with the details. Checks: - metadata are technically parsable - metadata pass the functional checks - SWHID (if any) is technically valid Args: deposit: Deposit reference swhid_reference: The swhid or the origin to attach metadata information to metadata: Full dict of metadata to check for validity (parsed out of raw_metadata) raw_metadata: The actual raw metadata to send in the storage metadata deposit_origin: Optional deposit origin url to use if any (e.g. deposit update scenario provides one) Raises: DepositError in case of incorrect inputs from the deposit client (e.g. functionally invalid metadata, ...) Returns: Tuple of target swhid, deposit, and deposit request """ metadata_ok, error_details = check_metadata(metadata) if not metadata_ok: assert error_details, "Details should be set when a failure occurs" raise DepositError( BAD_REQUEST, "Functional metadata checks failure", convert_status_detail(error_details), ) metadata_authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit.client.provider_url, ) metadata_fetcher = self.swh_deposit_fetcher() # replace metadata within the deposit backend deposit_request_data = { METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata, } # actually add the metadata to the completed deposit deposit_request = self._deposit_request_put(deposit, deposit_request_data) target_swhid: ExtendedSWHID # origin URL or CoreSWHID if isinstance(swhid_reference, str): target_swhid = Origin(swhid_reference).swhid() metadata_context = {} else: metadata_context = compute_metadata_context(swhid_reference) if deposit_origin: # metadata deposit update on completed deposit metadata_context["origin"] = deposit_origin target_swhid = extended_swhid_from_qualified(swhid_reference) self._check_swhid_in_archive(target_swhid) # metadata deposited by the client metadata_object = RawExtrinsicMetadata( target=target_swhid, # core swhid or origin discovery_date=deposit_request.date, authority=metadata_authority, fetcher=metadata_fetcher, format="sword-v2-atom-codemeta", metadata=raw_metadata, **metadata_context, ) # metadata on the metadata object swh_deposit_authority = self.swh_deposit_authority() swh_deposit_fetcher = self.swh_deposit_fetcher() metametadata_object = RawExtrinsicMetadata( target=metadata_object.swhid(), discovery_date=deposit_request.date, authority=swh_deposit_authority, fetcher=swh_deposit_fetcher, format="xml-deposit-info", metadata=render_to_string( "deposit/deposit_info.xml", context={"deposit": deposit} ).encode(), ) # write to metadata storage self.storage_metadata.metadata_authority_add( [metadata_authority, swh_deposit_authority] ) self.storage_metadata.metadata_fetcher_add( [metadata_fetcher, swh_deposit_fetcher] ) self.storage_metadata.raw_extrinsic_metadata_add( [metadata_object, metametadata_object] ) return (target_swhid, deposit, deposit_request) def _check_swhid_in_archive(self, target_swhid: ExtendedSWHID) -> None: """Check the target object already exists in the archive, and raises a BAD_REQUEST if it does not.""" if target_swhid.object_type in (ExtendedObjectType.CONTENT,): if list( self.storage.content_missing_per_sha1_git([target_swhid.object_id]) ): raise DepositError( BAD_REQUEST, f"Cannot load metadata on {target_swhid}, this content " f"object does not exist in the archive (yet?).", ) elif target_swhid.object_type in ( ExtendedObjectType.DIRECTORY, ExtendedObjectType.REVISION, ExtendedObjectType.RELEASE, ExtendedObjectType.SNAPSHOT, ): target_type_name = target_swhid.object_type.name.lower() method = getattr(self.storage, target_type_name + "_missing") if list(method([target_swhid.object_id])): raise DepositError( BAD_REQUEST, f"Cannot load metadata on {target_swhid}, this {target_type_name} " f"object does not exist in the archive (yet?).", ) elif target_swhid.object_type in (ExtendedObjectType.ORIGIN,): if None in list(self.storage.origin_get_by_sha1([target_swhid.object_id])): raise DepositError( BAD_REQUEST, "Cannot load metadata on origin, it is not (yet?) known to the " "archive.", ) else: # This should not happen, because target_swhid is generated from either # a core swhid or an origin URL. # Let's just check it again so the "switch" is exhaustive. raise ValueError( f"_check_swhid_in_archive expected core SWHID or origin SWHID, " f"but got {target_swhid}." ) def _atom_entry( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, replace_metadata: bool = False, replace_archives: bool = False, ) -> Receipt: """Atom entry deposit. Args: request: the request holding information to parse and inject in db headers: parsed request headers collection_name: the associated client deposit: deposit to be updated replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Raises: - 400 (bad request) if the request is not providing an external identifier - 400 (bad request) if the request's body is empty - 415 (unsupported media type) if a wrong media type is provided """ try: raw_metadata, metadata = self._read_metadata(request.data) except ParserError: raise DepositError( BAD_REQUEST, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) if metadata is None: raise DepositError( BAD_REQUEST, "Empty body request is not supported", "Atom entry deposit is supposed to send for metadata. " "If the body is empty, there is no metadata.", ) self._set_deposit_origin_from_metadata(deposit, metadata, headers) # Determine if we are in the metadata-only deposit case try: swhid_ref = parse_swh_reference(metadata) except ValidationError as e: raise DepositError( PARSING_ERROR, "Invalid SWHID reference", str(e), ) if swhid_ref is not None and ( deposit.origin_url or deposit.parent or deposit.external_id ): raise DepositError( BAD_REQUEST, " is for metadata-only deposits and " " / / Slug are for " "code deposits, only one may be used on a given deposit.", ) if swhid_ref is not None: deposit.save() # We need a deposit id target_swhid, depo, depo_request = self._store_metadata_deposit( deposit, swhid_ref, metadata, raw_metadata ) deposit.status = DEPOSIT_STATUS_LOAD_SUCCESS if isinstance(swhid_ref, QualifiedSWHID): deposit.swhid = str(extended_swhid_from_qualified(swhid_ref)) deposit.swhid_context = str(swhid_ref) deposit.complete_date = depo_request.date deposit.reception_date = depo_request.date deposit.save() return Receipt( deposit_id=deposit.id, deposit_date=depo_request.date, status=deposit.status, archive=None, ) self._deposit_put( deposit=deposit, in_progress=headers.in_progress, ) self._deposit_request_put( deposit, {METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata}, replace_metadata, replace_archives, ) return Receipt( deposit_id=deposit.id, deposit_date=deposit.reception_date, status=deposit.status, archive=None, ) def _set_deposit_origin_from_metadata(self, deposit, metadata, headers): create_origin = metadata.get("swh:deposit", {}).get("swh:create_origin") add_to_origin = metadata.get("swh:deposit", {}).get("swh:add_to_origin") if create_origin and add_to_origin: raise DepositError( BAD_REQUEST, " and are mutually exclusive, " "as they respectively create a new origin and add to an existing " "origin.", ) if create_origin: origin_url = create_origin["swh:origin"]["@url"] check_client_origin(deposit.client, origin_url) deposit.origin_url = origin_url if add_to_origin: origin_url = add_to_origin["swh:origin"]["@url"] check_client_origin(deposit.client, origin_url) deposit.parent = ( Deposit.objects.filter( client=deposit.client, origin_url=origin_url, status=DEPOSIT_STATUS_LOAD_SUCCESS, ) .order_by("-id")[0:1] .get() ) deposit.origin_url = origin_url if "atom:external_identifier" in metadata: # Deprecated tag. # When clients stopped using it, this should raise an error # unconditionally if deposit.origin_url: raise DepositError( BAD_REQUEST, " is deprecated, you should only use " " and from now on.", ) if headers.slug and metadata["atom:external_identifier"] != headers.slug: raise DepositError( BAD_REQUEST, "The tag and Slug header are deprecated, " " or " "should be used instead.", ) def _empty_post( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, ) -> Receipt: """Empty post to finalize a deposit. Args: request: the request holding information to parse and inject in db headers: parsed request headers collection_name: the associated client deposit: deposit to be finalized """ self._complete_deposit(deposit) assert deposit.complete_date is not None return Receipt( deposit_id=deposit.id, deposit_date=deposit.complete_date, status=deposit.status, archive=None, ) def additional_checks( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Optional[Deposit], ) -> Dict[str, Any]: """Permit the child class to enrich additional checks. Returns: dict with 'error' detailing the problem. """ return {} def get_client(self, request) -> DepositClient: # This class depends on AuthenticatedAPIView, so request.user.username # is always set username = request.user.username assert username is not None if self._client is None: try: self._client = DepositClient.objects.get( # type: ignore username=username ) except DepositClient.DoesNotExist: raise DepositError(NOT_FOUND, f"Unknown client name {username}") assert self._client.username == username return self._client def checks( self, request: Request, collection_name: str, deposit: Optional[Deposit] = None ) -> ParsedRequestHeaders: if deposit is None: collection = get_collection_by_name(collection_name) else: assert collection_name == deposit.collection.name collection = deposit.collection client = self.get_client(request) collection_id = collection.id collections = client.collections assert collections is not None if collection_id not in collections: raise DepositError( FORBIDDEN, f"Client {client.username} cannot access collection {collection_name}", ) headers = self._read_headers(request) if deposit is not None: self.restrict_access(request, headers, deposit) if headers.on_behalf_of: raise DepositError(MEDIATION_NOT_ALLOWED, "Mediation is not supported.") self.additional_checks(request, headers, collection_name, deposit) return headers def restrict_access( self, request: Request, headers: ParsedRequestHeaders, deposit: Deposit ) -> None: """Allow modifications on deposit with status 'partial' only, reject the rest. """ if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL: summary = "You can only act on deposit with status '%s'" % ( DEPOSIT_STATUS_PARTIAL, ) description = f"This deposit has status '{deposit.status}'" raise DepositError( BAD_REQUEST, summary=summary, verbose_description=description ) def _basic_not_allowed_method(self, request: Request, method: str): raise DepositError( METHOD_NOT_ALLOWED, f"{method} method is not supported on this endpoint", ) def get( self, request: Request, *args, **kwargs ) -> Union[HttpResponse, FileResponse]: return self._basic_not_allowed_method(request, "GET") def post(self, request: Request, *args, **kwargs) -> HttpResponse: return self._basic_not_allowed_method(request, "POST") def put(self, request: Request, *args, **kwargs) -> HttpResponse: return self._basic_not_allowed_method(request, "PUT") def delete(self, request: Request, *args, **kwargs) -> HttpResponse: return self._basic_not_allowed_method(request, "DELETE") class APIGet(APIBase, metaclass=ABCMeta): """Mixin for class to support GET method. """ def get( # type: ignore self, request: Request, collection_name: str, deposit_id: int ) -> Union[HttpResponse, FileResponse]: """Endpoint to create/add resources to deposit. Returns: 200 response when no error during routine occurred 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ deposit = get_deposit_by_id(deposit_id, collection_name) self.checks(request, collection_name, deposit) r = self.process_get(request, collection_name, deposit) status, content, content_type = r if content_type == "swh/generator": with content as path: return FileResponse( open(path, "rb"), status=status, content_type="application/tar" ) if content_type == "application/json": return HttpResponse( json.dumps(content), status=status, content_type=content_type ) return HttpResponse(content, status=status, content_type=content_type) @abstractmethod def process_get( self, request: Request, collection_name: str, deposit: Deposit ) -> Tuple[int, Any, str]: """Routine to deal with the deposit's get processing. Returns: Tuple status, stream of content, content-type """ pass class APIPost(APIBase, metaclass=ABCMeta): """Mixin for class to support POST method. """ def post( # type: ignore self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: """Endpoint to create/add resources to deposit. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ if deposit_id is None: deposit = None else: deposit = get_deposit_by_id(deposit_id, collection_name) headers = self.checks(request, collection_name, deposit) status, iri_key, receipt = self.process_post( request, headers, collection_name, deposit ) return self._make_deposit_receipt( request, collection_name, status, iri_key, receipt, ) def _make_deposit_receipt( self, request, collection_name: str, status: int, iri_key: str, receipt: Receipt, ) -> HttpResponse: """Returns an HttpResponse with a SWORD Deposit receipt as content.""" # Build the IRIs in the receipt args = [collection_name, receipt.deposit_id] iris = { iri: request.build_absolute_uri(reverse(iri, args=args)) for iri in [EM_IRI, EDIT_IRI, CONT_FILE_IRI, SE_IRI, STATE_IRI] } context = { **attr.asdict(receipt), **iris, "packagings": ACCEPT_PACKAGINGS, } response = render( request, "deposit/deposit_receipt.xml", context=context, content_type="application/xml", status=status, ) response["Location"] = iris[iri_key] return response @abstractmethod def process_post( self, request, headers: ParsedRequestHeaders, collection_name: str, deposit: Optional[Deposit] = None, ) -> Tuple[int, str, Receipt]: """Routine to deal with the deposit's processing. Returns Tuple of: - response status code (200, 201, etc...) - key iri (EM_IRI, EDIT_IRI, etc...) - Receipt """ pass class APIPut(APIBase, metaclass=ABCMeta): """Mixin for class to support PUT method. """ def put( # type: ignore self, request: Request, collection_name: str, deposit_id: int ) -> HttpResponse: """Endpoint to update deposit resources. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ if deposit_id is None: deposit = None else: deposit = get_deposit_by_id(deposit_id, collection_name) headers = self.checks(request, collection_name, deposit) self.process_put(request, headers, collection_name, deposit) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_put( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, ) -> None: """Routine to deal with updating a deposit in some way. Returns dictionary of the processing result """ pass class APIDelete(APIBase, metaclass=ABCMeta): """Mixin for class to support DELETE method. """ def delete( # type: ignore self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: """Endpoint to delete some deposit's resources (archives, deposit). Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ assert deposit_id is not None deposit = get_deposit_by_id(deposit_id, collection_name) self.checks(request, collection_name, deposit) self.process_delete(request, collection_name, deposit) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_delete( self, request: Request, collection_name: str, deposit: Deposit ) -> None: """Routine to delete a resource. This is mostly not allowed except for the EM_IRI (cf. .api.deposit_update.APIUpdateArchive) """ pass diff --git a/swh/deposit/api/edit.py b/swh/deposit/api/edit.py index 3d0d6574..71d9b132 100644 --- a/swh/deposit/api/edit.py +++ b/swh/deposit/api/edit.py @@ -1,142 +1,142 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework.request import Request from swh.deposit.models import Deposit -from swh.model.identifiers import QualifiedSWHID +from swh.model.swhids import QualifiedSWHID from ..config import DEPOSIT_STATUS_LOAD_SUCCESS from ..errors import BAD_REQUEST, DepositError, ParserError from ..parsers import SWHAtomEntryParser, SWHMultiPartParser from .common import APIDelete, APIPut, ParsedRequestHeaders class EditAPI(APIPut, APIDelete): """Deposit request class defining api endpoints for sword deposit. What's known as 'Edit-IRI' in the sword specification. HTTP verbs supported: PUT, DELETE """ parser_classes = (SWHMultiPartParser, SWHAtomEntryParser) def restrict_access( self, request: Request, headers: ParsedRequestHeaders, deposit: Deposit ) -> None: """Relax restriction access to allow metadata update on deposit with status "done" when a swhid is provided. """ if ( request.method == "PUT" and headers.swhid is not None and deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS ): # Allow metadata update on deposit with status "done" when swhid provided return # otherwise, let the standard access restriction check occur super().restrict_access(request, headers, deposit) def process_put( self, request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, ) -> None: """This allows the following scenarios: - multipart: replace all the deposit (status partial) metadata and archive with the provided ones. - atom: replace all the deposit (status partial) metadata with the provided ones. - with swhid, atom: Add new metatada to deposit (status done) with provided ones and push such metadata to the metadata storage directly. source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_metadata - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_multipart Raises: 400 if any of the following occur: - the swhid provided and the deposit swhid do not match - the provided metadata xml file is malformed - the provided xml atom entry is empty - the provided swhid does not exist in the archive """ # noqa swhid = headers.swhid if swhid is None: if request.content_type.startswith("multipart/"): self._multipart_upload( request, headers, collection_name, deposit=deposit, replace_archives=True, replace_metadata=True, ) else: # standard metadata update (replace all metadata already provided to the # deposit by the new ones) self._atom_entry( request, headers, collection_name, deposit=deposit, replace_metadata=True, ) return # Update metadata on a deposit already ingested # Write to the metadata storage (and the deposit backend) # no ingestion triggered assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS if swhid != deposit.swhid: raise DepositError( BAD_REQUEST, f"Mismatched provided SWHID {swhid} with deposit's {deposit.swhid}.", "The provided SWHID does not match the deposit to update. " "Please ensure you send the correct deposit SWHID.", ) try: raw_metadata, metadata = self._read_metadata(request.data) except ParserError: raise DepositError( BAD_REQUEST, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) if not metadata: raise DepositError( BAD_REQUEST, "Empty body request is not supported", "Atom entry deposit is supposed to send for metadata. " "If the body is empty, there is no metadata.", ) _, deposit, deposit_request = self._store_metadata_deposit( deposit, QualifiedSWHID.from_string(swhid), metadata, raw_metadata, deposit.origin_url, ) def process_delete(self, req, collection_name: str, deposit: Deposit) -> None: """Delete the container (deposit). source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deleteconteiner # noqa """ self._delete_deposit(collection_name, deposit) diff --git a/swh/deposit/api/private/deposit_update_status.py b/swh/deposit/api/private/deposit_update_status.py index df059261..8850064d 100644 --- a/swh/deposit/api/private/deposit_update_status.py +++ b/swh/deposit/api/private/deposit_update_status.py @@ -1,118 +1,118 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework.parsers import JSONParser from swh.model.hashutil import hash_to_bytes -from swh.model.identifiers import CoreSWHID, ObjectType, QualifiedSWHID +from swh.model.swhids import CoreSWHID, ObjectType, QualifiedSWHID from . import APIPrivateView from ...errors import BAD_REQUEST, DepositError from ...models import DEPOSIT_STATUS_DETAIL, DEPOSIT_STATUS_LOAD_SUCCESS, Deposit from ..common import APIPut, ParsedRequestHeaders MANDATORY_KEYS = ["origin_url", "revision_id", "directory_id", "snapshot_id"] class APIUpdateStatus(APIPrivateView, APIPut): """Deposit request class to update the deposit's status. HTTP verbs supported: PUT """ parser_classes = (JSONParser,) def additional_checks( self, request, headers: ParsedRequestHeaders, collection_name, deposit=None ): """Enrich existing checks to the default ones. New checks: - Ensure the status is provided - Ensure it exists - no missing information on load success update """ data = request.data status = data.get("status") if not status: msg = "The status key is mandatory with possible values %s" % list( DEPOSIT_STATUS_DETAIL.keys() ) raise DepositError(BAD_REQUEST, msg) if status not in DEPOSIT_STATUS_DETAIL: msg = "Possible status in %s" % list(DEPOSIT_STATUS_DETAIL.keys()) raise DepositError(BAD_REQUEST, msg) if status == DEPOSIT_STATUS_LOAD_SUCCESS: missing_keys = [] for key in MANDATORY_KEYS: value = data.get(key) if value is None: missing_keys.append(key) if missing_keys: msg = ( f"Updating deposit status to {status}" f" requires information {','.join(missing_keys)}" ) raise DepositError(BAD_REQUEST, msg) return {} def process_put( self, request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, ) -> None: """Update the deposit with status and SWHIDs Returns: 204 No content 400 Bad request if checks fail """ data = request.data status = data["status"] deposit.status = status if status == DEPOSIT_STATUS_LOAD_SUCCESS: origin_url = data["origin_url"] directory_id = data["directory_id"] revision_id = data["revision_id"] dir_id = CoreSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(directory_id) ) snp_id = CoreSWHID( object_type=ObjectType.SNAPSHOT, object_id=hash_to_bytes(data["snapshot_id"]), ) rev_id = CoreSWHID( object_type=ObjectType.REVISION, object_id=hash_to_bytes(revision_id) ) deposit.swhid = str(dir_id) # new id with contextual information deposit.swhid_context = str( QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(directory_id), origin=origin_url, visit=snp_id, anchor=rev_id, path="/", ) ) else: # rejected deposit.status = status if "status_detail" in data: deposit.status_detail = data["status_detail"] deposit.save() diff --git a/swh/deposit/migrations/0018_migrate_swhids.py b/swh/deposit/migrations/0018_migrate_swhids.py index 40cec883..d5835824 100644 --- a/swh/deposit/migrations/0018_migrate_swhids.py +++ b/swh/deposit/migrations/0018_migrate_swhids.py @@ -1,348 +1,348 @@ # -*- coding: utf-8 -*- # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import unicode_literals import logging import os from typing import Any, Dict, Optional, Tuple from django.db import migrations from swh.core import config from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS from swh.model.hashutil import hash_to_bytes, hash_to_hex -from swh.model.identifiers import CoreSWHID, ObjectType, QualifiedSWHID +from swh.model.swhids import CoreSWHID, ObjectType, QualifiedSWHID from swh.storage import get_storage as get_storage_client from swh.storage.algos.snapshot import snapshot_id_get_from_revision SWH_PROVIDER_URL = "https://www.softwareheritage.org" logger = logging.getLogger(__name__) swh_storage = None def get_storage() -> Optional[Any]: """Instantiate a storage client """ settings = os.environ.get("DJANGO_SETTINGS_MODULE") if settings != "swh.deposit.settings.production": # Bypass for now return None global swh_storage if not swh_storage: config_file = os.environ.get("SWH_CONFIG_FILENAME") if not config_file: raise ValueError( "Production: SWH_CONFIG_FILENAME must be set to the" " configuration file needed!" ) if not os.path.exists(config_file): raise ValueError( "Production: configuration file %s does not exist!" % (config_file,) ) conf = config.load_named_config(config_file) if not conf: raise ValueError( "Production: configuration %s does not exist." % (config_file,) ) storage_config = conf.get("storage") if not storage_config: raise ValueError( "Production: invalid configuration; missing 'storage' config entry." ) swh_storage = get_storage_client(**storage_config) return swh_storage def migrate_deposit_swhid_context_not_null(apps, schema_editor) -> None: """Migrate deposit SWHIDs to the new format. Migrate deposit SWHIDs to the new format. Only deposit with status done and swh_id_context not null are concerned. """ storage = get_storage() if not storage: logging.warning("Nothing to do") return None Deposit = apps.get_model("deposit", "Deposit") for deposit in Deposit.objects.filter( status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=False ): obj_dir = QualifiedSWHID.from_string(deposit.swh_id_context) assert obj_dir.object_type == ObjectType.DIRECTORY obj_rev = CoreSWHID.from_string(deposit.swh_anchor_id) assert obj_rev.object_type == ObjectType.REVISION if set(obj_dir.qualifiers()) != {"origin"}: # Assuming the migration is already done for that deposit logger.warning( "Deposit id %s: Migration already done, skipping", deposit.id ) continue # Starting migration dir_id = obj_dir.object_id origin = obj_dir.origin assert origin check_origin = storage.origin_get([origin])[0] if not check_origin: logger.warning("Deposit id %s: Origin %s not found!", deposit.id, origin) continue rev_id = obj_rev.object_id # Find the snapshot targeting the revision snp_id = snapshot_id_get_from_revision(storage, origin, hash_to_bytes(rev_id)) if snp_id is None: logger.warning( "Deposit id %s: Snapshot targeting revision %s not found!", deposit.id, rev_id, ) continue # Reference the old values to do some checks later old_swh_id = deposit.swh_id old_swh_id_context = deposit.swh_id_context old_swh_anchor_id = deposit.swh_anchor_id old_swh_anchor_id_context = deposit.swh_anchor_id_context # Update deposit.swh_id_context = QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=dir_id, origin=origin, visit=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id), anchor=CoreSWHID( object_type=ObjectType.REVISION, object_id=hash_to_bytes(rev_id) ), path=b"/", ) # Ensure only deposit.swh_id_context changed logging.debug("deposit.id: {deposit.id}") logging.debug("deposit.swh_id: %s -> %s", old_swh_id, deposit.swh_id) assert old_swh_id == deposit.swh_id logging.debug( "deposit.swh_id_context: %s -> %s", old_swh_id_context, deposit.swh_id_context, ) assert old_swh_id_context != deposit.swh_id_context logging.debug( "deposit.swh_anchor_id: %s -> %s", old_swh_anchor_id, deposit.swh_anchor_id ) assert old_swh_anchor_id == deposit.swh_anchor_id logging.debug( "deposit.swh_anchor_id_context: %s -> %s", old_swh_anchor_id_context, deposit.swh_anchor_id_context, ) assert old_swh_anchor_id_context == deposit.swh_anchor_id_context # Commit deposit.save() def resolve_origin(deposit_id: int, provider_url: str, external_id: str) -> str: """Resolve the origin from provider-url and external-id For some edge case, only the external_id is used as there is some old inconsistency from testing which exists. """ map_edge_case_origin: Dict[Tuple[int, str], str] = { ( 76, "hal-01588782", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588782", ( 87, "hal-01588927", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588927", (89, "hal-01588935"): "https://hal-preprod.archives-ouvertes.fr/hal-01588935", ( 88, "hal-01588928", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588928", ( 90, "hal-01588942", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588942", (143, "hal-01592430"): "https://hal-preprod.archives-ouvertes.fr/hal-01592430", ( 75, "hal-01588781", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588781", } origin = map_edge_case_origin.get((deposit_id, external_id)) if origin: return origin # Some simpler origin edge cases (mostly around the initial deposits) map_origin = { ( SWH_PROVIDER_URL, "je-suis-gpl", ): "https://forge.softwareheritage.org/source/jesuisgpl/", ( SWH_PROVIDER_URL, "external-id", ): "https://hal.archives-ouvertes.fr/external-id", } key = (provider_url, external_id) return map_origin.get(key, f"{provider_url.rstrip('/')}/{external_id}") def migrate_deposit_swhid_context_null(apps, schema_editor) -> None: """Migrate deposit SWHIDs to the new format. Migrate deposit whose swh_id_context is not set (initial deposits not migrated at the time). Only deposit with status done and swh_id_context null are concerned. Note: Those deposits have their swh_id being the SWHPIDs of the revision! So we can align them as well. """ storage = get_storage() if not storage: logging.warning("Nothing to do") return None Deposit = apps.get_model("deposit", "Deposit") for deposit in Deposit.objects.filter( status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=True ): obj_rev = CoreSWHID.from_string(deposit.swh_id) if obj_rev.object_type == ObjectType.DIRECTORY: # Assuming the migration is already done for that deposit logger.warning( "Deposit id %s: Migration already done, skipping", deposit.id ) continue # Ensuring Migration not done assert obj_rev.object_type == ObjectType.REVISION assert deposit.swh_id is not None assert deposit.swh_id_context is None assert deposit.swh_anchor_id is None assert deposit.swh_anchor_id_context is None rev_id = obj_rev.object_id rev_id_bytes = hash_to_bytes(rev_id) revision = storage.revision_get([rev_id_bytes])[0] if not revision: logger.warning("Deposit id %s: Revision %s not found!", deposit.id, rev_id) continue provider_url = deposit.client.provider_url external_id = deposit.external_id origin = resolve_origin(deposit.id, provider_url, external_id) check_origin = storage.origin_get([origin])[0] if not check_origin: logger.warning("Deposit id %s: Origin %s not found!", deposit.id, origin) continue dir_id = hash_to_hex(revision["directory"]) # Reference the old values to do some checks later old_swh_id = deposit.swh_id old_swh_id_context = deposit.swh_id_context old_swh_anchor_id = deposit.swh_anchor_id old_swh_anchor_id_context = deposit.swh_anchor_id_context # retrieve the snapshot from the archive snp_id = snapshot_id_get_from_revision(storage, origin, rev_id_bytes) if snp_id is None: logger.warning( "Deposit id %s: Snapshot targeting revision %s not found!", deposit.id, rev_id, ) continue # New SWHIDs ids deposit.swh_id = CoreSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(dir_id) ) deposit.swh_id_context = QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=dir_id, origin=origin, visit=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id), anchor=CoreSWHID(object_type=ObjectType.REVISION, object_id=rev_id_bytes), path=b"/", ) # Realign the remaining deposit SWHIDs fields deposit.swh_anchor_id = str( CoreSWHID(object_type=ObjectType.REVISION, object_id=rev_id_bytes) ) deposit.swh_anchor_id_context = str( QualifiedSWHID( object_type=ObjectType.REVISION, object_id=rev_id_bytes, origin=origin ) ) # Ensure only deposit.swh_id_context changed logging.debug("deposit.id: {deposit.id}") logging.debug("deposit.swh_id: %s -> %s", old_swh_id, deposit.swh_id) assert old_swh_id != deposit.swh_id logging.debug( "deposit.swh_id_context: %s -> %s", old_swh_id_context, deposit.swh_id_context, ) assert old_swh_id_context != deposit.swh_id_context assert deposit.swh_id_context is not None logging.debug( "deposit.swh_anchor_id: %s -> %s", old_swh_anchor_id, deposit.swh_anchor_id ) assert deposit.swh_anchor_id == old_swh_id assert deposit.swh_anchor_id is not None logging.debug( "deposit.swh_anchor_id_context: %s -> %s", old_swh_anchor_id_context, deposit.swh_anchor_id_context, ) assert deposit.swh_anchor_id_context is not None deposit.save() class Migration(migrations.Migration): dependencies = [ ("deposit", "0017_auto_20190925_0906"), ] operations = [ # Migrate and make the operations possibly reversible # https://docs.djangoproject.com/en/3.0/ref/migration-operations/#django.db.migrations.operations.RunPython.noop # noqa migrations.RunPython( migrate_deposit_swhid_context_not_null, reverse_code=migrations.RunPython.noop, ), migrations.RunPython( migrate_deposit_swhid_context_null, reverse_code=migrations.RunPython.noop ), ] diff --git a/swh/deposit/tests/api/test_collection_post_atom.py b/swh/deposit/tests/api/test_collection_post_atom.py index da0f0cdf..77b8c715 100644 --- a/swh/deposit/tests/api/test_collection_post_atom.py +++ b/swh/deposit/tests/api/test_collection_post_atom.py @@ -1,775 +1,775 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Tests the handling of the Atom content when doing a POST Col-IRI.""" from io import BytesIO import textwrap import uuid import warnings import attr from django.urls import reverse_lazy as reverse import pytest from rest_framework import status from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_SUCCESS, APIConfig, ) from swh.deposit.models import Deposit, DepositCollection, DepositRequest from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import post_atom from swh.deposit.utils import compute_metadata_context, extended_swhid_from_qualified from swh.model.hypothesis_strategies import ( directories, present_contents, releases, revisions, snapshots, ) -from swh.model.identifiers import ObjectType, QualifiedSWHID from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, Origin, RawExtrinsicMetadata, ) +from swh.model.swhids import ObjectType, QualifiedSWHID from swh.storage.interface import PagedResult def _insert_object(swh_storage, swhid): """Insert an object with the given swhid in the archive""" if swhid.object_type == ObjectType.CONTENT: with warnings.catch_warnings(): # hypothesis doesn't like us using .example(), but we know what we're doing warnings.simplefilter("ignore") obj = present_contents().example() swh_storage.content_add([attr.evolve(obj, sha1_git=swhid.object_id)]) else: object_type_name = swhid.object_type.name.lower() strategy = { "directory": directories, "revision": revisions, "release": releases, "snapshot": snapshots, }[object_type_name] method = getattr(swh_storage, object_type_name + "_add") with warnings.catch_warnings(): # hypothesis doesn't like us using .example(), but we know what we're doing warnings.simplefilter("ignore") obj = strategy().example() method([attr.evolve(obj, id=swhid.object_id)]) def _assert_deposit_info_on_metadata( swh_storage, metadata_swhid, deposit, metadata_fetcher ): swh_authority = MetadataAuthority( MetadataAuthorityType.REGISTRY, "http://deposit.softwareheritage.example/", ) page_results = swh_storage.raw_extrinsic_metadata_get(metadata_swhid, swh_authority) assert len(page_results.results) == 1 assert page_results.next_page_token is None expected_xml_data = textwrap.dedent( f"""\ {deposit.id} https://hal-test.archives-ouvertes.fr/ test """ ) assert page_results == PagedResult( results=[ RawExtrinsicMetadata( target=metadata_swhid, discovery_date=deposit.complete_date, authority=swh_authority, fetcher=metadata_fetcher, format="xml-deposit-info", metadata=expected_xml_data.encode(), ) ], next_page_token=None, ) def test_post_deposit_atom_201_even_with_decimal( authenticated_client, deposit_collection, atom_dataset ): """Posting an initial atom entry should return 201 with deposit receipt """ atom_error_with_decimal = atom_dataset["error-with-decimal"] response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_error_with_decimal, HTTP_SLUG="external-id", HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["swh:deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) dr = DepositRequest.objects.get(deposit=deposit) assert dr.metadata is not None sw_version = dr.metadata.get("codemeta:softwareVersion") assert sw_version == "10.4" def test_post_deposit_atom_400_with_empty_body( authenticated_client, deposit_collection, atom_dataset ): """Posting empty body request should return a 400 response """ atom_content = atom_dataset["entry-data-empty-body"] response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_content, HTTP_SLUG="external-id", ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Empty body request is not supported" in response.content def test_post_deposit_atom_400_badly_formatted_atom( authenticated_client, deposit_collection, atom_dataset ): """Posting a badly formatted atom should return a 400 response """ response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-badly-formatted"], HTTP_SLUG="external-id", ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Malformed xml metadata" in response.content def test_post_deposit_atom_parsing_error( authenticated_client, deposit_collection, atom_dataset ): """Posting parsing error prone atom should return 400 """ response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-parsing-error-prone"], HTTP_SLUG="external-id", ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Malformed xml metadata" in response.content def test_post_deposit_atom_400_both_create_origin_and_add_to_origin( authenticated_client, deposit_collection, atom_dataset ): """Posting a badly formatted atom should return a 400 response """ response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-with-both-create-origin-and-add-to-origin"], ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert ( b"<swh:create_origin> and <swh:add_to_origin> " b"are mutually exclusive" ) in response.content def test_post_deposit_atom_403_create_wrong_origin_url_prefix( authenticated_client, deposit_collection, atom_dataset, deposit_user ): """Creating an origin for a prefix not owned by the client is forbidden """ origin_url = "http://example.org/foo" response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_403_FORBIDDEN expected_msg = ( f"Cannot create origin {origin_url}, " f"it must start with {deposit_user.provider_url}" ) assert expected_msg in response.content.decode() def test_post_deposit_atom_use_slug_header( authenticated_client, deposit_collection, deposit_user, atom_dataset, mocker ): """Posting an atom entry with a slug header but no origin url generates an origin url from the slug """ url = reverse(COL_IRI, args=[deposit_collection.name]) slug = str(uuid.uuid4()) # when response = post_atom( authenticated_client, url, data=atom_dataset["entry-data-no-origin-url"], HTTP_IN_PROGRESS="false", HTTP_SLUG=slug, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["swh:deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == deposit_user.provider_url + slug assert deposit.status == DEPOSIT_STATUS_DEPOSITED def test_post_deposit_atom_no_origin_url_nor_slug_header( authenticated_client, deposit_collection, deposit_user, atom_dataset, mocker ): """Posting an atom entry without an origin url or a slug header should generate one """ url = reverse(COL_IRI, args=[deposit_collection.name]) slug = str(uuid.uuid4()) mocker.patch("uuid.uuid4", return_value=slug) # when response = post_atom( authenticated_client, url, data=atom_dataset["entry-data-no-origin-url"], HTTP_IN_PROGRESS="false", ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["swh:deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == deposit_user.provider_url + slug assert deposit.status == DEPOSIT_STATUS_DEPOSITED def test_post_deposit_atom_with_slug_and_external_identifier( authenticated_client, deposit_collection, deposit_user, atom_dataset, mocker ): """Even though is deprecated, it should still be allowed when it matches the slug, so that we don't break existing clients """ url = reverse(COL_IRI, args=[deposit_collection.name]) slug = str(uuid.uuid4()) # when response = post_atom( authenticated_client, url, data=atom_dataset["error-with-external-identifier"] % slug, HTTP_IN_PROGRESS="false", HTTP_SLUG=slug, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["swh:deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == deposit_user.provider_url + slug assert deposit.status == DEPOSIT_STATUS_DEPOSITED def test_post_deposit_atom_with_mismatched_slug_and_external_identifier( authenticated_client, deposit_collection, atom_dataset ): """Posting an atom entry with mismatched slug header and external_identifier should return a 400 """ external_id = "foobar" url = reverse(COL_IRI, args=[deposit_collection.name]) # when response = post_atom( authenticated_client, url, data=atom_dataset["error-with-external-identifier"] % external_id, HTTP_IN_PROGRESS="false", HTTP_SLUG="something", ) assert ( b"The <external_identifier> tag and Slug header are deprecated" in response.content ) assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_with_create_origin_and_external_identifier( authenticated_client, deposit_collection, atom_dataset, deposit_user ): """ was deprecated before was introduced, clients should get an error when trying to use both """ external_id = "foobar" origin_url = deposit_user.provider_url + external_id url = reverse(COL_IRI, args=[deposit_collection.name]) document = atom_dataset["error-with-external-identifier-and-create-origin"].format( external_id=external_id, url=origin_url, ) # when response = post_atom( authenticated_client, url, data=document, HTTP_IN_PROGRESS="false", ) assert b"<external_identifier> is deprecated" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_with_create_origin_and_reference( authenticated_client, deposit_collection, atom_dataset, deposit_user ): """ and are mutually exclusive """ external_id = "foobar" origin_url = deposit_user.provider_url + external_id url = reverse(COL_IRI, args=[deposit_collection.name]) document = atom_dataset["error-with-reference-and-create-origin"].format( external_id=external_id, url=origin_url, ) # when response = post_atom( authenticated_client, url, data=document, HTTP_IN_PROGRESS="false", ) assert b"only one may be used on a given deposit" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_unknown_collection(authenticated_client, atom_dataset): """Posting an atom entry to an unknown collection should return a 404 """ unknown_collection = "unknown-one" with pytest.raises(DepositCollection.DoesNotExist): DepositCollection.objects.get(name=unknown_collection) response = post_atom( authenticated_client, reverse(COL_IRI, args=[unknown_collection]), data=atom_dataset["entry-data0"], HTTP_SLUG="something", ) assert response.status_code == status.HTTP_404_NOT_FOUND assert b"Unknown collection" in response.content def test_post_deposit_atom_entry_initial( authenticated_client, deposit_collection, atom_dataset, deposit_user ): """Posting an initial atom entry should return 201 with deposit receipt """ # given origin_url = deposit_user.provider_url + "1225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(origin_url=origin_url) atom_entry_data = atom_dataset["entry-data0"] % origin_url # when response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_entry_data, HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["swh:deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == origin_url assert deposit.status == DEPOSIT_STATUS_DEPOSITED # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.metadata is not None assert deposit_request.raw_metadata == atom_entry_data assert bool(deposit_request.archive) is False def test_post_deposit_atom_entry_with_codemeta( authenticated_client, deposit_collection, atom_dataset, deposit_user ): """Posting an initial atom entry should return 201 with deposit receipt """ # given origin_url = deposit_user.provider_url + "1225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(origin_url=origin_url) atom_entry_data = atom_dataset["codemeta-sample"] % origin_url # when response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_entry_data, HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["swh:deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == origin_url assert deposit.status == DEPOSIT_STATUS_DEPOSITED # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.metadata is not None assert deposit_request.raw_metadata == atom_entry_data assert bool(deposit_request.archive) is False def test_deposit_metadata_invalid( authenticated_client, deposit_collection, atom_dataset ): """Posting invalid swhid reference is bad request returned to client """ invalid_swhid = "swh:1:dir :31b5c8cc985d190b5a7ef4878128ebfdc2358f49" xml_data = atom_dataset["entry-data-with-swhid"].format(swhid=invalid_swhid) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Invalid SWHID reference" in response.content def test_deposit_metadata_fails_functional_checks( authenticated_client, deposit_collection, atom_dataset ): """Posting functionally invalid metadata swhid is bad request returned to client """ swhid = "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49" invalid_xml_data = atom_dataset[ "entry-data-with-swhid-fail-metadata-functional-checks" ].format(swhid=swhid) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=invalid_xml_data, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Functional metadata checks failure" in response.content @pytest.mark.parametrize( "swhid", [ "swh:1:cnt:01b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:dir:11b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:rev:21b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;origin=https://inria.halpreprod.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:rev:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa "swh:1:rev:71b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:rel:81b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:snp:91b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", ], ) def test_deposit_metadata_swhid( swhid, authenticated_client, deposit_collection, atom_dataset, swh_storage, ): """Posting a swhid reference is stored on raw extrinsic metadata storage """ swhid_reference = QualifiedSWHID.from_string(swhid) swhid_target = extended_swhid_from_qualified(swhid_reference) xml_data = atom_dataset["entry-data-with-swhid"].format(swhid=swhid) deposit_client = authenticated_client.deposit_client _insert_object(swh_storage, swhid_reference) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = parse_xml(BytesIO(response.content)) # Ensure the deposit is finalized deposit_id = int(response_content["swh:deposit_id"]) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.swhid == str(swhid_target) assert deposit.swhid_context == str(swhid_reference) assert deposit.complete_date == deposit.reception_date assert deposit.complete_date is not None assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS # Ensure metadata stored in the metadata storage is consistent metadata_authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url, ) actual_authority = swh_storage.metadata_authority_get( MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url ) assert actual_authority == metadata_authority config = APIConfig() metadata_fetcher = MetadataFetcher( name=config.tool["name"], version=config.tool["version"], ) actual_fetcher = swh_storage.metadata_fetcher_get( config.tool["name"], config.tool["version"] ) assert actual_fetcher == metadata_fetcher # Get the deposited metadata object and check it: page_results = swh_storage.raw_extrinsic_metadata_get( swhid_target, metadata_authority ) assert len(page_results.results) == 1 assert page_results.next_page_token is None metadata_context = compute_metadata_context(swhid_reference) metadata = RawExtrinsicMetadata( target=swhid_target, discovery_date=deposit.complete_date, authority=metadata_authority, fetcher=metadata_fetcher, format="sword-v2-atom-codemeta", metadata=xml_data.encode(), **metadata_context, ) assert page_results == PagedResult(results=[metadata], next_page_token=None,) # Get metadata about the deposited metadata object and check it: _assert_deposit_info_on_metadata( swh_storage, metadata.swhid(), deposit, metadata_fetcher ) @pytest.mark.parametrize( "url", ["https://gitlab.org/user/repo", "https://whatever.else/repo",] ) def test_deposit_metadata_origin( url, authenticated_client, deposit_collection, atom_dataset, swh_storage, ): """Posting a swhid reference is stored on raw extrinsic metadata storage """ xml_data = atom_dataset["entry-data-with-origin-reference"].format(url=url) origin_swhid = Origin(url).swhid() deposit_client = authenticated_client.deposit_client swh_storage.origin_add([Origin(url)]) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = parse_xml(BytesIO(response.content)) # Ensure the deposit is finalized deposit_id = int(response_content["swh:deposit_id"]) deposit = Deposit.objects.get(pk=deposit_id) # we got not swhid as input so we cannot have those assert deposit.swhid is None assert deposit.swhid_context is None assert deposit.complete_date == deposit.reception_date assert deposit.complete_date is not None assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS # Ensure metadata stored in the metadata storage is consistent metadata_authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url, ) actual_authority = swh_storage.metadata_authority_get( MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url ) assert actual_authority == metadata_authority config = APIConfig() metadata_fetcher = MetadataFetcher( name=config.tool["name"], version=config.tool["version"], ) actual_fetcher = swh_storage.metadata_fetcher_get( config.tool["name"], config.tool["version"] ) assert actual_fetcher == metadata_fetcher # Get the deposited metadata object and check it: page_results = swh_storage.raw_extrinsic_metadata_get( origin_swhid, metadata_authority ) assert len(page_results.results) == 1 assert page_results.next_page_token is None metadata = RawExtrinsicMetadata( target=origin_swhid, discovery_date=deposit.complete_date, authority=metadata_authority, fetcher=metadata_fetcher, format="sword-v2-atom-codemeta", metadata=xml_data.encode(), ) assert page_results == PagedResult(results=[metadata], next_page_token=None,) # Get metadata about the deposited metadata object and check it: _assert_deposit_info_on_metadata( swh_storage, metadata.swhid(), deposit, metadata_fetcher ) @pytest.mark.parametrize( "swhid", [ "swh:1:cnt:01b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:dir:11b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:rev:21b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;origin=https://inria.halpreprod.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:rev:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa "swh:1:rev:71b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:rel:81b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:snp:91b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", ], ) def test_deposit_metadata_unknown_swhid( swhid, authenticated_client, deposit_collection, atom_dataset, swh_storage, ): """Posting a swhid reference is rejected if the referenced object is unknown """ xml_data = atom_dataset["entry-data-with-swhid"].format(swhid=swhid) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert ( response.status_code == status.HTTP_400_BAD_REQUEST ), response.content.decode() response_content = parse_xml(BytesIO(response.content)) assert "object does not exist" in response_content["sword:error"]["atom:summary"] @pytest.mark.parametrize( "swhid", [ "swh:1:ori:01b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:emd:11b5c8cc985d190b5a7ef4878128ebfdc2358f49", ], ) def test_deposit_metadata_extended_swhid( swhid, authenticated_client, deposit_collection, atom_dataset, swh_storage, ): """Posting a swhid reference is rejected if the referenced SWHID is for an extended object type """ xml_data = atom_dataset["entry-data-with-swhid"].format(swhid=swhid) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert ( response.status_code == status.HTTP_400_BAD_REQUEST ), response.content.decode() response_content = parse_xml(BytesIO(response.content)) assert "Invalid SWHID reference" in response_content["sword:error"]["atom:summary"] def test_deposit_metadata_unknown_origin( authenticated_client, deposit_collection, atom_dataset, swh_storage, ): """Posting a swhid reference is stored on raw extrinsic metadata storage """ url = "https://gitlab.org/user/repo" xml_data = atom_dataset["entry-data-with-origin-reference"].format(url=url) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert ( response.status_code == status.HTTP_400_BAD_REQUEST ), response.content.decode() response_content = parse_xml(BytesIO(response.content)) assert "known to the archive" in response_content["sword:error"]["atom:summary"] diff --git a/swh/deposit/tests/api/test_deposit_update_atom.py b/swh/deposit/tests/api/test_deposit_update_atom.py index c49b967c..afc83041 100644 --- a/swh/deposit/tests/api/test_deposit_update_atom.py +++ b/swh/deposit/tests/api/test_deposit_update_atom.py @@ -1,602 +1,602 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import BytesIO from django.urls import reverse_lazy as reverse import pytest from rest_framework import status from swh.deposit.api.common import ACCEPT_ARCHIVE_CONTENT_TYPES from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, EDIT_IRI, EM_IRI, SE_IRI, APIConfig, ) from swh.deposit.models import Deposit, DepositCollection, DepositRequest from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import post_atom, put_atom from swh.model.hashutil import hash_to_bytes -from swh.model.identifiers import CoreSWHID, ExtendedSWHID, ObjectType from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, RawExtrinsicMetadata, ) +from swh.model.swhids import CoreSWHID, ExtendedSWHID, ObjectType from swh.storage.interface import PagedResult def test_post_deposit_atom_entry_multiple_steps( authenticated_client, deposit_collection, atom_dataset, deposit_user ): """After initial deposit, updating a deposit should return a 201 """ # given origin_url = deposit_user.provider_url + "2225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): deposit = Deposit.objects.get(origin_url=origin_url) # when response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data1"], HTTP_IN_PROGRESS="True", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = int(response_content["swh:deposit_id"]) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url is None # not provided yet assert deposit.status == "partial" # one associated request to a deposit deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert len(deposit_requests) == 1 atom_entry_data = atom_dataset["entry-only-create-origin"] % (origin_url) for link in response_content["atom:link"]: if link["@rel"] == "http://purl.org/net/sword/terms/add": se_iri = link["@href"] break else: assert False, f"missing SE-IRI from {response_content['link']}" # when updating the first deposit post response = post_atom( authenticated_client, se_iri, data=atom_entry_data, HTTP_IN_PROGRESS="False", ) # then assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = parse_xml(BytesIO(response.content)) deposit_id = int(response_content["swh:deposit_id"]) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == origin_url assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert len(Deposit.objects.all()) == 1 # now 2 associated requests to a same deposit deposit_requests = DepositRequest.objects.filter(deposit=deposit).order_by("id") assert len(deposit_requests) == 2 atom_entry_data1 = atom_dataset["entry-data1"] expected_meta = [ {"metadata": parse_xml(atom_entry_data1), "raw_metadata": atom_entry_data1}, {"metadata": parse_xml(atom_entry_data), "raw_metadata": atom_entry_data}, ] for i, deposit_request in enumerate(deposit_requests): actual_metadata = deposit_request.metadata assert actual_metadata == expected_meta[i]["metadata"] assert deposit_request.raw_metadata == expected_meta[i]["raw_metadata"] assert bool(deposit_request.archive) is False def test_replace_metadata_to_deposit_is_possible( tmp_path, authenticated_client, partial_deposit_with_metadata, deposit_collection, atom_dataset, deposit_user, ): """Replace all metadata with another one should return a 204 response """ # given deposit = partial_deposit_with_metadata origin_url = deposit_user.provider_url + deposit.external_id raw_metadata0 = atom_dataset["entry-data0"] % origin_url requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta) == 1 request_meta0 = requests_meta[0] assert request_meta0.raw_metadata == raw_metadata0 requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive0) == 1 update_uri = reverse(EDIT_IRI, args=[deposit_collection.name, deposit.id]) response = put_atom( authenticated_client, update_uri, data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_204_NO_CONTENT requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta) == 1 request_meta1 = requests_meta[0] raw_metadata1 = request_meta1.raw_metadata assert raw_metadata1 == atom_dataset["entry-data1"] assert raw_metadata0 != raw_metadata1 assert request_meta0 != request_meta1 # check we did not touch the other parts requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive1) == 1 assert set(requests_archive0) == set(requests_archive1) def test_add_metadata_to_deposit_is_possible( authenticated_client, deposit_collection, partial_deposit_with_metadata, atom_dataset, deposit_user, ): """Add metadata with another one should return a 204 response """ deposit = partial_deposit_with_metadata origin_url = deposit_user.provider_url + deposit.external_id requests = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests) == 1 requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive0) == 1 update_uri = reverse(SE_IRI, args=[deposit_collection.name, deposit.id]) atom_entry = atom_dataset["entry-data1"] response = post_atom(authenticated_client, update_uri, data=atom_entry) assert response.status_code == status.HTTP_201_CREATED requests = DepositRequest.objects.filter(deposit=deposit, type="metadata").order_by( "id" ) assert len(requests) == 2 expected_raw_meta0 = atom_dataset["entry-data0"] % origin_url # a new one was added assert requests[0].raw_metadata == expected_raw_meta0 assert requests[1].raw_metadata == atom_entry # check we did not touch the other parts requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive1) == 1 assert set(requests_archive0) == set(requests_archive1) def test_add_metadata_to_unknown_deposit( deposit_collection, authenticated_client, atom_dataset ): """Replacing metadata to unknown deposit should return a 404 response """ unknown_deposit_id = 1000 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(SE_IRI, args=[deposit_collection, unknown_deposit_id]) response = post_atom(authenticated_client, url, data=atom_dataset["entry-data1"],) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert ( "Deposit 1000 does not exist" in response_content["sword:error"]["atom:summary"] ) def test_add_metadata_to_unknown_collection( partial_deposit, authenticated_client, atom_dataset ): """Replacing metadata to unknown deposit should return a 404 response """ deposit = partial_deposit unknown_collection_name = "unknown-collection" try: DepositCollection.objects.get(name=unknown_collection_name) except DepositCollection.DoesNotExist: assert True url = reverse(SE_IRI, args=[unknown_collection_name, deposit.id]) response = post_atom(authenticated_client, url, data=atom_dataset["entry-data1"],) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert "Unknown collection name" in response_content["sword:error"]["atom:summary"] def test_replace_metadata_to_unknown_deposit( authenticated_client, deposit_collection, atom_dataset ): """Adding metadata to unknown deposit should return a 404 response """ unknown_deposit_id = 998 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(EDIT_IRI, args=[deposit_collection.name, unknown_deposit_id]) response = put_atom(authenticated_client, url, data=atom_dataset["entry-data1"],) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert ( "Deposit %s does not exist" % unknown_deposit_id == response_content["sword:error"]["atom:summary"] ) def test_post_metadata_to_em_iri_failure( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Update (POST) archive with wrong content type should return 400 """ deposit = partial_deposit update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.post( update_uri, content_type="application/x-gtar-compressed", data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Packaging format supported is restricted" in response.content for supported_format in ACCEPT_ARCHIVE_CONTENT_TYPES: assert supported_format.encode() in response.content def test_put_metadata_to_em_iri_failure( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Update (PUT) archive with wrong content type should return 400 """ # given deposit = partial_deposit # when update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) response = put_atom( authenticated_client, update_uri, data=atom_dataset["entry-data1"], ) # then assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Packaging format supported is restricted" in response.content for supported_format in ACCEPT_ARCHIVE_CONTENT_TYPES: assert supported_format.encode() in response.content def test_put_update_metadata_done_deposit_nominal( tmp_path, authenticated_client, complete_deposit, deposit_collection, atom_dataset, sample_data, swh_storage, ): """Nominal scenario, client send an update of metadata on a deposit with status "done" with an existing swhid. Such swhid has its metadata updated accordingly both in the deposit backend and in the metadata storage. Response: 204 """ deposit_swhid = CoreSWHID.from_string(complete_deposit.swhid) assert deposit_swhid.object_type == ObjectType.DIRECTORY directory_id = hash_to_bytes(deposit_swhid.object_id) # directory targeted by the complete_deposit does not exist in the storage assert list(swh_storage.directory_missing([directory_id])) == [directory_id] # so let's create a directory reference in the storage (current deposit targets an # unknown swhid) existing_directory = sample_data.directory swh_storage.directory_add([existing_directory]) assert list(swh_storage.directory_missing([existing_directory.id])) == [] # and patch one complete deposit swhid so it targets said reference complete_deposit.swhid = str(existing_directory.swhid()) complete_deposit.save() actual_existing_requests_archive = DepositRequest.objects.filter( deposit=complete_deposit, type="archive" ) nb_archives = len(actual_existing_requests_archive) actual_existing_requests_metadata = DepositRequest.objects.filter( deposit=complete_deposit, type="metadata" ) nb_metadata = len(actual_existing_requests_metadata) update_uri = reverse(EDIT_IRI, args=[deposit_collection.name, complete_deposit.id]) response = put_atom( authenticated_client, update_uri, data=atom_dataset["entry-data1"], HTTP_X_CHECK_SWHID=complete_deposit.swhid, ) assert response.status_code == status.HTTP_204_NO_CONTENT new_requests_meta = DepositRequest.objects.filter( deposit=complete_deposit, type="metadata" ) assert len(new_requests_meta) == nb_metadata + 1 request_meta1 = new_requests_meta[0] raw_metadata1 = request_meta1.raw_metadata assert raw_metadata1 == atom_dataset["entry-data1"] # check we did not touch the other parts requests_archive1 = DepositRequest.objects.filter( deposit=complete_deposit, type="archive" ) assert len(requests_archive1) == nb_archives assert set(actual_existing_requests_archive) == set(requests_archive1) # Ensure metadata stored in the metadata storage is consistent metadata_authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url=complete_deposit.client.provider_url, ) actual_authority = swh_storage.metadata_authority_get( MetadataAuthorityType.DEPOSIT_CLIENT, url=complete_deposit.client.provider_url ) assert actual_authority == metadata_authority config = APIConfig() metadata_fetcher = MetadataFetcher( name=config.tool["name"], version=config.tool["version"], ) actual_fetcher = swh_storage.metadata_fetcher_get( config.tool["name"], config.tool["version"] ) assert actual_fetcher == metadata_fetcher directory_swhid = ExtendedSWHID.from_string(complete_deposit.swhid) page_results = swh_storage.raw_extrinsic_metadata_get( directory_swhid, metadata_authority ) assert page_results == PagedResult( results=[ RawExtrinsicMetadata( target=directory_swhid, discovery_date=request_meta1.date, authority=metadata_authority, fetcher=metadata_fetcher, format="sword-v2-atom-codemeta", metadata=raw_metadata1.encode(), origin=complete_deposit.origin_url, ) ], next_page_token=None, ) def test_put_update_metadata_done_deposit_failure_mismatched_swhid( tmp_path, authenticated_client, complete_deposit, deposit_collection, atom_dataset, swh_storage, ): """failure: client updates metadata on deposit with SWHID not matching the deposit's. Response: 400 """ incorrect_swhid = "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea" assert complete_deposit.swhid != incorrect_swhid update_uri = reverse(EDIT_IRI, args=[deposit_collection.name, complete_deposit.id]) response = put_atom( authenticated_client, update_uri, data=atom_dataset["entry-data1"], HTTP_X_CHECK_SWHID=incorrect_swhid, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Mismatched provided SWHID" in response.content def test_put_update_metadata_done_deposit_failure_malformed_xml( tmp_path, authenticated_client, complete_deposit, deposit_collection, atom_dataset, swh_storage, ): """failure: client updates metadata on deposit done with a malformed xml Response: 400 """ update_uri = reverse(EDIT_IRI, args=[deposit_collection.name, complete_deposit.id]) response = put_atom( authenticated_client, update_uri, data=atom_dataset["entry-data-ko"], HTTP_X_CHECK_SWHID=complete_deposit.swhid, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Malformed xml metadata" in response.content def test_put_update_metadata_done_deposit_failure_empty_xml( tmp_path, authenticated_client, complete_deposit, deposit_collection, atom_dataset, swh_storage, ): """failure: client updates metadata on deposit done with an empty xml. Response: 400 """ update_uri = reverse(EDIT_IRI, args=[deposit_collection.name, complete_deposit.id]) atom_content = atom_dataset["entry-data-empty-body"] response = put_atom( authenticated_client, update_uri, data=atom_content, HTTP_X_CHECK_SWHID=complete_deposit.swhid, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Empty body request is not supported" in response.content def test_put_update_metadata_done_deposit_failure_functional_checks( tmp_path, authenticated_client, complete_deposit, deposit_collection, atom_dataset, swh_storage, ): """failure: client updates metadata on deposit done without required incomplete metadata Response: 400 """ update_uri = reverse(EDIT_IRI, args=[deposit_collection.name, complete_deposit.id]) response = put_atom( authenticated_client, update_uri, # no title, nor author, nor name fields data=atom_dataset["entry-data-fail-metadata-functional-checks"], HTTP_X_CHECK_SWHID=complete_deposit.swhid, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Functional metadata checks failure" in response.content # detail on the errors msg = ( b"- Mandatory fields are missing (" b"atom:name or atom:title or codemeta:name, " b"atom:author or codemeta:author)" ) assert msg in response.content def test_put_atom_with_create_origin_and_external_identifier( authenticated_client, deposit_collection, atom_dataset, deposit_user ): """ was deprecated before was introduced, clients should get an error when trying to use both """ external_id = "foobar" origin_url = deposit_user.provider_url + external_id url = reverse(COL_IRI, args=[deposit_collection.name]) response = post_atom( authenticated_client, url, data=atom_dataset["entry-data0"] % origin_url, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) for link in response_content["atom:link"]: if link["@rel"] == "edit": edit_iri = link["@href"] break else: assert False, response_content # when response = put_atom( authenticated_client, edit_iri, data=atom_dataset["error-with-external-identifier"] % external_id, HTTP_IN_PROGRESS="false", ) assert b"<external_identifier> is deprecated" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST def test_put_atom_with_create_origin_and_reference( authenticated_client, deposit_collection, atom_dataset, deposit_user ): """ and are mutually exclusive """ external_id = "foobar" origin_url = deposit_user.provider_url + external_id url = reverse(COL_IRI, args=[deposit_collection.name]) response = post_atom( authenticated_client, url, data=atom_dataset["entry-data0"] % origin_url, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) for link in response_content["atom:link"]: if link["@rel"] == "edit": edit_iri = link["@href"] break else: assert False, response_content # when response = put_atom( authenticated_client, edit_iri, data=atom_dataset["entry-data-with-origin-reference"].format(url=origin_url), HTTP_IN_PROGRESS="false", ) assert b"only one may be used on a given deposit" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py index 93ddf572..344b2b26 100644 --- a/swh/deposit/tests/conftest.py +++ b/swh/deposit/tests/conftest.py @@ -1,608 +1,608 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 from copy import deepcopy from functools import partial from io import BytesIO import os import re from typing import TYPE_CHECKING, Dict, Mapping from django.test.utils import setup_databases # type: ignore from django.urls import reverse_lazy as reverse import psycopg2 from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT import pytest from rest_framework import status from rest_framework.test import APIClient import yaml from swh.auth.pytest_plugin import keycloak_mock_factory from swh.core.config import read from swh.core.pytest_plugin import get_response_cb from swh.deposit.auth import DEPOSIT_PERMISSION from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED, SE_IRI, setup_django_for, ) from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import ( create_arborescence_archive, post_archive, post_atom, ) from swh.model.hashutil import hash_to_bytes -from swh.model.identifiers import CoreSWHID, ObjectType, QualifiedSWHID +from swh.model.swhids import CoreSWHID, ObjectType, QualifiedSWHID from swh.scheduler import get_scheduler if TYPE_CHECKING: from swh.deposit.models import Deposit, DepositClient, DepositCollection # mypy is asked to ignore the import statement above because setup_databases # is not part of the d.t.utils.__all__ variable. USERNAME = "test" EMAIL = "test@example.org" COLLECTION = "test" TEST_USER = { "username": USERNAME, "password": "pass", "email": EMAIL, "provider_url": "https://hal-test.archives-ouvertes.fr/", "domain": "archives-ouvertes.fr/", "collection": {"name": COLLECTION}, } USER_INFO = { "name": USERNAME, "email": EMAIL, "email_verified": False, "family_name": "", "given_name": "", "groups": [], "preferred_username": USERNAME, "sub": "ffffffff-bbbb-4444-aaaa-14f61e6b7200", } USERNAME2 = "test2" EMAIL2 = "test@example.org" COLLECTION2 = "another-collection" TEST_USER2 = { "username": USERNAME2, "password": "", "email": EMAIL2, "provider_url": "https://hal-test.archives-ouvertes.example/", "domain": "archives-ouvertes.example/", "collection": {"name": COLLECTION2}, } KEYCLOAK_SERVER_URL = "https://auth.swh.org/SWHTest" KEYCLOAK_REALM_NAME = "SWHTest" CLIENT_ID = "swh-deposit" keycloak_mock_auth_success = keycloak_mock_factory( server_url=KEYCLOAK_SERVER_URL, realm_name=KEYCLOAK_REALM_NAME, client_id=CLIENT_ID, auth_success=True, user_info=USER_INFO, client_permissions=[DEPOSIT_PERMISSION], ) keycloak_mock_auth_failure = keycloak_mock_factory( server_url=KEYCLOAK_SERVER_URL, realm_name=KEYCLOAK_REALM_NAME, client_id=CLIENT_ID, auth_success=False, ) def pytest_configure(): setup_django_for("testing") @pytest.fixture def requests_mock_datadir(datadir, requests_mock_datadir): """Override default behavior to deal with put/post methods """ cb = partial(get_response_cb, datadir=datadir) requests_mock_datadir.put(re.compile("https://"), body=cb) requests_mock_datadir.post(re.compile("https://"), body=cb) return requests_mock_datadir @pytest.fixture def common_deposit_config(swh_scheduler_config, swh_storage_backend_config): return { "max_upload_size": 500, "extraction_dir": "/tmp/swh-deposit/test/extraction-dir", "checks": False, "scheduler": {"cls": "local", **swh_scheduler_config,}, "storage": swh_storage_backend_config, "storage_metadata": swh_storage_backend_config, "swh_authority_url": "http://deposit.softwareheritage.example/", } @pytest.fixture() def deposit_config(common_deposit_config): return { **common_deposit_config, "authentication_provider": "keycloak", "keycloak": { "server_url": KEYCLOAK_SERVER_URL, "realm_name": KEYCLOAK_REALM_NAME, }, } @pytest.fixture() def deposit_config_path(tmp_path, monkeypatch, deposit_config): conf_path = os.path.join(tmp_path, "deposit.yml") with open(conf_path, "w") as f: f.write(yaml.dump(deposit_config)) monkeypatch.setenv("SWH_CONFIG_FILENAME", conf_path) return conf_path @pytest.fixture(autouse=True) def deposit_autoconfig(deposit_config_path): """Enforce config for deposit classes inherited from APIConfig.""" cfg = read(deposit_config_path) if "scheduler" in cfg: # scheduler setup: require the check-deposit and load-deposit tasks scheduler = get_scheduler(**cfg["scheduler"]) task_types = [ { "type": "check-deposit", "backend_name": "swh.deposit.loader.tasks.ChecksDepositTsk", "description": "Check deposit metadata/archive before loading", "num_retries": 3, }, { "type": "load-deposit", "backend_name": "swh.loader.package.deposit.tasks.LoadDeposit", "description": "Loading deposit archive into swh archive", "num_retries": 3, }, ] for task_type in task_types: scheduler.create_task_type(task_type) @pytest.fixture(scope="session") def django_db_setup(request, django_db_blocker, postgresql_proc): from django.conf import settings settings.DATABASES["default"].update( { ("ENGINE", "django.db.backends.postgresql"), ("NAME", "tests"), ("USER", postgresql_proc.user), # noqa ("HOST", postgresql_proc.host), # noqa ("PORT", postgresql_proc.port), # noqa } ) with django_db_blocker.unblock(): setup_databases( verbosity=request.config.option.verbose, interactive=False, keepdb=False ) def execute_sql(sql): """Execute sql to postgres db""" with psycopg2.connect(database="postgres") as conn: conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() cur.execute(sql) @pytest.fixture(autouse=True, scope="session") def swh_proxy(): """Automatically inject this fixture in all tests to ensure no outside connection takes place. """ os.environ["http_proxy"] = "http://localhost:999" os.environ["https_proxy"] = "http://localhost:999" def create_deposit_collection(collection_name: str): """Create a deposit collection with name collection_name """ from swh.deposit.models import DepositCollection try: collection = DepositCollection._default_manager.get(name=collection_name) except DepositCollection.DoesNotExist: collection = DepositCollection(name=collection_name) collection.save() return collection def deposit_collection_factory(collection_name): @pytest.fixture def _deposit_collection(db, collection_name=collection_name): return create_deposit_collection(collection_name) return _deposit_collection deposit_collection = deposit_collection_factory(COLLECTION) deposit_another_collection = deposit_collection_factory(COLLECTION2) def _create_deposit_user( collection: "DepositCollection", user_data: Dict ) -> "DepositClient": """Create/Return the test_user "test" For basic authentication, this will save a password. This is not required for keycloak authentication scheme. """ from swh.deposit.models import DepositClient user_data_d = deepcopy(user_data) user_data_d.pop("collection", None) passwd = user_data_d.pop("password", None) user, _ = DepositClient.objects.get_or_create( # type: ignore username=user_data_d["username"], defaults={**user_data_d, "collections": [collection.id]}, ) if passwd: user.set_password(passwd) user.save() return user @pytest.fixture def deposit_user(db, deposit_collection): return _create_deposit_user(deposit_collection, TEST_USER) @pytest.fixture def deposit_another_user(db, deposit_another_collection): return _create_deposit_user(deposit_another_collection, TEST_USER2) @pytest.fixture def anonymous_client(): """Create an anonymous client (no credentials during queries to the deposit) """ return APIClient() # <- drf's client def mock_keycloakopenidconnect(mocker, keycloak_mock): """Mock swh.deposit.auth.KeycloakOpenIDConnect to return the keycloak_mock """ mock = mocker.patch("swh.deposit.auth.KeycloakOpenIDConnect") mock.from_configfile.return_value = keycloak_mock return mock @pytest.fixture def mock_keycloakopenidconnect_ok(mocker, keycloak_mock_auth_success): """Mock keycloak so it always accepts connection for user with the right permissions """ return mock_keycloakopenidconnect(mocker, keycloak_mock_auth_success) @pytest.fixture def mock_keycloakopenidconnect_ko(mocker, keycloak_mock_auth_failure): """Mock keycloak so it always refuses connections.""" return mock_keycloakopenidconnect(mocker, keycloak_mock_auth_failure) def _create_authenticated_client(client, user, password=None): """Return a client whose credentials will be proposed to the deposit server. This also patched the client instance to keep a reference on the associated deposit_user. """ if not password: password = "irrelevant-if-not-set" _token = "%s:%s" % (user.username, password) token = base64.b64encode(_token.encode("utf-8")) authorization = "Basic %s" % token.decode("utf-8") client.credentials(HTTP_AUTHORIZATION=authorization) client.deposit_client = user yield client client.logout() @pytest.fixture def basic_authenticated_client(anonymous_client, deposit_user): yield from _create_authenticated_client( anonymous_client, deposit_user, password=TEST_USER["password"] ) @pytest.fixture def authenticated_client(mock_keycloakopenidconnect_ok, anonymous_client, deposit_user): yield from _create_authenticated_client(anonymous_client, deposit_user) @pytest.fixture def unauthorized_client(mock_keycloakopenidconnect_ko, anonymous_client, deposit_user): """Create an unauthorized client (will see their authentication fail) """ yield from _create_authenticated_client(anonymous_client, deposit_user) @pytest.fixture def insufficient_perm_client( mocker, keycloak_mock_auth_success, anonymous_client, deposit_user ): """keycloak accepts connection but client returned has no deposit permission, so access is not allowed. """ keycloak_mock_auth_success.client_permissions = [] mock_keycloakopenidconnect(mocker, keycloak_mock_auth_success) yield from _create_authenticated_client(anonymous_client, deposit_user) @pytest.fixture def sample_archive(tmp_path): """Returns a sample archive """ tmp_path = str(tmp_path) # pytest version limitation in previous version archive = create_arborescence_archive( tmp_path, "archive1", "file1", b"some content in file" ) return archive @pytest.fixture def atom_dataset(datadir) -> Mapping[str, str]: """Compute the paths to atom files. Returns: Dict of atom name per content (bytes) """ atom_path = os.path.join(datadir, "atom") data = {} for filename in os.listdir(atom_path): filepath = os.path.join(atom_path, filename) with open(filepath, "rb") as f: raw_content = f.read().decode("utf-8") # Keep the filename without extension atom_name = filename.split(".")[0] data[atom_name] = raw_content return data def internal_create_deposit( client: "DepositClient", collection: "DepositCollection", external_id: str, status: str, ) -> "Deposit": """Create a deposit for a given collection with internal tool """ from swh.deposit.models import Deposit deposit = Deposit( client=client, external_id=external_id, status=status, collection=collection ) deposit.save() return deposit def create_deposit( client, collection_name: str, sample_archive, external_id: str, deposit_status=DEPOSIT_STATUS_DEPOSITED, in_progress=False, ): """Create a skeleton shell deposit """ url = reverse(COL_IRI, args=[collection_name]) # when response = post_archive( client, url, sample_archive, HTTP_SLUG=external_id, HTTP_IN_PROGRESS=str(in_progress).lower(), ) # then assert response.status_code == status.HTTP_201_CREATED, response.content.decode() from swh.deposit.models import Deposit response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["swh:deposit_id"] deposit = Deposit._default_manager.get(id=deposit_id) if deposit.status != deposit_status: deposit.status = deposit_status deposit.save() assert deposit.status == deposit_status return deposit def create_binary_deposit( authenticated_client, collection_name: str, deposit_status: str = DEPOSIT_STATUS_DEPOSITED, atom_dataset: Mapping[str, bytes] = {}, **kwargs, ): """Create a deposit with both metadata and archive set. Then alters its status to `deposit_status`. """ deposit = create_deposit( authenticated_client, collection_name, deposit_status=DEPOSIT_STATUS_PARTIAL, **kwargs, ) origin_url = deposit.client.provider_url + deposit.external_id response = post_atom( authenticated_client, reverse(SE_IRI, args=[collection_name, deposit.id]), data=atom_dataset["entry-data0"] % origin_url, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_201_CREATED assert deposit.status == DEPOSIT_STATUS_PARTIAL from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit.id) assert deposit.status == deposit_status return deposit def deposit_factory(deposit_status=DEPOSIT_STATUS_DEPOSITED, in_progress=False): """Build deposit with a specific status """ @pytest.fixture() def _deposit( sample_archive, deposit_collection, authenticated_client, deposit_status=deposit_status, ): external_id = "external-id-%s" % deposit_status return create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id=external_id, deposit_status=deposit_status, in_progress=in_progress, ) return _deposit deposited_deposit = deposit_factory() rejected_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_REJECTED) partial_deposit = deposit_factory( deposit_status=DEPOSIT_STATUS_PARTIAL, in_progress=True ) verified_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_VERIFIED) completed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS) failed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_FAILURE) @pytest.fixture def partial_deposit_with_metadata( sample_archive, deposit_collection, authenticated_client, atom_dataset ): """Returns deposit with archive and metadata provided, status 'partial' """ return create_binary_deposit( authenticated_client, deposit_collection.name, sample_archive=sample_archive, external_id="external-id-partial", in_progress=True, deposit_status=DEPOSIT_STATUS_PARTIAL, atom_dataset=atom_dataset, ) @pytest.fixture def partial_deposit_only_metadata( deposit_collection, authenticated_client, atom_dataset ): response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data1"], HTTP_SLUG="external-id-partial", HTTP_IN_PROGRESS=True, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = response_content["swh:deposit_id"] from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_PARTIAL return deposit @pytest.fixture def complete_deposit(sample_archive, deposit_collection, authenticated_client): """Returns a completed deposit (load success) """ deposit = create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id="external-id-complete", deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS, ) origin = "https://hal.archives-ouvertes.fr/hal-01727745" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" revision_id = hash_to_bytes("548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10") snapshot_id = hash_to_bytes("e5e82d064a9c3df7464223042e0c55d72ccff7f0") deposit.swhid = f"swh:1:dir:{directory_id}" deposit.swhid_context = str( QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(directory_id), origin=origin, visit=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snapshot_id), anchor=CoreSWHID(object_type=ObjectType.REVISION, object_id=revision_id), path=b"/", ) ) deposit.save() return deposit @pytest.fixture() def tmp_path(tmp_path): return str(tmp_path) # issue with oldstable's pytest version diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py index b907f480..a06e02ce 100644 --- a/swh/deposit/tests/test_utils.py +++ b/swh/deposit/tests/test_utils.py @@ -1,276 +1,276 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.deposit import utils from swh.deposit.parsers import parse_xml from swh.model.exceptions import ValidationError -from swh.model.identifiers import CoreSWHID, QualifiedSWHID +from swh.model.swhids import CoreSWHID, QualifiedSWHID @pytest.fixture def xml_with_origin_reference(): xml_data = """ """ return xml_data.strip() def test_merge(): """Calling utils.merge on dicts should merge without losing information """ d0 = {"author": "someone", "license": [["gpl2"]], "a": 1} d1 = { "author": ["author0", {"name": "author1"}], "license": [["gpl3"]], "b": {"1": "2"}, } d2 = {"author": map(lambda x: x, ["else"]), "license": "mit", "b": {"2": "3",}} d3 = { "author": (v for v in ["no one"]), } actual_merge = utils.merge(d0, d1, d2, d3) expected_merge = { "a": 1, "license": [["gpl2"], ["gpl3"], "mit"], "author": ["someone", "author0", {"name": "author1"}, "else", "no one"], "b": {"1": "2", "2": "3",}, } assert actual_merge == expected_merge def test_merge_2(): d0 = {"license": "gpl2", "runtime": {"os": "unix derivative"}} d1 = {"license": "gpl3", "runtime": "GNU/Linux"} expected = { "license": ["gpl2", "gpl3"], "runtime": [{"os": "unix derivative"}, "GNU/Linux"], } actual = utils.merge(d0, d1) assert actual == expected def test_merge_edge_cases(): input_dict = { "license": ["gpl2", "gpl3"], "runtime": [{"os": "unix derivative"}, "GNU/Linux"], } # against empty dict actual = utils.merge(input_dict, {}) assert actual == input_dict # against oneself actual = utils.merge(input_dict, input_dict, input_dict) assert actual == input_dict def test_merge_one_dict(): """Merge one dict should result in the same dict value """ input_and_expected = {"anything": "really"} actual = utils.merge(input_and_expected) assert actual == input_and_expected def test_merge_raise(): """Calling utils.merge with any no dict argument should raise """ d0 = {"author": "someone", "a": 1} d1 = ["not a dict"] with pytest.raises(ValueError): utils.merge(d0, d1) with pytest.raises(ValueError): utils.merge(d1, d0) with pytest.raises(ValueError): utils.merge(d1) assert utils.merge(d0) == d0 def test_normalize_date_0(): """When date is a list, choose the first date and normalize it """ actual_date = utils.normalize_date(["2017-10-12", "date1"]) assert actual_date == { "timestamp": {"microseconds": 0, "seconds": 1507766400}, "negative_utc": False, "offset": 0, } def test_normalize_date_1(): """Providing a date in a reasonable format, everything is fine """ actual_date = utils.normalize_date("2018-06-11 17:02:02") assert actual_date == { "timestamp": {"microseconds": 0, "seconds": 1528736522}, "negative_utc": False, "offset": 0, } def test_normalize_date_doing_irrelevant_stuff(): """Providing a date with only the year results in a reasonable date """ actual_date = utils.normalize_date("2017") assert actual_date == { "timestamp": {"seconds": 1483228800, "microseconds": 0}, "offset": 0, "negative_utc": False, } @pytest.mark.parametrize( "swhid,expected_metadata_context", [ ("swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49", {"origin": None},), ( "swh:1:snp:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=http://blah", {"origin": "http://blah", "path": None}, ), ( "swh:1:dir:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;path=/path", {"origin": None, "path": b"/path"}, ), ( "swh:1:rev:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;visit=swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", # noqa { "origin": None, "path": None, "snapshot": CoreSWHID.from_string( "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49" ), }, ), ( "swh:1:rel:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", # noqa { "origin": None, "path": None, "directory": CoreSWHID.from_string( "swh:1:dir:41b5c8cc985d190b5a7ef4878128ebfdc2358f49" ), }, ), ], ) def test_compute_metadata_context(swhid: str, expected_metadata_context): assert expected_metadata_context == utils.compute_metadata_context( QualifiedSWHID.from_string(swhid) ) def test_parse_swh_reference_origin(xml_with_origin_reference): url = "https://url" xml_data = xml_with_origin_reference.format(url=url) metadata = parse_xml(xml_data) actual_origin = utils.parse_swh_reference(metadata) assert actual_origin == url @pytest.fixture def xml_with_empty_reference(): xml_data = """ {swh_reference} """ return xml_data.strip() @pytest.mark.parametrize( "xml_ref", [ "", "", "", """""", ], ) def test_parse_swh_reference_empty(xml_with_empty_reference, xml_ref): xml_body = xml_with_empty_reference.format(swh_reference=xml_ref) metadata = utils.parse_xml(xml_body) assert utils.parse_swh_reference(metadata) is None @pytest.fixture def xml_with_swhid(atom_dataset): return atom_dataset["entry-data-with-swhid"] @pytest.mark.parametrize( "swhid", [ "swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=https://hal.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:4fc1e36fca86b2070204bedd51106014a614f321;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba;path=/moranegg-AffectationRO-df7f68b/", # noqa "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:rev:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rel:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:snp:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:snp:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", ], ) def test_parse_swh_reference_swhid(swhid, xml_with_swhid): xml_data = xml_with_swhid.format(swhid=swhid) metadata = utils.parse_xml(xml_data) actual_swhid = utils.parse_swh_reference(metadata) assert actual_swhid is not None expected_swhid = QualifiedSWHID.from_string(swhid) assert actual_swhid == expected_swhid @pytest.mark.parametrize( "invalid_swhid", [ # incorrect length "swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc235" # noqa # visit qualifier should be a core SWHID with type, "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:rev:0175049fc45055a3824a1675ac06e3711619a55a", # noqa # anchor qualifier should be a core SWHID with type one of "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;anchor=swh:1:cnt:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:snp:b5f505b005435fa5c4fa4c279792bd7b17167c04", # noqa ], ) def test_parse_swh_reference_invalid_swhid(invalid_swhid, xml_with_swhid): """Unparsable swhid should raise """ xml_invalid_swhid = xml_with_swhid.format(swhid=invalid_swhid) metadata = utils.parse_xml(xml_invalid_swhid) with pytest.raises(ValidationError): utils.parse_swh_reference(metadata) diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py index 2e01de8c..59176936 100644 --- a/swh/deposit/utils.py +++ b/swh/deposit/utils.py @@ -1,253 +1,249 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from types import GeneratorType from typing import Any, Dict, Optional, Union import iso8601 import xmltodict from swh.model.exceptions import ValidationError -from swh.model.identifiers import ( - ExtendedSWHID, - ObjectType, - QualifiedSWHID, - normalize_timestamp, -) +from swh.model.model import TimestampWithTimezone +from swh.model.swhids import ExtendedSWHID, ObjectType, QualifiedSWHID logger = logging.getLogger(__name__) def parse_xml(stream, encoding="utf-8"): namespaces = { "http://www.w3.org/2005/Atom": "atom", "http://www.w3.org/2007/app": "app", "http://purl.org/dc/terms/": "dc", "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0": "codemeta", "http://purl.org/net/sword/terms/": "sword", "https://www.softwareheritage.org/schema/2018/deposit": "swh", } data = xmltodict.parse( stream, encoding=encoding, namespaces=namespaces, process_namespaces=True, dict_constructor=dict, ) if "atom:entry" in data: data = data["atom:entry"] return data def merge(*dicts): """Given an iterator of dicts, merge them losing no information. Args: *dicts: arguments are all supposed to be dict to merge into one Returns: dict merged without losing information """ def _extend(existing_val, value): """Given an existing value and a value (as potential lists), merge them together without repetition. """ if isinstance(value, (list, map, GeneratorType)): vals = value else: vals = [value] for v in vals: if v in existing_val: continue existing_val.append(v) return existing_val d = {} for data in dicts: if not isinstance(data, dict): raise ValueError("dicts is supposed to be a variable arguments of dict") for key, value in data.items(): existing_val = d.get(key) if not existing_val: d[key] = value continue if isinstance(existing_val, (list, map, GeneratorType)): new_val = _extend(existing_val, value) elif isinstance(existing_val, dict): if isinstance(value, dict): new_val = merge(existing_val, value) else: new_val = _extend([existing_val], value) else: new_val = _extend([existing_val], value) d[key] = new_val return d def normalize_date(date): """Normalize date fields as expected by swh workers. If date is a list, elect arbitrarily the first element of that list If date is (then) a string, parse it through dateutil.parser.parse to extract a datetime. Then normalize it through - swh.model.identifiers.normalize_timestamp. + :class:`swh.model.model.TimestampWithTimezone` Returns The swh date object """ if isinstance(date, list): date = date[0] if isinstance(date, str): date = iso8601.parse_date(date) - return normalize_timestamp(date) + return TimestampWithTimezone.from_dict(date).to_dict() def compute_metadata_context(swhid_reference: QualifiedSWHID) -> Dict[str, Any]: """Given a SWHID object, determine the context as a dict. """ metadata_context: Dict[str, Any] = {"origin": None} if swhid_reference.qualifiers(): metadata_context = { "origin": swhid_reference.origin, "path": swhid_reference.path, } snapshot = swhid_reference.visit if snapshot: metadata_context["snapshot"] = snapshot anchor = swhid_reference.anchor if anchor: metadata_context[anchor.object_type.name.lower()] = anchor return metadata_context ALLOWED_QUALIFIERS_NODE_TYPE = ( ObjectType.SNAPSHOT, ObjectType.REVISION, ObjectType.RELEASE, ObjectType.DIRECTORY, ) def parse_swh_reference(metadata: Dict,) -> Optional[Union[QualifiedSWHID, str]]: """Parse swh reference within the metadata dict (or origin) reference if found, None otherwise. .. code-block:: xml or: .. code-block:: xml Args: metadata: result of parsing an Atom document with :func:`parse_xml` Raises: ValidationError in case the swhid referenced (if any) is invalid Returns: Either swhid or origin reference if any. None otherwise. """ # noqa swh_deposit = metadata.get("swh:deposit") if not swh_deposit: return None swh_reference = swh_deposit.get("swh:reference") if not swh_reference: return None swh_origin = swh_reference.get("swh:origin") if swh_origin: url = swh_origin.get("@url") if url: return url swh_object = swh_reference.get("swh:object") if not swh_object: return None swhid = swh_object.get("@swhid") if not swhid: return None swhid_reference = QualifiedSWHID.from_string(swhid) if swhid_reference.qualifiers(): anchor = swhid_reference.anchor if anchor: if anchor.object_type not in ALLOWED_QUALIFIERS_NODE_TYPE: error_msg = ( "anchor qualifier should be a core SWHID with type one of " f"{', '.join(t.name.lower() for t in ALLOWED_QUALIFIERS_NODE_TYPE)}" ) raise ValidationError(error_msg) visit = swhid_reference.visit if visit: if visit.object_type != ObjectType.SNAPSHOT: raise ValidationError( f"visit qualifier should be a core SWHID with type snp, " f"not {visit.object_type.value}" ) if ( visit and anchor and visit.object_type == ObjectType.SNAPSHOT and anchor.object_type == ObjectType.SNAPSHOT ): logger.warn( "SWHID use of both anchor and visit targeting " f"a snapshot: {swhid_reference}" ) raise ValidationError( "'anchor=swh:1:snp:' is not supported when 'visit' is also provided." ) return swhid_reference def extended_swhid_from_qualified(swhid: QualifiedSWHID) -> ExtendedSWHID: """Used to get the target of a metadata object from a , as the latter uses a QualifiedSWHID.""" return ExtendedSWHID.from_string(str(swhid).split(";")[0]) def to_header_link(link: str, link_name: str) -> str: """Build a single header link. >>> link_next = to_header_link("next-url", "next") >>> link_next '; rel="next"' >>> ','.join([link_next, to_header_link("prev-url", "prev")]) '; rel="next",; rel="prev"' """ return f'<{link}>; rel="{link_name}"'