diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py index b0945fd1..9dc8f74e 100644 --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -1,1284 +1,1288 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from abc import ABCMeta, abstractmethod import datetime import hashlib import json from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union import uuid from xml.etree import ElementTree import attr from django.core.files.uploadedfile import UploadedFile from django.http import FileResponse, HttpResponse from django.shortcuts import render from django.template.loader import render_to_string from django.urls import reverse from django.utils import timezone from rest_framework import status from rest_framework.authentication import BaseAuthentication, BasicAuthentication from rest_framework.permissions import BasePermission, IsAuthenticated from rest_framework.request import Request from rest_framework.views import APIView from swh.deposit.api.checks import check_metadata from swh.deposit.api.converters import convert_status_detail from swh.deposit.auth import HasDepositPermission, KeycloakBasicAuthentication -from swh.deposit.models import Deposit, DEPOSIT_METADATA_ONLY +from swh.deposit.models import DEPOSIT_METADATA_ONLY, Deposit from swh.deposit.parsers import parse_xml -from swh.deposit.utils import NAMESPACES, compute_metadata_context +from swh.deposit.utils import ( + NAMESPACES, + check_url_match_provider, + compute_metadata_context, + parse_swh_metadata_provenance, +) from swh.model import hashutil from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, Origin, RawExtrinsicMetadata, ) from swh.model.swhids import ( ExtendedObjectType, ExtendedSWHID, QualifiedSWHID, ValidationError, ) from swh.scheduler.utils import create_oneshot_task_dict from ..config import ( ARCHIVE_KEY, ARCHIVE_TYPE, CONT_FILE_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_PARTIAL, EDIT_IRI, EM_IRI, METADATA_TYPE, RAW_METADATA_KEY, SE_IRI, STATE_IRI, APIConfig, ) from ..errors import ( BAD_REQUEST, CHECKSUM_MISMATCH, ERROR_CONTENT, FORBIDDEN, MAX_UPLOAD_SIZE_EXCEEDED, MEDIATION_NOT_ALLOWED, METHOD_NOT_ALLOWED, NOT_FOUND, PARSING_ERROR, DepositError, ParserError, ) from ..models import DepositClient, DepositCollection, DepositRequest from ..utils import ( extended_swhid_from_qualified, parse_swh_deposit_origin, parse_swh_reference, ) ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"] ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"] @attr.s class ParsedRequestHeaders: content_type = attr.ib(type=str) content_length = attr.ib(type=Optional[int]) in_progress = attr.ib(type=bool) content_disposition = attr.ib(type=Optional[str]) content_md5sum = attr.ib(type=Optional[bytes]) packaging = attr.ib(type=Optional[str]) slug = attr.ib(type=Optional[str]) on_behalf_of = attr.ib(type=Optional[str]) metadata_relevant = attr.ib(type=Optional[str]) swhid = attr.ib(type=Optional[str]) @attr.s class Receipt: """Data computed while handling the request body that will be served in the Deposit Receipt.""" deposit_id = attr.ib(type=int) deposit_date = attr.ib(type=datetime.datetime) status = attr.ib(type=str) archive = attr.ib(type=Optional[str]) def _compute_md5(filehandler: UploadedFile) -> bytes: h = hashlib.md5() for chunk in filehandler: h.update(chunk) # type: ignore return h.digest() def get_deposit_by_id( deposit_id: int, collection_name: Optional[str] = None ) -> Deposit: """Gets an existing Deposit object if it exists, or raises `DepositError`. If `collection` is not None, also checks the deposit belongs to the collection.""" try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: raise DepositError(NOT_FOUND, f"Deposit {deposit_id} does not exist") if collection_name and deposit.collection.name != collection_name: get_collection_by_name(collection_name) # raises if does not exist raise DepositError( NOT_FOUND, f"Deposit {deposit_id} does not belong to collection {collection_name}", ) return deposit def get_collection_by_name(collection_name: str): """Gets an existing Deposit object if it exists, or raises `DepositError`.""" try: collection = DepositCollection.objects.get(name=collection_name) except DepositCollection.DoesNotExist: raise DepositError(NOT_FOUND, f"Unknown collection name {collection_name}") assert collection is not None return collection def guess_deposit_origin_url(deposit: Deposit): """Guesses an origin url for the given deposit.""" external_id = deposit.external_id if not external_id: # The client provided neither an origin_url nor a slug. That's inconvenient, # but SWORD requires we support it. So let's generate a random slug. external_id = str(uuid.uuid4()) return "%s/%s" % (deposit.client.provider_url.rstrip("/"), external_id) -def check_client_origin(client: DepositClient, origin_url: str): - provider_url = client.provider_url.rstrip("/") + "/" - if not origin_url.startswith(provider_url): - raise DepositError( - FORBIDDEN, - f"Cannot create origin {origin_url}, it must start with {provider_url}", - ) - - class APIBase(APIConfig, APIView, metaclass=ABCMeta): """Base deposit request class sharing multiple common behaviors. """ _client: Optional[DepositClient] = None def __init__(self): super().__init__() auth_provider = self.config.get("authentication_provider") if auth_provider == "basic": self.authentication_classes: Sequence[Type[BaseAuthentication]] = ( BasicAuthentication, ) self.permission_classes: Sequence[Type[BasePermission]] = (IsAuthenticated,) elif auth_provider == "keycloak": self.authentication_classes: Sequence[Type[BaseAuthentication]] = ( KeycloakBasicAuthentication, ) self.permission_classes: Sequence[Type[BasePermission]] = ( IsAuthenticated, HasDepositPermission, ) else: raise ValueError( "Configuration key 'authentication_provider' should be provided with" f"either 'basic' or 'keycloak' value not {auth_provider!r}." ) def _read_headers(self, request: Request) -> ParsedRequestHeaders: """Read and unify the necessary headers from the request (those are not stored in the same location or not properly formatted). Args: request: Input request Returns: Dictionary with the following keys (some associated values may be None): - content-type - content-length - in-progress - content-disposition - packaging - slug - on-behalf-of """ meta = request._request.META content_length = meta.get("CONTENT_LENGTH") if content_length and isinstance(content_length, str): content_length = int(content_length) # final deposit if not provided in_progress = meta.get("HTTP_IN_PROGRESS", False) if isinstance(in_progress, str): in_progress = in_progress.lower() == "true" content_md5sum = meta.get("HTTP_CONTENT_MD5") if content_md5sum: content_md5sum = bytes.fromhex(content_md5sum) return ParsedRequestHeaders( content_type=request.content_type, content_length=content_length, in_progress=in_progress, content_disposition=meta.get("HTTP_CONTENT_DISPOSITION"), content_md5sum=content_md5sum, packaging=meta.get("HTTP_PACKAGING"), slug=meta.get("HTTP_SLUG"), on_behalf_of=meta.get("HTTP_ON_BEHALF_OF"), metadata_relevant=meta.get("HTTP_METADATA_RELEVANT"), swhid=meta.get("HTTP_X_CHECK_SWHID"), ) def _deposit_put(self, deposit: Deposit, in_progress: bool = False) -> None: """Save/Update a deposit in db. Args: deposit: deposit being updated/created in_progress: deposit status """ if in_progress is False: self._complete_deposit(deposit) else: deposit.status = DEPOSIT_STATUS_PARTIAL deposit.save() def _complete_deposit(self, deposit: Deposit) -> None: """Marks the deposit as 'deposited', then schedule a check task if configured to do so.""" deposit.complete_date = timezone.now() deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() if not deposit.origin_url: deposit.origin_url = guess_deposit_origin_url(deposit) if self.config["checks"]: scheduler = self.scheduler if deposit.status == DEPOSIT_STATUS_DEPOSITED and not deposit.check_task_id: task = create_oneshot_task_dict( "check-deposit", collection=deposit.collection.name, deposit_id=deposit.id, retries_left=3, ) check_task_id = scheduler.create_tasks([task])[0]["id"] deposit.check_task_id = check_task_id deposit.save() def _deposit_request_put( self, deposit: Deposit, deposit_request_data: Dict[str, Any], replace_metadata: bool = False, replace_archives: bool = False, ) -> DepositRequest: """Save a deposit request with metadata attached to a deposit. Args: deposit: The deposit concerned by the request deposit_request_data: The dictionary with at most 2 deposit request types (archive, metadata) to associate to the deposit replace_metadata: Flag defining if we add or update existing metadata to the deposit replace_archives: Flag defining if we add or update archives to existing deposit Returns: the DepositRequest object stored in the backend """ if replace_metadata: DepositRequest.objects.filter(deposit=deposit, type=METADATA_TYPE).delete() if replace_archives: DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() deposit_request = None archive_file = deposit_request_data.get(ARCHIVE_KEY) if archive_file: deposit_request = DepositRequest( type=ARCHIVE_TYPE, deposit=deposit, archive=archive_file ) deposit_request.save() raw_metadata = deposit_request_data.get(RAW_METADATA_KEY) if raw_metadata: deposit_request = DepositRequest( type=METADATA_TYPE, deposit=deposit, raw_metadata=raw_metadata.decode("utf-8"), ) deposit_request.save() assert deposit_request is not None return deposit_request def _delete_archives(self, collection_name: str, deposit: Deposit) -> Dict: """Delete archive references from the deposit id. """ DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() return {} def _delete_deposit(self, collection_name: str, deposit: Deposit) -> Dict: """Delete deposit reference. Args: collection_name: Client's collection deposit: The deposit to delete Returns Empty dict when ok. Dict with error key to describe the failure. """ if deposit.collection.name != collection_name: summary = "Cannot delete a deposit from another collection" description = "Deposit %s does not belong to the collection %s" % ( deposit.id, collection_name, ) raise DepositError( BAD_REQUEST, summary=summary, verbose_description=description ) DepositRequest.objects.filter(deposit=deposit).delete() deposit.delete() return {} def _check_file_length( self, filehandler: UploadedFile, content_length: Optional[int] = None, ) -> None: """Check the filehandler passed as argument has exactly the expected content_length Args: filehandler: The file to check content_length: the expected length if provided. Raises: DepositError if the actual length does not match """ max_upload_size = self.config["max_upload_size"] if content_length: length = filehandler.size if length != content_length: raise DepositError(status.HTTP_412_PRECONDITION_FAILED, "Wrong length") if filehandler.size > max_upload_size: raise DepositError( MAX_UPLOAD_SIZE_EXCEEDED, f"Upload size limit exceeded (max {max_upload_size} bytes)." "Please consider sending the archive in multiple steps.", ) def _check_file_md5sum( self, filehandler: UploadedFile, md5sum: Optional[bytes], ) -> None: """Check the filehandler passed as argument has the expected md5sum Args: filehandler: The file to check md5sum: md5 hash expected from the file's content Raises: DepositError if the md5sum does not match """ if md5sum: _md5sum = _compute_md5(filehandler) if _md5sum != md5sum: raise DepositError( CHECKSUM_MISMATCH, "Wrong md5 hash", f"The checksum sent {hashutil.hash_to_hex(md5sum)} and the actual " f"checksum {hashutil.hash_to_hex(_md5sum)} does not match.", ) def _binary_upload( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, replace_metadata: bool = False, replace_archives: bool = False, ) -> Receipt: """Binary upload routine. Other than such a request, a 415 response is returned. Args: request: the request holding information to parse and inject in db headers: parsed request headers collection_name: the associated client deposit: deposit to be updated replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Raises: - 400 (bad request) if the request is not providing an external identifier - 413 (request entity too large) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided """ content_length = headers.content_length if not content_length: raise DepositError( BAD_REQUEST, "CONTENT_LENGTH header is mandatory", "For archive deposit, the CONTENT_LENGTH header must be sent.", ) content_disposition = headers.content_disposition if not content_disposition: raise DepositError( BAD_REQUEST, "CONTENT_DISPOSITION header is mandatory", "For archive deposit, the CONTENT_DISPOSITION header must be sent.", ) packaging = headers.packaging if packaging and packaging not in ACCEPT_PACKAGINGS: raise DepositError( BAD_REQUEST, f"Only packaging {ACCEPT_PACKAGINGS} is supported", f"The packaging provided {packaging} is not supported", ) filehandler = request.FILES["file"] assert isinstance(filehandler, UploadedFile), filehandler self._check_file_length(filehandler, content_length) self._check_file_md5sum(filehandler, headers.content_md5sum) # actual storage of data archive_metadata = filehandler self._deposit_put( deposit=deposit, in_progress=headers.in_progress, ) self._deposit_request_put( deposit, {ARCHIVE_KEY: archive_metadata}, replace_metadata=replace_metadata, replace_archives=replace_archives, ) return Receipt( deposit_id=deposit.id, deposit_date=deposit.reception_date, status=deposit.status, archive=filehandler.name, ) def _read_metadata(self, metadata_stream) -> Tuple[bytes, ElementTree.Element]: """ Given a metadata stream, reads the metadata and returns the metadata in three forms: * verbatim (as raw bytes), for archival in long-term storage * parsed as a Python dict, for archival in postgresql's jsonb type * parsed as ElementTree, to extract information immediately """ raw_metadata = metadata_stream.read() metadata_tree = parse_xml(raw_metadata) return raw_metadata, metadata_tree def _multipart_upload( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, replace_metadata: bool = False, replace_archives: bool = False, ) -> Receipt: """Multipart upload supported with exactly: - 1 archive (zip) - 1 atom entry Other than such a request, a 415 response is returned. Args: request: the request holding information to parse and inject in db headers: parsed request headers collection_name: the associated client deposit: deposit to be updated replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Raises: - 400 (bad request) if the request is not providing an external identifier - 412 (precondition failed) if the potentially md5 hash provided mismatch the reality of the archive - 413 (request entity too large) if the length of the archive exceeds the max size configured - 415 (unsupported media type) if a wrong media type is provided """ content_types_present = set() data: Dict[str, Optional[Any]] = { "application/zip": None, # expected either zip "application/x-tar": None, # or x-tar "application/atom+xml": None, } for key, value in request.FILES.items(): fh = value content_type = fh.content_type if content_type in content_types_present: raise DepositError( ERROR_CONTENT, "Only 1 application/zip (or application/x-tar) archive " "and 1 atom+xml entry is supported (as per sword2.0 " "specification)", "You provided more than 1 application/(zip|x-tar) " "or more than 1 application/atom+xml content-disposition " "header in the multipart deposit", ) content_types_present.add(content_type) assert content_type is not None data[content_type] = fh if len(content_types_present) != 2: raise DepositError( ERROR_CONTENT, "You must provide both 1 application/zip (or " "application/x-tar) and 1 atom+xml entry for multipart " "deposit", "You need to provide only 1 application/(zip|x-tar) " "and 1 application/atom+xml content-disposition header " "in the multipart deposit", ) filehandler = data["application/zip"] if not filehandler: filehandler = data["application/x-tar"] assert isinstance(filehandler, UploadedFile), filehandler self._check_file_length(filehandler) self._check_file_md5sum(filehandler, headers.content_md5sum) try: raw_metadata, metadata_tree = self._read_metadata( data["application/atom+xml"] ) except ParserError: raise DepositError( PARSING_ERROR, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) self._set_deposit_origin_from_metadata(deposit, metadata_tree, headers) # actual storage of data self._deposit_put( deposit=deposit, in_progress=headers.in_progress, ) deposit_request_data = { ARCHIVE_KEY: filehandler, RAW_METADATA_KEY: raw_metadata, } self._deposit_request_put( deposit, deposit_request_data, replace_metadata, replace_archives ) assert filehandler is not None return Receipt( deposit_id=deposit.id, deposit_date=deposit.reception_date, archive=filehandler.name, status=deposit.status, ) def _store_metadata_deposit( self, deposit: Deposit, swhid_reference: Union[str, QualifiedSWHID], metadata_tree: ElementTree.Element, raw_metadata: bytes, deposit_origin: Optional[str] = None, ) -> Tuple[ExtendedSWHID, Deposit, DepositRequest]: """When all user inputs pass the checks, this associates the raw_metadata to the swhid_reference in the raw extrinsic metadata storage. In case of any issues, a bad request response is returned to the user with the details. Checks: - metadata are technically parsable - metadata pass the functional checks - SWHID (if any) is technically valid Args: deposit: Deposit reference swhid_reference: The swhid or the origin to attach metadata information to metadata_tree: Full element tree of metadata to check for validity (parsed out of raw_metadata) raw_metadata: The actual raw metadata to send in the storage metadata deposit_origin: Optional deposit origin url to use if any (e.g. deposit update scenario provides one) Raises: DepositError in case of incorrect inputs from the deposit client (e.g. functionally invalid metadata, ...) Returns: Tuple of target swhid, deposit, and deposit request """ metadata_ok, error_details = check_metadata(metadata_tree) if not metadata_ok: assert error_details, "Details should be set when a failure occurs" raise DepositError( BAD_REQUEST, "Functional metadata checks failure", convert_status_detail(error_details), ) metadata_authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit.client.provider_url, ) metadata_fetcher = self.swh_deposit_fetcher() # replace metadata within the deposit backend deposit_request_data = { RAW_METADATA_KEY: raw_metadata, } # actually add the metadata to the completed deposit deposit_request = self._deposit_request_put(deposit, deposit_request_data) target_swhid: ExtendedSWHID # origin URL or CoreSWHID if isinstance(swhid_reference, str): target_swhid = Origin(swhid_reference).swhid() metadata_context = {} else: metadata_context = compute_metadata_context(swhid_reference) if deposit_origin: # metadata deposit update on completed deposit metadata_context["origin"] = deposit_origin target_swhid = extended_swhid_from_qualified(swhid_reference) self._check_swhid_in_archive(target_swhid) # metadata deposited by the client metadata_object = RawExtrinsicMetadata( target=target_swhid, # core swhid or origin discovery_date=deposit_request.date, authority=metadata_authority, fetcher=metadata_fetcher, format="sword-v2-atom-codemeta", metadata=raw_metadata, **metadata_context, ) # metadata on the metadata object swh_deposit_authority = self.swh_deposit_authority() swh_deposit_fetcher = self.swh_deposit_fetcher() metametadata_object = RawExtrinsicMetadata( target=metadata_object.swhid(), discovery_date=deposit_request.date, authority=swh_deposit_authority, fetcher=swh_deposit_fetcher, format="xml-deposit-info", metadata=render_to_string( "deposit/deposit_info.xml", context={"deposit": deposit} ).encode(), ) # write to metadata storage self.storage_metadata.metadata_authority_add( [metadata_authority, swh_deposit_authority] ) self.storage_metadata.metadata_fetcher_add( [metadata_fetcher, swh_deposit_fetcher] ) self.storage_metadata.raw_extrinsic_metadata_add( [metadata_object, metametadata_object] ) return (target_swhid, deposit, deposit_request) def _check_swhid_in_archive(self, target_swhid: ExtendedSWHID) -> None: """Check the target object already exists in the archive, and raises a BAD_REQUEST if it does not.""" if target_swhid.object_type in (ExtendedObjectType.CONTENT,): if list( self.storage.content_missing_per_sha1_git([target_swhid.object_id]) ): raise DepositError( BAD_REQUEST, f"Cannot load metadata on {target_swhid}, this content " f"object does not exist in the archive (yet?).", ) elif target_swhid.object_type in ( ExtendedObjectType.DIRECTORY, ExtendedObjectType.REVISION, ExtendedObjectType.RELEASE, ExtendedObjectType.SNAPSHOT, ): target_type_name = target_swhid.object_type.name.lower() method = getattr(self.storage, target_type_name + "_missing") if list(method([target_swhid.object_id])): raise DepositError( BAD_REQUEST, f"Cannot load metadata on {target_swhid}, this {target_type_name} " f"object does not exist in the archive (yet?).", ) elif target_swhid.object_type in (ExtendedObjectType.ORIGIN,): if None in list(self.storage.origin_get_by_sha1([target_swhid.object_id])): raise DepositError( BAD_REQUEST, "Cannot load metadata on origin, it is not (yet?) known to the " "archive.", ) else: # This should not happen, because target_swhid is generated from either # a core swhid or an origin URL. # Let's just check it again so the "switch" is exhaustive. raise ValueError( f"_check_swhid_in_archive expected core SWHID or origin SWHID, " f"but got {target_swhid}." ) def _atom_entry( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, replace_metadata: bool = False, replace_archives: bool = False, ) -> Receipt: """Atom entry deposit. Args: request: the request holding information to parse and inject in db headers: parsed request headers collection_name: the associated client deposit: deposit to be updated replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Raises: - 400 (bad request) if the request is not providing an external identifier - 400 (bad request) if the request's body is empty - 415 (unsupported media type) if a wrong media type is provided """ metadata_stream = request.data empty_atom_entry_summary = "Empty body request is not supported." empty_atom_entry_desc = ( "Atom entry request is about non-empty metadata deposit." ) if not metadata_stream: raise DepositError( BAD_REQUEST, empty_atom_entry_summary, empty_atom_entry_desc ) try: raw_metadata, metadata_tree = self._read_metadata(metadata_stream) except ParserError: raise DepositError( BAD_REQUEST, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) if len(metadata_tree) == 0: raise DepositError( BAD_REQUEST, empty_atom_entry_summary, empty_atom_entry_desc ) self._set_deposit_origin_from_metadata(deposit, metadata_tree, headers) # Determine if we are in the metadata-only deposit case try: swhid_ref = parse_swh_reference(metadata_tree) except ValidationError as e: raise DepositError( PARSING_ERROR, "Invalid SWHID reference", str(e), ) if swhid_ref is not None and ( deposit.origin_url or deposit.parent or deposit.external_id ): raise DepositError( BAD_REQUEST, " is for metadata-only deposits and " " / / Slug are for " "code deposits, only one may be used on a given deposit.", ) if swhid_ref is not None: + # It's suggested to user to provide it + metadata_provenance_url = parse_swh_metadata_provenance(metadata_tree) + if metadata_provenance_url: + # If the provenance is provided, ensure it matches client provider url + check_url_match_provider( + metadata_provenance_url, deposit.client.provider_url + ) + deposit.save() # We need a deposit id target_swhid, depo, depo_request = self._store_metadata_deposit( deposit, swhid_ref, metadata_tree, raw_metadata ) deposit.status = DEPOSIT_STATUS_LOAD_SUCCESS if isinstance(swhid_ref, QualifiedSWHID): deposit.swhid = str(extended_swhid_from_qualified(swhid_ref)) deposit.swhid_context = str(swhid_ref) deposit.type = DEPOSIT_METADATA_ONLY deposit.complete_date = depo_request.date deposit.reception_date = depo_request.date deposit.save() return Receipt( deposit_id=deposit.id, deposit_date=depo_request.date, status=deposit.status, archive=None, ) self._deposit_put( deposit=deposit, in_progress=headers.in_progress, ) self._deposit_request_put( deposit, {RAW_METADATA_KEY: raw_metadata}, replace_metadata, replace_archives, ) return Receipt( deposit_id=deposit.id, deposit_date=deposit.reception_date, status=deposit.status, archive=None, ) def _set_deposit_origin_from_metadata(self, deposit, metadata, headers): (create_origin, add_to_origin) = parse_swh_deposit_origin(metadata) if create_origin and add_to_origin: raise DepositError( BAD_REQUEST, " and are mutually exclusive, " "as they respectively create a new origin and add to an existing " "origin.", ) if create_origin: origin_url = create_origin - check_client_origin(deposit.client, origin_url) + check_url_match_provider(origin_url, deposit.client.provider_url) deposit.origin_url = origin_url if add_to_origin: origin_url = add_to_origin - check_client_origin(deposit.client, origin_url) + check_url_match_provider(origin_url, deposit.client.provider_url) deposit.parent = ( Deposit.objects.filter( client=deposit.client, origin_url=origin_url, status=DEPOSIT_STATUS_LOAD_SUCCESS, ) .order_by("-id")[0:1] .get() ) deposit.origin_url = origin_url external_identifier_element = metadata.find( "atom:external_identifier", namespaces=NAMESPACES ) if external_identifier_element is not None: # Deprecated tag. # When clients stopped using it, this should raise an error # unconditionally if deposit.origin_url: raise DepositError( BAD_REQUEST, " is deprecated, you should only use " " and from now on.", ) if headers.slug and external_identifier_element.text != headers.slug: raise DepositError( BAD_REQUEST, "The tag and Slug header are deprecated, " " or " "should be used instead.", ) def _empty_post( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, ) -> Receipt: """Empty post to finalize a deposit. Args: request: the request holding information to parse and inject in db headers: parsed request headers collection_name: the associated client deposit: deposit to be finalized """ self._complete_deposit(deposit) assert deposit.complete_date is not None return Receipt( deposit_id=deposit.id, deposit_date=deposit.complete_date, status=deposit.status, archive=None, ) def additional_checks( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Optional[Deposit], ) -> Dict[str, Any]: """Permit the child class to enrich additional checks. Returns: dict with 'error' detailing the problem. """ return {} def get_client(self, request) -> DepositClient: # This class depends on AuthenticatedAPIView, so request.user.username # is always set username = request.user.username assert username is not None if self._client is None: try: self._client = DepositClient.objects.get( # type: ignore username=username ) except DepositClient.DoesNotExist: raise DepositError(NOT_FOUND, f"Unknown client name {username}") assert self._client.username == username return self._client def checks( self, request: Request, collection_name: str, deposit: Optional[Deposit] = None ) -> ParsedRequestHeaders: if deposit is None: collection = get_collection_by_name(collection_name) else: assert collection_name == deposit.collection.name collection = deposit.collection client = self.get_client(request) collection_id = collection.id collections = client.collections assert collections is not None if collection_id not in collections: raise DepositError( FORBIDDEN, f"Client {client.username} cannot access collection {collection_name}", ) headers = self._read_headers(request) if deposit is not None: self.restrict_access(request, headers, deposit) if headers.on_behalf_of: raise DepositError(MEDIATION_NOT_ALLOWED, "Mediation is not supported.") self.additional_checks(request, headers, collection_name, deposit) return headers def restrict_access( self, request: Request, headers: ParsedRequestHeaders, deposit: Deposit ) -> None: """Allow modifications on deposit with status 'partial' only, reject the rest. """ if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL: summary = "You can only act on deposit with status '%s'" % ( DEPOSIT_STATUS_PARTIAL, ) description = f"This deposit has status '{deposit.status}'" raise DepositError( BAD_REQUEST, summary=summary, verbose_description=description ) def _basic_not_allowed_method(self, request: Request, method: str): raise DepositError( METHOD_NOT_ALLOWED, f"{method} method is not supported on this endpoint", ) def get( self, request: Request, *args, **kwargs ) -> Union[HttpResponse, FileResponse]: return self._basic_not_allowed_method(request, "GET") def post(self, request: Request, *args, **kwargs) -> HttpResponse: return self._basic_not_allowed_method(request, "POST") def put(self, request: Request, *args, **kwargs) -> HttpResponse: return self._basic_not_allowed_method(request, "PUT") def delete(self, request: Request, *args, **kwargs) -> HttpResponse: return self._basic_not_allowed_method(request, "DELETE") class APIGet(APIBase, metaclass=ABCMeta): """Mixin for class to support GET method. """ def get( # type: ignore self, request: Request, collection_name: str, deposit_id: int ) -> Union[HttpResponse, FileResponse]: """Endpoint to create/add resources to deposit. Returns: 200 response when no error during routine occurred 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ deposit = get_deposit_by_id(deposit_id, collection_name) self.checks(request, collection_name, deposit) r = self.process_get(request, collection_name, deposit) status, content, content_type = r if content_type == "swh/generator": with content as path: return FileResponse( open(path, "rb"), status=status, content_type="application/tar" ) if content_type == "application/json": return HttpResponse( json.dumps(content), status=status, content_type=content_type ) return HttpResponse(content, status=status, content_type=content_type) @abstractmethod def process_get( self, request: Request, collection_name: str, deposit: Deposit ) -> Tuple[int, Any, str]: """Routine to deal with the deposit's get processing. Returns: Tuple status, stream of content, content-type """ pass class APIPost(APIBase, metaclass=ABCMeta): """Mixin for class to support POST method. """ def post( # type: ignore self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: """Endpoint to create/add resources to deposit. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ if deposit_id is None: deposit = None else: deposit = get_deposit_by_id(deposit_id, collection_name) headers = self.checks(request, collection_name, deposit) status, iri_key, receipt = self.process_post( request, headers, collection_name, deposit ) return self._make_deposit_receipt( request, collection_name, status, iri_key, receipt, ) def _make_deposit_receipt( self, request, collection_name: str, status: int, iri_key: str, receipt: Receipt, ) -> HttpResponse: """Returns an HttpResponse with a SWORD Deposit receipt as content.""" # Build the IRIs in the receipt args = [collection_name, receipt.deposit_id] iris = { iri: request.build_absolute_uri(reverse(iri, args=args)) for iri in [EM_IRI, EDIT_IRI, CONT_FILE_IRI, SE_IRI, STATE_IRI] } context = { **attr.asdict(receipt), **iris, "packagings": ACCEPT_PACKAGINGS, } response = render( request, "deposit/deposit_receipt.xml", context=context, content_type="application/xml", status=status, ) response["Location"] = iris[iri_key] return response @abstractmethod def process_post( self, request, headers: ParsedRequestHeaders, collection_name: str, deposit: Optional[Deposit] = None, ) -> Tuple[int, str, Receipt]: """Routine to deal with the deposit's processing. Returns Tuple of: - response status code (200, 201, etc...) - key iri (EM_IRI, EDIT_IRI, etc...) - Receipt """ pass class APIPut(APIBase, metaclass=ABCMeta): """Mixin for class to support PUT method. """ def put( # type: ignore self, request: Request, collection_name: str, deposit_id: int ) -> HttpResponse: """Endpoint to update deposit resources. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ if deposit_id is None: deposit = None else: deposit = get_deposit_by_id(deposit_id, collection_name) headers = self.checks(request, collection_name, deposit) self.process_put(request, headers, collection_name, deposit) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_put( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, ) -> None: """Routine to deal with updating a deposit in some way. Returns dictionary of the processing result """ pass class APIDelete(APIBase, metaclass=ABCMeta): """Mixin for class to support DELETE method. """ def delete( # type: ignore self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: """Endpoint to delete some deposit's resources (archives, deposit). Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ assert deposit_id is not None deposit = get_deposit_by_id(deposit_id, collection_name) self.checks(request, collection_name, deposit) self.process_delete(request, collection_name, deposit) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_delete( self, request: Request, collection_name: str, deposit: Deposit ) -> None: """Routine to delete a resource. This is mostly not allowed except for the EM_IRI (cf. .api.deposit_update.APIUpdateArchive) """ pass diff --git a/swh/deposit/tests/api/test_collection_add_to_origin.py b/swh/deposit/tests/api/test_collection_add_to_origin.py index 4c165815..8f3f02e3 100644 --- a/swh/deposit/tests/api/test_collection_add_to_origin.py +++ b/swh/deposit/tests/api/test_collection_add_to_origin.py @@ -1,156 +1,152 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_LOAD_SUCCESS from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import post_atom from swh.deposit.utils import NAMESPACES from ..conftest import internal_create_deposit def test_add_deposit_with_add_to_origin( authenticated_client, deposit_collection, completed_deposit, atom_dataset, deposit_user, ): """Posting deposit with creates a new deposit with parent """ # given multiple deposit already loaded deposit = completed_deposit assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS origin_url = deposit_user.provider_url + deposit.external_id # adding a new deposit with the same external id as a completed deposit # creates the parenting chain response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-with-add-to-origin"] % origin_url, ) assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert deposit_id != deposit.id new_deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == new_deposit.collection assert deposit.origin_url == origin_url assert new_deposit != deposit assert new_deposit.parent == deposit assert new_deposit.origin_url == origin_url def test_add_deposit_add_to_origin_conflict( authenticated_client, deposit_collection, deposit_another_collection, atom_dataset, sample_archive, deposit_user, deposit_another_user, ): """Posting a deposit with an referencing an origin owned by a different client raises an error """ external_id = "foobar" origin_url = deposit_another_user.provider_url + external_id # create a deposit for that other user, with the same slug internal_create_deposit( deposit_another_user, deposit_another_collection, external_id, DEPOSIT_STATUS_LOAD_SUCCESS, ) # adding a new deposit with the same external id as a completed deposit response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, ) assert response.status_code == status.HTTP_403_FORBIDDEN assert b"must start with" in response.content def test_add_deposit_add_to_wrong_origin( authenticated_client, deposit_collection, atom_dataset, sample_archive, ): """Posting a deposit with an referencing an origin not starting with the provider_url raises an error """ origin_url = "http://example.org/foo" # adding a new deposit with the same external id as a completed deposit response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, ) assert response.status_code == status.HTTP_403_FORBIDDEN, response.content.decode() assert b"must start with" in response.content def test_add_deposit_with_add_to_origin_and_external_identifier( authenticated_client, deposit_collection, completed_deposit, atom_dataset, deposit_user, ): """Posting deposit with creates a new deposit with parent """ # given multiple deposit already loaded origin_url = deposit_user.provider_url + completed_deposit.external_id # adding a new deposit with the same external id as a completed deposit # creates the parenting chain response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-with-both-add-to-origin-and-external-id"] % origin_url, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"<external_identifier> is deprecated" in response.content def test_post_deposit_atom_403_add_to_wrong_origin_url_prefix( authenticated_client, deposit_collection, atom_dataset, deposit_user ): """Creating an origin for a prefix not owned by the client is forbidden """ origin_url = "http://example.org/foo" response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-with-add-to-origin"] % origin_url, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_403_FORBIDDEN - expected_msg = ( - f"Cannot create origin {origin_url}, " - f"it must start with {deposit_user.provider_url}" - ) - assert expected_msg in response.content.decode() + assert "URL mismatch" in response.content.decode() diff --git a/swh/deposit/tests/api/test_collection_post_atom.py b/swh/deposit/tests/api/test_collection_post_atom.py index 4356f033..1ad4ae78 100644 --- a/swh/deposit/tests/api/test_collection_post_atom.py +++ b/swh/deposit/tests/api/test_collection_post_atom.py @@ -1,803 +1,825 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Tests the handling of the Atom content when doing a POST Col-IRI.""" import textwrap import uuid import warnings from xml.etree import ElementTree import attr from django.urls import reverse_lazy as reverse import pytest from rest_framework import status from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_SUCCESS, APIConfig, ) from swh.deposit.models import Deposit, DepositCollection, DepositRequest from swh.deposit.tests.common import post_atom from swh.deposit.utils import ( NAMESPACES, compute_metadata_context, extended_swhid_from_qualified, ) from swh.model.hypothesis_strategies import ( directories, present_contents, releases, revisions, snapshots, ) from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, Origin, RawExtrinsicMetadata, ) from swh.model.swhids import ObjectType, QualifiedSWHID from swh.storage.interface import PagedResult def _insert_object(swh_storage, swhid): """Insert an object with the given swhid in the archive""" if swhid.object_type == ObjectType.CONTENT: with warnings.catch_warnings(): # hypothesis doesn't like us using .example(), but we know what we're doing warnings.simplefilter("ignore") obj = present_contents().example() swh_storage.content_add([attr.evolve(obj, sha1_git=swhid.object_id)]) else: object_type_name = swhid.object_type.name.lower() strategy = { "directory": directories, "revision": revisions, "release": releases, "snapshot": snapshots, }[object_type_name] method = getattr(swh_storage, object_type_name + "_add") with warnings.catch_warnings(): # hypothesis doesn't like us using .example(), but we know what we're doing warnings.simplefilter("ignore") obj = strategy().example() method([attr.evolve(obj, id=swhid.object_id)]) def _assert_deposit_info_on_metadata( swh_storage, metadata_swhid, deposit, metadata_fetcher ): swh_authority = MetadataAuthority( MetadataAuthorityType.REGISTRY, "http://deposit.softwareheritage.example/", ) page_results = swh_storage.raw_extrinsic_metadata_get(metadata_swhid, swh_authority) assert len(page_results.results) == 1 assert page_results.next_page_token is None expected_xml_data = textwrap.dedent( f"""\ {deposit.id} https://hal-test.archives-ouvertes.fr/ test """ ) assert page_results == PagedResult( results=[ RawExtrinsicMetadata( target=metadata_swhid, discovery_date=deposit.complete_date, authority=swh_authority, fetcher=metadata_fetcher, format="xml-deposit-info", metadata=expected_xml_data.encode(), ) ], next_page_token=None, ) def test_post_deposit_atom_201_even_with_decimal( authenticated_client, deposit_collection, atom_dataset ): """Posting an initial atom entry should return 201 with deposit receipt """ atom_error_with_decimal = atom_dataset["error-with-decimal"] response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_error_with_decimal, HTTP_SLUG="external-id", HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = ElementTree.fromstring(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) dr = DepositRequest.objects.get(deposit=deposit) assert dr.raw_metadata is not None sw_version = ElementTree.fromstring(dr.raw_metadata).findtext( "codemeta:softwareVersion", namespaces=NAMESPACES ) assert sw_version == "10.4" def test_post_deposit_atom_400_with_empty_body( authenticated_client, deposit_collection, atom_dataset ): """Posting empty body request should return a 400 response """ atom_content = atom_dataset["entry-data-empty-body"] response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_content, HTTP_SLUG="external-id", ) assert ( response.status_code == status.HTTP_400_BAD_REQUEST ), response.content.decode() assert b"Empty body request is not supported" in response.content def test_post_deposit_atom_400_with_empty_request( authenticated_client, deposit_collection ): """Posting empty request should return a 400 response """ response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data={}, HTTP_SLUG="external-id", CONTENT_LENGTH=0, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Empty body request is not supported" in response.content def test_post_deposit_atom_400_badly_formatted_atom( authenticated_client, deposit_collection, atom_dataset ): """Posting a badly formatted atom should return a 400 response """ response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-badly-formatted"], HTTP_SLUG="external-id", ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Malformed xml metadata" in response.content def test_post_deposit_atom_parsing_error( authenticated_client, deposit_collection, atom_dataset ): """Posting parsing error prone atom should return 400 """ response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-parsing-error-prone"], HTTP_SLUG="external-id", ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Malformed xml metadata" in response.content def test_post_deposit_atom_400_both_create_origin_and_add_to_origin( authenticated_client, deposit_collection, atom_dataset ): """Posting a badly formatted atom should return a 400 response """ response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-with-both-create-origin-and-add-to-origin"], ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert ( b"<swh:create_origin> and <swh:add_to_origin> " b"are mutually exclusive" ) in response.content def test_post_deposit_atom_403_create_wrong_origin_url_prefix( authenticated_client, deposit_collection, atom_dataset, deposit_user ): """Creating an origin for a prefix not owned by the client is forbidden """ origin_url = "http://example.org/foo" response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_403_FORBIDDEN - expected_msg = ( - f"Cannot create origin {origin_url}, " - f"it must start with {deposit_user.provider_url}" - ) - assert expected_msg in response.content.decode() + assert "URL mismatch" in response.content.decode() def test_post_deposit_atom_use_slug_header( authenticated_client, deposit_collection, deposit_user, atom_dataset, mocker ): """Posting an atom entry with a slug header but no origin url generates an origin url from the slug """ url = reverse(COL_IRI, args=[deposit_collection.name]) slug = str(uuid.uuid4()) # when response = post_atom( authenticated_client, url, data=atom_dataset["entry-data-no-origin-url"], HTTP_IN_PROGRESS="false", HTTP_SLUG=slug, ) assert response.status_code == status.HTTP_201_CREATED response_content = ElementTree.fromstring(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == deposit_user.provider_url + slug assert deposit.status == DEPOSIT_STATUS_DEPOSITED def test_post_deposit_atom_no_origin_url_nor_slug_header( authenticated_client, deposit_collection, deposit_user, atom_dataset, mocker ): """Posting an atom entry without an origin url or a slug header should generate one """ url = reverse(COL_IRI, args=[deposit_collection.name]) slug = str(uuid.uuid4()) mocker.patch("uuid.uuid4", return_value=slug) # when response = post_atom( authenticated_client, url, data=atom_dataset["entry-data-no-origin-url"], HTTP_IN_PROGRESS="false", ) assert response.status_code == status.HTTP_201_CREATED response_content = ElementTree.fromstring(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == deposit_user.provider_url + slug assert deposit.status == DEPOSIT_STATUS_DEPOSITED def test_post_deposit_atom_with_slug_and_external_identifier( authenticated_client, deposit_collection, deposit_user, atom_dataset, mocker ): """Even though is deprecated, it should still be allowed when it matches the slug, so that we don't break existing clients """ url = reverse(COL_IRI, args=[deposit_collection.name]) slug = str(uuid.uuid4()) # when response = post_atom( authenticated_client, url, data=atom_dataset["error-with-external-identifier"] % slug, HTTP_IN_PROGRESS="false", HTTP_SLUG=slug, ) assert response.status_code == status.HTTP_201_CREATED response_content = ElementTree.fromstring(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == deposit_user.provider_url + slug assert deposit.status == DEPOSIT_STATUS_DEPOSITED def test_post_deposit_atom_with_mismatched_slug_and_external_identifier( authenticated_client, deposit_collection, atom_dataset ): """Posting an atom entry with mismatched slug header and external_identifier should return a 400 """ external_id = "foobar" url = reverse(COL_IRI, args=[deposit_collection.name]) # when response = post_atom( authenticated_client, url, data=atom_dataset["error-with-external-identifier"] % external_id, HTTP_IN_PROGRESS="false", HTTP_SLUG="something", ) assert ( b"The <external_identifier> tag and Slug header are deprecated" in response.content ) assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_with_create_origin_and_external_identifier( authenticated_client, deposit_collection, atom_dataset, deposit_user ): """ was deprecated before was introduced, clients should get an error when trying to use both """ external_id = "foobar" origin_url = deposit_user.provider_url + external_id url = reverse(COL_IRI, args=[deposit_collection.name]) document = atom_dataset["error-with-external-identifier-and-create-origin"].format( external_id=external_id, url=origin_url, ) # when response = post_atom( authenticated_client, url, data=document, HTTP_IN_PROGRESS="false", ) assert b"<external_identifier> is deprecated" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_with_create_origin_and_reference( authenticated_client, deposit_collection, atom_dataset, deposit_user ): """ and are mutually exclusive """ external_id = "foobar" origin_url = deposit_user.provider_url + external_id url = reverse(COL_IRI, args=[deposit_collection.name]) document = atom_dataset["error-with-reference-and-create-origin"].format( external_id=external_id, url=origin_url, ) # when response = post_atom( authenticated_client, url, data=document, HTTP_IN_PROGRESS="false", ) assert b"only one may be used on a given deposit" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_unknown_collection(authenticated_client, atom_dataset): """Posting an atom entry to an unknown collection should return a 404 """ unknown_collection = "unknown-one" with pytest.raises(DepositCollection.DoesNotExist): DepositCollection.objects.get(name=unknown_collection) response = post_atom( authenticated_client, reverse(COL_IRI, args=[unknown_collection]), data=atom_dataset["entry-data0"], HTTP_SLUG="something", ) assert response.status_code == status.HTTP_404_NOT_FOUND assert b"Unknown collection" in response.content def test_post_deposit_atom_entry_initial( authenticated_client, deposit_collection, atom_dataset, deposit_user ): """Posting an initial atom entry should return 201 with deposit receipt """ # given origin_url = deposit_user.provider_url + "1225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(origin_url=origin_url) atom_entry_data = atom_dataset["entry-data0"] % origin_url # when response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_entry_data, HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = ElementTree.fromstring(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == origin_url assert deposit.status == DEPOSIT_STATUS_DEPOSITED # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.raw_metadata == atom_entry_data assert bool(deposit_request.archive) is False def test_post_deposit_atom_entry_with_codemeta( authenticated_client, deposit_collection, atom_dataset, deposit_user ): """Posting an initial atom entry should return 201 with deposit receipt """ # given origin_url = deposit_user.provider_url + "1225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(origin_url=origin_url) atom_entry_data = atom_dataset["codemeta-sample"] % origin_url # when response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_entry_data, HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = ElementTree.fromstring(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == origin_url assert deposit.status == DEPOSIT_STATUS_DEPOSITED # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.raw_metadata == atom_entry_data assert bool(deposit_request.archive) is False def test_deposit_metadata_invalid( authenticated_client, deposit_collection, atom_dataset ): """Posting invalid swhid reference is bad request returned to client """ invalid_swhid = "swh:1:dir :31b5c8cc985d190b5a7ef4878128ebfdc2358f49" - xml_data = atom_dataset["entry-data-with-swhid"].format(swhid=invalid_swhid) + xml_data = atom_dataset["entry-data-with-swhid-no-prov"].format(swhid=invalid_swhid) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Invalid SWHID reference" in response.content +def test_deposit_metadata_invalid_metadata_provenance( + authenticated_client, deposit_collection, atom_dataset +): + """Posting invalid metadata provenance is bad request returned to client + + """ + invalid_swhid = "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49" + xml_data = atom_dataset["entry-data-with-swhid"].format( + swhid=invalid_swhid, + metadata_provenance_url=( + "https://inria.halpreprod.archives-ouvertes.fr/hal-abcdefgh" + ), + ) + + response = post_atom( + authenticated_client, + reverse(COL_IRI, args=[deposit_collection.name]), + data=xml_data, + ) + assert response.status_code == status.HTTP_403_FORBIDDEN + assert b"URL mismatch" in response.content + + def test_deposit_metadata_fails_functional_checks( authenticated_client, deposit_collection, atom_dataset ): """Posting functionally invalid metadata swhid is bad request returned to client """ swhid = "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49" invalid_xml_data = atom_dataset[ "entry-data-with-swhid-fail-metadata-functional-checks" ].format(swhid=swhid) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=invalid_xml_data, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Functional metadata checks failure" in response.content @pytest.mark.parametrize( "swhid", [ "swh:1:cnt:01b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:dir:11b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:rev:21b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;origin=https://inria.halpreprod.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:rev:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa "swh:1:rev:71b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:rel:81b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:snp:91b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", ], ) def test_deposit_metadata_swhid( swhid, authenticated_client, deposit_collection, atom_dataset, swh_storage, ): """Posting a swhid reference is stored on raw extrinsic metadata storage """ swhid_reference = QualifiedSWHID.from_string(swhid) swhid_target = extended_swhid_from_qualified(swhid_reference) - xml_data = atom_dataset["entry-data-with-swhid"].format(swhid=swhid) + xml_data = atom_dataset["entry-data-with-swhid"].format( + swhid=swhid, + metadata_provenance_url="https://hal-test.archives-ouvertes.fr/hal-abcdefgh", + ) deposit_client = authenticated_client.deposit_client _insert_object(swh_storage, swhid_reference) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = ElementTree.fromstring(response.content) # Ensure the deposit is finalized deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.swhid == str(swhid_target) assert deposit.swhid_context == str(swhid_reference) assert deposit.complete_date == deposit.reception_date assert deposit.complete_date is not None assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS # Ensure metadata stored in the metadata storage is consistent metadata_authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url, ) actual_authority = swh_storage.metadata_authority_get( MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url ) assert actual_authority == metadata_authority config = APIConfig() metadata_fetcher = MetadataFetcher( name=config.tool["name"], version=config.tool["version"], ) actual_fetcher = swh_storage.metadata_fetcher_get( config.tool["name"], config.tool["version"] ) assert actual_fetcher == metadata_fetcher # Get the deposited metadata object and check it: page_results = swh_storage.raw_extrinsic_metadata_get( swhid_target, metadata_authority ) assert len(page_results.results) == 1 assert page_results.next_page_token is None metadata_context = compute_metadata_context(swhid_reference) metadata = RawExtrinsicMetadata( target=swhid_target, discovery_date=deposit.complete_date, authority=metadata_authority, fetcher=metadata_fetcher, format="sword-v2-atom-codemeta", metadata=xml_data.encode(), **metadata_context, ) assert page_results == PagedResult(results=[metadata], next_page_token=None,) # Get metadata about the deposited metadata object and check it: _assert_deposit_info_on_metadata( swh_storage, metadata.swhid(), deposit, metadata_fetcher ) @pytest.mark.parametrize( "url", ["https://gitlab.org/user/repo", "https://whatever.else/repo",] ) def test_deposit_metadata_origin( url, authenticated_client, deposit_collection, atom_dataset, swh_storage, ): """Posting a swhid reference is stored on raw extrinsic metadata storage """ xml_data = atom_dataset["entry-data-with-origin-reference"].format(url=url) origin_swhid = Origin(url).swhid() deposit_client = authenticated_client.deposit_client swh_storage.origin_add([Origin(url)]) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = ElementTree.fromstring(response.content) # Ensure the deposit is finalized deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) # we got not swhid as input so we cannot have those assert deposit.swhid is None assert deposit.swhid_context is None assert deposit.complete_date == deposit.reception_date assert deposit.complete_date is not None assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS # Ensure metadata stored in the metadata storage is consistent metadata_authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url, ) actual_authority = swh_storage.metadata_authority_get( MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url ) assert actual_authority == metadata_authority config = APIConfig() metadata_fetcher = MetadataFetcher( name=config.tool["name"], version=config.tool["version"], ) actual_fetcher = swh_storage.metadata_fetcher_get( config.tool["name"], config.tool["version"] ) assert actual_fetcher == metadata_fetcher # Get the deposited metadata object and check it: page_results = swh_storage.raw_extrinsic_metadata_get( origin_swhid, metadata_authority ) assert len(page_results.results) == 1 assert page_results.next_page_token is None metadata = RawExtrinsicMetadata( target=origin_swhid, discovery_date=deposit.complete_date, authority=metadata_authority, fetcher=metadata_fetcher, format="sword-v2-atom-codemeta", metadata=xml_data.encode(), ) assert page_results == PagedResult(results=[metadata], next_page_token=None,) # Get metadata about the deposited metadata object and check it: _assert_deposit_info_on_metadata( swh_storage, metadata.swhid(), deposit, metadata_fetcher ) @pytest.mark.parametrize( "swhid", [ "swh:1:cnt:01b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:dir:11b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:rev:21b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;origin=https://inria.halpreprod.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:rev:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa "swh:1:rev:71b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:rel:81b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:snp:91b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", ], ) def test_deposit_metadata_unknown_swhid( swhid, authenticated_client, deposit_collection, atom_dataset, swh_storage, ): """Posting a swhid reference is rejected if the referenced object is unknown """ - xml_data = atom_dataset["entry-data-with-swhid"].format(swhid=swhid) + xml_data = atom_dataset["entry-data-with-swhid-no-prov"].format(swhid=swhid) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert ( response.status_code == status.HTTP_400_BAD_REQUEST ), response.content.decode() response_content = ElementTree.fromstring(response.content) assert "object does not exist" in response_content.findtext( "atom:summary", namespaces=NAMESPACES ) @pytest.mark.parametrize( "swhid", [ "swh:1:ori:01b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:emd:11b5c8cc985d190b5a7ef4878128ebfdc2358f49", ], ) def test_deposit_metadata_extended_swhid( swhid, authenticated_client, deposit_collection, atom_dataset, swh_storage, ): """Posting a swhid reference is rejected if the referenced SWHID is for an extended object type """ - xml_data = atom_dataset["entry-data-with-swhid"].format(swhid=swhid) + xml_data = atom_dataset["entry-data-with-swhid-no-prov"].format(swhid=swhid) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert ( response.status_code == status.HTTP_400_BAD_REQUEST ), response.content.decode() response_content = ElementTree.fromstring(response.content) assert "Invalid SWHID reference" in response_content.findtext( "atom:summary", namespaces=NAMESPACES ) def test_deposit_metadata_unknown_origin( authenticated_client, deposit_collection, atom_dataset, swh_storage, ): """Posting a swhid reference is stored on raw extrinsic metadata storage """ url = "https://gitlab.org/user/repo" xml_data = atom_dataset["entry-data-with-origin-reference"].format(url=url) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert ( response.status_code == status.HTTP_400_BAD_REQUEST ), response.content.decode() response_content = ElementTree.fromstring(response.content) assert "known to the archive" in response_content.findtext( "atom:summary", namespaces=NAMESPACES ) diff --git a/swh/deposit/tests/cli/test_client.py b/swh/deposit/tests/cli/test_client.py index c2f54a72..569ffb2b 100644 --- a/swh/deposit/tests/cli/test_client.py +++ b/swh/deposit/tests/cli/test_client.py @@ -1,1167 +1,1176 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import ast import contextlib import json import logging import os from typing import Optional from unittest.mock import MagicMock from xml.etree import ElementTree import pytest import yaml from swh.deposit.api.checks import ( METADATA_PROVENANCE_KEY, SUGGESTED_FIELDS_MISSING, check_metadata, ) from swh.deposit.cli import deposit as cli from swh.deposit.cli.client import InputError, _collection, _url, generate_metadata from swh.deposit.client import ( BaseDepositClient, MaintenanceError, PublicApiDepositClient, ServiceDocumentDepositClient, ) from swh.deposit.parsers import parse_xml from swh.deposit.utils import NAMESPACES from swh.model.exceptions import ValidationError from ..conftest import TEST_USER def generate_slug() -> str: """Generate a slug (sample purposes). """ import uuid return str(uuid.uuid4()) @pytest.fixture def datadir(request): """Override default datadir to target main test datadir""" return os.path.join(os.path.dirname(str(request.fspath)), "../data") @pytest.fixture def slug(): return generate_slug() @pytest.fixture def patched_tmp_path(tmp_path, mocker): mocker.patch( "tempfile.TemporaryDirectory", return_value=contextlib.nullcontext(str(tmp_path)), ) return tmp_path @pytest.fixture def client_mock_api_down(mocker, slug): """A mock client whose connection with api fails due to maintenance issue """ mock_client = MagicMock() mocker.patch("swh.deposit.client.PublicApiDepositClient", return_value=mock_client) mock_client.service_document.side_effect = MaintenanceError( "Database backend maintenance: Temporarily unavailable, try again later." ) return mock_client def test_cli_url(): assert _url("http://deposit") == "http://deposit/1" assert _url("https://other/1") == "https://other/1" def test_cli_collection_error(): mock_client = MagicMock() mock_client.service_document.return_value = {"error": "something went wrong"} with pytest.raises(InputError) as e: _collection(mock_client) assert "Service document retrieval: something went wrong" == str(e.value) def test_cli_collection_ok(requests_mock_datadir): client = PublicApiDepositClient( url="https://deposit.swh.test/1", auth=("test", "test") ) collection_name = _collection(client) assert collection_name == "test" def test_cli_collection_ko_because_downtime(): mock_client = MagicMock() mock_client.service_document.side_effect = MaintenanceError("downtime") with pytest.raises(MaintenanceError, match="downtime"): _collection(mock_client) def test_cli_upload_conflictual_flags( datadir, requests_mock_datadir, cli_runner, atom_dataset, tmp_path, ): """Post metadata-only deposit through cli with invalid swhid raises """ api_url_basename = "deposit.test.metadataonly" metadata = atom_dataset["entry-data-minimal"] metadata_path = os.path.join(tmp_path, "entry-data-minimal.xml") with open(metadata_path, "w") as f: f.write(metadata) with pytest.raises(InputError, match="both with different values"): # fmt: off cli_runner.invoke( cli, [ "upload", "--url", f"https://{api_url_basename}/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, "--slug", "some-slug", # deprecated flag "--create-origin", "some-other-slug", # conflictual value, so raise "--format", "json", ], catch_exceptions=False, ) # fmt: on def test_cli_deposit_with_server_down_for_maintenance( sample_archive, caplog, client_mock_api_down, slug, patched_tmp_path, cli_runner ): """ Deposit failure due to maintenance down time should be explicit """ # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], "--author", "Jane Doe", ], ) # fmt: on assert result.exit_code == 1, result.output assert result.output == "" down_for_maintenance_log_record = ( "swh.deposit.cli.client", logging.ERROR, "Database backend maintenance: Temporarily unavailable, try again later.", ) assert down_for_maintenance_log_record in caplog.record_tuples client_mock_api_down.service_document.assert_called_once_with() def test_cli_client_generate_metadata_ok(slug): """Generated metadata is well formed and pass service side metadata checks """ actual_metadata_xml = generate_metadata( "deposit-client", "project-name", authors=["some", "authors"], external_id="external-id", create_origin="origin-url", metadata_provenance_url="meta-prov-url", ) actual_metadata = parse_xml(actual_metadata_xml) assert ( actual_metadata.findtext("atom:author", namespaces=NAMESPACES) == "deposit-client" ) assert ( actual_metadata.findtext("atom:title", namespaces=NAMESPACES) == "project-name" ) assert actual_metadata.findtext("atom:updated", namespaces=NAMESPACES) is not None assert ( actual_metadata.findtext("codemeta:name", namespaces=NAMESPACES) == "project-name" ) assert ( actual_metadata.findtext("codemeta:identifier", namespaces=NAMESPACES) == "external-id" ) authors = actual_metadata.findall( "codemeta:author/codemeta:name", namespaces=NAMESPACES ) assert len(authors) == 2 assert authors[0].text == "some" assert authors[1].text == "authors" assert ( actual_metadata.find( "swh:deposit/swh:create_origin/swh:origin", namespaces=NAMESPACES ).attrib["url"] == "origin-url" ) assert ( actual_metadata.findtext( "swh:deposit/swh:metadata-provenance/schema:url", namespaces=NAMESPACES ) == "meta-prov-url" ) checks_ok, detail = check_metadata(ElementTree.fromstring(actual_metadata_xml)) assert checks_ok is True assert detail is None def test_cli_client_generate_metadata_ok2(slug): """Generated metadata is well formed and pass service side metadata checks """ actual_metadata_xml = generate_metadata( "deposit-client", "project-name", authors=["some", "authors"], ) actual_metadata = parse_xml(actual_metadata_xml) assert ( actual_metadata.findtext("atom:author", namespaces=NAMESPACES) == "deposit-client" ) assert ( actual_metadata.findtext("atom:title", namespaces=NAMESPACES) == "project-name" ) assert actual_metadata.findtext("atom:updated", namespaces=NAMESPACES) is not None assert ( actual_metadata.findtext("codemeta:name", namespaces=NAMESPACES) == "project-name" ) authors = actual_metadata.findall( "codemeta:author/codemeta:name", namespaces=NAMESPACES ) assert len(authors) == 2 assert authors[0].text == "some" assert authors[1].text == "authors" assert actual_metadata.find("codemeta:identifier", namespaces=NAMESPACES) is None assert actual_metadata.find("swh:deposit", namespaces=NAMESPACES) is None checks_ok, detail = check_metadata(ElementTree.fromstring(actual_metadata_xml)) assert checks_ok is True assert detail == { "metadata": [ {"summary": SUGGESTED_FIELDS_MISSING, "fields": [METADATA_PROVENANCE_KEY]} ] } def test_cli_single_minimal_deposit_with_slug( sample_archive, slug, patched_tmp_path, requests_mock_datadir, cli_runner, caplog, ): """ This ensure a single deposit upload through the cli is fine, cf. https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#single-deposit """ # noqa metadata_path = os.path.join(patched_tmp_path, "metadata.xml") # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], "--metadata-provenance-url", "meta-prov-url", "--author", "Jane Doe", "--slug", slug, "--format", "json", ], ) # fmt: on assert result.exit_code == 0, result.output assert json.loads(result.output) == { "deposit_id": "615", "deposit_status": "partial", "deposit_status_detail": None, "deposit_date": "Oct. 8, 2020, 4:57 p.m.", } with open(metadata_path) as fd: actual_metadata = parse_xml(fd.read()) assert ( actual_metadata.findtext("atom:author", namespaces=NAMESPACES) == TEST_USER["username"] ) assert ( actual_metadata.findtext("codemeta:name", namespaces=NAMESPACES) == "test-project" ) assert ( actual_metadata.findtext("atom:title", namespaces=NAMESPACES) == "test-project" ) assert ( actual_metadata.findtext("atom:updated", namespaces=NAMESPACES) is not None ) assert ( actual_metadata.findtext("codemeta:identifier", namespaces=NAMESPACES) == slug ) authors = actual_metadata.findall( "codemeta:author/codemeta:name", namespaces=NAMESPACES ) assert len(authors) == 1 assert authors[0].text == "Jane Doe" count_warnings = 0 for (_, log_level, _) in caplog.record_tuples: count_warnings += 1 if log_level == logging.WARNING else 0 assert ( count_warnings == 1 ), "We should have 1 warning as we are using slug instead of create_origin" def test_cli_single_minimal_deposit_with_create_origin( sample_archive, slug, patched_tmp_path, requests_mock_datadir, cli_runner, caplog, ): """ This ensure a single deposit upload through the cli is fine, cf. https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#single-deposit """ # noqa metadata_path = os.path.join(patched_tmp_path, "metadata.xml") origin = slug # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], "--author", "Jane Doe", "--create-origin", origin, "--metadata-provenance-url", "meta-prov-url", "--format", "json", ], ) # fmt: on assert result.exit_code == 0, result.output assert json.loads(result.output) == { "deposit_id": "615", "deposit_status": "partial", "deposit_status_detail": None, "deposit_date": "Oct. 8, 2020, 4:57 p.m.", } with open(metadata_path) as fd: actual_metadata = parse_xml(fd.read()) assert ( actual_metadata.findtext("atom:author", namespaces=NAMESPACES) == TEST_USER["username"] ) assert ( actual_metadata.findtext("codemeta:name", namespaces=NAMESPACES) == "test-project" ) assert ( actual_metadata.findtext("atom:title", namespaces=NAMESPACES) == "test-project" ) assert ( actual_metadata.findtext("atom:updated", namespaces=NAMESPACES) is not None ) assert ( actual_metadata.find( "swh:deposit/swh:create_origin/swh:origin", namespaces=NAMESPACES ).attrib["url"] == origin ) assert ( actual_metadata.findtext( "swh:deposit/swh:metadata-provenance/schema:url", namespaces=NAMESPACES ) == "meta-prov-url" ) authors = actual_metadata.findall( "codemeta:author/codemeta:name", namespaces=NAMESPACES ) assert len(authors) == 1 assert authors[0].text == "Jane Doe" count_warnings = 0 for (_, log_level, _) in caplog.record_tuples: count_warnings += 1 if log_level == logging.WARNING else 0 assert ( count_warnings == 0 ), "We should have no warning as we are using create_origin" def test_cli_validation_metadata( sample_archive, caplog, patched_tmp_path, cli_runner, slug ): """Multiple metadata flags scenario (missing, conflicts) properly fails the calls """ metadata_path = os.path.join(patched_tmp_path, "metadata.xml") with open(metadata_path, "a"): pass # creates the file for flag_title_or_name, author_or_name in [ ("--author", "no one"), ("--name", "test-project"), ]: # Test missing author then missing name # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--slug", slug, flag_title_or_name, author_or_name, ], ) # fmt: on assert result.exit_code == 1, f"unexpected result: {result.output}" assert result.output == "" expected_error_log_record = ( "swh.deposit.cli.client", logging.ERROR, ( "Problem during parsing options: " "For metadata deposit request, either a metadata file with " "--metadata or both --author and --name must be provided. " ), ) assert expected_error_log_record in caplog.record_tuples # Clear mocking state caplog.clear() # incompatible flags: Test both --metadata and --author, then --metadata and # --name # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--deposit-id", 666, "--archive", sample_archive["path"], "--slug", slug, ], ) # fmt: on assert result.exit_code == 1, f"unexpected result: {result.output}" assert result.output == "" expected_error_log_record = ( "swh.deposit.cli.client", logging.ERROR, ( "Problem during parsing options: " "For metadata deposit request, either a metadata file with " "--metadata or both --author and --name must be provided." ), ) assert expected_error_log_record in caplog.record_tuples # Clear mocking state caplog.clear() # incompatible flags check (Test both --metadata and --author, # then --metadata and --name) # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--metadata", metadata_path, "--author", "Jane Doe", "--slug", slug, ], ) # fmt: on assert result.exit_code == 1, result.output assert result.output == "" expected_error_log_record = ( "swh.deposit.cli.client", logging.ERROR, ( "Problem during parsing options: " "Using --metadata flag is incompatible with --author " "and --name and --create-origin (those are used to generate " "one metadata file)." ), ) assert expected_error_log_record in caplog.record_tuples caplog.clear() def test_cli_validation_no_actionable_command(caplog, cli_runner): """Multiple metadata flags scenario (missing, conflicts) properly fails the calls """ # no actionable command # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--partial", ], ) # fmt: on assert result.exit_code == 1, result.output assert result.output == "" expected_error_log_record = ( "swh.deposit.cli.client", logging.ERROR, ( "Problem during parsing options: " "Please provide an actionable command. See --help for more information" ), ) assert expected_error_log_record in caplog.record_tuples def test_cli_validation_replace_with_no_deposit_id_fails( sample_archive, caplog, patched_tmp_path, requests_mock_datadir, datadir, cli_runner ): """--replace flags require --deposit-id otherwise fails """ metadata_path = os.path.join(datadir, "atom", "entry-data-deposit-binary.xml") # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, "--archive", sample_archive["path"], "--replace", ], ) # fmt: on assert result.exit_code == 1, result.output assert result.output == "" expected_error_log_record = ( "swh.deposit.cli.client", logging.ERROR, ( "Problem during parsing options: " "To update an existing deposit, you must provide its id" ), ) assert expected_error_log_record in caplog.record_tuples def test_cli_single_deposit_slug_generation( sample_archive, patched_tmp_path, requests_mock_datadir, cli_runner ): """Single deposit scenario without providing the slug, it should not be generated. """ metadata_path = os.path.join(patched_tmp_path, "metadata.xml") # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], "--author", "Jane Doe", "--format", "json", ], ) # fmt: on assert result.exit_code == 0, result.output assert json.loads(result.output) == { "deposit_id": "615", "deposit_status": "partial", "deposit_status_detail": None, "deposit_date": "Oct. 8, 2020, 4:57 p.m.", } with open(metadata_path) as fd: metadata_xml = fd.read() actual_metadata = parse_xml(metadata_xml) assert "codemeta:identifier" not in actual_metadata def test_cli_multisteps_deposit( sample_archive, datadir, slug, requests_mock_datadir, cli_runner ): """ First deposit a partial deposit (no metadata, only archive), then update the metadata part. https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#multisteps-deposit """ # noqa api_url = "https://deposit.test.metadata/1" deposit_id = 666 # Create a partial deposit with only 1 archive # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", api_url, "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--slug", slug, "--format", "json", "--partial", ], ) # fmt: on assert result.exit_code == 0, f"unexpected output: {result.output}" actual_deposit = json.loads(result.output) assert actual_deposit == { "deposit_id": str(deposit_id), "deposit_status": "partial", "deposit_status_detail": None, "deposit_date": "Oct. 8, 2020, 4:57 p.m.", } # Update the partial deposit with only 1 archive # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", api_url, "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--deposit-id", deposit_id, "--slug", slug, "--format", "json", "--partial", # in-progress: True, because remains the metadata to upload ], ) # fmt: on assert result.exit_code == 0, f"unexpected output: {result.output}" assert result.output is not None actual_deposit = json.loads(result.output) # deposit update scenario actually returns a deposit status dict assert actual_deposit["deposit_id"] == str(deposit_id) assert actual_deposit["deposit_status"] == "partial" # Update the partial deposit with only some metadata (and then finalize it) # https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#add-content-or-metadata-to-the-deposit metadata_path = os.path.join(datadir, "atom", "entry-data-deposit-binary.xml") # Update deposit with metadata # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", api_url, "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, "--deposit-id", deposit_id, "--slug", slug, "--format", "json", ], # this time, ^ we no longer flag it to partial, so the status changes to # in-progress false ) # fmt: on assert result.exit_code == 0, f"unexpected output: {result.output}" assert result.output is not None actual_deposit = json.loads(result.output) # deposit update scenario actually returns a deposit status dict assert actual_deposit["deposit_id"] == str(deposit_id) # FIXME: should be "deposited" but current limitation in the # requests_mock_datadir_visits use, cannot find a way to make it work right now assert actual_deposit["deposit_status"] == "partial" @pytest.mark.parametrize( "output_format,parser_fn", [ ("json", json.loads), ("yaml", yaml.safe_load), ( "logging", ast.literal_eval, ), # not enough though, the caplog fixture is needed ], ) def test_cli_deposit_status_with_output_format( output_format, parser_fn, datadir, slug, requests_mock_datadir, caplog, cli_runner ): """Check deposit status cli with all possible output formats (json, yaml, logging). """ api_url_basename = "deposit.test.status" deposit_id = 1033 expected_deposit_status = { "deposit_id": str(deposit_id), "deposit_status": "done", "deposit_status_detail": ( "The deposit has been successfully loaded into the " "Software Heritage archive" ), "deposit_swh_id": "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea", "deposit_swh_id_context": "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea;origin=https://www.softwareheritage.org/check-deposit-2020-10-08T13:52:34.509655;visit=swh:1:snp:c477c6ef51833127b13a86ece7d75e5b3cc4e93d;anchor=swh:1:rev:f26f3960c175f15f6e24200171d446b86f6f7230;path=/", # noqa "deposit_external_id": "check-deposit-2020-10-08T13:52:34.509655", } # fmt: off result = cli_runner.invoke( cli, [ "status", "--url", f"https://{api_url_basename}/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--deposit-id", deposit_id, "--format", output_format, ], ) # fmt: on assert result.exit_code == 0, f"unexpected output: {result.output}" if output_format == "logging": assert len(caplog.record_tuples) == 1 # format: (, , ) _, _, result_output = caplog.record_tuples[0] else: result_output = result.output actual_deposit = parser_fn(result_output) assert actual_deposit == expected_deposit_status def test_cli_update_metadata_with_swhid_on_completed_deposit( datadir, requests_mock_datadir, cli_runner ): """Update new metadata on a completed deposit (status done) is ok """ api_url_basename = "deposit.test.updateswhid" deposit_id = 123 expected_deposit_status = { "deposit_external_id": "check-deposit-2020-10-08T13:52:34.509655", "deposit_id": str(deposit_id), "deposit_status": "done", "deposit_status_detail": ( "The deposit has been successfully loaded into the " "Software Heritage archive" ), "deposit_swh_id": "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea", "deposit_swh_id_context": "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea;origin=https://www.softwareheritage.org/check-deposit-2020-10-08T13:52:34.509655;visit=swh:1:snp:c477c6ef51833127b13a86ece7d75e5b3cc4e93d;anchor=swh:1:rev:f26f3960c175f15f6e24200171d446b86f6f7230;path=/", # noqa } assert expected_deposit_status["deposit_status"] == "done" assert expected_deposit_status["deposit_swh_id"] is not None # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", f"https://{api_url_basename}/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--author", "John Doe", "--deposit-id", deposit_id, "--swhid", expected_deposit_status["deposit_swh_id"], "--format", "json", ], ) # fmt: on assert result.exit_code == 0, result.output actual_deposit_status = json.loads(result.output) assert "error" not in actual_deposit_status assert actual_deposit_status == expected_deposit_status def test_cli_update_metadata_with_swhid_on_other_status_deposit( datadir, requests_mock_datadir, cli_runner ): """Update new metadata with swhid on other deposit status is not possible """ api_url_basename = "deposit.test.updateswhid" deposit_id = "321" # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", f"https://{api_url_basename}/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--author", "John Doe", "--deposit-id", deposit_id, "--swhid", "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea", "--format", "json", ], ) # fmt: on assert result.exit_code == 0, result.output actual_result = json.loads(result.output) assert "error" in actual_result assert actual_result == { "error": "You can only update metadata on deposit with status 'done'", "detail": f"The deposit {deposit_id} has status 'partial'", "deposit_status": "partial", "deposit_id": deposit_id, } @pytest.mark.parametrize( "metadata_entry_key", ["entry-data-with-swhid", "entry-data-with-swhid-no-prov"] ) def test_cli_metadata_only_deposit_full_metadata_file( datadir, requests_mock_datadir, cli_runner, atom_dataset, tmp_path, metadata_entry_key, caplog, ): """Post metadata-only deposit through cli The metadata file posted by the client already contains the swhid """ api_url_basename = "deposit.test.metadataonly" swhid = "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea" - metadata = atom_dataset[metadata_entry_key].format(swhid=swhid) + atom_data = atom_dataset[metadata_entry_key] + if metadata_entry_key == "entry-data-with-swhid": + metadata = atom_data.format( + swhid=swhid, + metadata_provenance_url=( + "https://inria.halpreprod.archives-ouvertes.fr/hal-abcdefgh" + ), + ) + else: + metadata = atom_data.format(swhid=swhid) metadata_path = os.path.join(tmp_path, "entry-data-with-swhid.xml") with open(metadata_path, "w") as m: m.write(metadata) expected_deposit_status = { "deposit_id": "100", "deposit_status": "done", "deposit_date": "2020-10-08T13:52:34.509655", } assert expected_deposit_status["deposit_status"] == "done" # fmt: off result = cli_runner.invoke( cli, [ "metadata-only", "--url", f"https://{api_url_basename}/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, "--format", "json", ], ) # fmt: on assert result.exit_code == 0, result.output actual_deposit_status = json.loads(result.output) assert "error" not in actual_deposit_status assert actual_deposit_status == expected_deposit_status count_warnings = 0 warning_record: Optional[str] = None for (_, log_level, msg) in caplog.record_tuples: if log_level == logging.WARNING: count_warnings += 1 warning_record = msg if "no-prov" in metadata_entry_key: assert count_warnings == 1 assert "metadata-provenance>' should be provided" in warning_record else: assert count_warnings == 0 def test_cli_metadata_only_deposit_invalid_swhid( datadir, requests_mock_datadir, cli_runner, atom_dataset, tmp_path, ): """Post metadata-only deposit through cli with invalid swhid raises """ api_url_basename = "deposit.test.metadataonly" invalid_swhid = "ssh:2:sth:xxx" - metadata = atom_dataset["entry-data-with-swhid"].format(swhid=invalid_swhid) + metadata = atom_dataset["entry-data-with-swhid-no-prov"].format(swhid=invalid_swhid) metadata_path = os.path.join(tmp_path, "entry-data-with-swhid.xml") with open(metadata_path, "w") as f: f.write(metadata) with pytest.raises(ValidationError, match="Invalid"): # fmt: off cli_runner.invoke( cli, [ "metadata-only", "--url", f"https://{api_url_basename}/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, "--format", "json", ], catch_exceptions=False, ) # fmt: on def test_cli_metadata_only_deposit_no_swhid( datadir, requests_mock_datadir, cli_runner, atom_dataset, tmp_path, ): """Post metadata-only deposit through cli with invalid swhid raises """ api_url_basename = "deposit.test.metadataonly" metadata = atom_dataset["entry-data-minimal"] metadata_path = os.path.join(tmp_path, "entry-data-minimal.xml") with open(metadata_path, "w") as f: f.write(metadata) with pytest.raises(InputError, match="SWHID must be provided"): # fmt: off cli_runner.invoke( cli, [ "metadata-only", "--url", f"https://{api_url_basename}/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, "--format", "json", ], catch_exceptions=False, ) # fmt: on @pytest.mark.parametrize( "metadata_entry_key", ["entry-data-with-add-to-origin", "entry-only-create-origin"] ) def test_cli_deposit_warning_missing_origin( metadata_entry_key, tmp_path, atom_dataset, caplog, cli_runner, requests_mock_datadir, ): """Deposit cli should warn when provided metadata xml is missing 'origins' tags """ # For the next deposit, no warning should be logged as either or # are provided, and is always # provided. metadata_raw = atom_dataset[metadata_entry_key] % "some-url" metadata_path = os.path.join(tmp_path, "metadata-with-origin-tag-to-deposit.xml") with open(metadata_path, "w") as f: f.write(metadata_raw) # fmt: off cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, ], ) # fmt: on for (_, log_level, _) in caplog.record_tuples: # all messages are info or below messages so everything is fine assert log_level < logging.WARNING def test_cli_deposit_warning_missing_provenance_url( tmp_path, atom_dataset, caplog, cli_runner, requests_mock_datadir, ): """Deposit cli should warn when no metadata provenance is provided """ atom_template = atom_dataset["entry-data-with-add-to-origin-no-prov"] metadata_raw = atom_template % "some-url" metadata_path = os.path.join(tmp_path, "metadata-with-missing-prov-url.xml") with open(metadata_path, "w") as f: f.write(metadata_raw) # fmt: off cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, ], ) # fmt: on count_warnings = sum( 1 for (_, log_level, _) in caplog.record_tuples if log_level == logging.WARNING ) assert count_warnings == 1 def test_cli_failure_should_be_parseable(atom_dataset, mocker): summary = "Cannot load metadata" verbose_description = ( "Cannot load metadata on swh:1:dir:0eda267e7d3c2e37b3f6a78e542b16190ac4574e, " "this directory object does not exist in the archive (yet?)." ) error_xml = atom_dataset["error-cli"].format( summary=summary, verboseDescription=verbose_description ) api_call = BaseDepositClient(url="https://somewhere.org/") actual_error = api_call.parse_result_error(error_xml) assert actual_error == { "summary": summary, "detail": "", "sword:verboseDescription": verbose_description, } def test_cli_service_document_failure(atom_dataset, mocker): """Ensure service document failures are properly served """ summary = "Invalid user credentials" error_xml = atom_dataset["error-cli"].format(summary=summary, verboseDescription="") api_call = ServiceDocumentDepositClient(url="https://somewhere.org/") actual_error = api_call.parse_result_error(error_xml) assert actual_error == {"error": summary} @pytest.mark.parametrize( "output_format,parser_fn", [ ("json", json.loads), ("yaml", yaml.safe_load), ( "logging", ast.literal_eval, ), # not enough though, the caplog fixture is needed ], ) def test_cli_deposit_collection_list( output_format, parser_fn, datadir, slug, requests_mock_datadir, caplog, cli_runner ): """Check deposit status cli with all possible output formats (json, yaml, logging). """ api_url_basename = "deposit.test.list" expected_deposits = { "count": "3", "deposits": [ { "external_id": "check-deposit-2020-10-09T13:10:00.000000", "id": "1031", "status": "rejected", "status_detail": "Deposit without archive", }, { "external_id": "check-deposit-2020-10-10T13:20:00.000000", "id": "1032", "status": "rejected", "status_detail": "Deposit without archive", }, { "complete_date": "2020-10-08T13:52:34.509655", "external_id": "check-deposit-2020-10-08T13:52:34.509655", "id": "1033", "reception_date": "2020-10-08T13:50:30", "status": "done", "status_detail": "The deposit has been successfully loaded into " "the Software Heritage archive", "swhid": "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea", "swhid_context": "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea;origin=https://www.softwareheritage.org/check-deposit-2020-10-08T13:52:34.509655;visit=swh:1:snp:c477c6ef51833127b13a86ece7d75e5b3cc4e93d;anchor=swh:1:rev:f26f3960c175f15f6e24200171d446b86f6f7230;path=/", # noqa }, ], } # fmt: off result = cli_runner.invoke( cli, [ "list", "--url", f"https://{api_url_basename}/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--page", 1, "--page-size", 10, "--format", output_format, ], ) # fmt: on assert result.exit_code == 0, f"unexpected output: {result.output}" if output_format == "logging": assert len(caplog.record_tuples) == 1 # format: (, , ) _, _, result_output = caplog.record_tuples[0] else: result_output = result.output actual_deposit = parser_fn(result_output) assert actual_deposit == expected_deposits diff --git a/swh/deposit/tests/data/atom/entry-data-with-swhid.xml b/swh/deposit/tests/data/atom/entry-data-with-swhid.xml index 56a3683d..f9348526 100644 --- a/swh/deposit/tests/data/atom/entry-data-with-swhid.xml +++ b/swh/deposit/tests/data/atom/entry-data-with-swhid.xml @@ -1,17 +1,17 @@ Awesome Compiler urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a dudess - https://inria.halpreprod.archives-ouvertes.fr/hal-abcdefgh + {metadata_provenance_url} diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py index 352cf782..91ff6be2 100644 --- a/swh/deposit/tests/test_utils.py +++ b/swh/deposit/tests/test_utils.py @@ -1,218 +1,218 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from xml.etree import ElementTree import pytest from swh.deposit import utils from swh.model.exceptions import ValidationError from swh.model.swhids import CoreSWHID, QualifiedSWHID @pytest.fixture def xml_with_origin_reference(): xml_data = """ """ return xml_data.strip() def test_normalize_date_0(): """When date is a list, choose the first date and normalize it """ actual_date = utils.normalize_date(["2017-10-12", "date1"]) assert actual_date == { "timestamp": {"microseconds": 0, "seconds": 1507766400}, "offset": 0, } def test_normalize_date_1(): """Providing a date in a reasonable format, everything is fine """ actual_date = utils.normalize_date("2018-06-11 17:02:02") assert actual_date == { "timestamp": {"microseconds": 0, "seconds": 1528736522}, "offset": 0, } def test_normalize_date_doing_irrelevant_stuff(): """Providing a date with only the year results in a reasonable date """ actual_date = utils.normalize_date("2017") assert actual_date == { "timestamp": {"seconds": 1483228800, "microseconds": 0}, "offset": 0, } @pytest.mark.parametrize( "swhid,expected_metadata_context", [ ("swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49", {"origin": None},), ( "swh:1:snp:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=http://blah", {"origin": "http://blah", "path": None}, ), ( "swh:1:dir:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;path=/path", {"origin": None, "path": b"/path"}, ), ( "swh:1:rev:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;visit=swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", # noqa { "origin": None, "path": None, "snapshot": CoreSWHID.from_string( "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49" ), }, ), ( "swh:1:rel:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", # noqa { "origin": None, "path": None, "directory": CoreSWHID.from_string( "swh:1:dir:41b5c8cc985d190b5a7ef4878128ebfdc2358f49" ), }, ), ], ) def test_compute_metadata_context(swhid: str, expected_metadata_context): assert expected_metadata_context == utils.compute_metadata_context( QualifiedSWHID.from_string(swhid) ) def test_parse_swh_reference_origin(xml_with_origin_reference): url = "https://url" xml_data = xml_with_origin_reference.format(url=url) metadata = ElementTree.fromstring(xml_data) actual_origin = utils.parse_swh_reference(metadata) assert actual_origin == url @pytest.fixture def xml_swh_deposit_template(): xml_data = """ {swh_deposit} """ return xml_data.strip() @pytest.mark.parametrize( "xml_ref", [ "", "", "", """""", ], ) def test_parse_swh_reference_empty(xml_swh_deposit_template, xml_ref): xml_body = xml_swh_deposit_template.format(swh_deposit=xml_ref) metadata = ElementTree.fromstring(xml_body) assert utils.parse_swh_reference(metadata) is None @pytest.fixture def xml_with_swhid(atom_dataset): - return atom_dataset["entry-data-with-swhid"] + return atom_dataset["entry-data-with-swhid-no-prov"] @pytest.mark.parametrize( "swhid", [ "swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=https://hal.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:4fc1e36fca86b2070204bedd51106014a614f321;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba;path=/moranegg-AffectationRO-df7f68b/", # noqa "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:rev:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rel:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:snp:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:snp:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", ], ) def test_parse_swh_reference_swhid(swhid, xml_with_swhid): - xml_data = xml_with_swhid.format(swhid=swhid) + xml_data = xml_with_swhid.format(swhid=swhid,) metadata = ElementTree.fromstring(xml_data) actual_swhid = utils.parse_swh_reference(metadata) assert actual_swhid is not None expected_swhid = QualifiedSWHID.from_string(swhid) assert actual_swhid == expected_swhid @pytest.mark.parametrize( "invalid_swhid", [ # incorrect length "swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc235" # noqa # visit qualifier should be a core SWHID with type, "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:rev:0175049fc45055a3824a1675ac06e3711619a55a", # noqa # anchor qualifier should be a core SWHID with type one of "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;anchor=swh:1:cnt:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:snp:b5f505b005435fa5c4fa4c279792bd7b17167c04", # noqa ], ) def test_parse_swh_reference_invalid_swhid(invalid_swhid, xml_with_swhid): """Unparsable swhid should raise """ xml_invalid_swhid = xml_with_swhid.format(swhid=invalid_swhid) metadata = ElementTree.fromstring(xml_invalid_swhid) with pytest.raises(ValidationError): utils.parse_swh_reference(metadata) @pytest.mark.parametrize( "xml_ref", [ "", "", "", ], ) def test_parse_swh_metatada_provenance_empty(xml_swh_deposit_template, xml_ref): xml_body = xml_swh_deposit_template.format(swh_deposit=xml_ref) metadata = ElementTree.fromstring(xml_body) assert utils.parse_swh_metadata_provenance(metadata) is None @pytest.fixture def xml_with_metadata_provenance(atom_dataset): return atom_dataset["entry-data-with-metadata-provenance"] def test_parse_swh_metadata_provenance2(xml_with_metadata_provenance): xml_data = xml_with_metadata_provenance.format(url="https://url.org/metadata/url") metadata = ElementTree.fromstring(xml_data) actual_url = utils.parse_swh_metadata_provenance(metadata) assert actual_url == "https://url.org/metadata/url" diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py index 63704d5d..d17996d6 100644 --- a/swh/deposit/utils.py +++ b/swh/deposit/utils.py @@ -1,259 +1,271 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from typing import Any, Dict, Optional, Tuple, Union from xml.etree import ElementTree import iso8601 +from swh.deposit.errors import FORBIDDEN, DepositError from swh.model.exceptions import ValidationError from swh.model.model import TimestampWithTimezone from swh.model.swhids import ExtendedSWHID, ObjectType, QualifiedSWHID logger = logging.getLogger(__name__) NAMESPACES = { "atom": "http://www.w3.org/2005/Atom", "app": "http://www.w3.org/2007/app", "dc": "http://purl.org/dc/terms/", "codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "sword": "http://purl.org/net/sword/terms/", "swh": "https://www.softwareheritage.org/schema/2018/deposit", "schema": "http://schema.org/", } def normalize_date(date): """Normalize date fields as expected by swh workers. If date is a list, elect arbitrarily the first element of that list If date is (then) a string, parse it through dateutil.parser.parse to extract a datetime. Then normalize it through :class:`swh.model.model.TimestampWithTimezone` Returns The swh date object """ if isinstance(date, list): date = date[0] if isinstance(date, str): date = iso8601.parse_date(date) tstz = TimestampWithTimezone.from_dict(date) return { "timestamp": tstz.timestamp.to_dict(), "offset": tstz.offset_minutes(), } def compute_metadata_context(swhid_reference: QualifiedSWHID) -> Dict[str, Any]: """Given a SWHID object, determine the context as a dict. """ metadata_context: Dict[str, Any] = {"origin": None} if swhid_reference.qualifiers(): metadata_context = { "origin": swhid_reference.origin, "path": swhid_reference.path, } snapshot = swhid_reference.visit if snapshot: metadata_context["snapshot"] = snapshot anchor = swhid_reference.anchor if anchor: metadata_context[anchor.object_type.name.lower()] = anchor return metadata_context ALLOWED_QUALIFIERS_NODE_TYPE = ( ObjectType.SNAPSHOT, ObjectType.REVISION, ObjectType.RELEASE, ObjectType.DIRECTORY, ) -def parse_swh_metadata_provenance( - metadata: ElementTree.Element, -) -> Optional[Union[QualifiedSWHID, str]]: +def parse_swh_metadata_provenance(metadata: ElementTree.Element,) -> Optional[str]: """Parse swh metadata-provenance within the metadata dict reference if found, None otherwise. .. code-block:: xml https://example.org/metadata/url Args: metadata: result of parsing an Atom document with :func:`parse_xml` Raises: ValidationError in case of invalid xml Returns: Either the metadata provenance url if any or None otherwise """ url_element = metadata.find( "swh:deposit/swh:metadata-provenance/schema:url", namespaces=NAMESPACES ) if url_element is not None: return url_element.text return None def parse_swh_deposit_origin( metadata: ElementTree.Element, ) -> Tuple[Optional[str], Optional[str]]: """Parses and from metadata document, if any. .. code-block:: xml .. code-block:: xml Returns: tuple of (origin_to_create, origin_to_add). If both are non-None, this should typically be an error raised to the user. """ create_origin = metadata.find( "swh:deposit/swh:create_origin/swh:origin", namespaces=NAMESPACES ) add_to_origin = metadata.find( "swh:deposit/swh:add_to_origin/swh:origin", namespaces=NAMESPACES ) return ( None if create_origin is None else create_origin.attrib["url"], None if add_to_origin is None else add_to_origin.attrib["url"], ) def parse_swh_reference( metadata: ElementTree.Element, ) -> Optional[Union[QualifiedSWHID, str]]: """Parse within the metadata document, if any. .. code-block:: xml or: .. code-block:: xml Args: metadata: result of parsing an Atom document Raises: ValidationError in case the swhid referenced (if any) is invalid Returns: Either swhid or origin reference if any. None otherwise. """ # noqa ref_origin = metadata.find( "swh:deposit/swh:reference/swh:origin[@url]", namespaces=NAMESPACES ) if ref_origin is not None: return ref_origin.attrib["url"] ref_object = metadata.find( "swh:deposit/swh:reference/swh:object[@swhid]", namespaces=NAMESPACES ) if ref_object is None: return None swhid = ref_object.attrib["swhid"] if not swhid: return None swhid_reference = QualifiedSWHID.from_string(swhid) if swhid_reference.qualifiers(): anchor = swhid_reference.anchor if anchor: if anchor.object_type not in ALLOWED_QUALIFIERS_NODE_TYPE: error_msg = ( "anchor qualifier should be a core SWHID with type one of " f"{', '.join(t.name.lower() for t in ALLOWED_QUALIFIERS_NODE_TYPE)}" ) raise ValidationError(error_msg) visit = swhid_reference.visit if visit: if visit.object_type != ObjectType.SNAPSHOT: raise ValidationError( f"visit qualifier should be a core SWHID with type snp, " f"not {visit.object_type.value}" ) if ( visit and anchor and visit.object_type == ObjectType.SNAPSHOT and anchor.object_type == ObjectType.SNAPSHOT ): logger.warn( "SWHID use of both anchor and visit targeting " f"a snapshot: {swhid_reference}" ) raise ValidationError( "'anchor=swh:1:snp:' is not supported when 'visit' is also provided." ) return swhid_reference def extended_swhid_from_qualified(swhid: QualifiedSWHID) -> ExtendedSWHID: """Used to get the target of a metadata object from a , as the latter uses a QualifiedSWHID.""" return ExtendedSWHID.from_string(str(swhid).split(";")[0]) def to_header_link(link: str, link_name: str) -> str: """Build a single header link. >>> link_next = to_header_link("next-url", "next") >>> link_next '; rel="next"' >>> ','.join([link_next, to_header_link("prev-url", "prev")]) '; rel="next",; rel="prev"' """ return f'<{link}>; rel="{link_name}"' + + +def check_url_match_provider(url: str, provider_url: str) -> None: + """Check url matches the provider url. + + Raises DepositError in case of mismatch + + """ + provider_url = provider_url.rstrip("/") + "/" + if not url.startswith(provider_url): + raise DepositError( + FORBIDDEN, f"URL mismatch: {url} must start with {provider_url}", + )