diff --git a/swh/deposit/__init__.py b/swh/deposit/__init__.py index e69de29b..65c78f3d 100644 --- a/swh/deposit/__init__.py +++ b/swh/deposit/__init__.py @@ -0,0 +1,11 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pkg_resources + +try: + __version__ = pkg_resources.get_distribution("swh.deposit").version +except pkg_resources.DistributionNotFound: + __version__ = "devel" diff --git a/swh/deposit/api/__init__.py b/swh/deposit/api/__init__.py index 65c78f3d..245db776 100644 --- a/swh/deposit/api/__init__.py +++ b/swh/deposit/api/__init__.py @@ -1,11 +1,4 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - -import pkg_resources - -try: - __version__ = pkg_resources.get_distribution("swh.deposit").version -except pkg_resources.DistributionNotFound: - __version__ = "devel" diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py index 6bed49c5..38afddcf 100644 --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -1,1033 +1,1041 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from abc import ABCMeta, abstractmethod import datetime import hashlib import json from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union from django.http import FileResponse, HttpResponse from django.shortcuts import render from django.urls import reverse from django.utils import timezone from rest_framework import status from rest_framework.authentication import BaseAuthentication, BasicAuthentication from rest_framework.permissions import BasePermission, IsAuthenticated from rest_framework.request import Request from rest_framework.views import APIView from swh.model import hashutil from swh.scheduler.utils import create_oneshot_task_dict from ..config import ( ARCHIVE_KEY, ARCHIVE_TYPE, CONT_FILE_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_PARTIAL, EDIT_SE_IRI, EM_IRI, METADATA_KEY, METADATA_TYPE, RAW_METADATA_KEY, STATE_IRI, APIConfig, ) from ..errors import ( BAD_REQUEST, CHECKSUM_MISMATCH, ERROR_CONTENT, FORBIDDEN, MAX_UPLOAD_SIZE_EXCEEDED, MEDIATION_NOT_ALLOWED, METHOD_NOT_ALLOWED, NOT_FOUND, PARSING_ERROR, ParserError, make_error_dict, make_error_response, make_error_response_from_dict, ) from ..models import Deposit, DepositClient, DepositCollection, DepositRequest from ..parsers import parse_xml ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"] ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"] class AuthenticatedAPIView(APIView): """Mixin intended as a based API view to enforce the basic authentication check """ authentication_classes: Sequence[Type[BaseAuthentication]] = (BasicAuthentication,) permission_classes: Sequence[Type[BasePermission]] = (IsAuthenticated,) class APIBase(APIConfig, AuthenticatedAPIView, metaclass=ABCMeta): """Base deposit request class sharing multiple common behaviors. """ def _read_headers(self, request: Request) -> Dict[str, Any]: """Read and unify the necessary headers from the request (those are not stored in the same location or not properly formatted). Args: request (Request): Input request Returns: Dictionary with the following keys (some associated values may be None): - content-type - content-length - in-progress - content-disposition - packaging - slug - on-behalf-of """ meta = request._request.META content_type = request.content_type content_length = meta.get("CONTENT_LENGTH") if content_length and isinstance(content_length, str): content_length = int(content_length) # final deposit if not provided in_progress = meta.get("HTTP_IN_PROGRESS", False) content_disposition = meta.get("HTTP_CONTENT_DISPOSITION") if isinstance(in_progress, str): in_progress = in_progress.lower() == "true" content_md5sum = meta.get("HTTP_CONTENT_MD5") if content_md5sum: content_md5sum = bytes.fromhex(content_md5sum) packaging = meta.get("HTTP_PACKAGING") slug = meta.get("HTTP_SLUG") on_behalf_of = meta.get("HTTP_ON_BEHALF_OF") metadata_relevant = meta.get("HTTP_METADATA_RELEVANT") + swhid = meta.get("HTTP_X_CHECK_SWHID") + return { "content-type": content_type, "content-length": content_length, "in-progress": in_progress, "content-disposition": content_disposition, "content-md5sum": content_md5sum, "packaging": packaging, "slug": slug, "on-behalf-of": on_behalf_of, "metadata-relevant": metadata_relevant, + "swhid": swhid, } def _compute_md5(self, filehandler) -> bytes: """Compute uploaded file's md5 sum. Args: filehandler (InMemoryUploadedFile): the file to compute the md5 hash Returns: the md5 checksum (str) """ h = hashlib.md5() for chunk in filehandler: h.update(chunk) return h.digest() def _deposit_put( self, request: Request, deposit_id: Optional[int] = None, in_progress: bool = False, external_id: Optional[str] = None, ) -> Deposit: """Save/Update a deposit in db. Args: request: request data deposit_id: deposit identifier in_progress: deposit status external_id: external identifier to associate to the deposit Returns: The Deposit instance saved or updated. """ complete_date: Optional[datetime.datetime] = None deposit_parent: Optional[Deposit] = None if in_progress is False: complete_date = timezone.now() status_type = DEPOSIT_STATUS_DEPOSITED else: status_type = DEPOSIT_STATUS_PARTIAL if not deposit_id: try: # find a deposit parent (same external id, status load to success) deposit_parent = ( Deposit.objects.filter( external_id=external_id, status=DEPOSIT_STATUS_LOAD_SUCCESS ) .order_by("-id")[0:1] .get() ) # noqa except Deposit.DoesNotExist: # then no parent for that deposit, deposit_parent already None pass assert external_id is not None deposit = Deposit( collection=self._collection, external_id=external_id, complete_date=complete_date, status=status_type, client=self._client, parent=deposit_parent, ) else: deposit = Deposit.objects.get(pk=deposit_id) # update metadata deposit.complete_date = complete_date deposit.status = status_type if self.config["checks"]: deposit.save() # needed to have a deposit id scheduler = self.scheduler if deposit.status == DEPOSIT_STATUS_DEPOSITED and not deposit.check_task_id: task = create_oneshot_task_dict( "check-deposit", collection=deposit.collection.name, deposit_id=deposit.id, ) check_task_id = scheduler.create_tasks([task])[0]["id"] deposit.check_task_id = check_task_id deposit.save() return deposit def _deposit_request_put( self, deposit: Deposit, deposit_request_data: Dict[str, Any], replace_metadata: bool = False, replace_archives: bool = False, - ) -> None: + ) -> DepositRequest: """Save a deposit request with metadata attached to a deposit. Args: deposit: The deposit concerned by the request deposit_request_data: The dictionary with at most 2 deposit request types (archive, metadata) to associate to the deposit replace_metadata: Flag defining if we add or update existing metadata to the deposit replace_archives: Flag defining if we add or update archives to existing deposit Returns: - None + the DepositRequest object stored in the backend """ if replace_metadata: DepositRequest.objects.filter(deposit=deposit, type=METADATA_TYPE).delete() if replace_archives: DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() deposit_request = None archive_file = deposit_request_data.get(ARCHIVE_KEY) if archive_file: deposit_request = DepositRequest( type=ARCHIVE_TYPE, deposit=deposit, archive=archive_file ) deposit_request.save() metadata = deposit_request_data.get(METADATA_KEY) if metadata: raw_metadata = deposit_request_data[RAW_METADATA_KEY] deposit_request = DepositRequest( type=METADATA_TYPE, deposit=deposit, metadata=metadata, raw_metadata=raw_metadata.decode("utf-8"), ) deposit_request.save() assert deposit_request is not None + return deposit_request def _delete_archives(self, collection_name: str, deposit_id: int) -> Dict: """Delete archive references from the deposit id. """ try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, f"The deposit {deposit_id} does not exist" ) DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() return {} def _delete_deposit(self, collection_name: str, deposit_id: int) -> Dict: """Delete deposit reference. Args: collection_name: Client's collection deposit_id: The deposit to delete Returns Empty dict when ok. Dict with error key to describe the failure. """ try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, f"The deposit {deposit_id} does not exist" ) if deposit.collection.name != collection_name: summary = "Cannot delete a deposit from another collection" description = "Deposit %s does not belong to the collection %s" % ( deposit_id, collection_name, ) return make_error_dict( BAD_REQUEST, summary=summary, verbose_description=description ) DepositRequest.objects.filter(deposit=deposit).delete() deposit.delete() return {} def _check_preconditions_on( self, filehandler, md5sum: str, content_length: Optional[int] = None ) -> Optional[Dict]: """Check preconditions on provided file are respected. That is the length and/or the md5sum hash match the file's content. Args: filehandler (InMemoryUploadedFile): The file to check md5sum: md5 hash expected from the file's content content_length: the expected length if provided. Returns: Either none if no error or a dictionary with a key error detailing the problem. """ max_upload_size = self.config["max_upload_size"] if content_length: if content_length > max_upload_size: return make_error_dict( MAX_UPLOAD_SIZE_EXCEEDED, f"Upload size limit exceeded (max {max_upload_size} bytes)." "Please consider sending the archive in multiple steps.", ) length = filehandler.size if length != content_length: return make_error_dict( status.HTTP_412_PRECONDITION_FAILED, "Wrong length" ) if md5sum: _md5sum = self._compute_md5(filehandler) if _md5sum != md5sum: return make_error_dict( CHECKSUM_MISMATCH, "Wrong md5 hash", f"The checksum sent {hashutil.hash_to_hex(md5sum)} and the actual " f"checksum {hashutil.hash_to_hex(_md5sum)} does not match.", ) return None def _binary_upload( self, request: Request, headers: Dict[str, Any], collection_name: str, deposit_id: Optional[int] = None, replace_metadata: bool = False, replace_archives: bool = False, ) -> Dict[str, Any]: """Binary upload routine. Other than such a request, a 415 response is returned. Args: request (Request): the request holding information to parse and inject in db headers (dict): request headers formatted collection_name (str): the associated client deposit_id (id): deposit identifier if provided replace_metadata (bool): 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives (bool): 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id (int): Deposit identifier - deposit_date (date): Deposit date - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 413 (request entity too large) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided """ content_length = headers["content-length"] if not content_length: return make_error_dict( BAD_REQUEST, "CONTENT_LENGTH header is mandatory", "For archive deposit, the CONTENT_LENGTH header must be sent.", ) content_disposition = headers["content-disposition"] if not content_disposition: return make_error_dict( BAD_REQUEST, "CONTENT_DISPOSITION header is mandatory", "For archive deposit, the CONTENT_DISPOSITION header must be sent.", ) packaging = headers["packaging"] if packaging and packaging not in ACCEPT_PACKAGINGS: return make_error_dict( BAD_REQUEST, f"Only packaging {ACCEPT_PACKAGINGS} is supported", f"The packaging provided {packaging} is not supported", ) filehandler = request.FILES["file"] precondition_status_response = self._check_preconditions_on( filehandler, headers["content-md5sum"], content_length ) if precondition_status_response: return precondition_status_response external_id = headers["slug"] # actual storage of data archive_metadata = filehandler deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) self._deposit_request_put( deposit, {ARCHIVE_KEY: archive_metadata}, replace_metadata=replace_metadata, replace_archives=replace_archives, ) return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "status": deposit.status, "archive": filehandler.name, } def _read_metadata(self, metadata_stream) -> Tuple[bytes, Dict[str, Any]]: """Given a metadata stream, reads the metadata and returns both the parsed and the raw metadata. """ raw_metadata = metadata_stream.read() metadata = parse_xml(raw_metadata) return raw_metadata, metadata def _multipart_upload( self, request: Request, headers: Dict[str, Any], collection_name: str, deposit_id: Optional[int] = None, replace_metadata: bool = False, replace_archives: bool = False, ) -> Dict: """Multipart upload supported with exactly: - 1 archive (zip) - 1 atom entry Other than such a request, a 415 response is returned. Args: request (Request): the request holding information to parse and inject in db headers: request headers formatted collection_name: the associated client deposit_id: deposit identifier if provided replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id (int): Deposit identifier - deposit_date (date): Deposit date - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 412 (precondition failed) if the potentially md5 hash provided mismatch the reality of the archive - 413 (request entity too large) if the length of the archive exceeds the max size configured - 415 (unsupported media type) if a wrong media type is provided """ external_id = headers["slug"] content_types_present = set() data: Dict[str, Optional[Any]] = { "application/zip": None, # expected either zip "application/x-tar": None, # or x-tar "application/atom+xml": None, } for key, value in request.FILES.items(): fh = value content_type = fh.content_type if content_type in content_types_present: return make_error_dict( ERROR_CONTENT, "Only 1 application/zip (or application/x-tar) archive " "and 1 atom+xml entry is supported (as per sword2.0 " "specification)", "You provided more than 1 application/(zip|x-tar) " "or more than 1 application/atom+xml content-disposition " "header in the multipart deposit", ) content_types_present.add(content_type) assert content_type is not None data[content_type] = fh if len(content_types_present) != 2: return make_error_dict( ERROR_CONTENT, "You must provide both 1 application/zip (or " "application/x-tar) and 1 atom+xml entry for multipart " "deposit", "You need to provide only 1 application/(zip|x-tar) " "and 1 application/atom+xml content-disposition header " "in the multipart deposit", ) filehandler = data["application/zip"] if not filehandler: filehandler = data["application/x-tar"] precondition_status_response = self._check_preconditions_on( filehandler, headers["content-md5sum"] ) if precondition_status_response: return precondition_status_response try: raw_metadata, metadata = self._read_metadata(data["application/atom+xml"]) except ParserError: return make_error_dict( PARSING_ERROR, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) # actual storage of data deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) deposit_request_data = { ARCHIVE_KEY: filehandler, METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata, } self._deposit_request_put( deposit, deposit_request_data, replace_metadata, replace_archives ) assert filehandler is not None return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "archive": filehandler.name, "status": deposit.status, } def _atom_entry( self, request: Request, headers: Dict[str, Any], collection_name: str, deposit_id: Optional[int] = None, replace_metadata: bool = False, replace_archives: bool = False, ) -> Dict[str, Any]: """Atom entry deposit. Args: - request (Request): the request holding information to parse + request: the request holding information to parse and inject in db headers: request headers formatted collection_name: the associated client deposit_id: deposit identifier if provided replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id: deposit id associated to the deposit - deposit_date: date of the deposit - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 400 (bad request) if the request's body is empty - 415 (unsupported media type) if a wrong media type is provided """ try: raw_metadata, metadata = self._read_metadata(request.data) except ParserError: return make_error_dict( BAD_REQUEST, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) if not metadata: return make_error_dict( BAD_REQUEST, "Empty body request is not supported", "Atom entry deposit is supposed to send for metadata. " "If the body is empty, there is no metadata.", ) external_id = metadata.get("external_identifier", headers["slug"]) # TODO: Determine if we are in the metadata-only deposit case. If it is, then # save deposit and deposit request typed 'metadata' and send metadata to the # metadata storage. Otherwise, do as existing deposit. deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) self._deposit_request_put( deposit, {METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata}, replace_metadata, replace_archives, ) return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "archive": None, "status": deposit.status, } def _empty_post( self, request: Request, headers: Dict, collection_name: str, deposit_id: int ) -> Dict[str, Any]: """Empty post to finalize an empty deposit. Args: request: the request holding information to parse and inject in db headers: request headers formatted collection_name: the associated client deposit_id: deposit identifier Returns: Dictionary of result with the deposit's id, the date it was completed and no archive. """ deposit = Deposit.objects.get(pk=deposit_id) deposit.complete_date = timezone.now() deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() return { "deposit_id": deposit_id, "deposit_date": deposit.complete_date, "status": deposit.status, "archive": None, } def _make_iris( self, request: Request, collection_name: str, deposit_id: int ) -> Dict[str, Any]: """Define the IRI endpoints Args: request (Request): The initial request collection_name (str): client/collection's name deposit_id (id): Deposit identifier Returns: Dictionary of keys with the iris' urls. """ args = [collection_name, deposit_id] return { iri: request.build_absolute_uri(reverse(iri, args=args)) for iri in [EM_IRI, EDIT_SE_IRI, CONT_FILE_IRI, STATE_IRI] } def additional_checks( self, request: Request, headers: Dict[str, Any], collection_name: str, deposit_id: Optional[int] = None, ) -> Dict[str, Any]: """Permit the child class to enrich additional checks. Returns: dict with 'error' detailing the problem. """ return {} def checks( self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> Dict[str, Any]: try: self._collection = DepositCollection.objects.get(name=collection_name) except DepositCollection.DoesNotExist: return make_error_dict( NOT_FOUND, f"Unknown collection name {collection_name}" ) assert self._collection is not None username = request.user.username if username: # unauthenticated request can have the username empty try: self._client: DepositClient = DepositClient.objects.get( # type: ignore username=username ) except DepositClient.DoesNotExist: return make_error_dict(NOT_FOUND, f"Unknown client name {username}") collection_id = self._collection.id collections = self._client.collections assert collections is not None if collection_id not in collections: return make_error_dict( FORBIDDEN, f"Client {username} cannot access collection {collection_name}", ) + headers = self._read_headers(request) + if deposit_id: try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, f"Deposit with id {deposit_id} does not exist" ) - checks = self.restrict_access(request, deposit) + assert deposit is not None + checks = self.restrict_access(request, headers, deposit) if checks: return checks - headers = self._read_headers(request) if headers["on-behalf-of"]: return make_error_dict(MEDIATION_NOT_ALLOWED, "Mediation is not supported.") checks = self.additional_checks(request, headers, collection_name, deposit_id) if "error" in checks: return checks return {"headers": headers} def restrict_access( - self, request: Request, deposit: Optional[Deposit] = None + self, request: Request, headers: Dict, deposit: Deposit ) -> Dict[str, Any]: - if deposit: - if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL: - summary = "You can only act on deposit with status '%s'" % ( - DEPOSIT_STATUS_PARTIAL, - ) - description = f"This deposit has status '{deposit.status}'" - return make_error_dict( - BAD_REQUEST, summary=summary, verbose_description=description - ) + """Allow modifications on deposit with status 'partial' only, reject the rest. + + """ + if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL: + summary = "You can only act on deposit with status '%s'" % ( + DEPOSIT_STATUS_PARTIAL, + ) + description = f"This deposit has status '{deposit.status}'" + return make_error_dict( + BAD_REQUEST, summary=summary, verbose_description=description + ) return {} def _basic_not_allowed_method(self, request: Request, method: str): return make_error_response( request, METHOD_NOT_ALLOWED, f"{method} method is not supported on this endpoint", ) def get( self, request: Request, collection_name: str, deposit_id: int ) -> Union[HttpResponse, FileResponse]: return self._basic_not_allowed_method(request, "GET") def post( self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: return self._basic_not_allowed_method(request, "POST") def put( self, request: Request, collection_name: str, deposit_id: int ) -> HttpResponse: return self._basic_not_allowed_method(request, "PUT") def delete( self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: return self._basic_not_allowed_method(request, "DELETE") class APIGet(APIBase, metaclass=ABCMeta): """Mixin for class to support GET method. """ def get( self, request: Request, collection_name: str, deposit_id: int ) -> Union[HttpResponse, FileResponse]: """Endpoint to create/add resources to deposit. Returns: 200 response when no error during routine occurred 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) r = self.process_get(request, collection_name, deposit_id) status, content, content_type = r if content_type == "swh/generator": with content as path: return FileResponse( open(path, "rb"), status=status, content_type="application/zip" ) if content_type == "application/json": return HttpResponse( json.dumps(content), status=status, content_type=content_type ) return HttpResponse(content, status=status, content_type=content_type) @abstractmethod def process_get( self, request: Request, collection_name: str, deposit_id: int ) -> Tuple[int, Any, str]: """Routine to deal with the deposit's get processing. Returns: Tuple status, stream of content, content-type """ pass class APIPost(APIBase, metaclass=ABCMeta): """Mixin for class to support DELETE method. """ def post( self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: """Endpoint to create/add resources to deposit. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) headers = checks["headers"] _status, _iri_key, data = self.process_post( request, headers, collection_name, deposit_id ) error = data.get("error") if error: return make_error_response_from_dict(request, error) data["packagings"] = ACCEPT_PACKAGINGS iris = self._make_iris(request, collection_name, data["deposit_id"]) data.update(iris) response = render( request, "deposit/deposit_receipt.xml", context=data, content_type="application/xml", status=_status, ) response._headers["location"] = "Location", data[_iri_key] # type: ignore return response @abstractmethod def process_post( self, request, headers: Dict, collection_name: str, deposit_id: Optional[int] = None, ) -> Tuple[int, str, Dict]: """Routine to deal with the deposit's processing. Returns Tuple of: - response status code (200, 201, etc...) - key iri (EM_IRI, EDIT_SE_IRI, etc...) - dictionary of the processing result """ pass class APIPut(APIBase, metaclass=ABCMeta): """Mixin for class to support PUT method. """ def put( self, request: Request, collection_name: str, deposit_id: int ) -> HttpResponse: """Endpoint to update deposit resources. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) headers = checks["headers"] data = self.process_put(request, headers, collection_name, deposit_id) error = data.get("error") if error: return make_error_response_from_dict(request, error) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_put( self, request: Request, headers: Dict, collection_name: str, deposit_id: int ) -> Dict[str, Any]: """Routine to deal with updating a deposit in some way. Returns dictionary of the processing result """ pass class APIDelete(APIBase, metaclass=ABCMeta): """Mixin for class to support DELETE method. """ def delete( self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: """Endpoint to delete some deposit's resources (archives, deposit). Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) assert deposit_id is not None data = self.process_delete(request, collection_name, deposit_id) error = data.get("error") if error: return make_error_response_from_dict(request, error) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_delete( self, request: Request, collection_name: str, deposit_id: int ) -> Dict: """Routine to delete a resource. This is mostly not allowed except for the EM_IRI (cf. .api.deposit_update.APIUpdateArchive) """ return {} diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py index 07d00e92..c2e54dc7 100644 --- a/swh/deposit/api/deposit_update.py +++ b/swh/deposit/api/deposit_update.py @@ -1,183 +1,345 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Dict, Optional, Tuple from rest_framework import status - -from ..config import CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI -from ..errors import BAD_REQUEST, make_error_dict +from rest_framework.request import Request + +from swh.deposit.models import Deposit +from swh.model.hashutil import hash_to_bytes +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + RawExtrinsicMetadata, +) +from swh.storage import get_storage +from swh.storage.interface import StorageInterface + +from ..config import ( + CONT_FILE_IRI, + DEPOSIT_STATUS_LOAD_SUCCESS, + EDIT_SE_IRI, + EM_IRI, + METADATA_KEY, + RAW_METADATA_KEY, +) +from ..errors import BAD_REQUEST, ParserError, make_error_dict from ..parsers import ( SWHAtomEntryParser, SWHFileUploadTarParser, SWHFileUploadZipParser, SWHMultiPartParser, ) from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, APIDelete, APIPost, APIPut class APIUpdateArchive(APIPost, APIPut, APIDelete): """Deposit request class defining api endpoints for sword deposit. What's known as 'EM IRI' in the sword specification. HTTP verbs supported: PUT, POST, DELETE """ parser_classes = ( SWHFileUploadZipParser, SWHFileUploadTarParser, ) def process_put( self, req, headers, collection_name: str, deposit_id: int ) -> Dict[str, Any]: """Replace existing content for the existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_binary # noqa Returns: 204 No content """ if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES: msg = "Packaging format supported is restricted to %s" % ( ", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES) ) return make_error_dict(BAD_REQUEST, msg) return self._binary_upload( req, headers, collection_name, deposit_id=deposit_id, replace_archives=True ) def process_post( self, req, headers: Dict, collection_name: str, deposit_id: Optional[int] = None ) -> Tuple[int, str, Dict]: """Add new content to the existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_mediaresource # noqa Returns: 201 Created Headers: Location: [Cont-File-IRI] Body: [optional Deposit Receipt] """ if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES: msg = "Packaging format supported is restricted to %s" % ( ", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES) ) unused = 0 return unused, "unused", make_error_dict(BAD_REQUEST, msg) return ( status.HTTP_201_CREATED, CONT_FILE_IRI, self._binary_upload(req, headers, collection_name, deposit_id), ) def process_delete(self, req, collection_name: str, deposit_id: int) -> Dict: """Delete content (archives) from existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deletingcontent # noqa Returns: 204 Created """ return self._delete_archives(collection_name, deposit_id) class APIUpdateMetadata(APIPost, APIPut, APIDelete): """Deposit request class defining api endpoints for sword deposit. What's known as 'Edit IRI' (and SE IRI) in the sword specification. HTTP verbs supported: POST (SE IRI), PUT (Edit IRI), DELETE """ parser_classes = (SWHMultiPartParser, SWHAtomEntryParser) + def __init__(self): + super().__init__() + self.storage_metadata: StorageInterface = get_storage( + **self.config["storage_metadata"] + ) + + def restrict_access( + self, request: Request, headers: Dict, deposit: Deposit + ) -> Dict[str, Any]: + """Relax restriction access to allow metadata update on deposit with status "done" when + a swhid is provided. + + """ + if ( + request.method == "PUT" + and headers["swhid"] is not None + and deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS + ): + # Allow metadata update on deposit with status "done" when swhid provided + return {} + # otherwise, let the standard access restriction check occur + return super().restrict_access(request, headers, deposit) + def process_put( - self, req, headers: Dict, collection_name: str, deposit_id: int + self, request, headers: Dict, collection_name: str, deposit_id: int ) -> Dict[str, Any]: - """Replace existing deposit's metadata/archive with new ones. + """This allows the following scenarios: + + - multipart: replace all the deposit (status partial) metadata and archive + with the provided ones. + - atom: replace all the deposit (status partial) metadata with the + provided ones. + - with swhid, atom: Add new metatada to deposit (status done) with provided ones + and push such metadata to the metadata storage directly. source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_metadata - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_multipart + Raises: + 400 if any of the following occur: + - the swhid provided and the deposit swhid do not match + - the provided metadata xml file is malformed + - the provided xml atom entry is empty + - the provided swhid does not exist in the archive + Returns: 204 No content """ # noqa - if req.content_type.startswith("multipart/"): - return self._multipart_upload( - req, + swhid = headers.get("swhid") + if swhid is None: + if request.content_type.startswith("multipart/"): + return self._multipart_upload( + request, + headers, + collection_name, + deposit_id=deposit_id, + replace_archives=True, + replace_metadata=True, + ) + # standard metadata update (replace all metadata already provided to the + # deposit by the new ones) + return self._atom_entry( + request, headers, collection_name, deposit_id=deposit_id, - replace_archives=True, replace_metadata=True, ) - return self._atom_entry( - req, headers, collection_name, deposit_id=deposit_id, replace_metadata=True + + # Update metadata on a deposit already ingested + # Write to the metadata storage (and the deposit backend) + # no ingestion triggered + + deposit = Deposit.objects.get(pk=deposit_id) + assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS + + if swhid != deposit.swh_id: + return make_error_dict( + BAD_REQUEST, + f"Mismatched provided SWHID {swhid} with deposit's {deposit.swh_id}.", + "The provided SWHID does not match the deposit to update. " + "Please ensure you send the correct deposit SWHID.", + ) + + try: + raw_metadata, metadata = self._read_metadata(request.data) + except ParserError: + return make_error_dict( + BAD_REQUEST, + "Malformed xml metadata", + "The xml received is malformed. " + "Please ensure your metadata file is correctly formatted.", + ) + + if not metadata: + return make_error_dict( + BAD_REQUEST, + "Empty body request is not supported", + "Atom entry deposit is supposed to send for metadata. " + "If the body is empty, there is no metadata.", + ) + + metadata_authority = MetadataAuthority( + type=MetadataAuthorityType.DEPOSIT_CLIENT, + url=deposit.client.provider_url, + metadata={"name": deposit.client.last_name}, ) + metadata_fetcher = MetadataFetcher( + name=self.tool["name"], + version=self.tool["version"], + metadata=self.tool["configuration"], + ) + + deposit_swhid = parse_swhid(swhid) + + directory_id = hash_to_bytes(deposit_swhid.object_id) + + # check the swhid exists in the archive + directories_missing = list( + self.storage_metadata.directory_missing([directory_id]) + ) + if len(directories_missing) > 0: + return make_error_dict( + BAD_REQUEST, + f"Unknown directory SWHID {swhid} reference", + "The SWHID provided is not a known directory SWHID in SWH archive. " + "Please provide an existing SWHID.", + ) + + # replace metadata within the deposit backend + deposit_request_data = { + METADATA_KEY: metadata, + RAW_METADATA_KEY: raw_metadata, + } + + # actually add the metadata to the completed deposit + deposit_request = self._deposit_request_put(deposit, deposit_request_data) + # store that metadata to the metadata storage + metadata_object = RawExtrinsicMetadata( + type=MetadataTargetType.DIRECTORY, + id=deposit_swhid, + discovery_date=deposit_request.date, + authority=metadata_authority, + fetcher=metadata_fetcher, + format="sword-v2-atom-codemeta", + metadata=raw_metadata, + ) + + # write to metadata storage + self.storage_metadata.metadata_authority_add([metadata_authority]) + self.storage_metadata.metadata_fetcher_add([metadata_fetcher]) + self.storage_metadata.raw_extrinsic_metadata_add([metadata_object]) + + return { + "deposit_id": deposit_id, + "deposit_date": deposit_request.date, + "status": deposit.status, + "archive": None, + } + def process_post( self, request, headers: Dict, collection_name: str, deposit_id: Optional[int] = None, ) -> Tuple[int, str, Dict]: """Add new metadata/archive to existing deposit. + This allows the following scenarios to occur: + + - multipart: Add new metadata and archive to a deposit in status partial with + the provided ones. + + - empty atom: Allows to finalize a deposit in status partial (transition to + deposited). + source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_metadata - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_multipart - - This also deals with an empty post corner case to finalize a - deposit. + - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#continueddeposit_complete Returns: In optimal case for a multipart and atom-entry update, a 201 Created response. The body response will hold a deposit. And the response headers will contain an entry 'Location' with the EM-IRI. For the empty post case, this returns a 200. """ # noqa assert deposit_id is not None if request.content_type.startswith("multipart/"): data = self._multipart_upload( request, headers, collection_name, deposit_id=deposit_id ) return (status.HTTP_201_CREATED, EM_IRI, data) content_length = headers["content-length"] or 0 if content_length == 0 and headers["in-progress"] is False: # check for final empty post - # source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html - # #continueddeposit_complete data = self._empty_post(request, headers, collection_name, deposit_id) return (status.HTTP_200_OK, EDIT_SE_IRI, data) data = self._atom_entry( request, headers, collection_name, deposit_id=deposit_id ) return (status.HTTP_201_CREATED, EM_IRI, data) def process_delete(self, req, collection_name: str, deposit_id: int) -> Dict: """Delete the container (deposit). source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deleteconteiner # noqa """ return self._delete_deposit(collection_name, deposit_id) diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py index 51b6636e..36124045 100644 --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -1,195 +1,188 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from contextlib import contextmanager import os import shutil import tempfile from typing import Any, Dict, Tuple from rest_framework import status from swh.core import tarball -from swh.deposit.api import __version__ from swh.deposit.utils import normalize_date from swh.model import identifiers +from swh.model.model import MetadataAuthorityType from . import APIPrivateView, DepositReadMixin from ...config import ARCHIVE_TYPE, SWH_PERSON from ...models import Deposit from ..common import APIGet @contextmanager def aggregate_tarballs(extraction_dir, archive_paths): """Aggregate multiple tarballs into one and returns this new archive's path. Args: extraction_dir (path): Path to use for the tarballs computation archive_paths ([str]): Deposit's archive paths Returns: Tuple (directory to clean up, archive path (aggregated or not)) """ # rebuild one zip archive from (possibly) multiple ones os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix="swh.deposit-", dir=extraction_dir) # root folder to build an aggregated tarball aggregated_tarball_rootdir = os.path.join(dir_path, "aggregate") os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True) # uncompress in a temporary location all archives for archive_path in archive_paths: tarball.uncompress(archive_path, aggregated_tarball_rootdir) # Aggregate into one big tarball the multiple smaller ones temp_tarpath = shutil.make_archive( aggregated_tarball_rootdir, "zip", aggregated_tarball_rootdir ) # can already clean up temporary directory shutil.rmtree(aggregated_tarball_rootdir) try: yield temp_tarpath finally: shutil.rmtree(dir_path) class APIReadArchives(APIPrivateView, APIGet, DepositReadMixin): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ def __init__(self): super().__init__() self.extraction_dir = self.config["extraction_dir"] if not os.path.exists(self.extraction_dir): os.makedirs(self.extraction_dir) def process_get( self, request, collection_name: str, deposit_id: int ) -> Tuple[int, Any, str]: """Build a unique tarball from the multiple received and stream that content to the client. Args: request (Request): collection_name: Collection owning the deposit deposit_id: Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ archive_paths = [ r.archive.path for r in self._deposit_requests(deposit_id, request_type=ARCHIVE_TYPE) ] return ( status.HTTP_200_OK, aggregate_tarballs(self.extraction_dir, archive_paths), "swh/generator", ) class APIReadMetadata(APIPrivateView, APIGet, DepositReadMixin): """Class in charge of aggregating metadata on a deposit. """ - def __init__(self): - super().__init__() - self.provider = self.config["provider"] - self.tool = { - "name": "swh-deposit", - "version": __version__, - "configuration": {"sword_version": "2"}, - } - def _normalize_dates(self, deposit, metadata): """Normalize the date to use as a tuple of author date, committer date from the incoming metadata. Args: deposit (Deposit): Deposit model representation metadata (Dict): Metadata dict representation Returns: Tuple of author date, committer date. Those dates are swh normalized. """ commit_date = metadata.get("codemeta:datePublished") author_date = metadata.get("codemeta:dateCreated") if author_date and commit_date: pass elif commit_date: author_date = commit_date elif author_date: commit_date = author_date else: author_date = deposit.complete_date commit_date = deposit.complete_date return (normalize_date(author_date), normalize_date(commit_date)) def metadata_read(self, deposit): """Read and aggregate multiple data on deposit into one unified data dictionary. Args: deposit (Deposit): Deposit concerned by the data aggregation. Returns: Dictionary of data representing the deposit to inject in swh. """ metadata = self._metadata_get(deposit) # Read information metadata data = {"origin": {"type": "deposit", "url": deposit.origin_url,}} - # metadata provider - self.provider["provider_name"] = deposit.client.last_name - self.provider["provider_url"] = deposit.client.provider_url - author_date, commit_date = self._normalize_dates(deposit, metadata) if deposit.parent: swh_persistent_id = deposit.parent.swh_id swhid = identifiers.parse_swhid(swh_persistent_id) parent_revision = swhid.object_id parents = [parent_revision] else: parents = [] data["origin_metadata"] = { - "provider": self.provider, + # metadata provider + "provider": { + "provider_name": deposit.client.last_name, + "provider_url": deposit.client.provider_url, + "provider_type": MetadataAuthorityType.DEPOSIT_CLIENT.value, + "metadata": {}, + }, "tool": self.tool, "metadata": metadata, } data["deposit"] = { "id": deposit.id, "client": deposit.client.username, "collection": deposit.collection.name, "author": SWH_PERSON, "author_date": author_date, "committer": SWH_PERSON, "committer_date": commit_date, "revision_parents": parents, } return data def process_get( self, request, collection_name: str, deposit_id: int ) -> Tuple[int, Dict, str]: deposit = Deposit.objects.get(pk=deposit_id) data = self.metadata_read(deposit) return status.HTTP_200_OK, data if data else {}, "application/json" diff --git a/swh/deposit/config.py b/swh/deposit/config.py index 16221dfd..e67dd030 100644 --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -1,99 +1,105 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from typing import Any, Dict from swh.core import config +from swh.deposit import __version__ from swh.scheduler import get_scheduler from swh.scheduler.interface import SchedulerInterface # IRIs (Internationalized Resource identifier) sword 2.0 specified EDIT_SE_IRI = "edit_se_iri" EM_IRI = "em_iri" CONT_FILE_IRI = "cont_file_iri" SD_IRI = "servicedocument" COL_IRI = "upload" STATE_IRI = "state_iri" PRIVATE_GET_RAW_CONTENT = "private-download" PRIVATE_CHECK_DEPOSIT = "check-deposit" PRIVATE_PUT_DEPOSIT = "private-update" PRIVATE_GET_DEPOSIT_METADATA = "private-read" PRIVATE_LIST_DEPOSITS = "private-deposit-list" ARCHIVE_KEY = "archive" METADATA_KEY = "metadata" RAW_METADATA_KEY = "raw-metadata" ARCHIVE_TYPE = "archive" METADATA_TYPE = "metadata" AUTHORIZED_PLATFORMS = ["development", "production", "testing"] DEPOSIT_STATUS_REJECTED = "rejected" DEPOSIT_STATUS_PARTIAL = "partial" DEPOSIT_STATUS_DEPOSITED = "deposited" DEPOSIT_STATUS_VERIFIED = "verified" DEPOSIT_STATUS_LOAD_SUCCESS = "done" DEPOSIT_STATUS_LOAD_FAILURE = "failed" # Revision author for deposit SWH_PERSON = { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org", } DEFAULT_CONFIG = { "max_upload_size": 209715200, "checks": True, } def setup_django_for(platform=None, config_file=None): """Setup function for command line tools (swh.deposit.create_user) to initialize the needed db access. Note: Do not import any django related module prior to this function call. Otherwise, this will raise an django.core.exceptions.ImproperlyConfigured error message. Args: platform (str): the platform the scheduling is running config_file (str): Extra configuration file (typically for the production platform) Raises: ValueError in case of wrong platform inputs. """ if platform is not None: if platform not in AUTHORIZED_PLATFORMS: raise ValueError("Platform should be one of %s" % AUTHORIZED_PLATFORMS) if "DJANGO_SETTINGS_MODULE" not in os.environ: os.environ["DJANGO_SETTINGS_MODULE"] = "swh.deposit.settings.%s" % platform if config_file: os.environ.setdefault("SWH_CONFIG_FILENAME", config_file) import django django.setup() class APIConfig: """API Configuration centralized class. This loads explicitly the configuration file out of the SWH_CONFIG_FILENAME environment variable. """ def __init__(self): config_file = os.environ["SWH_CONFIG_FILENAME"] conf = config.read_raw_config(config.config_basepath(config_file)) self.config: Dict[str, Any] = config.merge_configs(DEFAULT_CONFIG, conf) self.scheduler: SchedulerInterface = get_scheduler(**self.config["scheduler"]) + self.tool = { + "name": "swh-deposit", + "version": __version__, + "configuration": {"sword_version": "2"}, + } diff --git a/swh/deposit/tests/api/test_deposit_atom.py b/swh/deposit/tests/api/test_deposit_atom.py index 2214b551..f551a317 100644 --- a/swh/deposit/tests/api/test_deposit_atom.py +++ b/swh/deposit/tests/api/test_deposit_atom.py @@ -1,328 +1,326 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import BytesIO from django.urls import reverse import pytest from rest_framework import status from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED from swh.deposit.models import Deposit, DepositCollection, DepositRequest from swh.deposit.parsers import parse_xml def test_post_deposit_atom_201_even_with_decimal( authenticated_client, deposit_collection, atom_dataset ): """Posting an initial atom entry should return 201 with deposit receipt """ atom_error_with_decimal = atom_dataset["error-with-decimal"] response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_error_with_decimal, HTTP_SLUG="external-id", HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) dr = DepositRequest.objects.get(deposit=deposit) assert dr.metadata is not None sw_version = dr.metadata.get("codemeta:softwareVersion") assert sw_version == "10.4" def test_post_deposit_atom_400_with_empty_body( authenticated_client, deposit_collection, atom_dataset ): """Posting empty body request should return a 400 response """ response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data-empty-body"], - HTTP_SLUG="something", ) assert response.status_code == status.HTTP_400_BAD_REQUEST - assert b"Empty body request is not supported" in response.content def test_post_deposit_atom_400_badly_formatted_atom( authenticated_client, deposit_collection, atom_dataset ): """Posting a badly formatted atom should return a 400 response """ response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data-badly-formatted"], ) assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_parsing_error( authenticated_client, deposit_collection, atom_dataset ): """Posting parsing error prone atom should return 400 """ response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data-parsing-error-prone"], ) assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_no_slug_header( authenticated_client, deposit_collection, atom_dataset ): """Posting an atom entry without a slug header should return a 400 """ url = reverse(COL_IRI, args=[deposit_collection.name]) # when response = authenticated_client.post( url, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"], # + headers HTTP_IN_PROGRESS="false", ) assert b"Missing SLUG header" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_unknown_collection(authenticated_client, atom_dataset): """Posting an atom entry to an unknown collection should return a 404 """ unknown_collection = "unknown-one" with pytest.raises(DepositCollection.DoesNotExist): DepositCollection.objects.get(name=unknown_collection) response = authenticated_client.post( reverse(COL_IRI, args=[unknown_collection]), # <- unknown collection content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"], HTTP_SLUG="something", ) assert response.status_code == status.HTTP_404_NOT_FOUND def test_post_deposit_atom_entry_initial( authenticated_client, deposit_collection, atom_dataset ): """Posting an initial atom entry should return 201 with deposit receipt """ # given external_id = "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) atom_entry_data = atom_dataset["entry-data0"] % external_id # when response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_entry_data, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.external_id == external_id assert deposit.status == DEPOSIT_STATUS_DEPOSITED # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.metadata is not None assert deposit_request.raw_metadata == atom_entry_data assert bool(deposit_request.archive) is False def test_post_deposit_atom_entry_with_codemeta( authenticated_client, deposit_collection, atom_dataset ): """Posting an initial atom entry should return 201 with deposit receipt """ # given external_id = "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) atom_entry_data = atom_dataset["codemeta-sample"] % external_id # when response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_entry_data, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.external_id == external_id assert deposit.status == DEPOSIT_STATUS_DEPOSITED # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.metadata is not None assert deposit_request.raw_metadata == atom_entry_data assert bool(deposit_request.archive) is False def test_post_deposit_atom_entry_tei( authenticated_client, deposit_collection, atom_dataset ): """Posting initial atom entry as TEI should return 201 with receipt """ # given external_id = "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) atom_entry_data = atom_dataset["tei-sample"] # when response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_entry_data, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.external_id == external_id assert deposit.status == DEPOSIT_STATUS_DEPOSITED # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.metadata is not None assert deposit_request.raw_metadata == atom_entry_data assert bool(deposit_request.archive) is False def test_post_deposit_atom_entry_multiple_steps( authenticated_client, deposit_collection, atom_dataset ): """After initial deposit, updating a deposit should return a 201 """ # given external_id = "urn:uuid:2225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): deposit = Deposit.objects.get(external_id=external_id) # when response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], HTTP_IN_PROGRESS="True", HTTP_SLUG=external_id, ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = int(response_content["deposit_id"]) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.external_id == external_id assert deposit.status == "partial" # one associated request to a deposit deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert len(deposit_requests) == 1 atom_entry_data = atom_dataset["entry-data-minimal"] % external_id.encode( "utf-8" ) # noqa update_uri = response._headers["location"][1] # when updating the first deposit post response = authenticated_client.post( update_uri, content_type="application/atom+xml;type=entry", data=atom_entry_data, HTTP_IN_PROGRESS="False", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = int(response_content["deposit_id"]) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.external_id == external_id assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert len(Deposit.objects.all()) == 1 # now 2 associated requests to a same deposit deposit_requests = DepositRequest.objects.filter(deposit=deposit).order_by("id") assert len(deposit_requests) == 2 atom_entry_data1 = atom_dataset["entry-data1"] expected_meta = [ {"metadata": parse_xml(atom_entry_data1), "raw_metadata": atom_entry_data1}, {"metadata": parse_xml(atom_entry_data), "raw_metadata": atom_entry_data}, ] for i, deposit_request in enumerate(deposit_requests): actual_metadata = deposit_request.metadata assert actual_metadata == expected_meta[i]["metadata"] assert deposit_request.raw_metadata == expected_meta[i]["raw_metadata"] assert bool(deposit_request.archive) is False diff --git a/swh/deposit/tests/api/test_deposit_private_read_metadata.py b/swh/deposit/tests/api/test_deposit_private_read_metadata.py index ec62dc73..76d1efde 100644 --- a/swh/deposit/tests/api/test_deposit_private_read_metadata.py +++ b/swh/deposit/tests/api/test_deposit_private_read_metadata.py @@ -1,551 +1,551 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse from rest_framework import status -from swh.deposit.api import __version__ +from swh.deposit import __version__ from swh.deposit.config import EDIT_SE_IRI, PRIVATE_GET_DEPOSIT_METADATA, SWH_PERSON from swh.deposit.models import Deposit PRIVATE_GET_DEPOSIT_METADATA_NC = PRIVATE_GET_DEPOSIT_METADATA + "-nc" def private_get_raw_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" deposit_id = deposit if isinstance(deposit, int) else deposit.id return [ reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[collection.name, deposit_id]), reverse(PRIVATE_GET_DEPOSIT_METADATA_NC, args=[deposit_id]), ] def update_deposit(authenticated_client, collection, deposit, atom_dataset): for atom_data in ["entry-data2", "entry-data3"]: update_deposit_with_metadata( authenticated_client, collection, deposit, atom_dataset[atom_data] ) return deposit def update_deposit_with_metadata(authenticated_client, collection, deposit, metadata): # update deposit's metadata response = authenticated_client.post( reverse(EDIT_SE_IRI, args=[collection.name, deposit.id]), content_type="application/atom+xml;type=entry", data=metadata, HTTP_SLUG=deposit.external_id, HTTP_IN_PROGRESS=True, ) assert response.status_code == status.HTTP_201_CREATED return deposit def test_read_metadata( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Private metadata read api to existing deposit should return metadata """ deposit = partial_deposit deposit.external_id = "some-external-id" deposit.save() deposit = update_deposit( authenticated_client, deposit_collection, deposit, atom_dataset ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() expected_meta = { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", }, "origin_metadata": { "metadata": { "@xmlns": ["http://www.w3.org/2005/Atom"], "author": ["some awesome author", "another one", "no one"], "codemeta:dateCreated": "2017-10-07T15:17:08Z", "external_identifier": "some-external-id", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", # noqa }, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": "test", "id": deposit.id, "collection": "test", "revision_parents": [], }, } assert data == expected_meta def test_read_metadata_revision_with_parent( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Private read metadata to a deposit (with parent) returns metadata """ deposit = partial_deposit deposit.external_id = "some-external-id" deposit.save() deposit = update_deposit( authenticated_client, deposit_collection, deposit, atom_dataset ) rev_id = "da78a9d4cf1d5d29873693fd496142e3a18c20fa" swh_id = "swh:1:rev:%s" % rev_id fake_parent = Deposit( swh_id=swh_id, client=deposit.client, collection=deposit.collection ) fake_parent.save() deposit.parent = fake_parent deposit.save() for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() expected_meta = { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", }, "origin_metadata": { "metadata": { "@xmlns": ["http://www.w3.org/2005/Atom"], "author": ["some awesome author", "another one", "no one"], "codemeta:dateCreated": "2017-10-07T15:17:08Z", "external_identifier": "some-external-id", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", # noqa }, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": "test", "id": deposit.id, "collection": "test", "revision_parents": [rev_id], }, } assert data == expected_meta def test_read_metadata_3( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """date(Created|Published) provided, uses author/committer date """ deposit = partial_deposit deposit.external_id = "hal-01243065" deposit.save() deposit = update_deposit( authenticated_client, deposit_collection, deposit, atom_dataset ) # add metadata to the deposit with datePublished and dateCreated codemeta_entry_data = ( atom_dataset["metadata"] % """ 2015-04-06T17:08:47+02:00 2017-05-03T16:08:47+02:00 """ ) update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() metadata = { "@xmlns": ["http://www.w3.org/2005/Atom"], "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "author": [ "some awesome author", "another one", "no one", {"email": "hal@ccsd.cnrs.fr", "name": "HAL"}, ], "client": "hal", "codemeta:applicationCategory": "test", "codemeta:author": {"codemeta:name": "Morane Gruenpeter"}, "codemeta:dateCreated": [ "2017-10-07T15:17:08Z", "2015-04-06T17:08:47+02:00", ], "codemeta:datePublished": "2017-05-03T16:08:47+02:00", "codemeta:description": "this is the description", "codemeta:developmentStatus": "stable", "codemeta:keywords": "DSP programming", "codemeta:license": [ {"codemeta:name": "GNU General Public License v3.0 only"}, { "codemeta:name": "CeCILL " "Free " "Software " "License " "Agreement " "v1.1" }, ], "codemeta:programmingLanguage": ["php", "python", "C"], "codemeta:runtimePlatform": "phpstorm", "codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243065", # noqa "codemeta:version": "1", "external_identifier": ["some-external-id", "hal-01243065"], "id": "hal-01243065", "title": "Composing a Web of Audio Applications", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", } expected_meta = { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/hal-01243065", }, "origin_metadata": { "metadata": metadata, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1493820527}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], }, } assert data == expected_meta def test_read_metadata_4( authenticated_client, deposit_collection, atom_dataset, partial_deposit ): """dateCreated/datePublished not provided, revision uses complete_date """ deposit = partial_deposit codemeta_entry_data = atom_dataset["metadata"] % "" deposit = update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) # will use the deposit completed date as fallback date deposit.complete_date = "2016-04-06" deposit.save() for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() metadata = { "@xmlns": "http://www.w3.org/2005/Atom", "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "author": {"email": "hal@ccsd.cnrs.fr", "name": "HAL"}, "client": "hal", "codemeta:applicationCategory": "test", "codemeta:author": {"codemeta:name": "Morane Gruenpeter"}, "codemeta:description": "this is the description", "codemeta:developmentStatus": "stable", "codemeta:keywords": "DSP programming", "codemeta:license": [ { "codemeta:name": "GNU " "General " "Public " "License " "v3.0 " "only" }, { "codemeta:name": "CeCILL " "Free " "Software " "License " "Agreement " "v1.1" }, ], "codemeta:programmingLanguage": ["php", "python", "C"], "codemeta:runtimePlatform": "phpstorm", "codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243065", "codemeta:version": "1", "external_identifier": "hal-01243065", "id": "hal-01243065", "title": "Composing a Web of Audio Applications", } expected_origin = { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/%s" % (deposit.external_id), } expected_origin_metadata = { "metadata": metadata, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, } expected_deposit_info = { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1459900800}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1459900800}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], } expected_meta = { "origin": expected_origin, "origin_metadata": expected_origin_metadata, "deposit": expected_deposit_info, } assert data == expected_meta def test_read_metadata_5( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """dateCreated/datePublished provided, revision uses author/committer date If multiple dateCreated provided, the first occurrence (of dateCreated) is selected. If multiple datePublished provided, the first occurrence (of datePublished) is selected. """ deposit = partial_deposit # add metadata to the deposit with multiple datePublished/dateCreated codemeta_entry_data = ( atom_dataset["metadata"] % """ 2015-04-06T17:08:47+02:00 2017-05-03T16:08:47+02:00 2016-04-06T17:08:47+02:00 2018-05-03T16:08:47+02:00 """ ) deposit = update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() expected_origin = { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/external-id-partial", } metadata = { "@xmlns": "http://www.w3.org/2005/Atom", "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "author": {"email": "hal@ccsd.cnrs.fr", "name": "HAL"}, "client": "hal", "codemeta:applicationCategory": "test", "codemeta:author": {"codemeta:name": "Morane Gruenpeter"}, "codemeta:dateCreated": [ "2015-04-06T17:08:47+02:00", "2016-04-06T17:08:47+02:00", ], "codemeta:datePublished": [ "2017-05-03T16:08:47+02:00", "2018-05-03T16:08:47+02:00", ], "codemeta:description": "this is the description", "codemeta:developmentStatus": "stable", "codemeta:keywords": "DSP programming", "codemeta:license": [ { "codemeta:name": "GNU " "General " "Public " "License " "v3.0 " "only" }, { "codemeta:name": "CeCILL " "Free " "Software " "License " "Agreement " "v1.1" }, ], "codemeta:programmingLanguage": ["php", "python", "C"], "codemeta:runtimePlatform": "phpstorm", "codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243065", # noqa "codemeta:version": "1", "external_identifier": "hal-01243065", "id": "hal-01243065", "title": "Composing a Web of Audio Applications", } expected_origin_metadata = { "metadata": metadata, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, } expected_deposit_info = { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1493820527}, }, "author_date": { "negative_utc": False, "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1428332927}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], } expected_meta = { "origin": expected_origin, "origin_metadata": expected_origin_metadata, "deposit": expected_deposit_info, } assert data == expected_meta def test_access_to_nonexisting_deposit_returns_404_response( authenticated_client, deposit_collection, ): """Read unknown collection should return a 404 response """ unknown_id = 999 try: Deposit.objects.get(pk=unknown_id) except Deposit.DoesNotExist: assert True for url in private_get_raw_url_endpoints(deposit_collection, unknown_id): response = authenticated_client.get(url) assert response.status_code == status.HTTP_404_NOT_FOUND msg = "Deposit with id %s does not exist" % unknown_id assert msg in response.content.decode("utf-8") diff --git a/swh/deposit/tests/api/test_deposit_update.py b/swh/deposit/tests/api/test_deposit_update.py index fd934d24..88402f07 100644 --- a/swh/deposit/tests/api/test_deposit_update.py +++ b/swh/deposit/tests/api/test_deposit_update.py @@ -1,563 +1,761 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import BytesIO from django.core.files.uploadedfile import InMemoryUploadedFile from django.urls import reverse from rest_framework import status from swh.deposit.config import ( DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL, EDIT_SE_IRI, EM_IRI, ) from swh.deposit.models import Deposit, DepositCollection, DepositRequest from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import check_archive, create_arborescence_archive +from swh.model.hashutil import hash_to_bytes +from swh.model.identifiers import parse_swhid, swhid def test_replace_archive_to_deposit_is_possible( tmp_path, partial_deposit, deposit_collection, authenticated_client, sample_archive, atom_dataset, ): """Replace all archive with another one should return a 204 response """ tmp_path = str(tmp_path) # given deposit = partial_deposit requests = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(list(requests)) == 1 check_archive(sample_archive["name"], requests[0].archive.name) # we have no metadata for that deposit requests = list(DepositRequest.objects.filter(deposit=deposit, type="metadata")) assert len(requests) == 0 response = authenticated_client.post( reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], HTTP_SLUG=deposit.external_id, HTTP_IN_PROGRESS=True, ) requests = list(DepositRequest.objects.filter(deposit=deposit, type="metadata")) assert len(requests) == 1 update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) external_id = "some-external-id-1" archive2 = create_arborescence_archive( tmp_path, "archive2", "file2", b"some other content in file" ) response = authenticated_client.put( update_uri, content_type="application/zip", # as zip data=archive2["data"], # + headers CONTENT_LENGTH=archive2["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive2["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive2["name"],), ) assert response.status_code == status.HTTP_204_NO_CONTENT requests = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(list(requests)) == 1 check_archive(archive2["name"], requests[0].archive.name) # check we did not touch the other parts requests = list(DepositRequest.objects.filter(deposit=deposit, type="metadata")) assert len(requests) == 1 def test_replace_metadata_to_deposit_is_possible( tmp_path, authenticated_client, partial_deposit_with_metadata, deposit_collection, atom_dataset, ): """Replace all metadata with another one should return a 204 response """ # given deposit = partial_deposit_with_metadata raw_metadata0 = atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8") requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta) == 1 request_meta0 = requests_meta[0] assert request_meta0.raw_metadata == raw_metadata0 requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive0) == 1 update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.put( update_uri, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_204_NO_CONTENT requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta) == 1 request_meta1 = requests_meta[0] raw_metadata1 = request_meta1.raw_metadata assert raw_metadata1 == atom_dataset["entry-data1"] assert raw_metadata0 != raw_metadata1 assert request_meta0 != request_meta1 # check we did not touch the other parts requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive1) == 1 assert set(requests_archive0) == set(requests_archive1) def test_add_archive_to_deposit_is_possible( tmp_path, authenticated_client, deposit_collection, partial_deposit_with_metadata, sample_archive, ): """Add another archive to a deposit return a 201 response """ tmp_path = str(tmp_path) deposit = partial_deposit_with_metadata requests = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests) == 1 check_archive(sample_archive["name"], requests[0].archive.name) requests_meta0 = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta0) == 1 update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) external_id = "some-external-id-1" archive2 = create_arborescence_archive( tmp_path, "archive2", "file2", b"some other content in file" ) response = authenticated_client.post( update_uri, content_type="application/zip", # as zip data=archive2["data"], # + headers CONTENT_LENGTH=archive2["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive2["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive2["name"],), ) assert response.status_code == status.HTTP_201_CREATED requests = DepositRequest.objects.filter(deposit=deposit, type="archive").order_by( "id" ) assert len(requests) == 2 # first archive still exists check_archive(sample_archive["name"], requests[0].archive.name) # a new one was added check_archive(archive2["name"], requests[1].archive.name) # check we did not touch the other parts requests_meta1 = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta1) == 1 assert set(requests_meta0) == set(requests_meta1) def test_add_metadata_to_deposit_is_possible( authenticated_client, deposit_collection, partial_deposit_with_metadata, atom_dataset, ): """Add metadata with another one should return a 204 response """ deposit = partial_deposit_with_metadata requests = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests) == 1 requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive0) == 1 update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) atom_entry = atom_dataset["entry-data1"] response = authenticated_client.post( update_uri, content_type="application/atom+xml;type=entry", data=atom_entry ) assert response.status_code == status.HTTP_201_CREATED requests = DepositRequest.objects.filter(deposit=deposit, type="metadata").order_by( "id" ) assert len(requests) == 2 expected_raw_meta0 = atom_dataset["entry-data0"] % ( deposit.external_id.encode("utf-8") ) # a new one was added assert requests[0].raw_metadata == expected_raw_meta0 assert requests[1].raw_metadata == atom_entry # check we did not touch the other parts requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive1) == 1 assert set(requests_archive0) == set(requests_archive1) def test_add_both_archive_and_metadata_to_deposit( authenticated_client, deposit_collection, partial_deposit_with_metadata, atom_dataset, sample_archive, ): """Scenario: Add both a new archive and new metadata to a partial deposit is ok Response: 201 """ deposit = partial_deposit_with_metadata requests = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests) == 1 requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive0) == 1 update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) archive = InMemoryUploadedFile( BytesIO(sample_archive["data"]), field_name=sample_archive["name"], name=sample_archive["name"], content_type="application/x-tar", size=sample_archive["length"], charset=None, ) data_atom_entry = atom_dataset["entry-data1"] atom_entry = InMemoryUploadedFile( BytesIO(data_atom_entry.encode("utf-8")), field_name="atom0", name="atom0", content_type='application/atom+xml; charset="utf-8"', size=len(data_atom_entry), charset="utf-8", ) update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.post( update_uri, format="multipart", data={"archive": archive, "atom_entry": atom_entry,}, ) assert response.status_code == status.HTTP_201_CREATED requests = DepositRequest.objects.filter(deposit=deposit, type="metadata").order_by( "id" ) assert len(requests) == 1 + 1, "New deposit request archive got added" expected_raw_meta0 = atom_dataset["entry-data0"] % ( deposit.external_id.encode("utf-8") ) # a new one was added assert requests[0].raw_metadata == expected_raw_meta0 assert requests[1].raw_metadata == data_atom_entry # check we did not touch the other parts requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive1) == 1 + 1, "New deposit request metadata got added" def test_post_metadata_empty_post_finalize_deposit_ok( authenticated_client, deposit_collection, partial_deposit_with_metadata, atom_dataset, ): """Empty atom post entry with header in-progress to false transitions deposit to 'deposited' status Response: 200 """ deposit = partial_deposit_with_metadata assert deposit.status == DEPOSIT_STATUS_PARTIAL update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.post( update_uri, content_type="application/atom+xml;type=entry", data="", size=0, HTTP_IN_PROGRESS=False, ) assert response.status_code == status.HTTP_200_OK deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED def test_add_metadata_to_unknown_deposit( deposit_collection, authenticated_client, atom_dataset ): """Replacing metadata to unknown deposit should return a 404 response """ unknown_deposit_id = 1000 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(EDIT_SE_IRI, args=[deposit_collection, unknown_deposit_id]) response = authenticated_client.post( url, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert "Unknown collection name" in response_content["sword:error"]["summary"] def test_add_metadata_to_unknown_collection( partial_deposit, authenticated_client, atom_dataset ): """Replacing metadata to unknown deposit should return a 404 response """ deposit = partial_deposit unknown_collection_name = "unknown-collection" try: DepositCollection.objects.get(name=unknown_collection_name) except DepositCollection.DoesNotExist: assert True url = reverse(EDIT_SE_IRI, args=[unknown_collection_name, deposit.id]) response = authenticated_client.post( url, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert "Unknown collection name" in response_content["sword:error"]["summary"] def test_replace_metadata_to_unknown_deposit( authenticated_client, deposit_collection, atom_dataset ): """Adding metadata to unknown deposit should return a 404 response """ unknown_deposit_id = 998 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(EDIT_SE_IRI, args=[deposit_collection.name, unknown_deposit_id]) response = authenticated_client.put( url, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert ( "Deposit with id %s does not exist" % unknown_deposit_id == response_content["sword:error"]["summary"] ) def test_add_archive_to_unknown_deposit( authenticated_client, deposit_collection, atom_dataset ): """Adding metadata to unknown deposit should return a 404 response """ unknown_deposit_id = 997 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(EM_IRI, args=[deposit_collection.name, unknown_deposit_id]) response = authenticated_client.post( url, content_type="application/zip", data=atom_dataset["entry-data1"] ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert ( "Deposit with id %s does not exist" % unknown_deposit_id == response_content["sword:error"]["summary"] ) def test_replace_archive_to_unknown_deposit( authenticated_client, deposit_collection, atom_dataset ): """Replacing archive to unknown deposit should return a 404 response """ unknown_deposit_id = 996 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(EM_IRI, args=[deposit_collection.name, unknown_deposit_id]) response = authenticated_client.put( url, content_type="application/zip", data=atom_dataset["entry-data1"] ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert ( "Deposit with id %s does not exist" % unknown_deposit_id == response_content["sword:error"]["summary"] ) def test_post_metadata_to_em_iri_failure( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Update (POST) archive with wrong content type should return 400 """ deposit = partial_deposit update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.post( update_uri, content_type="application/x-gtar-compressed", data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_400_BAD_REQUEST response_content = parse_xml(response.content) msg = ( "Packaging format supported is restricted to " + "application/zip, application/x-tar" ) assert msg == response_content["sword:error"]["summary"] def test_put_metadata_to_em_iri_failure( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Update (PUT) archive with wrong content type should return 400 """ # given deposit = partial_deposit # when update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.put( update_uri, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], ) # then assert response.status_code == status.HTTP_400_BAD_REQUEST response_content = parse_xml(response.content) msg = ( "Packaging format supported is restricted to " + "application/zip, application/x-tar" ) assert msg == response_content["sword:error"]["summary"] def test_put_update_metadata_and_archive_deposit_partial_nominal( tmp_path, authenticated_client, partial_deposit_with_metadata, deposit_collection, atom_dataset, sample_archive, ): """Scenario: Replace metadata and archive(s) with new ones should be ok Response: 204 """ # given deposit = partial_deposit_with_metadata raw_metadata0 = atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8") requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta) == 1 request_meta0 = requests_meta[0] assert request_meta0.raw_metadata == raw_metadata0 requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive0) == 1 archive = InMemoryUploadedFile( BytesIO(sample_archive["data"]), field_name=sample_archive["name"], name=sample_archive["name"], content_type="application/x-tar", size=sample_archive["length"], charset=None, ) data_atom_entry = atom_dataset["entry-data1"] atom_entry = InMemoryUploadedFile( BytesIO(data_atom_entry.encode("utf-8")), field_name="atom0", name="atom0", content_type='application/atom+xml; charset="utf-8"', size=len(data_atom_entry), charset="utf-8", ) update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.put( update_uri, format="multipart", data={"archive": archive, "atom_entry": atom_entry,}, ) assert response.status_code == status.HTTP_204_NO_CONTENT # check we updated the metadata part requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta) == 1 request_meta1 = requests_meta[0] raw_metadata1 = request_meta1.raw_metadata assert raw_metadata1 == data_atom_entry assert raw_metadata0 != raw_metadata1 assert request_meta0 != request_meta1 # and the archive part requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive1) == 1 assert set(requests_archive0) != set(requests_archive1) + + +def test_put_update_metadata_done_deposit_nominal( + tmp_path, + authenticated_client, + complete_deposit, + deposit_collection, + atom_dataset, + sample_data, + swh_storage, +): + """Nominal scenario, client send an update of metadata on a deposit with status "done" + with an existing swhid. Such swhid has its metadata updated accordingly both in + the deposit backend and in the metadata storage. + + Response: 204 + + """ + deposit_swhid = parse_swhid(complete_deposit.swh_id) + assert deposit_swhid.object_type == "directory" + directory_id = hash_to_bytes(deposit_swhid.object_id) + + # directory targeted by the complete_deposit does not exist in the storage + assert list(swh_storage.directory_missing([directory_id])) == [directory_id] + + # so let's create a directory reference in the storage (current deposit targets an + # unknown swhid) + existing_directory = sample_data.directory + swh_storage.directory_add([existing_directory]) + assert list(swh_storage.directory_missing([existing_directory.id])) == [] + + # and patch one complete deposit swh_id so it targets said reference + complete_deposit.swh_id = swhid("directory", existing_directory.id) + complete_deposit.save() + + actual_existing_requests_archive = DepositRequest.objects.filter( + deposit=complete_deposit, type="archive" + ) + nb_archives = len(actual_existing_requests_archive) + actual_existing_requests_metadata = DepositRequest.objects.filter( + deposit=complete_deposit, type="metadata" + ) + nb_metadata = len(actual_existing_requests_metadata) + + update_uri = reverse( + EDIT_SE_IRI, args=[deposit_collection.name, complete_deposit.id] + ) + response = authenticated_client.put( + update_uri, + content_type="application/atom+xml;type=entry", + data=atom_dataset["entry-data1"], + HTTP_X_CHECK_SWHID=complete_deposit.swh_id, + ) + + assert response.status_code == status.HTTP_204_NO_CONTENT + + new_requests_meta = DepositRequest.objects.filter( + deposit=complete_deposit, type="metadata" + ) + assert len(new_requests_meta) == nb_metadata + 1 + request_meta1 = new_requests_meta[0] + raw_metadata1 = request_meta1.raw_metadata + assert raw_metadata1 == atom_dataset["entry-data1"] + + # check we did not touch the other parts + requests_archive1 = DepositRequest.objects.filter( + deposit=complete_deposit, type="archive" + ) + assert len(requests_archive1) == nb_archives + assert set(actual_existing_requests_archive) == set(requests_archive1) + + # FIXME: Check the metadata storage information created is consistent + pass + + +def test_put_update_metadata_done_deposit_failure_swhid_unknown( + tmp_path, + authenticated_client, + complete_deposit, + deposit_collection, + atom_dataset, + swh_storage, +): + """Failure: client updates metadata with a SWHID matching the deposit's. Said SWHID does + not exist in the archive somehow. + + This should not happen though, it is still technically possible so it's + covered... + + Response: 400 + + """ + # directory targeted by the complete_deposit does not exist in the storage + missing_directory_id = hash_to_bytes(parse_swhid(complete_deposit.swh_id).object_id) + assert list(swh_storage.directory_missing([missing_directory_id])) == [ + missing_directory_id + ] + + update_uri = reverse( + EDIT_SE_IRI, args=[deposit_collection.name, complete_deposit.id] + ) + + response = authenticated_client.put( + update_uri, + content_type="application/atom+xml;type=entry", + data=atom_dataset["entry-data1"], + HTTP_X_CHECK_SWHID=complete_deposit.swh_id, + ) + + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert b"Unknown directory SWHID" in response.content + + +def test_put_update_metadata_done_deposit_failure_mismatched_swhid( + tmp_path, + authenticated_client, + complete_deposit, + deposit_collection, + atom_dataset, + swh_storage, +): + """failure: client updates metadata on deposit with SWHID not matching the deposit's. + + Response: 400 + + """ + incorrect_swhid = "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea" + assert complete_deposit.swh_id != incorrect_swhid + + update_uri = reverse( + EDIT_SE_IRI, args=[deposit_collection.name, complete_deposit.id] + ) + response = authenticated_client.put( + update_uri, + content_type="application/atom+xml;type=entry", + data=atom_dataset["entry-data1"], + HTTP_X_CHECK_SWHID=incorrect_swhid, + ) + + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert b"Mismatched provided SWHID" in response.content + + +def test_put_update_metadata_done_deposit_failure_malformed_xml( + tmp_path, + authenticated_client, + complete_deposit, + deposit_collection, + atom_dataset, + swh_storage, +): + """failure: client updates metadata on deposit done with a malformed xml + + Response: 400 + + """ + update_uri = reverse( + EDIT_SE_IRI, args=[deposit_collection.name, complete_deposit.id] + ) + response = authenticated_client.put( + update_uri, + content_type="application/atom+xml;type=entry", + data=atom_dataset["entry-data-ko"], + HTTP_X_CHECK_SWHID=complete_deposit.swh_id, + ) + + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert b"Malformed xml metadata" in response.content + + +def test_put_update_metadata_done_deposit_failure_empty_xml( + tmp_path, + authenticated_client, + complete_deposit, + deposit_collection, + atom_dataset, + swh_storage, +): + """failure: client updates metadata on deposit done with an empty xml. + + Response: 400 + + """ + update_uri = reverse( + EDIT_SE_IRI, args=[deposit_collection.name, complete_deposit.id] + ) + + response = authenticated_client.put( + update_uri, + content_type="application/atom+xml;type=entry", + data=atom_dataset["entry-data-empty-body"], + HTTP_X_CHECK_SWHID=complete_deposit.swh_id, + ) + + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert b"Empty body request is not supported" in response.content diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py index c92a6916..7ac74e94 100644 --- a/swh/deposit/tests/conftest.py +++ b/swh/deposit/tests/conftest.py @@ -1,418 +1,418 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 import os from typing import Mapping from django.test.utils import setup_databases # type: ignore from django.urls import reverse import psycopg2 from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT import pytest from rest_framework import status from rest_framework.test import APIClient import yaml +from swh.core.config import read from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED, EDIT_SE_IRI, setup_django_for, ) from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import create_arborescence_archive from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid from swh.scheduler import get_scheduler # mypy is asked to ignore the import statement above because setup_databases # is not part of the d.t.utils.__all__ variable. TEST_USER = { "username": "test", "password": "password", "email": "test@example.org", "provider_url": "https://hal-test.archives-ouvertes.fr/", "domain": "archives-ouvertes.fr/", "collection": {"name": "test"}, } def pytest_configure(): setup_django_for("testing") @pytest.fixture() -def deposit_config(swh_scheduler_config): +def deposit_config(swh_scheduler_config, swh_storage_backend_config): return { "max_upload_size": 500, "extraction_dir": "/tmp/swh-deposit/test/extraction-dir", "checks": False, - "provider": { - "provider_name": "", - "provider_type": "deposit_client", - "provider_url": "", - "metadata": {}, - }, "scheduler": {"cls": "local", "args": swh_scheduler_config,}, + "storage_metadata": swh_storage_backend_config, } @pytest.fixture() def deposit_config_path(tmp_path, monkeypatch, deposit_config): conf_path = os.path.join(tmp_path, "deposit.yml") with open(conf_path, "w") as f: f.write(yaml.dump(deposit_config)) monkeypatch.setenv("SWH_CONFIG_FILENAME", conf_path) return conf_path @pytest.fixture(autouse=True) def deposit_autoconfig(deposit_config_path, swh_scheduler_config): """Enforce config for deposit classes inherited from APIConfig.""" - - scheduler = get_scheduler("local", swh_scheduler_config) - task_type = { - "type": "load-deposit", - "backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit", - "description": "Load deposit task", - } - scheduler.create_task_type(task_type) + cfg = read(deposit_config_path) + + if "scheduler" in cfg: + # scheduler setup: require the load-deposit tasks (already existing in + # production) + scheduler = get_scheduler(**cfg["scheduler"]) + task_type = { + "type": "load-deposit", + "backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit", + "description": "Load deposit task", + } + scheduler.create_task_type(task_type) @pytest.fixture(scope="session") def django_db_setup(request, django_db_blocker, postgresql_proc): from django.conf import settings settings.DATABASES["default"].update( { ("ENGINE", "django.db.backends.postgresql"), ("NAME", "tests"), ("USER", postgresql_proc.user), # noqa ("HOST", postgresql_proc.host), # noqa ("PORT", postgresql_proc.port), # noqa } ) with django_db_blocker.unblock(): setup_databases( verbosity=request.config.option.verbose, interactive=False, keepdb=False ) def execute_sql(sql): """Execute sql to postgres db""" with psycopg2.connect(database="postgres") as conn: conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() cur.execute(sql) @pytest.fixture(autouse=True, scope="session") def swh_proxy(): """Automatically inject this fixture in all tests to ensure no outside connection takes place. """ os.environ["http_proxy"] = "http://localhost:999" os.environ["https_proxy"] = "http://localhost:999" def create_deposit_collection(collection_name: str): """Create a deposit collection with name collection_name """ from swh.deposit.models import DepositCollection try: collection = DepositCollection._default_manager.get(name=collection_name) except DepositCollection.DoesNotExist: collection = DepositCollection(name=collection_name) collection.save() return collection def deposit_collection_factory(collection_name=TEST_USER["collection"]["name"]): @pytest.fixture def _deposit_collection(db, collection_name=collection_name): return create_deposit_collection(collection_name) return _deposit_collection deposit_collection = deposit_collection_factory() deposit_another_collection = deposit_collection_factory("another-collection") @pytest.fixture def deposit_user(db, deposit_collection): """Create/Return the test_user "test" """ from swh.deposit.models import DepositClient try: user = DepositClient._default_manager.get(username=TEST_USER["username"]) except DepositClient.DoesNotExist: user = DepositClient._default_manager.create_user( username=TEST_USER["username"], email=TEST_USER["email"], password=TEST_USER["password"], provider_url=TEST_USER["provider_url"], domain=TEST_USER["domain"], ) user.collections = [deposit_collection.id] user.save() return user @pytest.fixture def client(): """Override pytest-django one which does not work for djangorestframework. """ return APIClient() # <- drf's client @pytest.yield_fixture def authenticated_client(client, deposit_user): """Returned a logged client """ _token = "%s:%s" % (deposit_user.username, TEST_USER["password"]) token = base64.b64encode(_token.encode("utf-8")) authorization = "Basic %s" % token.decode("utf-8") client.credentials(HTTP_AUTHORIZATION=authorization) yield client client.logout() @pytest.fixture def sample_archive(tmp_path): """Returns a sample archive """ tmp_path = str(tmp_path) # pytest version limitation in previous version archive = create_arborescence_archive( tmp_path, "archive1", "file1", b"some content in file" ) return archive @pytest.fixture def atom_dataset(datadir) -> Mapping[str, str]: """Compute the paths to atom files. Returns: Dict of atom name per content (bytes) """ atom_path = os.path.join(datadir, "atom") data = {} for filename in os.listdir(atom_path): filepath = os.path.join(atom_path, filename) with open(filepath, "rb") as f: raw_content = f.read().decode("utf-8") # Keep the filename without extension atom_name = filename.split(".")[0] data[atom_name] = raw_content return data def create_deposit( authenticated_client, collection_name: str, sample_archive, external_id: str, deposit_status=DEPOSIT_STATUS_DEPOSITED, ): """Create a skeleton shell deposit """ url = reverse(COL_IRI, args=[collection_name]) # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"]), ) # then assert response.status_code == status.HTTP_201_CREATED from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(external_id=external_id) if deposit.status != deposit_status: deposit.status = deposit_status deposit.save() assert deposit.status == deposit_status return deposit def create_binary_deposit( authenticated_client, collection_name: str, sample_archive, external_id: str, deposit_status: str = DEPOSIT_STATUS_DEPOSITED, atom_dataset: Mapping[str, bytes] = {}, ): """Create a deposit with both metadata and archive set. Then alters its status to `deposit_status`. """ deposit = create_deposit( authenticated_client, collection_name, sample_archive, external_id=external_id, deposit_status=DEPOSIT_STATUS_PARTIAL, ) response = authenticated_client.post( reverse(EDIT_SE_IRI, args=[collection_name, deposit.id]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8"), HTTP_SLUG=deposit.external_id, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_201_CREATED assert deposit.status == DEPOSIT_STATUS_PARTIAL from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit.id) if deposit.status != deposit_status: deposit.status = deposit_status deposit.save() assert deposit.status == deposit_status return deposit def deposit_factory(deposit_status=DEPOSIT_STATUS_DEPOSITED): """Build deposit with a specific status """ @pytest.fixture() def _deposit( sample_archive, deposit_collection, authenticated_client, deposit_status=deposit_status, ): external_id = "external-id-%s" % deposit_status return create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id=external_id, deposit_status=deposit_status, ) return _deposit deposited_deposit = deposit_factory() rejected_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_REJECTED) partial_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_PARTIAL) verified_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_VERIFIED) completed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS) failed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_FAILURE) @pytest.fixture def partial_deposit_with_metadata( sample_archive, deposit_collection, authenticated_client, atom_dataset ): """Returns deposit with archive and metadata provided, status 'partial' """ return create_binary_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id="external-id-partial", deposit_status=DEPOSIT_STATUS_PARTIAL, atom_dataset=atom_dataset, ) @pytest.fixture def partial_deposit_only_metadata( deposit_collection, authenticated_client, atom_dataset ): response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], HTTP_SLUG="external-id-partial", HTTP_IN_PROGRESS=True, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = response_content["deposit_id"] from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_PARTIAL return deposit @pytest.fixture def complete_deposit(sample_archive, deposit_collection, authenticated_client): """Returns a completed deposit (load success) """ deposit = create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id="external-id-complete", deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS, ) origin = "https://hal.archives-ouvertes.fr/hal-01727745" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" revision_id = "548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10" snapshot_id = "e5e82d064a9c3df7464223042e0c55d72ccff7f0" deposit.swh_id = swhid(DIRECTORY, directory_id) deposit.swh_id_context = swhid( DIRECTORY, directory_id, metadata={ "origin": origin, "visit": swhid(SNAPSHOT, snapshot_id), "anchor": swhid(REVISION, revision_id), "path": "/", }, ) deposit.save() return deposit @pytest.fixture() def tmp_path(tmp_path): return str(tmp_path) # issue with oldstable's pytest version diff --git a/swh/deposit/tests/test_init.py b/swh/deposit/tests/test_init.py index 88fca573..59a145a1 100644 --- a/swh/deposit/tests/test_init.py +++ b/swh/deposit/tests/test_init.py @@ -1,10 +1,10 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def test_version(): - from swh.deposit.api import __version__ + from swh.deposit import __version__ assert __version__ is not None