diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py index c8e5248a..24b22a10 100644 --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -1,962 +1,962 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib from typing import Sequence, Type from abc import ABCMeta, abstractmethod from django.urls import reverse from django.http import HttpResponse from django.shortcuts import render from django.utils import timezone from rest_framework import status from rest_framework.authentication import BaseAuthentication, BasicAuthentication from rest_framework.permissions import BasePermission, IsAuthenticated from rest_framework.views import APIView from swh.model import hashutil from swh.scheduler.utils import create_oneshot_task_dict from ..config import ( - SWHDefaultConfig, + APIConfig, EDIT_SE_IRI, EM_IRI, CONT_FILE_IRI, ARCHIVE_KEY, METADATA_KEY, RAW_METADATA_KEY, STATE_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_LOAD_SUCCESS, ARCHIVE_TYPE, METADATA_TYPE, ) from ..errors import ( MAX_UPLOAD_SIZE_EXCEEDED, BAD_REQUEST, ERROR_CONTENT, CHECKSUM_MISMATCH, make_error_dict, MEDIATION_NOT_ALLOWED, make_error_response_from_dict, FORBIDDEN, NOT_FOUND, make_error_response, METHOD_NOT_ALLOWED, ParserError, PARSING_ERROR, ) from ..models import Deposit, DepositRequest, DepositCollection, DepositClient from ..parsers import parse_xml ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"] ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"] -class SWHAPIView(APIView): +class AuthenticatedAPIView(APIView): """Mixin intended as a based API view to enforce the basic authentication check """ authentication_classes: Sequence[Type[BaseAuthentication]] = (BasicAuthentication,) permission_classes: Sequence[Type[BasePermission]] = (IsAuthenticated,) -class SWHBaseDeposit(SWHDefaultConfig, SWHAPIView, metaclass=ABCMeta): +class APIBase(APIConfig, AuthenticatedAPIView, metaclass=ABCMeta): """Base deposit request class sharing multiple common behaviors. """ def _read_headers(self, request): """Read and unify the necessary headers from the request (those are not stored in the same location or not properly formatted). Args: request (Request): Input request Returns: Dictionary with the following keys (some associated values may be None): - content-type - content-length - in-progress - content-disposition - packaging - slug - on-behalf-of """ meta = request._request.META content_type = request.content_type content_length = meta.get("CONTENT_LENGTH") if content_length and isinstance(content_length, str): content_length = int(content_length) # final deposit if not provided in_progress = meta.get("HTTP_IN_PROGRESS", False) content_disposition = meta.get("HTTP_CONTENT_DISPOSITION") if isinstance(in_progress, str): in_progress = in_progress.lower() == "true" content_md5sum = meta.get("HTTP_CONTENT_MD5") if content_md5sum: content_md5sum = bytes.fromhex(content_md5sum) packaging = meta.get("HTTP_PACKAGING") slug = meta.get("HTTP_SLUG") on_behalf_of = meta.get("HTTP_ON_BEHALF_OF") metadata_relevant = meta.get("HTTP_METADATA_RELEVANT") return { "content-type": content_type, "content-length": content_length, "in-progress": in_progress, "content-disposition": content_disposition, "content-md5sum": content_md5sum, "packaging": packaging, "slug": slug, "on-behalf-of": on_behalf_of, "metadata-relevant": metadata_relevant, } def _compute_md5(self, filehandler): """Compute uploaded file's md5 sum. Args: filehandler (InMemoryUploadedFile): the file to compute the md5 hash Returns: the md5 checksum (str) """ h = hashlib.md5() for chunk in filehandler: h.update(chunk) return h.digest() def _deposit_put( self, request, deposit_id=None, in_progress=False, external_id=None ): """Save/Update a deposit in db. Args: deposit_id (int): deposit identifier in_progress (dict): The deposit's status external_id (str): The external identifier to associate to the deposit Returns: The Deposit instance saved or updated. """ if in_progress is False: complete_date = timezone.now() status_type = DEPOSIT_STATUS_DEPOSITED else: complete_date = None status_type = DEPOSIT_STATUS_PARTIAL if not deposit_id: try: # find a deposit parent (same external id, status load # to success) deposit_parent = ( Deposit.objects.filter( external_id=external_id, status=DEPOSIT_STATUS_LOAD_SUCCESS ) .order_by("-id")[0:1] .get() ) # noqa except Deposit.DoesNotExist: deposit_parent = None deposit = Deposit( collection=self._collection, external_id=external_id, complete_date=complete_date, status=status_type, client=self._client, parent=deposit_parent, ) else: deposit = Deposit.objects.get(pk=deposit_id) # update metadata deposit.complete_date = complete_date deposit.status = status_type if self.config["checks"]: deposit.save() # needed to have a deposit id scheduler = self.scheduler if deposit.status == DEPOSIT_STATUS_DEPOSITED and not deposit.check_task_id: task = create_oneshot_task_dict( "check-deposit", collection=deposit.collection.name, deposit_id=deposit.id, ) check_task_id = scheduler.create_tasks([task])[0]["id"] deposit.check_task_id = check_task_id deposit.save() return deposit def _deposit_request_put( self, deposit, deposit_request_data, replace_metadata=False, replace_archives=False, ): """Save a deposit request with metadata attached to a deposit. Args: deposit (Deposit): The deposit concerned by the request deposit_request_data (dict): The dictionary with at most 2 deposit request types (archive, metadata) to associate to the deposit replace_metadata (bool): Flag defining if we add or update existing metadata to the deposit replace_archives (bool): Flag defining if we add or update archives to existing deposit Returns: None """ if replace_metadata: DepositRequest.objects.filter(deposit=deposit, type=METADATA_TYPE).delete() if replace_archives: DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() deposit_request = None archive_file = deposit_request_data.get(ARCHIVE_KEY) if archive_file: deposit_request = DepositRequest( type=ARCHIVE_TYPE, deposit=deposit, archive=archive_file ) deposit_request.save() metadata = deposit_request_data.get(METADATA_KEY) if metadata: raw_metadata = deposit_request_data.get(RAW_METADATA_KEY) deposit_request = DepositRequest( type=METADATA_TYPE, deposit=deposit, metadata=metadata, raw_metadata=raw_metadata.decode("utf-8"), ) deposit_request.save() assert deposit_request is not None def _delete_archives(self, collection_name, deposit_id): """Delete archives reference from the deposit id. """ try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, "The deposit %s does not exist" % deposit_id ) DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() return {} def _delete_deposit(self, collection_name, deposit_id): """Delete deposit reference. Args: collection_name (str): Client's name deposit_id (id): The deposit to delete Returns Empty dict when ok. Dict with error key to describe the failure. """ try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, "The deposit %s does not exist" % deposit_id ) if deposit.collection.name != collection_name: summary = "Cannot delete a deposit from another collection" description = "Deposit %s does not belong to the collection %s" % ( deposit_id, collection_name, ) return make_error_dict( BAD_REQUEST, summary=summary, verbose_description=description ) DepositRequest.objects.filter(deposit=deposit).delete() deposit.delete() return {} def _check_preconditions_on(self, filehandler, md5sum, content_length=None): """Check preconditions on provided file are respected. That is the length and/or the md5sum hash match the file's content. Args: filehandler (InMemoryUploadedFile): The file to check md5sum (hex str): md5 hash expected from the file's content content_length (int): the expected length if provided. Returns: Either none if no error or a dictionary with a key error detailing the problem. """ if content_length: if content_length > self.config["max_upload_size"]: return make_error_dict( MAX_UPLOAD_SIZE_EXCEEDED, "Upload size limit exceeded (max %s bytes)." % self.config["max_upload_size"], "Please consider sending the archive in " "multiple steps.", ) length = filehandler.size if length != content_length: return make_error_dict( status.HTTP_412_PRECONDITION_FAILED, "Wrong length" ) if md5sum: _md5sum = self._compute_md5(filehandler) if _md5sum != md5sum: return make_error_dict( CHECKSUM_MISMATCH, "Wrong md5 hash", "The checksum sent %s and the actual checksum " "%s does not match." % (hashutil.hash_to_hex(md5sum), hashutil.hash_to_hex(_md5sum)), ) return None def _binary_upload( self, request, headers, collection_name, deposit_id=None, replace_metadata=False, replace_archives=False, ): """Binary upload routine. Other than such a request, a 415 response is returned. Args: request (Request): the request holding information to parse and inject in db headers (dict): request headers formatted collection_name (str): the associated client deposit_id (id): deposit identifier if provided replace_metadata (bool): 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives (bool): 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id (int): Deposit identifier - deposit_date (date): Deposit date - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 413 (request entity too large) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided """ content_length = headers["content-length"] if not content_length: return make_error_dict( BAD_REQUEST, "CONTENT_LENGTH header is mandatory", "For archive deposit, the " "CONTENT_LENGTH header must be sent.", ) content_disposition = headers["content-disposition"] if not content_disposition: return make_error_dict( BAD_REQUEST, "CONTENT_DISPOSITION header is mandatory", "For archive deposit, the " "CONTENT_DISPOSITION header must be sent.", ) packaging = headers["packaging"] if packaging and packaging not in ACCEPT_PACKAGINGS: return make_error_dict( BAD_REQUEST, "Only packaging %s is supported" % ACCEPT_PACKAGINGS, "The packaging provided %s is not supported" % packaging, ) filehandler = request.FILES["file"] precondition_status_response = self._check_preconditions_on( filehandler, headers["content-md5sum"], content_length ) if precondition_status_response: return precondition_status_response external_id = headers["slug"] # actual storage of data archive_metadata = filehandler deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) self._deposit_request_put( deposit, {ARCHIVE_KEY: archive_metadata}, replace_metadata=replace_metadata, replace_archives=replace_archives, ) return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "status": deposit.status, "archive": filehandler.name, } def _read_metadata(self, metadata_stream): """Given a metadata stream, reads the metadata and returns both the parsed and the raw metadata. """ raw_metadata = metadata_stream.read() metadata = parse_xml(raw_metadata) return raw_metadata, metadata def _multipart_upload( self, request, headers, collection_name, deposit_id=None, replace_metadata=False, replace_archives=False, ): """Multipart upload supported with exactly: - 1 archive (zip) - 1 atom entry Other than such a request, a 415 response is returned. Args: request (Request): the request holding information to parse and inject in db headers (dict): request headers formatted collection_name (str): the associated client deposit_id (id): deposit identifier if provided replace_metadata (bool): 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives (bool): 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id (int): Deposit identifier - deposit_date (date): Deposit date - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 412 (precondition failed) if the potentially md5 hash provided mismatch the reality of the archive - 413 (request entity too large) if the length of the archive exceeds the max size configured - 415 (unsupported media type) if a wrong media type is provided """ external_id = headers["slug"] content_types_present = set() data = { "application/zip": None, # expected either zip "application/x-tar": None, # or x-tar "application/atom+xml": None, } for key, value in request.FILES.items(): fh = value if fh.content_type in content_types_present: return make_error_dict( ERROR_CONTENT, "Only 1 application/zip (or application/x-tar) archive " "and 1 atom+xml entry is supported (as per sword2.0 " "specification)", "You provided more than 1 application/(zip|x-tar) " "or more than 1 application/atom+xml content-disposition " "header in the multipart deposit", ) content_types_present.add(fh.content_type) data[fh.content_type] = fh if len(content_types_present) != 2: return make_error_dict( ERROR_CONTENT, "You must provide both 1 application/zip (or " "application/x-tar) and 1 atom+xml entry for multipart " "deposit", "You need to provide only 1 application/(zip|x-tar) " "and 1 application/atom+xml content-disposition header " "in the multipart deposit", ) filehandler = data["application/zip"] if not filehandler: filehandler = data["application/x-tar"] precondition_status_response = self._check_preconditions_on( filehandler, headers["content-md5sum"] ) if precondition_status_response: return precondition_status_response try: raw_metadata, metadata = self._read_metadata(data["application/atom+xml"]) except ParserError: return make_error_dict( PARSING_ERROR, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) # actual storage of data deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) deposit_request_data = { ARCHIVE_KEY: filehandler, METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata, } self._deposit_request_put( deposit, deposit_request_data, replace_metadata, replace_archives ) return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "archive": filehandler.name, "status": deposit.status, } def _atom_entry( self, request, headers, collection_name, deposit_id=None, replace_metadata=False, replace_archives=False, ): """Atom entry deposit. Args: request (Request): the request holding information to parse and inject in db headers (dict): request headers formatted collection_name (str): the associated client deposit_id (id): deposit identifier if provided replace_metadata (bool): 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives (bool): 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id: deposit id associated to the deposit - deposit_date: date of the deposit - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 400 (bad request) if the request's body is empty - 415 (unsupported media type) if a wrong media type is provided """ try: raw_metadata, metadata = self._read_metadata(request.data) except ParserError: return make_error_dict( BAD_REQUEST, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) if not metadata: return make_error_dict( BAD_REQUEST, "Empty body request is not supported", "Atom entry deposit is supposed to send for metadata. " "If the body is empty, there is no metadata.", ) external_id = metadata.get("external_identifier", headers["slug"]) deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) self._deposit_request_put( deposit, {METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata}, replace_metadata, replace_archives, ) return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "archive": None, "status": deposit.status, } def _empty_post(self, request, headers, collection_name, deposit_id): """Empty post to finalize an empty deposit. Args: request (Request): the request holding information to parse and inject in db headers (dict): request headers formatted collection_name (str): the associated client deposit_id (id): deposit identifier Returns: Dictionary of result with the deposit's id, the date it was completed and no archive. """ deposit = Deposit.objects.get(pk=deposit_id) deposit.complete_date = timezone.now() deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() return { "deposit_id": deposit_id, "deposit_date": deposit.complete_date, "status": deposit.status, "archive": None, } def _make_iris(self, request, collection_name, deposit_id): """Define the IRI endpoints Args: request (Request): The initial request collection_name (str): client/collection's name deposit_id (id): Deposit identifier Returns: Dictionary of keys with the iris' urls. """ args = [collection_name, deposit_id] return { iri: request.build_absolute_uri(reverse(iri, args=args)) for iri in [EM_IRI, EDIT_SE_IRI, CONT_FILE_IRI, STATE_IRI] } def additional_checks(self, request, headers, collection_name, deposit_id=None): """Permit the child class to enrich additional checks. Returns: dict with 'error' detailing the problem. """ return {} def checks(self, request, collection_name, deposit_id=None): try: self._collection = DepositCollection.objects.get(name=collection_name) except DepositCollection.DoesNotExist: return make_error_dict( NOT_FOUND, "Unknown collection name %s" % collection_name ) username = request.user.username if username: # unauthenticated request can have the username empty try: self._client = DepositClient.objects.get(username=username) except DepositClient.DoesNotExist: return make_error_dict(NOT_FOUND, "Unknown client name %s" % username) if self._collection.id not in self._client.collections: return make_error_dict( FORBIDDEN, "Client %s cannot access collection %s" % (username, collection_name), ) if deposit_id: try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, "Deposit with id %s does not exist" % deposit_id ) checks = self.restrict_access(request, deposit) if checks: return checks headers = self._read_headers(request) if headers["on-behalf-of"]: return make_error_dict(MEDIATION_NOT_ALLOWED, "Mediation is not supported.") checks = self.additional_checks(request, headers, collection_name, deposit_id) if "error" in checks: return checks return {"headers": headers} def restrict_access(self, request, deposit=None): if deposit: if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL: summary = "You can only act on deposit with status '%s'" % ( DEPOSIT_STATUS_PARTIAL, ) description = "This deposit has status '%s'" % deposit.status return make_error_dict( BAD_REQUEST, summary=summary, verbose_description=description ) def _basic_not_allowed_method(self, request, method): return make_error_response( request, METHOD_NOT_ALLOWED, "%s method is not supported on this endpoint" % method, ) def get(self, request, *args, **kwargs): return self._basic_not_allowed_method(request, "GET") def post(self, request, *args, **kwargs): return self._basic_not_allowed_method(request, "POST") def put(self, request, *args, **kwargs): return self._basic_not_allowed_method(request, "PUT") def delete(self, request, *args, **kwargs): return self._basic_not_allowed_method(request, "DELETE") -class SWHGetDepositAPI(SWHBaseDeposit, metaclass=ABCMeta): +class APIGet(APIBase, metaclass=ABCMeta): """Mixin for class to support GET method. """ def get(self, request, collection_name, deposit_id, format=None): """Endpoint to create/add resources to deposit. Returns: 200 response when no error during routine occurred 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) r = self.process_get(request, collection_name, deposit_id) if isinstance(r, tuple): status, content, content_type = r return HttpResponse(content, status=status, content_type=content_type) return r @abstractmethod def process_get(self, request, collection_name, deposit_id): """Routine to deal with the deposit's get processing. Returns: Tuple status, stream of content, content-type """ pass -class SWHPostDepositAPI(SWHBaseDeposit, metaclass=ABCMeta): +class APIPost(APIBase, metaclass=ABCMeta): """Mixin for class to support DELETE method. """ def post(self, request, collection_name, deposit_id=None, format=None): """Endpoint to create/add resources to deposit. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) headers = checks["headers"] _status, _iri_key, data = self.process_post( request, headers, collection_name, deposit_id ) error = data.get("error") if error: return make_error_response_from_dict(request, error) data["packagings"] = ACCEPT_PACKAGINGS iris = self._make_iris(request, collection_name, data["deposit_id"]) data.update(iris) response = render( request, "deposit/deposit_receipt.xml", context=data, content_type="application/xml", status=_status, ) response._headers["location"] = "Location", data[_iri_key] return response @abstractmethod def process_post(self, request, headers, collection_name, deposit_id=None): """Routine to deal with the deposit's processing. Returns Tuple of: - response status code (200, 201, etc...) - key iri (EM_IRI, EDIT_SE_IRI, etc...) - dictionary of the processing result """ pass -class SWHPutDepositAPI(SWHBaseDeposit, metaclass=ABCMeta): +class APIPut(APIBase, metaclass=ABCMeta): """Mixin for class to support PUT method. """ def put(self, request, collection_name, deposit_id, format=None): """Endpoint to update deposit resources. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) headers = checks["headers"] data = self.process_put(request, headers, collection_name, deposit_id) error = data.get("error") if error: return make_error_response_from_dict(request, error) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_put(self, request, headers, collection_name, deposit_id): """Routine to deal with updating a deposit in some way. Returns dictionary of the processing result """ pass -class SWHDeleteDepositAPI(SWHBaseDeposit, metaclass=ABCMeta): +class APIDelete(APIBase, metaclass=ABCMeta): """Mixin for class to support DELETE method. """ def delete(self, request, collection_name, deposit_id): """Endpoint to delete some deposit's resources (archives, deposit). Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) data = self.process_delete(request, collection_name, deposit_id) error = data.get("error") if error: return make_error_response_from_dict(request, error) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_delete(self, request, collection_name, deposit_id): """Routine to delete a resource. This is mostly not allowed except for the - EM_IRI (cf. .api.deposit_update.SWHUpdateArchiveDeposit) + EM_IRI (cf. .api.deposit_update.APIUpdateArchive) """ pass diff --git a/swh/deposit/api/deposit.py b/swh/deposit/api/deposit.py index c5258081..d6dba82d 100644 --- a/swh/deposit/api/deposit.py +++ b/swh/deposit/api/deposit.py @@ -1,95 +1,95 @@ -# Copyright (C) 2017-2018 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework import status -from .common import SWHPostDepositAPI, ACCEPT_ARCHIVE_CONTENT_TYPES +from .common import APIPost, ACCEPT_ARCHIVE_CONTENT_TYPES from ..config import EDIT_SE_IRI from ..errors import make_error_dict, BAD_REQUEST from ..parsers import SWHFileUploadZipParser, SWHFileUploadTarParser from ..parsers import SWHAtomEntryParser from ..parsers import SWHMultiPartParser -class SWHDeposit(SWHPostDepositAPI): +class APIPostDeposit(APIPost): """Deposit request class defining api endpoints for sword deposit. What's known as 'Col IRI' in the sword specification. HTTP verbs supported: POST """ parser_classes = ( SWHMultiPartParser, SWHFileUploadZipParser, SWHFileUploadTarParser, SWHAtomEntryParser, ) def additional_checks(self, req, headers, collection_name, deposit_id=None): slug = headers["slug"] if not slug: msg = "Missing SLUG header in request" verbose_description = "Provide in the SLUG header one identifier, for example the url pointing to the resource you are depositing." # noqa return make_error_dict(BAD_REQUEST, msg, verbose_description) return {} def process_post(self, req, headers, collection_name, deposit_id=None): """Create a first deposit as: - archive deposit (1 zip) - multipart (1 zip + 1 atom entry) - atom entry Args: req (Request): the request holding the information to parse and inject in db collection_name (str): the associated client Returns: An http response (HttpResponse) according to the situation. If everything is ok, a 201 response (created) with a deposit receipt. Otherwise, depending on the upload, the following errors can be returned: - archive deposit: - 400 (bad request) if the request is not providing an external identifier - 403 (forbidden) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or hash provided mismatch the reality of the archive. - 415 (unsupported media type) if a wrong media type is provided - multipart deposit: - 400 (bad request) if the request is not providing an external identifier - 412 (precondition failed) if the potentially md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided - Atom entry deposit: - 400 (bad request) if the request is not providing an external identifier - 400 (bad request) if the request's body is empty - 415 (unsupported media type) if a wrong media type is provided """ assert deposit_id is None if req.content_type in ACCEPT_ARCHIVE_CONTENT_TYPES: data = self._binary_upload(req, headers, collection_name) elif req.content_type.startswith("multipart/"): data = self._multipart_upload(req, headers, collection_name) else: data = self._atom_entry(req, headers, collection_name) return status.HTTP_201_CREATED, EDIT_SE_IRI, data diff --git a/swh/deposit/api/deposit_content.py b/swh/deposit/api/deposit_content.py index a5065ae6..cee091dd 100644 --- a/swh/deposit/api/deposit_content.py +++ b/swh/deposit/api/deposit_content.py @@ -1,47 +1,47 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.shortcuts import render from rest_framework import status -from .common import SWHBaseDeposit +from .common import APIBase from ..errors import NOT_FOUND, make_error_response from ..errors import make_error_response_from_dict from ..models import DEPOSIT_STATUS_DETAIL, Deposit, DepositRequest -class SWHDepositContent(SWHBaseDeposit): +class APIContent(APIBase): def get(self, req, collection_name, deposit_id, format=None): checks = self.checks(req, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(req, checks["error"]) try: deposit = Deposit.objects.get(pk=deposit_id) if deposit.collection.name != collection_name: raise Deposit.DoesNotExist except Deposit.DoesNotExist: return make_error_response( req, NOT_FOUND, "deposit %s does not belong to collection %s" % (deposit_id, collection_name), ) requests = DepositRequest.objects.filter(deposit=deposit) context = { "deposit_id": deposit.id, "status": deposit.status, "status_detail": DEPOSIT_STATUS_DETAIL[deposit.status], "requests": requests, } return render( req, "deposit/content.xml", context=context, content_type="application/xml", status=status.HTTP_200_OK, ) diff --git a/swh/deposit/api/deposit_status.py b/swh/deposit/api/deposit_status.py index 18c26556..c163882c 100644 --- a/swh/deposit/api/deposit_status.py +++ b/swh/deposit/api/deposit_status.py @@ -1,65 +1,65 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.shortcuts import render from rest_framework import status -from .common import SWHBaseDeposit +from .common import APIBase from .converters import convert_status_detail from ..errors import NOT_FOUND, make_error_response from ..errors import make_error_response_from_dict from ..models import DEPOSIT_STATUS_DETAIL, Deposit -class SWHDepositStatus(SWHBaseDeposit): +class APIStatus(APIBase): """Deposit status. What's known as 'State IRI' in the sword specification. HTTP verbs supported: GET """ def get(self, req, collection_name, deposit_id, format=None): checks = self.checks(req, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(req, checks["error"]) try: deposit = Deposit.objects.get(pk=deposit_id) if deposit.collection.name != collection_name: raise Deposit.DoesNotExist except Deposit.DoesNotExist: return make_error_response( req, NOT_FOUND, "deposit %s does not belong to collection %s" % (deposit_id, collection_name), ) status_detail = convert_status_detail(deposit.status_detail) if not status_detail: status_detail = DEPOSIT_STATUS_DETAIL[deposit.status] context = { "deposit_id": deposit.id, "status_detail": status_detail, } keys = ( "status", "swh_id", "swh_id_context", "external_id", ) for k in keys: context[k] = getattr(deposit, k, None) return render( req, "deposit/status.xml", context=context, content_type="application/xml", status=status.HTTP_200_OK, ) diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py index 21707415..0c5f6303 100644 --- a/swh/deposit/api/deposit_update.py +++ b/swh/deposit/api/deposit_update.py @@ -1,169 +1,167 @@ -# Copyright (C) 2017-2018 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework import status -from .common import SWHPostDepositAPI, SWHPutDepositAPI, SWHDeleteDepositAPI +from .common import APIPost, APIPut, APIDelete from .common import ACCEPT_ARCHIVE_CONTENT_TYPES from ..config import CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI from ..errors import make_error_dict, BAD_REQUEST from ..parsers import SWHFileUploadZipParser, SWHFileUploadTarParser from ..parsers import SWHAtomEntryParser from ..parsers import SWHMultiPartParser -class SWHUpdateArchiveDeposit(SWHPostDepositAPI, SWHPutDepositAPI, SWHDeleteDepositAPI): +class APIUpdateArchive(APIPost, APIPut, APIDelete): """Deposit request class defining api endpoints for sword deposit. What's known as 'EM IRI' in the sword specification. HTTP verbs supported: PUT, POST, DELETE """ parser_classes = ( SWHFileUploadZipParser, SWHFileUploadTarParser, ) def process_put(self, req, headers, collection_name, deposit_id): """Replace existing content for the existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_binary # noqa Returns: 204 No content """ if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES: msg = "Packaging format supported is restricted to %s" % ( ", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES) ) return make_error_dict(BAD_REQUEST, msg) return self._binary_upload( req, headers, collection_name, deposit_id=deposit_id, replace_archives=True ) def process_post(self, req, headers, collection_name, deposit_id): """Add new content to the existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_mediaresource # noqa Returns: 201 Created Headers: Location: [Cont-File-IRI] Body: [optional Deposit Receipt] """ if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES: msg = "Packaging format supported is restricted to %s" % ( ", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES) ) return "unused", "unused", make_error_dict(BAD_REQUEST, msg) return ( status.HTTP_201_CREATED, CONT_FILE_IRI, self._binary_upload(req, headers, collection_name, deposit_id), ) def process_delete(self, req, collection_name, deposit_id): """Delete content (archives) from existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deletingcontent # noqa Returns: 204 Created """ return self._delete_archives(collection_name, deposit_id) -class SWHUpdateMetadataDeposit( - SWHPostDepositAPI, SWHPutDepositAPI, SWHDeleteDepositAPI -): +class APIUpdateMetadata(APIPost, APIPut, APIDelete): """Deposit request class defining api endpoints for sword deposit. What's known as 'Edit IRI' (and SE IRI) in the sword specification. HTTP verbs supported: POST (SE IRI), PUT (Edit IRI), DELETE """ parser_classes = (SWHMultiPartParser, SWHAtomEntryParser) def process_put(self, req, headers, collection_name, deposit_id): """Replace existing deposit's metadata/archive with new ones. source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_metadata # noqa - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_multipart # noqa Returns: 204 No content """ if req.content_type.startswith("multipart/"): return self._multipart_upload( req, headers, collection_name, deposit_id=deposit_id, replace_archives=True, replace_metadata=True, ) return self._atom_entry( req, headers, collection_name, deposit_id=deposit_id, replace_metadata=True ) def process_post(self, req, headers, collection_name, deposit_id): """Add new metadata/archive to existing deposit. source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_metadata # noqa - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_multipart # noqa This also deals with an empty post corner case to finalize a deposit. Returns: In optimal case for a multipart and atom-entry update, a 201 Created response. The body response will hold a deposit. And the response headers will contain an entry 'Location' with the EM-IRI. For the empty post case, this returns a 200. """ if req.content_type.startswith("multipart/"): return ( status.HTTP_201_CREATED, EM_IRI, self._multipart_upload( req, headers, collection_name, deposit_id=deposit_id ), ) # check for final empty post # source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html # #continueddeposit_complete if headers["content-length"] == 0 and headers["in-progress"] is False: data = self._empty_post(req, headers, collection_name, deposit_id) return (status.HTTP_200_OK, EDIT_SE_IRI, data) return ( status.HTTP_201_CREATED, EM_IRI, self._atom_entry(req, headers, collection_name, deposit_id=deposit_id), ) def process_delete(self, req, collection_name, deposit_id): """Delete the container (deposit). source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deleteconteiner # noqa """ return self._delete_deposit(collection_name, deposit_id) diff --git a/swh/deposit/api/private/__init__.py b/swh/deposit/api/private/__init__.py index db3e2f5a..c52363d8 100644 --- a/swh/deposit/api/private/__init__.py +++ b/swh/deposit/api/private/__init__.py @@ -1,109 +1,109 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.deposit import utils -from ...config import METADATA_TYPE, SWHDefaultConfig +from ...config import METADATA_TYPE, APIConfig from ...models import DepositRequest, Deposit from rest_framework.permissions import AllowAny -from swh.deposit.api.common import SWHAPIView +from swh.deposit.api.common import AuthenticatedAPIView from swh.deposit.errors import make_error_dict, NOT_FOUND class DepositReadMixin: """Deposit Read mixin """ def _deposit_requests(self, deposit, request_type): """Given a deposit, yields its associated deposit_request Args: deposit (Deposit): Deposit to list requests for request_type (str): 'archive' or 'metadata' Yields: deposit requests of type request_type associated to the deposit """ if isinstance(deposit, int): deposit = Deposit.objects.get(pk=deposit) deposit_requests = DepositRequest.objects.filter( type=request_type, deposit=deposit ).order_by("id") for deposit_request in deposit_requests: yield deposit_request def _metadata_get(self, deposit): """Given a deposit, aggregate all metadata requests. Args: deposit (Deposit): The deposit instance to extract metadata from. Returns: metadata dict from the deposit. """ metadata = ( m.metadata for m in self._deposit_requests(deposit, request_type=METADATA_TYPE) ) return utils.merge(*metadata) -class SWHPrivateAPIView(SWHDefaultConfig, SWHAPIView): +class SWHPrivateAPIView(APIConfig, AuthenticatedAPIView): """Mixin intended as private api (so no authentication) based API view (for the private ones). """ authentication_classes = () permission_classes = (AllowAny,) def checks(self, req, collection_name, deposit_id=None): """Override default checks implementation to allow empty collection. """ if deposit_id: try: Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, "Deposit with id %s does not exist" % deposit_id ) headers = self._read_headers(req) checks = self.additional_checks(req, headers, collection_name, deposit_id) if "error" in checks: return checks return {"headers": headers} def get( self, request, collection_name=None, deposit_id=None, format=None, *args, **kwargs, ): return super().get(request, collection_name, deposit_id, format) def put( self, request, collection_name=None, deposit_id=None, format=None, *args, **kwargs, ): return super().put(request, collection_name, deposit_id, format) diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py index 76924560..7e654045 100644 --- a/swh/deposit/api/private/deposit_check.py +++ b/swh/deposit/api/private/deposit_check.py @@ -1,230 +1,230 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import re import tarfile import zipfile from itertools import chain from shutil import get_unpack_formats from rest_framework import status from swh.scheduler.utils import create_oneshot_task_dict from . import DepositReadMixin, SWHPrivateAPIView -from ..common import SWHGetDepositAPI +from ..common import APIGet from ...config import DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_REJECTED from ...config import ARCHIVE_TYPE from ...models import Deposit MANDATORY_FIELDS_MISSING = "Mandatory fields are missing" ALTERNATE_FIELDS_MISSING = "Mandatory alternate fields are missing" MANDATORY_ARCHIVE_UNREADABLE = ( "At least one of its associated archives is not readable" # noqa ) MANDATORY_ARCHIVE_INVALID = ( "Mandatory archive is invalid (i.e contains only one archive)" # noqa ) MANDATORY_ARCHIVE_UNSUPPORTED = "Mandatory archive type is not supported" MANDATORY_ARCHIVE_MISSING = "Deposit without archive is rejected" ARCHIVE_EXTENSIONS = [ "zip", "tar", "tar.gz", "xz", "tar.xz", "bz2", "tar.bz2", "Z", "tar.Z", "tgz", "7z", ] PATTERN_ARCHIVE_EXTENSION = re.compile(r".*\.(%s)$" % "|".join(ARCHIVE_EXTENSIONS)) def known_archive_format(filename): return any( filename.endswith(t) for t in chain(*(x[1] for x in get_unpack_formats())) ) -class SWHChecksDeposit(SWHPrivateAPIView, SWHGetDepositAPI, DepositReadMixin): +class SWHChecksDeposit(SWHPrivateAPIView, APIGet, DepositReadMixin): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ def _check_deposit_archives(self, deposit): """Given a deposit, check each deposit request of type archive. Args: The deposit to check archives for Returns tuple (status, error_detail): True, None if all archives are ok, (False, ) otherwise. """ requests = list(self._deposit_requests(deposit, request_type=ARCHIVE_TYPE)) if len(requests) == 0: # no associated archive is refused return False, {"archive": [{"summary": MANDATORY_ARCHIVE_MISSING,}]} errors = [] for archive_request in requests: check, error_message = self._check_archive(archive_request) if not check: errors.append( {"summary": error_message, "fields": [archive_request.id]} ) if not errors: return True, None return False, {"archive": errors} def _check_archive(self, archive_request): """Check that a deposit associated archive is ok: - readable - supported archive format - valid content: the archive does not contain a single archive file If any of those checks are not ok, return the corresponding failing check. Args: archive_path (DepositRequest): Archive to check Returns: (True, None) if archive is check compliant, (False, ) otherwise. """ archive_path = archive_request.archive.path if not known_archive_format(archive_path): return False, MANDATORY_ARCHIVE_UNSUPPORTED try: if zipfile.is_zipfile(archive_path): with zipfile.ZipFile(archive_path) as f: files = f.namelist() elif tarfile.is_tarfile(archive_path): with tarfile.open(archive_path) as f: files = f.getnames() else: return False, MANDATORY_ARCHIVE_UNSUPPORTED except Exception: return False, MANDATORY_ARCHIVE_UNREADABLE if len(files) > 1: return True, None element = files[0] if PATTERN_ARCHIVE_EXTENSION.match(element): # archive in archive! return False, MANDATORY_ARCHIVE_INVALID return True, None def _check_metadata(self, metadata): """Check to execute on all metadata for mandatory field presence. Args: metadata (dict): Metadata dictionary to check for mandatory fields Returns: tuple (status, error_detail): True, None if metadata are ok (False, ) otherwise. """ required_fields = { "author": False, } alternate_fields = { ("name", "title"): False, # alternate field, at least one # of them must be present } for field, value in metadata.items(): for name in required_fields: if name in field: required_fields[name] = True for possible_names in alternate_fields: for possible_name in possible_names: if possible_name in field: alternate_fields[possible_names] = True continue mandatory_result = [k for k, v in required_fields.items() if not v] optional_result = [" or ".join(k) for k, v in alternate_fields.items() if not v] if mandatory_result == [] and optional_result == []: return True, None detail = [] if mandatory_result != []: detail.append( {"summary": MANDATORY_FIELDS_MISSING, "fields": mandatory_result} ) if optional_result != []: detail.append( {"summary": ALTERNATE_FIELDS_MISSING, "fields": optional_result,} ) return False, {"metadata": detail} def process_get(self, req, collection_name, deposit_id): """Build a unique tarball from the multiple received and stream that content to the client. Args: req (Request): collection_name (str): Collection owning the deposit deposit_id (id): Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ deposit = Deposit.objects.get(pk=deposit_id) metadata = self._metadata_get(deposit) problems = {} # will check each deposit's associated request (both of type # archive and metadata) for errors archives_status, error_detail = self._check_deposit_archives(deposit) if not archives_status: problems.update(error_detail) metadata_status, error_detail = self._check_metadata(metadata) if not metadata_status: problems.update(error_detail) deposit_status = archives_status and metadata_status # if any problems arose, the deposit is rejected if not deposit_status: deposit.status = DEPOSIT_STATUS_REJECTED deposit.status_detail = problems response = { "status": deposit.status, "details": deposit.status_detail, } else: deposit.status = DEPOSIT_STATUS_VERIFIED response = { "status": deposit.status, } if not deposit.load_task_id and self.config["checks"]: url = deposit.origin_url task = create_oneshot_task_dict( "load-deposit", url=url, deposit_id=deposit.id, retries_left=3 ) load_task_id = self.scheduler.create_tasks([task])[0]["id"] deposit.load_task_id = load_task_id deposit.save() return status.HTTP_200_OK, json.dumps(response), "application/json" diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py index 91ec2e3d..6a647871 100644 --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -1,216 +1,216 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os import shutil import tempfile from contextlib import contextmanager from django.http import FileResponse from rest_framework import status from swh.core import tarball from swh.model import identifiers from swh.deposit.utils import normalize_date from . import DepositReadMixin, SWHPrivateAPIView from ...config import SWH_PERSON, ARCHIVE_TYPE -from ..common import SWHGetDepositAPI +from ..common import APIGet from ...models import Deposit @contextmanager def aggregate_tarballs(extraction_dir, archive_paths): """Aggregate multiple tarballs into one and returns this new archive's path. Args: extraction_dir (path): Path to use for the tarballs computation archive_paths ([str]): Deposit's archive paths Returns: Tuple (directory to clean up, archive path (aggregated or not)) """ # rebuild one zip archive from (possibly) multiple ones os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix="swh.deposit-", dir=extraction_dir) # root folder to build an aggregated tarball aggregated_tarball_rootdir = os.path.join(dir_path, "aggregate") os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True) # uncompress in a temporary location all archives for archive_path in archive_paths: tarball.uncompress(archive_path, aggregated_tarball_rootdir) # Aggregate into one big tarball the multiple smaller ones temp_tarpath = shutil.make_archive( aggregated_tarball_rootdir, "zip", aggregated_tarball_rootdir ) # can already clean up temporary directory shutil.rmtree(aggregated_tarball_rootdir) try: yield temp_tarpath finally: shutil.rmtree(dir_path) -class SWHDepositReadArchives(SWHPrivateAPIView, SWHGetDepositAPI, DepositReadMixin): +class SWHDepositReadArchives(SWHPrivateAPIView, APIGet, DepositReadMixin): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ ADDITIONAL_CONFIG = { "extraction_dir": ("str", "/tmp/swh-deposit/archive/"), } def __init__(self): super().__init__() self.extraction_dir = self.config["extraction_dir"] if not os.path.exists(self.extraction_dir): os.makedirs(self.extraction_dir) def process_get(self, request, collection_name, deposit_id): """Build a unique tarball from the multiple received and stream that content to the client. Args: request (Request): collection_name (str): Collection owning the deposit deposit_id (id): Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ archive_paths = [ r.archive.path for r in self._deposit_requests(deposit_id, request_type=ARCHIVE_TYPE) ] with aggregate_tarballs(self.extraction_dir, archive_paths) as path: return FileResponse( open(path, "rb"), status=status.HTTP_200_OK, content_type="application/zip", ) -class SWHDepositReadMetadata(SWHPrivateAPIView, SWHGetDepositAPI, DepositReadMixin): +class SWHDepositReadMetadata(SWHPrivateAPIView, APIGet, DepositReadMixin): """Class in charge of aggregating metadata on a deposit. - """ + """ ADDITIONAL_CONFIG = { "provider": ( "dict", { # 'provider_name': '', # those are not set since read from the # 'provider_url': '', # deposit's client "provider_type": "deposit_client", "metadata": {}, }, ), "tool": ( "dict", { "name": "swh-deposit", "version": "0.0.1", "configuration": {"sword_version": "2"}, }, ), } def __init__(self): super().__init__() self.provider = self.config["provider"] self.tool = self.config["tool"] def _normalize_dates(self, deposit, metadata): """Normalize the date to use as a tuple of author date, committer date from the incoming metadata. Args: deposit (Deposit): Deposit model representation metadata (Dict): Metadata dict representation Returns: Tuple of author date, committer date. Those dates are swh normalized. """ commit_date = metadata.get("codemeta:datePublished") author_date = metadata.get("codemeta:dateCreated") if author_date and commit_date: pass elif commit_date: author_date = commit_date elif author_date: commit_date = author_date else: author_date = deposit.complete_date commit_date = deposit.complete_date return (normalize_date(author_date), normalize_date(commit_date)) def metadata_read(self, deposit): """Read and aggregate multiple data on deposit into one unified data dictionary. Args: deposit (Deposit): Deposit concerned by the data aggregation. Returns: Dictionary of data representing the deposit to inject in swh. """ metadata = self._metadata_get(deposit) # Read information metadata data = {"origin": {"type": "deposit", "url": deposit.origin_url,}} # metadata provider self.provider["provider_name"] = deposit.client.last_name self.provider["provider_url"] = deposit.client.provider_url author_date, commit_date = self._normalize_dates(deposit, metadata) if deposit.parent: swh_persistent_id = deposit.parent.swh_id swhid = identifiers.parse_swhid(swh_persistent_id) parent_revision = swhid.object_id parents = [parent_revision] else: parents = [] data["origin_metadata"] = { "provider": self.provider, "tool": self.tool, "metadata": metadata, } data["deposit"] = { "id": deposit.id, "client": deposit.client.username, "collection": deposit.collection.name, "author": SWH_PERSON, "author_date": author_date, "committer": SWH_PERSON, "committer_date": commit_date, "revision_parents": parents, } return data def process_get(self, request, collection_name, deposit_id): deposit = Deposit.objects.get(pk=deposit_id) data = self.metadata_read(deposit) d = {} if data: d = json.dumps(data) return status.HTTP_200_OK, d, "application/json" diff --git a/swh/deposit/api/private/deposit_update_status.py b/swh/deposit/api/private/deposit_update_status.py index c7b7e63a..24678a6c 100644 --- a/swh/deposit/api/private/deposit_update_status.py +++ b/swh/deposit/api/private/deposit_update_status.py @@ -1,105 +1,105 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework.parsers import JSONParser from swh.model.identifiers import DIRECTORY, swhid, REVISION, SNAPSHOT from . import SWHPrivateAPIView -from ..common import SWHPutDepositAPI +from ..common import APIPut from ...errors import make_error_dict, BAD_REQUEST from ...models import Deposit, DEPOSIT_STATUS_DETAIL from ...models import DEPOSIT_STATUS_LOAD_SUCCESS MANDATORY_KEYS = ["origin_url", "revision_id", "directory_id", "snapshot_id"] -class SWHUpdateStatusDeposit(SWHPrivateAPIView, SWHPutDepositAPI): +class SWHUpdateStatusDeposit(SWHPrivateAPIView, APIPut): """Deposit request class to update the deposit's status. HTTP verbs supported: PUT """ parser_classes = (JSONParser,) def additional_checks(self, request, headers, collection_name, deposit_id=None): """Enrich existing checks to the default ones. New checks: - Ensure the status is provided - Ensure it exists - no missing information on load success update """ data = request.data status = data.get("status") if not status: msg = "The status key is mandatory with possible values %s" % list( DEPOSIT_STATUS_DETAIL.keys() ) return make_error_dict(BAD_REQUEST, msg) if status not in DEPOSIT_STATUS_DETAIL: msg = "Possible status in %s" % list(DEPOSIT_STATUS_DETAIL.keys()) return make_error_dict(BAD_REQUEST, msg) if status == DEPOSIT_STATUS_LOAD_SUCCESS: missing_keys = [] for key in MANDATORY_KEYS: value = data.get(key) if value is None: missing_keys.append(key) if missing_keys: msg = ( f"Updating deposit status to {status}" f" requires information {','.join(missing_keys)}" ) return make_error_dict(BAD_REQUEST, msg) return {} def process_put(self, request, headers, collection_name, deposit_id): """Update the deposit with status and SWHIDs Returns: 204 No content 400 Bad request if checks fail """ data = request.data deposit = Deposit.objects.get(pk=deposit_id) status = data["status"] deposit.status = status if status == DEPOSIT_STATUS_LOAD_SUCCESS: origin_url = data["origin_url"] directory_id = data["directory_id"] revision_id = data["revision_id"] dir_id = swhid(DIRECTORY, directory_id) snp_id = swhid(SNAPSHOT, data["snapshot_id"]) rev_id = swhid(REVISION, revision_id) deposit.swh_id = dir_id # new id with contextual information deposit.swh_id_context = swhid( DIRECTORY, directory_id, metadata={ "origin": origin_url, "visit": snp_id, "anchor": rev_id, "path": "/", }, ) else: # rejected deposit.status = status deposit.save() return {} diff --git a/swh/deposit/api/service_document.py b/swh/deposit/api/service_document.py index 6aa3899c..15a68eba 100644 --- a/swh/deposit/api/service_document.py +++ b/swh/deposit/api/service_document.py @@ -1,34 +1,34 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.shortcuts import render from django.urls import reverse -from .common import SWHBaseDeposit, ACCEPT_PACKAGINGS +from .common import APIBase, ACCEPT_PACKAGINGS from .common import ACCEPT_ARCHIVE_CONTENT_TYPES from ..config import COL_IRI from ..models import DepositClient, DepositCollection -class SWHServiceDocument(SWHBaseDeposit): +class APIServiceDocument(APIBase): def get(self, req, *args, **kwargs): client = DepositClient.objects.get(username=req.user) collections = {} for col_id in client.collections: col = DepositCollection.objects.get(pk=col_id) col_uri = req.build_absolute_uri(reverse(COL_IRI, args=[col.name])) collections[col.name] = col_uri context = { "max_upload_size": self.config["max_upload_size"], "accept_packagings": ACCEPT_PACKAGINGS, "accept_content_types": ACCEPT_ARCHIVE_CONTENT_TYPES, "collections": collections, } return render( req, "deposit/service_document.xml", context, content_type="application/xml" ) diff --git a/swh/deposit/api/urls.py b/swh/deposit/api/urls.py index 236db5aa..f7450d77 100644 --- a/swh/deposit/api/urls.py +++ b/swh/deposit/api/urls.py @@ -1,70 +1,70 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """SWH's deposit api URL Configuration """ from django.conf.urls import url from django.shortcuts import render from ..config import EDIT_SE_IRI, EM_IRI, CONT_FILE_IRI from ..config import SD_IRI, COL_IRI, STATE_IRI -from .deposit import SWHDeposit -from .deposit_status import SWHDepositStatus -from .deposit_update import SWHUpdateMetadataDeposit -from .deposit_update import SWHUpdateArchiveDeposit -from .deposit_content import SWHDepositContent -from .service_document import SWHServiceDocument +from .deposit import APIPostDeposit +from .deposit_status import APIStatus +from .deposit_update import APIUpdateMetadata +from .deposit_update import APIUpdateArchive +from .deposit_content import APIContent +from .service_document import APIServiceDocument def api_view(req): return render(req, "api.html") # PUBLIC API urlpatterns = [ # simple view on the api url(r"^$", api_view, name="api"), # SD IRI - Service Document IRI # -> GET - url(r"^servicedocument/", SWHServiceDocument.as_view(), name=SD_IRI), + url(r"^servicedocument/", APIServiceDocument.as_view(), name=SD_IRI), # Col IRI - Collection IRI # -> POST - url(r"^(?P[^/]+)/$", SWHDeposit.as_view(), name=COL_IRI), + url(r"^(?P[^/]+)/$", APIPostDeposit.as_view(), name=COL_IRI), # EM IRI - Atom Edit Media IRI (update archive IRI) # -> PUT (update-in-place existing archive) # -> POST (add new archive) url( r"^(?P[^/]+)/(?P[^/]+)/media/$", - SWHUpdateArchiveDeposit.as_view(), + APIUpdateArchive.as_view(), name=EM_IRI, ), # Edit IRI - Atom Entry Edit IRI (update metadata IRI) # SE IRI - Sword Edit IRI ;; possibly same as Edit IRI # -> PUT (update in place) # -> POST (add new metadata) url( r"^(?P[^/]+)/(?P[^/]+)/metadata/$", - SWHUpdateMetadataDeposit.as_view(), + APIUpdateMetadata.as_view(), name=EDIT_SE_IRI, ), # State IRI # -> GET url( r"^(?P[^/]+)/(?P[^/]+)/status/$", - SWHDepositStatus.as_view(), + APIStatus.as_view(), name=STATE_IRI, ), # Cont/File IRI # -> GET url( r"^(?P[^/]+)/(?P[^/]+)/content/$", - SWHDepositContent.as_view(), + APIContent.as_view(), name=CONT_FILE_IRI, ), # specification is not clear about # FILE-IRI, we assume it's the same as # the CONT-IRI one ] diff --git a/swh/deposit/cli/admin.py b/swh/deposit/cli/admin.py index 125b1821..fbaac89a 100644 --- a/swh/deposit/cli/admin.py +++ b/swh/deposit/cli/admin.py @@ -1,274 +1,274 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # WARNING: do not import unnecessary things here to keep cli startup time under # control import click from swh.deposit.cli import deposit @deposit.group("admin") @click.option( "--config-file", "-C", default=None, type=click.Path(exists=True, dir_okay=False,), help="Optional extra configuration file.", ) @click.option( "--platform", default="development", type=click.Choice(["development", "production"]), help="development or production platform", ) @click.pass_context def admin(ctx, config_file, platform): """Server administration tasks (manipulate user or collections)""" from swh.deposit.config import setup_django_for # configuration happens here setup_django_for(platform, config_file=config_file) @admin.group("user") @click.pass_context def user(ctx): """Manipulate user.""" # configuration happens here pass def _create_collection(name): """Create the collection with name if it does not exist. Args: name (str): collection's name Returns: collection (DepositCollection): the existing collection object (created or not) """ # to avoid loading too early django namespaces from swh.deposit.models import DepositCollection try: collection = DepositCollection.objects.get(name=name) click.echo("Collection %s exists, nothing to do." % name) except DepositCollection.DoesNotExist: click.echo("Create new collection %s" % name) collection = DepositCollection.objects.create(name=name) click.echo("Collection %s created" % name) return collection @user.command("create") @click.option("--username", required=True, help="User's name") @click.option("--password", required=True, help="Desired user's password (plain).") @click.option("--firstname", default="", help="User's first name") @click.option("--lastname", default="", help="User's last name") @click.option("--email", default="", help="User's email") @click.option("--collection", help="User's collection") @click.option("--provider-url", default="", help="Provider URL") @click.option("--domain", default="", help="The domain") @click.pass_context def user_create( ctx, username, password, firstname, lastname, email, collection, provider_url, domain, ): """Create a user with some needed information (password, collection) If the collection does not exist, the collection is then created alongside. The password is stored encrypted using django's utilities. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositClient # If collection is not provided, fallback to username if not collection: collection = username click.echo("collection: %s" % collection) # create the collection if it does not exist collection = _create_collection(collection) # user create/update try: user = DepositClient.objects.get(username=username) click.echo("User %s exists, updating information." % user) user.set_password(password) except DepositClient.DoesNotExist: click.echo("Create new user %s" % username) user = DepositClient.objects.create_user(username=username, password=password) user.collections = [collection.id] user.first_name = firstname user.last_name = lastname user.email = email user.is_active = True user.provider_url = provider_url user.domain = domain user.save() click.echo("Information registered for user %s" % user) @user.command("list") @click.pass_context def user_list(ctx): """List existing users. This entrypoint is not paginated yet as there is not a lot of entry. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositClient users = DepositClient.objects.all() if not users: output = "Empty user list" else: output = "\n".join((user.username for user in users)) click.echo(output) @user.command("exists") @click.argument("username", required=True) @click.pass_context def user_exists(ctx, username): """Check if user exists. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositClient try: DepositClient.objects.get(username=username) click.echo("User %s exists." % username) ctx.exit(0) except DepositClient.DoesNotExist: click.echo("User %s does not exist." % username) ctx.exit(1) @admin.group("collection") @click.pass_context def collection(ctx): """Manipulate collections.""" pass @collection.command("create") @click.option("--name", required=True, help="Collection's name") @click.pass_context def collection_create(ctx, name): _create_collection(name) @collection.command("list") @click.pass_context def collection_list(ctx): """List existing collections. This entrypoint is not paginated yet as there is not a lot of entry. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositCollection collections = DepositCollection.objects.all() if not collections: output = "Empty collection list" else: output = "\n".join((col.name for col in collections)) click.echo(output) @admin.group("deposit") @click.pass_context def adm_deposit(ctx): """Manipulate deposit.""" pass @adm_deposit.command("reschedule") @click.option("--deposit-id", required=True, help="Deposit identifier") @click.pass_context def adm_deposit_reschedule(ctx, deposit_id): """Reschedule the deposit loading This will: - check the deposit's status to something reasonable (failed or done). That means that the checks have passed alright but something went wrong during the loading (failed: loading failed, done: loading ok, still for some reasons as in bugs, we need to reschedule it) - reset the deposit's status to 'verified' (prior to any loading but after the checks which are fine) and removes the different archives' identifiers (swh-id, ...) - trigger back the loading task through the scheduler """ # to avoid loading too early django namespaces from datetime import datetime from swh.deposit.models import Deposit from swh.deposit.config import ( DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_VERIFIED, - SWHDefaultConfig, + APIConfig, ) try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: click.echo("Deposit %s does not exist." % deposit_id) ctx.exit(1) # Check the deposit is in a reasonable state accepted_statuses = [DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE] if deposit.status == DEPOSIT_STATUS_VERIFIED: click.echo("Deposit %s's status already set for rescheduling." % (deposit_id)) ctx.exit(0) if deposit.status not in accepted_statuses: click.echo( "Deposit %s's status be one of %s." % (deposit_id, ", ".join(accepted_statuses)) ) ctx.exit(1) task_id = deposit.load_task_id if not task_id: click.echo( "Deposit %s cannot be rescheduled. It misses the " "associated task." % deposit_id ) ctx.exit(1) # Reset the deposit's state deposit.swh_id = None deposit.swh_id_context = None deposit.status = DEPOSIT_STATUS_VERIFIED deposit.save() # Trigger back the deposit - scheduler = SWHDefaultConfig().scheduler + scheduler = APIConfig().scheduler scheduler.set_status_tasks( [task_id], status="next_run_not_scheduled", next_run=datetime.now() ) diff --git a/swh/deposit/config.py b/swh/deposit/config.py index 410370d0..11be8c8d 100644 --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -1,110 +1,110 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import logging from typing import Any, Dict, Tuple from swh.core.config import SWHConfig from swh.scheduler import get_scheduler # IRIs (Internationalized Resource identifier) sword 2.0 specified EDIT_SE_IRI = "edit_se_iri" EM_IRI = "em_iri" CONT_FILE_IRI = "cont_file_iri" SD_IRI = "servicedocument" COL_IRI = "upload" STATE_IRI = "state_iri" PRIVATE_GET_RAW_CONTENT = "private-download" PRIVATE_CHECK_DEPOSIT = "check-deposit" PRIVATE_PUT_DEPOSIT = "private-update" PRIVATE_GET_DEPOSIT_METADATA = "private-read" PRIVATE_LIST_DEPOSITS = "private-deposit-list" ARCHIVE_KEY = "archive" METADATA_KEY = "metadata" RAW_METADATA_KEY = "raw-metadata" ARCHIVE_TYPE = "archive" METADATA_TYPE = "metadata" AUTHORIZED_PLATFORMS = ["development", "production", "testing"] DEPOSIT_STATUS_REJECTED = "rejected" DEPOSIT_STATUS_PARTIAL = "partial" DEPOSIT_STATUS_DEPOSITED = "deposited" DEPOSIT_STATUS_VERIFIED = "verified" DEPOSIT_STATUS_LOAD_SUCCESS = "done" DEPOSIT_STATUS_LOAD_FAILURE = "failed" # Revision author for deposit SWH_PERSON = { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org", } def setup_django_for(platform=None, config_file=None): """Setup function for command line tools (swh.deposit.create_user) to initialize the needed db access. Note: Do not import any django related module prior to this function call. Otherwise, this will raise an django.core.exceptions.ImproperlyConfigured error message. Args: platform (str): the platform the scheduling is running config_file (str): Extra configuration file (typically for the production platform) Raises: ValueError in case of wrong platform inputs. """ if platform is not None: if platform not in AUTHORIZED_PLATFORMS: raise ValueError("Platform should be one of %s" % AUTHORIZED_PLATFORMS) if "DJANGO_SETTINGS_MODULE" not in os.environ: os.environ["DJANGO_SETTINGS_MODULE"] = "swh.deposit.settings.%s" % platform if config_file: os.environ.setdefault("SWH_CONFIG_FILENAME", config_file) import django django.setup() -class SWHDefaultConfig(SWHConfig): +class APIConfig(SWHConfig): """Mixin intended to enrich views with SWH configuration. """ CONFIG_BASE_FILENAME = "deposit/server" DEFAULT_CONFIG = { "max_upload_size": ("int", 209715200), "checks": ("bool", True), "scheduler": ( "dict", {"cls": "remote", "args": {"url": "http://localhost:5008/"}}, ), } ADDITIONAL_CONFIG = {} # type: Dict[str, Tuple[str, Any]] def __init__(self, **config): super().__init__() self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG] ) self.config.update(config) self.log = logging.getLogger("swh.deposit") if self.config.get("scheduler"): self.scheduler = get_scheduler(**self.config["scheduler"]) diff --git a/swh/deposit/parsers.py b/swh/deposit/parsers.py index 0cb49065..4ac55216 100644 --- a/swh/deposit/parsers.py +++ b/swh/deposit/parsers.py @@ -1,96 +1,96 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining parsers with SWORD 2.0 supported mediatypes. """ import xmltodict from django.conf import settings from rest_framework.parsers import BaseParser from rest_framework.parsers import FileUploadParser from rest_framework.parsers import MultiPartParser from xml.parsers.expat import ExpatError from swh.deposit.errors import ParserError class SWHFileUploadZipParser(FileUploadParser): """File upload parser limited to zip archive. """ media_type = "application/zip" class SWHFileUploadTarParser(FileUploadParser): """File upload parser limited to tarball (tar, tar.gz, tar.*) archives. """ media_type = "application/x-tar" class SWHXMLParser(BaseParser): """ XML parser. """ media_type = "application/xml" def parse(self, stream, media_type=None, parser_context=None): """ Parses the incoming bytestream as XML and returns the resulting data. """ parser_context = parser_context or {} encoding = parser_context.get("encoding", settings.DEFAULT_CHARSET) data = xmltodict.parse(stream, encoding=encoding, process_namespaces=False) if "entry" in data: data = data["entry"] return data class SWHAtomEntryParser(SWHXMLParser): """Atom entry parser limited to specific mediatype """ media_type = "application/atom+xml;type=entry" def parse(self, stream, media_type=None, parser_context=None): # We do not actually want to parse the stream yet # because we want to keep the raw data as well # this is done later in the atom entry call - # (cf. swh.deposit.api.common.SWHBaseDeposit._atom_entry) + # (cf. swh.deposit.api.common.APIBase._atom_entry) return stream class SWHMultiPartParser(MultiPartParser): """Multipart parser limited to a subset of mediatypes. """ media_type = "multipart/*; *" def parse_xml(raw_content): """Parse xml body. Args: raw_content (bytes): The content to parse Raises: ParserError in case of a malformed xml Returns: content parsed as dict. """ try: return SWHXMLParser().parse(raw_content) except ExpatError as e: raise ParserError(str(e)) diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py index 74dadc6e..6d13f8d3 100644 --- a/swh/deposit/tests/conftest.py +++ b/swh/deposit/tests/conftest.py @@ -1,427 +1,427 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import base64 import pytest import psycopg2 from django.urls import reverse from django.test.utils import setup_databases # type: ignore # mypy is asked to ignore the import statement above because setup_databases # is not part of the d.t.utils.__all__ variable. from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT from rest_framework import status from rest_framework.test import APIClient from typing import Mapping from swh.scheduler import get_scheduler from swh.model.identifiers import DIRECTORY, swhid, REVISION, SNAPSHOT from swh.deposit.config import setup_django_for from swh.deposit.parsers import parse_xml -from swh.deposit.config import SWHDefaultConfig +from swh.deposit.config import APIConfig from swh.deposit.config import ( COL_IRI, EDIT_SE_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_LOAD_FAILURE, ) from swh.deposit.tests.common import create_arborescence_archive TEST_USER = { "username": "test", "password": "password", "email": "test@example.org", "provider_url": "https://hal-test.archives-ouvertes.fr/", "domain": "archives-ouvertes.fr/", "collection": {"name": "test"}, } TEST_CONFIG = { "max_upload_size": 500, "extraction_dir": "/tmp/swh-deposit/test/extraction-dir", "checks": False, "provider": { "provider_name": "", "provider_type": "deposit_client", "provider_url": "", "metadata": {}, }, "tool": { "name": "swh-deposit", "version": "0.0.1", "configuration": {"sword_version": "2"}, }, } def pytest_configure(): setup_django_for("testing") @pytest.fixture() def deposit_config(): return TEST_CONFIG @pytest.fixture(autouse=True) def deposit_autoconfig(monkeypatch, deposit_config, swh_scheduler_config): - """Enforce config for deposit classes inherited from SWHDefaultConfig.""" + """Enforce config for deposit classes inherited from APIConfig.""" def mock_parse_config(*args, **kw): config = deposit_config.copy() config["scheduler"] = { "cls": "local", "args": swh_scheduler_config, } return config - monkeypatch.setattr(SWHDefaultConfig, "parse_config_file", mock_parse_config) + monkeypatch.setattr(APIConfig, "parse_config_file", mock_parse_config) scheduler = get_scheduler("local", swh_scheduler_config) task_type = { "type": "load-deposit", "backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit", "description": "why does this have not-null constraint?", } scheduler.create_task_type(task_type) @pytest.fixture(scope="session") def django_db_setup(request, django_db_blocker, postgresql_proc): from django.conf import settings settings.DATABASES["default"].update( { ("ENGINE", "django.db.backends.postgresql"), ("NAME", "tests"), ("USER", postgresql_proc.user), # noqa ("HOST", postgresql_proc.host), # noqa ("PORT", postgresql_proc.port), # noqa } ) with django_db_blocker.unblock(): setup_databases( verbosity=request.config.option.verbose, interactive=False, keepdb=False ) def execute_sql(sql): """Execute sql to postgres db""" with psycopg2.connect(database="postgres") as conn: conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() cur.execute(sql) @pytest.fixture(autouse=True, scope="session") def swh_proxy(): """Automatically inject this fixture in all tests to ensure no outside connection takes place. """ os.environ["http_proxy"] = "http://localhost:999" os.environ["https_proxy"] = "http://localhost:999" def create_deposit_collection(collection_name: str): """Create a deposit collection with name collection_name """ from swh.deposit.models import DepositCollection try: collection = DepositCollection._default_manager.get(name=collection_name) except DepositCollection.DoesNotExist: collection = DepositCollection(name=collection_name) collection.save() return collection def deposit_collection_factory(collection_name=TEST_USER["collection"]["name"]): @pytest.fixture def _deposit_collection(db, collection_name=collection_name): return create_deposit_collection(collection_name) return _deposit_collection deposit_collection = deposit_collection_factory() deposit_another_collection = deposit_collection_factory("another-collection") @pytest.fixture def deposit_user(db, deposit_collection): """Create/Return the test_user "test" """ from swh.deposit.models import DepositClient try: user = DepositClient._default_manager.get(username=TEST_USER["username"]) except DepositClient.DoesNotExist: user = DepositClient._default_manager.create_user( username=TEST_USER["username"], email=TEST_USER["email"], password=TEST_USER["password"], provider_url=TEST_USER["provider_url"], domain=TEST_USER["domain"], ) user.collections = [deposit_collection.id] user.save() return user @pytest.fixture def client(): """Override pytest-django one which does not work for djangorestframework. """ return APIClient() # <- drf's client @pytest.yield_fixture def authenticated_client(client, deposit_user): """Returned a logged client """ _token = "%s:%s" % (deposit_user.username, TEST_USER["password"]) token = base64.b64encode(_token.encode("utf-8")) authorization = "Basic %s" % token.decode("utf-8") client.credentials(HTTP_AUTHORIZATION=authorization) yield client client.logout() @pytest.fixture def sample_archive(tmp_path): """Returns a sample archive """ tmp_path = str(tmp_path) # pytest version limitation in previous version archive = create_arborescence_archive( tmp_path, "archive1", "file1", b"some content in file" ) return archive @pytest.fixture def atom_dataset(datadir) -> Mapping[str, str]: """Compute the paths to atom files. Returns: Dict of atom name per content (bytes) """ atom_path = os.path.join(datadir, "atom") data = {} for filename in os.listdir(atom_path): filepath = os.path.join(atom_path, filename) with open(filepath, "rb") as f: raw_content = f.read().decode("utf-8") # Keep the filename without extension atom_name = filename.split(".")[0] data[atom_name] = raw_content return data def create_deposit( authenticated_client, collection_name: str, sample_archive, external_id: str, deposit_status=DEPOSIT_STATUS_DEPOSITED, ): """Create a skeleton shell deposit """ url = reverse(COL_IRI, args=[collection_name]) # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"]), ) # then assert response.status_code == status.HTTP_201_CREATED from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(external_id=external_id) if deposit.status != deposit_status: deposit.status = deposit_status deposit.save() assert deposit.status == deposit_status return deposit def create_binary_deposit( authenticated_client, collection_name: str, sample_archive, external_id: str, deposit_status: str = DEPOSIT_STATUS_DEPOSITED, atom_dataset: Mapping[str, bytes] = {}, ): """Create a deposit with both metadata and archive set. Then alters its status to `deposit_status`. """ deposit = create_deposit( authenticated_client, collection_name, sample_archive, external_id=external_id, deposit_status=DEPOSIT_STATUS_PARTIAL, ) response = authenticated_client.post( reverse(EDIT_SE_IRI, args=[collection_name, deposit.id]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8"), HTTP_SLUG=deposit.external_id, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_201_CREATED assert deposit.status == DEPOSIT_STATUS_PARTIAL from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit.id) if deposit.status != deposit_status: deposit.status = deposit_status deposit.save() assert deposit.status == deposit_status return deposit def deposit_factory(deposit_status=DEPOSIT_STATUS_DEPOSITED): """Build deposit with a specific status """ @pytest.fixture() def _deposit( sample_archive, deposit_collection, authenticated_client, deposit_status=deposit_status, ): external_id = "external-id-%s" % deposit_status return create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id=external_id, deposit_status=deposit_status, ) return _deposit deposited_deposit = deposit_factory() rejected_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_REJECTED) partial_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_PARTIAL) verified_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_VERIFIED) completed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS) failed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_FAILURE) @pytest.fixture def partial_deposit_with_metadata( sample_archive, deposit_collection, authenticated_client, atom_dataset ): """Returns deposit with archive and metadata provided, status 'partial' """ return create_binary_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id="external-id-partial", deposit_status=DEPOSIT_STATUS_PARTIAL, atom_dataset=atom_dataset, ) @pytest.fixture def partial_deposit_only_metadata( deposit_collection, authenticated_client, atom_dataset ): response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], HTTP_SLUG="external-id-partial", HTTP_IN_PROGRESS=True, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = response_content["deposit_id"] from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_PARTIAL return deposit @pytest.fixture def complete_deposit(sample_archive, deposit_collection, authenticated_client): """Returns a completed deposit (load success) """ deposit = create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id="external-id-complete", deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS, ) origin = "https://hal.archives-ouvertes.fr/hal-01727745" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" revision_id = "548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10" snapshot_id = "e5e82d064a9c3df7464223042e0c55d72ccff7f0" deposit.swh_id = swhid(DIRECTORY, directory_id) deposit.swh_id_context = swhid( DIRECTORY, directory_id, metadata={ "origin": origin, "visit": swhid(SNAPSHOT, snapshot_id), "anchor": swhid(REVISION, revision_id), "path": "/", }, ) deposit.save() return deposit @pytest.fixture() def tmp_path(tmp_path): return str(tmp_path) # issue with oldstable's pytest version