diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py index 6f696078..6bed49c5 100644 --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -1,958 +1,1033 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from abc import ABCMeta, abstractmethod +import datetime import hashlib -from typing import Sequence, Type +import json +from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union -from django.http import HttpResponse +from django.http import FileResponse, HttpResponse from django.shortcuts import render from django.urls import reverse from django.utils import timezone from rest_framework import status from rest_framework.authentication import BaseAuthentication, BasicAuthentication from rest_framework.permissions import BasePermission, IsAuthenticated +from rest_framework.request import Request from rest_framework.views import APIView from swh.model import hashutil from swh.scheduler.utils import create_oneshot_task_dict from ..config import ( ARCHIVE_KEY, ARCHIVE_TYPE, CONT_FILE_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_PARTIAL, EDIT_SE_IRI, EM_IRI, METADATA_KEY, METADATA_TYPE, RAW_METADATA_KEY, STATE_IRI, APIConfig, ) from ..errors import ( BAD_REQUEST, CHECKSUM_MISMATCH, ERROR_CONTENT, FORBIDDEN, MAX_UPLOAD_SIZE_EXCEEDED, MEDIATION_NOT_ALLOWED, METHOD_NOT_ALLOWED, NOT_FOUND, PARSING_ERROR, ParserError, make_error_dict, make_error_response, make_error_response_from_dict, ) from ..models import Deposit, DepositClient, DepositCollection, DepositRequest from ..parsers import parse_xml ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"] ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"] class AuthenticatedAPIView(APIView): """Mixin intended as a based API view to enforce the basic authentication check """ authentication_classes: Sequence[Type[BaseAuthentication]] = (BasicAuthentication,) permission_classes: Sequence[Type[BasePermission]] = (IsAuthenticated,) class APIBase(APIConfig, AuthenticatedAPIView, metaclass=ABCMeta): """Base deposit request class sharing multiple common behaviors. """ - def _read_headers(self, request): + def _read_headers(self, request: Request) -> Dict[str, Any]: """Read and unify the necessary headers from the request (those are not stored in the same location or not properly formatted). Args: request (Request): Input request Returns: Dictionary with the following keys (some associated values may be None): - content-type - content-length - in-progress - content-disposition - packaging - slug - on-behalf-of """ meta = request._request.META content_type = request.content_type content_length = meta.get("CONTENT_LENGTH") if content_length and isinstance(content_length, str): content_length = int(content_length) # final deposit if not provided in_progress = meta.get("HTTP_IN_PROGRESS", False) content_disposition = meta.get("HTTP_CONTENT_DISPOSITION") if isinstance(in_progress, str): in_progress = in_progress.lower() == "true" content_md5sum = meta.get("HTTP_CONTENT_MD5") if content_md5sum: content_md5sum = bytes.fromhex(content_md5sum) packaging = meta.get("HTTP_PACKAGING") slug = meta.get("HTTP_SLUG") on_behalf_of = meta.get("HTTP_ON_BEHALF_OF") metadata_relevant = meta.get("HTTP_METADATA_RELEVANT") return { "content-type": content_type, "content-length": content_length, "in-progress": in_progress, "content-disposition": content_disposition, "content-md5sum": content_md5sum, "packaging": packaging, "slug": slug, "on-behalf-of": on_behalf_of, "metadata-relevant": metadata_relevant, } - def _compute_md5(self, filehandler): + def _compute_md5(self, filehandler) -> bytes: """Compute uploaded file's md5 sum. Args: filehandler (InMemoryUploadedFile): the file to compute the md5 hash Returns: the md5 checksum (str) """ h = hashlib.md5() for chunk in filehandler: h.update(chunk) return h.digest() def _deposit_put( - self, request, deposit_id=None, in_progress=False, external_id=None - ): + self, + request: Request, + deposit_id: Optional[int] = None, + in_progress: bool = False, + external_id: Optional[str] = None, + ) -> Deposit: """Save/Update a deposit in db. Args: - deposit_id (int): deposit identifier - in_progress (dict): The deposit's status - external_id (str): The external identifier to associate to - the deposit + request: request data + deposit_id: deposit identifier + in_progress: deposit status + external_id: external identifier to associate to the deposit Returns: The Deposit instance saved or updated. """ + complete_date: Optional[datetime.datetime] = None + deposit_parent: Optional[Deposit] = None + if in_progress is False: complete_date = timezone.now() status_type = DEPOSIT_STATUS_DEPOSITED else: - complete_date = None status_type = DEPOSIT_STATUS_PARTIAL if not deposit_id: try: - # find a deposit parent (same external id, status load - # to success) + # find a deposit parent (same external id, status load to success) deposit_parent = ( Deposit.objects.filter( external_id=external_id, status=DEPOSIT_STATUS_LOAD_SUCCESS ) .order_by("-id")[0:1] .get() ) # noqa except Deposit.DoesNotExist: - deposit_parent = None + # then no parent for that deposit, deposit_parent already None + pass + assert external_id is not None deposit = Deposit( collection=self._collection, external_id=external_id, complete_date=complete_date, status=status_type, client=self._client, parent=deposit_parent, ) else: deposit = Deposit.objects.get(pk=deposit_id) # update metadata deposit.complete_date = complete_date deposit.status = status_type if self.config["checks"]: deposit.save() # needed to have a deposit id scheduler = self.scheduler if deposit.status == DEPOSIT_STATUS_DEPOSITED and not deposit.check_task_id: task = create_oneshot_task_dict( "check-deposit", collection=deposit.collection.name, deposit_id=deposit.id, ) check_task_id = scheduler.create_tasks([task])[0]["id"] deposit.check_task_id = check_task_id deposit.save() return deposit def _deposit_request_put( self, - deposit, - deposit_request_data, - replace_metadata=False, - replace_archives=False, - ): + deposit: Deposit, + deposit_request_data: Dict[str, Any], + replace_metadata: bool = False, + replace_archives: bool = False, + ) -> None: """Save a deposit request with metadata attached to a deposit. Args: - deposit (Deposit): The deposit concerned by the request - deposit_request_data (dict): The dictionary with at most 2 deposit - request types (archive, metadata) to associate to the deposit - replace_metadata (bool): Flag defining if we add or update + deposit: The deposit concerned by the request + deposit_request_data: The dictionary with at most 2 deposit + request types (archive, metadata) to associate to the deposit + replace_metadata: Flag defining if we add or update existing metadata to the deposit - replace_archives (bool): Flag defining if we add or update + replace_archives: Flag defining if we add or update archives to existing deposit Returns: None """ if replace_metadata: DepositRequest.objects.filter(deposit=deposit, type=METADATA_TYPE).delete() if replace_archives: DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() deposit_request = None archive_file = deposit_request_data.get(ARCHIVE_KEY) if archive_file: deposit_request = DepositRequest( type=ARCHIVE_TYPE, deposit=deposit, archive=archive_file ) deposit_request.save() metadata = deposit_request_data.get(METADATA_KEY) if metadata: - raw_metadata = deposit_request_data.get(RAW_METADATA_KEY) + raw_metadata = deposit_request_data[RAW_METADATA_KEY] deposit_request = DepositRequest( type=METADATA_TYPE, deposit=deposit, metadata=metadata, raw_metadata=raw_metadata.decode("utf-8"), ) deposit_request.save() assert deposit_request is not None - def _delete_archives(self, collection_name, deposit_id): - """Delete archives reference from the deposit id. + def _delete_archives(self, collection_name: str, deposit_id: int) -> Dict: + """Delete archive references from the deposit id. """ try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, f"The deposit {deposit_id} does not exist" ) DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() return {} - def _delete_deposit(self, collection_name, deposit_id): + def _delete_deposit(self, collection_name: str, deposit_id: int) -> Dict: """Delete deposit reference. Args: - collection_name (str): Client's name - deposit_id (id): The deposit to delete + collection_name: Client's collection + deposit_id: The deposit to delete Returns Empty dict when ok. Dict with error key to describe the failure. """ try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, f"The deposit {deposit_id} does not exist" ) if deposit.collection.name != collection_name: summary = "Cannot delete a deposit from another collection" description = "Deposit %s does not belong to the collection %s" % ( deposit_id, collection_name, ) return make_error_dict( BAD_REQUEST, summary=summary, verbose_description=description ) DepositRequest.objects.filter(deposit=deposit).delete() deposit.delete() return {} - def _check_preconditions_on(self, filehandler, md5sum, content_length=None): + def _check_preconditions_on( + self, filehandler, md5sum: str, content_length: Optional[int] = None + ) -> Optional[Dict]: """Check preconditions on provided file are respected. That is the length and/or the md5sum hash match the file's content. Args: filehandler (InMemoryUploadedFile): The file to check - md5sum (hex str): md5 hash expected from the file's content - content_length (int): the expected length if provided. + md5sum: md5 hash expected from the file's content + content_length: the expected length if provided. Returns: Either none if no error or a dictionary with a key error detailing the problem. """ max_upload_size = self.config["max_upload_size"] if content_length: if content_length > max_upload_size: return make_error_dict( MAX_UPLOAD_SIZE_EXCEEDED, f"Upload size limit exceeded (max {max_upload_size} bytes)." "Please consider sending the archive in multiple steps.", ) length = filehandler.size if length != content_length: return make_error_dict( status.HTTP_412_PRECONDITION_FAILED, "Wrong length" ) if md5sum: _md5sum = self._compute_md5(filehandler) if _md5sum != md5sum: return make_error_dict( CHECKSUM_MISMATCH, "Wrong md5 hash", f"The checksum sent {hashutil.hash_to_hex(md5sum)} and the actual " f"checksum {hashutil.hash_to_hex(_md5sum)} does not match.", ) return None def _binary_upload( self, - request, - headers, - collection_name, - deposit_id=None, - replace_metadata=False, - replace_archives=False, - ): + request: Request, + headers: Dict[str, Any], + collection_name: str, + deposit_id: Optional[int] = None, + replace_metadata: bool = False, + replace_archives: bool = False, + ) -> Dict[str, Any]: """Binary upload routine. Other than such a request, a 415 response is returned. Args: request (Request): the request holding information to parse and inject in db headers (dict): request headers formatted collection_name (str): the associated client deposit_id (id): deposit identifier if provided replace_metadata (bool): 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives (bool): 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id (int): Deposit identifier - deposit_date (date): Deposit date - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 413 (request entity too large) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided """ content_length = headers["content-length"] if not content_length: return make_error_dict( BAD_REQUEST, "CONTENT_LENGTH header is mandatory", "For archive deposit, the CONTENT_LENGTH header must be sent.", ) content_disposition = headers["content-disposition"] if not content_disposition: return make_error_dict( BAD_REQUEST, "CONTENT_DISPOSITION header is mandatory", "For archive deposit, the CONTENT_DISPOSITION header must be sent.", ) packaging = headers["packaging"] if packaging and packaging not in ACCEPT_PACKAGINGS: return make_error_dict( BAD_REQUEST, f"Only packaging {ACCEPT_PACKAGINGS} is supported", f"The packaging provided {packaging} is not supported", ) filehandler = request.FILES["file"] precondition_status_response = self._check_preconditions_on( filehandler, headers["content-md5sum"], content_length ) if precondition_status_response: return precondition_status_response external_id = headers["slug"] # actual storage of data archive_metadata = filehandler deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) self._deposit_request_put( deposit, {ARCHIVE_KEY: archive_metadata}, replace_metadata=replace_metadata, replace_archives=replace_archives, ) return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "status": deposit.status, "archive": filehandler.name, } - def _read_metadata(self, metadata_stream): + def _read_metadata(self, metadata_stream) -> Tuple[bytes, Dict[str, Any]]: """Given a metadata stream, reads the metadata and returns both the parsed and the raw metadata. """ raw_metadata = metadata_stream.read() metadata = parse_xml(raw_metadata) return raw_metadata, metadata def _multipart_upload( self, - request, - headers, - collection_name, - deposit_id=None, - replace_metadata=False, - replace_archives=False, - ): + request: Request, + headers: Dict[str, Any], + collection_name: str, + deposit_id: Optional[int] = None, + replace_metadata: bool = False, + replace_archives: bool = False, + ) -> Dict: """Multipart upload supported with exactly: - 1 archive (zip) - 1 atom entry Other than such a request, a 415 response is returned. Args: request (Request): the request holding information to parse and inject in db - headers (dict): request headers formatted - collection_name (str): the associated client - deposit_id (id): deposit identifier if provided - replace_metadata (bool): 'Update or add' request to existing + headers: request headers formatted + collection_name: the associated client + deposit_id: deposit identifier if provided + replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. - replace_archives (bool): 'Update or add' request to existing + replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id (int): Deposit identifier - deposit_date (date): Deposit date - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 412 (precondition failed) if the potentially md5 hash provided mismatch the reality of the archive - 413 (request entity too large) if the length of the archive exceeds the max size configured - 415 (unsupported media type) if a wrong media type is provided """ external_id = headers["slug"] content_types_present = set() - data = { + data: Dict[str, Optional[Any]] = { "application/zip": None, # expected either zip "application/x-tar": None, # or x-tar "application/atom+xml": None, } for key, value in request.FILES.items(): fh = value - if fh.content_type in content_types_present: + content_type = fh.content_type + if content_type in content_types_present: return make_error_dict( ERROR_CONTENT, "Only 1 application/zip (or application/x-tar) archive " "and 1 atom+xml entry is supported (as per sword2.0 " "specification)", "You provided more than 1 application/(zip|x-tar) " "or more than 1 application/atom+xml content-disposition " "header in the multipart deposit", ) - content_types_present.add(fh.content_type) - data[fh.content_type] = fh + content_types_present.add(content_type) + assert content_type is not None + data[content_type] = fh if len(content_types_present) != 2: return make_error_dict( ERROR_CONTENT, "You must provide both 1 application/zip (or " "application/x-tar) and 1 atom+xml entry for multipart " "deposit", "You need to provide only 1 application/(zip|x-tar) " "and 1 application/atom+xml content-disposition header " "in the multipart deposit", ) filehandler = data["application/zip"] if not filehandler: filehandler = data["application/x-tar"] precondition_status_response = self._check_preconditions_on( filehandler, headers["content-md5sum"] ) if precondition_status_response: return precondition_status_response try: raw_metadata, metadata = self._read_metadata(data["application/atom+xml"]) except ParserError: return make_error_dict( PARSING_ERROR, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) # actual storage of data deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) deposit_request_data = { ARCHIVE_KEY: filehandler, METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata, } self._deposit_request_put( deposit, deposit_request_data, replace_metadata, replace_archives ) + assert filehandler is not None return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "archive": filehandler.name, "status": deposit.status, } def _atom_entry( self, - request, - headers, - collection_name, - deposit_id=None, - replace_metadata=False, - replace_archives=False, - ): + request: Request, + headers: Dict[str, Any], + collection_name: str, + deposit_id: Optional[int] = None, + replace_metadata: bool = False, + replace_archives: bool = False, + ) -> Dict[str, Any]: """Atom entry deposit. Args: request (Request): the request holding information to parse and inject in db - headers (dict): request headers formatted - collection_name (str): the associated client - deposit_id (id): deposit identifier if provided - replace_metadata (bool): 'Update or add' request to existing + headers: request headers formatted + collection_name: the associated client + deposit_id: deposit identifier if provided + replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. - replace_archives (bool): 'Update or add' request to existing + replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id: deposit id associated to the deposit - deposit_date: date of the deposit - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 400 (bad request) if the request's body is empty - 415 (unsupported media type) if a wrong media type is provided """ try: raw_metadata, metadata = self._read_metadata(request.data) except ParserError: return make_error_dict( BAD_REQUEST, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) if not metadata: return make_error_dict( BAD_REQUEST, "Empty body request is not supported", "Atom entry deposit is supposed to send for metadata. " "If the body is empty, there is no metadata.", ) external_id = metadata.get("external_identifier", headers["slug"]) + # TODO: Determine if we are in the metadata-only deposit case. If it is, then + # save deposit and deposit request typed 'metadata' and send metadata to the + # metadata storage. Otherwise, do as existing deposit. + deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) self._deposit_request_put( deposit, {METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata}, replace_metadata, replace_archives, ) return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "archive": None, "status": deposit.status, } - def _empty_post(self, request, headers, collection_name, deposit_id): + def _empty_post( + self, request: Request, headers: Dict, collection_name: str, deposit_id: int + ) -> Dict[str, Any]: """Empty post to finalize an empty deposit. Args: - request (Request): the request holding information to parse + request: the request holding information to parse and inject in db - headers (dict): request headers formatted - collection_name (str): the associated client - deposit_id (id): deposit identifier + headers: request headers formatted + collection_name: the associated client + deposit_id: deposit identifier Returns: Dictionary of result with the deposit's id, the date it was completed and no archive. """ deposit = Deposit.objects.get(pk=deposit_id) deposit.complete_date = timezone.now() deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() return { "deposit_id": deposit_id, "deposit_date": deposit.complete_date, "status": deposit.status, "archive": None, } - def _make_iris(self, request, collection_name, deposit_id): + def _make_iris( + self, request: Request, collection_name: str, deposit_id: int + ) -> Dict[str, Any]: """Define the IRI endpoints Args: request (Request): The initial request collection_name (str): client/collection's name deposit_id (id): Deposit identifier Returns: Dictionary of keys with the iris' urls. """ args = [collection_name, deposit_id] return { iri: request.build_absolute_uri(reverse(iri, args=args)) for iri in [EM_IRI, EDIT_SE_IRI, CONT_FILE_IRI, STATE_IRI] } - def additional_checks(self, request, headers, collection_name, deposit_id=None): + def additional_checks( + self, + request: Request, + headers: Dict[str, Any], + collection_name: str, + deposit_id: Optional[int] = None, + ) -> Dict[str, Any]: """Permit the child class to enrich additional checks. Returns: dict with 'error' detailing the problem. """ return {} - def checks(self, request, collection_name, deposit_id=None): + def checks( + self, request: Request, collection_name: str, deposit_id: Optional[int] = None + ) -> Dict[str, Any]: try: self._collection = DepositCollection.objects.get(name=collection_name) except DepositCollection.DoesNotExist: return make_error_dict( NOT_FOUND, f"Unknown collection name {collection_name}" ) + assert self._collection is not None username = request.user.username if username: # unauthenticated request can have the username empty try: - self._client = DepositClient.objects.get(username=username) + self._client: DepositClient = DepositClient.objects.get( # type: ignore + username=username + ) except DepositClient.DoesNotExist: return make_error_dict(NOT_FOUND, f"Unknown client name {username}") - if self._collection.id not in self._client.collections: + collection_id = self._collection.id + collections = self._client.collections + assert collections is not None + if collection_id not in collections: return make_error_dict( FORBIDDEN, f"Client {username} cannot access collection {collection_name}", ) if deposit_id: try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, f"Deposit with id {deposit_id} does not exist" ) checks = self.restrict_access(request, deposit) if checks: return checks headers = self._read_headers(request) if headers["on-behalf-of"]: return make_error_dict(MEDIATION_NOT_ALLOWED, "Mediation is not supported.") checks = self.additional_checks(request, headers, collection_name, deposit_id) if "error" in checks: return checks return {"headers": headers} - def restrict_access(self, request, deposit=None): + def restrict_access( + self, request: Request, deposit: Optional[Deposit] = None + ) -> Dict[str, Any]: if deposit: if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL: summary = "You can only act on deposit with status '%s'" % ( DEPOSIT_STATUS_PARTIAL, ) description = f"This deposit has status '{deposit.status}'" return make_error_dict( BAD_REQUEST, summary=summary, verbose_description=description ) + return {} - def _basic_not_allowed_method(self, request, method): + def _basic_not_allowed_method(self, request: Request, method: str): return make_error_response( request, METHOD_NOT_ALLOWED, f"{method} method is not supported on this endpoint", ) - def get(self, request, *args, **kwargs): + def get( + self, request: Request, collection_name: str, deposit_id: int + ) -> Union[HttpResponse, FileResponse]: return self._basic_not_allowed_method(request, "GET") - def post(self, request, *args, **kwargs): + def post( + self, request: Request, collection_name: str, deposit_id: Optional[int] = None + ) -> HttpResponse: return self._basic_not_allowed_method(request, "POST") - def put(self, request, *args, **kwargs): + def put( + self, request: Request, collection_name: str, deposit_id: int + ) -> HttpResponse: return self._basic_not_allowed_method(request, "PUT") - def delete(self, request, *args, **kwargs): + def delete( + self, request: Request, collection_name: str, deposit_id: Optional[int] = None + ) -> HttpResponse: return self._basic_not_allowed_method(request, "DELETE") class APIGet(APIBase, metaclass=ABCMeta): """Mixin for class to support GET method. """ - def get(self, request, collection_name, deposit_id, format=None): + def get( + self, request: Request, collection_name: str, deposit_id: int + ) -> Union[HttpResponse, FileResponse]: """Endpoint to create/add resources to deposit. Returns: 200 response when no error during routine occurred 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) r = self.process_get(request, collection_name, deposit_id) - if isinstance(r, tuple): - status, content, content_type = r - return HttpResponse(content, status=status, content_type=content_type) - - return r + status, content, content_type = r + if content_type == "swh/generator": + with content as path: + return FileResponse( + open(path, "rb"), status=status, content_type="application/zip" + ) + if content_type == "application/json": + return HttpResponse( + json.dumps(content), status=status, content_type=content_type + ) + return HttpResponse(content, status=status, content_type=content_type) @abstractmethod - def process_get(self, request, collection_name, deposit_id): + def process_get( + self, request: Request, collection_name: str, deposit_id: int + ) -> Tuple[int, Any, str]: """Routine to deal with the deposit's get processing. Returns: Tuple status, stream of content, content-type """ pass class APIPost(APIBase, metaclass=ABCMeta): """Mixin for class to support DELETE method. """ - def post(self, request, collection_name, deposit_id=None, format=None): + def post( + self, request: Request, collection_name: str, deposit_id: Optional[int] = None + ) -> HttpResponse: """Endpoint to create/add resources to deposit. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) headers = checks["headers"] _status, _iri_key, data = self.process_post( request, headers, collection_name, deposit_id ) error = data.get("error") if error: return make_error_response_from_dict(request, error) data["packagings"] = ACCEPT_PACKAGINGS iris = self._make_iris(request, collection_name, data["deposit_id"]) data.update(iris) response = render( request, "deposit/deposit_receipt.xml", context=data, content_type="application/xml", status=_status, ) - response._headers["location"] = "Location", data[_iri_key] + response._headers["location"] = "Location", data[_iri_key] # type: ignore return response @abstractmethod - def process_post(self, request, headers, collection_name, deposit_id=None): + def process_post( + self, + request, + headers: Dict, + collection_name: str, + deposit_id: Optional[int] = None, + ) -> Tuple[int, str, Dict]: """Routine to deal with the deposit's processing. Returns Tuple of: - response status code (200, 201, etc...) - key iri (EM_IRI, EDIT_SE_IRI, etc...) - dictionary of the processing result """ pass class APIPut(APIBase, metaclass=ABCMeta): """Mixin for class to support PUT method. """ - def put(self, request, collection_name, deposit_id, format=None): + def put( + self, request: Request, collection_name: str, deposit_id: int + ) -> HttpResponse: """Endpoint to update deposit resources. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) headers = checks["headers"] data = self.process_put(request, headers, collection_name, deposit_id) error = data.get("error") if error: return make_error_response_from_dict(request, error) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod - def process_put(self, request, headers, collection_name, deposit_id): + def process_put( + self, request: Request, headers: Dict, collection_name: str, deposit_id: int + ) -> Dict[str, Any]: """Routine to deal with updating a deposit in some way. Returns dictionary of the processing result """ pass class APIDelete(APIBase, metaclass=ABCMeta): """Mixin for class to support DELETE method. """ - def delete(self, request, collection_name, deposit_id): + def delete( + self, request: Request, collection_name: str, deposit_id: Optional[int] = None + ) -> HttpResponse: """Endpoint to delete some deposit's resources (archives, deposit). Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) + assert deposit_id is not None data = self.process_delete(request, collection_name, deposit_id) error = data.get("error") if error: return make_error_response_from_dict(request, error) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod - def process_delete(self, request, collection_name, deposit_id): + def process_delete( + self, request: Request, collection_name: str, deposit_id: int + ) -> Dict: """Routine to delete a resource. This is mostly not allowed except for the EM_IRI (cf. .api.deposit_update.APIUpdateArchive) """ - pass + return {} diff --git a/swh/deposit/api/deposit.py b/swh/deposit/api/deposit.py index b426b180..8cc4455c 100644 --- a/swh/deposit/api/deposit.py +++ b/swh/deposit/api/deposit.py @@ -1,98 +1,112 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Any, Dict, Optional, Tuple + from rest_framework import status from ..config import EDIT_SE_IRI from ..errors import BAD_REQUEST, make_error_dict from ..parsers import ( SWHAtomEntryParser, SWHFileUploadTarParser, SWHFileUploadZipParser, SWHMultiPartParser, ) from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, APIPost class APIPostDeposit(APIPost): """Deposit request class defining api endpoints for sword deposit. What's known as 'Col IRI' in the sword specification. HTTP verbs supported: POST """ parser_classes = ( SWHMultiPartParser, SWHFileUploadZipParser, SWHFileUploadTarParser, SWHAtomEntryParser, ) - def additional_checks(self, req, headers, collection_name, deposit_id=None): + def additional_checks( + self, + req, + headers: Dict[str, Any], + collection_name: str, + deposit_id: Optional[int] = None, + ) -> Dict[str, Any]: slug = headers["slug"] if not slug: msg = "Missing SLUG header in request" verbose_description = "Provide in the SLUG header one identifier, for example the url pointing to the resource you are depositing." # noqa return make_error_dict(BAD_REQUEST, msg, verbose_description) return {} - def process_post(self, req, headers, collection_name, deposit_id=None): + def process_post( + self, + req, + headers: Dict[str, Any], + collection_name: str, + deposit_id: Optional[int] = None, + ) -> Tuple[int, str, Dict[str, Any]]: """Create a first deposit as: - archive deposit (1 zip) - multipart (1 zip + 1 atom entry) - atom entry Args: req (Request): the request holding the information to parse and inject in db collection_name (str): the associated client Returns: An http response (HttpResponse) according to the situation. If everything is ok, a 201 response (created) with a deposit receipt. Otherwise, depending on the upload, the following errors can be returned: - archive deposit: - 400 (bad request) if the request is not providing an external identifier - 403 (forbidden) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or hash provided mismatch the reality of the archive. - 415 (unsupported media type) if a wrong media type is provided - multipart deposit: - 400 (bad request) if the request is not providing an external identifier - 412 (precondition failed) if the potentially md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided - Atom entry deposit: - 400 (bad request) if the request is not providing an external identifier - 400 (bad request) if the request's body is empty - 415 (unsupported media type) if a wrong media type is provided """ assert deposit_id is None if req.content_type in ACCEPT_ARCHIVE_CONTENT_TYPES: data = self._binary_upload(req, headers, collection_name) elif req.content_type.startswith("multipart/"): data = self._multipart_upload(req, headers, collection_name) else: data = self._atom_entry(req, headers, collection_name) return status.HTTP_201_CREATED, EDIT_SE_IRI, data diff --git a/swh/deposit/api/deposit_content.py b/swh/deposit/api/deposit_content.py index a7f861f4..fbab2fe4 100644 --- a/swh/deposit/api/deposit_content.py +++ b/swh/deposit/api/deposit_content.py @@ -1,46 +1,47 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from django.http import HttpResponse from django.shortcuts import render from rest_framework import status from ..errors import NOT_FOUND, make_error_response, make_error_response_from_dict from ..models import DEPOSIT_STATUS_DETAIL, Deposit, DepositRequest from .common import APIBase class APIContent(APIBase): - def get(self, req, collection_name, deposit_id, format=None): + def get(self, req, collection_name: str, deposit_id: int) -> HttpResponse: checks = self.checks(req, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(req, checks["error"]) try: deposit = Deposit.objects.get(pk=deposit_id) if deposit.collection.name != collection_name: raise Deposit.DoesNotExist except Deposit.DoesNotExist: return make_error_response( req, NOT_FOUND, "deposit %s does not belong to collection %s" % (deposit_id, collection_name), ) requests = DepositRequest.objects.filter(deposit=deposit) context = { "deposit_id": deposit.id, "status": deposit.status, "status_detail": DEPOSIT_STATUS_DETAIL[deposit.status], "requests": requests, } return render( req, "deposit/content.xml", context=context, content_type="application/xml", status=status.HTTP_200_OK, ) diff --git a/swh/deposit/api/deposit_status.py b/swh/deposit/api/deposit_status.py index fa89276e..9c87db9c 100644 --- a/swh/deposit/api/deposit_status.py +++ b/swh/deposit/api/deposit_status.py @@ -1,64 +1,65 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from django.http import HttpResponse from django.shortcuts import render from rest_framework import status from ..errors import NOT_FOUND, make_error_response, make_error_response_from_dict from ..models import DEPOSIT_STATUS_DETAIL, Deposit from .common import APIBase from .converters import convert_status_detail class APIStatus(APIBase): """Deposit status. What's known as 'State IRI' in the sword specification. HTTP verbs supported: GET """ - def get(self, req, collection_name, deposit_id, format=None): + def get(self, req, collection_name: str, deposit_id: int) -> HttpResponse: checks = self.checks(req, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(req, checks["error"]) try: deposit = Deposit.objects.get(pk=deposit_id) if deposit.collection.name != collection_name: raise Deposit.DoesNotExist except Deposit.DoesNotExist: return make_error_response( req, NOT_FOUND, "deposit %s does not belong to collection %s" % (deposit_id, collection_name), ) status_detail = convert_status_detail(deposit.status_detail) if not status_detail: status_detail = DEPOSIT_STATUS_DETAIL[deposit.status] context = { "deposit_id": deposit.id, "status_detail": status_detail, } keys = ( "status", "swh_id", "swh_id_context", "external_id", ) for k in keys: context[k] = getattr(deposit, k, None) return render( req, "deposit/status.xml", context=context, content_type="application/xml", status=status.HTTP_200_OK, ) diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py index 749edd37..ded1bf5f 100644 --- a/swh/deposit/api/deposit_update.py +++ b/swh/deposit/api/deposit_update.py @@ -1,169 +1,185 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Any, Dict, Optional, Tuple + from rest_framework import status from ..config import CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI from ..errors import BAD_REQUEST, make_error_dict from ..parsers import ( SWHAtomEntryParser, SWHFileUploadTarParser, SWHFileUploadZipParser, SWHMultiPartParser, ) from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, APIDelete, APIPost, APIPut class APIUpdateArchive(APIPost, APIPut, APIDelete): """Deposit request class defining api endpoints for sword deposit. What's known as 'EM IRI' in the sword specification. HTTP verbs supported: PUT, POST, DELETE """ parser_classes = ( SWHFileUploadZipParser, SWHFileUploadTarParser, ) - def process_put(self, req, headers, collection_name, deposit_id): + def process_put( + self, req, headers, collection_name: str, deposit_id: int + ) -> Dict[str, Any]: """Replace existing content for the existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_binary # noqa Returns: 204 No content """ if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES: msg = "Packaging format supported is restricted to %s" % ( ", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES) ) return make_error_dict(BAD_REQUEST, msg) return self._binary_upload( req, headers, collection_name, deposit_id=deposit_id, replace_archives=True ) - def process_post(self, req, headers, collection_name, deposit_id): + def process_post( + self, req, headers: Dict, collection_name: str, deposit_id: Optional[int] = None + ) -> Tuple[int, str, Dict]: """Add new content to the existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_mediaresource # noqa Returns: 201 Created Headers: Location: [Cont-File-IRI] Body: [optional Deposit Receipt] """ if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES: msg = "Packaging format supported is restricted to %s" % ( ", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES) ) - return "unused", "unused", make_error_dict(BAD_REQUEST, msg) + unused = 0 + return unused, "unused", make_error_dict(BAD_REQUEST, msg) return ( status.HTTP_201_CREATED, CONT_FILE_IRI, self._binary_upload(req, headers, collection_name, deposit_id), ) - def process_delete(self, req, collection_name, deposit_id): + def process_delete(self, req, collection_name: str, deposit_id: int) -> Dict: """Delete content (archives) from existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deletingcontent # noqa Returns: 204 Created """ return self._delete_archives(collection_name, deposit_id) class APIUpdateMetadata(APIPost, APIPut, APIDelete): """Deposit request class defining api endpoints for sword deposit. What's known as 'Edit IRI' (and SE IRI) in the sword specification. HTTP verbs supported: POST (SE IRI), PUT (Edit IRI), DELETE """ parser_classes = (SWHMultiPartParser, SWHAtomEntryParser) - def process_put(self, req, headers, collection_name, deposit_id): + def process_put( + self, req, headers: Dict, collection_name: str, deposit_id: int + ) -> Dict[str, Any]: """Replace existing deposit's metadata/archive with new ones. source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_metadata # noqa - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_multipart # noqa Returns: 204 No content """ if req.content_type.startswith("multipart/"): return self._multipart_upload( req, headers, collection_name, deposit_id=deposit_id, replace_archives=True, replace_metadata=True, ) return self._atom_entry( req, headers, collection_name, deposit_id=deposit_id, replace_metadata=True ) - def process_post(self, req, headers, collection_name, deposit_id): + def process_post( + self, + request, + headers: Dict, + collection_name: str, + deposit_id: Optional[int] = None, + ) -> Tuple[int, str, Dict]: """Add new metadata/archive to existing deposit. source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_metadata # noqa - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_multipart # noqa This also deals with an empty post corner case to finalize a deposit. Returns: In optimal case for a multipart and atom-entry update, a 201 Created response. The body response will hold a deposit. And the response headers will contain an entry 'Location' with the EM-IRI. For the empty post case, this returns a 200. """ - if req.content_type.startswith("multipart/"): + assert deposit_id is not None + if request.content_type.startswith("multipart/"): return ( status.HTTP_201_CREATED, EM_IRI, self._multipart_upload( - req, headers, collection_name, deposit_id=deposit_id + request, headers, collection_name, deposit_id=deposit_id ), ) # check for final empty post # source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html # #continueddeposit_complete if headers["content-length"] == 0 and headers["in-progress"] is False: - data = self._empty_post(req, headers, collection_name, deposit_id) + data = self._empty_post(request, headers, collection_name, deposit_id) return (status.HTTP_200_OK, EDIT_SE_IRI, data) return ( status.HTTP_201_CREATED, EM_IRI, - self._atom_entry(req, headers, collection_name, deposit_id=deposit_id), + self._atom_entry(request, headers, collection_name, deposit_id=deposit_id), ) - def process_delete(self, req, collection_name, deposit_id): + def process_delete(self, req, collection_name: str, deposit_id: int) -> Dict: """Delete the container (deposit). source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deleteconteiner # noqa """ return self._delete_deposit(collection_name, deposit_id) diff --git a/swh/deposit/api/private/__init__.py b/swh/deposit/api/private/__init__.py index e9b98ee3..4a9aaaa8 100644 --- a/swh/deposit/api/private/__init__.py +++ b/swh/deposit/api/private/__init__.py @@ -1,108 +1,96 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework.permissions import AllowAny from swh.deposit import utils from swh.deposit.api.common import AuthenticatedAPIView from swh.deposit.errors import NOT_FOUND, make_error_dict from ...config import METADATA_TYPE, APIConfig from ...models import Deposit, DepositRequest class DepositReadMixin: """Deposit Read mixin """ def _deposit_requests(self, deposit, request_type): """Given a deposit, yields its associated deposit_request Args: deposit (Deposit): Deposit to list requests for request_type (str): 'archive' or 'metadata' Yields: deposit requests of type request_type associated to the deposit """ if isinstance(deposit, int): deposit = Deposit.objects.get(pk=deposit) deposit_requests = DepositRequest.objects.filter( type=request_type, deposit=deposit ).order_by("id") for deposit_request in deposit_requests: yield deposit_request def _metadata_get(self, deposit): """Given a deposit, aggregate all metadata requests. Args: deposit (Deposit): The deposit instance to extract metadata from. Returns: metadata dict from the deposit. """ metadata = ( m.metadata for m in self._deposit_requests(deposit, request_type=METADATA_TYPE) ) return utils.merge(*metadata) class APIPrivateView(APIConfig, AuthenticatedAPIView): """Mixin intended as private api (so no authentication) based API view (for the private ones). """ authentication_classes = () permission_classes = (AllowAny,) def checks(self, req, collection_name, deposit_id=None): """Override default checks implementation to allow empty collection. """ if deposit_id: try: Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, "Deposit with id %s does not exist" % deposit_id ) headers = self._read_headers(req) checks = self.additional_checks(req, headers, collection_name, deposit_id) if "error" in checks: return checks return {"headers": headers} def get( - self, - request, - collection_name=None, - deposit_id=None, - format=None, - *args, - **kwargs, + self, request, collection_name=None, deposit_id=None, *args, **kwargs, ): - return super().get(request, collection_name, deposit_id, format) + return super().get(request, collection_name, deposit_id) def put( - self, - request, - collection_name=None, - deposit_id=None, - format=None, - *args, - **kwargs, + self, request, collection_name=None, deposit_id=None, *args, **kwargs, ): - return super().put(request, collection_name, deposit_id, format) + return super().put(request, collection_name, deposit_id) diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py index 680ec83c..d2afd5e7 100644 --- a/swh/deposit/api/private/deposit_check.py +++ b/swh/deposit/api/private/deposit_check.py @@ -1,228 +1,234 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from itertools import chain -import json import re from shutil import get_unpack_formats import tarfile +from typing import Dict, Optional, Tuple import zipfile from rest_framework import status from swh.scheduler.utils import create_oneshot_task_dict from . import APIPrivateView, DepositReadMixin from ...config import ARCHIVE_TYPE, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED -from ...models import Deposit +from ...models import Deposit, DepositRequest from ..common import APIGet MANDATORY_FIELDS_MISSING = "Mandatory fields are missing" ALTERNATE_FIELDS_MISSING = "Mandatory alternate fields are missing" MANDATORY_ARCHIVE_UNREADABLE = ( "At least one of its associated archives is not readable" # noqa ) MANDATORY_ARCHIVE_INVALID = ( "Mandatory archive is invalid (i.e contains only one archive)" # noqa ) MANDATORY_ARCHIVE_UNSUPPORTED = "Mandatory archive type is not supported" MANDATORY_ARCHIVE_MISSING = "Deposit without archive is rejected" ARCHIVE_EXTENSIONS = [ "zip", "tar", "tar.gz", "xz", "tar.xz", "bz2", "tar.bz2", "Z", "tar.Z", "tgz", "7z", ] PATTERN_ARCHIVE_EXTENSION = re.compile(r".*\.(%s)$" % "|".join(ARCHIVE_EXTENSIONS)) def known_archive_format(filename): return any( filename.endswith(t) for t in chain(*(x[1] for x in get_unpack_formats())) ) class APIChecks(APIPrivateView, APIGet, DepositReadMixin): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ - def _check_deposit_archives(self, deposit): + def _check_deposit_archives(self, deposit: Deposit) -> Tuple[bool, Optional[Dict]]: """Given a deposit, check each deposit request of type archive. Args: The deposit to check archives for Returns tuple (status, error_detail): True, None if all archives are ok, (False, ) otherwise. """ requests = list(self._deposit_requests(deposit, request_type=ARCHIVE_TYPE)) if len(requests) == 0: # no associated archive is refused return False, {"archive": [{"summary": MANDATORY_ARCHIVE_MISSING,}]} errors = [] for archive_request in requests: check, error_message = self._check_archive(archive_request) if not check: errors.append( {"summary": error_message, "fields": [archive_request.id]} ) if not errors: return True, None return False, {"archive": errors} - def _check_archive(self, archive_request): + def _check_archive( + self, archive_request: DepositRequest + ) -> Tuple[bool, Optional[str]]: """Check that a deposit associated archive is ok: - readable - supported archive format - valid content: the archive does not contain a single archive file If any of those checks are not ok, return the corresponding failing check. Args: archive_path (DepositRequest): Archive to check Returns: (True, None) if archive is check compliant, (False, ) otherwise. """ archive_path = archive_request.archive.path if not known_archive_format(archive_path): return False, MANDATORY_ARCHIVE_UNSUPPORTED try: if zipfile.is_zipfile(archive_path): - with zipfile.ZipFile(archive_path) as f: - files = f.namelist() + with zipfile.ZipFile(archive_path) as zipfile_: + files = zipfile_.namelist() elif tarfile.is_tarfile(archive_path): - with tarfile.open(archive_path) as f: - files = f.getnames() + with tarfile.open(archive_path) as tarfile_: + files = tarfile_.getnames() else: return False, MANDATORY_ARCHIVE_UNSUPPORTED except Exception: return False, MANDATORY_ARCHIVE_UNREADABLE if len(files) > 1: return True, None element = files[0] if PATTERN_ARCHIVE_EXTENSION.match(element): # archive in archive! return False, MANDATORY_ARCHIVE_INVALID return True, None - def _check_metadata(self, metadata): + def _check_metadata(self, metadata: Dict) -> Tuple[bool, Optional[Dict]]: """Check to execute on all metadata for mandatory field presence. Args: metadata (dict): Metadata dictionary to check for mandatory fields Returns: tuple (status, error_detail): True, None if metadata are ok (False, ) otherwise. """ required_fields = { "author": False, } alternate_fields = { ("name", "title"): False, # alternate field, at least one # of them must be present } for field, value in metadata.items(): for name in required_fields: if name in field: required_fields[name] = True for possible_names in alternate_fields: for possible_name in possible_names: if possible_name in field: alternate_fields[possible_names] = True continue mandatory_result = [k for k, v in required_fields.items() if not v] optional_result = [" or ".join(k) for k, v in alternate_fields.items() if not v] if mandatory_result == [] and optional_result == []: return True, None detail = [] if mandatory_result != []: detail.append( {"summary": MANDATORY_FIELDS_MISSING, "fields": mandatory_result} ) if optional_result != []: detail.append( {"summary": ALTERNATE_FIELDS_MISSING, "fields": optional_result,} ) return False, {"metadata": detail} - def process_get(self, req, collection_name, deposit_id): + def process_get( + self, req, collection_name: str, deposit_id: int + ) -> Tuple[int, Dict, str]: """Build a unique tarball from the multiple received and stream that content to the client. Args: req (Request): collection_name (str): Collection owning the deposit deposit_id (id): Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ deposit = Deposit.objects.get(pk=deposit_id) metadata = self._metadata_get(deposit) - problems = {} + problems: Dict = {} # will check each deposit's associated request (both of type # archive and metadata) for errors archives_status, error_detail = self._check_deposit_archives(deposit) if not archives_status: + assert error_detail is not None problems.update(error_detail) metadata_status, error_detail = self._check_metadata(metadata) if not metadata_status: + assert error_detail is not None problems.update(error_detail) deposit_status = archives_status and metadata_status # if any problems arose, the deposit is rejected if not deposit_status: deposit.status = DEPOSIT_STATUS_REJECTED deposit.status_detail = problems response = { "status": deposit.status, "details": deposit.status_detail, } else: deposit.status = DEPOSIT_STATUS_VERIFIED response = { "status": deposit.status, } if not deposit.load_task_id and self.config["checks"]: url = deposit.origin_url task = create_oneshot_task_dict( "load-deposit", url=url, deposit_id=deposit.id, retries_left=3 ) load_task_id = self.scheduler.create_tasks([task])[0]["id"] deposit.load_task_id = load_task_id deposit.save() - return status.HTTP_200_OK, json.dumps(response), "application/json" + return status.HTTP_200_OK, response, "application/json" diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py index 4a5f388a..51b6636e 100644 --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -1,197 +1,195 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from contextlib import contextmanager -import json import os import shutil import tempfile +from typing import Any, Dict, Tuple -from django.http import FileResponse from rest_framework import status from swh.core import tarball from swh.deposit.api import __version__ from swh.deposit.utils import normalize_date from swh.model import identifiers from . import APIPrivateView, DepositReadMixin from ...config import ARCHIVE_TYPE, SWH_PERSON from ...models import Deposit from ..common import APIGet @contextmanager def aggregate_tarballs(extraction_dir, archive_paths): """Aggregate multiple tarballs into one and returns this new archive's path. Args: extraction_dir (path): Path to use for the tarballs computation archive_paths ([str]): Deposit's archive paths Returns: Tuple (directory to clean up, archive path (aggregated or not)) """ # rebuild one zip archive from (possibly) multiple ones os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix="swh.deposit-", dir=extraction_dir) # root folder to build an aggregated tarball aggregated_tarball_rootdir = os.path.join(dir_path, "aggregate") os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True) # uncompress in a temporary location all archives for archive_path in archive_paths: tarball.uncompress(archive_path, aggregated_tarball_rootdir) # Aggregate into one big tarball the multiple smaller ones temp_tarpath = shutil.make_archive( aggregated_tarball_rootdir, "zip", aggregated_tarball_rootdir ) # can already clean up temporary directory shutil.rmtree(aggregated_tarball_rootdir) try: yield temp_tarpath finally: shutil.rmtree(dir_path) class APIReadArchives(APIPrivateView, APIGet, DepositReadMixin): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ def __init__(self): super().__init__() self.extraction_dir = self.config["extraction_dir"] if not os.path.exists(self.extraction_dir): os.makedirs(self.extraction_dir) - def process_get(self, request, collection_name, deposit_id): + def process_get( + self, request, collection_name: str, deposit_id: int + ) -> Tuple[int, Any, str]: """Build a unique tarball from the multiple received and stream that content to the client. Args: request (Request): - collection_name (str): Collection owning the deposit - deposit_id (id): Deposit concerned by the reading + collection_name: Collection owning the deposit + deposit_id: Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ archive_paths = [ r.archive.path for r in self._deposit_requests(deposit_id, request_type=ARCHIVE_TYPE) ] - with aggregate_tarballs(self.extraction_dir, archive_paths) as path: - return FileResponse( - open(path, "rb"), - status=status.HTTP_200_OK, - content_type="application/zip", - ) + return ( + status.HTTP_200_OK, + aggregate_tarballs(self.extraction_dir, archive_paths), + "swh/generator", + ) class APIReadMetadata(APIPrivateView, APIGet, DepositReadMixin): """Class in charge of aggregating metadata on a deposit. """ def __init__(self): super().__init__() self.provider = self.config["provider"] self.tool = { "name": "swh-deposit", "version": __version__, "configuration": {"sword_version": "2"}, } def _normalize_dates(self, deposit, metadata): """Normalize the date to use as a tuple of author date, committer date from the incoming metadata. Args: deposit (Deposit): Deposit model representation metadata (Dict): Metadata dict representation Returns: Tuple of author date, committer date. Those dates are swh normalized. """ commit_date = metadata.get("codemeta:datePublished") author_date = metadata.get("codemeta:dateCreated") if author_date and commit_date: pass elif commit_date: author_date = commit_date elif author_date: commit_date = author_date else: author_date = deposit.complete_date commit_date = deposit.complete_date return (normalize_date(author_date), normalize_date(commit_date)) def metadata_read(self, deposit): """Read and aggregate multiple data on deposit into one unified data dictionary. Args: deposit (Deposit): Deposit concerned by the data aggregation. Returns: Dictionary of data representing the deposit to inject in swh. """ metadata = self._metadata_get(deposit) # Read information metadata data = {"origin": {"type": "deposit", "url": deposit.origin_url,}} # metadata provider self.provider["provider_name"] = deposit.client.last_name self.provider["provider_url"] = deposit.client.provider_url author_date, commit_date = self._normalize_dates(deposit, metadata) if deposit.parent: swh_persistent_id = deposit.parent.swh_id swhid = identifiers.parse_swhid(swh_persistent_id) parent_revision = swhid.object_id parents = [parent_revision] else: parents = [] data["origin_metadata"] = { "provider": self.provider, "tool": self.tool, "metadata": metadata, } data["deposit"] = { "id": deposit.id, "client": deposit.client.username, "collection": deposit.collection.name, "author": SWH_PERSON, "author_date": author_date, "committer": SWH_PERSON, "committer_date": commit_date, "revision_parents": parents, } return data - def process_get(self, request, collection_name, deposit_id): + def process_get( + self, request, collection_name: str, deposit_id: int + ) -> Tuple[int, Dict, str]: deposit = Deposit.objects.get(pk=deposit_id) data = self.metadata_read(deposit) - d = {} - if data: - d = json.dumps(data) - - return status.HTTP_200_OK, d, "application/json" + return status.HTTP_200_OK, data if data else {}, "application/json" diff --git a/swh/deposit/api/private/deposit_update_status.py b/swh/deposit/api/private/deposit_update_status.py index 9df47390..af6bcb6c 100644 --- a/swh/deposit/api/private/deposit_update_status.py +++ b/swh/deposit/api/private/deposit_update_status.py @@ -1,103 +1,107 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Dict + from rest_framework.parsers import JSONParser from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid from . import APIPrivateView from ...errors import BAD_REQUEST, make_error_dict from ...models import DEPOSIT_STATUS_DETAIL, DEPOSIT_STATUS_LOAD_SUCCESS, Deposit from ..common import APIPut MANDATORY_KEYS = ["origin_url", "revision_id", "directory_id", "snapshot_id"] class APIUpdateStatus(APIPrivateView, APIPut): """Deposit request class to update the deposit's status. HTTP verbs supported: PUT """ parser_classes = (JSONParser,) def additional_checks(self, request, headers, collection_name, deposit_id=None): """Enrich existing checks to the default ones. New checks: - Ensure the status is provided - Ensure it exists - no missing information on load success update """ data = request.data status = data.get("status") if not status: msg = "The status key is mandatory with possible values %s" % list( DEPOSIT_STATUS_DETAIL.keys() ) return make_error_dict(BAD_REQUEST, msg) if status not in DEPOSIT_STATUS_DETAIL: msg = "Possible status in %s" % list(DEPOSIT_STATUS_DETAIL.keys()) return make_error_dict(BAD_REQUEST, msg) if status == DEPOSIT_STATUS_LOAD_SUCCESS: missing_keys = [] for key in MANDATORY_KEYS: value = data.get(key) if value is None: missing_keys.append(key) if missing_keys: msg = ( f"Updating deposit status to {status}" f" requires information {','.join(missing_keys)}" ) return make_error_dict(BAD_REQUEST, msg) return {} - def process_put(self, request, headers, collection_name, deposit_id): + def process_put( + self, request, headers: Dict, collection_name: str, deposit_id: int + ) -> Dict: """Update the deposit with status and SWHIDs Returns: 204 No content 400 Bad request if checks fail """ data = request.data deposit = Deposit.objects.get(pk=deposit_id) status = data["status"] deposit.status = status if status == DEPOSIT_STATUS_LOAD_SUCCESS: origin_url = data["origin_url"] directory_id = data["directory_id"] revision_id = data["revision_id"] dir_id = swhid(DIRECTORY, directory_id) snp_id = swhid(SNAPSHOT, data["snapshot_id"]) rev_id = swhid(REVISION, revision_id) deposit.swh_id = dir_id # new id with contextual information deposit.swh_id_context = swhid( DIRECTORY, directory_id, metadata={ "origin": origin_url, "visit": snp_id, "anchor": rev_id, "path": "/", }, ) else: # rejected deposit.status = status deposit.save() return {}