Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py
index 6f696078..6bed49c5 100644
--- a/swh/deposit/api/common.py
+++ b/swh/deposit/api/common.py
@@ -1,958 +1,1033 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from abc import ABCMeta, abstractmethod
+import datetime
import hashlib
-from typing import Sequence, Type
+import json
+from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union
-from django.http import HttpResponse
+from django.http import FileResponse, HttpResponse
from django.shortcuts import render
from django.urls import reverse
from django.utils import timezone
from rest_framework import status
from rest_framework.authentication import BaseAuthentication, BasicAuthentication
from rest_framework.permissions import BasePermission, IsAuthenticated
+from rest_framework.request import Request
from rest_framework.views import APIView
from swh.model import hashutil
from swh.scheduler.utils import create_oneshot_task_dict
from ..config import (
ARCHIVE_KEY,
ARCHIVE_TYPE,
CONT_FILE_IRI,
DEPOSIT_STATUS_DEPOSITED,
DEPOSIT_STATUS_LOAD_SUCCESS,
DEPOSIT_STATUS_PARTIAL,
EDIT_SE_IRI,
EM_IRI,
METADATA_KEY,
METADATA_TYPE,
RAW_METADATA_KEY,
STATE_IRI,
APIConfig,
)
from ..errors import (
BAD_REQUEST,
CHECKSUM_MISMATCH,
ERROR_CONTENT,
FORBIDDEN,
MAX_UPLOAD_SIZE_EXCEEDED,
MEDIATION_NOT_ALLOWED,
METHOD_NOT_ALLOWED,
NOT_FOUND,
PARSING_ERROR,
ParserError,
make_error_dict,
make_error_response,
make_error_response_from_dict,
)
from ..models import Deposit, DepositClient, DepositCollection, DepositRequest
from ..parsers import parse_xml
ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"]
ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"]
class AuthenticatedAPIView(APIView):
"""Mixin intended as a based API view to enforce the basic
authentication check
"""
authentication_classes: Sequence[Type[BaseAuthentication]] = (BasicAuthentication,)
permission_classes: Sequence[Type[BasePermission]] = (IsAuthenticated,)
class APIBase(APIConfig, AuthenticatedAPIView, metaclass=ABCMeta):
"""Base deposit request class sharing multiple common behaviors.
"""
- def _read_headers(self, request):
+ def _read_headers(self, request: Request) -> Dict[str, Any]:
"""Read and unify the necessary headers from the request (those are
not stored in the same location or not properly formatted).
Args:
request (Request): Input request
Returns:
Dictionary with the following keys (some associated values may be
None):
- content-type
- content-length
- in-progress
- content-disposition
- packaging
- slug
- on-behalf-of
"""
meta = request._request.META
content_type = request.content_type
content_length = meta.get("CONTENT_LENGTH")
if content_length and isinstance(content_length, str):
content_length = int(content_length)
# final deposit if not provided
in_progress = meta.get("HTTP_IN_PROGRESS", False)
content_disposition = meta.get("HTTP_CONTENT_DISPOSITION")
if isinstance(in_progress, str):
in_progress = in_progress.lower() == "true"
content_md5sum = meta.get("HTTP_CONTENT_MD5")
if content_md5sum:
content_md5sum = bytes.fromhex(content_md5sum)
packaging = meta.get("HTTP_PACKAGING")
slug = meta.get("HTTP_SLUG")
on_behalf_of = meta.get("HTTP_ON_BEHALF_OF")
metadata_relevant = meta.get("HTTP_METADATA_RELEVANT")
return {
"content-type": content_type,
"content-length": content_length,
"in-progress": in_progress,
"content-disposition": content_disposition,
"content-md5sum": content_md5sum,
"packaging": packaging,
"slug": slug,
"on-behalf-of": on_behalf_of,
"metadata-relevant": metadata_relevant,
}
- def _compute_md5(self, filehandler):
+ def _compute_md5(self, filehandler) -> bytes:
"""Compute uploaded file's md5 sum.
Args:
filehandler (InMemoryUploadedFile): the file to compute the md5
hash
Returns:
the md5 checksum (str)
"""
h = hashlib.md5()
for chunk in filehandler:
h.update(chunk)
return h.digest()
def _deposit_put(
- self, request, deposit_id=None, in_progress=False, external_id=None
- ):
+ self,
+ request: Request,
+ deposit_id: Optional[int] = None,
+ in_progress: bool = False,
+ external_id: Optional[str] = None,
+ ) -> Deposit:
"""Save/Update a deposit in db.
Args:
- deposit_id (int): deposit identifier
- in_progress (dict): The deposit's status
- external_id (str): The external identifier to associate to
- the deposit
+ request: request data
+ deposit_id: deposit identifier
+ in_progress: deposit status
+ external_id: external identifier to associate to the deposit
Returns:
The Deposit instance saved or updated.
"""
+ complete_date: Optional[datetime.datetime] = None
+ deposit_parent: Optional[Deposit] = None
+
if in_progress is False:
complete_date = timezone.now()
status_type = DEPOSIT_STATUS_DEPOSITED
else:
- complete_date = None
status_type = DEPOSIT_STATUS_PARTIAL
if not deposit_id:
try:
- # find a deposit parent (same external id, status load
- # to success)
+ # find a deposit parent (same external id, status load to success)
deposit_parent = (
Deposit.objects.filter(
external_id=external_id, status=DEPOSIT_STATUS_LOAD_SUCCESS
)
.order_by("-id")[0:1]
.get()
) # noqa
except Deposit.DoesNotExist:
- deposit_parent = None
+ # then no parent for that deposit, deposit_parent already None
+ pass
+ assert external_id is not None
deposit = Deposit(
collection=self._collection,
external_id=external_id,
complete_date=complete_date,
status=status_type,
client=self._client,
parent=deposit_parent,
)
else:
deposit = Deposit.objects.get(pk=deposit_id)
# update metadata
deposit.complete_date = complete_date
deposit.status = status_type
if self.config["checks"]:
deposit.save() # needed to have a deposit id
scheduler = self.scheduler
if deposit.status == DEPOSIT_STATUS_DEPOSITED and not deposit.check_task_id:
task = create_oneshot_task_dict(
"check-deposit",
collection=deposit.collection.name,
deposit_id=deposit.id,
)
check_task_id = scheduler.create_tasks([task])[0]["id"]
deposit.check_task_id = check_task_id
deposit.save()
return deposit
def _deposit_request_put(
self,
- deposit,
- deposit_request_data,
- replace_metadata=False,
- replace_archives=False,
- ):
+ deposit: Deposit,
+ deposit_request_data: Dict[str, Any],
+ replace_metadata: bool = False,
+ replace_archives: bool = False,
+ ) -> None:
"""Save a deposit request with metadata attached to a deposit.
Args:
- deposit (Deposit): The deposit concerned by the request
- deposit_request_data (dict): The dictionary with at most 2 deposit
- request types (archive, metadata) to associate to the deposit
- replace_metadata (bool): Flag defining if we add or update
+ deposit: The deposit concerned by the request
+ deposit_request_data: The dictionary with at most 2 deposit
+ request types (archive, metadata) to associate to the deposit
+ replace_metadata: Flag defining if we add or update
existing metadata to the deposit
- replace_archives (bool): Flag defining if we add or update
+ replace_archives: Flag defining if we add or update
archives to existing deposit
Returns:
None
"""
if replace_metadata:
DepositRequest.objects.filter(deposit=deposit, type=METADATA_TYPE).delete()
if replace_archives:
DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete()
deposit_request = None
archive_file = deposit_request_data.get(ARCHIVE_KEY)
if archive_file:
deposit_request = DepositRequest(
type=ARCHIVE_TYPE, deposit=deposit, archive=archive_file
)
deposit_request.save()
metadata = deposit_request_data.get(METADATA_KEY)
if metadata:
- raw_metadata = deposit_request_data.get(RAW_METADATA_KEY)
+ raw_metadata = deposit_request_data[RAW_METADATA_KEY]
deposit_request = DepositRequest(
type=METADATA_TYPE,
deposit=deposit,
metadata=metadata,
raw_metadata=raw_metadata.decode("utf-8"),
)
deposit_request.save()
assert deposit_request is not None
- def _delete_archives(self, collection_name, deposit_id):
- """Delete archives reference from the deposit id.
+ def _delete_archives(self, collection_name: str, deposit_id: int) -> Dict:
+ """Delete archive references from the deposit id.
"""
try:
deposit = Deposit.objects.get(pk=deposit_id)
except Deposit.DoesNotExist:
return make_error_dict(
NOT_FOUND, f"The deposit {deposit_id} does not exist"
)
DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete()
return {}
- def _delete_deposit(self, collection_name, deposit_id):
+ def _delete_deposit(self, collection_name: str, deposit_id: int) -> Dict:
"""Delete deposit reference.
Args:
- collection_name (str): Client's name
- deposit_id (id): The deposit to delete
+ collection_name: Client's collection
+ deposit_id: The deposit to delete
Returns
Empty dict when ok.
Dict with error key to describe the failure.
"""
try:
deposit = Deposit.objects.get(pk=deposit_id)
except Deposit.DoesNotExist:
return make_error_dict(
NOT_FOUND, f"The deposit {deposit_id} does not exist"
)
if deposit.collection.name != collection_name:
summary = "Cannot delete a deposit from another collection"
description = "Deposit %s does not belong to the collection %s" % (
deposit_id,
collection_name,
)
return make_error_dict(
BAD_REQUEST, summary=summary, verbose_description=description
)
DepositRequest.objects.filter(deposit=deposit).delete()
deposit.delete()
return {}
- def _check_preconditions_on(self, filehandler, md5sum, content_length=None):
+ def _check_preconditions_on(
+ self, filehandler, md5sum: str, content_length: Optional[int] = None
+ ) -> Optional[Dict]:
"""Check preconditions on provided file are respected. That is the
length and/or the md5sum hash match the file's content.
Args:
filehandler (InMemoryUploadedFile): The file to check
- md5sum (hex str): md5 hash expected from the file's content
- content_length (int): the expected length if provided.
+ md5sum: md5 hash expected from the file's content
+ content_length: the expected length if provided.
Returns:
Either none if no error or a dictionary with a key error
detailing the problem.
"""
max_upload_size = self.config["max_upload_size"]
if content_length:
if content_length > max_upload_size:
return make_error_dict(
MAX_UPLOAD_SIZE_EXCEEDED,
f"Upload size limit exceeded (max {max_upload_size} bytes)."
"Please consider sending the archive in multiple steps.",
)
length = filehandler.size
if length != content_length:
return make_error_dict(
status.HTTP_412_PRECONDITION_FAILED, "Wrong length"
)
if md5sum:
_md5sum = self._compute_md5(filehandler)
if _md5sum != md5sum:
return make_error_dict(
CHECKSUM_MISMATCH,
"Wrong md5 hash",
f"The checksum sent {hashutil.hash_to_hex(md5sum)} and the actual "
f"checksum {hashutil.hash_to_hex(_md5sum)} does not match.",
)
return None
def _binary_upload(
self,
- request,
- headers,
- collection_name,
- deposit_id=None,
- replace_metadata=False,
- replace_archives=False,
- ):
+ request: Request,
+ headers: Dict[str, Any],
+ collection_name: str,
+ deposit_id: Optional[int] = None,
+ replace_metadata: bool = False,
+ replace_archives: bool = False,
+ ) -> Dict[str, Any]:
"""Binary upload routine.
Other than such a request, a 415 response is returned.
Args:
request (Request): the request holding information to parse
and inject in db
headers (dict): request headers formatted
collection_name (str): the associated client
deposit_id (id): deposit identifier if provided
replace_metadata (bool): 'Update or add' request to existing
deposit. If False (default), this adds new metadata request to
existing ones. Otherwise, this will replace existing metadata.
replace_archives (bool): 'Update or add' request to existing
deposit. If False (default), this adds new archive request to
existing ones. Otherwise, this will replace existing archives.
ones.
Returns:
In the optimal case a dict with the following keys:
- deposit_id (int): Deposit identifier
- deposit_date (date): Deposit date
- archive: None (no archive is provided here)
Otherwise, a dictionary with the key error and the
associated failures, either:
- 400 (bad request) if the request is not providing an external
identifier
- 413 (request entity too large) if the length of the
archive exceeds the max size configured
- 412 (precondition failed) if the length or md5 hash provided
mismatch the reality of the archive
- 415 (unsupported media type) if a wrong media type is provided
"""
content_length = headers["content-length"]
if not content_length:
return make_error_dict(
BAD_REQUEST,
"CONTENT_LENGTH header is mandatory",
"For archive deposit, the CONTENT_LENGTH header must be sent.",
)
content_disposition = headers["content-disposition"]
if not content_disposition:
return make_error_dict(
BAD_REQUEST,
"CONTENT_DISPOSITION header is mandatory",
"For archive deposit, the CONTENT_DISPOSITION header must be sent.",
)
packaging = headers["packaging"]
if packaging and packaging not in ACCEPT_PACKAGINGS:
return make_error_dict(
BAD_REQUEST,
f"Only packaging {ACCEPT_PACKAGINGS} is supported",
f"The packaging provided {packaging} is not supported",
)
filehandler = request.FILES["file"]
precondition_status_response = self._check_preconditions_on(
filehandler, headers["content-md5sum"], content_length
)
if precondition_status_response:
return precondition_status_response
external_id = headers["slug"]
# actual storage of data
archive_metadata = filehandler
deposit = self._deposit_put(
request,
deposit_id=deposit_id,
in_progress=headers["in-progress"],
external_id=external_id,
)
self._deposit_request_put(
deposit,
{ARCHIVE_KEY: archive_metadata},
replace_metadata=replace_metadata,
replace_archives=replace_archives,
)
return {
"deposit_id": deposit.id,
"deposit_date": deposit.reception_date,
"status": deposit.status,
"archive": filehandler.name,
}
- def _read_metadata(self, metadata_stream):
+ def _read_metadata(self, metadata_stream) -> Tuple[bytes, Dict[str, Any]]:
"""Given a metadata stream, reads the metadata and returns both the
parsed and the raw metadata.
"""
raw_metadata = metadata_stream.read()
metadata = parse_xml(raw_metadata)
return raw_metadata, metadata
def _multipart_upload(
self,
- request,
- headers,
- collection_name,
- deposit_id=None,
- replace_metadata=False,
- replace_archives=False,
- ):
+ request: Request,
+ headers: Dict[str, Any],
+ collection_name: str,
+ deposit_id: Optional[int] = None,
+ replace_metadata: bool = False,
+ replace_archives: bool = False,
+ ) -> Dict:
"""Multipart upload supported with exactly:
- 1 archive (zip)
- 1 atom entry
Other than such a request, a 415 response is returned.
Args:
request (Request): the request holding information to parse
and inject in db
- headers (dict): request headers formatted
- collection_name (str): the associated client
- deposit_id (id): deposit identifier if provided
- replace_metadata (bool): 'Update or add' request to existing
+ headers: request headers formatted
+ collection_name: the associated client
+ deposit_id: deposit identifier if provided
+ replace_metadata: 'Update or add' request to existing
deposit. If False (default), this adds new metadata request to
existing ones. Otherwise, this will replace existing metadata.
- replace_archives (bool): 'Update or add' request to existing
+ replace_archives: 'Update or add' request to existing
deposit. If False (default), this adds new archive request to
existing ones. Otherwise, this will replace existing archives.
ones.
Returns:
In the optimal case a dict with the following keys:
- deposit_id (int): Deposit identifier
- deposit_date (date): Deposit date
- archive: None (no archive is provided here)
Otherwise, a dictionary with the key error and the
associated failures, either:
- 400 (bad request) if the request is not providing an external
identifier
- 412 (precondition failed) if the potentially md5 hash provided
mismatch the reality of the archive
- 413 (request entity too large) if the length of the
archive exceeds the max size configured
- 415 (unsupported media type) if a wrong media type is provided
"""
external_id = headers["slug"]
content_types_present = set()
- data = {
+ data: Dict[str, Optional[Any]] = {
"application/zip": None, # expected either zip
"application/x-tar": None, # or x-tar
"application/atom+xml": None,
}
for key, value in request.FILES.items():
fh = value
- if fh.content_type in content_types_present:
+ content_type = fh.content_type
+ if content_type in content_types_present:
return make_error_dict(
ERROR_CONTENT,
"Only 1 application/zip (or application/x-tar) archive "
"and 1 atom+xml entry is supported (as per sword2.0 "
"specification)",
"You provided more than 1 application/(zip|x-tar) "
"or more than 1 application/atom+xml content-disposition "
"header in the multipart deposit",
)
- content_types_present.add(fh.content_type)
- data[fh.content_type] = fh
+ content_types_present.add(content_type)
+ assert content_type is not None
+ data[content_type] = fh
if len(content_types_present) != 2:
return make_error_dict(
ERROR_CONTENT,
"You must provide both 1 application/zip (or "
"application/x-tar) and 1 atom+xml entry for multipart "
"deposit",
"You need to provide only 1 application/(zip|x-tar) "
"and 1 application/atom+xml content-disposition header "
"in the multipart deposit",
)
filehandler = data["application/zip"]
if not filehandler:
filehandler = data["application/x-tar"]
precondition_status_response = self._check_preconditions_on(
filehandler, headers["content-md5sum"]
)
if precondition_status_response:
return precondition_status_response
try:
raw_metadata, metadata = self._read_metadata(data["application/atom+xml"])
except ParserError:
return make_error_dict(
PARSING_ERROR,
"Malformed xml metadata",
"The xml received is malformed. "
"Please ensure your metadata file is correctly formatted.",
)
# actual storage of data
deposit = self._deposit_put(
request,
deposit_id=deposit_id,
in_progress=headers["in-progress"],
external_id=external_id,
)
deposit_request_data = {
ARCHIVE_KEY: filehandler,
METADATA_KEY: metadata,
RAW_METADATA_KEY: raw_metadata,
}
self._deposit_request_put(
deposit, deposit_request_data, replace_metadata, replace_archives
)
+ assert filehandler is not None
return {
"deposit_id": deposit.id,
"deposit_date": deposit.reception_date,
"archive": filehandler.name,
"status": deposit.status,
}
def _atom_entry(
self,
- request,
- headers,
- collection_name,
- deposit_id=None,
- replace_metadata=False,
- replace_archives=False,
- ):
+ request: Request,
+ headers: Dict[str, Any],
+ collection_name: str,
+ deposit_id: Optional[int] = None,
+ replace_metadata: bool = False,
+ replace_archives: bool = False,
+ ) -> Dict[str, Any]:
"""Atom entry deposit.
Args:
request (Request): the request holding information to parse
and inject in db
- headers (dict): request headers formatted
- collection_name (str): the associated client
- deposit_id (id): deposit identifier if provided
- replace_metadata (bool): 'Update or add' request to existing
+ headers: request headers formatted
+ collection_name: the associated client
+ deposit_id: deposit identifier if provided
+ replace_metadata: 'Update or add' request to existing
deposit. If False (default), this adds new metadata request to
existing ones. Otherwise, this will replace existing metadata.
- replace_archives (bool): 'Update or add' request to existing
+ replace_archives: 'Update or add' request to existing
deposit. If False (default), this adds new archive request to
existing ones. Otherwise, this will replace existing archives.
ones.
Returns:
In the optimal case a dict with the following keys:
- deposit_id: deposit id associated to the deposit
- deposit_date: date of the deposit
- archive: None (no archive is provided here)
Otherwise, a dictionary with the key error and the
associated failures, either:
- 400 (bad request) if the request is not providing an external
identifier
- 400 (bad request) if the request's body is empty
- 415 (unsupported media type) if a wrong media type is provided
"""
try:
raw_metadata, metadata = self._read_metadata(request.data)
except ParserError:
return make_error_dict(
BAD_REQUEST,
"Malformed xml metadata",
"The xml received is malformed. "
"Please ensure your metadata file is correctly formatted.",
)
if not metadata:
return make_error_dict(
BAD_REQUEST,
"Empty body request is not supported",
"Atom entry deposit is supposed to send for metadata. "
"If the body is empty, there is no metadata.",
)
external_id = metadata.get("external_identifier", headers["slug"])
+ # TODO: Determine if we are in the metadata-only deposit case. If it is, then
+ # save deposit and deposit request typed 'metadata' and send metadata to the
+ # metadata storage. Otherwise, do as existing deposit.
+
deposit = self._deposit_put(
request,
deposit_id=deposit_id,
in_progress=headers["in-progress"],
external_id=external_id,
)
self._deposit_request_put(
deposit,
{METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata},
replace_metadata,
replace_archives,
)
return {
"deposit_id": deposit.id,
"deposit_date": deposit.reception_date,
"archive": None,
"status": deposit.status,
}
- def _empty_post(self, request, headers, collection_name, deposit_id):
+ def _empty_post(
+ self, request: Request, headers: Dict, collection_name: str, deposit_id: int
+ ) -> Dict[str, Any]:
"""Empty post to finalize an empty deposit.
Args:
- request (Request): the request holding information to parse
+ request: the request holding information to parse
and inject in db
- headers (dict): request headers formatted
- collection_name (str): the associated client
- deposit_id (id): deposit identifier
+ headers: request headers formatted
+ collection_name: the associated client
+ deposit_id: deposit identifier
Returns:
Dictionary of result with the deposit's id, the date
it was completed and no archive.
"""
deposit = Deposit.objects.get(pk=deposit_id)
deposit.complete_date = timezone.now()
deposit.status = DEPOSIT_STATUS_DEPOSITED
deposit.save()
return {
"deposit_id": deposit_id,
"deposit_date": deposit.complete_date,
"status": deposit.status,
"archive": None,
}
- def _make_iris(self, request, collection_name, deposit_id):
+ def _make_iris(
+ self, request: Request, collection_name: str, deposit_id: int
+ ) -> Dict[str, Any]:
"""Define the IRI endpoints
Args:
request (Request): The initial request
collection_name (str): client/collection's name
deposit_id (id): Deposit identifier
Returns:
Dictionary of keys with the iris' urls.
"""
args = [collection_name, deposit_id]
return {
iri: request.build_absolute_uri(reverse(iri, args=args))
for iri in [EM_IRI, EDIT_SE_IRI, CONT_FILE_IRI, STATE_IRI]
}
- def additional_checks(self, request, headers, collection_name, deposit_id=None):
+ def additional_checks(
+ self,
+ request: Request,
+ headers: Dict[str, Any],
+ collection_name: str,
+ deposit_id: Optional[int] = None,
+ ) -> Dict[str, Any]:
"""Permit the child class to enrich additional checks.
Returns:
dict with 'error' detailing the problem.
"""
return {}
- def checks(self, request, collection_name, deposit_id=None):
+ def checks(
+ self, request: Request, collection_name: str, deposit_id: Optional[int] = None
+ ) -> Dict[str, Any]:
try:
self._collection = DepositCollection.objects.get(name=collection_name)
except DepositCollection.DoesNotExist:
return make_error_dict(
NOT_FOUND, f"Unknown collection name {collection_name}"
)
+ assert self._collection is not None
username = request.user.username
if username: # unauthenticated request can have the username empty
try:
- self._client = DepositClient.objects.get(username=username)
+ self._client: DepositClient = DepositClient.objects.get( # type: ignore
+ username=username
+ )
except DepositClient.DoesNotExist:
return make_error_dict(NOT_FOUND, f"Unknown client name {username}")
- if self._collection.id not in self._client.collections:
+ collection_id = self._collection.id
+ collections = self._client.collections
+ assert collections is not None
+ if collection_id not in collections:
return make_error_dict(
FORBIDDEN,
f"Client {username} cannot access collection {collection_name}",
)
if deposit_id:
try:
deposit = Deposit.objects.get(pk=deposit_id)
except Deposit.DoesNotExist:
return make_error_dict(
NOT_FOUND, f"Deposit with id {deposit_id} does not exist"
)
checks = self.restrict_access(request, deposit)
if checks:
return checks
headers = self._read_headers(request)
if headers["on-behalf-of"]:
return make_error_dict(MEDIATION_NOT_ALLOWED, "Mediation is not supported.")
checks = self.additional_checks(request, headers, collection_name, deposit_id)
if "error" in checks:
return checks
return {"headers": headers}
- def restrict_access(self, request, deposit=None):
+ def restrict_access(
+ self, request: Request, deposit: Optional[Deposit] = None
+ ) -> Dict[str, Any]:
if deposit:
if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL:
summary = "You can only act on deposit with status '%s'" % (
DEPOSIT_STATUS_PARTIAL,
)
description = f"This deposit has status '{deposit.status}'"
return make_error_dict(
BAD_REQUEST, summary=summary, verbose_description=description
)
+ return {}
- def _basic_not_allowed_method(self, request, method):
+ def _basic_not_allowed_method(self, request: Request, method: str):
return make_error_response(
request,
METHOD_NOT_ALLOWED,
f"{method} method is not supported on this endpoint",
)
- def get(self, request, *args, **kwargs):
+ def get(
+ self, request: Request, collection_name: str, deposit_id: int
+ ) -> Union[HttpResponse, FileResponse]:
return self._basic_not_allowed_method(request, "GET")
- def post(self, request, *args, **kwargs):
+ def post(
+ self, request: Request, collection_name: str, deposit_id: Optional[int] = None
+ ) -> HttpResponse:
return self._basic_not_allowed_method(request, "POST")
- def put(self, request, *args, **kwargs):
+ def put(
+ self, request: Request, collection_name: str, deposit_id: int
+ ) -> HttpResponse:
return self._basic_not_allowed_method(request, "PUT")
- def delete(self, request, *args, **kwargs):
+ def delete(
+ self, request: Request, collection_name: str, deposit_id: Optional[int] = None
+ ) -> HttpResponse:
return self._basic_not_allowed_method(request, "DELETE")
class APIGet(APIBase, metaclass=ABCMeta):
"""Mixin for class to support GET method.
"""
- def get(self, request, collection_name, deposit_id, format=None):
+ def get(
+ self, request: Request, collection_name: str, deposit_id: int
+ ) -> Union[HttpResponse, FileResponse]:
"""Endpoint to create/add resources to deposit.
Returns:
200 response when no error during routine occurred
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
checks = self.checks(request, collection_name, deposit_id)
if "error" in checks:
return make_error_response_from_dict(request, checks["error"])
r = self.process_get(request, collection_name, deposit_id)
- if isinstance(r, tuple):
- status, content, content_type = r
- return HttpResponse(content, status=status, content_type=content_type)
-
- return r
+ status, content, content_type = r
+ if content_type == "swh/generator":
+ with content as path:
+ return FileResponse(
+ open(path, "rb"), status=status, content_type="application/zip"
+ )
+ if content_type == "application/json":
+ return HttpResponse(
+ json.dumps(content), status=status, content_type=content_type
+ )
+ return HttpResponse(content, status=status, content_type=content_type)
@abstractmethod
- def process_get(self, request, collection_name, deposit_id):
+ def process_get(
+ self, request: Request, collection_name: str, deposit_id: int
+ ) -> Tuple[int, Any, str]:
"""Routine to deal with the deposit's get processing.
Returns:
Tuple status, stream of content, content-type
"""
pass
class APIPost(APIBase, metaclass=ABCMeta):
"""Mixin for class to support DELETE method.
"""
- def post(self, request, collection_name, deposit_id=None, format=None):
+ def post(
+ self, request: Request, collection_name: str, deposit_id: Optional[int] = None
+ ) -> HttpResponse:
"""Endpoint to create/add resources to deposit.
Returns:
204 response when no error during routine occurred.
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
checks = self.checks(request, collection_name, deposit_id)
if "error" in checks:
return make_error_response_from_dict(request, checks["error"])
headers = checks["headers"]
_status, _iri_key, data = self.process_post(
request, headers, collection_name, deposit_id
)
error = data.get("error")
if error:
return make_error_response_from_dict(request, error)
data["packagings"] = ACCEPT_PACKAGINGS
iris = self._make_iris(request, collection_name, data["deposit_id"])
data.update(iris)
response = render(
request,
"deposit/deposit_receipt.xml",
context=data,
content_type="application/xml",
status=_status,
)
- response._headers["location"] = "Location", data[_iri_key]
+ response._headers["location"] = "Location", data[_iri_key] # type: ignore
return response
@abstractmethod
- def process_post(self, request, headers, collection_name, deposit_id=None):
+ def process_post(
+ self,
+ request,
+ headers: Dict,
+ collection_name: str,
+ deposit_id: Optional[int] = None,
+ ) -> Tuple[int, str, Dict]:
"""Routine to deal with the deposit's processing.
Returns
Tuple of:
- response status code (200, 201, etc...)
- key iri (EM_IRI, EDIT_SE_IRI, etc...)
- dictionary of the processing result
"""
pass
class APIPut(APIBase, metaclass=ABCMeta):
"""Mixin for class to support PUT method.
"""
- def put(self, request, collection_name, deposit_id, format=None):
+ def put(
+ self, request: Request, collection_name: str, deposit_id: int
+ ) -> HttpResponse:
"""Endpoint to update deposit resources.
Returns:
204 response when no error during routine occurred.
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
checks = self.checks(request, collection_name, deposit_id)
if "error" in checks:
return make_error_response_from_dict(request, checks["error"])
headers = checks["headers"]
data = self.process_put(request, headers, collection_name, deposit_id)
error = data.get("error")
if error:
return make_error_response_from_dict(request, error)
return HttpResponse(status=status.HTTP_204_NO_CONTENT)
@abstractmethod
- def process_put(self, request, headers, collection_name, deposit_id):
+ def process_put(
+ self, request: Request, headers: Dict, collection_name: str, deposit_id: int
+ ) -> Dict[str, Any]:
"""Routine to deal with updating a deposit in some way.
Returns
dictionary of the processing result
"""
pass
class APIDelete(APIBase, metaclass=ABCMeta):
"""Mixin for class to support DELETE method.
"""
- def delete(self, request, collection_name, deposit_id):
+ def delete(
+ self, request: Request, collection_name: str, deposit_id: Optional[int] = None
+ ) -> HttpResponse:
"""Endpoint to delete some deposit's resources (archives, deposit).
Returns:
204 response when no error during routine occurred.
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
checks = self.checks(request, collection_name, deposit_id)
if "error" in checks:
return make_error_response_from_dict(request, checks["error"])
+ assert deposit_id is not None
data = self.process_delete(request, collection_name, deposit_id)
error = data.get("error")
if error:
return make_error_response_from_dict(request, error)
return HttpResponse(status=status.HTTP_204_NO_CONTENT)
@abstractmethod
- def process_delete(self, request, collection_name, deposit_id):
+ def process_delete(
+ self, request: Request, collection_name: str, deposit_id: int
+ ) -> Dict:
"""Routine to delete a resource.
This is mostly not allowed except for the
EM_IRI (cf. .api.deposit_update.APIUpdateArchive)
"""
- pass
+ return {}
diff --git a/swh/deposit/api/deposit.py b/swh/deposit/api/deposit.py
index b426b180..8cc4455c 100644
--- a/swh/deposit/api/deposit.py
+++ b/swh/deposit/api/deposit.py
@@ -1,98 +1,112 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from typing import Any, Dict, Optional, Tuple
+
from rest_framework import status
from ..config import EDIT_SE_IRI
from ..errors import BAD_REQUEST, make_error_dict
from ..parsers import (
SWHAtomEntryParser,
SWHFileUploadTarParser,
SWHFileUploadZipParser,
SWHMultiPartParser,
)
from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, APIPost
class APIPostDeposit(APIPost):
"""Deposit request class defining api endpoints for sword deposit.
What's known as 'Col IRI' in the sword specification.
HTTP verbs supported: POST
"""
parser_classes = (
SWHMultiPartParser,
SWHFileUploadZipParser,
SWHFileUploadTarParser,
SWHAtomEntryParser,
)
- def additional_checks(self, req, headers, collection_name, deposit_id=None):
+ def additional_checks(
+ self,
+ req,
+ headers: Dict[str, Any],
+ collection_name: str,
+ deposit_id: Optional[int] = None,
+ ) -> Dict[str, Any]:
slug = headers["slug"]
if not slug:
msg = "Missing SLUG header in request"
verbose_description = "Provide in the SLUG header one identifier, for example the url pointing to the resource you are depositing." # noqa
return make_error_dict(BAD_REQUEST, msg, verbose_description)
return {}
- def process_post(self, req, headers, collection_name, deposit_id=None):
+ def process_post(
+ self,
+ req,
+ headers: Dict[str, Any],
+ collection_name: str,
+ deposit_id: Optional[int] = None,
+ ) -> Tuple[int, str, Dict[str, Any]]:
"""Create a first deposit as:
- archive deposit (1 zip)
- multipart (1 zip + 1 atom entry)
- atom entry
Args:
req (Request): the request holding the information to parse
and inject in db
collection_name (str): the associated client
Returns:
An http response (HttpResponse) according to the situation.
If everything is ok, a 201 response (created) with a
deposit receipt.
Otherwise, depending on the upload, the following errors
can be returned:
- archive deposit:
- 400 (bad request) if the request is not providing an external
identifier
- 403 (forbidden) if the length of the archive exceeds the
max size configured
- 412 (precondition failed) if the length or hash provided
mismatch the reality of the archive.
- 415 (unsupported media type) if a wrong media type is
provided
- multipart deposit:
- 400 (bad request) if the request is not providing an external
identifier
- 412 (precondition failed) if the potentially md5 hash
provided mismatch the reality of the archive
- 415 (unsupported media type) if a wrong media type is
provided
- Atom entry deposit:
- 400 (bad request) if the request is not providing an external
identifier
- 400 (bad request) if the request's body is empty
- 415 (unsupported media type) if a wrong media type is
provided
"""
assert deposit_id is None
if req.content_type in ACCEPT_ARCHIVE_CONTENT_TYPES:
data = self._binary_upload(req, headers, collection_name)
elif req.content_type.startswith("multipart/"):
data = self._multipart_upload(req, headers, collection_name)
else:
data = self._atom_entry(req, headers, collection_name)
return status.HTTP_201_CREATED, EDIT_SE_IRI, data
diff --git a/swh/deposit/api/deposit_content.py b/swh/deposit/api/deposit_content.py
index a7f861f4..fbab2fe4 100644
--- a/swh/deposit/api/deposit_content.py
+++ b/swh/deposit/api/deposit_content.py
@@ -1,46 +1,47 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from django.http import HttpResponse
from django.shortcuts import render
from rest_framework import status
from ..errors import NOT_FOUND, make_error_response, make_error_response_from_dict
from ..models import DEPOSIT_STATUS_DETAIL, Deposit, DepositRequest
from .common import APIBase
class APIContent(APIBase):
- def get(self, req, collection_name, deposit_id, format=None):
+ def get(self, req, collection_name: str, deposit_id: int) -> HttpResponse:
checks = self.checks(req, collection_name, deposit_id)
if "error" in checks:
return make_error_response_from_dict(req, checks["error"])
try:
deposit = Deposit.objects.get(pk=deposit_id)
if deposit.collection.name != collection_name:
raise Deposit.DoesNotExist
except Deposit.DoesNotExist:
return make_error_response(
req,
NOT_FOUND,
"deposit %s does not belong to collection %s"
% (deposit_id, collection_name),
)
requests = DepositRequest.objects.filter(deposit=deposit)
context = {
"deposit_id": deposit.id,
"status": deposit.status,
"status_detail": DEPOSIT_STATUS_DETAIL[deposit.status],
"requests": requests,
}
return render(
req,
"deposit/content.xml",
context=context,
content_type="application/xml",
status=status.HTTP_200_OK,
)
diff --git a/swh/deposit/api/deposit_status.py b/swh/deposit/api/deposit_status.py
index fa89276e..9c87db9c 100644
--- a/swh/deposit/api/deposit_status.py
+++ b/swh/deposit/api/deposit_status.py
@@ -1,64 +1,65 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from django.http import HttpResponse
from django.shortcuts import render
from rest_framework import status
from ..errors import NOT_FOUND, make_error_response, make_error_response_from_dict
from ..models import DEPOSIT_STATUS_DETAIL, Deposit
from .common import APIBase
from .converters import convert_status_detail
class APIStatus(APIBase):
"""Deposit status.
What's known as 'State IRI' in the sword specification.
HTTP verbs supported: GET
"""
- def get(self, req, collection_name, deposit_id, format=None):
+ def get(self, req, collection_name: str, deposit_id: int) -> HttpResponse:
checks = self.checks(req, collection_name, deposit_id)
if "error" in checks:
return make_error_response_from_dict(req, checks["error"])
try:
deposit = Deposit.objects.get(pk=deposit_id)
if deposit.collection.name != collection_name:
raise Deposit.DoesNotExist
except Deposit.DoesNotExist:
return make_error_response(
req,
NOT_FOUND,
"deposit %s does not belong to collection %s"
% (deposit_id, collection_name),
)
status_detail = convert_status_detail(deposit.status_detail)
if not status_detail:
status_detail = DEPOSIT_STATUS_DETAIL[deposit.status]
context = {
"deposit_id": deposit.id,
"status_detail": status_detail,
}
keys = (
"status",
"swh_id",
"swh_id_context",
"external_id",
)
for k in keys:
context[k] = getattr(deposit, k, None)
return render(
req,
"deposit/status.xml",
context=context,
content_type="application/xml",
status=status.HTTP_200_OK,
)
diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py
index 749edd37..ded1bf5f 100644
--- a/swh/deposit/api/deposit_update.py
+++ b/swh/deposit/api/deposit_update.py
@@ -1,169 +1,185 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from typing import Any, Dict, Optional, Tuple
+
from rest_framework import status
from ..config import CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI
from ..errors import BAD_REQUEST, make_error_dict
from ..parsers import (
SWHAtomEntryParser,
SWHFileUploadTarParser,
SWHFileUploadZipParser,
SWHMultiPartParser,
)
from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, APIDelete, APIPost, APIPut
class APIUpdateArchive(APIPost, APIPut, APIDelete):
"""Deposit request class defining api endpoints for sword deposit.
What's known as 'EM IRI' in the sword specification.
HTTP verbs supported: PUT, POST, DELETE
"""
parser_classes = (
SWHFileUploadZipParser,
SWHFileUploadTarParser,
)
- def process_put(self, req, headers, collection_name, deposit_id):
+ def process_put(
+ self, req, headers, collection_name: str, deposit_id: int
+ ) -> Dict[str, Any]:
"""Replace existing content for the existing deposit.
source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_binary # noqa
Returns:
204 No content
"""
if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES:
msg = "Packaging format supported is restricted to %s" % (
", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES)
)
return make_error_dict(BAD_REQUEST, msg)
return self._binary_upload(
req, headers, collection_name, deposit_id=deposit_id, replace_archives=True
)
- def process_post(self, req, headers, collection_name, deposit_id):
+ def process_post(
+ self, req, headers: Dict, collection_name: str, deposit_id: Optional[int] = None
+ ) -> Tuple[int, str, Dict]:
"""Add new content to the existing deposit.
source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_mediaresource # noqa
Returns:
201 Created
Headers: Location: [Cont-File-IRI]
Body: [optional Deposit Receipt]
"""
if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES:
msg = "Packaging format supported is restricted to %s" % (
", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES)
)
- return "unused", "unused", make_error_dict(BAD_REQUEST, msg)
+ unused = 0
+ return unused, "unused", make_error_dict(BAD_REQUEST, msg)
return (
status.HTTP_201_CREATED,
CONT_FILE_IRI,
self._binary_upload(req, headers, collection_name, deposit_id),
)
- def process_delete(self, req, collection_name, deposit_id):
+ def process_delete(self, req, collection_name: str, deposit_id: int) -> Dict:
"""Delete content (archives) from existing deposit.
source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deletingcontent # noqa
Returns:
204 Created
"""
return self._delete_archives(collection_name, deposit_id)
class APIUpdateMetadata(APIPost, APIPut, APIDelete):
"""Deposit request class defining api endpoints for sword deposit.
What's known as 'Edit IRI' (and SE IRI) in the sword specification.
HTTP verbs supported: POST (SE IRI), PUT (Edit IRI), DELETE
"""
parser_classes = (SWHMultiPartParser, SWHAtomEntryParser)
- def process_put(self, req, headers, collection_name, deposit_id):
+ def process_put(
+ self, req, headers: Dict, collection_name: str, deposit_id: int
+ ) -> Dict[str, Any]:
"""Replace existing deposit's metadata/archive with new ones.
source:
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_metadata # noqa
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_multipart # noqa
Returns:
204 No content
"""
if req.content_type.startswith("multipart/"):
return self._multipart_upload(
req,
headers,
collection_name,
deposit_id=deposit_id,
replace_archives=True,
replace_metadata=True,
)
return self._atom_entry(
req, headers, collection_name, deposit_id=deposit_id, replace_metadata=True
)
- def process_post(self, req, headers, collection_name, deposit_id):
+ def process_post(
+ self,
+ request,
+ headers: Dict,
+ collection_name: str,
+ deposit_id: Optional[int] = None,
+ ) -> Tuple[int, str, Dict]:
"""Add new metadata/archive to existing deposit.
source:
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_metadata # noqa
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_multipart # noqa
This also deals with an empty post corner case to finalize a
deposit.
Returns:
In optimal case for a multipart and atom-entry update, a
201 Created response. The body response will hold a
deposit. And the response headers will contain an entry
'Location' with the EM-IRI.
For the empty post case, this returns a 200.
"""
- if req.content_type.startswith("multipart/"):
+ assert deposit_id is not None
+ if request.content_type.startswith("multipart/"):
return (
status.HTTP_201_CREATED,
EM_IRI,
self._multipart_upload(
- req, headers, collection_name, deposit_id=deposit_id
+ request, headers, collection_name, deposit_id=deposit_id
),
)
# check for final empty post
# source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html
# #continueddeposit_complete
if headers["content-length"] == 0 and headers["in-progress"] is False:
- data = self._empty_post(req, headers, collection_name, deposit_id)
+ data = self._empty_post(request, headers, collection_name, deposit_id)
return (status.HTTP_200_OK, EDIT_SE_IRI, data)
return (
status.HTTP_201_CREATED,
EM_IRI,
- self._atom_entry(req, headers, collection_name, deposit_id=deposit_id),
+ self._atom_entry(request, headers, collection_name, deposit_id=deposit_id),
)
- def process_delete(self, req, collection_name, deposit_id):
+ def process_delete(self, req, collection_name: str, deposit_id: int) -> Dict:
"""Delete the container (deposit).
source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deleteconteiner # noqa
"""
return self._delete_deposit(collection_name, deposit_id)
diff --git a/swh/deposit/api/private/__init__.py b/swh/deposit/api/private/__init__.py
index e9b98ee3..4a9aaaa8 100644
--- a/swh/deposit/api/private/__init__.py
+++ b/swh/deposit/api/private/__init__.py
@@ -1,108 +1,96 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from rest_framework.permissions import AllowAny
from swh.deposit import utils
from swh.deposit.api.common import AuthenticatedAPIView
from swh.deposit.errors import NOT_FOUND, make_error_dict
from ...config import METADATA_TYPE, APIConfig
from ...models import Deposit, DepositRequest
class DepositReadMixin:
"""Deposit Read mixin
"""
def _deposit_requests(self, deposit, request_type):
"""Given a deposit, yields its associated deposit_request
Args:
deposit (Deposit): Deposit to list requests for
request_type (str): 'archive' or 'metadata'
Yields:
deposit requests of type request_type associated to the deposit
"""
if isinstance(deposit, int):
deposit = Deposit.objects.get(pk=deposit)
deposit_requests = DepositRequest.objects.filter(
type=request_type, deposit=deposit
).order_by("id")
for deposit_request in deposit_requests:
yield deposit_request
def _metadata_get(self, deposit):
"""Given a deposit, aggregate all metadata requests.
Args:
deposit (Deposit): The deposit instance to extract
metadata from.
Returns:
metadata dict from the deposit.
"""
metadata = (
m.metadata
for m in self._deposit_requests(deposit, request_type=METADATA_TYPE)
)
return utils.merge(*metadata)
class APIPrivateView(APIConfig, AuthenticatedAPIView):
"""Mixin intended as private api (so no authentication) based API view
(for the private ones).
"""
authentication_classes = ()
permission_classes = (AllowAny,)
def checks(self, req, collection_name, deposit_id=None):
"""Override default checks implementation to allow empty collection.
"""
if deposit_id:
try:
Deposit.objects.get(pk=deposit_id)
except Deposit.DoesNotExist:
return make_error_dict(
NOT_FOUND, "Deposit with id %s does not exist" % deposit_id
)
headers = self._read_headers(req)
checks = self.additional_checks(req, headers, collection_name, deposit_id)
if "error" in checks:
return checks
return {"headers": headers}
def get(
- self,
- request,
- collection_name=None,
- deposit_id=None,
- format=None,
- *args,
- **kwargs,
+ self, request, collection_name=None, deposit_id=None, *args, **kwargs,
):
- return super().get(request, collection_name, deposit_id, format)
+ return super().get(request, collection_name, deposit_id)
def put(
- self,
- request,
- collection_name=None,
- deposit_id=None,
- format=None,
- *args,
- **kwargs,
+ self, request, collection_name=None, deposit_id=None, *args, **kwargs,
):
- return super().put(request, collection_name, deposit_id, format)
+ return super().put(request, collection_name, deposit_id)
diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py
index 680ec83c..d2afd5e7 100644
--- a/swh/deposit/api/private/deposit_check.py
+++ b/swh/deposit/api/private/deposit_check.py
@@ -1,228 +1,234 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from itertools import chain
-import json
import re
from shutil import get_unpack_formats
import tarfile
+from typing import Dict, Optional, Tuple
import zipfile
from rest_framework import status
from swh.scheduler.utils import create_oneshot_task_dict
from . import APIPrivateView, DepositReadMixin
from ...config import ARCHIVE_TYPE, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED
-from ...models import Deposit
+from ...models import Deposit, DepositRequest
from ..common import APIGet
MANDATORY_FIELDS_MISSING = "Mandatory fields are missing"
ALTERNATE_FIELDS_MISSING = "Mandatory alternate fields are missing"
MANDATORY_ARCHIVE_UNREADABLE = (
"At least one of its associated archives is not readable" # noqa
)
MANDATORY_ARCHIVE_INVALID = (
"Mandatory archive is invalid (i.e contains only one archive)" # noqa
)
MANDATORY_ARCHIVE_UNSUPPORTED = "Mandatory archive type is not supported"
MANDATORY_ARCHIVE_MISSING = "Deposit without archive is rejected"
ARCHIVE_EXTENSIONS = [
"zip",
"tar",
"tar.gz",
"xz",
"tar.xz",
"bz2",
"tar.bz2",
"Z",
"tar.Z",
"tgz",
"7z",
]
PATTERN_ARCHIVE_EXTENSION = re.compile(r".*\.(%s)$" % "|".join(ARCHIVE_EXTENSIONS))
def known_archive_format(filename):
return any(
filename.endswith(t) for t in chain(*(x[1] for x in get_unpack_formats()))
)
class APIChecks(APIPrivateView, APIGet, DepositReadMixin):
"""Dedicated class to read a deposit's raw archives content.
Only GET is supported.
"""
- def _check_deposit_archives(self, deposit):
+ def _check_deposit_archives(self, deposit: Deposit) -> Tuple[bool, Optional[Dict]]:
"""Given a deposit, check each deposit request of type archive.
Args:
The deposit to check archives for
Returns
tuple (status, error_detail): True, None if all archives
are ok, (False, <detailed-error>) otherwise.
"""
requests = list(self._deposit_requests(deposit, request_type=ARCHIVE_TYPE))
if len(requests) == 0: # no associated archive is refused
return False, {"archive": [{"summary": MANDATORY_ARCHIVE_MISSING,}]}
errors = []
for archive_request in requests:
check, error_message = self._check_archive(archive_request)
if not check:
errors.append(
{"summary": error_message, "fields": [archive_request.id]}
)
if not errors:
return True, None
return False, {"archive": errors}
- def _check_archive(self, archive_request):
+ def _check_archive(
+ self, archive_request: DepositRequest
+ ) -> Tuple[bool, Optional[str]]:
"""Check that a deposit associated archive is ok:
- readable
- supported archive format
- valid content: the archive does not contain a single archive file
If any of those checks are not ok, return the corresponding
failing check.
Args:
archive_path (DepositRequest): Archive to check
Returns:
(True, None) if archive is check compliant, (False,
<detail-error>) otherwise.
"""
archive_path = archive_request.archive.path
if not known_archive_format(archive_path):
return False, MANDATORY_ARCHIVE_UNSUPPORTED
try:
if zipfile.is_zipfile(archive_path):
- with zipfile.ZipFile(archive_path) as f:
- files = f.namelist()
+ with zipfile.ZipFile(archive_path) as zipfile_:
+ files = zipfile_.namelist()
elif tarfile.is_tarfile(archive_path):
- with tarfile.open(archive_path) as f:
- files = f.getnames()
+ with tarfile.open(archive_path) as tarfile_:
+ files = tarfile_.getnames()
else:
return False, MANDATORY_ARCHIVE_UNSUPPORTED
except Exception:
return False, MANDATORY_ARCHIVE_UNREADABLE
if len(files) > 1:
return True, None
element = files[0]
if PATTERN_ARCHIVE_EXTENSION.match(element):
# archive in archive!
return False, MANDATORY_ARCHIVE_INVALID
return True, None
- def _check_metadata(self, metadata):
+ def _check_metadata(self, metadata: Dict) -> Tuple[bool, Optional[Dict]]:
"""Check to execute on all metadata for mandatory field presence.
Args:
metadata (dict): Metadata dictionary to check for mandatory fields
Returns:
tuple (status, error_detail): True, None if metadata are
ok (False, <detailed-error>) otherwise.
"""
required_fields = {
"author": False,
}
alternate_fields = {
("name", "title"): False, # alternate field, at least one
# of them must be present
}
for field, value in metadata.items():
for name in required_fields:
if name in field:
required_fields[name] = True
for possible_names in alternate_fields:
for possible_name in possible_names:
if possible_name in field:
alternate_fields[possible_names] = True
continue
mandatory_result = [k for k, v in required_fields.items() if not v]
optional_result = [" or ".join(k) for k, v in alternate_fields.items() if not v]
if mandatory_result == [] and optional_result == []:
return True, None
detail = []
if mandatory_result != []:
detail.append(
{"summary": MANDATORY_FIELDS_MISSING, "fields": mandatory_result}
)
if optional_result != []:
detail.append(
{"summary": ALTERNATE_FIELDS_MISSING, "fields": optional_result,}
)
return False, {"metadata": detail}
- def process_get(self, req, collection_name, deposit_id):
+ def process_get(
+ self, req, collection_name: str, deposit_id: int
+ ) -> Tuple[int, Dict, str]:
"""Build a unique tarball from the multiple received and stream that
content to the client.
Args:
req (Request):
collection_name (str): Collection owning the deposit
deposit_id (id): Deposit concerned by the reading
Returns:
Tuple status, stream of content, content-type
"""
deposit = Deposit.objects.get(pk=deposit_id)
metadata = self._metadata_get(deposit)
- problems = {}
+ problems: Dict = {}
# will check each deposit's associated request (both of type
# archive and metadata) for errors
archives_status, error_detail = self._check_deposit_archives(deposit)
if not archives_status:
+ assert error_detail is not None
problems.update(error_detail)
metadata_status, error_detail = self._check_metadata(metadata)
if not metadata_status:
+ assert error_detail is not None
problems.update(error_detail)
deposit_status = archives_status and metadata_status
# if any problems arose, the deposit is rejected
if not deposit_status:
deposit.status = DEPOSIT_STATUS_REJECTED
deposit.status_detail = problems
response = {
"status": deposit.status,
"details": deposit.status_detail,
}
else:
deposit.status = DEPOSIT_STATUS_VERIFIED
response = {
"status": deposit.status,
}
if not deposit.load_task_id and self.config["checks"]:
url = deposit.origin_url
task = create_oneshot_task_dict(
"load-deposit", url=url, deposit_id=deposit.id, retries_left=3
)
load_task_id = self.scheduler.create_tasks([task])[0]["id"]
deposit.load_task_id = load_task_id
deposit.save()
- return status.HTTP_200_OK, json.dumps(response), "application/json"
+ return status.HTTP_200_OK, response, "application/json"
diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py
index 4a5f388a..51b6636e 100644
--- a/swh/deposit/api/private/deposit_read.py
+++ b/swh/deposit/api/private/deposit_read.py
@@ -1,197 +1,195 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from contextlib import contextmanager
-import json
import os
import shutil
import tempfile
+from typing import Any, Dict, Tuple
-from django.http import FileResponse
from rest_framework import status
from swh.core import tarball
from swh.deposit.api import __version__
from swh.deposit.utils import normalize_date
from swh.model import identifiers
from . import APIPrivateView, DepositReadMixin
from ...config import ARCHIVE_TYPE, SWH_PERSON
from ...models import Deposit
from ..common import APIGet
@contextmanager
def aggregate_tarballs(extraction_dir, archive_paths):
"""Aggregate multiple tarballs into one and returns this new archive's
path.
Args:
extraction_dir (path): Path to use for the tarballs computation
archive_paths ([str]): Deposit's archive paths
Returns:
Tuple (directory to clean up, archive path (aggregated or not))
"""
# rebuild one zip archive from (possibly) multiple ones
os.makedirs(extraction_dir, 0o755, exist_ok=True)
dir_path = tempfile.mkdtemp(prefix="swh.deposit-", dir=extraction_dir)
# root folder to build an aggregated tarball
aggregated_tarball_rootdir = os.path.join(dir_path, "aggregate")
os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True)
# uncompress in a temporary location all archives
for archive_path in archive_paths:
tarball.uncompress(archive_path, aggregated_tarball_rootdir)
# Aggregate into one big tarball the multiple smaller ones
temp_tarpath = shutil.make_archive(
aggregated_tarball_rootdir, "zip", aggregated_tarball_rootdir
)
# can already clean up temporary directory
shutil.rmtree(aggregated_tarball_rootdir)
try:
yield temp_tarpath
finally:
shutil.rmtree(dir_path)
class APIReadArchives(APIPrivateView, APIGet, DepositReadMixin):
"""Dedicated class to read a deposit's raw archives content.
Only GET is supported.
"""
def __init__(self):
super().__init__()
self.extraction_dir = self.config["extraction_dir"]
if not os.path.exists(self.extraction_dir):
os.makedirs(self.extraction_dir)
- def process_get(self, request, collection_name, deposit_id):
+ def process_get(
+ self, request, collection_name: str, deposit_id: int
+ ) -> Tuple[int, Any, str]:
"""Build a unique tarball from the multiple received and stream that
content to the client.
Args:
request (Request):
- collection_name (str): Collection owning the deposit
- deposit_id (id): Deposit concerned by the reading
+ collection_name: Collection owning the deposit
+ deposit_id: Deposit concerned by the reading
Returns:
Tuple status, stream of content, content-type
"""
archive_paths = [
r.archive.path
for r in self._deposit_requests(deposit_id, request_type=ARCHIVE_TYPE)
]
- with aggregate_tarballs(self.extraction_dir, archive_paths) as path:
- return FileResponse(
- open(path, "rb"),
- status=status.HTTP_200_OK,
- content_type="application/zip",
- )
+ return (
+ status.HTTP_200_OK,
+ aggregate_tarballs(self.extraction_dir, archive_paths),
+ "swh/generator",
+ )
class APIReadMetadata(APIPrivateView, APIGet, DepositReadMixin):
"""Class in charge of aggregating metadata on a deposit.
"""
def __init__(self):
super().__init__()
self.provider = self.config["provider"]
self.tool = {
"name": "swh-deposit",
"version": __version__,
"configuration": {"sword_version": "2"},
}
def _normalize_dates(self, deposit, metadata):
"""Normalize the date to use as a tuple of author date, committer date
from the incoming metadata.
Args:
deposit (Deposit): Deposit model representation
metadata (Dict): Metadata dict representation
Returns:
Tuple of author date, committer date. Those dates are
swh normalized.
"""
commit_date = metadata.get("codemeta:datePublished")
author_date = metadata.get("codemeta:dateCreated")
if author_date and commit_date:
pass
elif commit_date:
author_date = commit_date
elif author_date:
commit_date = author_date
else:
author_date = deposit.complete_date
commit_date = deposit.complete_date
return (normalize_date(author_date), normalize_date(commit_date))
def metadata_read(self, deposit):
"""Read and aggregate multiple data on deposit into one unified data
dictionary.
Args:
deposit (Deposit): Deposit concerned by the data aggregation.
Returns:
Dictionary of data representing the deposit to inject in swh.
"""
metadata = self._metadata_get(deposit)
# Read information metadata
data = {"origin": {"type": "deposit", "url": deposit.origin_url,}}
# metadata provider
self.provider["provider_name"] = deposit.client.last_name
self.provider["provider_url"] = deposit.client.provider_url
author_date, commit_date = self._normalize_dates(deposit, metadata)
if deposit.parent:
swh_persistent_id = deposit.parent.swh_id
swhid = identifiers.parse_swhid(swh_persistent_id)
parent_revision = swhid.object_id
parents = [parent_revision]
else:
parents = []
data["origin_metadata"] = {
"provider": self.provider,
"tool": self.tool,
"metadata": metadata,
}
data["deposit"] = {
"id": deposit.id,
"client": deposit.client.username,
"collection": deposit.collection.name,
"author": SWH_PERSON,
"author_date": author_date,
"committer": SWH_PERSON,
"committer_date": commit_date,
"revision_parents": parents,
}
return data
- def process_get(self, request, collection_name, deposit_id):
+ def process_get(
+ self, request, collection_name: str, deposit_id: int
+ ) -> Tuple[int, Dict, str]:
deposit = Deposit.objects.get(pk=deposit_id)
data = self.metadata_read(deposit)
- d = {}
- if data:
- d = json.dumps(data)
-
- return status.HTTP_200_OK, d, "application/json"
+ return status.HTTP_200_OK, data if data else {}, "application/json"
diff --git a/swh/deposit/api/private/deposit_update_status.py b/swh/deposit/api/private/deposit_update_status.py
index 9df47390..af6bcb6c 100644
--- a/swh/deposit/api/private/deposit_update_status.py
+++ b/swh/deposit/api/private/deposit_update_status.py
@@ -1,103 +1,107 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from typing import Dict
+
from rest_framework.parsers import JSONParser
from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid
from . import APIPrivateView
from ...errors import BAD_REQUEST, make_error_dict
from ...models import DEPOSIT_STATUS_DETAIL, DEPOSIT_STATUS_LOAD_SUCCESS, Deposit
from ..common import APIPut
MANDATORY_KEYS = ["origin_url", "revision_id", "directory_id", "snapshot_id"]
class APIUpdateStatus(APIPrivateView, APIPut):
"""Deposit request class to update the deposit's status.
HTTP verbs supported: PUT
"""
parser_classes = (JSONParser,)
def additional_checks(self, request, headers, collection_name, deposit_id=None):
"""Enrich existing checks to the default ones.
New checks:
- Ensure the status is provided
- Ensure it exists
- no missing information on load success update
"""
data = request.data
status = data.get("status")
if not status:
msg = "The status key is mandatory with possible values %s" % list(
DEPOSIT_STATUS_DETAIL.keys()
)
return make_error_dict(BAD_REQUEST, msg)
if status not in DEPOSIT_STATUS_DETAIL:
msg = "Possible status in %s" % list(DEPOSIT_STATUS_DETAIL.keys())
return make_error_dict(BAD_REQUEST, msg)
if status == DEPOSIT_STATUS_LOAD_SUCCESS:
missing_keys = []
for key in MANDATORY_KEYS:
value = data.get(key)
if value is None:
missing_keys.append(key)
if missing_keys:
msg = (
f"Updating deposit status to {status}"
f" requires information {','.join(missing_keys)}"
)
return make_error_dict(BAD_REQUEST, msg)
return {}
- def process_put(self, request, headers, collection_name, deposit_id):
+ def process_put(
+ self, request, headers: Dict, collection_name: str, deposit_id: int
+ ) -> Dict:
"""Update the deposit with status and SWHIDs
Returns:
204 No content
400 Bad request if checks fail
"""
data = request.data
deposit = Deposit.objects.get(pk=deposit_id)
status = data["status"]
deposit.status = status
if status == DEPOSIT_STATUS_LOAD_SUCCESS:
origin_url = data["origin_url"]
directory_id = data["directory_id"]
revision_id = data["revision_id"]
dir_id = swhid(DIRECTORY, directory_id)
snp_id = swhid(SNAPSHOT, data["snapshot_id"])
rev_id = swhid(REVISION, revision_id)
deposit.swh_id = dir_id
# new id with contextual information
deposit.swh_id_context = swhid(
DIRECTORY,
directory_id,
metadata={
"origin": origin_url,
"visit": snp_id,
"anchor": rev_id,
"path": "/",
},
)
else: # rejected
deposit.status = status
deposit.save()
return {}

File Metadata

Mime Type
text/x-diff
Expires
Mon, Aug 18, 11:14 PM (2 w, 3 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3298293

Event Timeline