diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py
index ff7606c7..55c1b084 100644
--- a/swh/deposit/api/common.py
+++ b/swh/deposit/api/common.py
@@ -1,1042 +1,1172 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from abc import ABCMeta, abstractmethod
import datetime
import hashlib
import json
from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union
+import attr
from django.http import FileResponse, HttpResponse
from django.shortcuts import render
from django.urls import reverse
from django.utils import timezone
from rest_framework import status
from rest_framework.authentication import BaseAuthentication, BasicAuthentication
from rest_framework.permissions import BasePermission, IsAuthenticated
from rest_framework.request import Request
from rest_framework.views import APIView
+from swh.deposit.api.checks import check_metadata
+from swh.deposit.api.converters import convert_status_detail
+from swh.deposit.models import Deposit
+from swh.deposit.utils import compute_metadata_context
from swh.model import hashutil
+from swh.model.identifiers import SWHID, ValidationError
+from swh.model.model import (
+ MetadataAuthority,
+ MetadataAuthorityType,
+ MetadataFetcher,
+ RawExtrinsicMetadata,
+)
from swh.scheduler.utils import create_oneshot_task_dict
from ..config import (
ARCHIVE_KEY,
ARCHIVE_TYPE,
CONT_FILE_IRI,
DEPOSIT_STATUS_DEPOSITED,
DEPOSIT_STATUS_LOAD_SUCCESS,
DEPOSIT_STATUS_PARTIAL,
EDIT_SE_IRI,
EM_IRI,
METADATA_KEY,
METADATA_TYPE,
RAW_METADATA_KEY,
STATE_IRI,
APIConfig,
)
from ..errors import (
BAD_REQUEST,
CHECKSUM_MISMATCH,
ERROR_CONTENT,
FORBIDDEN,
MAX_UPLOAD_SIZE_EXCEEDED,
MEDIATION_NOT_ALLOWED,
METHOD_NOT_ALLOWED,
NOT_FOUND,
PARSING_ERROR,
+ BadRequestError,
ParserError,
make_error_dict,
make_error_response,
make_error_response_from_dict,
)
-from ..models import Deposit, DepositClient, DepositCollection, DepositRequest
-from ..parsers import parse_xml
+from ..models import DepositClient, DepositCollection, DepositRequest
+from ..parsers import parse_swh_reference, parse_xml
ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"]
ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"]
class AuthenticatedAPIView(APIView):
"""Mixin intended as a based API view to enforce the basic
authentication check
"""
authentication_classes: Sequence[Type[BaseAuthentication]] = (BasicAuthentication,)
permission_classes: Sequence[Type[BasePermission]] = (IsAuthenticated,)
class APIBase(APIConfig, AuthenticatedAPIView, metaclass=ABCMeta):
"""Base deposit request class sharing multiple common behaviors.
"""
def _read_headers(self, request: Request) -> Dict[str, Any]:
"""Read and unify the necessary headers from the request (those are
not stored in the same location or not properly formatted).
Args:
request (Request): Input request
Returns:
Dictionary with the following keys (some associated values may be
None):
- content-type
- content-length
- in-progress
- content-disposition
- packaging
- slug
- on-behalf-of
"""
meta = request._request.META
content_type = request.content_type
content_length = meta.get("CONTENT_LENGTH")
if content_length and isinstance(content_length, str):
content_length = int(content_length)
# final deposit if not provided
in_progress = meta.get("HTTP_IN_PROGRESS", False)
content_disposition = meta.get("HTTP_CONTENT_DISPOSITION")
if isinstance(in_progress, str):
in_progress = in_progress.lower() == "true"
content_md5sum = meta.get("HTTP_CONTENT_MD5")
if content_md5sum:
content_md5sum = bytes.fromhex(content_md5sum)
packaging = meta.get("HTTP_PACKAGING")
slug = meta.get("HTTP_SLUG")
on_behalf_of = meta.get("HTTP_ON_BEHALF_OF")
metadata_relevant = meta.get("HTTP_METADATA_RELEVANT")
swhid = meta.get("HTTP_X_CHECK_SWHID")
return {
"content-type": content_type,
"content-length": content_length,
"in-progress": in_progress,
"content-disposition": content_disposition,
"content-md5sum": content_md5sum,
"packaging": packaging,
"slug": slug,
"on-behalf-of": on_behalf_of,
"metadata-relevant": metadata_relevant,
"swhid": swhid,
}
def _compute_md5(self, filehandler) -> bytes:
"""Compute uploaded file's md5 sum.
Args:
filehandler (InMemoryUploadedFile): the file to compute the md5
hash
Returns:
the md5 checksum (str)
"""
h = hashlib.md5()
for chunk in filehandler:
h.update(chunk)
return h.digest()
def _deposit_put(
self,
request: Request,
deposit_id: Optional[int] = None,
in_progress: bool = False,
external_id: Optional[str] = None,
) -> Deposit:
"""Save/Update a deposit in db.
Args:
request: request data
deposit_id: deposit identifier
in_progress: deposit status
external_id: external identifier to associate to the deposit
Returns:
The Deposit instance saved or updated.
"""
complete_date: Optional[datetime.datetime] = None
deposit_parent: Optional[Deposit] = None
if in_progress is False:
complete_date = timezone.now()
status_type = DEPOSIT_STATUS_DEPOSITED
else:
status_type = DEPOSIT_STATUS_PARTIAL
if not deposit_id:
try:
# find a deposit parent (same external id, status load to success)
deposit_parent = (
Deposit.objects.filter(
external_id=external_id, status=DEPOSIT_STATUS_LOAD_SUCCESS
)
.order_by("-id")[0:1]
.get()
) # noqa
except Deposit.DoesNotExist:
# then no parent for that deposit, deposit_parent already None
pass
assert external_id is not None
deposit = Deposit(
collection=self._collection,
external_id=external_id,
complete_date=complete_date,
status=status_type,
client=self._client,
parent=deposit_parent,
)
else:
deposit = Deposit.objects.get(pk=deposit_id)
# update metadata
deposit.complete_date = complete_date
deposit.status = status_type
if self.config["checks"]:
deposit.save() # needed to have a deposit id
scheduler = self.scheduler
if deposit.status == DEPOSIT_STATUS_DEPOSITED and not deposit.check_task_id:
task = create_oneshot_task_dict(
"check-deposit",
collection=deposit.collection.name,
deposit_id=deposit.id,
retries_left=3,
)
check_task_id = scheduler.create_tasks([task])[0]["id"]
deposit.check_task_id = check_task_id
deposit.save()
return deposit
def _deposit_request_put(
self,
deposit: Deposit,
deposit_request_data: Dict[str, Any],
replace_metadata: bool = False,
replace_archives: bool = False,
) -> DepositRequest:
"""Save a deposit request with metadata attached to a deposit.
Args:
deposit: The deposit concerned by the request
deposit_request_data: The dictionary with at most 2 deposit
request types (archive, metadata) to associate to the deposit
replace_metadata: Flag defining if we add or update
existing metadata to the deposit
replace_archives: Flag defining if we add or update
archives to existing deposit
Returns:
the DepositRequest object stored in the backend
"""
if replace_metadata:
DepositRequest.objects.filter(deposit=deposit, type=METADATA_TYPE).delete()
if replace_archives:
DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete()
deposit_request = None
archive_file = deposit_request_data.get(ARCHIVE_KEY)
if archive_file:
deposit_request = DepositRequest(
type=ARCHIVE_TYPE, deposit=deposit, archive=archive_file
)
deposit_request.save()
metadata = deposit_request_data.get(METADATA_KEY)
if metadata:
raw_metadata = deposit_request_data[RAW_METADATA_KEY]
deposit_request = DepositRequest(
type=METADATA_TYPE,
deposit=deposit,
metadata=metadata,
raw_metadata=raw_metadata.decode("utf-8"),
)
deposit_request.save()
assert deposit_request is not None
return deposit_request
def _delete_archives(self, collection_name: str, deposit_id: int) -> Dict:
"""Delete archive references from the deposit id.
"""
try:
deposit = Deposit.objects.get(pk=deposit_id)
except Deposit.DoesNotExist:
return make_error_dict(
NOT_FOUND, f"The deposit {deposit_id} does not exist"
)
DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete()
return {}
def _delete_deposit(self, collection_name: str, deposit_id: int) -> Dict:
"""Delete deposit reference.
Args:
collection_name: Client's collection
deposit_id: The deposit to delete
Returns
Empty dict when ok.
Dict with error key to describe the failure.
"""
try:
deposit = Deposit.objects.get(pk=deposit_id)
except Deposit.DoesNotExist:
return make_error_dict(
NOT_FOUND, f"The deposit {deposit_id} does not exist"
)
if deposit.collection.name != collection_name:
summary = "Cannot delete a deposit from another collection"
description = "Deposit %s does not belong to the collection %s" % (
deposit_id,
collection_name,
)
return make_error_dict(
BAD_REQUEST, summary=summary, verbose_description=description
)
DepositRequest.objects.filter(deposit=deposit).delete()
deposit.delete()
return {}
def _check_preconditions_on(
self, filehandler, md5sum: str, content_length: Optional[int] = None
) -> Optional[Dict]:
"""Check preconditions on provided file are respected. That is the
length and/or the md5sum hash match the file's content.
Args:
filehandler (InMemoryUploadedFile): The file to check
md5sum: md5 hash expected from the file's content
content_length: the expected length if provided.
Returns:
Either none if no error or a dictionary with a key error
detailing the problem.
"""
max_upload_size = self.config["max_upload_size"]
if content_length:
if content_length > max_upload_size:
return make_error_dict(
MAX_UPLOAD_SIZE_EXCEEDED,
f"Upload size limit exceeded (max {max_upload_size} bytes)."
"Please consider sending the archive in multiple steps.",
)
length = filehandler.size
if length != content_length:
return make_error_dict(
status.HTTP_412_PRECONDITION_FAILED, "Wrong length"
)
if md5sum:
_md5sum = self._compute_md5(filehandler)
if _md5sum != md5sum:
return make_error_dict(
CHECKSUM_MISMATCH,
"Wrong md5 hash",
f"The checksum sent {hashutil.hash_to_hex(md5sum)} and the actual "
f"checksum {hashutil.hash_to_hex(_md5sum)} does not match.",
)
return None
def _binary_upload(
self,
request: Request,
headers: Dict[str, Any],
collection_name: str,
deposit_id: Optional[int] = None,
replace_metadata: bool = False,
replace_archives: bool = False,
) -> Dict[str, Any]:
"""Binary upload routine.
Other than such a request, a 415 response is returned.
Args:
request (Request): the request holding information to parse
and inject in db
headers (dict): request headers formatted
collection_name (str): the associated client
deposit_id (id): deposit identifier if provided
replace_metadata (bool): 'Update or add' request to existing
deposit. If False (default), this adds new metadata request to
existing ones. Otherwise, this will replace existing metadata.
replace_archives (bool): 'Update or add' request to existing
deposit. If False (default), this adds new archive request to
existing ones. Otherwise, this will replace existing archives.
ones.
Returns:
In the optimal case a dict with the following keys:
- deposit_id (int): Deposit identifier
- deposit_date (date): Deposit date
- archive: None (no archive is provided here)
Otherwise, a dictionary with the key error and the
associated failures, either:
- 400 (bad request) if the request is not providing an external
identifier
- 413 (request entity too large) if the length of the
archive exceeds the max size configured
- 412 (precondition failed) if the length or md5 hash provided
mismatch the reality of the archive
- 415 (unsupported media type) if a wrong media type is provided
"""
content_length = headers["content-length"]
if not content_length:
return make_error_dict(
BAD_REQUEST,
"CONTENT_LENGTH header is mandatory",
"For archive deposit, the CONTENT_LENGTH header must be sent.",
)
content_disposition = headers["content-disposition"]
if not content_disposition:
return make_error_dict(
BAD_REQUEST,
"CONTENT_DISPOSITION header is mandatory",
"For archive deposit, the CONTENT_DISPOSITION header must be sent.",
)
packaging = headers["packaging"]
if packaging and packaging not in ACCEPT_PACKAGINGS:
return make_error_dict(
BAD_REQUEST,
f"Only packaging {ACCEPT_PACKAGINGS} is supported",
f"The packaging provided {packaging} is not supported",
)
filehandler = request.FILES["file"]
precondition_status_response = self._check_preconditions_on(
filehandler, headers["content-md5sum"], content_length
)
if precondition_status_response:
return precondition_status_response
external_id = headers["slug"]
# actual storage of data
archive_metadata = filehandler
deposit = self._deposit_put(
request,
deposit_id=deposit_id,
in_progress=headers["in-progress"],
external_id=external_id,
)
self._deposit_request_put(
deposit,
{ARCHIVE_KEY: archive_metadata},
replace_metadata=replace_metadata,
replace_archives=replace_archives,
)
return {
"deposit_id": deposit.id,
"deposit_date": deposit.reception_date,
"status": deposit.status,
"archive": filehandler.name,
}
def _read_metadata(self, metadata_stream) -> Tuple[bytes, Dict[str, Any]]:
"""Given a metadata stream, reads the metadata and returns both the
parsed and the raw metadata.
"""
raw_metadata = metadata_stream.read()
metadata = parse_xml(raw_metadata)
return raw_metadata, metadata
def _multipart_upload(
self,
request: Request,
headers: Dict[str, Any],
collection_name: str,
deposit_id: Optional[int] = None,
replace_metadata: bool = False,
replace_archives: bool = False,
) -> Dict:
"""Multipart upload supported with exactly:
- 1 archive (zip)
- 1 atom entry
Other than such a request, a 415 response is returned.
Args:
request (Request): the request holding information to parse
and inject in db
headers: request headers formatted
collection_name: the associated client
deposit_id: deposit identifier if provided
replace_metadata: 'Update or add' request to existing
deposit. If False (default), this adds new metadata request to
existing ones. Otherwise, this will replace existing metadata.
replace_archives: 'Update or add' request to existing
deposit. If False (default), this adds new archive request to
existing ones. Otherwise, this will replace existing archives.
ones.
Returns:
In the optimal case a dict with the following keys:
- deposit_id (int): Deposit identifier
- deposit_date (date): Deposit date
- archive: None (no archive is provided here)
Otherwise, a dictionary with the key error and the
associated failures, either:
- 400 (bad request) if the request is not providing an external
identifier
- 412 (precondition failed) if the potentially md5 hash provided
mismatch the reality of the archive
- 413 (request entity too large) if the length of the
archive exceeds the max size configured
- 415 (unsupported media type) if a wrong media type is provided
"""
external_id = headers["slug"]
content_types_present = set()
data: Dict[str, Optional[Any]] = {
"application/zip": None, # expected either zip
"application/x-tar": None, # or x-tar
"application/atom+xml": None,
}
for key, value in request.FILES.items():
fh = value
content_type = fh.content_type
if content_type in content_types_present:
return make_error_dict(
ERROR_CONTENT,
"Only 1 application/zip (or application/x-tar) archive "
"and 1 atom+xml entry is supported (as per sword2.0 "
"specification)",
"You provided more than 1 application/(zip|x-tar) "
"or more than 1 application/atom+xml content-disposition "
"header in the multipart deposit",
)
content_types_present.add(content_type)
assert content_type is not None
data[content_type] = fh
if len(content_types_present) != 2:
return make_error_dict(
ERROR_CONTENT,
"You must provide both 1 application/zip (or "
"application/x-tar) and 1 atom+xml entry for multipart "
"deposit",
"You need to provide only 1 application/(zip|x-tar) "
"and 1 application/atom+xml content-disposition header "
"in the multipart deposit",
)
filehandler = data["application/zip"]
if not filehandler:
filehandler = data["application/x-tar"]
precondition_status_response = self._check_preconditions_on(
filehandler, headers["content-md5sum"]
)
if precondition_status_response:
return precondition_status_response
try:
raw_metadata, metadata = self._read_metadata(data["application/atom+xml"])
except ParserError:
return make_error_dict(
PARSING_ERROR,
"Malformed xml metadata",
"The xml received is malformed. "
"Please ensure your metadata file is correctly formatted.",
)
# actual storage of data
deposit = self._deposit_put(
request,
deposit_id=deposit_id,
in_progress=headers["in-progress"],
external_id=external_id,
)
deposit_request_data = {
ARCHIVE_KEY: filehandler,
METADATA_KEY: metadata,
RAW_METADATA_KEY: raw_metadata,
}
self._deposit_request_put(
deposit, deposit_request_data, replace_metadata, replace_archives
)
assert filehandler is not None
return {
"deposit_id": deposit.id,
"deposit_date": deposit.reception_date,
"archive": filehandler.name,
"status": deposit.status,
}
+ def _store_metadata_deposit(
+ self,
+ deposit: Deposit,
+ swhid_reference: Union[str, SWHID],
+ metadata: Dict,
+ raw_metadata: bytes,
+ deposit_origin: Optional[str] = None,
+ ) -> Tuple[Union[SWHID, str], Union[SWHID, str], Deposit, DepositRequest]:
+ """When all user inputs pass the checks, this associates the raw_metadata to the
+ swhid_reference in the raw extrinsic metadata storage. In case of any issues,
+ a bad request response is returned to the user with the details.
+
+ Checks:
+ - metadata are technically parsable
+ - metadata pass the functional checks
+ - SWHID (if any) is technically valid
+
+ Args:
+ deposit: Deposit reference
+ swhid_reference: The swhid or the origin to attach metadata information to
+ metadata: Full dict of metadata to check for validity (parsed out of
+ raw_metadata)
+ raw_metadata: The actual raw metadata to send in the storage metadata
+ deposit_origin: Optional deposit origin url to use if any (e.g. deposit
+ update scenario provides one)
+
+ Raises:
+ BadRequestError in case of incorrect inputs from the deposit client
+ (e.g. functionally invalid metadata, ...)
+
+ Returns:
+ Tuple of core swhid, swhid context, deposit and deposit request
+
+ """
+ metadata_ok, error_details = check_metadata(metadata)
+ if not metadata_ok:
+ assert error_details, "Details should be set when a failure occurs"
+ raise BadRequestError(
+ "Functional metadata checks failure",
+ convert_status_detail(error_details),
+ )
+
+ metadata_authority = MetadataAuthority(
+ type=MetadataAuthorityType.DEPOSIT_CLIENT,
+ url=deposit.client.provider_url,
+ metadata={"name": deposit.client.last_name},
+ )
+
+ metadata_fetcher = MetadataFetcher(
+ name=self.tool["name"],
+ version=self.tool["version"],
+ metadata=self.tool["configuration"],
+ )
+
+ # replace metadata within the deposit backend
+ deposit_request_data = {
+ METADATA_KEY: metadata,
+ RAW_METADATA_KEY: raw_metadata,
+ }
+
+ # actually add the metadata to the completed deposit
+ deposit_request = self._deposit_request_put(deposit, deposit_request_data)
+
+ object_type, metadata_context = compute_metadata_context(swhid_reference)
+ if deposit_origin: # metadata deposit update on completed deposit
+ metadata_context["origin"] = deposit_origin
+
+ swhid_core: Union[str, SWHID]
+ if isinstance(swhid_reference, str):
+ swhid_core = swhid_reference
+ else:
+ swhid_core = attr.evolve(swhid_reference, metadata={})
+
+ # store that metadata to the metadata storage
+ metadata_object = RawExtrinsicMetadata(
+ type=object_type,
+ target=swhid_core, # core swhid or origin
+ discovery_date=deposit_request.date,
+ authority=metadata_authority,
+ fetcher=metadata_fetcher,
+ format="sword-v2-atom-codemeta",
+ metadata=raw_metadata,
+ **metadata_context,
+ )
+
+ # write to metadata storage
+ self.storage_metadata.metadata_authority_add([metadata_authority])
+ self.storage_metadata.metadata_fetcher_add([metadata_fetcher])
+ self.storage_metadata.raw_extrinsic_metadata_add([metadata_object])
+
+ return (swhid_core, swhid_reference, deposit, deposit_request)
+
def _atom_entry(
self,
request: Request,
headers: Dict[str, Any],
collection_name: str,
deposit_id: Optional[int] = None,
replace_metadata: bool = False,
replace_archives: bool = False,
) -> Dict[str, Any]:
"""Atom entry deposit.
Args:
request: the request holding information to parse
and inject in db
headers: request headers formatted
collection_name: the associated client
deposit_id: deposit identifier if provided
replace_metadata: 'Update or add' request to existing
deposit. If False (default), this adds new metadata request to
existing ones. Otherwise, this will replace existing metadata.
replace_archives: 'Update or add' request to existing
deposit. If False (default), this adds new archive request to
existing ones. Otherwise, this will replace existing archives.
ones.
Returns:
In the optimal case a dict with the following keys:
- deposit_id: deposit id associated to the deposit
- deposit_date: date of the deposit
- archive: None (no archive is provided here)
Otherwise, a dictionary with the key error and the
associated failures, either:
- 400 (bad request) if the request is not providing an external
identifier
- 400 (bad request) if the request's body is empty
- 415 (unsupported media type) if a wrong media type is provided
"""
try:
raw_metadata, metadata = self._read_metadata(request.data)
except ParserError:
return make_error_dict(
BAD_REQUEST,
"Malformed xml metadata",
"The xml received is malformed. "
"Please ensure your metadata file is correctly formatted.",
)
if not metadata:
return make_error_dict(
BAD_REQUEST,
"Empty body request is not supported",
"Atom entry deposit is supposed to send for metadata. "
"If the body is empty, there is no metadata.",
)
- external_id = metadata.get("external_identifier", headers["slug"])
+ # Determine if we are in the metadata-only deposit case
+ try:
+ swhid = parse_swh_reference(metadata)
+ except ValidationError as e:
+ return make_error_dict(PARSING_ERROR, "Invalid SWHID reference", str(e),)
- # TODO: Determine if we are in the metadata-only deposit case. If it is, then
- # save deposit and deposit request typed 'metadata' and send metadata to the
- # metadata storage. Otherwise, do as existing deposit.
+ external_id = metadata.get("external_identifier", headers["slug"])
deposit = self._deposit_put(
request,
deposit_id=deposit_id,
in_progress=headers["in-progress"],
external_id=external_id,
)
+ if swhid is not None:
+ try:
+ swhid, swhid_ref, depo, depo_request = self._store_metadata_deposit(
+ deposit, swhid, metadata, raw_metadata
+ )
+ except BadRequestError as bad_request_error:
+ return bad_request_error.to_dict()
+
+ deposit.status = DEPOSIT_STATUS_LOAD_SUCCESS
+ if isinstance(swhid_ref, SWHID):
+ deposit.swhid = str(swhid)
+ deposit.swhid_context = str(swhid_ref)
+ deposit.complete_date = depo_request.date
+ deposit.reception_date = depo_request.date
+ deposit.save()
+
+ return {
+ "deposit_id": deposit.id,
+ "deposit_date": depo_request.date,
+ "status": deposit.status,
+ "archive": None,
+ }
+
self._deposit_request_put(
deposit,
{METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata},
replace_metadata,
replace_archives,
)
return {
"deposit_id": deposit.id,
"deposit_date": deposit.reception_date,
"archive": None,
"status": deposit.status,
}
def _empty_post(
self, request: Request, headers: Dict, collection_name: str, deposit_id: int
) -> Dict[str, Any]:
"""Empty post to finalize an empty deposit.
Args:
request: the request holding information to parse
and inject in db
headers: request headers formatted
collection_name: the associated client
deposit_id: deposit identifier
Returns:
Dictionary of result with the deposit's id, the date
it was completed and no archive.
"""
deposit = Deposit.objects.get(pk=deposit_id)
deposit.complete_date = timezone.now()
deposit.status = DEPOSIT_STATUS_DEPOSITED
deposit.save()
return {
"deposit_id": deposit_id,
"deposit_date": deposit.complete_date,
"status": deposit.status,
"archive": None,
}
def _make_iris(
self, request: Request, collection_name: str, deposit_id: int
) -> Dict[str, Any]:
"""Define the IRI endpoints
Args:
request (Request): The initial request
collection_name (str): client/collection's name
deposit_id (id): Deposit identifier
Returns:
Dictionary of keys with the iris' urls.
"""
args = [collection_name, deposit_id]
return {
iri: request.build_absolute_uri(reverse(iri, args=args))
for iri in [EM_IRI, EDIT_SE_IRI, CONT_FILE_IRI, STATE_IRI]
}
def additional_checks(
self,
request: Request,
headers: Dict[str, Any],
collection_name: str,
deposit_id: Optional[int] = None,
) -> Dict[str, Any]:
"""Permit the child class to enrich additional checks.
Returns:
dict with 'error' detailing the problem.
"""
return {}
def checks(
self, request: Request, collection_name: str, deposit_id: Optional[int] = None
) -> Dict[str, Any]:
try:
self._collection = DepositCollection.objects.get(name=collection_name)
except DepositCollection.DoesNotExist:
return make_error_dict(
NOT_FOUND, f"Unknown collection name {collection_name}"
)
assert self._collection is not None
username = request.user.username
if username: # unauthenticated request can have the username empty
try:
self._client: DepositClient = DepositClient.objects.get( # type: ignore
username=username
)
except DepositClient.DoesNotExist:
return make_error_dict(NOT_FOUND, f"Unknown client name {username}")
collection_id = self._collection.id
collections = self._client.collections
assert collections is not None
if collection_id not in collections:
return make_error_dict(
FORBIDDEN,
f"Client {username} cannot access collection {collection_name}",
)
headers = self._read_headers(request)
if deposit_id:
try:
deposit = Deposit.objects.get(pk=deposit_id)
except Deposit.DoesNotExist:
return make_error_dict(
NOT_FOUND, f"Deposit with id {deposit_id} does not exist"
)
assert deposit is not None
checks = self.restrict_access(request, headers, deposit)
if checks:
return checks
if headers["on-behalf-of"]:
return make_error_dict(MEDIATION_NOT_ALLOWED, "Mediation is not supported.")
checks = self.additional_checks(request, headers, collection_name, deposit_id)
if "error" in checks:
return checks
return {"headers": headers}
def restrict_access(
self, request: Request, headers: Dict, deposit: Deposit
) -> Dict[str, Any]:
"""Allow modifications on deposit with status 'partial' only, reject the rest.
"""
if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL:
summary = "You can only act on deposit with status '%s'" % (
DEPOSIT_STATUS_PARTIAL,
)
description = f"This deposit has status '{deposit.status}'"
return make_error_dict(
BAD_REQUEST, summary=summary, verbose_description=description
)
return {}
def _basic_not_allowed_method(self, request: Request, method: str):
return make_error_response(
request,
METHOD_NOT_ALLOWED,
f"{method} method is not supported on this endpoint",
)
def get(
self, request: Request, collection_name: str, deposit_id: int
) -> Union[HttpResponse, FileResponse]:
return self._basic_not_allowed_method(request, "GET")
def post(
self, request: Request, collection_name: str, deposit_id: Optional[int] = None
) -> HttpResponse:
return self._basic_not_allowed_method(request, "POST")
def put(
self, request: Request, collection_name: str, deposit_id: int
) -> HttpResponse:
return self._basic_not_allowed_method(request, "PUT")
def delete(
self, request: Request, collection_name: str, deposit_id: Optional[int] = None
) -> HttpResponse:
return self._basic_not_allowed_method(request, "DELETE")
class APIGet(APIBase, metaclass=ABCMeta):
"""Mixin for class to support GET method.
"""
def get(
self, request: Request, collection_name: str, deposit_id: int
) -> Union[HttpResponse, FileResponse]:
"""Endpoint to create/add resources to deposit.
Returns:
200 response when no error during routine occurred
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
checks = self.checks(request, collection_name, deposit_id)
if "error" in checks:
return make_error_response_from_dict(request, checks["error"])
r = self.process_get(request, collection_name, deposit_id)
status, content, content_type = r
if content_type == "swh/generator":
with content as path:
return FileResponse(
open(path, "rb"), status=status, content_type="application/zip"
)
if content_type == "application/json":
return HttpResponse(
json.dumps(content), status=status, content_type=content_type
)
return HttpResponse(content, status=status, content_type=content_type)
@abstractmethod
def process_get(
self, request: Request, collection_name: str, deposit_id: int
) -> Tuple[int, Any, str]:
"""Routine to deal with the deposit's get processing.
Returns:
Tuple status, stream of content, content-type
"""
pass
class APIPost(APIBase, metaclass=ABCMeta):
"""Mixin for class to support POST method.
"""
def post(
self, request: Request, collection_name: str, deposit_id: Optional[int] = None
) -> HttpResponse:
"""Endpoint to create/add resources to deposit.
Returns:
204 response when no error during routine occurred.
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
checks = self.checks(request, collection_name, deposit_id)
if "error" in checks:
return make_error_response_from_dict(request, checks["error"])
headers = checks["headers"]
_status, _iri_key, data = self.process_post(
request, headers, collection_name, deposit_id
)
error = data.get("error")
if error:
return make_error_response_from_dict(request, error)
data["packagings"] = ACCEPT_PACKAGINGS
iris = self._make_iris(request, collection_name, data["deposit_id"])
data.update(iris)
response = render(
request,
"deposit/deposit_receipt.xml",
context=data,
content_type="application/xml",
status=_status,
)
response._headers["location"] = "Location", data[_iri_key] # type: ignore
return response
@abstractmethod
def process_post(
self,
request,
headers: Dict,
collection_name: str,
deposit_id: Optional[int] = None,
) -> Tuple[int, str, Dict]:
"""Routine to deal with the deposit's processing.
Returns
Tuple of:
- response status code (200, 201, etc...)
- key iri (EM_IRI, EDIT_SE_IRI, etc...)
- dictionary of the processing result
"""
pass
class APIPut(APIBase, metaclass=ABCMeta):
"""Mixin for class to support PUT method.
"""
def put(
self, request: Request, collection_name: str, deposit_id: int
) -> HttpResponse:
"""Endpoint to update deposit resources.
Returns:
204 response when no error during routine occurred.
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
checks = self.checks(request, collection_name, deposit_id)
if "error" in checks:
return make_error_response_from_dict(request, checks["error"])
headers = checks["headers"]
data = self.process_put(request, headers, collection_name, deposit_id)
error = data.get("error")
if error:
return make_error_response_from_dict(request, error)
return HttpResponse(status=status.HTTP_204_NO_CONTENT)
@abstractmethod
def process_put(
self, request: Request, headers: Dict, collection_name: str, deposit_id: int
) -> Dict[str, Any]:
"""Routine to deal with updating a deposit in some way.
Returns
dictionary of the processing result
"""
pass
class APIDelete(APIBase, metaclass=ABCMeta):
"""Mixin for class to support DELETE method.
"""
def delete(
self, request: Request, collection_name: str, deposit_id: Optional[int] = None
) -> HttpResponse:
"""Endpoint to delete some deposit's resources (archives, deposit).
Returns:
204 response when no error during routine occurred.
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
checks = self.checks(request, collection_name, deposit_id)
if "error" in checks:
return make_error_response_from_dict(request, checks["error"])
assert deposit_id is not None
data = self.process_delete(request, collection_name, deposit_id)
error = data.get("error")
if error:
return make_error_response_from_dict(request, error)
return HttpResponse(status=status.HTTP_204_NO_CONTENT)
@abstractmethod
def process_delete(
self, request: Request, collection_name: str, deposit_id: int
) -> Dict:
"""Routine to delete a resource.
This is mostly not allowed except for the
EM_IRI (cf. .api.deposit_update.APIUpdateArchive)
"""
return {}
diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py
index 068c80eb..d4fcfba0 100644
--- a/swh/deposit/api/deposit_update.py
+++ b/swh/deposit/api/deposit_update.py
@@ -1,342 +1,277 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Any, Dict, Optional, Tuple
from rest_framework import status
from rest_framework.request import Request
-from swh.deposit.api.checks import check_metadata
-from swh.deposit.api.converters import convert_status_detail
from swh.deposit.models import Deposit
from swh.model.identifiers import parse_swhid
-from swh.model.model import (
- MetadataAuthority,
- MetadataAuthorityType,
- MetadataFetcher,
- MetadataTargetType,
- RawExtrinsicMetadata,
-)
-from swh.storage import get_storage
-from swh.storage.interface import StorageInterface
-
-from ..config import (
- CONT_FILE_IRI,
- DEPOSIT_STATUS_LOAD_SUCCESS,
- EDIT_SE_IRI,
- EM_IRI,
- METADATA_KEY,
- RAW_METADATA_KEY,
-)
-from ..errors import BAD_REQUEST, ParserError, make_error_dict
+
+from ..config import CONT_FILE_IRI, DEPOSIT_STATUS_LOAD_SUCCESS, EDIT_SE_IRI, EM_IRI
+from ..errors import BAD_REQUEST, BadRequestError, ParserError, make_error_dict
from ..parsers import (
SWHAtomEntryParser,
SWHFileUploadTarParser,
SWHFileUploadZipParser,
SWHMultiPartParser,
)
from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, APIDelete, APIPost, APIPut
class APIUpdateArchive(APIPost, APIPut, APIDelete):
"""Deposit request class defining api endpoints for sword deposit.
What's known as 'EM IRI' in the sword specification.
HTTP verbs supported: PUT, POST, DELETE
"""
parser_classes = (
SWHFileUploadZipParser,
SWHFileUploadTarParser,
)
def process_put(
self, req, headers, collection_name: str, deposit_id: int
) -> Dict[str, Any]:
"""Replace existing content for the existing deposit.
source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_binary # noqa
Returns:
204 No content
"""
if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES:
msg = "Packaging format supported is restricted to %s" % (
", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES)
)
return make_error_dict(BAD_REQUEST, msg)
return self._binary_upload(
req, headers, collection_name, deposit_id=deposit_id, replace_archives=True
)
def process_post(
self, req, headers: Dict, collection_name: str, deposit_id: Optional[int] = None
) -> Tuple[int, str, Dict]:
"""Add new content to the existing deposit.
source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_mediaresource # noqa
Returns:
201 Created
Headers: Location: [Cont-File-IRI]
Body: [optional Deposit Receipt]
"""
if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES:
msg = "Packaging format supported is restricted to %s" % (
", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES)
)
unused = 0
return unused, "unused", make_error_dict(BAD_REQUEST, msg)
return (
status.HTTP_201_CREATED,
CONT_FILE_IRI,
self._binary_upload(req, headers, collection_name, deposit_id),
)
def process_delete(self, req, collection_name: str, deposit_id: int) -> Dict:
"""Delete content (archives) from existing deposit.
source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deletingcontent # noqa
Returns:
204 Created
"""
return self._delete_archives(collection_name, deposit_id)
class APIUpdateMetadata(APIPost, APIPut, APIDelete):
"""Deposit request class defining api endpoints for sword deposit.
What's known as 'Edit IRI' (and SE IRI) in the sword specification.
HTTP verbs supported: POST (SE IRI), PUT (Edit IRI), DELETE
"""
parser_classes = (SWHMultiPartParser, SWHAtomEntryParser)
- def __init__(self):
- super().__init__()
- self.storage_metadata: StorageInterface = get_storage(
- **self.config["storage_metadata"]
- )
-
def restrict_access(
self, request: Request, headers: Dict, deposit: Deposit
) -> Dict[str, Any]:
"""Relax restriction access to allow metadata update on deposit with status "done" when
a swhid is provided.
"""
if (
request.method == "PUT"
and headers["swhid"] is not None
and deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS
):
# Allow metadata update on deposit with status "done" when swhid provided
return {}
# otherwise, let the standard access restriction check occur
return super().restrict_access(request, headers, deposit)
def process_put(
self, request, headers: Dict, collection_name: str, deposit_id: int
) -> Dict[str, Any]:
"""This allows the following scenarios:
- multipart: replace all the deposit (status partial) metadata and archive
with the provided ones.
- atom: replace all the deposit (status partial) metadata with the
provided ones.
- with swhid, atom: Add new metatada to deposit (status done) with provided ones
and push such metadata to the metadata storage directly.
source:
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_metadata
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_multipart
Raises:
400 if any of the following occur:
- the swhid provided and the deposit swhid do not match
- the provided metadata xml file is malformed
- the provided xml atom entry is empty
- the provided swhid does not exist in the archive
Returns:
204 No content
""" # noqa
swhid = headers.get("swhid")
if swhid is None:
if request.content_type.startswith("multipart/"):
return self._multipart_upload(
request,
headers,
collection_name,
deposit_id=deposit_id,
replace_archives=True,
replace_metadata=True,
)
# standard metadata update (replace all metadata already provided to the
# deposit by the new ones)
return self._atom_entry(
request,
headers,
collection_name,
deposit_id=deposit_id,
replace_metadata=True,
)
# Update metadata on a deposit already ingested
# Write to the metadata storage (and the deposit backend)
# no ingestion triggered
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS
if swhid != deposit.swhid:
return make_error_dict(
BAD_REQUEST,
f"Mismatched provided SWHID {swhid} with deposit's {deposit.swhid}.",
"The provided SWHID does not match the deposit to update. "
"Please ensure you send the correct deposit SWHID.",
)
try:
raw_metadata, metadata = self._read_metadata(request.data)
except ParserError:
return make_error_dict(
BAD_REQUEST,
"Malformed xml metadata",
"The xml received is malformed. "
"Please ensure your metadata file is correctly formatted.",
)
if not metadata:
return make_error_dict(
BAD_REQUEST,
"Empty body request is not supported",
"Atom entry deposit is supposed to send for metadata. "
"If the body is empty, there is no metadata.",
)
- metadata_ok, error_details = check_metadata(metadata)
- if not metadata_ok:
- assert error_details, "Details should be set when a failure occurs"
- return make_error_dict(
- BAD_REQUEST,
- "Functional metadata checks failure",
- convert_status_detail(error_details),
+ try:
+ _, _, deposit, deposit_request = self._store_metadata_deposit(
+ deposit, parse_swhid(swhid), metadata, raw_metadata, deposit.origin_url,
)
-
- metadata_authority = MetadataAuthority(
- type=MetadataAuthorityType.DEPOSIT_CLIENT,
- url=deposit.client.provider_url,
- metadata={"name": deposit.client.last_name},
- )
-
- metadata_fetcher = MetadataFetcher(
- name=self.tool["name"],
- version=self.tool["version"],
- metadata=self.tool["configuration"],
- )
-
- deposit_swhid = parse_swhid(swhid)
-
- # replace metadata within the deposit backend
- deposit_request_data = {
- METADATA_KEY: metadata,
- RAW_METADATA_KEY: raw_metadata,
- }
-
- # actually add the metadata to the completed deposit
- deposit_request = self._deposit_request_put(deposit, deposit_request_data)
- # store that metadata to the metadata storage
- metadata_object = RawExtrinsicMetadata(
- type=MetadataTargetType.DIRECTORY,
- target=deposit_swhid,
- discovery_date=deposit_request.date,
- authority=metadata_authority,
- fetcher=metadata_fetcher,
- format="sword-v2-atom-codemeta",
- metadata=raw_metadata,
- origin=deposit.origin_url,
- )
-
- # write to metadata storage
- self.storage_metadata.metadata_authority_add([metadata_authority])
- self.storage_metadata.metadata_fetcher_add([metadata_fetcher])
- self.storage_metadata.raw_extrinsic_metadata_add([metadata_object])
+ except BadRequestError as bad_request_error:
+ return bad_request_error.to_dict()
return {
- "deposit_id": deposit_id,
+ "deposit_id": deposit.id,
"deposit_date": deposit_request.date,
"status": deposit.status,
"archive": None,
}
def process_post(
self,
request,
headers: Dict,
collection_name: str,
deposit_id: Optional[int] = None,
) -> Tuple[int, str, Dict]:
"""Add new metadata/archive to existing deposit.
This allows the following scenarios to occur:
- multipart: Add new metadata and archive to a deposit in status partial with
the provided ones.
- empty atom: Allows to finalize a deposit in status partial (transition to
deposited).
source:
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_metadata
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_multipart
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#continueddeposit_complete
Returns:
In optimal case for a multipart and atom-entry update, a
201 Created response. The body response will hold a
deposit. And the response headers will contain an entry
'Location' with the EM-IRI.
For the empty post case, this returns a 200.
""" # noqa
assert deposit_id is not None
if request.content_type.startswith("multipart/"):
data = self._multipart_upload(
request, headers, collection_name, deposit_id=deposit_id
)
return (status.HTTP_201_CREATED, EM_IRI, data)
content_length = headers["content-length"] or 0
if content_length == 0 and headers["in-progress"] is False:
# check for final empty post
data = self._empty_post(request, headers, collection_name, deposit_id)
return (status.HTTP_200_OK, EDIT_SE_IRI, data)
data = self._atom_entry(
request, headers, collection_name, deposit_id=deposit_id
)
return (status.HTTP_201_CREATED, EM_IRI, data)
def process_delete(self, req, collection_name: str, deposit_id: int) -> Dict:
"""Delete the container (deposit).
source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deleteconteiner # noqa
"""
return self._delete_deposit(collection_name, deposit_id)
diff --git a/swh/deposit/config.py b/swh/deposit/config.py
index ec1e0248..ba6c0939 100644
--- a/swh/deposit/config.py
+++ b/swh/deposit/config.py
@@ -1,103 +1,108 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
from typing import Any, Dict
from swh.core import config
from swh.deposit import __version__
from swh.scheduler import get_scheduler
from swh.scheduler.interface import SchedulerInterface
+from swh.storage import get_storage
+from swh.storage.interface import StorageInterface
# IRIs (Internationalized Resource identifier) sword 2.0 specified
EDIT_SE_IRI = "edit_se_iri"
EM_IRI = "em_iri"
CONT_FILE_IRI = "cont_file_iri"
SD_IRI = "servicedocument"
COL_IRI = "upload"
STATE_IRI = "state_iri"
PRIVATE_GET_RAW_CONTENT = "private-download"
PRIVATE_CHECK_DEPOSIT = "check-deposit"
PRIVATE_PUT_DEPOSIT = "private-update"
PRIVATE_GET_DEPOSIT_METADATA = "private-read"
PRIVATE_LIST_DEPOSITS = "private-deposit-list"
ARCHIVE_KEY = "archive"
METADATA_KEY = "metadata"
RAW_METADATA_KEY = "raw-metadata"
ARCHIVE_TYPE = "archive"
METADATA_TYPE = "metadata"
AUTHORIZED_PLATFORMS = ["development", "production", "testing"]
DEPOSIT_STATUS_REJECTED = "rejected"
DEPOSIT_STATUS_PARTIAL = "partial"
DEPOSIT_STATUS_DEPOSITED = "deposited"
DEPOSIT_STATUS_VERIFIED = "verified"
DEPOSIT_STATUS_LOAD_SUCCESS = "done"
DEPOSIT_STATUS_LOAD_FAILURE = "failed"
# Revision author for deposit
SWH_PERSON = {
"name": "Software Heritage",
"fullname": "Software Heritage",
"email": "robot@softwareheritage.org",
}
DEFAULT_CONFIG = {
"max_upload_size": 209715200,
"checks": True,
}
def setup_django_for(platform=None, config_file=None):
"""Setup function for command line tools (swh.deposit.create_user) to
initialize the needed db access.
Note:
Do not import any django related module prior to this function
call. Otherwise, this will raise an
django.core.exceptions.ImproperlyConfigured error message.
Args:
platform (str): the platform the scheduling is running
config_file (str): Extra configuration file (typically for the
production platform)
Raises:
ValueError in case of wrong platform inputs.
"""
if platform is not None:
if platform not in AUTHORIZED_PLATFORMS:
raise ValueError("Platform should be one of %s" % AUTHORIZED_PLATFORMS)
if "DJANGO_SETTINGS_MODULE" not in os.environ:
os.environ["DJANGO_SETTINGS_MODULE"] = "swh.deposit.settings.%s" % platform
if config_file:
os.environ.setdefault("SWH_CONFIG_FILENAME", config_file)
import django
django.setup()
class APIConfig:
"""API Configuration centralized class. This loads explicitly the configuration file out
of the SWH_CONFIG_FILENAME environment variable.
"""
def __init__(self):
self.config: Dict[str, Any] = config.load_from_envvar(DEFAULT_CONFIG)
self.scheduler: SchedulerInterface = get_scheduler(**self.config["scheduler"])
self.tool = {
"name": "swh-deposit",
"version": __version__,
"configuration": {"sword_version": "2"},
}
+ self.storage_metadata: StorageInterface = get_storage(
+ **self.config["storage_metadata"]
+ )
diff --git a/swh/deposit/errors.py b/swh/deposit/errors.py
index f41965dd..e0b7980e 100644
--- a/swh/deposit/errors.py
+++ b/swh/deposit/errors.py
@@ -1,150 +1,164 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Module in charge of providing the standard sword errors
"""
from django.shortcuts import render
from rest_framework import status
FORBIDDEN = "forbidden"
UNAUTHORIZED = "unauthorized"
NOT_FOUND = "unknown"
BAD_REQUEST = "bad-request"
ERROR_CONTENT = "error-content"
CHECKSUM_MISMATCH = "checksum-mismatch"
MEDIATION_NOT_ALLOWED = "mediation-not-allowed"
METHOD_NOT_ALLOWED = "method-not-allowed"
MAX_UPLOAD_SIZE_EXCEEDED = "max_upload_size_exceeded"
PARSING_ERROR = "parsing-error"
class ParserError(ValueError):
"""Specific parsing error detected when parsing the xml metadata input
"""
pass
ERRORS = {
FORBIDDEN: {
"status": status.HTTP_403_FORBIDDEN,
"iri": "http://purl.org/net/sword/error/ErrorForbidden",
"tag": "sword:ErrorForbidden",
},
UNAUTHORIZED: {
"status": status.HTTP_401_UNAUTHORIZED,
"iri": "http://purl.org/net/sword/error/ErrorUnauthorized",
"tag": "sword:ErrorUnauthorized",
},
NOT_FOUND: {
"status": status.HTTP_404_NOT_FOUND,
"iri": "http://purl.org/net/sword/error/ErrorNotFound",
"tag": "sword:ErrorNotFound",
},
ERROR_CONTENT: {
"status": status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
"iri": "http://purl.org/net/sword/error/ErrorContent",
"tag": "sword:ErrorContent",
},
CHECKSUM_MISMATCH: {
"status": status.HTTP_412_PRECONDITION_FAILED,
"iri": "http://purl.org/net/sword/error/ErrorChecksumMismatch",
"tag": "sword:ErrorChecksumMismatch",
},
BAD_REQUEST: {
"status": status.HTTP_400_BAD_REQUEST,
"iri": "http://purl.org/net/sword/error/ErrorBadRequest",
"tag": "sword:ErrorBadRequest",
},
PARSING_ERROR: {
"status": status.HTTP_400_BAD_REQUEST,
"iri": "http://purl.org/net/sword/error/ErrorBadRequest",
"tag": "sword:ErrorBadRequest",
},
MEDIATION_NOT_ALLOWED: {
"status": status.HTTP_412_PRECONDITION_FAILED,
"iri": "http://purl.org/net/sword/error/MediationNotAllowed",
"tag": "sword:MediationNotAllowed",
},
METHOD_NOT_ALLOWED: {
"status": status.HTTP_405_METHOD_NOT_ALLOWED,
"iri": "http://purl.org/net/sword/error/MethodNotAllowed",
"tag": "sword:MethodNotAllowed",
},
MAX_UPLOAD_SIZE_EXCEEDED: {
"status": status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
"iri": "http://purl.org/net/sword/error/MaxUploadSizeExceeded",
"tag": "sword:MaxUploadSizeExceeded",
},
}
def make_error_dict(key, summary=None, verbose_description=None):
"""Utility function to factorize error message dictionary.
Args:
key (str): Error status key referenced in swh.deposit.errors module
summary (str/None): Error message clarifying the status
verbose_description (str/None): A more verbose
description or work around a potential problem.
Returns:
Dictionary with key 'error' detailing the 'status' and
associated 'message'
"""
return {
"error": {
"key": key,
"summary": summary,
"verboseDescription": verbose_description,
},
}
def make_error_response_from_dict(req, error):
"""Utility function to return an http response with error detail.
Args:
req (Request): original request
error (dict): Error described as dict, typically generated
from the make_error_dict function.
Returns:
HttpResponse with detailed error.
"""
error_information = ERRORS[error["key"]]
context = error
context.update(error_information)
return render(
req,
"deposit/error.xml",
context=error,
content_type="application/xml",
status=error_information["status"],
)
def make_error_response(req, key, summary=None, verbose_description=None):
"""Utility function to create an http response with detailed error.
Args:
req (Request): original request
key (str): Error status key referenced in swh.deposit.errors module
summary (str): Error message clarifying the status
verbose_description (str / None): A more verbose
description or work around a potential problem.
Returns:
Dictionary with key 'error' detailing the 'status' and
associated 'message'
"""
error = make_error_dict(key, summary, verbose_description)
return make_error_response_from_dict(req, error["error"])
+
+
+class BadRequestError(ValueError):
+ """Represents a bad input from the deposit client
+
+ """
+
+ def __init__(self, summary, verbose_description):
+ self.key = BAD_REQUEST
+ self.summary = summary
+ self.verbose_description = verbose_description
+
+ def to_dict(self):
+ return make_error_dict(self.key, self.summary, self.verbose_description)
diff --git a/swh/deposit/tests/api/test_deposit_metadata.py b/swh/deposit/tests/api/test_deposit_metadata.py
new file mode 100644
index 00000000..f9dcfe0f
--- /dev/null
+++ b/swh/deposit/tests/api/test_deposit_metadata.py
@@ -0,0 +1,277 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from io import BytesIO
+
+import attr
+from django.urls import reverse
+import pytest
+from rest_framework import status
+
+from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_LOAD_SUCCESS, APIConfig
+from swh.deposit.models import Deposit
+from swh.deposit.parsers import parse_xml
+from swh.deposit.utils import compute_metadata_context
+from swh.model.identifiers import SWHID, parse_swhid
+from swh.model.model import (
+ MetadataAuthority,
+ MetadataAuthorityType,
+ MetadataFetcher,
+ MetadataTargetType,
+ RawExtrinsicMetadata,
+)
+from swh.storage.interface import PagedResult
+
+
+def test_deposit_metadata_invalid(
+ authenticated_client, deposit_collection, atom_dataset
+):
+ """Posting invalid swhid reference is bad request returned to client
+
+ """
+ invalid_swhid = "swh:1:dir :31b5c8cc985d190b5a7ef4878128ebfdc2358f49"
+ xml_data = atom_dataset["entry-data-with-swhid"].format(swhid=invalid_swhid)
+
+ response = authenticated_client.post(
+ reverse(COL_IRI, args=[deposit_collection.name]),
+ content_type="application/atom+xml;type=entry",
+ data=xml_data,
+ HTTP_SLUG="external-id",
+ )
+ assert response.status_code == status.HTTP_400_BAD_REQUEST
+ assert b"Invalid SWHID reference" in response.content
+
+
+def test_deposit_metadata_fails_functional_checks(
+ authenticated_client, deposit_collection, atom_dataset
+):
+ """Posting functionally invalid metadata swhid is bad request returned to client
+
+ """
+ swhid = "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49"
+ invalid_xml_data = atom_dataset[
+ "entry-data-with-swhid-fail-metadata-functional-checks"
+ ].format(swhid=swhid)
+
+ response = authenticated_client.post(
+ reverse(COL_IRI, args=[deposit_collection.name]),
+ content_type="application/atom+xml;type=entry",
+ data=invalid_xml_data,
+ HTTP_SLUG="external-id",
+ )
+ assert response.status_code == status.HTTP_400_BAD_REQUEST
+ assert b"Functional metadata checks failure" in response.content
+
+
+@pytest.mark.parametrize(
+ "swhid,target_type",
+ [
+ (
+ "swh:1:cnt:01b5c8cc985d190b5a7ef4878128ebfdc2358f49",
+ MetadataTargetType.CONTENT,
+ ),
+ (
+ "swh:1:dir:11b5c8cc985d190b5a7ef4878128ebfdc2358f49",
+ MetadataTargetType.DIRECTORY,
+ ),
+ (
+ "swh:1:rev:21b5c8cc985d190b5a7ef4878128ebfdc2358f49",
+ MetadataTargetType.REVISION,
+ ),
+ (
+ "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49",
+ MetadataTargetType.RELEASE,
+ ),
+ (
+ "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49",
+ MetadataTargetType.SNAPSHOT,
+ ),
+ (
+ "swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo",
+ MetadataTargetType.CONTENT,
+ ),
+ (
+ "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;origin=https://inria.halpreprod.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:rev:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa
+ MetadataTargetType.DIRECTORY,
+ ),
+ (
+ "swh:1:rev:71b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo",
+ MetadataTargetType.REVISION,
+ ),
+ (
+ "swh:1:rel:81b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo",
+ MetadataTargetType.RELEASE,
+ ),
+ (
+ "swh:1:snp:91b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo",
+ MetadataTargetType.SNAPSHOT,
+ ),
+ ],
+)
+def test_deposit_metadata_swhid(
+ swhid,
+ target_type,
+ authenticated_client,
+ deposit_collection,
+ atom_dataset,
+ swh_storage,
+):
+ """Posting a swhid reference is stored on raw extrinsic metadata storage
+
+ """
+ swhid_reference = parse_swhid(swhid)
+ swhid_core = attr.evolve(swhid_reference, metadata={})
+
+ xml_data = atom_dataset["entry-data-with-swhid"].format(swhid=swhid)
+ deposit_client = authenticated_client.deposit_client
+
+ response = authenticated_client.post(
+ reverse(COL_IRI, args=[deposit_collection.name]),
+ content_type="application/atom+xml;type=entry",
+ data=xml_data,
+ HTTP_SLUG="external-id",
+ )
+
+ assert response.status_code == status.HTTP_201_CREATED
+ response_content = parse_xml(BytesIO(response.content))
+
+ # Ensure the deposit is finalized
+ deposit_id = int(response_content["deposit_id"])
+ deposit = Deposit.objects.get(pk=deposit_id)
+ assert isinstance(swhid_core, SWHID)
+ assert deposit.swhid == str(swhid_core)
+ assert deposit.swhid_context == str(swhid_reference)
+ assert deposit.complete_date == deposit.reception_date
+ assert deposit.complete_date is not None
+ assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS
+
+ # Ensure metadata stored in the metadata storage is consistent
+ metadata_authority = MetadataAuthority(
+ type=MetadataAuthorityType.DEPOSIT_CLIENT,
+ url=deposit_client.provider_url,
+ metadata={"name": deposit_client.last_name},
+ )
+
+ actual_authority = swh_storage.metadata_authority_get(
+ MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url
+ )
+ assert actual_authority == metadata_authority
+
+ config = APIConfig()
+ metadata_fetcher = MetadataFetcher(
+ name=config.tool["name"],
+ version=config.tool["version"],
+ metadata=config.tool["configuration"],
+ )
+
+ actual_fetcher = swh_storage.metadata_fetcher_get(
+ config.tool["name"], config.tool["version"]
+ )
+ assert actual_fetcher == metadata_fetcher
+
+ page_results = swh_storage.raw_extrinsic_metadata_get(
+ target_type, swhid_core, metadata_authority
+ )
+ discovery_date = page_results.results[0].discovery_date
+
+ assert len(page_results.results) == 1
+ assert page_results.next_page_token is None
+
+ object_type, metadata_context = compute_metadata_context(swhid_reference)
+ assert page_results == PagedResult(
+ results=[
+ RawExtrinsicMetadata(
+ type=object_type,
+ target=swhid_core,
+ discovery_date=discovery_date,
+ authority=attr.evolve(metadata_authority, metadata=None),
+ fetcher=attr.evolve(metadata_fetcher, metadata=None),
+ format="sword-v2-atom-codemeta",
+ metadata=xml_data.encode(),
+ **metadata_context,
+ )
+ ],
+ next_page_token=None,
+ )
+ assert deposit.complete_date == discovery_date
+
+
+@pytest.mark.parametrize(
+ "url", ["https://gitlab.org/user/repo", "https://whatever.else/repo",]
+)
+def test_deposit_metadata_origin(
+ url, authenticated_client, deposit_collection, atom_dataset, swh_storage,
+):
+ """Posting a swhid reference is stored on raw extrinsic metadata storage
+
+ """
+ xml_data = atom_dataset["entry-data-with-origin"].format(url=url)
+ deposit_client = authenticated_client.deposit_client
+ response = authenticated_client.post(
+ reverse(COL_IRI, args=[deposit_collection.name]),
+ content_type="application/atom+xml;type=entry",
+ data=xml_data,
+ HTTP_SLUG="external-id",
+ )
+
+ assert response.status_code == status.HTTP_201_CREATED
+ response_content = parse_xml(BytesIO(response.content))
+ # Ensure the deposit is finalized
+ deposit_id = int(response_content["deposit_id"])
+ deposit = Deposit.objects.get(pk=deposit_id)
+ # we got not swhid as input so we cannot have those
+ assert deposit.swhid is None
+ assert deposit.swhid_context is None
+ assert deposit.complete_date == deposit.reception_date
+ assert deposit.complete_date is not None
+ assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS
+
+ # Ensure metadata stored in the metadata storage is consistent
+ metadata_authority = MetadataAuthority(
+ type=MetadataAuthorityType.DEPOSIT_CLIENT,
+ url=deposit_client.provider_url,
+ metadata={"name": deposit_client.last_name},
+ )
+
+ actual_authority = swh_storage.metadata_authority_get(
+ MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url
+ )
+ assert actual_authority == metadata_authority
+
+ config = APIConfig()
+ metadata_fetcher = MetadataFetcher(
+ name=config.tool["name"],
+ version=config.tool["version"],
+ metadata=config.tool["configuration"],
+ )
+
+ actual_fetcher = swh_storage.metadata_fetcher_get(
+ config.tool["name"], config.tool["version"]
+ )
+ assert actual_fetcher == metadata_fetcher
+
+ page_results = swh_storage.raw_extrinsic_metadata_get(
+ MetadataTargetType.ORIGIN, url, metadata_authority
+ )
+ discovery_date = page_results.results[0].discovery_date
+
+ assert len(page_results.results) == 1
+ assert page_results.next_page_token is None
+
+ assert page_results == PagedResult(
+ results=[
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.ORIGIN,
+ target=url,
+ discovery_date=discovery_date,
+ authority=attr.evolve(metadata_authority, metadata=None),
+ fetcher=attr.evolve(metadata_fetcher, metadata=None),
+ format="sword-v2-atom-codemeta",
+ metadata=xml_data.encode(),
+ )
+ ],
+ next_page_token=None,
+ )
+ assert deposit.complete_date == discovery_date
diff --git a/swh/deposit/tests/api/test_parsers.py b/swh/deposit/tests/api/test_parsers.py
index 374b2c5f..765584ff 100644
--- a/swh/deposit/tests/api/test_parsers.py
+++ b/swh/deposit/tests/api/test_parsers.py
@@ -1,249 +1,238 @@
# Copyright (C) 2018-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import OrderedDict
import io
import pytest
from swh.deposit.parsers import SWHXMLParser, parse_swh_reference, parse_xml
from swh.model.exceptions import ValidationError
from swh.model.identifiers import parse_swhid
def test_parsing_without_duplicates():
xml_no_duplicate = io.BytesIO(
b"""
Awesome Compiler
GPL3.0
https://opensource.org/licenses/GPL-3.0
Python3
author1
Inria
ocaml
http://issuetracker.com
"""
)
actual_result = SWHXMLParser().parse(xml_no_duplicate)
expected_dict = OrderedDict(
[
("title", "Awesome Compiler"),
(
"codemeta:license",
OrderedDict(
[
("codemeta:name", "GPL3.0"),
("codemeta:url", "https://opensource.org/licenses/GPL-3.0"),
]
),
),
("codemeta:runtimePlatform", "Python3"),
(
"codemeta:author",
OrderedDict(
[("codemeta:name", "author1"), ("codemeta:affiliation", "Inria")]
),
),
("codemeta:programmingLanguage", "ocaml"),
("codemeta:issueTracker", "http://issuetracker.com"),
]
)
assert expected_dict == actual_result
def test_parsing_with_duplicates():
xml_with_duplicates = io.BytesIO(
b"""
Another Compiler
GNU/Linux
GPL3.0
https://opensource.org/licenses/GPL-3.0
Un*x
author1
Inria
author2
Inria
ocaml
haskell
spdx
http://spdx.org
python3
"""
)
actual_result = SWHXMLParser().parse(xml_with_duplicates)
expected_dict = OrderedDict(
[
("title", "Another Compiler"),
("codemeta:runtimePlatform", ["GNU/Linux", "Un*x"]),
(
"codemeta:license",
[
OrderedDict(
[
("codemeta:name", "GPL3.0"),
("codemeta:url", "https://opensource.org/licenses/GPL-3.0"),
]
),
OrderedDict(
[("codemeta:name", "spdx"), ("codemeta:url", "http://spdx.org")]
),
],
),
(
"codemeta:author",
[
OrderedDict(
[
("codemeta:name", "author1"),
("codemeta:affiliation", "Inria"),
]
),
OrderedDict(
[
("codemeta:name", "author2"),
("codemeta:affiliation", "Inria"),
]
),
],
),
("codemeta:programmingLanguage", ["ocaml", "haskell", "python3"]),
]
)
assert expected_dict == actual_result
@pytest.fixture
def xml_with_origin_reference():
xml_data = """
"""
return xml_data.strip()
def test_parse_swh_reference_origin(xml_with_origin_reference):
url = "https://url"
xml_data = xml_with_origin_reference.format(url=url)
metadata = parse_xml(xml_data)
actual_origin = parse_swh_reference(metadata)
assert actual_origin == url
@pytest.fixture
def xml_with_empty_reference():
xml_data = """
{swh_reference}
"""
return xml_data.strip()
@pytest.mark.parametrize(
"xml_ref",
[
"",
"",
"",
"""""",
],
)
def test_parse_swh_reference_empty(xml_with_empty_reference, xml_ref):
xml_body = xml_with_empty_reference.format(swh_reference=xml_ref)
metadata = parse_xml(xml_body)
assert parse_swh_reference(metadata) is None
@pytest.fixture
-def xml_with_swhid():
- xml_data = """
-
-
-
-
-
-
-
- """
- return xml_data.strip()
+def xml_with_swhid(atom_dataset):
+ return atom_dataset["entry-data-with-swhid"]
@pytest.mark.parametrize(
"swhid",
[
"swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=https://hal.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:4fc1e36fca86b2070204bedd51106014a614f321;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba;path=/moranegg-AffectationRO-df7f68b/", # noqa
"swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa
"swh:1:rev:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa
"swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rel:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa
"swh:1:snp:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:snp:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa
"swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49",
],
)
def test_parse_swh_reference_swhid(swhid, xml_with_swhid):
xml_data = xml_with_swhid.format(swhid=swhid)
metadata = parse_xml(xml_data)
actual_swhid = parse_swh_reference(metadata)
assert actual_swhid is not None
expected_swhid = parse_swhid(swhid)
assert actual_swhid == expected_swhid
@pytest.mark.parametrize(
"invalid_swhid,error_msg",
[
("swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc235", "Unexpected length"),
(
"swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:rev:0175049fc45055a3824a1675ac06e3711619a55a", # noqa
"visit qualifier should be a core SWHID with type",
),
(
"swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;anchor=swh:1:cnt:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa
"anchor qualifier should be a core SWHID with type one of",
), # noqa
],
)
def test_parse_swh_reference_invalid_swhid(invalid_swhid, error_msg, xml_with_swhid):
"""Unparsable swhid should raise
"""
xml_invalid_swhid = xml_with_swhid.format(swhid=invalid_swhid)
metadata = parse_xml(xml_invalid_swhid)
with pytest.raises(ValidationError, match=error_msg):
parse_swh_reference(metadata)
diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py
index da8d2e4b..cb2c92e3 100644
--- a/swh/deposit/tests/conftest.py
+++ b/swh/deposit/tests/conftest.py
@@ -1,441 +1,445 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import base64
from functools import partial
import os
import re
from typing import Mapping
from django.test.utils import setup_databases # type: ignore
from django.urls import reverse
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
import pytest
from rest_framework import status
from rest_framework.test import APIClient
import yaml
from swh.core.config import read
from swh.core.pytest_plugin import get_response_cb
from swh.deposit.config import (
COL_IRI,
DEPOSIT_STATUS_DEPOSITED,
DEPOSIT_STATUS_LOAD_FAILURE,
DEPOSIT_STATUS_LOAD_SUCCESS,
DEPOSIT_STATUS_PARTIAL,
DEPOSIT_STATUS_REJECTED,
DEPOSIT_STATUS_VERIFIED,
EDIT_SE_IRI,
setup_django_for,
)
from swh.deposit.parsers import parse_xml
from swh.deposit.tests.common import create_arborescence_archive
from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid
from swh.scheduler import get_scheduler
# mypy is asked to ignore the import statement above because setup_databases
# is not part of the d.t.utils.__all__ variable.
TEST_USER = {
"username": "test",
"password": "password",
"email": "test@example.org",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
"domain": "archives-ouvertes.fr/",
"collection": {"name": "test"},
}
def pytest_configure():
setup_django_for("testing")
@pytest.fixture
def requests_mock_datadir(datadir, requests_mock_datadir):
"""Override default behavior to deal with put/post methods
"""
cb = partial(get_response_cb, datadir=datadir)
requests_mock_datadir.put(re.compile("https://"), body=cb)
requests_mock_datadir.post(re.compile("https://"), body=cb)
return requests_mock_datadir
@pytest.fixture()
def deposit_config(swh_scheduler_config, swh_storage_backend_config):
return {
"max_upload_size": 500,
"extraction_dir": "/tmp/swh-deposit/test/extraction-dir",
"checks": False,
"scheduler": {"cls": "local", **swh_scheduler_config,},
"storage_metadata": swh_storage_backend_config,
}
@pytest.fixture()
def deposit_config_path(tmp_path, monkeypatch, deposit_config):
conf_path = os.path.join(tmp_path, "deposit.yml")
with open(conf_path, "w") as f:
f.write(yaml.dump(deposit_config))
monkeypatch.setenv("SWH_CONFIG_FILENAME", conf_path)
return conf_path
@pytest.fixture(autouse=True)
def deposit_autoconfig(deposit_config_path):
"""Enforce config for deposit classes inherited from APIConfig."""
cfg = read(deposit_config_path)
if "scheduler" in cfg:
# scheduler setup: require the check-deposit and load-deposit tasks
scheduler = get_scheduler(**cfg["scheduler"])
task_types = [
{
"type": "check-deposit",
"backend_name": "swh.deposit.loader.tasks.ChecksDepositTsk",
"description": "Check deposit metadata/archive before loading",
"num_retries": 3,
},
{
"type": "load-deposit",
"backend_name": "swh.loader.package.deposit.tasks.LoadDeposit",
"description": "Loading deposit archive into swh archive",
"num_retries": 3,
},
]
for task_type in task_types:
scheduler.create_task_type(task_type)
@pytest.fixture(scope="session")
def django_db_setup(request, django_db_blocker, postgresql_proc):
from django.conf import settings
settings.DATABASES["default"].update(
{
("ENGINE", "django.db.backends.postgresql"),
("NAME", "tests"),
("USER", postgresql_proc.user), # noqa
("HOST", postgresql_proc.host), # noqa
("PORT", postgresql_proc.port), # noqa
}
)
with django_db_blocker.unblock():
setup_databases(
verbosity=request.config.option.verbose, interactive=False, keepdb=False
)
def execute_sql(sql):
"""Execute sql to postgres db"""
with psycopg2.connect(database="postgres") as conn:
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
cur = conn.cursor()
cur.execute(sql)
@pytest.fixture(autouse=True, scope="session")
def swh_proxy():
"""Automatically inject this fixture in all tests to ensure no outside
connection takes place.
"""
os.environ["http_proxy"] = "http://localhost:999"
os.environ["https_proxy"] = "http://localhost:999"
def create_deposit_collection(collection_name: str):
"""Create a deposit collection with name collection_name
"""
from swh.deposit.models import DepositCollection
try:
collection = DepositCollection._default_manager.get(name=collection_name)
except DepositCollection.DoesNotExist:
collection = DepositCollection(name=collection_name)
collection.save()
return collection
def deposit_collection_factory(collection_name=TEST_USER["collection"]["name"]):
@pytest.fixture
def _deposit_collection(db, collection_name=collection_name):
return create_deposit_collection(collection_name)
return _deposit_collection
deposit_collection = deposit_collection_factory()
deposit_another_collection = deposit_collection_factory("another-collection")
@pytest.fixture
def deposit_user(db, deposit_collection):
"""Create/Return the test_user "test"
"""
from swh.deposit.models import DepositClient
try:
user = DepositClient._default_manager.get(username=TEST_USER["username"])
except DepositClient.DoesNotExist:
user = DepositClient._default_manager.create_user(
username=TEST_USER["username"],
email=TEST_USER["email"],
password=TEST_USER["password"],
provider_url=TEST_USER["provider_url"],
domain=TEST_USER["domain"],
)
user.collections = [deposit_collection.id]
user.save()
return user
@pytest.fixture
def client():
"""Override pytest-django one which does not work for djangorestframework.
"""
return APIClient() # <- drf's client
-@pytest.yield_fixture
+@pytest.fixture
def authenticated_client(client, deposit_user):
"""Returned a logged client
+ This also patched the client instance to keep a reference on the associated
+ deposit_user.
+
"""
_token = "%s:%s" % (deposit_user.username, TEST_USER["password"])
token = base64.b64encode(_token.encode("utf-8"))
authorization = "Basic %s" % token.decode("utf-8")
client.credentials(HTTP_AUTHORIZATION=authorization)
+ client.deposit_client = deposit_user
yield client
client.logout()
@pytest.fixture
def sample_archive(tmp_path):
"""Returns a sample archive
"""
tmp_path = str(tmp_path) # pytest version limitation in previous version
archive = create_arborescence_archive(
tmp_path, "archive1", "file1", b"some content in file"
)
return archive
@pytest.fixture
def atom_dataset(datadir) -> Mapping[str, str]:
"""Compute the paths to atom files.
Returns:
Dict of atom name per content (bytes)
"""
atom_path = os.path.join(datadir, "atom")
data = {}
for filename in os.listdir(atom_path):
filepath = os.path.join(atom_path, filename)
with open(filepath, "rb") as f:
raw_content = f.read().decode("utf-8")
# Keep the filename without extension
atom_name = filename.split(".")[0]
data[atom_name] = raw_content
return data
def create_deposit(
authenticated_client,
collection_name: str,
sample_archive,
external_id: str,
deposit_status=DEPOSIT_STATUS_DEPOSITED,
):
"""Create a skeleton shell deposit
"""
url = reverse(COL_IRI, args=[collection_name])
# when
response = authenticated_client.post(
url,
content_type="application/zip", # as zip
data=sample_archive["data"],
# + headers
CONTENT_LENGTH=sample_archive["length"],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="false",
HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"]),
)
# then
assert response.status_code == status.HTTP_201_CREATED
from swh.deposit.models import Deposit
deposit = Deposit._default_manager.get(external_id=external_id)
if deposit.status != deposit_status:
deposit.status = deposit_status
deposit.save()
assert deposit.status == deposit_status
return deposit
def create_binary_deposit(
authenticated_client,
collection_name: str,
sample_archive,
external_id: str,
deposit_status: str = DEPOSIT_STATUS_DEPOSITED,
atom_dataset: Mapping[str, bytes] = {},
):
"""Create a deposit with both metadata and archive set. Then alters its status
to `deposit_status`.
"""
deposit = create_deposit(
authenticated_client,
collection_name,
sample_archive,
external_id=external_id,
deposit_status=DEPOSIT_STATUS_PARTIAL,
)
response = authenticated_client.post(
reverse(EDIT_SE_IRI, args=[collection_name, deposit.id]),
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8"),
HTTP_SLUG=deposit.external_id,
HTTP_IN_PROGRESS="true",
)
assert response.status_code == status.HTTP_201_CREATED
assert deposit.status == DEPOSIT_STATUS_PARTIAL
from swh.deposit.models import Deposit
deposit = Deposit._default_manager.get(pk=deposit.id)
if deposit.status != deposit_status:
deposit.status = deposit_status
deposit.save()
assert deposit.status == deposit_status
return deposit
def deposit_factory(deposit_status=DEPOSIT_STATUS_DEPOSITED):
"""Build deposit with a specific status
"""
@pytest.fixture()
def _deposit(
sample_archive,
deposit_collection,
authenticated_client,
deposit_status=deposit_status,
):
external_id = "external-id-%s" % deposit_status
return create_deposit(
authenticated_client,
deposit_collection.name,
sample_archive,
external_id=external_id,
deposit_status=deposit_status,
)
return _deposit
deposited_deposit = deposit_factory()
rejected_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_REJECTED)
partial_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_PARTIAL)
verified_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_VERIFIED)
completed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS)
failed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_FAILURE)
@pytest.fixture
def partial_deposit_with_metadata(
sample_archive, deposit_collection, authenticated_client, atom_dataset
):
"""Returns deposit with archive and metadata provided, status 'partial'
"""
return create_binary_deposit(
authenticated_client,
deposit_collection.name,
sample_archive,
external_id="external-id-partial",
deposit_status=DEPOSIT_STATUS_PARTIAL,
atom_dataset=atom_dataset,
)
@pytest.fixture
def partial_deposit_only_metadata(
deposit_collection, authenticated_client, atom_dataset
):
response = authenticated_client.post(
reverse(COL_IRI, args=[deposit_collection.name]),
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data1"],
HTTP_SLUG="external-id-partial",
HTTP_IN_PROGRESS=True,
)
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(response.content)
deposit_id = response_content["deposit_id"]
from swh.deposit.models import Deposit
deposit = Deposit._default_manager.get(pk=deposit_id)
assert deposit.status == DEPOSIT_STATUS_PARTIAL
return deposit
@pytest.fixture
def complete_deposit(sample_archive, deposit_collection, authenticated_client):
"""Returns a completed deposit (load success)
"""
deposit = create_deposit(
authenticated_client,
deposit_collection.name,
sample_archive,
external_id="external-id-complete",
deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS,
)
origin = "https://hal.archives-ouvertes.fr/hal-01727745"
directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b"
revision_id = "548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10"
snapshot_id = "e5e82d064a9c3df7464223042e0c55d72ccff7f0"
deposit.swhid = swhid(DIRECTORY, directory_id)
deposit.swhid_context = swhid(
DIRECTORY,
directory_id,
metadata={
"origin": origin,
"visit": swhid(SNAPSHOT, snapshot_id),
"anchor": swhid(REVISION, revision_id),
"path": "/",
},
)
deposit.save()
return deposit
@pytest.fixture()
def tmp_path(tmp_path):
return str(tmp_path) # issue with oldstable's pytest version
diff --git a/swh/deposit/tests/data/atom/entry-data-with-origin.xml b/swh/deposit/tests/data/atom/entry-data-with-origin.xml
new file mode 100644
index 00000000..0cc06a8b
--- /dev/null
+++ b/swh/deposit/tests/data/atom/entry-data-with-origin.xml
@@ -0,0 +1,13 @@
+
+
+ Awesome Compiler
+ urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a
+ dudess
+
+
+
+
+
+
diff --git a/swh/deposit/tests/data/atom/entry-data-with-swhid-fail-metadata-functional-checks.xml b/swh/deposit/tests/data/atom/entry-data-with-swhid-fail-metadata-functional-checks.xml
new file mode 100644
index 00000000..dab5b1f8
--- /dev/null
+++ b/swh/deposit/tests/data/atom/entry-data-with-swhid-fail-metadata-functional-checks.xml
@@ -0,0 +1,12 @@
+
+
+
+ urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a
+ 2017-10-07T15:17:08Z
+
+
+
+
+
+
diff --git a/swh/deposit/tests/data/atom/entry-data-with-swhid.xml b/swh/deposit/tests/data/atom/entry-data-with-swhid.xml
new file mode 100644
index 00000000..34a59474
--- /dev/null
+++ b/swh/deposit/tests/data/atom/entry-data-with-swhid.xml
@@ -0,0 +1,13 @@
+
+
+ Awesome Compiler
+ urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a
+ dudess
+
+
+
+
+
+
diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py
index 8be41c4c..430e5790 100644
--- a/swh/deposit/tests/test_utils.py
+++ b/swh/deposit/tests/test_utils.py
@@ -1,141 +1,200 @@
-# Copyright (C) 2018-2019 The Software Heritage developers
+# Copyright (C) 2018-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from typing import Union
from unittest.mock import patch
import pytest
from swh.deposit import utils
+from swh.model.identifiers import SWHID, parse_swhid
+from swh.model.model import MetadataTargetType
def test_merge():
"""Calling utils.merge on dicts should merge without losing information
"""
d0 = {"author": "someone", "license": [["gpl2"]], "a": 1}
d1 = {
"author": ["author0", {"name": "author1"}],
"license": [["gpl3"]],
"b": {"1": "2"},
}
d2 = {"author": map(lambda x: x, ["else"]), "license": "mit", "b": {"2": "3",}}
d3 = {
"author": (v for v in ["no one"]),
}
actual_merge = utils.merge(d0, d1, d2, d3)
expected_merge = {
"a": 1,
"license": [["gpl2"], ["gpl3"], "mit"],
"author": ["someone", "author0", {"name": "author1"}, "else", "no one"],
"b": {"1": "2", "2": "3",},
}
assert actual_merge == expected_merge
def test_merge_2():
d0 = {"license": "gpl2", "runtime": {"os": "unix derivative"}}
d1 = {"license": "gpl3", "runtime": "GNU/Linux"}
expected = {
"license": ["gpl2", "gpl3"],
"runtime": [{"os": "unix derivative"}, "GNU/Linux"],
}
actual = utils.merge(d0, d1)
assert actual == expected
def test_merge_edge_cases():
input_dict = {
"license": ["gpl2", "gpl3"],
"runtime": [{"os": "unix derivative"}, "GNU/Linux"],
}
# against empty dict
actual = utils.merge(input_dict, {})
assert actual == input_dict
# against oneself
actual = utils.merge(input_dict, input_dict, input_dict)
assert actual == input_dict
def test_merge_one_dict():
"""Merge one dict should result in the same dict value
"""
input_and_expected = {"anything": "really"}
actual = utils.merge(input_and_expected)
assert actual == input_and_expected
def test_merge_raise():
"""Calling utils.merge with any no dict argument should raise
"""
d0 = {"author": "someone", "a": 1}
d1 = ["not a dict"]
with pytest.raises(ValueError):
utils.merge(d0, d1)
with pytest.raises(ValueError):
utils.merge(d1, d0)
with pytest.raises(ValueError):
utils.merge(d1)
assert utils.merge(d0) == d0
@patch("swh.deposit.utils.normalize_timestamp", side_effect=lambda x: x)
def test_normalize_date_0(mock_normalize):
"""When date is a list, choose the first date and normalize it
Note: We do not test swh.model.identifiers which is already tested
in swh.model
"""
actual_date = utils.normalize_date(["2017-10-12", "date1"])
expected_date = "2017-10-12 00:00:00+00:00"
assert str(actual_date) == expected_date
@patch("swh.deposit.utils.normalize_timestamp", side_effect=lambda x: x)
def test_normalize_date_1(mock_normalize):
"""Providing a date in a reasonable format, everything is fine
Note: We do not test swh.model.identifiers which is already tested
in swh.model
"""
actual_date = utils.normalize_date("2018-06-11 17:02:02")
expected_date = "2018-06-11 17:02:02+00:00"
assert str(actual_date) == expected_date
@patch("swh.deposit.utils.normalize_timestamp", side_effect=lambda x: x)
def test_normalize_date_doing_irrelevant_stuff(mock_normalize):
"""Providing a date with only the year results in a reasonable date
Note: We do not test swh.model.identifiers which is already tested
in swh.model
"""
actual_date = utils.normalize_date("2017")
expected_date = "2017-01-01 00:00:00+00:00"
assert str(actual_date) == expected_date
+
+
+@pytest.mark.parametrize(
+ "swhid_or_origin,expected_type,expected_metadata_context",
+ [
+ ("https://something", MetadataTargetType.ORIGIN, {"origin": None}),
+ (
+ "swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49",
+ MetadataTargetType.CONTENT,
+ {"origin": None},
+ ),
+ (
+ "swh:1:snp:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=http://blah",
+ MetadataTargetType.SNAPSHOT,
+ {"origin": "http://blah", "path": None},
+ ),
+ (
+ "swh:1:dir:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;path=/path",
+ MetadataTargetType.DIRECTORY,
+ {"origin": None, "path": b"/path"},
+ ),
+ (
+ "swh:1:rev:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;visit=swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", # noqa
+ MetadataTargetType.REVISION,
+ {
+ "origin": None,
+ "path": None,
+ "snapshot": parse_swhid(
+ "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49"
+ ),
+ },
+ ),
+ (
+ "swh:1:rel:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", # noqa
+ MetadataTargetType.RELEASE,
+ {
+ "origin": None,
+ "path": None,
+ "directory": parse_swhid(
+ "swh:1:dir:41b5c8cc985d190b5a7ef4878128ebfdc2358f49"
+ ),
+ },
+ ),
+ ],
+)
+def test_compute_metadata_context(
+ swhid_or_origin: Union[str, SWHID], expected_type, expected_metadata_context
+):
+ if expected_type != MetadataTargetType.ORIGIN:
+ assert isinstance(swhid_or_origin, str)
+ swhid_or_origin = parse_swhid(swhid_or_origin)
+
+ object_type, metadata_context = utils.compute_metadata_context(swhid_or_origin)
+
+ assert object_type == expected_type
+ assert metadata_context == expected_metadata_context
diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py
index 3b79293e..e306902a 100644
--- a/swh/deposit/utils.py
+++ b/swh/deposit/utils.py
@@ -1,83 +1,119 @@
-# Copyright (C) 2018-2019 The Software Heritage developers
+# Copyright (C) 2018-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from types import GeneratorType
+from typing import Any, Dict, Tuple, Union
import iso8601
-from swh.model.identifiers import normalize_timestamp
+from swh.model.identifiers import SWHID, normalize_timestamp, parse_swhid
+from swh.model.model import MetadataTargetType
def merge(*dicts):
"""Given an iterator of dicts, merge them losing no information.
Args:
*dicts: arguments are all supposed to be dict to merge into one
Returns:
dict merged without losing information
"""
def _extend(existing_val, value):
"""Given an existing value and a value (as potential lists), merge
them together without repetition.
"""
if isinstance(value, (list, map, GeneratorType)):
vals = value
else:
vals = [value]
for v in vals:
if v in existing_val:
continue
existing_val.append(v)
return existing_val
d = {}
for data in dicts:
if not isinstance(data, dict):
raise ValueError("dicts is supposed to be a variable arguments of dict")
for key, value in data.items():
existing_val = d.get(key)
if not existing_val:
d[key] = value
continue
if isinstance(existing_val, (list, map, GeneratorType)):
new_val = _extend(existing_val, value)
elif isinstance(existing_val, dict):
if isinstance(value, dict):
new_val = merge(existing_val, value)
else:
new_val = _extend([existing_val], value)
else:
new_val = _extend([existing_val], value)
d[key] = new_val
return d
def normalize_date(date):
"""Normalize date fields as expected by swh workers.
If date is a list, elect arbitrarily the first element of that
list
If date is (then) a string, parse it through
dateutil.parser.parse to extract a datetime.
Then normalize it through
swh.model.identifiers.normalize_timestamp.
Returns
The swh date object
"""
if isinstance(date, list):
date = date[0]
if isinstance(date, str):
date = iso8601.parse_date(date)
return normalize_timestamp(date)
+
+
+def compute_metadata_context(
+ swhid_reference: Union[SWHID, str]
+) -> Tuple[MetadataTargetType, Dict[str, Any]]:
+ """Given a SWHID object, determine the context as a dict.
+
+ The parse_swhid calls within are not expected to raise (because they should have
+ been caught early on).
+
+ """
+ metadata_context: Dict[str, Any] = {"origin": None}
+ if isinstance(swhid_reference, SWHID):
+ object_type = MetadataTargetType(swhid_reference.object_type)
+ assert object_type != MetadataTargetType.ORIGIN
+
+ if swhid_reference.metadata:
+ path = swhid_reference.metadata.get("path")
+ metadata_context = {
+ "origin": swhid_reference.metadata.get("origin"),
+ "path": path.encode() if path else None,
+ }
+ snapshot = swhid_reference.metadata.get("visit")
+ if snapshot:
+ metadata_context["snapshot"] = parse_swhid(snapshot)
+
+ anchor = swhid_reference.metadata.get("anchor")
+ if anchor:
+ anchor_swhid = parse_swhid(anchor)
+ metadata_context[anchor_swhid.object_type] = anchor_swhid
+ else:
+ object_type = MetadataTargetType.ORIGIN
+
+ return object_type, metadata_context