diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py index ff7606c7..55c1b084 100644 --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -1,1042 +1,1172 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from abc import ABCMeta, abstractmethod import datetime import hashlib import json from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union +import attr from django.http import FileResponse, HttpResponse from django.shortcuts import render from django.urls import reverse from django.utils import timezone from rest_framework import status from rest_framework.authentication import BaseAuthentication, BasicAuthentication from rest_framework.permissions import BasePermission, IsAuthenticated from rest_framework.request import Request from rest_framework.views import APIView +from swh.deposit.api.checks import check_metadata +from swh.deposit.api.converters import convert_status_detail +from swh.deposit.models import Deposit +from swh.deposit.utils import compute_metadata_context from swh.model import hashutil +from swh.model.identifiers import SWHID, ValidationError +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + RawExtrinsicMetadata, +) from swh.scheduler.utils import create_oneshot_task_dict from ..config import ( ARCHIVE_KEY, ARCHIVE_TYPE, CONT_FILE_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_PARTIAL, EDIT_SE_IRI, EM_IRI, METADATA_KEY, METADATA_TYPE, RAW_METADATA_KEY, STATE_IRI, APIConfig, ) from ..errors import ( BAD_REQUEST, CHECKSUM_MISMATCH, ERROR_CONTENT, FORBIDDEN, MAX_UPLOAD_SIZE_EXCEEDED, MEDIATION_NOT_ALLOWED, METHOD_NOT_ALLOWED, NOT_FOUND, PARSING_ERROR, + BadRequestError, ParserError, make_error_dict, make_error_response, make_error_response_from_dict, ) -from ..models import Deposit, DepositClient, DepositCollection, DepositRequest -from ..parsers import parse_xml +from ..models import DepositClient, DepositCollection, DepositRequest +from ..parsers import parse_swh_reference, parse_xml ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"] ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"] class AuthenticatedAPIView(APIView): """Mixin intended as a based API view to enforce the basic authentication check """ authentication_classes: Sequence[Type[BaseAuthentication]] = (BasicAuthentication,) permission_classes: Sequence[Type[BasePermission]] = (IsAuthenticated,) class APIBase(APIConfig, AuthenticatedAPIView, metaclass=ABCMeta): """Base deposit request class sharing multiple common behaviors. """ def _read_headers(self, request: Request) -> Dict[str, Any]: """Read and unify the necessary headers from the request (those are not stored in the same location or not properly formatted). Args: request (Request): Input request Returns: Dictionary with the following keys (some associated values may be None): - content-type - content-length - in-progress - content-disposition - packaging - slug - on-behalf-of """ meta = request._request.META content_type = request.content_type content_length = meta.get("CONTENT_LENGTH") if content_length and isinstance(content_length, str): content_length = int(content_length) # final deposit if not provided in_progress = meta.get("HTTP_IN_PROGRESS", False) content_disposition = meta.get("HTTP_CONTENT_DISPOSITION") if isinstance(in_progress, str): in_progress = in_progress.lower() == "true" content_md5sum = meta.get("HTTP_CONTENT_MD5") if content_md5sum: content_md5sum = bytes.fromhex(content_md5sum) packaging = meta.get("HTTP_PACKAGING") slug = meta.get("HTTP_SLUG") on_behalf_of = meta.get("HTTP_ON_BEHALF_OF") metadata_relevant = meta.get("HTTP_METADATA_RELEVANT") swhid = meta.get("HTTP_X_CHECK_SWHID") return { "content-type": content_type, "content-length": content_length, "in-progress": in_progress, "content-disposition": content_disposition, "content-md5sum": content_md5sum, "packaging": packaging, "slug": slug, "on-behalf-of": on_behalf_of, "metadata-relevant": metadata_relevant, "swhid": swhid, } def _compute_md5(self, filehandler) -> bytes: """Compute uploaded file's md5 sum. Args: filehandler (InMemoryUploadedFile): the file to compute the md5 hash Returns: the md5 checksum (str) """ h = hashlib.md5() for chunk in filehandler: h.update(chunk) return h.digest() def _deposit_put( self, request: Request, deposit_id: Optional[int] = None, in_progress: bool = False, external_id: Optional[str] = None, ) -> Deposit: """Save/Update a deposit in db. Args: request: request data deposit_id: deposit identifier in_progress: deposit status external_id: external identifier to associate to the deposit Returns: The Deposit instance saved or updated. """ complete_date: Optional[datetime.datetime] = None deposit_parent: Optional[Deposit] = None if in_progress is False: complete_date = timezone.now() status_type = DEPOSIT_STATUS_DEPOSITED else: status_type = DEPOSIT_STATUS_PARTIAL if not deposit_id: try: # find a deposit parent (same external id, status load to success) deposit_parent = ( Deposit.objects.filter( external_id=external_id, status=DEPOSIT_STATUS_LOAD_SUCCESS ) .order_by("-id")[0:1] .get() ) # noqa except Deposit.DoesNotExist: # then no parent for that deposit, deposit_parent already None pass assert external_id is not None deposit = Deposit( collection=self._collection, external_id=external_id, complete_date=complete_date, status=status_type, client=self._client, parent=deposit_parent, ) else: deposit = Deposit.objects.get(pk=deposit_id) # update metadata deposit.complete_date = complete_date deposit.status = status_type if self.config["checks"]: deposit.save() # needed to have a deposit id scheduler = self.scheduler if deposit.status == DEPOSIT_STATUS_DEPOSITED and not deposit.check_task_id: task = create_oneshot_task_dict( "check-deposit", collection=deposit.collection.name, deposit_id=deposit.id, retries_left=3, ) check_task_id = scheduler.create_tasks([task])[0]["id"] deposit.check_task_id = check_task_id deposit.save() return deposit def _deposit_request_put( self, deposit: Deposit, deposit_request_data: Dict[str, Any], replace_metadata: bool = False, replace_archives: bool = False, ) -> DepositRequest: """Save a deposit request with metadata attached to a deposit. Args: deposit: The deposit concerned by the request deposit_request_data: The dictionary with at most 2 deposit request types (archive, metadata) to associate to the deposit replace_metadata: Flag defining if we add or update existing metadata to the deposit replace_archives: Flag defining if we add or update archives to existing deposit Returns: the DepositRequest object stored in the backend """ if replace_metadata: DepositRequest.objects.filter(deposit=deposit, type=METADATA_TYPE).delete() if replace_archives: DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() deposit_request = None archive_file = deposit_request_data.get(ARCHIVE_KEY) if archive_file: deposit_request = DepositRequest( type=ARCHIVE_TYPE, deposit=deposit, archive=archive_file ) deposit_request.save() metadata = deposit_request_data.get(METADATA_KEY) if metadata: raw_metadata = deposit_request_data[RAW_METADATA_KEY] deposit_request = DepositRequest( type=METADATA_TYPE, deposit=deposit, metadata=metadata, raw_metadata=raw_metadata.decode("utf-8"), ) deposit_request.save() assert deposit_request is not None return deposit_request def _delete_archives(self, collection_name: str, deposit_id: int) -> Dict: """Delete archive references from the deposit id. """ try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, f"The deposit {deposit_id} does not exist" ) DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() return {} def _delete_deposit(self, collection_name: str, deposit_id: int) -> Dict: """Delete deposit reference. Args: collection_name: Client's collection deposit_id: The deposit to delete Returns Empty dict when ok. Dict with error key to describe the failure. """ try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, f"The deposit {deposit_id} does not exist" ) if deposit.collection.name != collection_name: summary = "Cannot delete a deposit from another collection" description = "Deposit %s does not belong to the collection %s" % ( deposit_id, collection_name, ) return make_error_dict( BAD_REQUEST, summary=summary, verbose_description=description ) DepositRequest.objects.filter(deposit=deposit).delete() deposit.delete() return {} def _check_preconditions_on( self, filehandler, md5sum: str, content_length: Optional[int] = None ) -> Optional[Dict]: """Check preconditions on provided file are respected. That is the length and/or the md5sum hash match the file's content. Args: filehandler (InMemoryUploadedFile): The file to check md5sum: md5 hash expected from the file's content content_length: the expected length if provided. Returns: Either none if no error or a dictionary with a key error detailing the problem. """ max_upload_size = self.config["max_upload_size"] if content_length: if content_length > max_upload_size: return make_error_dict( MAX_UPLOAD_SIZE_EXCEEDED, f"Upload size limit exceeded (max {max_upload_size} bytes)." "Please consider sending the archive in multiple steps.", ) length = filehandler.size if length != content_length: return make_error_dict( status.HTTP_412_PRECONDITION_FAILED, "Wrong length" ) if md5sum: _md5sum = self._compute_md5(filehandler) if _md5sum != md5sum: return make_error_dict( CHECKSUM_MISMATCH, "Wrong md5 hash", f"The checksum sent {hashutil.hash_to_hex(md5sum)} and the actual " f"checksum {hashutil.hash_to_hex(_md5sum)} does not match.", ) return None def _binary_upload( self, request: Request, headers: Dict[str, Any], collection_name: str, deposit_id: Optional[int] = None, replace_metadata: bool = False, replace_archives: bool = False, ) -> Dict[str, Any]: """Binary upload routine. Other than such a request, a 415 response is returned. Args: request (Request): the request holding information to parse and inject in db headers (dict): request headers formatted collection_name (str): the associated client deposit_id (id): deposit identifier if provided replace_metadata (bool): 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives (bool): 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id (int): Deposit identifier - deposit_date (date): Deposit date - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 413 (request entity too large) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided """ content_length = headers["content-length"] if not content_length: return make_error_dict( BAD_REQUEST, "CONTENT_LENGTH header is mandatory", "For archive deposit, the CONTENT_LENGTH header must be sent.", ) content_disposition = headers["content-disposition"] if not content_disposition: return make_error_dict( BAD_REQUEST, "CONTENT_DISPOSITION header is mandatory", "For archive deposit, the CONTENT_DISPOSITION header must be sent.", ) packaging = headers["packaging"] if packaging and packaging not in ACCEPT_PACKAGINGS: return make_error_dict( BAD_REQUEST, f"Only packaging {ACCEPT_PACKAGINGS} is supported", f"The packaging provided {packaging} is not supported", ) filehandler = request.FILES["file"] precondition_status_response = self._check_preconditions_on( filehandler, headers["content-md5sum"], content_length ) if precondition_status_response: return precondition_status_response external_id = headers["slug"] # actual storage of data archive_metadata = filehandler deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) self._deposit_request_put( deposit, {ARCHIVE_KEY: archive_metadata}, replace_metadata=replace_metadata, replace_archives=replace_archives, ) return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "status": deposit.status, "archive": filehandler.name, } def _read_metadata(self, metadata_stream) -> Tuple[bytes, Dict[str, Any]]: """Given a metadata stream, reads the metadata and returns both the parsed and the raw metadata. """ raw_metadata = metadata_stream.read() metadata = parse_xml(raw_metadata) return raw_metadata, metadata def _multipart_upload( self, request: Request, headers: Dict[str, Any], collection_name: str, deposit_id: Optional[int] = None, replace_metadata: bool = False, replace_archives: bool = False, ) -> Dict: """Multipart upload supported with exactly: - 1 archive (zip) - 1 atom entry Other than such a request, a 415 response is returned. Args: request (Request): the request holding information to parse and inject in db headers: request headers formatted collection_name: the associated client deposit_id: deposit identifier if provided replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id (int): Deposit identifier - deposit_date (date): Deposit date - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 412 (precondition failed) if the potentially md5 hash provided mismatch the reality of the archive - 413 (request entity too large) if the length of the archive exceeds the max size configured - 415 (unsupported media type) if a wrong media type is provided """ external_id = headers["slug"] content_types_present = set() data: Dict[str, Optional[Any]] = { "application/zip": None, # expected either zip "application/x-tar": None, # or x-tar "application/atom+xml": None, } for key, value in request.FILES.items(): fh = value content_type = fh.content_type if content_type in content_types_present: return make_error_dict( ERROR_CONTENT, "Only 1 application/zip (or application/x-tar) archive " "and 1 atom+xml entry is supported (as per sword2.0 " "specification)", "You provided more than 1 application/(zip|x-tar) " "or more than 1 application/atom+xml content-disposition " "header in the multipart deposit", ) content_types_present.add(content_type) assert content_type is not None data[content_type] = fh if len(content_types_present) != 2: return make_error_dict( ERROR_CONTENT, "You must provide both 1 application/zip (or " "application/x-tar) and 1 atom+xml entry for multipart " "deposit", "You need to provide only 1 application/(zip|x-tar) " "and 1 application/atom+xml content-disposition header " "in the multipart deposit", ) filehandler = data["application/zip"] if not filehandler: filehandler = data["application/x-tar"] precondition_status_response = self._check_preconditions_on( filehandler, headers["content-md5sum"] ) if precondition_status_response: return precondition_status_response try: raw_metadata, metadata = self._read_metadata(data["application/atom+xml"]) except ParserError: return make_error_dict( PARSING_ERROR, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) # actual storage of data deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) deposit_request_data = { ARCHIVE_KEY: filehandler, METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata, } self._deposit_request_put( deposit, deposit_request_data, replace_metadata, replace_archives ) assert filehandler is not None return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "archive": filehandler.name, "status": deposit.status, } + def _store_metadata_deposit( + self, + deposit: Deposit, + swhid_reference: Union[str, SWHID], + metadata: Dict, + raw_metadata: bytes, + deposit_origin: Optional[str] = None, + ) -> Tuple[Union[SWHID, str], Union[SWHID, str], Deposit, DepositRequest]: + """When all user inputs pass the checks, this associates the raw_metadata to the + swhid_reference in the raw extrinsic metadata storage. In case of any issues, + a bad request response is returned to the user with the details. + + Checks: + - metadata are technically parsable + - metadata pass the functional checks + - SWHID (if any) is technically valid + + Args: + deposit: Deposit reference + swhid_reference: The swhid or the origin to attach metadata information to + metadata: Full dict of metadata to check for validity (parsed out of + raw_metadata) + raw_metadata: The actual raw metadata to send in the storage metadata + deposit_origin: Optional deposit origin url to use if any (e.g. deposit + update scenario provides one) + + Raises: + BadRequestError in case of incorrect inputs from the deposit client + (e.g. functionally invalid metadata, ...) + + Returns: + Tuple of core swhid, swhid context, deposit and deposit request + + """ + metadata_ok, error_details = check_metadata(metadata) + if not metadata_ok: + assert error_details, "Details should be set when a failure occurs" + raise BadRequestError( + "Functional metadata checks failure", + convert_status_detail(error_details), + ) + + metadata_authority = MetadataAuthority( + type=MetadataAuthorityType.DEPOSIT_CLIENT, + url=deposit.client.provider_url, + metadata={"name": deposit.client.last_name}, + ) + + metadata_fetcher = MetadataFetcher( + name=self.tool["name"], + version=self.tool["version"], + metadata=self.tool["configuration"], + ) + + # replace metadata within the deposit backend + deposit_request_data = { + METADATA_KEY: metadata, + RAW_METADATA_KEY: raw_metadata, + } + + # actually add the metadata to the completed deposit + deposit_request = self._deposit_request_put(deposit, deposit_request_data) + + object_type, metadata_context = compute_metadata_context(swhid_reference) + if deposit_origin: # metadata deposit update on completed deposit + metadata_context["origin"] = deposit_origin + + swhid_core: Union[str, SWHID] + if isinstance(swhid_reference, str): + swhid_core = swhid_reference + else: + swhid_core = attr.evolve(swhid_reference, metadata={}) + + # store that metadata to the metadata storage + metadata_object = RawExtrinsicMetadata( + type=object_type, + target=swhid_core, # core swhid or origin + discovery_date=deposit_request.date, + authority=metadata_authority, + fetcher=metadata_fetcher, + format="sword-v2-atom-codemeta", + metadata=raw_metadata, + **metadata_context, + ) + + # write to metadata storage + self.storage_metadata.metadata_authority_add([metadata_authority]) + self.storage_metadata.metadata_fetcher_add([metadata_fetcher]) + self.storage_metadata.raw_extrinsic_metadata_add([metadata_object]) + + return (swhid_core, swhid_reference, deposit, deposit_request) + def _atom_entry( self, request: Request, headers: Dict[str, Any], collection_name: str, deposit_id: Optional[int] = None, replace_metadata: bool = False, replace_archives: bool = False, ) -> Dict[str, Any]: """Atom entry deposit. Args: request: the request holding information to parse and inject in db headers: request headers formatted collection_name: the associated client deposit_id: deposit identifier if provided replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id: deposit id associated to the deposit - deposit_date: date of the deposit - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 400 (bad request) if the request's body is empty - 415 (unsupported media type) if a wrong media type is provided """ try: raw_metadata, metadata = self._read_metadata(request.data) except ParserError: return make_error_dict( BAD_REQUEST, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) if not metadata: return make_error_dict( BAD_REQUEST, "Empty body request is not supported", "Atom entry deposit is supposed to send for metadata. " "If the body is empty, there is no metadata.", ) - external_id = metadata.get("external_identifier", headers["slug"]) + # Determine if we are in the metadata-only deposit case + try: + swhid = parse_swh_reference(metadata) + except ValidationError as e: + return make_error_dict(PARSING_ERROR, "Invalid SWHID reference", str(e),) - # TODO: Determine if we are in the metadata-only deposit case. If it is, then - # save deposit and deposit request typed 'metadata' and send metadata to the - # metadata storage. Otherwise, do as existing deposit. + external_id = metadata.get("external_identifier", headers["slug"]) deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) + if swhid is not None: + try: + swhid, swhid_ref, depo, depo_request = self._store_metadata_deposit( + deposit, swhid, metadata, raw_metadata + ) + except BadRequestError as bad_request_error: + return bad_request_error.to_dict() + + deposit.status = DEPOSIT_STATUS_LOAD_SUCCESS + if isinstance(swhid_ref, SWHID): + deposit.swhid = str(swhid) + deposit.swhid_context = str(swhid_ref) + deposit.complete_date = depo_request.date + deposit.reception_date = depo_request.date + deposit.save() + + return { + "deposit_id": deposit.id, + "deposit_date": depo_request.date, + "status": deposit.status, + "archive": None, + } + self._deposit_request_put( deposit, {METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata}, replace_metadata, replace_archives, ) return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "archive": None, "status": deposit.status, } def _empty_post( self, request: Request, headers: Dict, collection_name: str, deposit_id: int ) -> Dict[str, Any]: """Empty post to finalize an empty deposit. Args: request: the request holding information to parse and inject in db headers: request headers formatted collection_name: the associated client deposit_id: deposit identifier Returns: Dictionary of result with the deposit's id, the date it was completed and no archive. """ deposit = Deposit.objects.get(pk=deposit_id) deposit.complete_date = timezone.now() deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() return { "deposit_id": deposit_id, "deposit_date": deposit.complete_date, "status": deposit.status, "archive": None, } def _make_iris( self, request: Request, collection_name: str, deposit_id: int ) -> Dict[str, Any]: """Define the IRI endpoints Args: request (Request): The initial request collection_name (str): client/collection's name deposit_id (id): Deposit identifier Returns: Dictionary of keys with the iris' urls. """ args = [collection_name, deposit_id] return { iri: request.build_absolute_uri(reverse(iri, args=args)) for iri in [EM_IRI, EDIT_SE_IRI, CONT_FILE_IRI, STATE_IRI] } def additional_checks( self, request: Request, headers: Dict[str, Any], collection_name: str, deposit_id: Optional[int] = None, ) -> Dict[str, Any]: """Permit the child class to enrich additional checks. Returns: dict with 'error' detailing the problem. """ return {} def checks( self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> Dict[str, Any]: try: self._collection = DepositCollection.objects.get(name=collection_name) except DepositCollection.DoesNotExist: return make_error_dict( NOT_FOUND, f"Unknown collection name {collection_name}" ) assert self._collection is not None username = request.user.username if username: # unauthenticated request can have the username empty try: self._client: DepositClient = DepositClient.objects.get( # type: ignore username=username ) except DepositClient.DoesNotExist: return make_error_dict(NOT_FOUND, f"Unknown client name {username}") collection_id = self._collection.id collections = self._client.collections assert collections is not None if collection_id not in collections: return make_error_dict( FORBIDDEN, f"Client {username} cannot access collection {collection_name}", ) headers = self._read_headers(request) if deposit_id: try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, f"Deposit with id {deposit_id} does not exist" ) assert deposit is not None checks = self.restrict_access(request, headers, deposit) if checks: return checks if headers["on-behalf-of"]: return make_error_dict(MEDIATION_NOT_ALLOWED, "Mediation is not supported.") checks = self.additional_checks(request, headers, collection_name, deposit_id) if "error" in checks: return checks return {"headers": headers} def restrict_access( self, request: Request, headers: Dict, deposit: Deposit ) -> Dict[str, Any]: """Allow modifications on deposit with status 'partial' only, reject the rest. """ if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL: summary = "You can only act on deposit with status '%s'" % ( DEPOSIT_STATUS_PARTIAL, ) description = f"This deposit has status '{deposit.status}'" return make_error_dict( BAD_REQUEST, summary=summary, verbose_description=description ) return {} def _basic_not_allowed_method(self, request: Request, method: str): return make_error_response( request, METHOD_NOT_ALLOWED, f"{method} method is not supported on this endpoint", ) def get( self, request: Request, collection_name: str, deposit_id: int ) -> Union[HttpResponse, FileResponse]: return self._basic_not_allowed_method(request, "GET") def post( self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: return self._basic_not_allowed_method(request, "POST") def put( self, request: Request, collection_name: str, deposit_id: int ) -> HttpResponse: return self._basic_not_allowed_method(request, "PUT") def delete( self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: return self._basic_not_allowed_method(request, "DELETE") class APIGet(APIBase, metaclass=ABCMeta): """Mixin for class to support GET method. """ def get( self, request: Request, collection_name: str, deposit_id: int ) -> Union[HttpResponse, FileResponse]: """Endpoint to create/add resources to deposit. Returns: 200 response when no error during routine occurred 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) r = self.process_get(request, collection_name, deposit_id) status, content, content_type = r if content_type == "swh/generator": with content as path: return FileResponse( open(path, "rb"), status=status, content_type="application/zip" ) if content_type == "application/json": return HttpResponse( json.dumps(content), status=status, content_type=content_type ) return HttpResponse(content, status=status, content_type=content_type) @abstractmethod def process_get( self, request: Request, collection_name: str, deposit_id: int ) -> Tuple[int, Any, str]: """Routine to deal with the deposit's get processing. Returns: Tuple status, stream of content, content-type """ pass class APIPost(APIBase, metaclass=ABCMeta): """Mixin for class to support POST method. """ def post( self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: """Endpoint to create/add resources to deposit. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) headers = checks["headers"] _status, _iri_key, data = self.process_post( request, headers, collection_name, deposit_id ) error = data.get("error") if error: return make_error_response_from_dict(request, error) data["packagings"] = ACCEPT_PACKAGINGS iris = self._make_iris(request, collection_name, data["deposit_id"]) data.update(iris) response = render( request, "deposit/deposit_receipt.xml", context=data, content_type="application/xml", status=_status, ) response._headers["location"] = "Location", data[_iri_key] # type: ignore return response @abstractmethod def process_post( self, request, headers: Dict, collection_name: str, deposit_id: Optional[int] = None, ) -> Tuple[int, str, Dict]: """Routine to deal with the deposit's processing. Returns Tuple of: - response status code (200, 201, etc...) - key iri (EM_IRI, EDIT_SE_IRI, etc...) - dictionary of the processing result """ pass class APIPut(APIBase, metaclass=ABCMeta): """Mixin for class to support PUT method. """ def put( self, request: Request, collection_name: str, deposit_id: int ) -> HttpResponse: """Endpoint to update deposit resources. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) headers = checks["headers"] data = self.process_put(request, headers, collection_name, deposit_id) error = data.get("error") if error: return make_error_response_from_dict(request, error) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_put( self, request: Request, headers: Dict, collection_name: str, deposit_id: int ) -> Dict[str, Any]: """Routine to deal with updating a deposit in some way. Returns dictionary of the processing result """ pass class APIDelete(APIBase, metaclass=ABCMeta): """Mixin for class to support DELETE method. """ def delete( self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: """Endpoint to delete some deposit's resources (archives, deposit). Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) assert deposit_id is not None data = self.process_delete(request, collection_name, deposit_id) error = data.get("error") if error: return make_error_response_from_dict(request, error) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_delete( self, request: Request, collection_name: str, deposit_id: int ) -> Dict: """Routine to delete a resource. This is mostly not allowed except for the EM_IRI (cf. .api.deposit_update.APIUpdateArchive) """ return {} diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py index 068c80eb..d4fcfba0 100644 --- a/swh/deposit/api/deposit_update.py +++ b/swh/deposit/api/deposit_update.py @@ -1,342 +1,277 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Dict, Optional, Tuple from rest_framework import status from rest_framework.request import Request -from swh.deposit.api.checks import check_metadata -from swh.deposit.api.converters import convert_status_detail from swh.deposit.models import Deposit from swh.model.identifiers import parse_swhid -from swh.model.model import ( - MetadataAuthority, - MetadataAuthorityType, - MetadataFetcher, - MetadataTargetType, - RawExtrinsicMetadata, -) -from swh.storage import get_storage -from swh.storage.interface import StorageInterface - -from ..config import ( - CONT_FILE_IRI, - DEPOSIT_STATUS_LOAD_SUCCESS, - EDIT_SE_IRI, - EM_IRI, - METADATA_KEY, - RAW_METADATA_KEY, -) -from ..errors import BAD_REQUEST, ParserError, make_error_dict + +from ..config import CONT_FILE_IRI, DEPOSIT_STATUS_LOAD_SUCCESS, EDIT_SE_IRI, EM_IRI +from ..errors import BAD_REQUEST, BadRequestError, ParserError, make_error_dict from ..parsers import ( SWHAtomEntryParser, SWHFileUploadTarParser, SWHFileUploadZipParser, SWHMultiPartParser, ) from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, APIDelete, APIPost, APIPut class APIUpdateArchive(APIPost, APIPut, APIDelete): """Deposit request class defining api endpoints for sword deposit. What's known as 'EM IRI' in the sword specification. HTTP verbs supported: PUT, POST, DELETE """ parser_classes = ( SWHFileUploadZipParser, SWHFileUploadTarParser, ) def process_put( self, req, headers, collection_name: str, deposit_id: int ) -> Dict[str, Any]: """Replace existing content for the existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_binary # noqa Returns: 204 No content """ if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES: msg = "Packaging format supported is restricted to %s" % ( ", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES) ) return make_error_dict(BAD_REQUEST, msg) return self._binary_upload( req, headers, collection_name, deposit_id=deposit_id, replace_archives=True ) def process_post( self, req, headers: Dict, collection_name: str, deposit_id: Optional[int] = None ) -> Tuple[int, str, Dict]: """Add new content to the existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_mediaresource # noqa Returns: 201 Created Headers: Location: [Cont-File-IRI] Body: [optional Deposit Receipt] """ if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES: msg = "Packaging format supported is restricted to %s" % ( ", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES) ) unused = 0 return unused, "unused", make_error_dict(BAD_REQUEST, msg) return ( status.HTTP_201_CREATED, CONT_FILE_IRI, self._binary_upload(req, headers, collection_name, deposit_id), ) def process_delete(self, req, collection_name: str, deposit_id: int) -> Dict: """Delete content (archives) from existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deletingcontent # noqa Returns: 204 Created """ return self._delete_archives(collection_name, deposit_id) class APIUpdateMetadata(APIPost, APIPut, APIDelete): """Deposit request class defining api endpoints for sword deposit. What's known as 'Edit IRI' (and SE IRI) in the sword specification. HTTP verbs supported: POST (SE IRI), PUT (Edit IRI), DELETE """ parser_classes = (SWHMultiPartParser, SWHAtomEntryParser) - def __init__(self): - super().__init__() - self.storage_metadata: StorageInterface = get_storage( - **self.config["storage_metadata"] - ) - def restrict_access( self, request: Request, headers: Dict, deposit: Deposit ) -> Dict[str, Any]: """Relax restriction access to allow metadata update on deposit with status "done" when a swhid is provided. """ if ( request.method == "PUT" and headers["swhid"] is not None and deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS ): # Allow metadata update on deposit with status "done" when swhid provided return {} # otherwise, let the standard access restriction check occur return super().restrict_access(request, headers, deposit) def process_put( self, request, headers: Dict, collection_name: str, deposit_id: int ) -> Dict[str, Any]: """This allows the following scenarios: - multipart: replace all the deposit (status partial) metadata and archive with the provided ones. - atom: replace all the deposit (status partial) metadata with the provided ones. - with swhid, atom: Add new metatada to deposit (status done) with provided ones and push such metadata to the metadata storage directly. source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_metadata - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_multipart Raises: 400 if any of the following occur: - the swhid provided and the deposit swhid do not match - the provided metadata xml file is malformed - the provided xml atom entry is empty - the provided swhid does not exist in the archive Returns: 204 No content """ # noqa swhid = headers.get("swhid") if swhid is None: if request.content_type.startswith("multipart/"): return self._multipart_upload( request, headers, collection_name, deposit_id=deposit_id, replace_archives=True, replace_metadata=True, ) # standard metadata update (replace all metadata already provided to the # deposit by the new ones) return self._atom_entry( request, headers, collection_name, deposit_id=deposit_id, replace_metadata=True, ) # Update metadata on a deposit already ingested # Write to the metadata storage (and the deposit backend) # no ingestion triggered deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS if swhid != deposit.swhid: return make_error_dict( BAD_REQUEST, f"Mismatched provided SWHID {swhid} with deposit's {deposit.swhid}.", "The provided SWHID does not match the deposit to update. " "Please ensure you send the correct deposit SWHID.", ) try: raw_metadata, metadata = self._read_metadata(request.data) except ParserError: return make_error_dict( BAD_REQUEST, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) if not metadata: return make_error_dict( BAD_REQUEST, "Empty body request is not supported", "Atom entry deposit is supposed to send for metadata. " "If the body is empty, there is no metadata.", ) - metadata_ok, error_details = check_metadata(metadata) - if not metadata_ok: - assert error_details, "Details should be set when a failure occurs" - return make_error_dict( - BAD_REQUEST, - "Functional metadata checks failure", - convert_status_detail(error_details), + try: + _, _, deposit, deposit_request = self._store_metadata_deposit( + deposit, parse_swhid(swhid), metadata, raw_metadata, deposit.origin_url, ) - - metadata_authority = MetadataAuthority( - type=MetadataAuthorityType.DEPOSIT_CLIENT, - url=deposit.client.provider_url, - metadata={"name": deposit.client.last_name}, - ) - - metadata_fetcher = MetadataFetcher( - name=self.tool["name"], - version=self.tool["version"], - metadata=self.tool["configuration"], - ) - - deposit_swhid = parse_swhid(swhid) - - # replace metadata within the deposit backend - deposit_request_data = { - METADATA_KEY: metadata, - RAW_METADATA_KEY: raw_metadata, - } - - # actually add the metadata to the completed deposit - deposit_request = self._deposit_request_put(deposit, deposit_request_data) - # store that metadata to the metadata storage - metadata_object = RawExtrinsicMetadata( - type=MetadataTargetType.DIRECTORY, - target=deposit_swhid, - discovery_date=deposit_request.date, - authority=metadata_authority, - fetcher=metadata_fetcher, - format="sword-v2-atom-codemeta", - metadata=raw_metadata, - origin=deposit.origin_url, - ) - - # write to metadata storage - self.storage_metadata.metadata_authority_add([metadata_authority]) - self.storage_metadata.metadata_fetcher_add([metadata_fetcher]) - self.storage_metadata.raw_extrinsic_metadata_add([metadata_object]) + except BadRequestError as bad_request_error: + return bad_request_error.to_dict() return { - "deposit_id": deposit_id, + "deposit_id": deposit.id, "deposit_date": deposit_request.date, "status": deposit.status, "archive": None, } def process_post( self, request, headers: Dict, collection_name: str, deposit_id: Optional[int] = None, ) -> Tuple[int, str, Dict]: """Add new metadata/archive to existing deposit. This allows the following scenarios to occur: - multipart: Add new metadata and archive to a deposit in status partial with the provided ones. - empty atom: Allows to finalize a deposit in status partial (transition to deposited). source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_metadata - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_multipart - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#continueddeposit_complete Returns: In optimal case for a multipart and atom-entry update, a 201 Created response. The body response will hold a deposit. And the response headers will contain an entry 'Location' with the EM-IRI. For the empty post case, this returns a 200. """ # noqa assert deposit_id is not None if request.content_type.startswith("multipart/"): data = self._multipart_upload( request, headers, collection_name, deposit_id=deposit_id ) return (status.HTTP_201_CREATED, EM_IRI, data) content_length = headers["content-length"] or 0 if content_length == 0 and headers["in-progress"] is False: # check for final empty post data = self._empty_post(request, headers, collection_name, deposit_id) return (status.HTTP_200_OK, EDIT_SE_IRI, data) data = self._atom_entry( request, headers, collection_name, deposit_id=deposit_id ) return (status.HTTP_201_CREATED, EM_IRI, data) def process_delete(self, req, collection_name: str, deposit_id: int) -> Dict: """Delete the container (deposit). source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deleteconteiner # noqa """ return self._delete_deposit(collection_name, deposit_id) diff --git a/swh/deposit/config.py b/swh/deposit/config.py index ec1e0248..ba6c0939 100644 --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -1,103 +1,108 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from typing import Any, Dict from swh.core import config from swh.deposit import __version__ from swh.scheduler import get_scheduler from swh.scheduler.interface import SchedulerInterface +from swh.storage import get_storage +from swh.storage.interface import StorageInterface # IRIs (Internationalized Resource identifier) sword 2.0 specified EDIT_SE_IRI = "edit_se_iri" EM_IRI = "em_iri" CONT_FILE_IRI = "cont_file_iri" SD_IRI = "servicedocument" COL_IRI = "upload" STATE_IRI = "state_iri" PRIVATE_GET_RAW_CONTENT = "private-download" PRIVATE_CHECK_DEPOSIT = "check-deposit" PRIVATE_PUT_DEPOSIT = "private-update" PRIVATE_GET_DEPOSIT_METADATA = "private-read" PRIVATE_LIST_DEPOSITS = "private-deposit-list" ARCHIVE_KEY = "archive" METADATA_KEY = "metadata" RAW_METADATA_KEY = "raw-metadata" ARCHIVE_TYPE = "archive" METADATA_TYPE = "metadata" AUTHORIZED_PLATFORMS = ["development", "production", "testing"] DEPOSIT_STATUS_REJECTED = "rejected" DEPOSIT_STATUS_PARTIAL = "partial" DEPOSIT_STATUS_DEPOSITED = "deposited" DEPOSIT_STATUS_VERIFIED = "verified" DEPOSIT_STATUS_LOAD_SUCCESS = "done" DEPOSIT_STATUS_LOAD_FAILURE = "failed" # Revision author for deposit SWH_PERSON = { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org", } DEFAULT_CONFIG = { "max_upload_size": 209715200, "checks": True, } def setup_django_for(platform=None, config_file=None): """Setup function for command line tools (swh.deposit.create_user) to initialize the needed db access. Note: Do not import any django related module prior to this function call. Otherwise, this will raise an django.core.exceptions.ImproperlyConfigured error message. Args: platform (str): the platform the scheduling is running config_file (str): Extra configuration file (typically for the production platform) Raises: ValueError in case of wrong platform inputs. """ if platform is not None: if platform not in AUTHORIZED_PLATFORMS: raise ValueError("Platform should be one of %s" % AUTHORIZED_PLATFORMS) if "DJANGO_SETTINGS_MODULE" not in os.environ: os.environ["DJANGO_SETTINGS_MODULE"] = "swh.deposit.settings.%s" % platform if config_file: os.environ.setdefault("SWH_CONFIG_FILENAME", config_file) import django django.setup() class APIConfig: """API Configuration centralized class. This loads explicitly the configuration file out of the SWH_CONFIG_FILENAME environment variable. """ def __init__(self): self.config: Dict[str, Any] = config.load_from_envvar(DEFAULT_CONFIG) self.scheduler: SchedulerInterface = get_scheduler(**self.config["scheduler"]) self.tool = { "name": "swh-deposit", "version": __version__, "configuration": {"sword_version": "2"}, } + self.storage_metadata: StorageInterface = get_storage( + **self.config["storage_metadata"] + ) diff --git a/swh/deposit/errors.py b/swh/deposit/errors.py index f41965dd..e0b7980e 100644 --- a/swh/deposit/errors.py +++ b/swh/deposit/errors.py @@ -1,150 +1,164 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of providing the standard sword errors """ from django.shortcuts import render from rest_framework import status FORBIDDEN = "forbidden" UNAUTHORIZED = "unauthorized" NOT_FOUND = "unknown" BAD_REQUEST = "bad-request" ERROR_CONTENT = "error-content" CHECKSUM_MISMATCH = "checksum-mismatch" MEDIATION_NOT_ALLOWED = "mediation-not-allowed" METHOD_NOT_ALLOWED = "method-not-allowed" MAX_UPLOAD_SIZE_EXCEEDED = "max_upload_size_exceeded" PARSING_ERROR = "parsing-error" class ParserError(ValueError): """Specific parsing error detected when parsing the xml metadata input """ pass ERRORS = { FORBIDDEN: { "status": status.HTTP_403_FORBIDDEN, "iri": "http://purl.org/net/sword/error/ErrorForbidden", "tag": "sword:ErrorForbidden", }, UNAUTHORIZED: { "status": status.HTTP_401_UNAUTHORIZED, "iri": "http://purl.org/net/sword/error/ErrorUnauthorized", "tag": "sword:ErrorUnauthorized", }, NOT_FOUND: { "status": status.HTTP_404_NOT_FOUND, "iri": "http://purl.org/net/sword/error/ErrorNotFound", "tag": "sword:ErrorNotFound", }, ERROR_CONTENT: { "status": status.HTTP_415_UNSUPPORTED_MEDIA_TYPE, "iri": "http://purl.org/net/sword/error/ErrorContent", "tag": "sword:ErrorContent", }, CHECKSUM_MISMATCH: { "status": status.HTTP_412_PRECONDITION_FAILED, "iri": "http://purl.org/net/sword/error/ErrorChecksumMismatch", "tag": "sword:ErrorChecksumMismatch", }, BAD_REQUEST: { "status": status.HTTP_400_BAD_REQUEST, "iri": "http://purl.org/net/sword/error/ErrorBadRequest", "tag": "sword:ErrorBadRequest", }, PARSING_ERROR: { "status": status.HTTP_400_BAD_REQUEST, "iri": "http://purl.org/net/sword/error/ErrorBadRequest", "tag": "sword:ErrorBadRequest", }, MEDIATION_NOT_ALLOWED: { "status": status.HTTP_412_PRECONDITION_FAILED, "iri": "http://purl.org/net/sword/error/MediationNotAllowed", "tag": "sword:MediationNotAllowed", }, METHOD_NOT_ALLOWED: { "status": status.HTTP_405_METHOD_NOT_ALLOWED, "iri": "http://purl.org/net/sword/error/MethodNotAllowed", "tag": "sword:MethodNotAllowed", }, MAX_UPLOAD_SIZE_EXCEEDED: { "status": status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, "iri": "http://purl.org/net/sword/error/MaxUploadSizeExceeded", "tag": "sword:MaxUploadSizeExceeded", }, } def make_error_dict(key, summary=None, verbose_description=None): """Utility function to factorize error message dictionary. Args: key (str): Error status key referenced in swh.deposit.errors module summary (str/None): Error message clarifying the status verbose_description (str/None): A more verbose description or work around a potential problem. Returns: Dictionary with key 'error' detailing the 'status' and associated 'message' """ return { "error": { "key": key, "summary": summary, "verboseDescription": verbose_description, }, } def make_error_response_from_dict(req, error): """Utility function to return an http response with error detail. Args: req (Request): original request error (dict): Error described as dict, typically generated from the make_error_dict function. Returns: HttpResponse with detailed error. """ error_information = ERRORS[error["key"]] context = error context.update(error_information) return render( req, "deposit/error.xml", context=error, content_type="application/xml", status=error_information["status"], ) def make_error_response(req, key, summary=None, verbose_description=None): """Utility function to create an http response with detailed error. Args: req (Request): original request key (str): Error status key referenced in swh.deposit.errors module summary (str): Error message clarifying the status verbose_description (str / None): A more verbose description or work around a potential problem. Returns: Dictionary with key 'error' detailing the 'status' and associated 'message' """ error = make_error_dict(key, summary, verbose_description) return make_error_response_from_dict(req, error["error"]) + + +class BadRequestError(ValueError): + """Represents a bad input from the deposit client + + """ + + def __init__(self, summary, verbose_description): + self.key = BAD_REQUEST + self.summary = summary + self.verbose_description = verbose_description + + def to_dict(self): + return make_error_dict(self.key, self.summary, self.verbose_description) diff --git a/swh/deposit/tests/api/test_deposit_metadata.py b/swh/deposit/tests/api/test_deposit_metadata.py new file mode 100644 index 00000000..f9dcfe0f --- /dev/null +++ b/swh/deposit/tests/api/test_deposit_metadata.py @@ -0,0 +1,277 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from io import BytesIO + +import attr +from django.urls import reverse +import pytest +from rest_framework import status + +from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_LOAD_SUCCESS, APIConfig +from swh.deposit.models import Deposit +from swh.deposit.parsers import parse_xml +from swh.deposit.utils import compute_metadata_context +from swh.model.identifiers import SWHID, parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + RawExtrinsicMetadata, +) +from swh.storage.interface import PagedResult + + +def test_deposit_metadata_invalid( + authenticated_client, deposit_collection, atom_dataset +): + """Posting invalid swhid reference is bad request returned to client + + """ + invalid_swhid = "swh:1:dir :31b5c8cc985d190b5a7ef4878128ebfdc2358f49" + xml_data = atom_dataset["entry-data-with-swhid"].format(swhid=invalid_swhid) + + response = authenticated_client.post( + reverse(COL_IRI, args=[deposit_collection.name]), + content_type="application/atom+xml;type=entry", + data=xml_data, + HTTP_SLUG="external-id", + ) + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert b"Invalid SWHID reference" in response.content + + +def test_deposit_metadata_fails_functional_checks( + authenticated_client, deposit_collection, atom_dataset +): + """Posting functionally invalid metadata swhid is bad request returned to client + + """ + swhid = "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49" + invalid_xml_data = atom_dataset[ + "entry-data-with-swhid-fail-metadata-functional-checks" + ].format(swhid=swhid) + + response = authenticated_client.post( + reverse(COL_IRI, args=[deposit_collection.name]), + content_type="application/atom+xml;type=entry", + data=invalid_xml_data, + HTTP_SLUG="external-id", + ) + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert b"Functional metadata checks failure" in response.content + + +@pytest.mark.parametrize( + "swhid,target_type", + [ + ( + "swh:1:cnt:01b5c8cc985d190b5a7ef4878128ebfdc2358f49", + MetadataTargetType.CONTENT, + ), + ( + "swh:1:dir:11b5c8cc985d190b5a7ef4878128ebfdc2358f49", + MetadataTargetType.DIRECTORY, + ), + ( + "swh:1:rev:21b5c8cc985d190b5a7ef4878128ebfdc2358f49", + MetadataTargetType.REVISION, + ), + ( + "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", + MetadataTargetType.RELEASE, + ), + ( + "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", + MetadataTargetType.SNAPSHOT, + ), + ( + "swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", + MetadataTargetType.CONTENT, + ), + ( + "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;origin=https://inria.halpreprod.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:rev:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa + MetadataTargetType.DIRECTORY, + ), + ( + "swh:1:rev:71b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", + MetadataTargetType.REVISION, + ), + ( + "swh:1:rel:81b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", + MetadataTargetType.RELEASE, + ), + ( + "swh:1:snp:91b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", + MetadataTargetType.SNAPSHOT, + ), + ], +) +def test_deposit_metadata_swhid( + swhid, + target_type, + authenticated_client, + deposit_collection, + atom_dataset, + swh_storage, +): + """Posting a swhid reference is stored on raw extrinsic metadata storage + + """ + swhid_reference = parse_swhid(swhid) + swhid_core = attr.evolve(swhid_reference, metadata={}) + + xml_data = atom_dataset["entry-data-with-swhid"].format(swhid=swhid) + deposit_client = authenticated_client.deposit_client + + response = authenticated_client.post( + reverse(COL_IRI, args=[deposit_collection.name]), + content_type="application/atom+xml;type=entry", + data=xml_data, + HTTP_SLUG="external-id", + ) + + assert response.status_code == status.HTTP_201_CREATED + response_content = parse_xml(BytesIO(response.content)) + + # Ensure the deposit is finalized + deposit_id = int(response_content["deposit_id"]) + deposit = Deposit.objects.get(pk=deposit_id) + assert isinstance(swhid_core, SWHID) + assert deposit.swhid == str(swhid_core) + assert deposit.swhid_context == str(swhid_reference) + assert deposit.complete_date == deposit.reception_date + assert deposit.complete_date is not None + assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS + + # Ensure metadata stored in the metadata storage is consistent + metadata_authority = MetadataAuthority( + type=MetadataAuthorityType.DEPOSIT_CLIENT, + url=deposit_client.provider_url, + metadata={"name": deposit_client.last_name}, + ) + + actual_authority = swh_storage.metadata_authority_get( + MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url + ) + assert actual_authority == metadata_authority + + config = APIConfig() + metadata_fetcher = MetadataFetcher( + name=config.tool["name"], + version=config.tool["version"], + metadata=config.tool["configuration"], + ) + + actual_fetcher = swh_storage.metadata_fetcher_get( + config.tool["name"], config.tool["version"] + ) + assert actual_fetcher == metadata_fetcher + + page_results = swh_storage.raw_extrinsic_metadata_get( + target_type, swhid_core, metadata_authority + ) + discovery_date = page_results.results[0].discovery_date + + assert len(page_results.results) == 1 + assert page_results.next_page_token is None + + object_type, metadata_context = compute_metadata_context(swhid_reference) + assert page_results == PagedResult( + results=[ + RawExtrinsicMetadata( + type=object_type, + target=swhid_core, + discovery_date=discovery_date, + authority=attr.evolve(metadata_authority, metadata=None), + fetcher=attr.evolve(metadata_fetcher, metadata=None), + format="sword-v2-atom-codemeta", + metadata=xml_data.encode(), + **metadata_context, + ) + ], + next_page_token=None, + ) + assert deposit.complete_date == discovery_date + + +@pytest.mark.parametrize( + "url", ["https://gitlab.org/user/repo", "https://whatever.else/repo",] +) +def test_deposit_metadata_origin( + url, authenticated_client, deposit_collection, atom_dataset, swh_storage, +): + """Posting a swhid reference is stored on raw extrinsic metadata storage + + """ + xml_data = atom_dataset["entry-data-with-origin"].format(url=url) + deposit_client = authenticated_client.deposit_client + response = authenticated_client.post( + reverse(COL_IRI, args=[deposit_collection.name]), + content_type="application/atom+xml;type=entry", + data=xml_data, + HTTP_SLUG="external-id", + ) + + assert response.status_code == status.HTTP_201_CREATED + response_content = parse_xml(BytesIO(response.content)) + # Ensure the deposit is finalized + deposit_id = int(response_content["deposit_id"]) + deposit = Deposit.objects.get(pk=deposit_id) + # we got not swhid as input so we cannot have those + assert deposit.swhid is None + assert deposit.swhid_context is None + assert deposit.complete_date == deposit.reception_date + assert deposit.complete_date is not None + assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS + + # Ensure metadata stored in the metadata storage is consistent + metadata_authority = MetadataAuthority( + type=MetadataAuthorityType.DEPOSIT_CLIENT, + url=deposit_client.provider_url, + metadata={"name": deposit_client.last_name}, + ) + + actual_authority = swh_storage.metadata_authority_get( + MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url + ) + assert actual_authority == metadata_authority + + config = APIConfig() + metadata_fetcher = MetadataFetcher( + name=config.tool["name"], + version=config.tool["version"], + metadata=config.tool["configuration"], + ) + + actual_fetcher = swh_storage.metadata_fetcher_get( + config.tool["name"], config.tool["version"] + ) + assert actual_fetcher == metadata_fetcher + + page_results = swh_storage.raw_extrinsic_metadata_get( + MetadataTargetType.ORIGIN, url, metadata_authority + ) + discovery_date = page_results.results[0].discovery_date + + assert len(page_results.results) == 1 + assert page_results.next_page_token is None + + assert page_results == PagedResult( + results=[ + RawExtrinsicMetadata( + type=MetadataTargetType.ORIGIN, + target=url, + discovery_date=discovery_date, + authority=attr.evolve(metadata_authority, metadata=None), + fetcher=attr.evolve(metadata_fetcher, metadata=None), + format="sword-v2-atom-codemeta", + metadata=xml_data.encode(), + ) + ], + next_page_token=None, + ) + assert deposit.complete_date == discovery_date diff --git a/swh/deposit/tests/api/test_parsers.py b/swh/deposit/tests/api/test_parsers.py index 374b2c5f..765584ff 100644 --- a/swh/deposit/tests/api/test_parsers.py +++ b/swh/deposit/tests/api/test_parsers.py @@ -1,249 +1,238 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import OrderedDict import io import pytest from swh.deposit.parsers import SWHXMLParser, parse_swh_reference, parse_xml from swh.model.exceptions import ValidationError from swh.model.identifiers import parse_swhid def test_parsing_without_duplicates(): xml_no_duplicate = io.BytesIO( b""" Awesome Compiler GPL3.0 https://opensource.org/licenses/GPL-3.0 Python3 author1 Inria ocaml http://issuetracker.com """ ) actual_result = SWHXMLParser().parse(xml_no_duplicate) expected_dict = OrderedDict( [ ("title", "Awesome Compiler"), ( "codemeta:license", OrderedDict( [ ("codemeta:name", "GPL3.0"), ("codemeta:url", "https://opensource.org/licenses/GPL-3.0"), ] ), ), ("codemeta:runtimePlatform", "Python3"), ( "codemeta:author", OrderedDict( [("codemeta:name", "author1"), ("codemeta:affiliation", "Inria")] ), ), ("codemeta:programmingLanguage", "ocaml"), ("codemeta:issueTracker", "http://issuetracker.com"), ] ) assert expected_dict == actual_result def test_parsing_with_duplicates(): xml_with_duplicates = io.BytesIO( b""" Another Compiler GNU/Linux GPL3.0 https://opensource.org/licenses/GPL-3.0 Un*x author1 Inria author2 Inria ocaml haskell spdx http://spdx.org python3 """ ) actual_result = SWHXMLParser().parse(xml_with_duplicates) expected_dict = OrderedDict( [ ("title", "Another Compiler"), ("codemeta:runtimePlatform", ["GNU/Linux", "Un*x"]), ( "codemeta:license", [ OrderedDict( [ ("codemeta:name", "GPL3.0"), ("codemeta:url", "https://opensource.org/licenses/GPL-3.0"), ] ), OrderedDict( [("codemeta:name", "spdx"), ("codemeta:url", "http://spdx.org")] ), ], ), ( "codemeta:author", [ OrderedDict( [ ("codemeta:name", "author1"), ("codemeta:affiliation", "Inria"), ] ), OrderedDict( [ ("codemeta:name", "author2"), ("codemeta:affiliation", "Inria"), ] ), ], ), ("codemeta:programmingLanguage", ["ocaml", "haskell", "python3"]), ] ) assert expected_dict == actual_result @pytest.fixture def xml_with_origin_reference(): xml_data = """ """ return xml_data.strip() def test_parse_swh_reference_origin(xml_with_origin_reference): url = "https://url" xml_data = xml_with_origin_reference.format(url=url) metadata = parse_xml(xml_data) actual_origin = parse_swh_reference(metadata) assert actual_origin == url @pytest.fixture def xml_with_empty_reference(): xml_data = """ {swh_reference} """ return xml_data.strip() @pytest.mark.parametrize( "xml_ref", [ "", "", "", """""", ], ) def test_parse_swh_reference_empty(xml_with_empty_reference, xml_ref): xml_body = xml_with_empty_reference.format(swh_reference=xml_ref) metadata = parse_xml(xml_body) assert parse_swh_reference(metadata) is None @pytest.fixture -def xml_with_swhid(): - xml_data = """ - - - - - - - - """ - return xml_data.strip() +def xml_with_swhid(atom_dataset): + return atom_dataset["entry-data-with-swhid"] @pytest.mark.parametrize( "swhid", [ "swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=https://hal.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:4fc1e36fca86b2070204bedd51106014a614f321;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba;path=/moranegg-AffectationRO-df7f68b/", # noqa "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:rev:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rel:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:snp:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:snp:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", ], ) def test_parse_swh_reference_swhid(swhid, xml_with_swhid): xml_data = xml_with_swhid.format(swhid=swhid) metadata = parse_xml(xml_data) actual_swhid = parse_swh_reference(metadata) assert actual_swhid is not None expected_swhid = parse_swhid(swhid) assert actual_swhid == expected_swhid @pytest.mark.parametrize( "invalid_swhid,error_msg", [ ("swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc235", "Unexpected length"), ( "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:rev:0175049fc45055a3824a1675ac06e3711619a55a", # noqa "visit qualifier should be a core SWHID with type", ), ( "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;anchor=swh:1:cnt:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa "anchor qualifier should be a core SWHID with type one of", ), # noqa ], ) def test_parse_swh_reference_invalid_swhid(invalid_swhid, error_msg, xml_with_swhid): """Unparsable swhid should raise """ xml_invalid_swhid = xml_with_swhid.format(swhid=invalid_swhid) metadata = parse_xml(xml_invalid_swhid) with pytest.raises(ValidationError, match=error_msg): parse_swh_reference(metadata) diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py index da8d2e4b..cb2c92e3 100644 --- a/swh/deposit/tests/conftest.py +++ b/swh/deposit/tests/conftest.py @@ -1,441 +1,445 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 from functools import partial import os import re from typing import Mapping from django.test.utils import setup_databases # type: ignore from django.urls import reverse import psycopg2 from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT import pytest from rest_framework import status from rest_framework.test import APIClient import yaml from swh.core.config import read from swh.core.pytest_plugin import get_response_cb from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED, EDIT_SE_IRI, setup_django_for, ) from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import create_arborescence_archive from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid from swh.scheduler import get_scheduler # mypy is asked to ignore the import statement above because setup_databases # is not part of the d.t.utils.__all__ variable. TEST_USER = { "username": "test", "password": "password", "email": "test@example.org", "provider_url": "https://hal-test.archives-ouvertes.fr/", "domain": "archives-ouvertes.fr/", "collection": {"name": "test"}, } def pytest_configure(): setup_django_for("testing") @pytest.fixture def requests_mock_datadir(datadir, requests_mock_datadir): """Override default behavior to deal with put/post methods """ cb = partial(get_response_cb, datadir=datadir) requests_mock_datadir.put(re.compile("https://"), body=cb) requests_mock_datadir.post(re.compile("https://"), body=cb) return requests_mock_datadir @pytest.fixture() def deposit_config(swh_scheduler_config, swh_storage_backend_config): return { "max_upload_size": 500, "extraction_dir": "/tmp/swh-deposit/test/extraction-dir", "checks": False, "scheduler": {"cls": "local", **swh_scheduler_config,}, "storage_metadata": swh_storage_backend_config, } @pytest.fixture() def deposit_config_path(tmp_path, monkeypatch, deposit_config): conf_path = os.path.join(tmp_path, "deposit.yml") with open(conf_path, "w") as f: f.write(yaml.dump(deposit_config)) monkeypatch.setenv("SWH_CONFIG_FILENAME", conf_path) return conf_path @pytest.fixture(autouse=True) def deposit_autoconfig(deposit_config_path): """Enforce config for deposit classes inherited from APIConfig.""" cfg = read(deposit_config_path) if "scheduler" in cfg: # scheduler setup: require the check-deposit and load-deposit tasks scheduler = get_scheduler(**cfg["scheduler"]) task_types = [ { "type": "check-deposit", "backend_name": "swh.deposit.loader.tasks.ChecksDepositTsk", "description": "Check deposit metadata/archive before loading", "num_retries": 3, }, { "type": "load-deposit", "backend_name": "swh.loader.package.deposit.tasks.LoadDeposit", "description": "Loading deposit archive into swh archive", "num_retries": 3, }, ] for task_type in task_types: scheduler.create_task_type(task_type) @pytest.fixture(scope="session") def django_db_setup(request, django_db_blocker, postgresql_proc): from django.conf import settings settings.DATABASES["default"].update( { ("ENGINE", "django.db.backends.postgresql"), ("NAME", "tests"), ("USER", postgresql_proc.user), # noqa ("HOST", postgresql_proc.host), # noqa ("PORT", postgresql_proc.port), # noqa } ) with django_db_blocker.unblock(): setup_databases( verbosity=request.config.option.verbose, interactive=False, keepdb=False ) def execute_sql(sql): """Execute sql to postgres db""" with psycopg2.connect(database="postgres") as conn: conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() cur.execute(sql) @pytest.fixture(autouse=True, scope="session") def swh_proxy(): """Automatically inject this fixture in all tests to ensure no outside connection takes place. """ os.environ["http_proxy"] = "http://localhost:999" os.environ["https_proxy"] = "http://localhost:999" def create_deposit_collection(collection_name: str): """Create a deposit collection with name collection_name """ from swh.deposit.models import DepositCollection try: collection = DepositCollection._default_manager.get(name=collection_name) except DepositCollection.DoesNotExist: collection = DepositCollection(name=collection_name) collection.save() return collection def deposit_collection_factory(collection_name=TEST_USER["collection"]["name"]): @pytest.fixture def _deposit_collection(db, collection_name=collection_name): return create_deposit_collection(collection_name) return _deposit_collection deposit_collection = deposit_collection_factory() deposit_another_collection = deposit_collection_factory("another-collection") @pytest.fixture def deposit_user(db, deposit_collection): """Create/Return the test_user "test" """ from swh.deposit.models import DepositClient try: user = DepositClient._default_manager.get(username=TEST_USER["username"]) except DepositClient.DoesNotExist: user = DepositClient._default_manager.create_user( username=TEST_USER["username"], email=TEST_USER["email"], password=TEST_USER["password"], provider_url=TEST_USER["provider_url"], domain=TEST_USER["domain"], ) user.collections = [deposit_collection.id] user.save() return user @pytest.fixture def client(): """Override pytest-django one which does not work for djangorestframework. """ return APIClient() # <- drf's client -@pytest.yield_fixture +@pytest.fixture def authenticated_client(client, deposit_user): """Returned a logged client + This also patched the client instance to keep a reference on the associated + deposit_user. + """ _token = "%s:%s" % (deposit_user.username, TEST_USER["password"]) token = base64.b64encode(_token.encode("utf-8")) authorization = "Basic %s" % token.decode("utf-8") client.credentials(HTTP_AUTHORIZATION=authorization) + client.deposit_client = deposit_user yield client client.logout() @pytest.fixture def sample_archive(tmp_path): """Returns a sample archive """ tmp_path = str(tmp_path) # pytest version limitation in previous version archive = create_arborescence_archive( tmp_path, "archive1", "file1", b"some content in file" ) return archive @pytest.fixture def atom_dataset(datadir) -> Mapping[str, str]: """Compute the paths to atom files. Returns: Dict of atom name per content (bytes) """ atom_path = os.path.join(datadir, "atom") data = {} for filename in os.listdir(atom_path): filepath = os.path.join(atom_path, filename) with open(filepath, "rb") as f: raw_content = f.read().decode("utf-8") # Keep the filename without extension atom_name = filename.split(".")[0] data[atom_name] = raw_content return data def create_deposit( authenticated_client, collection_name: str, sample_archive, external_id: str, deposit_status=DEPOSIT_STATUS_DEPOSITED, ): """Create a skeleton shell deposit """ url = reverse(COL_IRI, args=[collection_name]) # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"]), ) # then assert response.status_code == status.HTTP_201_CREATED from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(external_id=external_id) if deposit.status != deposit_status: deposit.status = deposit_status deposit.save() assert deposit.status == deposit_status return deposit def create_binary_deposit( authenticated_client, collection_name: str, sample_archive, external_id: str, deposit_status: str = DEPOSIT_STATUS_DEPOSITED, atom_dataset: Mapping[str, bytes] = {}, ): """Create a deposit with both metadata and archive set. Then alters its status to `deposit_status`. """ deposit = create_deposit( authenticated_client, collection_name, sample_archive, external_id=external_id, deposit_status=DEPOSIT_STATUS_PARTIAL, ) response = authenticated_client.post( reverse(EDIT_SE_IRI, args=[collection_name, deposit.id]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8"), HTTP_SLUG=deposit.external_id, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_201_CREATED assert deposit.status == DEPOSIT_STATUS_PARTIAL from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit.id) if deposit.status != deposit_status: deposit.status = deposit_status deposit.save() assert deposit.status == deposit_status return deposit def deposit_factory(deposit_status=DEPOSIT_STATUS_DEPOSITED): """Build deposit with a specific status """ @pytest.fixture() def _deposit( sample_archive, deposit_collection, authenticated_client, deposit_status=deposit_status, ): external_id = "external-id-%s" % deposit_status return create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id=external_id, deposit_status=deposit_status, ) return _deposit deposited_deposit = deposit_factory() rejected_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_REJECTED) partial_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_PARTIAL) verified_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_VERIFIED) completed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS) failed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_FAILURE) @pytest.fixture def partial_deposit_with_metadata( sample_archive, deposit_collection, authenticated_client, atom_dataset ): """Returns deposit with archive and metadata provided, status 'partial' """ return create_binary_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id="external-id-partial", deposit_status=DEPOSIT_STATUS_PARTIAL, atom_dataset=atom_dataset, ) @pytest.fixture def partial_deposit_only_metadata( deposit_collection, authenticated_client, atom_dataset ): response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], HTTP_SLUG="external-id-partial", HTTP_IN_PROGRESS=True, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = response_content["deposit_id"] from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_PARTIAL return deposit @pytest.fixture def complete_deposit(sample_archive, deposit_collection, authenticated_client): """Returns a completed deposit (load success) """ deposit = create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id="external-id-complete", deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS, ) origin = "https://hal.archives-ouvertes.fr/hal-01727745" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" revision_id = "548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10" snapshot_id = "e5e82d064a9c3df7464223042e0c55d72ccff7f0" deposit.swhid = swhid(DIRECTORY, directory_id) deposit.swhid_context = swhid( DIRECTORY, directory_id, metadata={ "origin": origin, "visit": swhid(SNAPSHOT, snapshot_id), "anchor": swhid(REVISION, revision_id), "path": "/", }, ) deposit.save() return deposit @pytest.fixture() def tmp_path(tmp_path): return str(tmp_path) # issue with oldstable's pytest version diff --git a/swh/deposit/tests/data/atom/entry-data-with-origin.xml b/swh/deposit/tests/data/atom/entry-data-with-origin.xml new file mode 100644 index 00000000..0cc06a8b --- /dev/null +++ b/swh/deposit/tests/data/atom/entry-data-with-origin.xml @@ -0,0 +1,13 @@ + + + Awesome Compiler + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + dudess + + + + + + diff --git a/swh/deposit/tests/data/atom/entry-data-with-swhid-fail-metadata-functional-checks.xml b/swh/deposit/tests/data/atom/entry-data-with-swhid-fail-metadata-functional-checks.xml new file mode 100644 index 00000000..dab5b1f8 --- /dev/null +++ b/swh/deposit/tests/data/atom/entry-data-with-swhid-fail-metadata-functional-checks.xml @@ -0,0 +1,12 @@ + + + + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + 2017-10-07T15:17:08Z + + + + + + diff --git a/swh/deposit/tests/data/atom/entry-data-with-swhid.xml b/swh/deposit/tests/data/atom/entry-data-with-swhid.xml new file mode 100644 index 00000000..34a59474 --- /dev/null +++ b/swh/deposit/tests/data/atom/entry-data-with-swhid.xml @@ -0,0 +1,13 @@ + + + Awesome Compiler + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + dudess + + + + + + diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py index 8be41c4c..430e5790 100644 --- a/swh/deposit/tests/test_utils.py +++ b/swh/deposit/tests/test_utils.py @@ -1,141 +1,200 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Union from unittest.mock import patch import pytest from swh.deposit import utils +from swh.model.identifiers import SWHID, parse_swhid +from swh.model.model import MetadataTargetType def test_merge(): """Calling utils.merge on dicts should merge without losing information """ d0 = {"author": "someone", "license": [["gpl2"]], "a": 1} d1 = { "author": ["author0", {"name": "author1"}], "license": [["gpl3"]], "b": {"1": "2"}, } d2 = {"author": map(lambda x: x, ["else"]), "license": "mit", "b": {"2": "3",}} d3 = { "author": (v for v in ["no one"]), } actual_merge = utils.merge(d0, d1, d2, d3) expected_merge = { "a": 1, "license": [["gpl2"], ["gpl3"], "mit"], "author": ["someone", "author0", {"name": "author1"}, "else", "no one"], "b": {"1": "2", "2": "3",}, } assert actual_merge == expected_merge def test_merge_2(): d0 = {"license": "gpl2", "runtime": {"os": "unix derivative"}} d1 = {"license": "gpl3", "runtime": "GNU/Linux"} expected = { "license": ["gpl2", "gpl3"], "runtime": [{"os": "unix derivative"}, "GNU/Linux"], } actual = utils.merge(d0, d1) assert actual == expected def test_merge_edge_cases(): input_dict = { "license": ["gpl2", "gpl3"], "runtime": [{"os": "unix derivative"}, "GNU/Linux"], } # against empty dict actual = utils.merge(input_dict, {}) assert actual == input_dict # against oneself actual = utils.merge(input_dict, input_dict, input_dict) assert actual == input_dict def test_merge_one_dict(): """Merge one dict should result in the same dict value """ input_and_expected = {"anything": "really"} actual = utils.merge(input_and_expected) assert actual == input_and_expected def test_merge_raise(): """Calling utils.merge with any no dict argument should raise """ d0 = {"author": "someone", "a": 1} d1 = ["not a dict"] with pytest.raises(ValueError): utils.merge(d0, d1) with pytest.raises(ValueError): utils.merge(d1, d0) with pytest.raises(ValueError): utils.merge(d1) assert utils.merge(d0) == d0 @patch("swh.deposit.utils.normalize_timestamp", side_effect=lambda x: x) def test_normalize_date_0(mock_normalize): """When date is a list, choose the first date and normalize it Note: We do not test swh.model.identifiers which is already tested in swh.model """ actual_date = utils.normalize_date(["2017-10-12", "date1"]) expected_date = "2017-10-12 00:00:00+00:00" assert str(actual_date) == expected_date @patch("swh.deposit.utils.normalize_timestamp", side_effect=lambda x: x) def test_normalize_date_1(mock_normalize): """Providing a date in a reasonable format, everything is fine Note: We do not test swh.model.identifiers which is already tested in swh.model """ actual_date = utils.normalize_date("2018-06-11 17:02:02") expected_date = "2018-06-11 17:02:02+00:00" assert str(actual_date) == expected_date @patch("swh.deposit.utils.normalize_timestamp", side_effect=lambda x: x) def test_normalize_date_doing_irrelevant_stuff(mock_normalize): """Providing a date with only the year results in a reasonable date Note: We do not test swh.model.identifiers which is already tested in swh.model """ actual_date = utils.normalize_date("2017") expected_date = "2017-01-01 00:00:00+00:00" assert str(actual_date) == expected_date + + +@pytest.mark.parametrize( + "swhid_or_origin,expected_type,expected_metadata_context", + [ + ("https://something", MetadataTargetType.ORIGIN, {"origin": None}), + ( + "swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49", + MetadataTargetType.CONTENT, + {"origin": None}, + ), + ( + "swh:1:snp:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=http://blah", + MetadataTargetType.SNAPSHOT, + {"origin": "http://blah", "path": None}, + ), + ( + "swh:1:dir:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;path=/path", + MetadataTargetType.DIRECTORY, + {"origin": None, "path": b"/path"}, + ), + ( + "swh:1:rev:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;visit=swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", # noqa + MetadataTargetType.REVISION, + { + "origin": None, + "path": None, + "snapshot": parse_swhid( + "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49" + ), + }, + ), + ( + "swh:1:rel:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", # noqa + MetadataTargetType.RELEASE, + { + "origin": None, + "path": None, + "directory": parse_swhid( + "swh:1:dir:41b5c8cc985d190b5a7ef4878128ebfdc2358f49" + ), + }, + ), + ], +) +def test_compute_metadata_context( + swhid_or_origin: Union[str, SWHID], expected_type, expected_metadata_context +): + if expected_type != MetadataTargetType.ORIGIN: + assert isinstance(swhid_or_origin, str) + swhid_or_origin = parse_swhid(swhid_or_origin) + + object_type, metadata_context = utils.compute_metadata_context(swhid_or_origin) + + assert object_type == expected_type + assert metadata_context == expected_metadata_context diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py index 3b79293e..e306902a 100644 --- a/swh/deposit/utils.py +++ b/swh/deposit/utils.py @@ -1,83 +1,119 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from types import GeneratorType +from typing import Any, Dict, Tuple, Union import iso8601 -from swh.model.identifiers import normalize_timestamp +from swh.model.identifiers import SWHID, normalize_timestamp, parse_swhid +from swh.model.model import MetadataTargetType def merge(*dicts): """Given an iterator of dicts, merge them losing no information. Args: *dicts: arguments are all supposed to be dict to merge into one Returns: dict merged without losing information """ def _extend(existing_val, value): """Given an existing value and a value (as potential lists), merge them together without repetition. """ if isinstance(value, (list, map, GeneratorType)): vals = value else: vals = [value] for v in vals: if v in existing_val: continue existing_val.append(v) return existing_val d = {} for data in dicts: if not isinstance(data, dict): raise ValueError("dicts is supposed to be a variable arguments of dict") for key, value in data.items(): existing_val = d.get(key) if not existing_val: d[key] = value continue if isinstance(existing_val, (list, map, GeneratorType)): new_val = _extend(existing_val, value) elif isinstance(existing_val, dict): if isinstance(value, dict): new_val = merge(existing_val, value) else: new_val = _extend([existing_val], value) else: new_val = _extend([existing_val], value) d[key] = new_val return d def normalize_date(date): """Normalize date fields as expected by swh workers. If date is a list, elect arbitrarily the first element of that list If date is (then) a string, parse it through dateutil.parser.parse to extract a datetime. Then normalize it through swh.model.identifiers.normalize_timestamp. Returns The swh date object """ if isinstance(date, list): date = date[0] if isinstance(date, str): date = iso8601.parse_date(date) return normalize_timestamp(date) + + +def compute_metadata_context( + swhid_reference: Union[SWHID, str] +) -> Tuple[MetadataTargetType, Dict[str, Any]]: + """Given a SWHID object, determine the context as a dict. + + The parse_swhid calls within are not expected to raise (because they should have + been caught early on). + + """ + metadata_context: Dict[str, Any] = {"origin": None} + if isinstance(swhid_reference, SWHID): + object_type = MetadataTargetType(swhid_reference.object_type) + assert object_type != MetadataTargetType.ORIGIN + + if swhid_reference.metadata: + path = swhid_reference.metadata.get("path") + metadata_context = { + "origin": swhid_reference.metadata.get("origin"), + "path": path.encode() if path else None, + } + snapshot = swhid_reference.metadata.get("visit") + if snapshot: + metadata_context["snapshot"] = parse_swhid(snapshot) + + anchor = swhid_reference.metadata.get("anchor") + if anchor: + anchor_swhid = parse_swhid(anchor) + metadata_context[anchor_swhid.object_type] = anchor_swhid + else: + object_type = MetadataTargetType.ORIGIN + + return object_type, metadata_context