diff --git a/docs/conf.py b/docs/conf.py index 947a35c7..5a0b8f31 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,7 +1,8 @@ import os + import django os.environ.setdefault("DJANGO_SETTINGS_MODULE", "swh.deposit.settings.development") django.setup() from swh.docs.sphinx.conf import * # NoQA diff --git a/setup.py b/setup.py index 8fea3aea..f04dc4bc 100755 --- a/setup.py +++ b/setup.py @@ -1,79 +1,79 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from setuptools import setup, find_packages - -from os import path from io import open +from os import path + +from setuptools import find_packages, setup here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(*names): requirements = [] for name in names: if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements setup( name="swh.deposit", description="Software Heritage Deposit Server", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/source/swh-deposit/", packages=find_packages(), install_requires=parse_requirements(None, "swh"), tests_require=parse_requirements("test"), setup_requires=["setuptools-scm"], use_scm_version=True, extras_require={ "testing": parse_requirements("test", "server", "swh-server"), "server": parse_requirements("server", "swh-server"), }, include_package_data=True, entry_points=""" [console_scripts] swh-deposit=swh.deposit.cli:main [swh.cli.subcommands] deposit=swh.deposit.cli:deposit [swh.workers] deposit.worker=swh.deposit.loader:register """, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-deposit", "Documentation": "https://docs.softwareheritage.org/devel/swh-deposit/", }, ) diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py index 24b22a10..a2873350 100644 --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -1,962 +1,960 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from abc import ABCMeta, abstractmethod import hashlib - from typing import Sequence, Type -from abc import ABCMeta, abstractmethod -from django.urls import reverse from django.http import HttpResponse from django.shortcuts import render +from django.urls import reverse from django.utils import timezone from rest_framework import status from rest_framework.authentication import BaseAuthentication, BasicAuthentication from rest_framework.permissions import BasePermission, IsAuthenticated from rest_framework.views import APIView from swh.model import hashutil from swh.scheduler.utils import create_oneshot_task_dict from ..config import ( - APIConfig, + ARCHIVE_KEY, + ARCHIVE_TYPE, + CONT_FILE_IRI, + DEPOSIT_STATUS_DEPOSITED, + DEPOSIT_STATUS_LOAD_SUCCESS, + DEPOSIT_STATUS_PARTIAL, EDIT_SE_IRI, EM_IRI, - CONT_FILE_IRI, - ARCHIVE_KEY, METADATA_KEY, + METADATA_TYPE, RAW_METADATA_KEY, STATE_IRI, - DEPOSIT_STATUS_DEPOSITED, - DEPOSIT_STATUS_PARTIAL, - DEPOSIT_STATUS_LOAD_SUCCESS, - ARCHIVE_TYPE, - METADATA_TYPE, + APIConfig, ) from ..errors import ( - MAX_UPLOAD_SIZE_EXCEEDED, BAD_REQUEST, - ERROR_CONTENT, CHECKSUM_MISMATCH, - make_error_dict, - MEDIATION_NOT_ALLOWED, - make_error_response_from_dict, + ERROR_CONTENT, FORBIDDEN, - NOT_FOUND, - make_error_response, + MAX_UPLOAD_SIZE_EXCEEDED, + MEDIATION_NOT_ALLOWED, METHOD_NOT_ALLOWED, - ParserError, + NOT_FOUND, PARSING_ERROR, + ParserError, + make_error_dict, + make_error_response, + make_error_response_from_dict, ) -from ..models import Deposit, DepositRequest, DepositCollection, DepositClient +from ..models import Deposit, DepositClient, DepositCollection, DepositRequest from ..parsers import parse_xml - ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"] ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"] class AuthenticatedAPIView(APIView): """Mixin intended as a based API view to enforce the basic authentication check """ authentication_classes: Sequence[Type[BaseAuthentication]] = (BasicAuthentication,) permission_classes: Sequence[Type[BasePermission]] = (IsAuthenticated,) class APIBase(APIConfig, AuthenticatedAPIView, metaclass=ABCMeta): """Base deposit request class sharing multiple common behaviors. """ def _read_headers(self, request): """Read and unify the necessary headers from the request (those are not stored in the same location or not properly formatted). Args: request (Request): Input request Returns: Dictionary with the following keys (some associated values may be None): - content-type - content-length - in-progress - content-disposition - packaging - slug - on-behalf-of """ meta = request._request.META content_type = request.content_type content_length = meta.get("CONTENT_LENGTH") if content_length and isinstance(content_length, str): content_length = int(content_length) # final deposit if not provided in_progress = meta.get("HTTP_IN_PROGRESS", False) content_disposition = meta.get("HTTP_CONTENT_DISPOSITION") if isinstance(in_progress, str): in_progress = in_progress.lower() == "true" content_md5sum = meta.get("HTTP_CONTENT_MD5") if content_md5sum: content_md5sum = bytes.fromhex(content_md5sum) packaging = meta.get("HTTP_PACKAGING") slug = meta.get("HTTP_SLUG") on_behalf_of = meta.get("HTTP_ON_BEHALF_OF") metadata_relevant = meta.get("HTTP_METADATA_RELEVANT") return { "content-type": content_type, "content-length": content_length, "in-progress": in_progress, "content-disposition": content_disposition, "content-md5sum": content_md5sum, "packaging": packaging, "slug": slug, "on-behalf-of": on_behalf_of, "metadata-relevant": metadata_relevant, } def _compute_md5(self, filehandler): """Compute uploaded file's md5 sum. Args: filehandler (InMemoryUploadedFile): the file to compute the md5 hash Returns: the md5 checksum (str) """ h = hashlib.md5() for chunk in filehandler: h.update(chunk) return h.digest() def _deposit_put( self, request, deposit_id=None, in_progress=False, external_id=None ): """Save/Update a deposit in db. Args: deposit_id (int): deposit identifier in_progress (dict): The deposit's status external_id (str): The external identifier to associate to the deposit Returns: The Deposit instance saved or updated. """ if in_progress is False: complete_date = timezone.now() status_type = DEPOSIT_STATUS_DEPOSITED else: complete_date = None status_type = DEPOSIT_STATUS_PARTIAL if not deposit_id: try: # find a deposit parent (same external id, status load # to success) deposit_parent = ( Deposit.objects.filter( external_id=external_id, status=DEPOSIT_STATUS_LOAD_SUCCESS ) .order_by("-id")[0:1] .get() ) # noqa except Deposit.DoesNotExist: deposit_parent = None deposit = Deposit( collection=self._collection, external_id=external_id, complete_date=complete_date, status=status_type, client=self._client, parent=deposit_parent, ) else: deposit = Deposit.objects.get(pk=deposit_id) # update metadata deposit.complete_date = complete_date deposit.status = status_type if self.config["checks"]: deposit.save() # needed to have a deposit id scheduler = self.scheduler if deposit.status == DEPOSIT_STATUS_DEPOSITED and not deposit.check_task_id: task = create_oneshot_task_dict( "check-deposit", collection=deposit.collection.name, deposit_id=deposit.id, ) check_task_id = scheduler.create_tasks([task])[0]["id"] deposit.check_task_id = check_task_id deposit.save() return deposit def _deposit_request_put( self, deposit, deposit_request_data, replace_metadata=False, replace_archives=False, ): """Save a deposit request with metadata attached to a deposit. Args: deposit (Deposit): The deposit concerned by the request deposit_request_data (dict): The dictionary with at most 2 deposit request types (archive, metadata) to associate to the deposit replace_metadata (bool): Flag defining if we add or update existing metadata to the deposit replace_archives (bool): Flag defining if we add or update archives to existing deposit Returns: None """ if replace_metadata: DepositRequest.objects.filter(deposit=deposit, type=METADATA_TYPE).delete() if replace_archives: DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() deposit_request = None archive_file = deposit_request_data.get(ARCHIVE_KEY) if archive_file: deposit_request = DepositRequest( type=ARCHIVE_TYPE, deposit=deposit, archive=archive_file ) deposit_request.save() metadata = deposit_request_data.get(METADATA_KEY) if metadata: raw_metadata = deposit_request_data.get(RAW_METADATA_KEY) deposit_request = DepositRequest( type=METADATA_TYPE, deposit=deposit, metadata=metadata, raw_metadata=raw_metadata.decode("utf-8"), ) deposit_request.save() assert deposit_request is not None def _delete_archives(self, collection_name, deposit_id): """Delete archives reference from the deposit id. """ try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, "The deposit %s does not exist" % deposit_id ) DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() return {} def _delete_deposit(self, collection_name, deposit_id): """Delete deposit reference. Args: collection_name (str): Client's name deposit_id (id): The deposit to delete Returns Empty dict when ok. Dict with error key to describe the failure. """ try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, "The deposit %s does not exist" % deposit_id ) if deposit.collection.name != collection_name: summary = "Cannot delete a deposit from another collection" description = "Deposit %s does not belong to the collection %s" % ( deposit_id, collection_name, ) return make_error_dict( BAD_REQUEST, summary=summary, verbose_description=description ) DepositRequest.objects.filter(deposit=deposit).delete() deposit.delete() return {} def _check_preconditions_on(self, filehandler, md5sum, content_length=None): """Check preconditions on provided file are respected. That is the length and/or the md5sum hash match the file's content. Args: filehandler (InMemoryUploadedFile): The file to check md5sum (hex str): md5 hash expected from the file's content content_length (int): the expected length if provided. Returns: Either none if no error or a dictionary with a key error detailing the problem. """ if content_length: if content_length > self.config["max_upload_size"]: return make_error_dict( MAX_UPLOAD_SIZE_EXCEEDED, "Upload size limit exceeded (max %s bytes)." % self.config["max_upload_size"], "Please consider sending the archive in " "multiple steps.", ) length = filehandler.size if length != content_length: return make_error_dict( status.HTTP_412_PRECONDITION_FAILED, "Wrong length" ) if md5sum: _md5sum = self._compute_md5(filehandler) if _md5sum != md5sum: return make_error_dict( CHECKSUM_MISMATCH, "Wrong md5 hash", "The checksum sent %s and the actual checksum " "%s does not match." % (hashutil.hash_to_hex(md5sum), hashutil.hash_to_hex(_md5sum)), ) return None def _binary_upload( self, request, headers, collection_name, deposit_id=None, replace_metadata=False, replace_archives=False, ): """Binary upload routine. Other than such a request, a 415 response is returned. Args: request (Request): the request holding information to parse and inject in db headers (dict): request headers formatted collection_name (str): the associated client deposit_id (id): deposit identifier if provided replace_metadata (bool): 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives (bool): 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id (int): Deposit identifier - deposit_date (date): Deposit date - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 413 (request entity too large) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided """ content_length = headers["content-length"] if not content_length: return make_error_dict( BAD_REQUEST, "CONTENT_LENGTH header is mandatory", "For archive deposit, the " "CONTENT_LENGTH header must be sent.", ) content_disposition = headers["content-disposition"] if not content_disposition: return make_error_dict( BAD_REQUEST, "CONTENT_DISPOSITION header is mandatory", "For archive deposit, the " "CONTENT_DISPOSITION header must be sent.", ) packaging = headers["packaging"] if packaging and packaging not in ACCEPT_PACKAGINGS: return make_error_dict( BAD_REQUEST, "Only packaging %s is supported" % ACCEPT_PACKAGINGS, "The packaging provided %s is not supported" % packaging, ) filehandler = request.FILES["file"] precondition_status_response = self._check_preconditions_on( filehandler, headers["content-md5sum"], content_length ) if precondition_status_response: return precondition_status_response external_id = headers["slug"] # actual storage of data archive_metadata = filehandler deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) self._deposit_request_put( deposit, {ARCHIVE_KEY: archive_metadata}, replace_metadata=replace_metadata, replace_archives=replace_archives, ) return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "status": deposit.status, "archive": filehandler.name, } def _read_metadata(self, metadata_stream): """Given a metadata stream, reads the metadata and returns both the parsed and the raw metadata. """ raw_metadata = metadata_stream.read() metadata = parse_xml(raw_metadata) return raw_metadata, metadata def _multipart_upload( self, request, headers, collection_name, deposit_id=None, replace_metadata=False, replace_archives=False, ): """Multipart upload supported with exactly: - 1 archive (zip) - 1 atom entry Other than such a request, a 415 response is returned. Args: request (Request): the request holding information to parse and inject in db headers (dict): request headers formatted collection_name (str): the associated client deposit_id (id): deposit identifier if provided replace_metadata (bool): 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives (bool): 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id (int): Deposit identifier - deposit_date (date): Deposit date - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 412 (precondition failed) if the potentially md5 hash provided mismatch the reality of the archive - 413 (request entity too large) if the length of the archive exceeds the max size configured - 415 (unsupported media type) if a wrong media type is provided """ external_id = headers["slug"] content_types_present = set() data = { "application/zip": None, # expected either zip "application/x-tar": None, # or x-tar "application/atom+xml": None, } for key, value in request.FILES.items(): fh = value if fh.content_type in content_types_present: return make_error_dict( ERROR_CONTENT, "Only 1 application/zip (or application/x-tar) archive " "and 1 atom+xml entry is supported (as per sword2.0 " "specification)", "You provided more than 1 application/(zip|x-tar) " "or more than 1 application/atom+xml content-disposition " "header in the multipart deposit", ) content_types_present.add(fh.content_type) data[fh.content_type] = fh if len(content_types_present) != 2: return make_error_dict( ERROR_CONTENT, "You must provide both 1 application/zip (or " "application/x-tar) and 1 atom+xml entry for multipart " "deposit", "You need to provide only 1 application/(zip|x-tar) " "and 1 application/atom+xml content-disposition header " "in the multipart deposit", ) filehandler = data["application/zip"] if not filehandler: filehandler = data["application/x-tar"] precondition_status_response = self._check_preconditions_on( filehandler, headers["content-md5sum"] ) if precondition_status_response: return precondition_status_response try: raw_metadata, metadata = self._read_metadata(data["application/atom+xml"]) except ParserError: return make_error_dict( PARSING_ERROR, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) # actual storage of data deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) deposit_request_data = { ARCHIVE_KEY: filehandler, METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata, } self._deposit_request_put( deposit, deposit_request_data, replace_metadata, replace_archives ) return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "archive": filehandler.name, "status": deposit.status, } def _atom_entry( self, request, headers, collection_name, deposit_id=None, replace_metadata=False, replace_archives=False, ): """Atom entry deposit. Args: request (Request): the request holding information to parse and inject in db headers (dict): request headers formatted collection_name (str): the associated client deposit_id (id): deposit identifier if provided replace_metadata (bool): 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives (bool): 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id: deposit id associated to the deposit - deposit_date: date of the deposit - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 400 (bad request) if the request's body is empty - 415 (unsupported media type) if a wrong media type is provided """ try: raw_metadata, metadata = self._read_metadata(request.data) except ParserError: return make_error_dict( BAD_REQUEST, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) if not metadata: return make_error_dict( BAD_REQUEST, "Empty body request is not supported", "Atom entry deposit is supposed to send for metadata. " "If the body is empty, there is no metadata.", ) external_id = metadata.get("external_identifier", headers["slug"]) deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) self._deposit_request_put( deposit, {METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata}, replace_metadata, replace_archives, ) return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "archive": None, "status": deposit.status, } def _empty_post(self, request, headers, collection_name, deposit_id): """Empty post to finalize an empty deposit. Args: request (Request): the request holding information to parse and inject in db headers (dict): request headers formatted collection_name (str): the associated client deposit_id (id): deposit identifier Returns: Dictionary of result with the deposit's id, the date it was completed and no archive. """ deposit = Deposit.objects.get(pk=deposit_id) deposit.complete_date = timezone.now() deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() return { "deposit_id": deposit_id, "deposit_date": deposit.complete_date, "status": deposit.status, "archive": None, } def _make_iris(self, request, collection_name, deposit_id): """Define the IRI endpoints Args: request (Request): The initial request collection_name (str): client/collection's name deposit_id (id): Deposit identifier Returns: Dictionary of keys with the iris' urls. """ args = [collection_name, deposit_id] return { iri: request.build_absolute_uri(reverse(iri, args=args)) for iri in [EM_IRI, EDIT_SE_IRI, CONT_FILE_IRI, STATE_IRI] } def additional_checks(self, request, headers, collection_name, deposit_id=None): """Permit the child class to enrich additional checks. Returns: dict with 'error' detailing the problem. """ return {} def checks(self, request, collection_name, deposit_id=None): try: self._collection = DepositCollection.objects.get(name=collection_name) except DepositCollection.DoesNotExist: return make_error_dict( NOT_FOUND, "Unknown collection name %s" % collection_name ) username = request.user.username if username: # unauthenticated request can have the username empty try: self._client = DepositClient.objects.get(username=username) except DepositClient.DoesNotExist: return make_error_dict(NOT_FOUND, "Unknown client name %s" % username) if self._collection.id not in self._client.collections: return make_error_dict( FORBIDDEN, "Client %s cannot access collection %s" % (username, collection_name), ) if deposit_id: try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, "Deposit with id %s does not exist" % deposit_id ) checks = self.restrict_access(request, deposit) if checks: return checks headers = self._read_headers(request) if headers["on-behalf-of"]: return make_error_dict(MEDIATION_NOT_ALLOWED, "Mediation is not supported.") checks = self.additional_checks(request, headers, collection_name, deposit_id) if "error" in checks: return checks return {"headers": headers} def restrict_access(self, request, deposit=None): if deposit: if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL: summary = "You can only act on deposit with status '%s'" % ( DEPOSIT_STATUS_PARTIAL, ) description = "This deposit has status '%s'" % deposit.status return make_error_dict( BAD_REQUEST, summary=summary, verbose_description=description ) def _basic_not_allowed_method(self, request, method): return make_error_response( request, METHOD_NOT_ALLOWED, "%s method is not supported on this endpoint" % method, ) def get(self, request, *args, **kwargs): return self._basic_not_allowed_method(request, "GET") def post(self, request, *args, **kwargs): return self._basic_not_allowed_method(request, "POST") def put(self, request, *args, **kwargs): return self._basic_not_allowed_method(request, "PUT") def delete(self, request, *args, **kwargs): return self._basic_not_allowed_method(request, "DELETE") class APIGet(APIBase, metaclass=ABCMeta): """Mixin for class to support GET method. """ def get(self, request, collection_name, deposit_id, format=None): """Endpoint to create/add resources to deposit. Returns: 200 response when no error during routine occurred 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) r = self.process_get(request, collection_name, deposit_id) if isinstance(r, tuple): status, content, content_type = r return HttpResponse(content, status=status, content_type=content_type) return r @abstractmethod def process_get(self, request, collection_name, deposit_id): """Routine to deal with the deposit's get processing. Returns: Tuple status, stream of content, content-type """ pass class APIPost(APIBase, metaclass=ABCMeta): """Mixin for class to support DELETE method. """ def post(self, request, collection_name, deposit_id=None, format=None): """Endpoint to create/add resources to deposit. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) headers = checks["headers"] _status, _iri_key, data = self.process_post( request, headers, collection_name, deposit_id ) error = data.get("error") if error: return make_error_response_from_dict(request, error) data["packagings"] = ACCEPT_PACKAGINGS iris = self._make_iris(request, collection_name, data["deposit_id"]) data.update(iris) response = render( request, "deposit/deposit_receipt.xml", context=data, content_type="application/xml", status=_status, ) response._headers["location"] = "Location", data[_iri_key] return response @abstractmethod def process_post(self, request, headers, collection_name, deposit_id=None): """Routine to deal with the deposit's processing. Returns Tuple of: - response status code (200, 201, etc...) - key iri (EM_IRI, EDIT_SE_IRI, etc...) - dictionary of the processing result """ pass class APIPut(APIBase, metaclass=ABCMeta): """Mixin for class to support PUT method. """ def put(self, request, collection_name, deposit_id, format=None): """Endpoint to update deposit resources. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) headers = checks["headers"] data = self.process_put(request, headers, collection_name, deposit_id) error = data.get("error") if error: return make_error_response_from_dict(request, error) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_put(self, request, headers, collection_name, deposit_id): """Routine to deal with updating a deposit in some way. Returns dictionary of the processing result """ pass class APIDelete(APIBase, metaclass=ABCMeta): """Mixin for class to support DELETE method. """ def delete(self, request, collection_name, deposit_id): """Endpoint to delete some deposit's resources (archives, deposit). Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) data = self.process_delete(request, collection_name, deposit_id) error = data.get("error") if error: return make_error_response_from_dict(request, error) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_delete(self, request, collection_name, deposit_id): """Routine to delete a resource. This is mostly not allowed except for the EM_IRI (cf. .api.deposit_update.APIUpdateArchive) """ pass diff --git a/swh/deposit/api/deposit.py b/swh/deposit/api/deposit.py index d6dba82d..b426b180 100644 --- a/swh/deposit/api/deposit.py +++ b/swh/deposit/api/deposit.py @@ -1,95 +1,98 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework import status -from .common import APIPost, ACCEPT_ARCHIVE_CONTENT_TYPES from ..config import EDIT_SE_IRI -from ..errors import make_error_dict, BAD_REQUEST -from ..parsers import SWHFileUploadZipParser, SWHFileUploadTarParser -from ..parsers import SWHAtomEntryParser -from ..parsers import SWHMultiPartParser +from ..errors import BAD_REQUEST, make_error_dict +from ..parsers import ( + SWHAtomEntryParser, + SWHFileUploadTarParser, + SWHFileUploadZipParser, + SWHMultiPartParser, +) +from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, APIPost class APIPostDeposit(APIPost): """Deposit request class defining api endpoints for sword deposit. What's known as 'Col IRI' in the sword specification. HTTP verbs supported: POST """ parser_classes = ( SWHMultiPartParser, SWHFileUploadZipParser, SWHFileUploadTarParser, SWHAtomEntryParser, ) def additional_checks(self, req, headers, collection_name, deposit_id=None): slug = headers["slug"] if not slug: msg = "Missing SLUG header in request" verbose_description = "Provide in the SLUG header one identifier, for example the url pointing to the resource you are depositing." # noqa return make_error_dict(BAD_REQUEST, msg, verbose_description) return {} def process_post(self, req, headers, collection_name, deposit_id=None): """Create a first deposit as: - archive deposit (1 zip) - multipart (1 zip + 1 atom entry) - atom entry Args: req (Request): the request holding the information to parse and inject in db collection_name (str): the associated client Returns: An http response (HttpResponse) according to the situation. If everything is ok, a 201 response (created) with a deposit receipt. Otherwise, depending on the upload, the following errors can be returned: - archive deposit: - 400 (bad request) if the request is not providing an external identifier - 403 (forbidden) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or hash provided mismatch the reality of the archive. - 415 (unsupported media type) if a wrong media type is provided - multipart deposit: - 400 (bad request) if the request is not providing an external identifier - 412 (precondition failed) if the potentially md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided - Atom entry deposit: - 400 (bad request) if the request is not providing an external identifier - 400 (bad request) if the request's body is empty - 415 (unsupported media type) if a wrong media type is provided """ assert deposit_id is None if req.content_type in ACCEPT_ARCHIVE_CONTENT_TYPES: data = self._binary_upload(req, headers, collection_name) elif req.content_type.startswith("multipart/"): data = self._multipart_upload(req, headers, collection_name) else: data = self._atom_entry(req, headers, collection_name) return status.HTTP_201_CREATED, EDIT_SE_IRI, data diff --git a/swh/deposit/api/deposit_content.py b/swh/deposit/api/deposit_content.py index cee091dd..a7f861f4 100644 --- a/swh/deposit/api/deposit_content.py +++ b/swh/deposit/api/deposit_content.py @@ -1,47 +1,46 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.shortcuts import render from rest_framework import status -from .common import APIBase -from ..errors import NOT_FOUND, make_error_response -from ..errors import make_error_response_from_dict +from ..errors import NOT_FOUND, make_error_response, make_error_response_from_dict from ..models import DEPOSIT_STATUS_DETAIL, Deposit, DepositRequest +from .common import APIBase class APIContent(APIBase): def get(self, req, collection_name, deposit_id, format=None): checks = self.checks(req, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(req, checks["error"]) try: deposit = Deposit.objects.get(pk=deposit_id) if deposit.collection.name != collection_name: raise Deposit.DoesNotExist except Deposit.DoesNotExist: return make_error_response( req, NOT_FOUND, "deposit %s does not belong to collection %s" % (deposit_id, collection_name), ) requests = DepositRequest.objects.filter(deposit=deposit) context = { "deposit_id": deposit.id, "status": deposit.status, "status_detail": DEPOSIT_STATUS_DETAIL[deposit.status], "requests": requests, } return render( req, "deposit/content.xml", context=context, content_type="application/xml", status=status.HTTP_200_OK, ) diff --git a/swh/deposit/api/deposit_status.py b/swh/deposit/api/deposit_status.py index c163882c..fa89276e 100644 --- a/swh/deposit/api/deposit_status.py +++ b/swh/deposit/api/deposit_status.py @@ -1,65 +1,64 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.shortcuts import render from rest_framework import status +from ..errors import NOT_FOUND, make_error_response, make_error_response_from_dict +from ..models import DEPOSIT_STATUS_DETAIL, Deposit from .common import APIBase from .converters import convert_status_detail -from ..errors import NOT_FOUND, make_error_response -from ..errors import make_error_response_from_dict -from ..models import DEPOSIT_STATUS_DETAIL, Deposit class APIStatus(APIBase): """Deposit status. What's known as 'State IRI' in the sword specification. HTTP verbs supported: GET """ def get(self, req, collection_name, deposit_id, format=None): checks = self.checks(req, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(req, checks["error"]) try: deposit = Deposit.objects.get(pk=deposit_id) if deposit.collection.name != collection_name: raise Deposit.DoesNotExist except Deposit.DoesNotExist: return make_error_response( req, NOT_FOUND, "deposit %s does not belong to collection %s" % (deposit_id, collection_name), ) status_detail = convert_status_detail(deposit.status_detail) if not status_detail: status_detail = DEPOSIT_STATUS_DETAIL[deposit.status] context = { "deposit_id": deposit.id, "status_detail": status_detail, } keys = ( "status", "swh_id", "swh_id_context", "external_id", ) for k in keys: context[k] = getattr(deposit, k, None) return render( req, "deposit/status.xml", context=context, content_type="application/xml", status=status.HTTP_200_OK, ) diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py index 0c5f6303..749edd37 100644 --- a/swh/deposit/api/deposit_update.py +++ b/swh/deposit/api/deposit_update.py @@ -1,167 +1,169 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework import status -from .common import APIPost, APIPut, APIDelete -from .common import ACCEPT_ARCHIVE_CONTENT_TYPES from ..config import CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI -from ..errors import make_error_dict, BAD_REQUEST -from ..parsers import SWHFileUploadZipParser, SWHFileUploadTarParser -from ..parsers import SWHAtomEntryParser -from ..parsers import SWHMultiPartParser +from ..errors import BAD_REQUEST, make_error_dict +from ..parsers import ( + SWHAtomEntryParser, + SWHFileUploadTarParser, + SWHFileUploadZipParser, + SWHMultiPartParser, +) +from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, APIDelete, APIPost, APIPut class APIUpdateArchive(APIPost, APIPut, APIDelete): """Deposit request class defining api endpoints for sword deposit. What's known as 'EM IRI' in the sword specification. HTTP verbs supported: PUT, POST, DELETE """ parser_classes = ( SWHFileUploadZipParser, SWHFileUploadTarParser, ) def process_put(self, req, headers, collection_name, deposit_id): """Replace existing content for the existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_binary # noqa Returns: 204 No content """ if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES: msg = "Packaging format supported is restricted to %s" % ( ", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES) ) return make_error_dict(BAD_REQUEST, msg) return self._binary_upload( req, headers, collection_name, deposit_id=deposit_id, replace_archives=True ) def process_post(self, req, headers, collection_name, deposit_id): """Add new content to the existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_mediaresource # noqa Returns: 201 Created Headers: Location: [Cont-File-IRI] Body: [optional Deposit Receipt] """ if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES: msg = "Packaging format supported is restricted to %s" % ( ", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES) ) return "unused", "unused", make_error_dict(BAD_REQUEST, msg) return ( status.HTTP_201_CREATED, CONT_FILE_IRI, self._binary_upload(req, headers, collection_name, deposit_id), ) def process_delete(self, req, collection_name, deposit_id): """Delete content (archives) from existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deletingcontent # noqa Returns: 204 Created """ return self._delete_archives(collection_name, deposit_id) class APIUpdateMetadata(APIPost, APIPut, APIDelete): """Deposit request class defining api endpoints for sword deposit. What's known as 'Edit IRI' (and SE IRI) in the sword specification. HTTP verbs supported: POST (SE IRI), PUT (Edit IRI), DELETE """ parser_classes = (SWHMultiPartParser, SWHAtomEntryParser) def process_put(self, req, headers, collection_name, deposit_id): """Replace existing deposit's metadata/archive with new ones. source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_metadata # noqa - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_multipart # noqa Returns: 204 No content """ if req.content_type.startswith("multipart/"): return self._multipart_upload( req, headers, collection_name, deposit_id=deposit_id, replace_archives=True, replace_metadata=True, ) return self._atom_entry( req, headers, collection_name, deposit_id=deposit_id, replace_metadata=True ) def process_post(self, req, headers, collection_name, deposit_id): """Add new metadata/archive to existing deposit. source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_metadata # noqa - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_multipart # noqa This also deals with an empty post corner case to finalize a deposit. Returns: In optimal case for a multipart and atom-entry update, a 201 Created response. The body response will hold a deposit. And the response headers will contain an entry 'Location' with the EM-IRI. For the empty post case, this returns a 200. """ if req.content_type.startswith("multipart/"): return ( status.HTTP_201_CREATED, EM_IRI, self._multipart_upload( req, headers, collection_name, deposit_id=deposit_id ), ) # check for final empty post # source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html # #continueddeposit_complete if headers["content-length"] == 0 and headers["in-progress"] is False: data = self._empty_post(req, headers, collection_name, deposit_id) return (status.HTTP_200_OK, EDIT_SE_IRI, data) return ( status.HTTP_201_CREATED, EM_IRI, self._atom_entry(req, headers, collection_name, deposit_id=deposit_id), ) def process_delete(self, req, collection_name, deposit_id): """Delete the container (deposit). source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deleteconteiner # noqa """ return self._delete_deposit(collection_name, deposit_id) diff --git a/swh/deposit/api/private/__init__.py b/swh/deposit/api/private/__init__.py index 3a18be5e..e9b98ee3 100644 --- a/swh/deposit/api/private/__init__.py +++ b/swh/deposit/api/private/__init__.py @@ -1,109 +1,108 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.deposit import utils - -from ...config import METADATA_TYPE, APIConfig -from ...models import DepositRequest, Deposit - from rest_framework.permissions import AllowAny +from swh.deposit import utils from swh.deposit.api.common import AuthenticatedAPIView -from swh.deposit.errors import make_error_dict, NOT_FOUND +from swh.deposit.errors import NOT_FOUND, make_error_dict + +from ...config import METADATA_TYPE, APIConfig +from ...models import Deposit, DepositRequest class DepositReadMixin: """Deposit Read mixin """ def _deposit_requests(self, deposit, request_type): """Given a deposit, yields its associated deposit_request Args: deposit (Deposit): Deposit to list requests for request_type (str): 'archive' or 'metadata' Yields: deposit requests of type request_type associated to the deposit """ if isinstance(deposit, int): deposit = Deposit.objects.get(pk=deposit) deposit_requests = DepositRequest.objects.filter( type=request_type, deposit=deposit ).order_by("id") for deposit_request in deposit_requests: yield deposit_request def _metadata_get(self, deposit): """Given a deposit, aggregate all metadata requests. Args: deposit (Deposit): The deposit instance to extract metadata from. Returns: metadata dict from the deposit. """ metadata = ( m.metadata for m in self._deposit_requests(deposit, request_type=METADATA_TYPE) ) return utils.merge(*metadata) class APIPrivateView(APIConfig, AuthenticatedAPIView): """Mixin intended as private api (so no authentication) based API view (for the private ones). """ authentication_classes = () permission_classes = (AllowAny,) def checks(self, req, collection_name, deposit_id=None): """Override default checks implementation to allow empty collection. """ if deposit_id: try: Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, "Deposit with id %s does not exist" % deposit_id ) headers = self._read_headers(req) checks = self.additional_checks(req, headers, collection_name, deposit_id) if "error" in checks: return checks return {"headers": headers} def get( self, request, collection_name=None, deposit_id=None, format=None, *args, **kwargs, ): return super().get(request, collection_name, deposit_id, format) def put( self, request, collection_name=None, deposit_id=None, format=None, *args, **kwargs, ): return super().put(request, collection_name, deposit_id, format) diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py index dc9a8307..680ec83c 100644 --- a/swh/deposit/api/private/deposit_check.py +++ b/swh/deposit/api/private/deposit_check.py @@ -1,230 +1,228 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from itertools import chain import json import re +from shutil import get_unpack_formats import tarfile import zipfile -from itertools import chain -from shutil import get_unpack_formats - from rest_framework import status from swh.scheduler.utils import create_oneshot_task_dict -from . import DepositReadMixin, APIPrivateView -from ..common import APIGet -from ...config import DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_REJECTED -from ...config import ARCHIVE_TYPE +from . import APIPrivateView, DepositReadMixin +from ...config import ARCHIVE_TYPE, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED from ...models import Deposit +from ..common import APIGet MANDATORY_FIELDS_MISSING = "Mandatory fields are missing" ALTERNATE_FIELDS_MISSING = "Mandatory alternate fields are missing" MANDATORY_ARCHIVE_UNREADABLE = ( "At least one of its associated archives is not readable" # noqa ) MANDATORY_ARCHIVE_INVALID = ( "Mandatory archive is invalid (i.e contains only one archive)" # noqa ) MANDATORY_ARCHIVE_UNSUPPORTED = "Mandatory archive type is not supported" MANDATORY_ARCHIVE_MISSING = "Deposit without archive is rejected" ARCHIVE_EXTENSIONS = [ "zip", "tar", "tar.gz", "xz", "tar.xz", "bz2", "tar.bz2", "Z", "tar.Z", "tgz", "7z", ] PATTERN_ARCHIVE_EXTENSION = re.compile(r".*\.(%s)$" % "|".join(ARCHIVE_EXTENSIONS)) def known_archive_format(filename): return any( filename.endswith(t) for t in chain(*(x[1] for x in get_unpack_formats())) ) class APIChecks(APIPrivateView, APIGet, DepositReadMixin): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ def _check_deposit_archives(self, deposit): """Given a deposit, check each deposit request of type archive. Args: The deposit to check archives for Returns tuple (status, error_detail): True, None if all archives are ok, (False, ) otherwise. """ requests = list(self._deposit_requests(deposit, request_type=ARCHIVE_TYPE)) if len(requests) == 0: # no associated archive is refused return False, {"archive": [{"summary": MANDATORY_ARCHIVE_MISSING,}]} errors = [] for archive_request in requests: check, error_message = self._check_archive(archive_request) if not check: errors.append( {"summary": error_message, "fields": [archive_request.id]} ) if not errors: return True, None return False, {"archive": errors} def _check_archive(self, archive_request): """Check that a deposit associated archive is ok: - readable - supported archive format - valid content: the archive does not contain a single archive file If any of those checks are not ok, return the corresponding failing check. Args: archive_path (DepositRequest): Archive to check Returns: (True, None) if archive is check compliant, (False, ) otherwise. """ archive_path = archive_request.archive.path if not known_archive_format(archive_path): return False, MANDATORY_ARCHIVE_UNSUPPORTED try: if zipfile.is_zipfile(archive_path): with zipfile.ZipFile(archive_path) as f: files = f.namelist() elif tarfile.is_tarfile(archive_path): with tarfile.open(archive_path) as f: files = f.getnames() else: return False, MANDATORY_ARCHIVE_UNSUPPORTED except Exception: return False, MANDATORY_ARCHIVE_UNREADABLE if len(files) > 1: return True, None element = files[0] if PATTERN_ARCHIVE_EXTENSION.match(element): # archive in archive! return False, MANDATORY_ARCHIVE_INVALID return True, None def _check_metadata(self, metadata): """Check to execute on all metadata for mandatory field presence. Args: metadata (dict): Metadata dictionary to check for mandatory fields Returns: tuple (status, error_detail): True, None if metadata are ok (False, ) otherwise. """ required_fields = { "author": False, } alternate_fields = { ("name", "title"): False, # alternate field, at least one # of them must be present } for field, value in metadata.items(): for name in required_fields: if name in field: required_fields[name] = True for possible_names in alternate_fields: for possible_name in possible_names: if possible_name in field: alternate_fields[possible_names] = True continue mandatory_result = [k for k, v in required_fields.items() if not v] optional_result = [" or ".join(k) for k, v in alternate_fields.items() if not v] if mandatory_result == [] and optional_result == []: return True, None detail = [] if mandatory_result != []: detail.append( {"summary": MANDATORY_FIELDS_MISSING, "fields": mandatory_result} ) if optional_result != []: detail.append( {"summary": ALTERNATE_FIELDS_MISSING, "fields": optional_result,} ) return False, {"metadata": detail} def process_get(self, req, collection_name, deposit_id): """Build a unique tarball from the multiple received and stream that content to the client. Args: req (Request): collection_name (str): Collection owning the deposit deposit_id (id): Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ deposit = Deposit.objects.get(pk=deposit_id) metadata = self._metadata_get(deposit) problems = {} # will check each deposit's associated request (both of type # archive and metadata) for errors archives_status, error_detail = self._check_deposit_archives(deposit) if not archives_status: problems.update(error_detail) metadata_status, error_detail = self._check_metadata(metadata) if not metadata_status: problems.update(error_detail) deposit_status = archives_status and metadata_status # if any problems arose, the deposit is rejected if not deposit_status: deposit.status = DEPOSIT_STATUS_REJECTED deposit.status_detail = problems response = { "status": deposit.status, "details": deposit.status_detail, } else: deposit.status = DEPOSIT_STATUS_VERIFIED response = { "status": deposit.status, } if not deposit.load_task_id and self.config["checks"]: url = deposit.origin_url task = create_oneshot_task_dict( "load-deposit", url=url, deposit_id=deposit.id, retries_left=3 ) load_task_id = self.scheduler.create_tasks([task])[0]["id"] deposit.load_task_id = load_task_id deposit.save() return status.HTTP_200_OK, json.dumps(response), "application/json" diff --git a/swh/deposit/api/private/deposit_list.py b/swh/deposit/api/private/deposit_list.py index 43089a8b..a5c81c12 100644 --- a/swh/deposit/api/private/deposit_list.py +++ b/swh/deposit/api/private/deposit_list.py @@ -1,66 +1,66 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from rest_framework import serializers from rest_framework.fields import _UnvalidatedField from rest_framework.generics import ListAPIView from rest_framework.pagination import PageNumberPagination -from rest_framework import serializers from . import APIPrivateView -from ..converters import convert_status_detail from ...models import Deposit +from ..converters import convert_status_detail class DefaultPagination(PageNumberPagination): page_size = 100 page_size_query_param = "page_size" class StatusDetailField(_UnvalidatedField): """status_detail field is a dict, we want a simple message instead. So, we reuse the convert_status_detail from deposit_status endpoint to that effect. """ def to_representation(self, value): return convert_status_detail(value) class DepositSerializer(serializers.ModelSerializer): status_detail = StatusDetailField() class Meta: model = Deposit fields = "__all__" class APIList(ListAPIView, APIPrivateView): """Deposit request class to list the deposit's status per page. HTTP verbs supported: GET """ serializer_class = DepositSerializer pagination_class = DefaultPagination def get_queryset(self): params = self.request.query_params exclude_like = params.get("exclude") if exclude_like: # sql injection: A priori, nothing to worry about, django does it for # queryset # https://docs.djangoproject.com/en/3.0/topics/security/#sql-injection-protection # noqa # https://docs.djangoproject.com/en/2.2/topics/security/#sql-injection-protection # noqa deposits = ( Deposit.objects.all() .exclude(external_id__startswith=exclude_like) .order_by("id") ) else: deposits = Deposit.objects.all().order_by("id") return deposits diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py index 84d1715e..f67f1306 100644 --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -1,192 +1,192 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from contextlib import contextmanager import json import os import shutil import tempfile -from contextlib import contextmanager from django.http import FileResponse from rest_framework import status from swh.core import tarball -from swh.model import identifiers from swh.deposit.utils import normalize_date +from swh.model import identifiers -from . import DepositReadMixin, APIPrivateView -from ...config import SWH_PERSON, ARCHIVE_TYPE -from ..common import APIGet +from . import APIPrivateView, DepositReadMixin +from ...config import ARCHIVE_TYPE, SWH_PERSON from ...models import Deposit +from ..common import APIGet @contextmanager def aggregate_tarballs(extraction_dir, archive_paths): """Aggregate multiple tarballs into one and returns this new archive's path. Args: extraction_dir (path): Path to use for the tarballs computation archive_paths ([str]): Deposit's archive paths Returns: Tuple (directory to clean up, archive path (aggregated or not)) """ # rebuild one zip archive from (possibly) multiple ones os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix="swh.deposit-", dir=extraction_dir) # root folder to build an aggregated tarball aggregated_tarball_rootdir = os.path.join(dir_path, "aggregate") os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True) # uncompress in a temporary location all archives for archive_path in archive_paths: tarball.uncompress(archive_path, aggregated_tarball_rootdir) # Aggregate into one big tarball the multiple smaller ones temp_tarpath = shutil.make_archive( aggregated_tarball_rootdir, "zip", aggregated_tarball_rootdir ) # can already clean up temporary directory shutil.rmtree(aggregated_tarball_rootdir) try: yield temp_tarpath finally: shutil.rmtree(dir_path) class APIReadArchives(APIPrivateView, APIGet, DepositReadMixin): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ def __init__(self): super().__init__() self.extraction_dir = self.config["extraction_dir"] if not os.path.exists(self.extraction_dir): os.makedirs(self.extraction_dir) def process_get(self, request, collection_name, deposit_id): """Build a unique tarball from the multiple received and stream that content to the client. Args: request (Request): collection_name (str): Collection owning the deposit deposit_id (id): Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ archive_paths = [ r.archive.path for r in self._deposit_requests(deposit_id, request_type=ARCHIVE_TYPE) ] with aggregate_tarballs(self.extraction_dir, archive_paths) as path: return FileResponse( open(path, "rb"), status=status.HTTP_200_OK, content_type="application/zip", ) class APIReadMetadata(APIPrivateView, APIGet, DepositReadMixin): """Class in charge of aggregating metadata on a deposit. """ def __init__(self): super().__init__() self.provider = self.config["provider"] self.tool = self.config["tool"] def _normalize_dates(self, deposit, metadata): """Normalize the date to use as a tuple of author date, committer date from the incoming metadata. Args: deposit (Deposit): Deposit model representation metadata (Dict): Metadata dict representation Returns: Tuple of author date, committer date. Those dates are swh normalized. """ commit_date = metadata.get("codemeta:datePublished") author_date = metadata.get("codemeta:dateCreated") if author_date and commit_date: pass elif commit_date: author_date = commit_date elif author_date: commit_date = author_date else: author_date = deposit.complete_date commit_date = deposit.complete_date return (normalize_date(author_date), normalize_date(commit_date)) def metadata_read(self, deposit): """Read and aggregate multiple data on deposit into one unified data dictionary. Args: deposit (Deposit): Deposit concerned by the data aggregation. Returns: Dictionary of data representing the deposit to inject in swh. """ metadata = self._metadata_get(deposit) # Read information metadata data = {"origin": {"type": "deposit", "url": deposit.origin_url,}} # metadata provider self.provider["provider_name"] = deposit.client.last_name self.provider["provider_url"] = deposit.client.provider_url author_date, commit_date = self._normalize_dates(deposit, metadata) if deposit.parent: swh_persistent_id = deposit.parent.swh_id swhid = identifiers.parse_swhid(swh_persistent_id) parent_revision = swhid.object_id parents = [parent_revision] else: parents = [] data["origin_metadata"] = { "provider": self.provider, "tool": self.tool, "metadata": metadata, } data["deposit"] = { "id": deposit.id, "client": deposit.client.username, "collection": deposit.collection.name, "author": SWH_PERSON, "author_date": author_date, "committer": SWH_PERSON, "committer_date": commit_date, "revision_parents": parents, } return data def process_get(self, request, collection_name, deposit_id): deposit = Deposit.objects.get(pk=deposit_id) data = self.metadata_read(deposit) d = {} if data: d = json.dumps(data) return status.HTTP_200_OK, d, "application/json" diff --git a/swh/deposit/api/private/deposit_update_status.py b/swh/deposit/api/private/deposit_update_status.py index fcf0492b..9df47390 100644 --- a/swh/deposit/api/private/deposit_update_status.py +++ b/swh/deposit/api/private/deposit_update_status.py @@ -1,105 +1,103 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework.parsers import JSONParser -from swh.model.identifiers import DIRECTORY, swhid, REVISION, SNAPSHOT +from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid from . import APIPrivateView +from ...errors import BAD_REQUEST, make_error_dict +from ...models import DEPOSIT_STATUS_DETAIL, DEPOSIT_STATUS_LOAD_SUCCESS, Deposit from ..common import APIPut -from ...errors import make_error_dict, BAD_REQUEST -from ...models import Deposit, DEPOSIT_STATUS_DETAIL -from ...models import DEPOSIT_STATUS_LOAD_SUCCESS - MANDATORY_KEYS = ["origin_url", "revision_id", "directory_id", "snapshot_id"] class APIUpdateStatus(APIPrivateView, APIPut): """Deposit request class to update the deposit's status. HTTP verbs supported: PUT """ parser_classes = (JSONParser,) def additional_checks(self, request, headers, collection_name, deposit_id=None): """Enrich existing checks to the default ones. New checks: - Ensure the status is provided - Ensure it exists - no missing information on load success update """ data = request.data status = data.get("status") if not status: msg = "The status key is mandatory with possible values %s" % list( DEPOSIT_STATUS_DETAIL.keys() ) return make_error_dict(BAD_REQUEST, msg) if status not in DEPOSIT_STATUS_DETAIL: msg = "Possible status in %s" % list(DEPOSIT_STATUS_DETAIL.keys()) return make_error_dict(BAD_REQUEST, msg) if status == DEPOSIT_STATUS_LOAD_SUCCESS: missing_keys = [] for key in MANDATORY_KEYS: value = data.get(key) if value is None: missing_keys.append(key) if missing_keys: msg = ( f"Updating deposit status to {status}" f" requires information {','.join(missing_keys)}" ) return make_error_dict(BAD_REQUEST, msg) return {} def process_put(self, request, headers, collection_name, deposit_id): """Update the deposit with status and SWHIDs Returns: 204 No content 400 Bad request if checks fail """ data = request.data deposit = Deposit.objects.get(pk=deposit_id) status = data["status"] deposit.status = status if status == DEPOSIT_STATUS_LOAD_SUCCESS: origin_url = data["origin_url"] directory_id = data["directory_id"] revision_id = data["revision_id"] dir_id = swhid(DIRECTORY, directory_id) snp_id = swhid(SNAPSHOT, data["snapshot_id"]) rev_id = swhid(REVISION, revision_id) deposit.swh_id = dir_id # new id with contextual information deposit.swh_id_context = swhid( DIRECTORY, directory_id, metadata={ "origin": origin_url, "visit": snp_id, "anchor": rev_id, "path": "/", }, ) else: # rejected deposit.status = status deposit.save() return {} diff --git a/swh/deposit/api/private/urls.py b/swh/deposit/api/private/urls.py index 45ffd309..e48290d6 100644 --- a/swh/deposit/api/private/urls.py +++ b/swh/deposit/api/private/urls.py @@ -1,79 +1,78 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.conf.urls import url from ...config import ( - PRIVATE_GET_RAW_CONTENT, - PRIVATE_PUT_DEPOSIT, - PRIVATE_GET_DEPOSIT_METADATA, PRIVATE_CHECK_DEPOSIT, + PRIVATE_GET_DEPOSIT_METADATA, + PRIVATE_GET_RAW_CONTENT, PRIVATE_LIST_DEPOSITS, + PRIVATE_PUT_DEPOSIT, ) -from .deposit_read import APIReadArchives -from .deposit_read import APIReadMetadata -from .deposit_update_status import APIUpdateStatus from .deposit_check import APIChecks from .deposit_list import APIList +from .deposit_read import APIReadArchives, APIReadMetadata +from .deposit_update_status import APIUpdateStatus urlpatterns = [ # Retrieve deposit's raw archives' content # -> GET url( r"^(?P[^/]+)/(?P[^/]+)/raw/$", APIReadArchives.as_view(), name=PRIVATE_GET_RAW_CONTENT, ), # Update deposit's status # -> PUT url( r"^(?P[^/]+)/(?P[^/]+)/update/$", APIUpdateStatus.as_view(), name=PRIVATE_PUT_DEPOSIT, ), # Retrieve metadata information on a specific deposit # -> GET url( r"^(?P[^/]+)/(?P[^/]+)/meta/$", APIReadMetadata.as_view(), name=PRIVATE_GET_DEPOSIT_METADATA, ), # Check archive and metadata information on a specific deposit # -> GET url( r"^(?P[^/]+)/(?P[^/]+)/check/$", APIChecks.as_view(), name=PRIVATE_CHECK_DEPOSIT, ), # Retrieve deposit's raw archives' content # -> GET url( r"^(?P[^/]+)/raw/$", APIReadArchives.as_view(), name=PRIVATE_GET_RAW_CONTENT + "-nc", ), # Update deposit's status # -> PUT url( r"^(?P[^/]+)/update/$", APIUpdateStatus.as_view(), name=PRIVATE_PUT_DEPOSIT + "-nc", ), # Retrieve metadata information on a specific deposit # -> GET url( r"^(?P[^/]+)/meta/$", APIReadMetadata.as_view(), name=PRIVATE_GET_DEPOSIT_METADATA + "-nc", ), # Check archive and metadata information on a specific deposit # -> GET url( r"^(?P[^/]+)/check/$", APIChecks.as_view(), name=PRIVATE_CHECK_DEPOSIT + "-nc", ), url(r"^deposits/$", APIList.as_view(), name=PRIVATE_LIST_DEPOSITS), ] diff --git a/swh/deposit/api/service_document.py b/swh/deposit/api/service_document.py index 15a68eba..a36cb304 100644 --- a/swh/deposit/api/service_document.py +++ b/swh/deposit/api/service_document.py @@ -1,34 +1,33 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.shortcuts import render from django.urls import reverse -from .common import APIBase, ACCEPT_PACKAGINGS -from .common import ACCEPT_ARCHIVE_CONTENT_TYPES from ..config import COL_IRI from ..models import DepositClient, DepositCollection +from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, ACCEPT_PACKAGINGS, APIBase class APIServiceDocument(APIBase): def get(self, req, *args, **kwargs): client = DepositClient.objects.get(username=req.user) collections = {} for col_id in client.collections: col = DepositCollection.objects.get(pk=col_id) col_uri = req.build_absolute_uri(reverse(COL_IRI, args=[col.name])) collections[col.name] = col_uri context = { "max_upload_size": self.config["max_upload_size"], "accept_packagings": ACCEPT_PACKAGINGS, "accept_content_types": ACCEPT_ARCHIVE_CONTENT_TYPES, "collections": collections, } return render( req, "deposit/service_document.xml", context, content_type="application/xml" ) diff --git a/swh/deposit/api/urls.py b/swh/deposit/api/urls.py index f7450d77..e7a686af 100644 --- a/swh/deposit/api/urls.py +++ b/swh/deposit/api/urls.py @@ -1,70 +1,68 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """SWH's deposit api URL Configuration """ from django.conf.urls import url from django.shortcuts import render -from ..config import EDIT_SE_IRI, EM_IRI, CONT_FILE_IRI -from ..config import SD_IRI, COL_IRI, STATE_IRI +from ..config import COL_IRI, CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI, SD_IRI, STATE_IRI from .deposit import APIPostDeposit -from .deposit_status import APIStatus -from .deposit_update import APIUpdateMetadata -from .deposit_update import APIUpdateArchive from .deposit_content import APIContent +from .deposit_status import APIStatus +from .deposit_update import APIUpdateArchive, APIUpdateMetadata from .service_document import APIServiceDocument def api_view(req): return render(req, "api.html") # PUBLIC API urlpatterns = [ # simple view on the api url(r"^$", api_view, name="api"), # SD IRI - Service Document IRI # -> GET url(r"^servicedocument/", APIServiceDocument.as_view(), name=SD_IRI), # Col IRI - Collection IRI # -> POST url(r"^(?P[^/]+)/$", APIPostDeposit.as_view(), name=COL_IRI), # EM IRI - Atom Edit Media IRI (update archive IRI) # -> PUT (update-in-place existing archive) # -> POST (add new archive) url( r"^(?P[^/]+)/(?P[^/]+)/media/$", APIUpdateArchive.as_view(), name=EM_IRI, ), # Edit IRI - Atom Entry Edit IRI (update metadata IRI) # SE IRI - Sword Edit IRI ;; possibly same as Edit IRI # -> PUT (update in place) # -> POST (add new metadata) url( r"^(?P[^/]+)/(?P[^/]+)/metadata/$", APIUpdateMetadata.as_view(), name=EDIT_SE_IRI, ), # State IRI # -> GET url( r"^(?P[^/]+)/(?P[^/]+)/status/$", APIStatus.as_view(), name=STATE_IRI, ), # Cont/File IRI # -> GET url( r"^(?P[^/]+)/(?P[^/]+)/content/$", APIContent.as_view(), name=CONT_FILE_IRI, ), # specification is not clear about # FILE-IRI, we assume it's the same as # the CONT-IRI one ] diff --git a/swh/deposit/cli/__init__.py b/swh/deposit/cli/__init__.py index e5c50d1f..7a5473b5 100644 --- a/swh/deposit/cli/__init__.py +++ b/swh/deposit/cli/__init__.py @@ -1,42 +1,43 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import logging + # WARNING: do not import unnecessary things here to keep cli startup time under # control import click -import logging from swh.core.cli import CONTEXT_SETTINGS logger = logging.getLogger(__name__) @click.group(context_settings=CONTEXT_SETTINGS) @click.pass_context def deposit(ctx): """Deposit main command """ ctx.ensure_object(dict) log_level = ctx.obj.get("log_level", logging.INFO) logger.setLevel(log_level) def main(): logging.basicConfig() return deposit(auto_envvar_prefix="SWH_DEPOSIT") # These import statements MUST be executed after defining the 'deposit' group # since the subcommands in these are defined using this 'deposit' group. from . import client # noqa try: from . import admin # noqa except ImportError: # server part is optional logger.debug("admin subcommand not loaded") if __name__ == "__main__": main() diff --git a/swh/deposit/cli/admin.py b/swh/deposit/cli/admin.py index fbaac89a..a56581de 100644 --- a/swh/deposit/cli/admin.py +++ b/swh/deposit/cli/admin.py @@ -1,274 +1,275 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # WARNING: do not import unnecessary things here to keep cli startup time under # control import click from swh.deposit.cli import deposit @deposit.group("admin") @click.option( "--config-file", "-C", default=None, type=click.Path(exists=True, dir_okay=False,), help="Optional extra configuration file.", ) @click.option( "--platform", default="development", type=click.Choice(["development", "production"]), help="development or production platform", ) @click.pass_context def admin(ctx, config_file, platform): """Server administration tasks (manipulate user or collections)""" from swh.deposit.config import setup_django_for # configuration happens here setup_django_for(platform, config_file=config_file) @admin.group("user") @click.pass_context def user(ctx): """Manipulate user.""" # configuration happens here pass def _create_collection(name): """Create the collection with name if it does not exist. Args: name (str): collection's name Returns: collection (DepositCollection): the existing collection object (created or not) """ # to avoid loading too early django namespaces from swh.deposit.models import DepositCollection try: collection = DepositCollection.objects.get(name=name) click.echo("Collection %s exists, nothing to do." % name) except DepositCollection.DoesNotExist: click.echo("Create new collection %s" % name) collection = DepositCollection.objects.create(name=name) click.echo("Collection %s created" % name) return collection @user.command("create") @click.option("--username", required=True, help="User's name") @click.option("--password", required=True, help="Desired user's password (plain).") @click.option("--firstname", default="", help="User's first name") @click.option("--lastname", default="", help="User's last name") @click.option("--email", default="", help="User's email") @click.option("--collection", help="User's collection") @click.option("--provider-url", default="", help="Provider URL") @click.option("--domain", default="", help="The domain") @click.pass_context def user_create( ctx, username, password, firstname, lastname, email, collection, provider_url, domain, ): """Create a user with some needed information (password, collection) If the collection does not exist, the collection is then created alongside. The password is stored encrypted using django's utilities. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositClient # If collection is not provided, fallback to username if not collection: collection = username click.echo("collection: %s" % collection) # create the collection if it does not exist collection = _create_collection(collection) # user create/update try: user = DepositClient.objects.get(username=username) click.echo("User %s exists, updating information." % user) user.set_password(password) except DepositClient.DoesNotExist: click.echo("Create new user %s" % username) user = DepositClient.objects.create_user(username=username, password=password) user.collections = [collection.id] user.first_name = firstname user.last_name = lastname user.email = email user.is_active = True user.provider_url = provider_url user.domain = domain user.save() click.echo("Information registered for user %s" % user) @user.command("list") @click.pass_context def user_list(ctx): """List existing users. This entrypoint is not paginated yet as there is not a lot of entry. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositClient users = DepositClient.objects.all() if not users: output = "Empty user list" else: output = "\n".join((user.username for user in users)) click.echo(output) @user.command("exists") @click.argument("username", required=True) @click.pass_context def user_exists(ctx, username): """Check if user exists. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositClient try: DepositClient.objects.get(username=username) click.echo("User %s exists." % username) ctx.exit(0) except DepositClient.DoesNotExist: click.echo("User %s does not exist." % username) ctx.exit(1) @admin.group("collection") @click.pass_context def collection(ctx): """Manipulate collections.""" pass @collection.command("create") @click.option("--name", required=True, help="Collection's name") @click.pass_context def collection_create(ctx, name): _create_collection(name) @collection.command("list") @click.pass_context def collection_list(ctx): """List existing collections. This entrypoint is not paginated yet as there is not a lot of entry. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositCollection collections = DepositCollection.objects.all() if not collections: output = "Empty collection list" else: output = "\n".join((col.name for col in collections)) click.echo(output) @admin.group("deposit") @click.pass_context def adm_deposit(ctx): """Manipulate deposit.""" pass @adm_deposit.command("reschedule") @click.option("--deposit-id", required=True, help="Deposit identifier") @click.pass_context def adm_deposit_reschedule(ctx, deposit_id): """Reschedule the deposit loading This will: - check the deposit's status to something reasonable (failed or done). That means that the checks have passed alright but something went wrong during the loading (failed: loading failed, done: loading ok, still for some reasons as in bugs, we need to reschedule it) - reset the deposit's status to 'verified' (prior to any loading but after the checks which are fine) and removes the different archives' identifiers (swh-id, ...) - trigger back the loading task through the scheduler """ # to avoid loading too early django namespaces from datetime import datetime - from swh.deposit.models import Deposit + from swh.deposit.config import ( - DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE, + DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_VERIFIED, APIConfig, ) + from swh.deposit.models import Deposit try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: click.echo("Deposit %s does not exist." % deposit_id) ctx.exit(1) # Check the deposit is in a reasonable state accepted_statuses = [DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE] if deposit.status == DEPOSIT_STATUS_VERIFIED: click.echo("Deposit %s's status already set for rescheduling." % (deposit_id)) ctx.exit(0) if deposit.status not in accepted_statuses: click.echo( "Deposit %s's status be one of %s." % (deposit_id, ", ".join(accepted_statuses)) ) ctx.exit(1) task_id = deposit.load_task_id if not task_id: click.echo( "Deposit %s cannot be rescheduled. It misses the " "associated task." % deposit_id ) ctx.exit(1) # Reset the deposit's state deposit.swh_id = None deposit.swh_id_context = None deposit.status = DEPOSIT_STATUS_VERIFIED deposit.save() # Trigger back the deposit scheduler = APIConfig().scheduler scheduler.set_status_tasks( [task_id], status="next_run_not_scheduled", next_run=datetime.now() ) diff --git a/swh/deposit/cli/client.py b/swh/deposit/cli/client.py index 471bd7d7..b5fe469d 100644 --- a/swh/deposit/cli/client.py +++ b/swh/deposit/cli/client.py @@ -1,507 +1,509 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import logging + # WARNING: do not import unnecessary things here to keep cli startup time under # control import os -import logging import sys import click from swh.deposit.cli import deposit - logger = logging.getLogger(__name__) class InputError(ValueError): """Input script error """ pass def generate_slug(): """Generate a slug (sample purposes). """ import uuid return str(uuid.uuid4()) def _url(url): """Force the /1 api version at the end of the url (avoiding confusing issues without it). Args: url (str): api url used by cli users Returns: Top level api url to actually request """ if not url.endswith("/1"): url = "%s/1" % url return url def generate_metadata_file(name, external_id, authors, temp_dir): """Generate a temporary metadata file with the minimum required metadata This generates a xml file in a temporary location and returns the path to that file. This is up to the client of that function to clean up the temporary file. Args: name (str): Software's name external_id (str): External identifier (slug) or generated one authors (List[str]): List of author names Returns: Filepath to the metadata generated file """ import xmltodict path = os.path.join(temp_dir, "metadata.xml") # generate a metadata file with the minimum required metadata codemetadata = { "entry": { "@xmlns": "http://www.w3.org/2005/Atom", "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "codemeta:name": name, "codemeta:identifier": external_id, "codemeta:author": [ {"codemeta:name": author_name} for author_name in authors ], }, } logging.debug("Temporary file: %s", path) logging.debug("Metadata dict to generate as xml: %s", codemetadata) s = xmltodict.unparse(codemetadata, pretty=True) logging.debug("Metadata dict as xml generated: %s", s) with open(path, "w") as fp: fp.write(s) return path def _client(url, username, password): """Instantiate a client to access the deposit api server Args: url (str): Deposit api server username (str): User password (str): User's password """ from swh.deposit.client import PublicApiDepositClient client = PublicApiDepositClient( {"url": url, "auth": {"username": username, "password": password},} ) return client def _collection(client): """Retrieve the client's collection """ # retrieve user's collection sd_content = client.service_document() if "error" in sd_content: raise InputError("Service document retrieval: %s" % (sd_content["error"],)) collection = sd_content["service"]["workspace"]["collection"]["sword:name"] return collection def client_command_parse_input( username, password, archive, metadata, archive_deposit, metadata_deposit, collection, slug, partial, deposit_id, replace, url, name, authors, temp_dir, ): """Parse the client subcommand options and make sure the combination is acceptable*. If not, an InputError exception is raised explaining the issue. By acceptable, we mean: - A multipart deposit (create or update) requires: - an existing software archive - an existing metadata file or author(s) and name provided in params - A binary deposit (create/update) requires an existing software archive - A metadata deposit (create/update) requires an existing metadata file or author(s) and name provided in params - A deposit update requires a deposit_id This will not prevent all failure cases though. The remaining errors are already dealt with by the underlying api client. Raises: InputError explaining the user input related issue MaintenanceError explaining the api status Returns: dict with the following keys: 'archive': the software archive to deposit 'username': username 'password': associated password 'metadata': the metadata file to deposit 'collection': the username's associated client 'slug': the slug or external id identifying the deposit to make 'partial': if the deposit is partial or not 'client': instantiated class 'url': deposit's server main entry point 'deposit_type': deposit's type (binary, multipart, metadata) 'deposit_id': optional deposit identifier """ if archive_deposit and metadata_deposit: # too many flags use, remove redundant ones (-> multipart deposit) archive_deposit = False metadata_deposit = False if not slug: # generate one as this is mandatory slug = generate_slug() if not metadata: if name and authors: metadata = generate_metadata_file(name, slug, authors, temp_dir) elif not archive_deposit and not partial and not deposit_id: # If we meet all the following conditions: # * there is not an archive-only deposit # * it is not part of a multipart deposit (either create/update # or finish) # * it misses either name or authors raise InputError( "Either a metadata file (--metadata) or both --author and " "--name must be provided, unless this is an archive-only " "deposit." ) elif name or authors: # If we are generating metadata, then all mandatory metadata # must be present raise InputError( "Either a metadata file (--metadata) or both --author and " "--name must be provided." ) else: # TODO: this is a multipart deposit, we might want to check that # metadata are deposited at some point pass elif name or authors: raise InputError( "Using a metadata file (--metadata) is incompatible with " "--author and --name, which are used to generate one." ) if metadata_deposit: archive = None if archive_deposit: metadata = None if metadata_deposit and not metadata: raise InputError( "Metadata deposit must be provided for metadata " "deposit (either a filepath or --name and --author)" ) if not archive and not metadata and partial: raise InputError( "Please provide an actionable command. See --help for more " "information" ) if replace and not deposit_id: raise InputError("To update an existing deposit, you must provide its id") client = _client(url, username, password) if not collection: collection = _collection(client) return { "archive": archive, "username": username, "password": password, "metadata": metadata, "collection": collection, "slug": slug, "in_progress": partial, "client": client, "url": url, "deposit_id": deposit_id, "replace": replace, } def _subdict(d, keys): "return a dict from d with only given keys" return {k: v for k, v in d.items() if k in keys} def deposit_create(config, logger): """Delegate the actual deposit to the deposit client. """ logger.debug("Create deposit") client = config["client"] keys = ("collection", "archive", "metadata", "slug", "in_progress") return client.deposit_create(**_subdict(config, keys)) def deposit_update(config, logger): """Delegate the actual deposit to the deposit client. """ logger.debug("Update deposit") client = config["client"] keys = ( "collection", "deposit_id", "archive", "metadata", "slug", "in_progress", "replace", ) return client.deposit_update(**_subdict(config, keys)) @deposit.command() @click.option("--username", required=True, help="(Mandatory) User's name") @click.option( "--password", required=True, help="(Mandatory) User's associated password" ) @click.option( "--archive", type=click.Path(exists=True), help="(Optional) Software archive to deposit", ) @click.option( "--metadata", type=click.Path(exists=True), help=( "(Optional) Path to xml metadata file. If not provided, " "this will use a file named .metadata.xml" ), ) # noqa @click.option( "--archive-deposit/--no-archive-deposit", default=False, help="(Optional) Software archive only deposit", ) @click.option( "--metadata-deposit/--no-metadata-deposit", default=False, help="(Optional) Metadata only deposit", ) @click.option( "--collection", help="(Optional) User's collection. If not provided, this will be fetched.", ) # noqa @click.option( "--slug", help=( "(Optional) External system information identifier. " "If not provided, it will be generated" ), ) # noqa @click.option( "--partial/--no-partial", default=False, help=( "(Optional) The deposit will be partial, other deposits " "will have to take place to finalize it." ), ) # noqa @click.option( "--deposit-id", default=None, help="(Optional) Update an existing partial deposit with its identifier", ) # noqa @click.option( "--replace/--no-replace", default=False, help="(Optional) Update by replacing existing metadata to a deposit", ) # noqa @click.option( "--url", default="https://deposit.softwareheritage.org", help=( "(Optional) Deposit server api endpoint. By default, " "https://deposit.softwareheritage.org/1" ), ) # noqa @click.option("--verbose/--no-verbose", default=False, help="Verbose mode") @click.option("--name", help="Software name") @click.option( "--author", multiple=True, help="Software author(s), this can be repeated as many times" " as there are authors", ) @click.option( "-f", "--format", "output_format", default="logging", type=click.Choice(["logging", "yaml", "json"]), help="Output format results.", ) @click.pass_context def upload( ctx, username, password, archive=None, metadata=None, archive_deposit=False, metadata_deposit=False, collection=None, slug=None, partial=False, deposit_id=None, replace=False, url="https://deposit.softwareheritage.org", verbose=False, name=None, author=None, output_format=None, ): """Software Heritage Public Deposit Client Create/Update deposit through the command line. More documentation can be found at https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html. """ import tempfile + from swh.deposit.client import MaintenanceError url = _url(url) config = {} with tempfile.TemporaryDirectory() as temp_dir: try: logger.debug("Parsing cli options") config = client_command_parse_input( username, password, archive, metadata, archive_deposit, metadata_deposit, collection, slug, partial, deposit_id, replace, url, name, author, temp_dir, ) except InputError as e: logger.error("Problem during parsing options: %s", e) sys.exit(1) except MaintenanceError as e: logger.error(e) sys.exit(1) if verbose: logger.info("Parsed configuration: %s" % (config,)) deposit_id = config["deposit_id"] if deposit_id: r = deposit_update(config, logger) else: r = deposit_create(config, logger) print_result(r, output_format) @deposit.command() @click.option( "--url", default="https://deposit.softwareheritage.org", help="(Optional) Deposit server api endpoint. By default, " "https://deposit.softwareheritage.org/1", ) @click.option("--username", required=True, help="(Mandatory) User's name") @click.option( "--password", required=True, help="(Mandatory) User's associated password" ) @click.option("--deposit-id", default=None, required=True, help="Deposit identifier.") @click.option( "-f", "--format", "output_format", default="logging", type=click.Choice(["logging", "yaml", "json"]), help="Output format results.", ) @click.pass_context def status(ctx, url, username, password, deposit_id, output_format): """Deposit's status """ from swh.deposit.client import MaintenanceError url = _url(url) logger.debug("Status deposit") try: client = _client(url, username, password) collection = _collection(client) except InputError as e: logger.error("Problem during parsing options: %s", e) sys.exit(1) except MaintenanceError as e: logger.error(e) sys.exit(1) print_result( client.deposit_status(collection=collection, deposit_id=deposit_id), output_format, ) def print_result(data, output_format): import json + import yaml if output_format == "json": click.echo(json.dumps(data)) elif output_format == "yaml": click.echo(yaml.dump(data)) else: logger.info(data) diff --git a/swh/deposit/client.py b/swh/deposit/client.py index 9cd1a4fd..a27c166b 100644 --- a/swh/deposit/client.py +++ b/swh/deposit/client.py @@ -1,651 +1,651 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining an swh-deposit client """ +from abc import ABCMeta, abstractmethod import hashlib -import os import logging -import requests -import xmltodict - -from abc import ABCMeta, abstractmethod +import os from typing import Any, Dict from urllib.parse import urljoin -from swh.core.config import read_raw_config, config_basepath +import requests +import xmltodict + +from swh.core.config import config_basepath, read_raw_config logger = logging.getLogger(__name__) class MaintenanceError(ValueError): """Informational maintenance error exception """ pass def _parse(stream, encoding="utf-8"): """Given a xml stream, parse the result. Args: stream (bytes/text): The stream to parse encoding (str): The encoding to use if to decode the bytes stream Returns: A dict of values corresponding to the parsed xml """ if isinstance(stream, bytes): stream = stream.decode(encoding) data = xmltodict.parse(stream, encoding=encoding, process_namespaces=False) if "entry" in data: data = data["entry"] if "sword:error" in data: data = data["sword:error"] return dict(data) def _parse_with_filter(stream, encoding="utf-8", keys=[]): """Given a xml stream, parse the result and filter with keys. Args: stream (bytes/text): The stream to parse encoding (str): The encoding to use if to decode the bytes stream keys ([str]): Keys to filter the parsed result Returns: A dict of values corresponding to the parsed xml filtered by the keys provided. """ data = _parse(stream, encoding=encoding) m = {} for key in keys: m[key] = data.get(key) return m class BaseApiDepositClient: """Deposit client base class """ def __init__(self, config=None, _client=requests): if config is None: config_file = os.environ["SWH_CONFIG_FILENAME"] self.config: Dict[str, Any] = read_raw_config(config_basepath(config_file)) else: self.config = config self._client = _client self.base_url = self.config["url"].strip("/") + "/" auth = self.config["auth"] if auth == {}: self.auth = None else: self.auth = (auth["username"], auth["password"]) def do(self, method, url, *args, **kwargs): """Internal method to deal with requests, possibly with basic http authentication. Args: method (str): supported http methods as in self._methods' keys Returns: The request's execution """ if hasattr(self._client, method): method_fn = getattr(self._client, method) else: raise ValueError("Development error, unsupported method %s" % (method)) if self.auth: kwargs["auth"] = self.auth full_url = urljoin(self.base_url, url.lstrip("/")) return method_fn(full_url, *args, **kwargs) class PrivateApiDepositClient(BaseApiDepositClient): """Private API deposit client to: - read a given deposit's archive(s) - read a given deposit's metadata - update a given deposit's status """ def archive_get(self, archive_update_url, archive): """Retrieve the archive from the deposit to a local directory. Args: archive_update_url (str): The full deposit archive(s)'s raw content to retrieve locally archive (str): the local archive's path where to store the raw content Returns: The archive path to the local archive to load. Or None if any problem arose. """ r = self.do("get", archive_update_url, stream=True) if r.ok: with open(archive, "wb") as f: for chunk in r.iter_content(): f.write(chunk) return archive msg = "Problem when retrieving deposit archive at %s" % (archive_update_url,) logger.error(msg) raise ValueError(msg) def metadata_get(self, metadata_url): """Retrieve the metadata information on a given deposit. Args: metadata_url (str): The full deposit metadata url to retrieve locally Returns: The dictionary of metadata for that deposit or None if any problem arose. """ r = self.do("get", metadata_url) if r.ok: return r.json() msg = "Problem when retrieving metadata at %s" % metadata_url logger.error(msg) raise ValueError(msg) def status_update( self, update_status_url, status, revision_id=None, directory_id=None, origin_url=None, ): """Update the deposit's status. Args: update_status_url (str): the full deposit's archive status (str): The status to update the deposit with revision_id (str/None): the revision's identifier to update to directory_id (str/None): the directory's identifier to update to origin_url (str/None): deposit's associated origin url """ payload = {"status": status} if revision_id: payload["revision_id"] = revision_id if directory_id: payload["directory_id"] = directory_id if origin_url: payload["origin_url"] = origin_url self.do("put", update_status_url, json=payload) def check(self, check_url): """Check the deposit's associated data (metadata, archive(s)) Args: check_url (str): the full deposit's check url """ r = self.do("get", check_url) if r.ok: data = r.json() return data["status"] msg = "Problem when checking deposit %s" % check_url logger.error(msg) raise ValueError(msg) class BaseDepositClient(BaseApiDepositClient, metaclass=ABCMeta): """Base Deposit client to access the public api. """ def __init__(self, config, error_msg=None, empty_result={}): super().__init__(config) self.error_msg = error_msg self.empty_result = empty_result @abstractmethod def compute_url(self, *args, **kwargs): """Compute api url endpoint to query.""" pass @abstractmethod def compute_method(self, *args, **kwargs): """Http method to use on the url""" pass @abstractmethod def parse_result_ok(self, xml_content): """Given an xml result from the api endpoint, parse it and returns a dict. """ pass def compute_information(self, *args, **kwargs): """Compute some more information given the inputs (e.g http headers, ...) """ return {} def parse_result_error(self, xml_content): """Given an error response in xml, parse it into a dict. Returns: dict with following keys: 'error': The error message 'detail': Some more detail about the error if any """ return _parse_with_filter( xml_content, keys=["summary", "detail", "sword:verboseDescription"] ) def do_execute(self, method, url, info): """Execute the http query to url using method and info information. By default, execute a simple query to url with the http method. Override this in daughter class to improve the default behavior if needed. """ return self.do(method, url) def execute(self, *args, **kwargs) -> Dict[str, Any]: """Main endpoint to prepare and execute the http query to the api. Raises: MaintenanceError if some api maintenance is happening. Returns: Dict of computed api data """ url = self.compute_url(*args, **kwargs) method = self.compute_method(*args, **kwargs) info = self.compute_information(*args, **kwargs) try: r = self.do_execute(method, url, info) except Exception as e: msg = self.error_msg % (url, e) r = self.empty_result r.update( {"error": msg,} ) return r else: if r.ok: if int(r.status_code) == 204: # 204 returns no body return {"status": r.status_code} else: return self.parse_result_ok(r.text) else: error = self.parse_result_error(r.text) empty = self.empty_result error.update(empty) if r.status_code == 503: summary = error.get("summary") detail = error.get("sword:verboseDescription") # Maintenance error if summary and detail: raise MaintenanceError(f"{summary}: {detail}") error.update( {"status": r.status_code,} ) return error class ServiceDocumentDepositClient(BaseDepositClient): """Service Document information retrieval. """ def __init__(self, config): super().__init__( config, error_msg="Service document failure at %s: %s", empty_result={"collection": None}, ) def compute_url(self, *args, **kwargs): return "/servicedocument/" def compute_method(self, *args, **kwargs): return "get" def parse_result_ok(self, xml_content): """Parse service document's success response. """ return _parse(xml_content) class StatusDepositClient(BaseDepositClient): """Status information on a deposit. """ def __init__(self, config): super().__init__( config, error_msg="Status check failure at %s: %s", empty_result={ "deposit_status": None, "deposit_status_detail": None, "deposit_swh_id": None, }, ) def compute_url(self, collection, deposit_id): return "/%s/%s/status/" % (collection, deposit_id) def compute_method(self, *args, **kwargs): return "get" def parse_result_ok(self, xml_content): """Given an xml content as string, returns a deposit dict. """ return _parse_with_filter( xml_content, keys=[ "deposit_id", "deposit_status", "deposit_status_detail", "deposit_swh_id", "deposit_swh_id_context", "deposit_external_id", ], ) class BaseCreateDepositClient(BaseDepositClient): """Deposit client base class to post new deposit. """ def __init__(self, config): super().__init__( config, error_msg="Post Deposit failure at %s: %s", empty_result={"deposit_id": None, "deposit_status": None,}, ) def compute_url(self, collection, *args, **kwargs): return "/%s/" % collection def compute_method(self, *args, **kwargs): return "post" def parse_result_ok(self, xml_content): """Given an xml content as string, returns a deposit dict. """ return _parse_with_filter( xml_content, keys=[ "deposit_id", "deposit_status", "deposit_status_detail", "deposit_date", ], ) def _compute_information( self, collection, filepath, in_progress, slug, is_archive=True ): """Given a filepath, compute necessary information on that file. Args: filepath (str): Path to a file is_archive (bool): is it an archive or not? Returns: dict with keys: 'content-type': content type associated 'md5sum': md5 sum 'filename': filename """ filename = os.path.basename(filepath) if is_archive: md5sum = hashlib.md5(open(filepath, "rb").read()).hexdigest() extension = filename.split(".")[-1] if "zip" in extension: content_type = "application/zip" else: content_type = "application/x-tar" else: content_type = None md5sum = None return { "slug": slug, "in_progress": in_progress, "content-type": content_type, "md5sum": md5sum, "filename": filename, "filepath": filepath, } def compute_information( self, collection, filepath, in_progress, slug, is_archive=True, **kwargs ): info = self._compute_information( collection, filepath, in_progress, slug, is_archive=is_archive ) info["headers"] = self.compute_headers(info) return info def do_execute(self, method, url, info): with open(info["filepath"], "rb") as f: return self.do(method, url, data=f, headers=info["headers"]) class CreateArchiveDepositClient(BaseCreateDepositClient): """Post an archive (binary) deposit client.""" def compute_headers(self, info): return { "SLUG": info["slug"], "CONTENT_MD5": info["md5sum"], "IN-PROGRESS": str(info["in_progress"]), "CONTENT-TYPE": info["content-type"], "CONTENT-DISPOSITION": "attachment; filename=%s" % (info["filename"],), } class UpdateArchiveDepositClient(CreateArchiveDepositClient): """Update (add/replace) an archive (binary) deposit client.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return "/%s/%s/media/" % (collection, deposit_id) def compute_method(self, *args, replace=False, **kwargs): return "put" if replace else "post" class CreateMetadataDepositClient(BaseCreateDepositClient): """Post a metadata deposit client.""" def compute_headers(self, info): return { "SLUG": info["slug"], "IN-PROGRESS": str(info["in_progress"]), "CONTENT-TYPE": "application/atom+xml;type=entry", } class UpdateMetadataDepositClient(CreateMetadataDepositClient): """Update (add/replace) a metadata deposit client.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return "/%s/%s/metadata/" % (collection, deposit_id) def compute_method(self, *args, replace=False, **kwargs): return "put" if replace else "post" class CreateMultipartDepositClient(BaseCreateDepositClient): """Create a multipart deposit client.""" def _multipart_info(self, info, info_meta): files = [ ( "file", (info["filename"], open(info["filepath"], "rb"), info["content-type"]), ), ( "atom", ( info_meta["filename"], open(info_meta["filepath"], "rb"), "application/atom+xml", ), ), ] headers = { "SLUG": info["slug"], "CONTENT_MD5": info["md5sum"], "IN-PROGRESS": str(info["in_progress"]), } return files, headers def compute_information( self, collection, archive, metadata, in_progress, slug, **kwargs ): info = self._compute_information(collection, archive, in_progress, slug) info_meta = self._compute_information( collection, metadata, in_progress, slug, is_archive=False ) files, headers = self._multipart_info(info, info_meta) return {"files": files, "headers": headers} def do_execute(self, method, url, info): return self.do(method, url, files=info["files"], headers=info["headers"]) class UpdateMultipartDepositClient(CreateMultipartDepositClient): """Update a multipart deposit client.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return "/%s/%s/metadata/" % (collection, deposit_id) def compute_method(self, *args, replace=False, **kwargs): return "put" if replace else "post" class PublicApiDepositClient(BaseApiDepositClient): """Public api deposit client.""" def service_document(self): """Retrieve service document endpoint's information.""" return ServiceDocumentDepositClient(self.config).execute() def deposit_status(self, collection, deposit_id): """Retrieve status information on a deposit.""" return StatusDepositClient(self.config).execute(collection, deposit_id) def deposit_create( self, collection, slug, archive=None, metadata=None, in_progress=False ): """Create a new deposit (archive, metadata, both as multipart).""" if archive and not metadata: return CreateArchiveDepositClient(self.config).execute( collection, archive, in_progress, slug ) elif not archive and metadata: return CreateMetadataDepositClient(self.config).execute( collection, metadata, in_progress, slug, is_archive=False ) else: return CreateMultipartDepositClient(self.config).execute( collection, archive, metadata, in_progress, slug ) def deposit_update( self, collection, deposit_id, slug, archive=None, metadata=None, in_progress=False, replace=False, ): """Update (add/replace) existing deposit (archive, metadata, both).""" r = self.deposit_status(collection, deposit_id) if "error" in r: return r status = r["deposit_status"] if status != "partial": return { "error": "You can only act on deposit with status 'partial'", "detail": "The deposit %s has status '%s'" % (deposit_id, status), "deposit_status": status, "deposit_id": deposit_id, } if archive and not metadata: r = UpdateArchiveDepositClient(self.config).execute( collection, archive, in_progress, slug, deposit_id=deposit_id, replace=replace, ) elif not archive and metadata: r = UpdateMetadataDepositClient(self.config).execute( collection, metadata, in_progress, slug, deposit_id=deposit_id, replace=replace, ) else: r = UpdateMultipartDepositClient(self.config).execute( collection, archive, metadata, in_progress, slug, deposit_id=deposit_id, replace=replace, ) if "error" in r: return r return self.deposit_status(collection, deposit_id) diff --git a/swh/deposit/config.py b/swh/deposit/config.py index e658bef3..16221dfd 100644 --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -1,101 +1,99 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os - from typing import Any, Dict from swh.core import config from swh.scheduler import get_scheduler from swh.scheduler.interface import SchedulerInterface - # IRIs (Internationalized Resource identifier) sword 2.0 specified EDIT_SE_IRI = "edit_se_iri" EM_IRI = "em_iri" CONT_FILE_IRI = "cont_file_iri" SD_IRI = "servicedocument" COL_IRI = "upload" STATE_IRI = "state_iri" PRIVATE_GET_RAW_CONTENT = "private-download" PRIVATE_CHECK_DEPOSIT = "check-deposit" PRIVATE_PUT_DEPOSIT = "private-update" PRIVATE_GET_DEPOSIT_METADATA = "private-read" PRIVATE_LIST_DEPOSITS = "private-deposit-list" ARCHIVE_KEY = "archive" METADATA_KEY = "metadata" RAW_METADATA_KEY = "raw-metadata" ARCHIVE_TYPE = "archive" METADATA_TYPE = "metadata" AUTHORIZED_PLATFORMS = ["development", "production", "testing"] DEPOSIT_STATUS_REJECTED = "rejected" DEPOSIT_STATUS_PARTIAL = "partial" DEPOSIT_STATUS_DEPOSITED = "deposited" DEPOSIT_STATUS_VERIFIED = "verified" DEPOSIT_STATUS_LOAD_SUCCESS = "done" DEPOSIT_STATUS_LOAD_FAILURE = "failed" # Revision author for deposit SWH_PERSON = { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org", } DEFAULT_CONFIG = { "max_upload_size": 209715200, "checks": True, } def setup_django_for(platform=None, config_file=None): """Setup function for command line tools (swh.deposit.create_user) to initialize the needed db access. Note: Do not import any django related module prior to this function call. Otherwise, this will raise an django.core.exceptions.ImproperlyConfigured error message. Args: platform (str): the platform the scheduling is running config_file (str): Extra configuration file (typically for the production platform) Raises: ValueError in case of wrong platform inputs. """ if platform is not None: if platform not in AUTHORIZED_PLATFORMS: raise ValueError("Platform should be one of %s" % AUTHORIZED_PLATFORMS) if "DJANGO_SETTINGS_MODULE" not in os.environ: os.environ["DJANGO_SETTINGS_MODULE"] = "swh.deposit.settings.%s" % platform if config_file: os.environ.setdefault("SWH_CONFIG_FILENAME", config_file) import django django.setup() class APIConfig: """API Configuration centralized class. This loads explicitly the configuration file out of the SWH_CONFIG_FILENAME environment variable. """ def __init__(self): config_file = os.environ["SWH_CONFIG_FILENAME"] conf = config.read_raw_config(config.config_basepath(config_file)) self.config: Dict[str, Any] = config.merge_configs(DEFAULT_CONFIG, conf) self.scheduler: SchedulerInterface = get_scheduler(**self.config["scheduler"]) diff --git a/swh/deposit/errors.py b/swh/deposit/errors.py index 68cc346b..f41965dd 100644 --- a/swh/deposit/errors.py +++ b/swh/deposit/errors.py @@ -1,151 +1,150 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of providing the standard sword errors """ -from rest_framework import status from django.shortcuts import render - +from rest_framework import status FORBIDDEN = "forbidden" UNAUTHORIZED = "unauthorized" NOT_FOUND = "unknown" BAD_REQUEST = "bad-request" ERROR_CONTENT = "error-content" CHECKSUM_MISMATCH = "checksum-mismatch" MEDIATION_NOT_ALLOWED = "mediation-not-allowed" METHOD_NOT_ALLOWED = "method-not-allowed" MAX_UPLOAD_SIZE_EXCEEDED = "max_upload_size_exceeded" PARSING_ERROR = "parsing-error" class ParserError(ValueError): """Specific parsing error detected when parsing the xml metadata input """ pass ERRORS = { FORBIDDEN: { "status": status.HTTP_403_FORBIDDEN, "iri": "http://purl.org/net/sword/error/ErrorForbidden", "tag": "sword:ErrorForbidden", }, UNAUTHORIZED: { "status": status.HTTP_401_UNAUTHORIZED, "iri": "http://purl.org/net/sword/error/ErrorUnauthorized", "tag": "sword:ErrorUnauthorized", }, NOT_FOUND: { "status": status.HTTP_404_NOT_FOUND, "iri": "http://purl.org/net/sword/error/ErrorNotFound", "tag": "sword:ErrorNotFound", }, ERROR_CONTENT: { "status": status.HTTP_415_UNSUPPORTED_MEDIA_TYPE, "iri": "http://purl.org/net/sword/error/ErrorContent", "tag": "sword:ErrorContent", }, CHECKSUM_MISMATCH: { "status": status.HTTP_412_PRECONDITION_FAILED, "iri": "http://purl.org/net/sword/error/ErrorChecksumMismatch", "tag": "sword:ErrorChecksumMismatch", }, BAD_REQUEST: { "status": status.HTTP_400_BAD_REQUEST, "iri": "http://purl.org/net/sword/error/ErrorBadRequest", "tag": "sword:ErrorBadRequest", }, PARSING_ERROR: { "status": status.HTTP_400_BAD_REQUEST, "iri": "http://purl.org/net/sword/error/ErrorBadRequest", "tag": "sword:ErrorBadRequest", }, MEDIATION_NOT_ALLOWED: { "status": status.HTTP_412_PRECONDITION_FAILED, "iri": "http://purl.org/net/sword/error/MediationNotAllowed", "tag": "sword:MediationNotAllowed", }, METHOD_NOT_ALLOWED: { "status": status.HTTP_405_METHOD_NOT_ALLOWED, "iri": "http://purl.org/net/sword/error/MethodNotAllowed", "tag": "sword:MethodNotAllowed", }, MAX_UPLOAD_SIZE_EXCEEDED: { "status": status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, "iri": "http://purl.org/net/sword/error/MaxUploadSizeExceeded", "tag": "sword:MaxUploadSizeExceeded", }, } def make_error_dict(key, summary=None, verbose_description=None): """Utility function to factorize error message dictionary. Args: key (str): Error status key referenced in swh.deposit.errors module summary (str/None): Error message clarifying the status verbose_description (str/None): A more verbose description or work around a potential problem. Returns: Dictionary with key 'error' detailing the 'status' and associated 'message' """ return { "error": { "key": key, "summary": summary, "verboseDescription": verbose_description, }, } def make_error_response_from_dict(req, error): """Utility function to return an http response with error detail. Args: req (Request): original request error (dict): Error described as dict, typically generated from the make_error_dict function. Returns: HttpResponse with detailed error. """ error_information = ERRORS[error["key"]] context = error context.update(error_information) return render( req, "deposit/error.xml", context=error, content_type="application/xml", status=error_information["status"], ) def make_error_response(req, key, summary=None, verbose_description=None): """Utility function to create an http response with detailed error. Args: req (Request): original request key (str): Error status key referenced in swh.deposit.errors module summary (str): Error message clarifying the status verbose_description (str / None): A more verbose description or work around a potential problem. Returns: Dictionary with key 'error' detailing the 'status' and associated 'message' """ error = make_error_dict(key, summary, verbose_description) return make_error_response_from_dict(req, error["error"]) diff --git a/swh/deposit/exception.py b/swh/deposit/exception.py index cdd1f7d0..e0252e00 100644 --- a/swh/deposit/exception.py +++ b/swh/deposit/exception.py @@ -1,38 +1,37 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Dict, Optional +from django.db.utils import OperationalError +from django.http import HttpResponse from rest_framework.exceptions import APIException from rest_framework.views import exception_handler -from django.http import HttpResponse - -from django.db.utils import OperationalError def custom_exception_handler( exc: APIException, context: Dict ) -> Optional[HttpResponse]: """Custom deposit exception handler to ensure consistent xml output """ # drf's default exception handler first, to get the standard error response response = exception_handler(exc, context) if isinstance(exc, OperationalError): status = "Database backend maintenance" detail = "Service temporarily unavailable, try again later." data = f""" {status} {detail} """.encode( "utf-8" ) return HttpResponse(data, status=503, content_type="application/xml") return response diff --git a/swh/deposit/loader/checker.py b/swh/deposit/loader/checker.py index c9d9a9c2..5e239083 100644 --- a/swh/deposit/loader/checker.py +++ b/swh/deposit/loader/checker.py @@ -1,45 +1,42 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import os import logging - +import os from typing import Any, Dict from swh.core import config - from swh.deposit.client import PrivateApiDepositClient - logger = logging.getLogger(__name__) class DepositChecker: """Deposit checker implementation. Trigger deposit's checks through the private api. """ def __init__(self): config_file = os.environ["SWH_CONFIG_FILENAME"] self.config: Dict[str, Any] = config.read_raw_config( config.config_basepath(config_file) ) self.client = PrivateApiDepositClient(config=self.config["deposit"]) def check(self, collection: str, deposit_id: str) -> Dict[str, str]: status = None deposit_check_url = f"/{collection}/{deposit_id}/check/" logger.debug("deposit-check-url: %s", deposit_check_url) try: r = self.client.check(deposit_check_url) logger.debug("Check result: %s", r) status = "eventful" if r == "verified" else "failed" except Exception: logger.exception("Failure during check on '%s'", deposit_check_url) status = "failed" logger.debug("Check status: %s", status) return {"status": status} diff --git a/swh/deposit/manage.py b/swh/deposit/manage.py index 80fbcb86..eeb30601 100755 --- a/swh/deposit/manage.py +++ b/swh/deposit/manage.py @@ -1,53 +1,52 @@ #!/usr/bin/env python3 # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import sys from swh.core import config - DEFAULT_CONFIG = { "port": ("int", 5006), "host": ("str", "127.0.0.1"), } if __name__ == "__main__": settings_file = "development" if sys.argv[1] == "runserver": # override the default host:port for the 'runserver' task conf = config.load_named_config("deposit/server", default_conf=DEFAULT_CONFIG) extra_cmd = ["%s:%s" % (conf["host"], conf["port"])] cmd = sys.argv + extra_cmd elif sys.argv[1] == "test": # override the default settings file to read in testing mode settings_file = "testing" cmd = sys.argv else: # otherwise, do nothing cmd = sys.argv os.environ.setdefault( "DJANGO_SETTINGS_MODULE", "swh.deposit.settings.%s" % settings_file ) try: from django.core.management import execute_from_command_line except ImportError: # The above import may fail for some other reason. Ensure that the # issue is really that Django is missing to avoid masking other # exceptions on Python 2. try: import django # noqa except ImportError: raise ImportError( "Couldn't import Django. Are you sure it's installed and " "available on your PYTHONPATH environment variable? Did you " "forget to activate a virtual environment?" ) raise execute_from_command_line(cmd) diff --git a/swh/deposit/migrations/0002_depositrequest_archive.py b/swh/deposit/migrations/0002_depositrequest_archive.py index 68e0b080..b8931667 100644 --- a/swh/deposit/migrations/0002_depositrequest_archive.py +++ b/swh/deposit/migrations/0002_depositrequest_archive.py @@ -1,23 +1,24 @@ # -*- coding: utf-8 -*- # Generated by Django 1.10.7 on 2017-10-05 10:36 from __future__ import unicode_literals from django.db import migrations, models + import swh.deposit.models class Migration(migrations.Migration): dependencies = [ ("deposit", "0001_initial"), ] operations = [ migrations.AddField( model_name="depositrequest", name="archive", field=models.FileField( null=True, upload_to=swh.deposit.models.client_directory_path ), ), ] diff --git a/swh/deposit/migrations/0018_migrate_swhids.py b/swh/deposit/migrations/0018_migrate_swhids.py index f3fe71d8..a2724bee 100644 --- a/swh/deposit/migrations/0018_migrate_swhids.py +++ b/swh/deposit/migrations/0018_migrate_swhids.py @@ -1,348 +1,342 @@ # -*- coding: utf-8 -*- # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import unicode_literals -import os import logging +import os +from typing import Any, Dict, Optional, Tuple from django.db import migrations -from typing import Any, Dict, Optional, Tuple from swh.core import config from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS from swh.model.hashutil import hash_to_bytes, hash_to_hex -from swh.model.identifiers import ( - parse_swhid, - swhid, - DIRECTORY, - REVISION, - SNAPSHOT, -) +from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, parse_swhid, swhid from swh.storage import get_storage as get_storage_client from swh.storage.algos.snapshot import snapshot_id_get_from_revision SWH_PROVIDER_URL = "https://www.softwareheritage.org" logger = logging.getLogger(__name__) swh_storage = None def get_storage() -> Optional[Any]: """Instantiate a storage client """ settings = os.environ.get("DJANGO_SETTINGS_MODULE") if settings != "swh.deposit.settings.production": # Bypass for now return None global swh_storage if not swh_storage: config_file = os.environ.get("SWH_CONFIG_FILENAME") if not config_file: raise ValueError( "Production: SWH_CONFIG_FILENAME must be set to the" " configuration file needed!" ) if not os.path.exists(config_file): raise ValueError( "Production: configuration file %s does not exist!" % (config_file,) ) conf = config.load_named_config(config_file) if not conf: raise ValueError( "Production: configuration %s does not exist." % (config_file,) ) storage_config = conf.get("storage") if not storage_config: raise ValueError( "Production: invalid configuration; missing 'storage' config entry." ) swh_storage = get_storage_client(**storage_config) return swh_storage def migrate_deposit_swhid_context_not_null(apps, schema_editor): """Migrate deposit SWHIDs to the new format. Migrate deposit SWHIDs to the new format. Only deposit with status done and swh_id_context not null are concerned. """ storage = get_storage() if not storage: logging.warning("Nothing to do") return None Deposit = apps.get_model("deposit", "Deposit") for deposit in Deposit.objects.filter( status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=False ): obj_dir = parse_swhid(deposit.swh_id_context) assert obj_dir.object_type == DIRECTORY obj_rev = parse_swhid(deposit.swh_anchor_id) assert obj_rev.object_type == REVISION if set(obj_dir.metadata.keys()) != {"origin"}: # Assuming the migration is already done for that deposit logger.warning( "Deposit id %s: Migration already done, skipping", deposit.id ) continue # Starting migration dir_id = obj_dir.object_id origin = obj_dir.metadata["origin"] check_origin = storage.origin_get([origin])[0] if not check_origin: logger.warning("Deposit id %s: Origin %s not found!", deposit.id, origin) continue rev_id = obj_rev.object_id # Find the snapshot targeting the revision snp_id = snapshot_id_get_from_revision(storage, origin, hash_to_bytes(rev_id)) if snp_id is None: logger.warning( "Deposit id %s: Snapshot targeting revision %s not found!", deposit.id, rev_id, ) continue # Reference the old values to do some checks later old_swh_id = deposit.swh_id old_swh_id_context = deposit.swh_id_context old_swh_anchor_id = deposit.swh_anchor_id old_swh_anchor_id_context = deposit.swh_anchor_id_context # Update deposit.swh_id_context = swhid( DIRECTORY, dir_id, metadata={ "origin": origin, "visit": swhid(SNAPSHOT, snp_id.hex()), "anchor": swhid(REVISION, rev_id), "path": "/", }, ) # Ensure only deposit.swh_id_context changed logging.debug("deposit.id: {deposit.id}") logging.debug("deposit.swh_id: %s -> %s", old_swh_id, deposit.swh_id) assert old_swh_id == deposit.swh_id logging.debug( "deposit.swh_id_context: %s -> %s", old_swh_id_context, deposit.swh_id_context, ) assert old_swh_id_context != deposit.swh_id_context logging.debug( "deposit.swh_anchor_id: %s -> %s", old_swh_anchor_id, deposit.swh_anchor_id ) assert old_swh_anchor_id == deposit.swh_anchor_id logging.debug( "deposit.swh_anchor_id_context: %s -> %s", old_swh_anchor_id_context, deposit.swh_anchor_id_context, ) assert old_swh_anchor_id_context == deposit.swh_anchor_id_context # Commit deposit.save() def resolve_origin(deposit_id: int, provider_url: str, external_id: str) -> str: """Resolve the origin from provider-url and external-id For some edge case, only the external_id is used as there is some old inconsistency from testing which exists. """ map_edge_case_origin: Dict[Tuple[int, str], str] = { ( 76, "hal-01588782", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588782", ( 87, "hal-01588927", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588927", (89, "hal-01588935"): "https://hal-preprod.archives-ouvertes.fr/hal-01588935", ( 88, "hal-01588928", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588928", ( 90, "hal-01588942", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588942", (143, "hal-01592430"): "https://hal-preprod.archives-ouvertes.fr/hal-01592430", ( 75, "hal-01588781", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588781", } origin = map_edge_case_origin.get((deposit_id, external_id)) if origin: return origin # Some simpler origin edge cases (mostly around the initial deposits) map_origin = { ( SWH_PROVIDER_URL, "je-suis-gpl", ): "https://forge.softwareheritage.org/source/jesuisgpl/", ( SWH_PROVIDER_URL, "external-id", ): "https://hal.archives-ouvertes.fr/external-id", } key = (provider_url, external_id) return map_origin.get(key, f"{provider_url.rstrip('/')}/{external_id}") def migrate_deposit_swhid_context_null(apps, schema_editor): """Migrate deposit SWHIDs to the new format. Migrate deposit whose swh_id_context is not set (initial deposits not migrated at the time). Only deposit with status done and swh_id_context null are concerned. Note: Those deposits have their swh_id being the SWHPIDs of the revision! So we can align them as well. """ storage = get_storage() if not storage: logging.warning("Nothing to do") return None Deposit = apps.get_model("deposit", "Deposit") for deposit in Deposit.objects.filter( status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=True ): obj_rev = parse_swhid(deposit.swh_id) if obj_rev.object_type == DIRECTORY: # Assuming the migration is already done for that deposit logger.warning( "Deposit id %s: Migration already done, skipping", deposit.id ) continue # Ensuring Migration not done assert obj_rev.object_type == REVISION assert deposit.swh_id is not None assert deposit.swh_id_context is None assert deposit.swh_anchor_id is None assert deposit.swh_anchor_id_context is None rev_id = obj_rev.object_id rev_id_bytes = hash_to_bytes(rev_id) revision = storage.revision_get([rev_id_bytes])[0] if not revision: logger.warning("Deposit id %s: Revision %s not found!", deposit.id, rev_id) continue provider_url = deposit.client.provider_url external_id = deposit.external_id origin = resolve_origin(deposit.id, provider_url, external_id) check_origin = storage.origin_get([origin])[0] if not check_origin: logger.warning("Deposit id %s: Origin %s not found!", deposit.id, origin) continue dir_id = hash_to_hex(revision["directory"]) # Reference the old values to do some checks later old_swh_id = deposit.swh_id old_swh_id_context = deposit.swh_id_context old_swh_anchor_id = deposit.swh_anchor_id old_swh_anchor_id_context = deposit.swh_anchor_id_context # retrieve the snapshot from the archive snp_id = snapshot_id_get_from_revision(storage, origin, rev_id_bytes) if snp_id is None: logger.warning( "Deposit id %s: Snapshot targeting revision %s not found!", deposit.id, rev_id, ) continue # New SWHIDs ids deposit.swh_id = swhid(DIRECTORY, dir_id) deposit.swh_id_context = swhid( DIRECTORY, dir_id, metadata={ "origin": origin, "visit": swhid(SNAPSHOT, snp_id.hex()), "anchor": swhid(REVISION, rev_id), "path": "/", }, ) # Realign the remaining deposit SWHIDs fields deposit.swh_anchor_id = swhid(REVISION, rev_id) deposit.swh_anchor_id_context = swhid( REVISION, rev_id, metadata={"origin": origin,} ) # Ensure only deposit.swh_id_context changed logging.debug("deposit.id: {deposit.id}") logging.debug("deposit.swh_id: %s -> %s", old_swh_id, deposit.swh_id) assert old_swh_id != deposit.swh_id logging.debug( "deposit.swh_id_context: %s -> %s", old_swh_id_context, deposit.swh_id_context, ) assert old_swh_id_context != deposit.swh_id_context assert deposit.swh_id_context is not None logging.debug( "deposit.swh_anchor_id: %s -> %s", old_swh_anchor_id, deposit.swh_anchor_id ) assert deposit.swh_anchor_id == old_swh_id assert deposit.swh_anchor_id is not None logging.debug( "deposit.swh_anchor_id_context: %s -> %s", old_swh_anchor_id_context, deposit.swh_anchor_id_context, ) assert deposit.swh_anchor_id_context is not None deposit.save() class Migration(migrations.Migration): dependencies = [ ("deposit", "0017_auto_20190925_0906"), ] operations = [ # Migrate and make the operations possibly reversible # https://docs.djangoproject.com/en/3.0/ref/migration-operations/#django.db.migrations.operations.RunPython.noop # noqa migrations.RunPython( migrate_deposit_swhid_context_not_null, reverse_code=migrations.RunPython.noop, ), migrations.RunPython( migrate_deposit_swhid_context_null, reverse_code=migrations.RunPython.noop ), ] diff --git a/swh/deposit/models.py b/swh/deposit/models.py index 04e86e6c..03b015e6 100644 --- a/swh/deposit/models.py +++ b/swh/deposit/models.py @@ -1,240 +1,240 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # Generated from: # cd swh_deposit && \ # python3 -m manage inspectdb import datetime -from django.contrib.postgres.fields import JSONField, ArrayField from django.contrib.auth.models import User, UserManager +from django.contrib.postgres.fields import ArrayField, JSONField from django.db import models from django.utils.timezone import now from .config import ( - DEPOSIT_STATUS_VERIFIED, + ARCHIVE_TYPE, DEPOSIT_STATUS_DEPOSITED, - DEPOSIT_STATUS_PARTIAL, - DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE, + DEPOSIT_STATUS_LOAD_SUCCESS, + DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_REJECTED, - ARCHIVE_TYPE, + DEPOSIT_STATUS_VERIFIED, METADATA_TYPE, ) class Dbversion(models.Model): """Db version """ version = models.IntegerField(primary_key=True) release = models.DateTimeField(default=now, null=True) description = models.TextField(blank=True, null=True) class Meta: db_table = "dbversion" def __str__(self): return str( { "version": self.version, "release": self.release, "description": self.description, } ) """Possible status""" DEPOSIT_STATUS = [ (DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_PARTIAL), ("expired", "expired"), (DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_DEPOSITED), (DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_VERIFIED), (DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_REJECTED), ("loading", "loading"), (DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_SUCCESS), (DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_FAILURE), ] """Possible status and the detailed meaning.""" DEPOSIT_STATUS_DETAIL = { DEPOSIT_STATUS_PARTIAL: "Deposit is partially received. To finalize it, " "In-Progress header should be false", "expired": "Deposit has been there too long and is now " "deemed ready to be garbage collected", DEPOSIT_STATUS_DEPOSITED: "Deposit is ready for additional checks " "(tarball ok, metadata, etc...)", DEPOSIT_STATUS_VERIFIED: "Deposit is fully received, checked, and " "ready for loading", DEPOSIT_STATUS_REJECTED: "Deposit failed the checks", "loading": "Loading is ongoing on swh's side", DEPOSIT_STATUS_LOAD_SUCCESS: "The deposit has been successfully " "loaded into the Software Heritage archive", DEPOSIT_STATUS_LOAD_FAILURE: "The deposit loading into the " "Software Heritage archive failed", } class DepositClient(User): """Deposit client """ collections = ArrayField(models.IntegerField(), null=True) objects = UserManager() # type: ignore # this typing hint is due to a mypy/django-stubs limitation, # see https://github.com/typeddjango/django-stubs/issues/174 provider_url = models.TextField(null=False) domain = models.TextField(null=False) class Meta: db_table = "deposit_client" def __str__(self): return str( { "id": self.id, "collections": self.collections, "username": super().username, "domain": self.domain, "provider_url": self.provider_url, } ) class Deposit(models.Model): """Deposit reception table """ id = models.BigAutoField(primary_key=True) # First deposit reception date reception_date = models.DateTimeField(auto_now_add=True) # Date when the deposit is deemed complete and ready for loading complete_date = models.DateTimeField(null=True) # collection concerned by the deposit collection = models.ForeignKey("DepositCollection", models.DO_NOTHING) # Deposit's external identifier external_id = models.TextField() # Deposit client client = models.ForeignKey("DepositClient", models.DO_NOTHING) # SWH's loading result identifier swh_id = models.TextField(blank=True, null=True) swh_id_context = models.TextField(blank=True, null=True) # Deposit's status regarding loading status = models.TextField(choices=DEPOSIT_STATUS, default=DEPOSIT_STATUS_PARTIAL) status_detail = JSONField(null=True) # deposit can have one parent parent = models.ForeignKey("self", on_delete=models.PROTECT, null=True) check_task_id = models.TextField( blank=True, null=True, verbose_name="Scheduler's associated checking task id" ) load_task_id = models.TextField( blank=True, null=True, verbose_name="Scheduler's associated loading task id" ) class Meta: db_table = "deposit" def __str__(self): d = { "id": self.id, "reception_date": self.reception_date, "collection": self.collection.name, "external_id": self.external_id, "client": self.client.username, "status": self.status, } if self.status in (DEPOSIT_STATUS_REJECTED): d["status_detail"] = self.status_detail return str(d) @property def origin_url(self): return "%s/%s" % (self.client.provider_url.rstrip("/"), self.external_id) def client_directory_path(instance: "DepositRequest", filename: str) -> str: """Callable to determine the upload archive path. This defaults to MEDIA_ROOT/client_/%Y%m%d-%H%M%S.%f/. The format "%Y%m%d-%H%M%S.%f" is the reception date of the associated deposit formatted using strftime. Args: instance: DepositRequest concerned by the upload filename: Filename of the uploaded file Returns: The upload archive path. """ reception_date = instance.deposit.reception_date assert isinstance(reception_date, datetime.datetime) folder = reception_date.strftime("%Y%m%d-%H%M%S.%f") return f"client_{instance.deposit.client.id}/{folder}/{filename}" REQUEST_TYPES = [(ARCHIVE_TYPE, ARCHIVE_TYPE), (METADATA_TYPE, METADATA_TYPE)] class DepositRequest(models.Model): """Deposit request associated to one deposit. """ id = models.BigAutoField(primary_key=True) # Deposit concerned by the request deposit = models.ForeignKey(Deposit, models.DO_NOTHING) date = models.DateTimeField(auto_now_add=True) # Deposit request information on the data to inject # this can be null when type is 'archive' metadata = JSONField(null=True) raw_metadata = models.TextField(null=True) # this can be null when type is 'metadata' archive = models.FileField(null=True, upload_to=client_directory_path) type = models.CharField(max_length=8, choices=REQUEST_TYPES, null=True) class Meta: db_table = "deposit_request" def __str__(self): meta = None if self.metadata: from json import dumps meta = dumps(self.metadata) archive_name = None if self.archive: archive_name = self.archive.name return str( { "id": self.id, "deposit": self.deposit, "metadata": meta, "archive": archive_name, } ) class DepositCollection(models.Model): id = models.BigAutoField(primary_key=True) # Human readable name for the collection type e.g HAL, arXiv, etc... name = models.TextField() class Meta: db_table = "deposit_collection" def __str__(self): return str({"id": self.id, "name": self.name}) diff --git a/swh/deposit/parsers.py b/swh/deposit/parsers.py index 4ac55216..9f52a3af 100644 --- a/swh/deposit/parsers.py +++ b/swh/deposit/parsers.py @@ -1,96 +1,94 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining parsers with SWORD 2.0 supported mediatypes. """ -import xmltodict +from xml.parsers.expat import ExpatError from django.conf import settings -from rest_framework.parsers import BaseParser -from rest_framework.parsers import FileUploadParser -from rest_framework.parsers import MultiPartParser -from xml.parsers.expat import ExpatError +from rest_framework.parsers import BaseParser, FileUploadParser, MultiPartParser +import xmltodict from swh.deposit.errors import ParserError class SWHFileUploadZipParser(FileUploadParser): """File upload parser limited to zip archive. """ media_type = "application/zip" class SWHFileUploadTarParser(FileUploadParser): """File upload parser limited to tarball (tar, tar.gz, tar.*) archives. """ media_type = "application/x-tar" class SWHXMLParser(BaseParser): """ XML parser. """ media_type = "application/xml" def parse(self, stream, media_type=None, parser_context=None): """ Parses the incoming bytestream as XML and returns the resulting data. """ parser_context = parser_context or {} encoding = parser_context.get("encoding", settings.DEFAULT_CHARSET) data = xmltodict.parse(stream, encoding=encoding, process_namespaces=False) if "entry" in data: data = data["entry"] return data class SWHAtomEntryParser(SWHXMLParser): """Atom entry parser limited to specific mediatype """ media_type = "application/atom+xml;type=entry" def parse(self, stream, media_type=None, parser_context=None): # We do not actually want to parse the stream yet # because we want to keep the raw data as well # this is done later in the atom entry call # (cf. swh.deposit.api.common.APIBase._atom_entry) return stream class SWHMultiPartParser(MultiPartParser): """Multipart parser limited to a subset of mediatypes. """ media_type = "multipart/*; *" def parse_xml(raw_content): """Parse xml body. Args: raw_content (bytes): The content to parse Raises: ParserError in case of a malformed xml Returns: content parsed as dict. """ try: return SWHXMLParser().parse(raw_content) except ExpatError as e: raise ParserError(str(e)) diff --git a/swh/deposit/settings/production.py b/swh/deposit/settings/production.py index 5cc7c8b1..e1e6ac4d 100644 --- a/swh/deposit/settings/production.py +++ b/swh/deposit/settings/production.py @@ -1,110 +1,111 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os +from swh.core import config + from .common import * # noqa from .common import ALLOWED_HOSTS -from swh.core import config ALLOWED_HOSTS += ["deposit.softwareheritage.org"] # Setup support for proxy headers USE_X_FORWARDED_HOST = True SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https") DEBUG = False # Database # https://docs.djangoproject.com/en/1.10/ref/settings/#databases # https://docs.djangoproject.com/en/1.10/ref/settings/#std:setting-DATABASES # https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/#databases # Retrieve the deposit's configuration file # and check the required setup is ok # If not raise an error explaining the errors config_file = os.environ.get("SWH_CONFIG_FILENAME") if not config_file: raise ValueError( "Production: SWH_CONFIG_FILENAME must be set to the" " configuration file needed!" ) if not os.path.exists(config_file): raise ValueError( "Production: configuration file %s does not exist!" % (config_file,) ) conf = config.load_named_config(config_file) if not conf: raise ValueError("Production: configuration %s does not exist." % (config_file,)) for key in ("scheduler", "private"): if not conf.get(key): raise ValueError( "Production: invalid configuration; missing %s config entry." % (key,) ) ALLOWED_HOSTS += conf.get("allowed_hosts", []) private_conf = conf["private"] SECRET_KEY = private_conf["secret_key"] # https://docs.djangoproject.com/en/1.10/ref/settings/#logging LOGGING = { "version": 1, "disable_existing_loggers": False, "formatters": { "standard": { "format": "[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s", # noqa "datefmt": "%d/%b/%Y %H:%M:%S", }, }, "handlers": { "console": { "level": "INFO", "class": "logging.StreamHandler", "formatter": "standard", }, }, "loggers": { "django": {"handlers": ["console"], "level": "INFO", "propagate": True,}, }, } # database db_conf = private_conf.get("db", {"name": "unset"}) db = { "ENGINE": "django.db.backends.postgresql", "NAME": db_conf["name"], } db_user = db_conf.get("user") if db_user: db["USER"] = db_user db_pass = db_conf.get("password") if db_pass: db["PASSWORD"] = db_pass db_host = db_conf.get("host") if db_host: db["HOST"] = db_host db_port = db_conf.get("port") if db_port: db["PORT"] = db_port # https://docs.djangoproject.com/en/1.10/ref/settings/#databases DATABASES = { "default": db, } # Upload user directory # https://docs.djangoproject.com/en/1.11/ref/settings/#std:setting-MEDIA_ROOT MEDIA_ROOT = private_conf.get("media_root") diff --git a/swh/deposit/tests/api/conftest.py b/swh/deposit/tests/api/conftest.py index 7f626c1d..17e29af7 100644 --- a/swh/deposit/tests/api/conftest.py +++ b/swh/deposit/tests/api/conftest.py @@ -1,94 +1,93 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import os import hashlib -import pytest +import os from django.urls import reverse +import pytest +from swh.deposit.api.private.deposit_check import APIChecks from swh.deposit.config import ( - DEPOSIT_STATUS_DEPOSITED, COL_IRI, + DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_VERIFIED, ) from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml -from swh.deposit.api.private.deposit_check import APIChecks - @pytest.fixture def datadir(request): """Override default datadir to target main test datadir""" return os.path.join(os.path.dirname(str(request.fspath)), "../data") @pytest.fixture def ready_deposit_ok(partial_deposit_with_metadata): """Returns a deposit ready for checks (it will pass the checks). """ deposit = partial_deposit_with_metadata deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() return deposit @pytest.fixture def ready_deposit_verified(partial_deposit_with_metadata): """Returns a deposit ready for checks (it will pass the checks). """ deposit = partial_deposit_with_metadata deposit.status = DEPOSIT_STATUS_VERIFIED deposit.save() return deposit @pytest.fixture def ready_deposit_only_metadata(partial_deposit_only_metadata): """Deposit in status ready that will fail the checks (because missing archive). """ deposit = partial_deposit_only_metadata deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() return deposit @pytest.fixture def ready_deposit_invalid_archive(authenticated_client, deposit_collection): url = reverse(COL_IRI, args=[deposit_collection.name]) data = b"some data which is clearly not a zip file" md5sum = hashlib.md5(data).hexdigest() # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=data, # + headers CONTENT_LENGTH=len(data), # other headers needs HTTP_ prefix to be taken into account HTTP_SLUG="external-id-invalid", HTTP_CONTENT_MD5=md5sum, HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) response_content = parse_xml(response.content) deposit_id = int(response_content["deposit_id"]) deposit = Deposit.objects.get(pk=deposit_id) deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() return deposit @pytest.fixture def swh_checks_deposit(): return APIChecks() diff --git a/swh/deposit/tests/api/test_deposit.py b/swh/deposit/tests/api/test_deposit.py index 2e6cce7b..00c38d91 100644 --- a/swh/deposit/tests/api/test_deposit.py +++ b/swh/deposit/tests/api/test_deposit.py @@ -1,195 +1,194 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib +from io import BytesIO from django.urls import reverse -from io import BytesIO from rest_framework import status from swh.deposit.config import ( COL_IRI, - EDIT_SE_IRI, - DEPOSIT_STATUS_REJECTED, - DEPOSIT_STATUS_PARTIAL, - DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE, + DEPOSIT_STATUS_LOAD_SUCCESS, + DEPOSIT_STATUS_PARTIAL, + DEPOSIT_STATUS_REJECTED, + EDIT_SE_IRI, ) - from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml def test_deposit_post_will_fail_with_401(client): """Without authentication, endpoint refuses access with 401 response """ url = reverse(COL_IRI, args=["hal"]) response = client.post(url) assert response.status_code == status.HTTP_401_UNAUTHORIZED def test_access_to_another_user_collection_is_forbidden( authenticated_client, deposit_another_collection, deposit_user ): """Access to another user collection should return a 403 """ coll2 = deposit_another_collection url = reverse(COL_IRI, args=[coll2.name]) response = authenticated_client.post(url) assert response.status_code == status.HTTP_403_FORBIDDEN msg = "Client %s cannot access collection %s" % (deposit_user.username, coll2.name,) assert msg in response.content.decode("utf-8") def test_delete_on_col_iri_not_supported(authenticated_client, deposit_collection): """Delete on col iri should return a 405 response """ url = reverse(COL_IRI, args=[deposit_collection.name]) response = authenticated_client.delete(url) assert response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED assert "DELETE method is not supported on this endpoint" in response.content.decode( "utf-8" ) def create_deposit_with_rejection_status(authenticated_client, deposit_collection): url = reverse(COL_IRI, args=[deposit_collection.name]) data = b"some data which is clearly not a zip file" md5sum = hashlib.md5(data).hexdigest() external_id = "some-external-id-1" # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=data, # + headers CONTENT_LENGTH=len(data), # other headers needs HTTP_ prefix to be taken into account HTTP_SLUG=external_id, HTTP_CONTENT_MD5=md5sum, HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) actual_state = response_content["deposit_status"] assert actual_state == DEPOSIT_STATUS_REJECTED def test_act_on_deposit_rejected_is_not_permitted( authenticated_client, deposit_collection, rejected_deposit, atom_dataset ): deposit = rejected_deposit response = authenticated_client.post( reverse(EDIT_SE_IRI, args=[deposit.collection.name, deposit.id]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_400_BAD_REQUEST msg = "You can only act on deposit with status '%s'" % ( DEPOSIT_STATUS_PARTIAL, ) assert msg in response.content.decode("utf-8") def test_add_deposit_when_partial_makes_new_deposit( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Posting deposit on collection when previous is partial makes new deposit """ deposit = partial_deposit assert deposit.status == DEPOSIT_STATUS_PARTIAL # adding a new deposit with the same external id response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"] % deposit.external_id, HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] assert deposit_id != deposit.id # new deposit new_deposit = Deposit.objects.get(pk=deposit_id) assert new_deposit != deposit assert new_deposit.parent is None def test_add_deposit_when_failed_makes_new_deposit_with_no_parent( authenticated_client, deposit_collection, failed_deposit, atom_dataset ): """Posting deposit on collection when deposit done makes new deposit with parent """ deposit = failed_deposit assert deposit.status == DEPOSIT_STATUS_LOAD_FAILURE # adding a new deposit with the same external id as a completed deposit # creates the parenting chain response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"] % deposit.external_id, HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] assert deposit_id != deposit.id new_deposit = Deposit.objects.get(pk=deposit_id) assert new_deposit != deposit assert new_deposit.parent is None def test_add_deposit_when_done_makes_new_deposit_with_parent_old_one( authenticated_client, deposit_collection, completed_deposit, atom_dataset ): """Posting deposit on collection when deposit done makes new deposit with parent """ # given multiple deposit already loaded deposit = completed_deposit assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS # adding a new deposit with the same external id as a completed deposit # creates the parenting chain response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"] % deposit.external_id, HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] assert deposit_id != deposit.id new_deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == new_deposit.collection assert deposit.external_id == new_deposit.external_id assert new_deposit != deposit assert new_deposit.parent == deposit diff --git a/swh/deposit/tests/api/test_deposit_atom.py b/swh/deposit/tests/api/test_deposit_atom.py index 2de803d7..f551a317 100644 --- a/swh/deposit/tests/api/test_deposit_atom.py +++ b/swh/deposit/tests/api/test_deposit_atom.py @@ -1,326 +1,326 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import pytest +from io import BytesIO from django.urls import reverse -from io import BytesIO +import pytest from rest_framework import status from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED -from swh.deposit.models import Deposit, DepositRequest, DepositCollection +from swh.deposit.models import Deposit, DepositCollection, DepositRequest from swh.deposit.parsers import parse_xml def test_post_deposit_atom_201_even_with_decimal( authenticated_client, deposit_collection, atom_dataset ): """Posting an initial atom entry should return 201 with deposit receipt """ atom_error_with_decimal = atom_dataset["error-with-decimal"] response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_error_with_decimal, HTTP_SLUG="external-id", HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) dr = DepositRequest.objects.get(deposit=deposit) assert dr.metadata is not None sw_version = dr.metadata.get("codemeta:softwareVersion") assert sw_version == "10.4" def test_post_deposit_atom_400_with_empty_body( authenticated_client, deposit_collection, atom_dataset ): """Posting empty body request should return a 400 response """ response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data-empty-body"], ) assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_400_badly_formatted_atom( authenticated_client, deposit_collection, atom_dataset ): """Posting a badly formatted atom should return a 400 response """ response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data-badly-formatted"], ) assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_parsing_error( authenticated_client, deposit_collection, atom_dataset ): """Posting parsing error prone atom should return 400 """ response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data-parsing-error-prone"], ) assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_no_slug_header( authenticated_client, deposit_collection, atom_dataset ): """Posting an atom entry without a slug header should return a 400 """ url = reverse(COL_IRI, args=[deposit_collection.name]) # when response = authenticated_client.post( url, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"], # + headers HTTP_IN_PROGRESS="false", ) assert b"Missing SLUG header" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_unknown_collection(authenticated_client, atom_dataset): """Posting an atom entry to an unknown collection should return a 404 """ unknown_collection = "unknown-one" with pytest.raises(DepositCollection.DoesNotExist): DepositCollection.objects.get(name=unknown_collection) response = authenticated_client.post( reverse(COL_IRI, args=[unknown_collection]), # <- unknown collection content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"], HTTP_SLUG="something", ) assert response.status_code == status.HTTP_404_NOT_FOUND def test_post_deposit_atom_entry_initial( authenticated_client, deposit_collection, atom_dataset ): """Posting an initial atom entry should return 201 with deposit receipt """ # given external_id = "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) atom_entry_data = atom_dataset["entry-data0"] % external_id # when response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_entry_data, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.external_id == external_id assert deposit.status == DEPOSIT_STATUS_DEPOSITED # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.metadata is not None assert deposit_request.raw_metadata == atom_entry_data assert bool(deposit_request.archive) is False def test_post_deposit_atom_entry_with_codemeta( authenticated_client, deposit_collection, atom_dataset ): """Posting an initial atom entry should return 201 with deposit receipt """ # given external_id = "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) atom_entry_data = atom_dataset["codemeta-sample"] % external_id # when response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_entry_data, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.external_id == external_id assert deposit.status == DEPOSIT_STATUS_DEPOSITED # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.metadata is not None assert deposit_request.raw_metadata == atom_entry_data assert bool(deposit_request.archive) is False def test_post_deposit_atom_entry_tei( authenticated_client, deposit_collection, atom_dataset ): """Posting initial atom entry as TEI should return 201 with receipt """ # given external_id = "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) atom_entry_data = atom_dataset["tei-sample"] # when response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_entry_data, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.external_id == external_id assert deposit.status == DEPOSIT_STATUS_DEPOSITED # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.metadata is not None assert deposit_request.raw_metadata == atom_entry_data assert bool(deposit_request.archive) is False def test_post_deposit_atom_entry_multiple_steps( authenticated_client, deposit_collection, atom_dataset ): """After initial deposit, updating a deposit should return a 201 """ # given external_id = "urn:uuid:2225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): deposit = Deposit.objects.get(external_id=external_id) # when response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], HTTP_IN_PROGRESS="True", HTTP_SLUG=external_id, ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = int(response_content["deposit_id"]) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.external_id == external_id assert deposit.status == "partial" # one associated request to a deposit deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert len(deposit_requests) == 1 atom_entry_data = atom_dataset["entry-data-minimal"] % external_id.encode( "utf-8" ) # noqa update_uri = response._headers["location"][1] # when updating the first deposit post response = authenticated_client.post( update_uri, content_type="application/atom+xml;type=entry", data=atom_entry_data, HTTP_IN_PROGRESS="False", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = int(response_content["deposit_id"]) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.external_id == external_id assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert len(Deposit.objects.all()) == 1 # now 2 associated requests to a same deposit deposit_requests = DepositRequest.objects.filter(deposit=deposit).order_by("id") assert len(deposit_requests) == 2 atom_entry_data1 = atom_dataset["entry-data1"] expected_meta = [ {"metadata": parse_xml(atom_entry_data1), "raw_metadata": atom_entry_data1}, {"metadata": parse_xml(atom_entry_data), "raw_metadata": atom_entry_data}, ] for i, deposit_request in enumerate(deposit_requests): actual_metadata = deposit_request.metadata assert actual_metadata == expected_meta[i]["metadata"] assert deposit_request.raw_metadata == expected_meta[i]["raw_metadata"] assert bool(deposit_request.archive) is False diff --git a/swh/deposit/tests/api/test_deposit_binary.py b/swh/deposit/tests/api/test_deposit_binary.py index eced5d17..00b4179b 100644 --- a/swh/deposit/tests/api/test_deposit_binary.py +++ b/swh/deposit/tests/api/test_deposit_binary.py @@ -1,567 +1,562 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import pytest +from io import BytesIO from django.core.files.uploadedfile import InMemoryUploadedFile from django.urls import reverse -from io import BytesIO - +import pytest from rest_framework import status -from swh.deposit.config import ( - COL_IRI, - EM_IRI, - DEPOSIT_STATUS_DEPOSITED, -) +from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED, EM_IRI from swh.deposit.models import Deposit, DepositRequest from swh.deposit.parsers import parse_xml -from swh.deposit.tests.common import create_arborescence_archive, check_archive +from swh.deposit.tests.common import check_archive, create_arborescence_archive def test_post_deposit_binary_no_slug( authenticated_client, deposit_collection, sample_archive ): """Posting a binary deposit without slug header should return 400 """ url = reverse(COL_IRI, args=[deposit_collection.name]) # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) assert b"Missing SLUG header" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_binary_support( authenticated_client, deposit_collection, sample_archive ): """Binary upload with content-type not in [zip,x-tar] should return 415 """ # given url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id-1" # when response = authenticated_client.post( url, content_type="application/octet-stream", data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) # then assert response.status_code == status.HTTP_415_UNSUPPORTED_MEDIA_TYPE with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) def test_post_deposit_binary_upload_ok( authenticated_client, deposit_collection, sample_archive ): """Binary upload with correct headers should return 201 with receipt """ # given url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id-1" # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], # other headers needs HTTP_ prefix to be taken into account HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"],), ) # then response_content = parse_xml(BytesIO(response.content)) assert response.status_code == status.HTTP_201_CREATED deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swh_id is None deposit_request = DepositRequest.objects.get(deposit=deposit) check_archive(sample_archive["name"], deposit_request.archive.name) assert deposit_request.metadata is None assert deposit_request.raw_metadata is None response_content = parse_xml(BytesIO(response.content)) assert response_content["deposit_archive"] == sample_archive["name"] assert int(response_content["deposit_id"]) == deposit.id assert response_content["deposit_status"] == deposit.status edit_se_iri = reverse("edit_se_iri", args=[deposit_collection.name, deposit.id]) assert response._headers["location"] == ( "Location", "http://testserver" + edit_se_iri, ) def test_post_deposit_binary_failure_unsupported_packaging_header( authenticated_client, deposit_collection, sample_archive ): """Bin deposit without supported content_disposition header returns 400 """ # given url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id" # when response = authenticated_client.post( url, content_type="application/zip", data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="something-unsupported", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) # then assert response.status_code == status.HTTP_400_BAD_REQUEST with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) def test_post_deposit_binary_upload_no_content_disposition_header( authenticated_client, deposit_collection, sample_archive ): """Binary upload without content_disposition header should return 400 """ # given url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id" # when response = authenticated_client.post( url, content_type="application/zip", data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_400_BAD_REQUEST with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) def test_post_deposit_mediation_not_supported( authenticated_client, deposit_collection, sample_archive ): """Binary upload with mediation should return a 412 response """ # given url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id-1" # when response = authenticated_client.post( url, content_type="application/zip", data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_ON_BEHALF_OF="someone", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) # then assert response.status_code == status.HTTP_412_PRECONDITION_FAILED with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) def test_post_deposit_binary_upload_fail_if_upload_size_limit_exceeded( authenticated_client, deposit_collection, sample_archive, tmp_path ): """Binary upload must not exceed the limit set up... """ tmp_path = str(tmp_path) url = reverse(COL_IRI, args=[deposit_collection.name]) archive = create_arborescence_archive( tmp_path, "archive2", "file2", b"some content in file", up_to_size=500 ) external_id = "some-external-id" # when response = authenticated_client.post( url, content_type="application/zip", data=archive["data"], # + headers CONTENT_LENGTH=archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) # then assert response.status_code == status.HTTP_413_REQUEST_ENTITY_TOO_LARGE assert b"Upload size limit exceeded" in response.content with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) def test_post_deposit_2_post_2_different_deposits( authenticated_client, deposit_collection, sample_archive ): """2 posting deposits should return 2 different 201 with receipt """ url = reverse(COL_IRI, args=[deposit_collection.name]) # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG="some-external-id-1", HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) deposits = Deposit.objects.all() assert len(deposits) == 1 assert deposits[0] == deposit # second post response = authenticated_client.post( url, content_type="application/x-tar", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG="another-external-id", HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=filename1", ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id2 = response_content["deposit_id"] deposit2 = Deposit.objects.get(pk=deposit_id2) assert deposit != deposit2 deposits = Deposit.objects.all().order_by("id") assert len(deposits) == 2 assert list(deposits), [deposit == deposit2] def test_post_deposit_binary_and_post_to_add_another_archive( authenticated_client, deposit_collection, sample_archive, tmp_path ): """Updating a deposit should return a 201 with receipt """ tmp_path = str(tmp_path) url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id-1" # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="true", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"],), ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == "partial" assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swh_id is None deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.deposit == deposit assert deposit_request.type == "archive" check_archive(sample_archive["name"], deposit_request.archive.name) # 2nd archive to upload archive2 = create_arborescence_archive( tmp_path, "archive2", "file2", b"some other content in file" ) # uri to update the content update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit_id]) # adding another archive for the deposit and finalizing it response = authenticated_client.post( update_uri, content_type="application/zip", # as zip data=archive2["data"], # + headers CONTENT_LENGTH=archive2["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive2["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive2["name"]), ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swh_id is None deposit_requests = list( DepositRequest.objects.filter(deposit=deposit).order_by("id") ) # 2 deposit requests for the same deposit assert len(deposit_requests) == 2 assert deposit_requests[0].deposit == deposit assert deposit_requests[0].type == "archive" check_archive(sample_archive["name"], deposit_requests[0].archive.name) assert deposit_requests[1].deposit == deposit assert deposit_requests[1].type == "archive" check_archive(archive2["name"], deposit_requests[1].archive.name) # only 1 deposit in db deposits = Deposit.objects.all() assert len(deposits) == 1 def test_post_deposit_then_update_refused( authenticated_client, deposit_collection, sample_archive, atom_dataset, tmp_path ): """Updating a deposit with status 'ready' should return a 400 """ tmp_path = str(tmp_path) url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id-1" # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swh_id is None deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.deposit == deposit check_archive("filename0", deposit_request.archive.name) # updating/adding is forbidden # uri to update the content edit_se_iri = reverse("edit_se_iri", args=[deposit_collection.name, deposit_id]) em_iri = reverse("em_iri", args=[deposit_collection.name, deposit_id]) # Testing all update/add endpoint should fail # since the status is ready archive2 = create_arborescence_archive( tmp_path, "archive2", "file2", b"some content in file 2" ) # replacing file is no longer possible since the deposit's # status is ready r = authenticated_client.put( em_iri, content_type="application/zip", data=archive2["data"], CONTENT_LENGTH=archive2["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive2["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) assert r.status_code == status.HTTP_400_BAD_REQUEST # adding file is no longer possible since the deposit's status # is ready r = authenticated_client.post( em_iri, content_type="application/zip", data=archive2["data"], CONTENT_LENGTH=archive2["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive2["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) assert r.status_code == status.HTTP_400_BAD_REQUEST # replacing metadata is no longer possible since the deposit's # status is ready r = authenticated_client.put( edit_se_iri, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data-deposit-binary"], CONTENT_LENGTH=len(atom_dataset["entry-data-deposit-binary"]), HTTP_SLUG=external_id, ) assert r.status_code == status.HTTP_400_BAD_REQUEST # adding new metadata is no longer possible since the # deposit's status is ready r = authenticated_client.post( edit_se_iri, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data-deposit-binary"], CONTENT_LENGTH=len(atom_dataset["entry-data-deposit-binary"]), HTTP_SLUG=external_id, ) assert r.status_code == status.HTTP_400_BAD_REQUEST archive_content = b"some content representing archive" archive = InMemoryUploadedFile( BytesIO(archive_content), field_name="archive0", name="archive0", content_type="application/zip", size=len(archive_content), charset=None, ) atom_entry = InMemoryUploadedFile( BytesIO(atom_dataset["entry-data-deposit-binary"].encode("utf-8")), field_name="atom0", name="atom0", content_type='application/atom+xml; charset="utf-8"', size=len(atom_dataset["entry-data-deposit-binary"]), charset="utf-8", ) # replacing multipart metadata is no longer possible since the # deposit's status is ready r = authenticated_client.put( edit_se_iri, format="multipart", data={"archive": archive, "atom_entry": atom_entry,}, ) assert r.status_code == status.HTTP_400_BAD_REQUEST # adding new metadata is no longer possible since the # deposit's status is ready r = authenticated_client.post( edit_se_iri, format="multipart", data={"archive": archive, "atom_entry": atom_entry,}, ) assert r.status_code == status.HTTP_400_BAD_REQUEST diff --git a/swh/deposit/tests/api/test_deposit_delete.py b/swh/deposit/tests/api/test_deposit_delete.py index 496af061..76959c24 100644 --- a/swh/deposit/tests/api/test_deposit_delete.py +++ b/swh/deposit/tests/api/test_deposit_delete.py @@ -1,123 +1,123 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict +from typing import Dict, Mapping + from django.urls import reverse from rest_framework import status -from typing import Dict, Mapping from swh.deposit.config import ( + ARCHIVE_KEY, + DEPOSIT_STATUS_DEPOSITED, EDIT_SE_IRI, EM_IRI, - ARCHIVE_KEY, METADATA_KEY, - DEPOSIT_STATUS_DEPOSITED, ) - from swh.deposit.models import Deposit, DepositRequest def count_deposit_request_types(deposit_requests) -> Mapping[str, int]: deposit_request_types = defaultdict(int) # type: Dict[str, int] for dr in deposit_requests: deposit_request_types[dr.type] += 1 return deposit_request_types def test_delete_archive_on_partial_deposit_works( authenticated_client, partial_deposit_with_metadata, deposit_collection ): """Removing partial deposit's archive should return a 204 response """ deposit_id = partial_deposit_with_metadata.id deposit = Deposit.objects.get(pk=deposit_id) deposit_requests = DepositRequest.objects.filter(deposit=deposit) # deposit request type: 'archive', 1 'metadata' deposit_request_types = count_deposit_request_types(deposit_requests) assert deposit_request_types == {ARCHIVE_KEY: 1, METADATA_KEY: 1} # when update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit_id]) response = authenticated_client.delete(update_uri) # then assert response.status_code == status.HTTP_204_NO_CONTENT deposit = Deposit.objects.get(pk=deposit_id) deposit_requests2 = DepositRequest.objects.filter(deposit=deposit) deposit_request_types = count_deposit_request_types(deposit_requests2) assert deposit_request_types == {METADATA_KEY: 1} def test_delete_archive_on_undefined_deposit_fails( authenticated_client, deposit_collection, sample_archive ): """Delete undefined deposit returns a 404 response """ # when update_uri = reverse(EM_IRI, args=[deposit_collection.name, 999]) response = authenticated_client.delete(update_uri) # then assert response.status_code == status.HTTP_404_NOT_FOUND def test_delete_non_partial_deposit( authenticated_client, deposit_collection, deposited_deposit ): """Delete !partial status deposit should return a 400 response """ deposit = deposited_deposit assert deposit.status == DEPOSIT_STATUS_DEPOSITED # when update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.delete(update_uri) # then assert response.status_code == status.HTTP_400_BAD_REQUEST deposit = Deposit.objects.get(pk=deposit.id) assert deposit is not None def test_delete_partial_deposit( authenticated_client, deposit_collection, partial_deposit ): """Delete deposit should return a 204 response """ # given deposit = partial_deposit # when url = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.delete(url) # then assert response.status_code == status.HTTP_204_NO_CONTENT deposit_requests = list(DepositRequest.objects.filter(deposit=deposit)) assert deposit_requests == [] deposits = list(Deposit.objects.filter(pk=deposit.id)) assert deposits == [] def test_delete_on_edit_se_iri_cannot_delete_non_partial_deposit( authenticated_client, deposit_collection, complete_deposit ): """Delete !partial deposit should return a 400 response """ # given deposit = complete_deposit # when url = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.delete(url) # then assert response.status_code == status.HTTP_400_BAD_REQUEST deposit = Deposit.objects.get(pk=deposit.id) assert deposit is not None diff --git a/swh/deposit/tests/api/test_deposit_list.py b/swh/deposit/tests/api/test_deposit_list.py index e36d04ef..3de52950 100644 --- a/swh/deposit/tests/api/test_deposit_list.py +++ b/swh/deposit/tests/api/test_deposit_list.py @@ -1,100 +1,100 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse from rest_framework import status from swh.deposit.api.converters import convert_status_detail from swh.deposit.config import ( + DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL, PRIVATE_LIST_DEPOSITS, - DEPOSIT_STATUS_DEPOSITED, ) STATUS_DETAIL = { "url": { "summary": "At least one compatible url field. Failed", "fields": ["testurl"], }, "metadata": [{"summary": "Mandatory fields missing", "fields": ["9", 10, 1.212],},], "archive": [ {"summary": "Invalid archive", "fields": ["3"],}, {"summary": "Unsupported archive", "fields": [2],}, ], } def test_deposit_list(partial_deposit, deposited_deposit, authenticated_client): """Deposit list api should return all deposits in a paginated way """ partial_deposit.status_detail = STATUS_DETAIL partial_deposit.save() deposit_id = partial_deposit.id deposit_id2 = deposited_deposit.id # NOTE: does not work as documented # https://docs.djangoproject.com/en/1.11/ref/urlresolvers/#django.core.urlresolvers.reverse # noqa # url = reverse(PRIVATE_LIST_DEPOSITS, kwargs={'page_size': 1}) main_url = reverse(PRIVATE_LIST_DEPOSITS) url = "%s?page_size=1" % main_url response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["count"] == 2 # 2 deposits expected_next = f"{main_url}?page=2&page_size=1" assert data["next"].endswith(expected_next) is True assert data["previous"] is None assert len(data["results"]) == 1 # page of size 1 deposit = data["results"][0] assert deposit["id"] == deposit_id assert deposit["status"] == DEPOSIT_STATUS_PARTIAL expected_status_detail = convert_status_detail(STATUS_DETAIL) assert deposit["status_detail"] == expected_status_detail # then 2nd page response2 = authenticated_client.get(expected_next) assert response2.status_code == status.HTTP_200_OK data2 = response2.json() assert data2["count"] == 2 # still 2 deposits assert data2["next"] is None expected_previous = f"{main_url}?page_size=1" assert data2["previous"].endswith(expected_previous) is True assert len(data2["results"]) == 1 # page of size 1 deposit2 = data2["results"][0] assert deposit2["id"] == deposit_id2 assert deposit2["status"] == DEPOSIT_STATUS_DEPOSITED def test_deposit_list_exclude(partial_deposit, deposited_deposit, authenticated_client): """Exclusion pattern on external_id should be respected """ partial_deposit.status_detail = STATUS_DETAIL partial_deposit.save() main_url = reverse(PRIVATE_LIST_DEPOSITS) # Testing exclusion pattern exclude_pattern = "external-id" assert partial_deposit.external_id.startswith(exclude_pattern) assert deposited_deposit.external_id.startswith(exclude_pattern) url = f"{main_url}?page_size=1&exclude=external-id" response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["count"] == 0 url = "%s?page_size=1&exclude=dummy" % main_url # that won't exclude anything response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["count"] == 2 diff --git a/swh/deposit/tests/api/test_deposit_multipart.py b/swh/deposit/tests/api/test_deposit_multipart.py index bb4f42d7..c9a4a871 100644 --- a/swh/deposit/tests/api/test_deposit_multipart.py +++ b/swh/deposit/tests/api/test_deposit_multipart.py @@ -1,400 +1,401 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from io import BytesIO + from django.core.files.uploadedfile import InMemoryUploadedFile from django.urls import reverse -from io import BytesIO from rest_framework import status from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED from swh.deposit.models import Deposit, DepositRequest from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import check_archive def test_post_deposit_multipart_without_slug_header_is_bad_request( authenticated_client, deposit_collection, atom_dataset ): # given url = reverse(COL_IRI, args=[deposit_collection.name]) archive_content = b"some content representing archive" archive = InMemoryUploadedFile( BytesIO(archive_content), field_name="archive0", name="archive0", content_type="application/zip", size=len(archive_content), charset=None, ) data_atom_entry = atom_dataset["entry-data-deposit-binary"] atom_entry = InMemoryUploadedFile( BytesIO(data_atom_entry.encode("utf-8")), field_name="atom0", name="atom0", content_type='application/atom+xml; charset="utf-8"', size=len(data_atom_entry), charset="utf-8", ) # when response = authenticated_client.post( url, format="multipart", data={"archive": archive, "atom_entry": atom_entry,}, # + headers HTTP_IN_PROGRESS="false", ) assert b"Missing SLUG header" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_multipart_zip( authenticated_client, deposit_collection, atom_dataset, sample_archive ): """one multipart deposit (zip+xml) should be accepted """ # given url = reverse(COL_IRI, args=[deposit_collection.name]) archive = InMemoryUploadedFile( BytesIO(sample_archive["data"]), field_name=sample_archive["name"], name=sample_archive["name"], content_type="application/zip", size=sample_archive["length"], charset=None, ) data_atom_entry = atom_dataset["entry-data-deposit-binary"] atom_entry = InMemoryUploadedFile( BytesIO(data_atom_entry.encode("utf-8")), field_name="atom0", name="atom0", content_type='application/atom+xml; charset="utf-8"', size=len(data_atom_entry), charset="utf-8", ) external_id = "external-id" # when response = authenticated_client.post( url, format="multipart", data={"archive": archive, "atom_entry": atom_entry,}, # + headers HTTP_IN_PROGRESS="false", HTTP_SLUG=external_id, ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swh_id is None deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert len(deposit_requests) == 2 for deposit_request in deposit_requests: assert deposit_request.deposit == deposit if deposit_request.type == "archive": check_archive(sample_archive["name"], deposit_request.archive.name) assert deposit_request.metadata is None assert deposit_request.raw_metadata is None else: assert ( deposit_request.metadata["id"] == "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" ) assert deposit_request.raw_metadata == data_atom_entry def test_post_deposit_multipart_tar( authenticated_client, deposit_collection, atom_dataset, sample_archive ): """one multipart deposit (tar+xml) should be accepted """ # given url = reverse(COL_IRI, args=[deposit_collection.name]) # from django.core.files import uploadedfile data_atom_entry = atom_dataset["entry-data-deposit-binary"] archive = InMemoryUploadedFile( BytesIO(sample_archive["data"]), field_name=sample_archive["name"], name=sample_archive["name"], content_type="application/x-tar", size=sample_archive["length"], charset=None, ) atom_entry = InMemoryUploadedFile( BytesIO(data_atom_entry.encode("utf-8")), field_name="atom0", name="atom0", content_type='application/atom+xml; charset="utf-8"', size=len(data_atom_entry), charset="utf-8", ) external_id = "external-id" # when response = authenticated_client.post( url, format="multipart", data={"archive": archive, "atom_entry": atom_entry,}, # + headers HTTP_IN_PROGRESS="false", HTTP_SLUG=external_id, ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swh_id is None deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert len(deposit_requests) == 2 for deposit_request in deposit_requests: assert deposit_request.deposit == deposit if deposit_request.type == "archive": check_archive(sample_archive["name"], deposit_request.archive.name) assert deposit_request.metadata is None assert deposit_request.raw_metadata is None else: assert ( deposit_request.metadata["id"] == "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" ) assert deposit_request.raw_metadata == data_atom_entry def test_post_deposit_multipart_put_to_replace_metadata( authenticated_client, deposit_collection, atom_dataset, sample_archive ): """One multipart deposit followed by a metadata update should be accepted """ # given url = reverse(COL_IRI, args=[deposit_collection.name]) data_atom_entry = atom_dataset["entry-data-deposit-binary"] archive = InMemoryUploadedFile( BytesIO(sample_archive["data"]), field_name=sample_archive["name"], name=sample_archive["name"], content_type="application/zip", size=sample_archive["length"], charset=None, ) atom_entry = InMemoryUploadedFile( BytesIO(data_atom_entry.encode("utf-8")), field_name="atom0", name="atom0", content_type='application/atom+xml; charset="utf-8"', size=len(data_atom_entry), charset="utf-8", ) external_id = "external-id" # when response = authenticated_client.post( url, format="multipart", data={"archive": archive, "atom_entry": atom_entry,}, # + headers HTTP_IN_PROGRESS="true", HTTP_SLUG=external_id, ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == "partial" assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swh_id is None deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert len(deposit_requests) == 2 for deposit_request in deposit_requests: assert deposit_request.deposit == deposit if deposit_request.type == "archive": check_archive(sample_archive["name"], deposit_request.archive.name) else: assert ( deposit_request.metadata["id"] == "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" ) assert deposit_request.raw_metadata == data_atom_entry replace_metadata_uri = response._headers["location"][1] response = authenticated_client.put( replace_metadata_uri, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data-deposit-binary"], HTTP_IN_PROGRESS="false", ) assert response.status_code == status.HTTP_204_NO_CONTENT # deposit_id did not change deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swh_id is None deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert len(deposit_requests) == 2 for deposit_request in deposit_requests: assert deposit_request.deposit == deposit if deposit_request.type == "archive": check_archive(sample_archive["name"], deposit_request.archive.name) else: assert ( deposit_request.metadata["id"] == "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" ) assert ( deposit_request.raw_metadata == atom_dataset["entry-data-deposit-binary"] ) # FAILURE scenarios def test_post_deposit_multipart_only_archive_and_atom_entry( authenticated_client, deposit_collection ): """Multipart deposit only accepts one archive and one atom+xml""" # given url = reverse(COL_IRI, args=[deposit_collection.name]) archive_content = b"some content representing archive" archive = InMemoryUploadedFile( BytesIO(archive_content), field_name="archive0", name="archive0", content_type="application/x-tar", size=len(archive_content), charset=None, ) other_archive_content = b"some-other-content" other_archive = InMemoryUploadedFile( BytesIO(other_archive_content), field_name="atom0", name="atom0", content_type="application/x-tar", size=len(other_archive_content), charset="utf-8", ) # when response = authenticated_client.post( url, format="multipart", data={"archive": archive, "atom_entry": other_archive,}, # + headers HTTP_IN_PROGRESS="false", HTTP_SLUG="external-id", ) # then assert response.status_code == status.HTTP_415_UNSUPPORTED_MEDIA_TYPE assert ( "Only 1 application/zip (or application/x-tar) archive" in response.content.decode("utf-8") ) # when archive.seek(0) response = authenticated_client.post( url, format="multipart", data={"archive": archive,}, # + headers HTTP_IN_PROGRESS="false", HTTP_SLUG="external-id", ) # then assert response.status_code == status.HTTP_415_UNSUPPORTED_MEDIA_TYPE assert ( "You must provide both 1 application/zip (or " "application/x-tar) and 1 atom+xml entry for " "multipart deposit" in response.content.decode("utf-8") ) is True def test_post_deposit_multipart_400_when_badly_formatted_xml( authenticated_client, deposit_collection, sample_archive, atom_dataset ): # given url = reverse(COL_IRI, args=[deposit_collection.name]) archive_content = sample_archive["data"] archive = InMemoryUploadedFile( BytesIO(archive_content), field_name=sample_archive["name"], name=sample_archive["name"], content_type="application/zip", size=len(archive_content), charset=None, ) data_atom_entry_ko = atom_dataset["entry-data-ko"] atom_entry = InMemoryUploadedFile( BytesIO(data_atom_entry_ko.encode("utf-8")), field_name="atom0", name="atom0", content_type='application/atom+xml; charset="utf-8"', size=len(data_atom_entry_ko), charset="utf-8", ) # when response = authenticated_client.post( url, format="multipart", data={"archive": archive, "atom_entry": atom_entry,}, # + headers HTTP_IN_PROGRESS="false", HTTP_SLUG="external-id", ) assert b"Malformed xml metadata" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST diff --git a/swh/deposit/tests/api/test_deposit_private_check.py b/swh/deposit/tests/api/test_deposit_private_check.py index 8982f232..c882f817 100644 --- a/swh/deposit/tests/api/test_deposit_private_check.py +++ b/swh/deposit/tests/api/test_deposit_private_check.py @@ -1,283 +1,282 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse import pytest from rest_framework import status -from swh.deposit.config import ( - DEPOSIT_STATUS_VERIFIED, - PRIVATE_CHECK_DEPOSIT, - DEPOSIT_STATUS_DEPOSITED, - DEPOSIT_STATUS_REJECTED, - COL_IRI, -) from swh.deposit.api.private.deposit_check import ( - MANDATORY_ARCHIVE_INVALID, - MANDATORY_FIELDS_MISSING, - MANDATORY_ARCHIVE_UNSUPPORTED, ALTERNATE_FIELDS_MISSING, + MANDATORY_ARCHIVE_INVALID, MANDATORY_ARCHIVE_MISSING, + MANDATORY_ARCHIVE_UNSUPPORTED, + MANDATORY_FIELDS_MISSING, +) +from swh.deposit.config import ( + COL_IRI, + DEPOSIT_STATUS_DEPOSITED, + DEPOSIT_STATUS_REJECTED, + DEPOSIT_STATUS_VERIFIED, + PRIVATE_CHECK_DEPOSIT, ) from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import ( create_arborescence_archive, create_archive_with_archive, ) - PRIVATE_CHECK_DEPOSIT_NC = PRIVATE_CHECK_DEPOSIT + "-nc" def private_check_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" return [ reverse(PRIVATE_CHECK_DEPOSIT, args=[collection.name, deposit.id]), reverse(PRIVATE_CHECK_DEPOSIT_NC, args=[deposit.id]), ] @pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) def test_deposit_ok( authenticated_client, deposit_collection, ready_deposit_ok, extension ): """Proper deposit should succeed the checks (-> status ready) """ deposit = ready_deposit_ok for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_VERIFIED deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_VERIFIED deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() @pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) def test_deposit_invalid_tarball( tmp_path, authenticated_client, deposit_collection, extension ): """Deposit with tarball (of 1 tarball) should fail the checks: rejected """ deposit = create_deposit_archive_with_archive( tmp_path, extension, authenticated_client, deposit_collection.name ) for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_REJECTED details = data["details"] # archive checks failure assert len(details["archive"]) == 1 assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_INVALID deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_REJECTED def test_deposit_ko_missing_tarball( authenticated_client, deposit_collection, ready_deposit_only_metadata ): """Deposit without archive should fail the checks: rejected """ deposit = ready_deposit_only_metadata assert deposit.status == DEPOSIT_STATUS_DEPOSITED for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_REJECTED details = data["details"] # archive checks failure assert len(details["archive"]) == 1 assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_MISSING deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_REJECTED deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() def test_deposit_ko_unsupported_tarball( tmp_path, authenticated_client, deposit_collection, ready_deposit_invalid_archive ): """Deposit with an unsupported tarball should fail the checks: rejected """ deposit = ready_deposit_invalid_archive assert DEPOSIT_STATUS_DEPOSITED == deposit.status for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_REJECTED details = data["details"] # archive checks failure assert len(details["archive"]) == 1 assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_UNSUPPORTED # metadata check failure assert len(details["metadata"]) == 2 mandatory = details["metadata"][0] assert mandatory["summary"] == MANDATORY_FIELDS_MISSING assert set(mandatory["fields"]) == set(["author"]) alternate = details["metadata"][1] assert alternate["summary"] == ALTERNATE_FIELDS_MISSING assert alternate["fields"] == ["name or title"] deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_REJECTED deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() def test_check_deposit_metadata_ok( authenticated_client, deposit_collection, ready_deposit_ok ): """Proper deposit should succeed the checks (-> status ready) with all **MUST** metadata using the codemeta metadata test set """ deposit = ready_deposit_ok assert deposit.status == DEPOSIT_STATUS_DEPOSITED for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_VERIFIED deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_VERIFIED deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() def test_check_metadata_ok(swh_checks_deposit): actual_check, detail = swh_checks_deposit._check_metadata( { "url": "something", "external_identifier": "something-else", "name": "foo", "author": "someone", } ) assert actual_check is True assert detail is None def test_check_metadata_ok2(swh_checks_deposit): actual_check, detail = swh_checks_deposit._check_metadata( { "url": "something", "external_identifier": "something-else", "title": "bar", "author": "someone", } ) assert actual_check is True assert detail is None def test_check_metadata_ko(swh_checks_deposit): """Missing optional field should be caught """ actual_check, error_detail = swh_checks_deposit._check_metadata( { "url": "something", "external_identifier": "something-else", "author": "someone", } ) expected_error = { "metadata": [ { "summary": "Mandatory alternate fields are missing", "fields": ["name or title"], } ] } assert actual_check is False assert error_detail == expected_error def test_check_metadata_ko2(swh_checks_deposit): """Missing mandatory fields should be caught """ actual_check, error_detail = swh_checks_deposit._check_metadata( { "url": "something", "external_identifier": "something-else", "title": "foobar", } ) expected_error = { "metadata": [{"summary": "Mandatory fields are missing", "fields": ["author"],}] } assert actual_check is False assert error_detail == expected_error def create_deposit_archive_with_archive( root_path, archive_extension, client, collection_name ): # we create the holding archive to a given extension archive = create_arborescence_archive( root_path, "archive1", "file1", b"some content in file", extension=archive_extension, ) # now we create an archive holding the first created archive invalid_archive = create_archive_with_archive(root_path, "invalid.tgz", archive) # we deposit it response = client.post( reverse(COL_IRI, args=[collection_name]), content_type="application/x-tar", data=invalid_archive["data"], CONTENT_LENGTH=invalid_archive["length"], HTTP_MD5SUM=invalid_archive["md5sum"], HTTP_SLUG="external-id", HTTP_IN_PROGRESS=False, HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (invalid_archive["name"],), ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_status = response_content["deposit_status"] assert deposit_status == DEPOSIT_STATUS_DEPOSITED deposit_id = int(response_content["deposit_id"]) deposit = Deposit.objects.get(pk=deposit_id) assert DEPOSIT_STATUS_DEPOSITED == deposit.status return deposit diff --git a/swh/deposit/tests/api/test_deposit_private_read_archive.py b/swh/deposit/tests/api/test_deposit_private_read_archive.py index 1724a2a9..6c265130 100644 --- a/swh/deposit/tests/api/test_deposit_private_read_archive.py +++ b/swh/deposit/tests/api/test_deposit_private_read_archive.py @@ -1,87 +1,86 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io import zipfile from django.urls import reverse from rest_framework import status -from swh.deposit.config import PRIVATE_GET_RAW_CONTENT, EM_IRI +from swh.deposit.config import EM_IRI, PRIVATE_GET_RAW_CONTENT from swh.deposit.tests.common import create_arborescence_archive - PRIVATE_GET_RAW_CONTENT_NC = PRIVATE_GET_RAW_CONTENT + "-nc" def private_get_raw_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" return [ reverse(PRIVATE_GET_RAW_CONTENT, args=[collection.name, deposit.id]), reverse(PRIVATE_GET_RAW_CONTENT_NC, args=[deposit.id]), ] def test_access_to_existing_deposit_with_one_archive( authenticated_client, deposit_collection, complete_deposit, sample_archive ): """Access to deposit should stream a 200 response with its raw content """ deposit = complete_deposit for url in private_get_raw_url_endpoints(deposit_collection, deposit): r = authenticated_client.get(url) assert r.status_code == status.HTTP_200_OK assert r._headers["content-type"][1] == "application/zip" # read the stream data = b"".join(r.streaming_content) # extract the file from the zip zfile = zipfile.ZipFile(io.BytesIO(data)) assert zfile.namelist() == ["file1"] assert zfile.open("file1").read() == b"some content in file" def test_access_to_existing_deposit_with_multiple_archives( tmp_path, authenticated_client, deposit_collection, partial_deposit, sample_archive ): """Access to deposit should stream a 200 response with its raw contents """ deposit = partial_deposit archive2 = create_arborescence_archive( tmp_path, "archive2", "file2", b"some other content in file" ) # Add a second archive to deposit update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.post( update_uri, content_type="application/zip", # as zip data=archive2["data"], # + headers CONTENT_LENGTH=archive2["length"], HTTP_SLUG=deposit.external_id, HTTP_CONTENT_MD5=archive2["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive2["name"],), ) assert response.status_code == status.HTTP_201_CREATED for url in private_get_raw_url_endpoints(deposit_collection, deposit): r = authenticated_client.get(url) assert r.status_code == status.HTTP_200_OK assert r._headers["content-type"][1] == "application/zip" # read the stream data = b"".join(r.streaming_content) # extract the file from the zip zfile = zipfile.ZipFile(io.BytesIO(data)) assert set(zfile.namelist()) == {"file1", "file2"} assert zfile.open("file1").read() == b"some content in file" assert zfile.open("file2").read() == b"some other content in file" diff --git a/swh/deposit/tests/api/test_deposit_private_read_metadata.py b/swh/deposit/tests/api/test_deposit_private_read_metadata.py index 475ab1b8..b04fb3dd 100644 --- a/swh/deposit/tests/api/test_deposit_private_read_metadata.py +++ b/swh/deposit/tests/api/test_deposit_private_read_metadata.py @@ -1,551 +1,550 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse from rest_framework import status +from swh.deposit.config import EDIT_SE_IRI, PRIVATE_GET_DEPOSIT_METADATA, SWH_PERSON from swh.deposit.models import Deposit -from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA, SWH_PERSON, EDIT_SE_IRI - PRIVATE_GET_DEPOSIT_METADATA_NC = PRIVATE_GET_DEPOSIT_METADATA + "-nc" def private_get_raw_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" deposit_id = deposit if isinstance(deposit, int) else deposit.id return [ reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[collection.name, deposit_id]), reverse(PRIVATE_GET_DEPOSIT_METADATA_NC, args=[deposit_id]), ] def update_deposit(authenticated_client, collection, deposit, atom_dataset): for atom_data in ["entry-data2", "entry-data3"]: update_deposit_with_metadata( authenticated_client, collection, deposit, atom_dataset[atom_data] ) return deposit def update_deposit_with_metadata(authenticated_client, collection, deposit, metadata): # update deposit's metadata response = authenticated_client.post( reverse(EDIT_SE_IRI, args=[collection.name, deposit.id]), content_type="application/atom+xml;type=entry", data=metadata, HTTP_SLUG=deposit.external_id, HTTP_IN_PROGRESS=True, ) assert response.status_code == status.HTTP_201_CREATED return deposit def test_read_metadata( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Private metadata read api to existing deposit should return metadata """ deposit = partial_deposit deposit.external_id = "some-external-id" deposit.save() deposit = update_deposit( authenticated_client, deposit_collection, deposit, atom_dataset ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() expected_meta = { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", }, "origin_metadata": { "metadata": { "@xmlns": ["http://www.w3.org/2005/Atom"], "author": ["some awesome author", "another one", "no one"], "codemeta:dateCreated": "2017-10-07T15:17:08Z", "external_identifier": "some-external-id", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", # noqa }, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": "0.0.1", }, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": "test", "id": deposit.id, "collection": "test", "revision_parents": [], }, } assert data == expected_meta def test_read_metadata_revision_with_parent( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Private read metadata to a deposit (with parent) returns metadata """ deposit = partial_deposit deposit.external_id = "some-external-id" deposit.save() deposit = update_deposit( authenticated_client, deposit_collection, deposit, atom_dataset ) rev_id = "da78a9d4cf1d5d29873693fd496142e3a18c20fa" swh_id = "swh:1:rev:%s" % rev_id fake_parent = Deposit( swh_id=swh_id, client=deposit.client, collection=deposit.collection ) fake_parent.save() deposit.parent = fake_parent deposit.save() for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() expected_meta = { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", }, "origin_metadata": { "metadata": { "@xmlns": ["http://www.w3.org/2005/Atom"], "author": ["some awesome author", "another one", "no one"], "codemeta:dateCreated": "2017-10-07T15:17:08Z", "external_identifier": "some-external-id", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", # noqa }, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": "0.0.1", }, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": "test", "id": deposit.id, "collection": "test", "revision_parents": [rev_id], }, } assert data == expected_meta def test_read_metadata_3( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """date(Created|Published) provided, uses author/committer date """ deposit = partial_deposit deposit.external_id = "hal-01243065" deposit.save() deposit = update_deposit( authenticated_client, deposit_collection, deposit, atom_dataset ) # add metadata to the deposit with datePublished and dateCreated codemeta_entry_data = ( atom_dataset["metadata"] % """ 2015-04-06T17:08:47+02:00 2017-05-03T16:08:47+02:00 """ ) update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() metadata = { "@xmlns": ["http://www.w3.org/2005/Atom"], "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "author": [ "some awesome author", "another one", "no one", {"email": "hal@ccsd.cnrs.fr", "name": "HAL"}, ], "client": "hal", "codemeta:applicationCategory": "test", "codemeta:author": {"codemeta:name": "Morane Gruenpeter"}, "codemeta:dateCreated": [ "2017-10-07T15:17:08Z", "2015-04-06T17:08:47+02:00", ], "codemeta:datePublished": "2017-05-03T16:08:47+02:00", "codemeta:description": "this is the description", "codemeta:developmentStatus": "stable", "codemeta:keywords": "DSP programming", "codemeta:license": [ {"codemeta:name": "GNU General Public License v3.0 only"}, { "codemeta:name": "CeCILL " "Free " "Software " "License " "Agreement " "v1.1" }, ], "codemeta:programmingLanguage": ["php", "python", "C"], "codemeta:runtimePlatform": "phpstorm", "codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243065", # noqa "codemeta:version": "1", "external_identifier": ["some-external-id", "hal-01243065"], "id": "hal-01243065", "title": "Composing a Web of Audio " "Applications", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", } expected_meta = { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/hal-01243065", }, "origin_metadata": { "metadata": metadata, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": "0.0.1", }, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1493820527}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], }, } assert data == expected_meta def test_read_metadata_4( authenticated_client, deposit_collection, atom_dataset, partial_deposit ): """dateCreated/datePublished not provided, revision uses complete_date """ deposit = partial_deposit codemeta_entry_data = atom_dataset["metadata"] % "" deposit = update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) # will use the deposit completed date as fallback date deposit.complete_date = "2016-04-06" deposit.save() for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() metadata = { "@xmlns": "http://www.w3.org/2005/Atom", "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "author": {"email": "hal@ccsd.cnrs.fr", "name": "HAL"}, "client": "hal", "codemeta:applicationCategory": "test", "codemeta:author": {"codemeta:name": "Morane " "Gruenpeter"}, "codemeta:description": "this is the " "description", "codemeta:developmentStatus": "stable", "codemeta:keywords": "DSP programming", "codemeta:license": [ { "codemeta:name": "GNU " "General " "Public " "License " "v3.0 " "only" }, { "codemeta:name": "CeCILL " "Free " "Software " "License " "Agreement " "v1.1" }, ], "codemeta:programmingLanguage": ["php", "python", "C"], "codemeta:runtimePlatform": "phpstorm", "codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243065", "codemeta:version": "1", "external_identifier": "hal-01243065", "id": "hal-01243065", "title": "Composing a Web of Audio " "Applications", } expected_origin = { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/%s" % (deposit.external_id), } expected_origin_metadata = { "metadata": metadata, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": "0.0.1", }, } expected_deposit_info = { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1459900800}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1459900800}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], } expected_meta = { "origin": expected_origin, "origin_metadata": expected_origin_metadata, "deposit": expected_deposit_info, } assert data == expected_meta def test_read_metadata_5( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """dateCreated/datePublished provided, revision uses author/committer date If multiple dateCreated provided, the first occurrence (of dateCreated) is selected. If multiple datePublished provided, the first occurrence (of datePublished) is selected. """ deposit = partial_deposit # add metadata to the deposit with multiple datePublished/dateCreated codemeta_entry_data = ( atom_dataset["metadata"] % """ 2015-04-06T17:08:47+02:00 2017-05-03T16:08:47+02:00 2016-04-06T17:08:47+02:00 2018-05-03T16:08:47+02:00 """ ) deposit = update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() expected_origin = { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/external-id-partial", } metadata = { "@xmlns": "http://www.w3.org/2005/Atom", "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "author": {"email": "hal@ccsd.cnrs.fr", "name": "HAL"}, "client": "hal", "codemeta:applicationCategory": "test", "codemeta:author": {"codemeta:name": "Morane " "Gruenpeter"}, "codemeta:dateCreated": [ "2015-04-06T17:08:47+02:00", "2016-04-06T17:08:47+02:00", ], "codemeta:datePublished": [ "2017-05-03T16:08:47+02:00", "2018-05-03T16:08:47+02:00", ], "codemeta:description": "this is the description", "codemeta:developmentStatus": "stable", "codemeta:keywords": "DSP programming", "codemeta:license": [ { "codemeta:name": "GNU " "General " "Public " "License " "v3.0 " "only" }, { "codemeta:name": "CeCILL " "Free " "Software " "License " "Agreement " "v1.1" }, ], "codemeta:programmingLanguage": ["php", "python", "C"], "codemeta:runtimePlatform": "phpstorm", "codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243065", # noqa "codemeta:version": "1", "external_identifier": "hal-01243065", "id": "hal-01243065", "title": "Composing a Web of Audio " "Applications", } expected_origin_metadata = { "metadata": metadata, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": "0.0.1", }, } expected_deposit_info = { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1493820527}, }, "author_date": { "negative_utc": False, "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1428332927}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], } expected_meta = { "origin": expected_origin, "origin_metadata": expected_origin_metadata, "deposit": expected_deposit_info, } assert data == expected_meta def test_access_to_nonexisting_deposit_returns_404_response( authenticated_client, deposit_collection, ): """Read unknown collection should return a 404 response """ unknown_id = 999 try: Deposit.objects.get(pk=unknown_id) except Deposit.DoesNotExist: assert True for url in private_get_raw_url_endpoints(deposit_collection, unknown_id): response = authenticated_client.get(url) assert response.status_code == status.HTTP_404_NOT_FOUND msg = "Deposit with id %s does not exist" % unknown_id assert msg in response.content.decode("utf-8") diff --git a/swh/deposit/tests/api/test_deposit_private_update_status.py b/swh/deposit/tests/api/test_deposit_private_update_status.py index fce57539..f93801de 100644 --- a/swh/deposit/tests/api/test_deposit_private_update_status.py +++ b/swh/deposit/tests/api/test_deposit_private_update_status.py @@ -1,194 +1,191 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import json from django.urls import reverse from rest_framework import status -from swh.model.identifiers import DIRECTORY, swhid, REVISION, SNAPSHOT - from swh.deposit.api.private.deposit_update_status import MANDATORY_KEYS - -from swh.deposit.models import Deposit from swh.deposit.config import ( - PRIVATE_PUT_DEPOSIT, - DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE, + DEPOSIT_STATUS_LOAD_SUCCESS, + PRIVATE_PUT_DEPOSIT, ) - +from swh.deposit.models import Deposit +from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid PRIVATE_PUT_DEPOSIT_NC = PRIVATE_PUT_DEPOSIT + "-nc" def private_check_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" return [ reverse(PRIVATE_PUT_DEPOSIT, args=[collection.name, deposit.id]), reverse(PRIVATE_PUT_DEPOSIT_NC, args=[deposit.id]), ] def test_update_deposit_status_success_with_info( authenticated_client, deposit_collection, ready_deposit_verified ): """Update deposit with load success should require all information to succeed """ deposit = ready_deposit_verified expected_status = DEPOSIT_STATUS_LOAD_SUCCESS origin_url = "something" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" revision_id = "47dc6b4636c7f6cba0df83e3d5490bf4334d987e" snapshot_id = "68c0d26104d47e278dd6be07ed61fafb561d0d20" full_body_info = { "status": DEPOSIT_STATUS_LOAD_SUCCESS, "revision_id": revision_id, "directory_id": directory_id, "snapshot_id": snapshot_id, "origin_url": origin_url, } for url in private_check_url_endpoints(deposit_collection, deposit): dir_id = swhid(DIRECTORY, directory_id) rev_id = swhid(REVISION, revision_id) snp_id = swhid(SNAPSHOT, snapshot_id) expected_swh_id = "swh:1:dir:%s" % directory_id expected_swh_id_context = ( f"{dir_id};origin={origin_url};" + f"visit={snp_id};anchor={rev_id};path=/" ) response = authenticated_client.put( url, content_type="application/json", data=json.dumps(full_body_info), ) assert response.status_code == status.HTTP_204_NO_CONTENT deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == expected_status assert deposit.swh_id == expected_swh_id assert deposit.swh_id_context == expected_swh_id_context # Reset deposit deposit = ready_deposit_verified deposit.save() def test_update_deposit_status_rejected_with_info( authenticated_client, deposit_collection, ready_deposit_verified ): """Update deposit with rejected status needs few information to succeed """ deposit = ready_deposit_verified for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.put( url, content_type="application/json", data=json.dumps({"status": DEPOSIT_STATUS_LOAD_FAILURE}), ) assert response.status_code == status.HTTP_204_NO_CONTENT deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_LOAD_FAILURE assert deposit.swh_id is None assert deposit.swh_id_context is None # Reset status deposit = ready_deposit_verified deposit.save() def test_update_deposit_status_success_with_incomplete_data( authenticated_client, deposit_collection, ready_deposit_verified ): """Update deposit status with status success and incomplete information should fail """ deposit = ready_deposit_verified origin_url = "something" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" revision_id = "47dc6b4636c7f6cba0df83e3d5490bf4334d987e" snapshot_id = "68c0d26104d47e278dd6be07ed61fafb561d0d20" new_status = DEPOSIT_STATUS_LOAD_SUCCESS full_body_info = { "status": new_status, "revision_id": revision_id, "directory_id": directory_id, "snapshot_id": snapshot_id, "origin_url": origin_url, } for url in private_check_url_endpoints(deposit_collection, deposit): for key in MANDATORY_KEYS: # Crafting body with missing information so that it raises body = copy.deepcopy(full_body_info) body.pop(key) # make the body incomplete response = authenticated_client.put( url, content_type="application/json", data=json.dumps(body), ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert ( f"deposit status to {new_status} requires information {key}" in response.content.decode("utf-8") ) def test_update_deposit_status_will_fail_with_unknown_status( authenticated_client, deposit_collection, ready_deposit_verified ): """Unknown status for update should return a 400 response """ deposit = ready_deposit_verified for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.put( url, content_type="application/json", data=json.dumps({"status": "unknown"}) ) assert response.status_code == status.HTTP_400_BAD_REQUEST def test_update_deposit_status_will_fail_with_no_status_key( authenticated_client, deposit_collection, ready_deposit_verified ): """No status provided for update should return a 400 response """ deposit = ready_deposit_verified for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.put( url, content_type="application/json", data=json.dumps({"something": "something"}), ) assert response.status_code == status.HTTP_400_BAD_REQUEST def test_update_deposit_status_success_without_swh_id_fail( authenticated_client, deposit_collection, ready_deposit_verified ): """Providing successful status without swh_id should return a 400 """ deposit = ready_deposit_verified for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.put( url, content_type="application/json", data=json.dumps({"status": DEPOSIT_STATUS_LOAD_SUCCESS}), ) assert response.status_code == status.HTTP_400_BAD_REQUEST diff --git a/swh/deposit/tests/api/test_deposit_schedule.py b/swh/deposit/tests/api/test_deposit_schedule.py index 21091cb2..4218797e 100644 --- a/swh/deposit/tests/api/test_deposit_schedule.py +++ b/swh/deposit/tests/api/test_deposit_schedule.py @@ -1,85 +1,81 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import datetime - from io import BytesIO from django.urls import reverse import pytest from rest_framework import status -from swh.deposit.config import ( - COL_IRI, - DEPOSIT_STATUS_DEPOSITED, -) +from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED from swh.deposit.parsers import parse_xml @pytest.fixture() def deposit_config(deposit_config): """Overrides the `deposit_config` fixture define in swh/deposit/tests/conftest.py to re-enable the checks.""" config_d = copy.deepcopy(deposit_config) config_d["checks"] = True return config_d def now() -> datetime.datetime: return datetime.datetime.now(tz=datetime.timezone.utc) def test_add_deposit_schedules_check( authenticated_client, deposit_collection, sample_archive, swh_scheduler ): """Posting deposit on collection creates a checker task """ external_id = "external-id-schedules-check" url = reverse(COL_IRI, args=[deposit_collection.name]) timestamp_before_call = now() response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"]), ) timestamp_after_call = now() assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) actual_state = response_content["deposit_status"] assert actual_state == DEPOSIT_STATUS_DEPOSITED deposit_id = response_content["deposit_id"] tasks = swh_scheduler.grab_ready_tasks("check-deposit") assert len(tasks) == 1 task = tasks[0] assert timestamp_before_call <= task.pop("next_run") <= timestamp_after_call assert task == { "arguments": { "args": [], "kwargs": {"collection": "test", "deposit_id": int(deposit_id),}, }, "current_interval": datetime.timedelta(days=1), "id": 1, "policy": "oneshot", "priority": None, "retries_left": 3, "status": "next_run_scheduled", "type": "check-deposit", } diff --git a/swh/deposit/tests/api/test_deposit_status.py b/swh/deposit/tests/api/test_deposit_status.py index 4b03f7c7..c8f5f89e 100644 --- a/swh/deposit/tests/api/test_deposit_status.py +++ b/swh/deposit/tests/api/test_deposit_status.py @@ -1,120 +1,121 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from django.urls import reverse from io import BytesIO + +from django.urls import reverse from rest_framework import status from swh.deposit.config import ( - STATE_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_REJECTED, + STATE_IRI, ) from swh.deposit.models import DEPOSIT_STATUS_DETAIL, DEPOSIT_STATUS_LOAD_SUCCESS from swh.deposit.parsers import parse_xml def test_post_deposit_with_status_check(authenticated_client, deposited_deposit): """Successful but not loaded deposit should have a status 'deposited' """ deposit = deposited_deposit status_url = reverse(STATE_IRI, args=[deposit.collection.name, deposit.id]) # check status status_response = authenticated_client.get(status_url) assert status_response.status_code == status.HTTP_200_OK r = parse_xml(BytesIO(status_response.content)) assert int(r["deposit_id"]) == deposit.id assert r["deposit_status"] == DEPOSIT_STATUS_DEPOSITED assert r["deposit_status_detail"] == DEPOSIT_STATUS_DETAIL[DEPOSIT_STATUS_DEPOSITED] assert r["deposit_external_id"] == deposit.external_id def test_status_unknown_deposit(authenticated_client, deposit_collection): """Unknown deposit status should return 404 response """ unknown_deposit_id = 999 status_url = reverse(STATE_IRI, args=[deposit_collection.name, unknown_deposit_id]) status_response = authenticated_client.get(status_url) assert status_response.status_code == status.HTTP_404_NOT_FOUND def test_status_unknown_collection(authenticated_client, deposited_deposit): """Unknown collection status should return 404 response""" deposit = deposited_deposit unknown_collection = "something-unknown" status_url = reverse(STATE_IRI, args=[unknown_collection, deposit.id]) status_response = authenticated_client.get(status_url) assert status_response.status_code == status.HTTP_404_NOT_FOUND def test_status_deposit_rejected(authenticated_client, rejected_deposit): """Rejected deposit status should be 'rejected' with detailed summary """ deposit = rejected_deposit # _status_detail = {'url': {'summary': 'Wrong url'}} url = reverse(STATE_IRI, args=[deposit.collection.name, deposit.id]) # when status_response = authenticated_client.get(url) # then assert status_response.status_code == status.HTTP_200_OK r = parse_xml(BytesIO(status_response.content)) assert int(r["deposit_id"]) == deposit.id assert r["deposit_status"] == DEPOSIT_STATUS_REJECTED assert r["deposit_status_detail"] == "Deposit failed the checks" if deposit.swh_id: assert r["deposit_swh_id"] == deposit.swh_id def test_status_with_http_accept_header_should_not_break( authenticated_client, partial_deposit ): """Asking deposit status with Accept header should return 200 """ deposit = partial_deposit status_url = reverse(STATE_IRI, args=[deposit.collection.name, deposit.id]) response = authenticated_client.get(status_url) assert response.status_code == status.HTTP_200_OK response = authenticated_client.get( status_url, HTTP_ACCEPT="text/html,application/xml;q=9,*/*,q=8" ) assert response.status_code == status.HTTP_200_OK def test_status_complete_deposit(authenticated_client, complete_deposit): """Successful and loaded deposit should be 'done' and have detailed swh ids """ deposit = complete_deposit url = reverse(STATE_IRI, args=[deposit.collection.name, deposit.id]) # when status_response = authenticated_client.get(url) # then assert status_response.status_code == status.HTTP_200_OK r = parse_xml(BytesIO(status_response.content)) assert int(r["deposit_id"]) == deposit.id assert r["deposit_status"] == DEPOSIT_STATUS_LOAD_SUCCESS assert ( r["deposit_status_detail"] == DEPOSIT_STATUS_DETAIL[DEPOSIT_STATUS_LOAD_SUCCESS] ) assert deposit.swh_id is not None assert r["deposit_swh_id"] == deposit.swh_id assert deposit.swh_id_context is not None assert r["deposit_swh_id_context"] == deposit.swh_id_context diff --git a/swh/deposit/tests/api/test_deposit_update.py b/swh/deposit/tests/api/test_deposit_update.py index 43b268cd..0b173c4f 100644 --- a/swh/deposit/tests/api/test_deposit_update.py +++ b/swh/deposit/tests/api/test_deposit_update.py @@ -1,395 +1,394 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse from rest_framework import status -from swh.deposit.models import Deposit, DepositRequest, DepositCollection from swh.deposit.config import EDIT_SE_IRI, EM_IRI +from swh.deposit.models import Deposit, DepositCollection, DepositRequest from swh.deposit.parsers import parse_xml - -from swh.deposit.tests.common import create_arborescence_archive, check_archive +from swh.deposit.tests.common import check_archive, create_arborescence_archive def test_replace_archive_to_deposit_is_possible( tmp_path, partial_deposit, deposit_collection, authenticated_client, sample_archive, atom_dataset, ): """Replace all archive with another one should return a 204 response """ tmp_path = str(tmp_path) # given deposit = partial_deposit requests = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(list(requests)) == 1 check_archive(sample_archive["name"], requests[0].archive.name) # we have no metadata for that deposit requests = list(DepositRequest.objects.filter(deposit=deposit, type="metadata")) assert len(requests) == 0 response = authenticated_client.post( reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], HTTP_SLUG=deposit.external_id, HTTP_IN_PROGRESS=True, ) requests = list(DepositRequest.objects.filter(deposit=deposit, type="metadata")) assert len(requests) == 1 update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) external_id = "some-external-id-1" archive2 = create_arborescence_archive( tmp_path, "archive2", "file2", b"some other content in file" ) response = authenticated_client.put( update_uri, content_type="application/zip", # as zip data=archive2["data"], # + headers CONTENT_LENGTH=archive2["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive2["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive2["name"],), ) assert response.status_code == status.HTTP_204_NO_CONTENT requests = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(list(requests)) == 1 check_archive(archive2["name"], requests[0].archive.name) # check we did not touch the other parts requests = list(DepositRequest.objects.filter(deposit=deposit, type="metadata")) assert len(requests) == 1 def test_replace_metadata_to_deposit_is_possible( tmp_path, authenticated_client, partial_deposit_with_metadata, deposit_collection, atom_dataset, ): """Replace all metadata with another one should return a 204 response """ # given deposit = partial_deposit_with_metadata raw_metadata0 = atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8") requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta) == 1 request_meta0 = requests_meta[0] assert request_meta0.raw_metadata == raw_metadata0 requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive0) == 1 update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.put( update_uri, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_204_NO_CONTENT requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta) == 1 request_meta1 = requests_meta[0] raw_metadata1 = request_meta1.raw_metadata assert raw_metadata1 == atom_dataset["entry-data1"] assert raw_metadata0 != raw_metadata1 assert request_meta0 != request_meta1 # check we did not touch the other parts requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive1) == 1 assert set(requests_archive0) == set(requests_archive1) def test_add_archive_to_deposit_is_possible( tmp_path, authenticated_client, deposit_collection, partial_deposit_with_metadata, sample_archive, ): """Add another archive to a deposit return a 201 response """ tmp_path = str(tmp_path) deposit = partial_deposit_with_metadata requests = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests) == 1 check_archive(sample_archive["name"], requests[0].archive.name) requests_meta0 = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta0) == 1 update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) external_id = "some-external-id-1" archive2 = create_arborescence_archive( tmp_path, "archive2", "file2", b"some other content in file" ) response = authenticated_client.post( update_uri, content_type="application/zip", # as zip data=archive2["data"], # + headers CONTENT_LENGTH=archive2["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive2["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive2["name"],), ) assert response.status_code == status.HTTP_201_CREATED requests = DepositRequest.objects.filter(deposit=deposit, type="archive").order_by( "id" ) assert len(requests) == 2 # first archive still exists check_archive(sample_archive["name"], requests[0].archive.name) # a new one was added check_archive(archive2["name"], requests[1].archive.name) # check we did not touch the other parts requests_meta1 = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta1) == 1 assert set(requests_meta0) == set(requests_meta1) def test_add_metadata_to_deposit_is_possible( authenticated_client, deposit_collection, partial_deposit_with_metadata, atom_dataset, ): """Add metadata with another one should return a 204 response """ deposit = partial_deposit_with_metadata requests = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests) == 1 requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive0) == 1 update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) atom_entry = atom_dataset["entry-data1"] response = authenticated_client.post( update_uri, content_type="application/atom+xml;type=entry", data=atom_entry ) assert response.status_code == status.HTTP_201_CREATED requests = DepositRequest.objects.filter(deposit=deposit, type="metadata").order_by( "id" ) assert len(requests) == 2 expected_raw_meta0 = atom_dataset["entry-data0"] % ( deposit.external_id.encode("utf-8") ) # a new one was added assert requests[0].raw_metadata == expected_raw_meta0 assert requests[1].raw_metadata == atom_entry # check we did not touch the other parts requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive1) == 1 assert set(requests_archive0) == set(requests_archive1) def test_add_metadata_to_unknown_deposit( deposit_collection, authenticated_client, atom_dataset ): """Replacing metadata to unknown deposit should return a 404 response """ unknown_deposit_id = 1000 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(EDIT_SE_IRI, args=[deposit_collection, unknown_deposit_id]) response = authenticated_client.post( url, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert "Unknown collection name" in response_content["sword:error"]["summary"] def test_add_metadata_to_unknown_collection( partial_deposit, authenticated_client, atom_dataset ): """Replacing metadata to unknown deposit should return a 404 response """ deposit = partial_deposit unknown_collection_name = "unknown-collection" try: DepositCollection.objects.get(name=unknown_collection_name) except DepositCollection.DoesNotExist: assert True url = reverse(EDIT_SE_IRI, args=[unknown_collection_name, deposit.id]) response = authenticated_client.post( url, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert "Unknown collection name" in response_content["sword:error"]["summary"] def test_replace_metadata_to_unknown_deposit( authenticated_client, deposit_collection, atom_dataset ): """Adding metadata to unknown deposit should return a 404 response """ unknown_deposit_id = 998 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(EDIT_SE_IRI, args=[deposit_collection.name, unknown_deposit_id]) response = authenticated_client.put( url, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert ( "Deposit with id %s does not exist" % unknown_deposit_id == response_content["sword:error"]["summary"] ) def test_add_archive_to_unknown_deposit( authenticated_client, deposit_collection, atom_dataset ): """Adding metadata to unknown deposit should return a 404 response """ unknown_deposit_id = 997 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(EM_IRI, args=[deposit_collection.name, unknown_deposit_id]) response = authenticated_client.post( url, content_type="application/zip", data=atom_dataset["entry-data1"] ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert ( "Deposit with id %s does not exist" % unknown_deposit_id == response_content["sword:error"]["summary"] ) def test_replace_archive_to_unknown_deposit( authenticated_client, deposit_collection, atom_dataset ): """Replacing archive to unknown deposit should return a 404 response """ unknown_deposit_id = 996 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(EM_IRI, args=[deposit_collection.name, unknown_deposit_id]) response = authenticated_client.put( url, content_type="application/zip", data=atom_dataset["entry-data1"] ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert ( "Deposit with id %s does not exist" % unknown_deposit_id == response_content["sword:error"]["summary"] ) def test_post_metadata_to_em_iri_failure( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Update (POST) archive with wrong content type should return 400 """ deposit = partial_deposit update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.post( update_uri, content_type="application/x-gtar-compressed", data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_400_BAD_REQUEST response_content = parse_xml(response.content) msg = ( "Packaging format supported is restricted to " + "application/zip, application/x-tar" ) assert msg == response_content["sword:error"]["summary"] def test_put_metadata_to_em_iri_failure( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Update (PUT) archive with wrong content type should return 400 """ # given deposit = partial_deposit # when update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.put( update_uri, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], ) # then assert response.status_code == status.HTTP_400_BAD_REQUEST response_content = parse_xml(response.content) msg = ( "Packaging format supported is restricted to " + "application/zip, application/x-tar" ) assert msg == response_content["sword:error"]["summary"] diff --git a/swh/deposit/tests/api/test_exception.py b/swh/deposit/tests/api/test_exception.py index 0d71926b..a606397f 100644 --- a/swh/deposit/tests/api/test_exception.py +++ b/swh/deposit/tests/api/test_exception.py @@ -1,53 +1,52 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.deposit.exception import custom_exception_handler - +from django.db.utils import OperationalError from rest_framework.exceptions import APIException from rest_framework.response import Response -from django.db.utils import OperationalError +from swh.deposit.exception import custom_exception_handler def test_custom_exception_handler_operational_error(mocker): """Operation error are translated to service unavailable """ fake_exception = OperationalError("Fake internal error", 503) response = custom_exception_handler(fake_exception, {}) assert response is not None assert response.status_code == 503 status = "Database backend maintenance" detail = "Service temporarily unavailable, try again later." assert ( response.content.decode("utf-8") == f""" {status} {detail} """ ) def test_custom_exception_handler_default_behavior_maintained(mocker): """Other internal errors are transmitted as is """ fake_exception = APIException("Fake internal error", 500) fake_response = Response( exception=fake_exception, status=fake_exception.status_code ) mock_exception_handler = mocker.patch("swh.deposit.exception.exception_handler") mock_exception_handler.return_value = fake_response response = custom_exception_handler(fake_exception, {}) assert response is not None assert response == fake_response diff --git a/swh/deposit/tests/api/test_parser.py b/swh/deposit/tests/api/test_parser.py index b1cc9119..0adea4f5 100644 --- a/swh/deposit/tests/api/test_parser.py +++ b/swh/deposit/tests/api/test_parser.py @@ -1,134 +1,133 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import io - from collections import OrderedDict +import io from swh.deposit.parsers import SWHXMLParser def test_parsing_without_duplicates(): xml_no_duplicate = io.BytesIO( b""" Awesome Compiler GPL3.0 https://opensource.org/licenses/GPL-3.0 Python3 author1 Inria ocaml http://issuetracker.com """ ) actual_result = SWHXMLParser().parse(xml_no_duplicate) expected_dict = OrderedDict( [ ("@xmlns", "http://www.w3.org/2005/Atom"), ("@xmlns:codemeta", "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"), ("title", "Awesome Compiler"), ( "codemeta:license", OrderedDict( [ ("codemeta:name", "GPL3.0"), ("codemeta:url", "https://opensource.org/licenses/GPL-3.0"), ] ), ), ("codemeta:runtimePlatform", "Python3"), ( "codemeta:author", OrderedDict( [("codemeta:name", "author1"), ("codemeta:affiliation", "Inria")] ), ), ("codemeta:programmingLanguage", "ocaml"), ("codemeta:issueTracker", "http://issuetracker.com"), ] ) assert expected_dict == actual_result def test_parsing_with_duplicates(): xml_with_duplicates = io.BytesIO( b""" Another Compiler GNU/Linux GPL3.0 https://opensource.org/licenses/GPL-3.0 Un*x author1 Inria author2 Inria ocaml haskell spdx http://spdx.org python3 """ ) actual_result = SWHXMLParser().parse(xml_with_duplicates) expected_dict = OrderedDict( [ ("@xmlns", "http://www.w3.org/2005/Atom"), ("@xmlns:codemeta", "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"), ("title", "Another Compiler"), ("codemeta:runtimePlatform", ["GNU/Linux", "Un*x"]), ( "codemeta:license", [ OrderedDict( [ ("codemeta:name", "GPL3.0"), ("codemeta:url", "https://opensource.org/licenses/GPL-3.0"), ] ), OrderedDict( [("codemeta:name", "spdx"), ("codemeta:url", "http://spdx.org")] ), ], ), ( "codemeta:author", [ OrderedDict( [ ("codemeta:name", "author1"), ("codemeta:affiliation", "Inria"), ] ), OrderedDict( [ ("codemeta:name", "author2"), ("codemeta:affiliation", "Inria"), ] ), ], ), ("codemeta:programmingLanguage", ["ocaml", "haskell", "python3"]), ] ) assert expected_dict == actual_result diff --git a/swh/deposit/tests/cli/test_client.py b/swh/deposit/tests/cli/test_client.py index d3b0b9ea..2e793fa4 100644 --- a/swh/deposit/tests/cli/test_client.py +++ b/swh/deposit/tests/cli/test_client.py @@ -1,463 +1,463 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import contextlib import logging import os import re from unittest.mock import MagicMock from click.testing import CliRunner import pytest -from swh.deposit.client import PublicApiDepositClient, MaintenanceError -from swh.deposit.cli.client import generate_slug, _url, _client, _collection, InputError from swh.deposit.cli import deposit as cli -from ..conftest import TEST_USER +from swh.deposit.cli.client import InputError, _client, _collection, _url, generate_slug +from swh.deposit.client import MaintenanceError, PublicApiDepositClient +from ..conftest import TEST_USER EXAMPLE_SERVICE_DOCUMENT = { "service": {"workspace": {"collection": {"sword:name": "softcol",}}} } @pytest.fixture def datadir(request): """Override default datadir to target main test datadir""" return os.path.join(os.path.dirname(str(request.fspath)), "../data") @pytest.fixture def slug(): return generate_slug() @pytest.fixture def client_mock(mocker, slug): """A successful deposit client with hard-coded default values """ mocker.patch("swh.deposit.cli.client.generate_slug", return_value=slug) mock_client = MagicMock() mocker.patch("swh.deposit.cli.client._client", return_value=mock_client) mock_client.service_document.return_value = EXAMPLE_SERVICE_DOCUMENT mock_client.deposit_create.return_value = '{"foo": "bar"}' return mock_client @pytest.fixture def client_mock_api_down(mocker, slug): """A mock client whose connection with api fails due to maintenance issue """ mocker.patch("swh.deposit.cli.client.generate_slug", return_value=slug) mock_client = MagicMock() mocker.patch("swh.deposit.cli.client._client", return_value=mock_client) mock_client.service_document.side_effect = MaintenanceError( "Database backend maintenance: Temporarily unavailable, try again later." ) return mock_client def test_url(): assert _url("http://deposit") == "http://deposit/1" assert _url("https://other/1") == "https://other/1" def test_client(): client = _client("http://deposit", "user", "pass") assert isinstance(client, PublicApiDepositClient) def test_collection_error(): mock_client = MagicMock() mock_client.service_document.return_value = {"error": "something went wrong"} with pytest.raises(InputError) as e: _collection(mock_client) assert "Service document retrieval: something went wrong" == str(e.value) def test_collection_ok(): mock_client = MagicMock() mock_client.service_document.return_value = EXAMPLE_SERVICE_DOCUMENT collection_name = _collection(mock_client) assert collection_name == "softcol" def test_collection_ko_because_downtime(): mock_client = MagicMock() mock_client.service_document.side_effect = MaintenanceError("downtime") with pytest.raises(MaintenanceError, match="downtime"): _collection(mock_client) def test_deposit_with_server_down_for_maintenance( sample_archive, mocker, caplog, client_mock_api_down, slug, tmp_path ): """ Deposit failure due to maintenance down time should be explicit """ runner = CliRunner() result = runner.invoke( cli, [ "upload", "--url", "mock://deposit.swh/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], "--author", "Jane Doe", ], ) assert result.exit_code == 1, result.output assert result.output == "" assert caplog.record_tuples == [ ( "swh.deposit.cli.client", logging.ERROR, "Database backend maintenance: Temporarily unavailable, try again later.", ) ] client_mock_api_down.service_document.assert_called_once_with() def test_single_minimal_deposit( sample_archive, mocker, caplog, client_mock, slug, tmp_path ): """ from: https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#single-deposit """ # noqa metadata_path = os.path.join(tmp_path, "metadata.xml") mocker.patch( "tempfile.TemporaryDirectory", return_value=contextlib.nullcontext(str(tmp_path)), ) runner = CliRunner() result = runner.invoke( cli, [ "upload", "--url", "mock://deposit.swh/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], "--author", "Jane Doe", ], ) assert result.exit_code == 0, result.output assert result.output == "" assert caplog.record_tuples == [ ("swh.deposit.cli.client", logging.INFO, '{"foo": "bar"}'), ] client_mock.deposit_create.assert_called_once_with( archive=sample_archive["path"], collection="softcol", in_progress=False, metadata=metadata_path, slug=slug, ) with open(metadata_path) as fd: assert ( fd.read() == f"""\ \ttest-project \t{slug} \t \t\tJane Doe \t """ ) def test_metadata_validation(sample_archive, mocker, caplog, tmp_path): """ from: https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#single-deposit """ # noqa slug = generate_slug() mocker.patch("swh.deposit.cli.client.generate_slug", return_value=slug) mock_client = MagicMock() mocker.patch("swh.deposit.cli.client._client", return_value=mock_client) mock_client.service_document.return_value = EXAMPLE_SERVICE_DOCUMENT mock_client.deposit_create.return_value = '{"foo": "bar"}' metadata_path = os.path.join(tmp_path, "metadata.xml") mocker.patch( "tempfile.TemporaryDirectory", return_value=contextlib.nullcontext(str(tmp_path)), ) with open(metadata_path, "a"): pass # creates the file runner = CliRunner() # Test missing author result = runner.invoke( cli, [ "upload", "--url", "mock://deposit.swh/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], ], ) assert result.exit_code == 1, result.output assert result.output == "" assert len(caplog.record_tuples) == 1 (_logger, level, message) = caplog.record_tuples[0] assert level == logging.ERROR assert " --author " in message # Clear mocking state caplog.clear() mock_client.reset_mock() # Test missing name result = runner.invoke( cli, [ "upload", "--url", "mock://deposit.swh/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--author", "Jane Doe", ], ) assert result.exit_code == 1, result.output assert result.output == "" assert len(caplog.record_tuples) == 1 (_logger, level, message) = caplog.record_tuples[0] assert level == logging.ERROR assert " --name " in message # Clear mocking state caplog.clear() mock_client.reset_mock() # Test both --metadata and --author result = runner.invoke( cli, [ "upload", "--url", "mock://deposit.swh/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--metadata", metadata_path, "--author", "Jane Doe", ], ) assert result.exit_code == 1, result.output assert result.output == "" assert len(caplog.record_tuples) == 1 (_logger, level, message) = caplog.record_tuples[0] assert level == logging.ERROR assert re.search("--metadata.*is incompatible with", message) # Clear mocking state caplog.clear() mock_client.reset_mock() def test_single_deposit_slug_generation( sample_archive, mocker, caplog, tmp_path, client_mock ): """ from: https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#single-deposit """ # noqa slug = "my-slug" collection = "my-collection" metadata_path = os.path.join(tmp_path, "metadata.xml") mocker.patch( "tempfile.TemporaryDirectory", return_value=contextlib.nullcontext(str(tmp_path)), ) runner = CliRunner() result = runner.invoke( cli, [ "upload", "--url", "mock://deposit.swh/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], "--slug", slug, "--collection", collection, "--author", "Jane Doe", ], ) assert result.exit_code == 0, result.output assert result.output == "" assert caplog.record_tuples == [ ("swh.deposit.cli.client", logging.INFO, '{"foo": "bar"}'), ] client_mock.deposit_create.assert_called_once_with( archive=sample_archive["path"], collection=collection, in_progress=False, metadata=metadata_path, slug=slug, ) with open(metadata_path) as fd: assert ( fd.read() == """\ \ttest-project \tmy-slug \t \t\tJane Doe \t """ ) def test_multisteps_deposit( sample_archive, atom_dataset, mocker, caplog, datadir, client_mock, slug ): """ from: https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#multisteps-deposit """ # noqa slug = generate_slug() mocker.patch("swh.deposit.cli.client.generate_slug", return_value=slug) # https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#create-an-incomplete-deposit client_mock.deposit_create.return_value = '{"deposit_id": "42"}' runner = CliRunner() result = runner.invoke( cli, [ "upload", "--url", "mock://deposit.swh/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--partial", ], ) assert result.exit_code == 0, result.output assert result.output == "" assert caplog.record_tuples == [ ("swh.deposit.cli.client", logging.INFO, '{"deposit_id": "42"}'), ] client_mock.deposit_create.assert_called_once_with( archive=sample_archive["path"], collection="softcol", in_progress=True, metadata=None, slug=slug, ) # Clear mocking state caplog.clear() client_mock.reset_mock() # https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#add-content-or-metadata-to-the-deposit metadata_path = os.path.join(datadir, "atom", "entry-data-deposit-binary.xml") result = runner.invoke( cli, [ "upload", "--url", "mock://deposit.swh/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, ], ) assert result.exit_code == 0, result.output assert result.output == "" assert caplog.record_tuples == [ ("swh.deposit.cli.client", logging.INFO, '{"deposit_id": "42"}'), ] client_mock.deposit_create.assert_called_once_with( archive=None, collection="softcol", in_progress=False, metadata=metadata_path, slug=slug, ) # Clear mocking state caplog.clear() client_mock.reset_mock() diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py index 8599a4c1..faa9b08a 100644 --- a/swh/deposit/tests/conftest.py +++ b/swh/deposit/tests/conftest.py @@ -1,425 +1,423 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import os import base64 -import pytest -import psycopg2 - -import yaml +import os +from typing import Mapping -from django.urls import reverse from django.test.utils import setup_databases # type: ignore - -# mypy is asked to ignore the import statement above because setup_databases -# is not part of the d.t.utils.__all__ variable. - +from django.urls import reverse +import psycopg2 from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT +import pytest from rest_framework import status from rest_framework.test import APIClient -from typing import Mapping +import yaml -from swh.scheduler import get_scheduler -from swh.model.identifiers import DIRECTORY, swhid, REVISION, SNAPSHOT -from swh.deposit.config import setup_django_for -from swh.deposit.parsers import parse_xml from swh.deposit.config import ( COL_IRI, - EDIT_SE_IRI, DEPOSIT_STATUS_DEPOSITED, - DEPOSIT_STATUS_REJECTED, - DEPOSIT_STATUS_PARTIAL, + DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS, + DEPOSIT_STATUS_PARTIAL, + DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED, - DEPOSIT_STATUS_LOAD_FAILURE, + EDIT_SE_IRI, + setup_django_for, ) +from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import create_arborescence_archive +from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid +from swh.scheduler import get_scheduler + +# mypy is asked to ignore the import statement above because setup_databases +# is not part of the d.t.utils.__all__ variable. TEST_USER = { "username": "test", "password": "password", "email": "test@example.org", "provider_url": "https://hal-test.archives-ouvertes.fr/", "domain": "archives-ouvertes.fr/", "collection": {"name": "test"}, } def pytest_configure(): setup_django_for("testing") @pytest.fixture() def deposit_config(swh_scheduler_config): return { "max_upload_size": 500, "extraction_dir": "/tmp/swh-deposit/test/extraction-dir", "checks": False, "provider": { "provider_name": "", "provider_type": "deposit_client", "provider_url": "", "metadata": {}, }, "tool": { "name": "swh-deposit", "version": "0.0.1", "configuration": {"sword_version": "2"}, }, "scheduler": {"cls": "local", "args": swh_scheduler_config,}, } @pytest.fixture() def deposit_config_path(tmp_path, monkeypatch, deposit_config): conf_path = os.path.join(tmp_path, "deposit.yml") with open(conf_path, "w") as f: f.write(yaml.dump(deposit_config)) monkeypatch.setenv("SWH_CONFIG_FILENAME", conf_path) return conf_path @pytest.fixture(autouse=True) def deposit_autoconfig(deposit_config_path, swh_scheduler_config): """Enforce config for deposit classes inherited from APIConfig.""" scheduler = get_scheduler("local", swh_scheduler_config) task_type = { "type": "load-deposit", "backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit", "description": "Load deposit task", } scheduler.create_task_type(task_type) @pytest.fixture(scope="session") def django_db_setup(request, django_db_blocker, postgresql_proc): from django.conf import settings settings.DATABASES["default"].update( { ("ENGINE", "django.db.backends.postgresql"), ("NAME", "tests"), ("USER", postgresql_proc.user), # noqa ("HOST", postgresql_proc.host), # noqa ("PORT", postgresql_proc.port), # noqa } ) with django_db_blocker.unblock(): setup_databases( verbosity=request.config.option.verbose, interactive=False, keepdb=False ) def execute_sql(sql): """Execute sql to postgres db""" with psycopg2.connect(database="postgres") as conn: conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() cur.execute(sql) @pytest.fixture(autouse=True, scope="session") def swh_proxy(): """Automatically inject this fixture in all tests to ensure no outside connection takes place. """ os.environ["http_proxy"] = "http://localhost:999" os.environ["https_proxy"] = "http://localhost:999" def create_deposit_collection(collection_name: str): """Create a deposit collection with name collection_name """ from swh.deposit.models import DepositCollection try: collection = DepositCollection._default_manager.get(name=collection_name) except DepositCollection.DoesNotExist: collection = DepositCollection(name=collection_name) collection.save() return collection def deposit_collection_factory(collection_name=TEST_USER["collection"]["name"]): @pytest.fixture def _deposit_collection(db, collection_name=collection_name): return create_deposit_collection(collection_name) return _deposit_collection deposit_collection = deposit_collection_factory() deposit_another_collection = deposit_collection_factory("another-collection") @pytest.fixture def deposit_user(db, deposit_collection): """Create/Return the test_user "test" """ from swh.deposit.models import DepositClient try: user = DepositClient._default_manager.get(username=TEST_USER["username"]) except DepositClient.DoesNotExist: user = DepositClient._default_manager.create_user( username=TEST_USER["username"], email=TEST_USER["email"], password=TEST_USER["password"], provider_url=TEST_USER["provider_url"], domain=TEST_USER["domain"], ) user.collections = [deposit_collection.id] user.save() return user @pytest.fixture def client(): """Override pytest-django one which does not work for djangorestframework. """ return APIClient() # <- drf's client @pytest.yield_fixture def authenticated_client(client, deposit_user): """Returned a logged client """ _token = "%s:%s" % (deposit_user.username, TEST_USER["password"]) token = base64.b64encode(_token.encode("utf-8")) authorization = "Basic %s" % token.decode("utf-8") client.credentials(HTTP_AUTHORIZATION=authorization) yield client client.logout() @pytest.fixture def sample_archive(tmp_path): """Returns a sample archive """ tmp_path = str(tmp_path) # pytest version limitation in previous version archive = create_arborescence_archive( tmp_path, "archive1", "file1", b"some content in file" ) return archive @pytest.fixture def atom_dataset(datadir) -> Mapping[str, str]: """Compute the paths to atom files. Returns: Dict of atom name per content (bytes) """ atom_path = os.path.join(datadir, "atom") data = {} for filename in os.listdir(atom_path): filepath = os.path.join(atom_path, filename) with open(filepath, "rb") as f: raw_content = f.read().decode("utf-8") # Keep the filename without extension atom_name = filename.split(".")[0] data[atom_name] = raw_content return data def create_deposit( authenticated_client, collection_name: str, sample_archive, external_id: str, deposit_status=DEPOSIT_STATUS_DEPOSITED, ): """Create a skeleton shell deposit """ url = reverse(COL_IRI, args=[collection_name]) # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"]), ) # then assert response.status_code == status.HTTP_201_CREATED from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(external_id=external_id) if deposit.status != deposit_status: deposit.status = deposit_status deposit.save() assert deposit.status == deposit_status return deposit def create_binary_deposit( authenticated_client, collection_name: str, sample_archive, external_id: str, deposit_status: str = DEPOSIT_STATUS_DEPOSITED, atom_dataset: Mapping[str, bytes] = {}, ): """Create a deposit with both metadata and archive set. Then alters its status to `deposit_status`. """ deposit = create_deposit( authenticated_client, collection_name, sample_archive, external_id=external_id, deposit_status=DEPOSIT_STATUS_PARTIAL, ) response = authenticated_client.post( reverse(EDIT_SE_IRI, args=[collection_name, deposit.id]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8"), HTTP_SLUG=deposit.external_id, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_201_CREATED assert deposit.status == DEPOSIT_STATUS_PARTIAL from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit.id) if deposit.status != deposit_status: deposit.status = deposit_status deposit.save() assert deposit.status == deposit_status return deposit def deposit_factory(deposit_status=DEPOSIT_STATUS_DEPOSITED): """Build deposit with a specific status """ @pytest.fixture() def _deposit( sample_archive, deposit_collection, authenticated_client, deposit_status=deposit_status, ): external_id = "external-id-%s" % deposit_status return create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id=external_id, deposit_status=deposit_status, ) return _deposit deposited_deposit = deposit_factory() rejected_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_REJECTED) partial_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_PARTIAL) verified_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_VERIFIED) completed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS) failed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_FAILURE) @pytest.fixture def partial_deposit_with_metadata( sample_archive, deposit_collection, authenticated_client, atom_dataset ): """Returns deposit with archive and metadata provided, status 'partial' """ return create_binary_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id="external-id-partial", deposit_status=DEPOSIT_STATUS_PARTIAL, atom_dataset=atom_dataset, ) @pytest.fixture def partial_deposit_only_metadata( deposit_collection, authenticated_client, atom_dataset ): response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], HTTP_SLUG="external-id-partial", HTTP_IN_PROGRESS=True, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = response_content["deposit_id"] from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_PARTIAL return deposit @pytest.fixture def complete_deposit(sample_archive, deposit_collection, authenticated_client): """Returns a completed deposit (load success) """ deposit = create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id="external-id-complete", deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS, ) origin = "https://hal.archives-ouvertes.fr/hal-01727745" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" revision_id = "548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10" snapshot_id = "e5e82d064a9c3df7464223042e0c55d72ccff7f0" deposit.swh_id = swhid(DIRECTORY, directory_id) deposit.swh_id_context = swhid( DIRECTORY, directory_id, metadata={ "origin": origin, "visit": swhid(SNAPSHOT, snapshot_id), "anchor": swhid(REVISION, revision_id), "path": "/", }, ) deposit.save() return deposit @pytest.fixture() def tmp_path(tmp_path): return str(tmp_path) # issue with oldstable's pytest version diff --git a/swh/deposit/tests/loader/common.py b/swh/deposit/tests/loader/common.py index c86f0e21..0ebbc603 100644 --- a/swh/deposit/tests/loader/common.py +++ b/swh/deposit/tests/loader/common.py @@ -1,140 +1,139 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json - from typing import Dict, Optional -from swh.deposit.client import PrivateApiDepositClient +from swh.deposit.client import PrivateApiDepositClient from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.model import SnapshotBranch, TargetType from swh.storage.algos.snapshot import snapshot_get_all_branches CLIENT_TEST_CONFIG = { "url": "http://nowhere:9000/", "auth": {}, # no authentication in test scenario } class SWHDepositTestClient(PrivateApiDepositClient): """Deposit test client to permit overriding the default request client. """ def __init__(self, client, config): super().__init__(config=config) self.client = client def archive_get(self, archive_update_url, archive_path, log=None): r = self.client.get(archive_update_url) with open(archive_path, "wb") as f: for chunk in r.streaming_content: f.write(chunk) return archive_path def metadata_get(self, metadata_url, log=None): r = self.client.get(metadata_url) return json.loads(r.content.decode("utf-8")) def status_update( self, update_status_url, status, revision_id=None, directory_id=None, origin_url=None, ): payload = {"status": status} if revision_id: payload["revision_id"] = revision_id if directory_id: payload["directory_id"] = directory_id if origin_url: payload["origin_url"] = origin_url self.client.put( update_status_url, content_type="application/json", data=json.dumps(payload) ) def check(self, check_url): r = self.client.get(check_url) data = json.loads(r.content.decode("utf-8")) return data["status"] def get_stats(storage) -> Dict: """Adaptation utils to unify the stats counters across storage implementation. """ storage.refresh_stat_counters() stats = storage.stat_counters() keys = [ "content", "directory", "origin", "origin_visit", "person", "release", "revision", "skipped_content", "snapshot", ] return {k: stats.get(k) for k in keys} def decode_target(branch: Optional[SnapshotBranch]) -> Optional[Dict]: """Test helper to ease readability in test """ if not branch: return None target_type = branch.target_type if target_type == TargetType.ALIAS: decoded_target = branch.target.decode("utf-8") else: decoded_target = hash_to_hex(branch.target) return {"target": decoded_target, "target_type": target_type} def check_snapshot(expected_snapshot, storage): """Check for snapshot match. Provide the hashes as hexadecimal, the conversion is done within the method. Args: expected_snapshot (dict): full snapshot with hex ids storage (Storage): expected storage """ expected_snapshot_id = expected_snapshot["id"] expected_branches = expected_snapshot["branches"] snap = snapshot_get_all_branches(hash_to_bytes(expected_snapshot_id)) if snap is None: # display known snapshots instead if possible if hasattr(storage, "_snapshots"): # in-mem storage from pprint import pprint for snap_id, (_snap, _) in storage._snapshots.items(): snapd = _snap.to_dict() snapd["id"] = hash_to_hex(snapd["id"]) branches = { branch.decode("utf-8"): decode_target(target) for branch, target in snapd["branches"].items() } snapd["branches"] = branches pprint(snapd) raise AssertionError("Snapshot is not found") branches = { branch.decode("utf-8"): decode_target(branch) for branch_name, branch in snap["branches"].items() } assert expected_branches == branches diff --git a/swh/deposit/tests/loader/conftest.py b/swh/deposit/tests/loader/conftest.py index d4642852..260bd327 100644 --- a/swh/deposit/tests/loader/conftest.py +++ b/swh/deposit/tests/loader/conftest.py @@ -1,37 +1,37 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from functools import partial import re -import pytest -from functools import partial +import pytest from swh.core.pytest_plugin import get_response_cb from swh.deposit.loader.checker import DepositChecker @pytest.fixture def deposit_config(tmp_path): return { "deposit": { "url": "https://deposit.softwareheritage.org/1/private/", "auth": {}, } } @pytest.fixture def deposit_checker(deposit_config_path): return DepositChecker() @pytest.fixture def requests_mock_datadir(datadir, requests_mock_datadir): """Override default behavior to deal with put method """ cb = partial(get_response_cb, datadir=datadir) requests_mock_datadir.put(re.compile("https://"), body=cb) return requests_mock_datadir diff --git a/swh/deposit/tests/loader/test_client.py b/swh/deposit/tests/loader/test_client.py index 4f099d40..55edd2c7 100644 --- a/swh/deposit/tests/loader/test_client.py +++ b/swh/deposit/tests/loader/test_client.py @@ -1,247 +1,246 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import os import json -import pytest -import unittest - +import os from typing import Any, Callable, Optional +import unittest from urllib.parse import urlparse -from swh.deposit.client import PrivateApiDepositClient -from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE +import pytest +from swh.deposit.client import PrivateApiDepositClient +from swh.deposit.config import DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS CLIENT_TEST_CONFIG = { "url": "https://nowhere.org/", "auth": {}, # no authentication in test scenario } def build_expected_path(datadir, base_url: str, api_url: str) -> str: """Build expected path from api to served file """ url = urlparse(base_url) dirname = "%s_%s" % (url.scheme, url.hostname) if api_url.endswith("/"): api_url = api_url[:-1] if api_url.startswith("/"): api_url = api_url[1:] suffix_path = api_url.replace("/", "_") return os.path.join(datadir, dirname, suffix_path) def test_build_expected_path(datadir): actual_path = build_expected_path(datadir, "http://example.org", "/hello/you/") assert actual_path == os.path.join(datadir, "http_example.org", "hello_you") def read_served_path( datadir, base_url: str, api_url: str, convert_fn: Optional[Callable[[str], Any]] = None, ) -> bytes: """Read served path """ archive_path = build_expected_path(datadir, base_url, api_url) with open(archive_path, "rb") as f: content = f.read() if convert_fn: content = convert_fn(content.decode("utf-8")) return content def test_read_served_path(datadir): actual_content = read_served_path(datadir, "http://example.org", "/hello/you/") assert actual_content == b"hello people\n" actual_content2 = read_served_path( datadir, "http://example.org", "/hello.json", convert_fn=json.loads ) assert actual_content2 == {"a": [1, 3]} # private api to retrieve archive def test_archive_get(tmp_path, datadir, requests_mock_datadir): """Retrieving archive data through private api should stream data """ api_url = "/1/private/test/1/raw/" client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) expected_content = read_served_path(datadir, client.base_url, api_url) archive_path = os.path.join(tmp_path, "test.archive") archive_path = client.archive_get(api_url, archive_path) assert os.path.exists(archive_path) is True with open(archive_path, "rb") as f: actual_content = f.read() assert actual_content == expected_content assert client.base_url == CLIENT_TEST_CONFIG["url"] assert client.auth is None def test_archive_get_auth(tmp_path, datadir, requests_mock_datadir): """Retrieving archive data through private api should stream data """ api_url = "/1/private/test/1/raw/" config = CLIENT_TEST_CONFIG.copy() config["auth"] = { # add authentication setup "username": "user", "password": "pass", } client = PrivateApiDepositClient(config) expected_content = read_served_path(datadir, client.base_url, api_url) archive_path = os.path.join(tmp_path, "test.archive") archive_path = client.archive_get(api_url, archive_path) assert os.path.exists(archive_path) is True with open(archive_path, "rb") as f: actual_content = f.read() assert actual_content == expected_content assert client.base_url == CLIENT_TEST_CONFIG["url"] assert client.auth == ("user", "pass") def test_archive_get_ko(tmp_path, datadir, requests_mock_datadir): """Reading archive can fail for some reasons """ unknown_api_url = "/1/private/unknown/deposit-id/raw/" client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) with pytest.raises(ValueError, match="Problem when retrieving deposit"): client.archive_get(unknown_api_url, "some/path") # private api read metadata def test_metadata_get(datadir, requests_mock_datadir): """Reading archive should write data in temporary directory """ api_url = "/1/private/test/1/metadata" client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) actual_metadata = client.metadata_get(api_url) assert isinstance(actual_metadata, str) is False expected_content = read_served_path( datadir, client.base_url, api_url, convert_fn=json.loads ) assert actual_metadata == expected_content def test_metadata_get_ko(requests_mock_datadir): """Reading metadata can fail for some reasons """ unknown_api_url = "/1/private/unknown/deposit-id/metadata/" client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) with pytest.raises(ValueError, match="Problem when retrieving metadata"): client.metadata_get(unknown_api_url) # private api check def test_check(requests_mock_datadir): """When check ok, this should return the deposit's status """ api_url = "/1/private/test/1/check" client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) r = client.check(api_url) assert r == "something" def test_check_fails(requests_mock_datadir): """Checking deposit can fail for some reason """ unknown_api_url = "/1/private/test/10/check" client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) with pytest.raises(ValueError, match="Problem when checking deposit"): client.check(unknown_api_url) # private api update status class FakeRequestClientPut: """Fake Request client dedicated to put request method calls. """ args = None kwargs = None def put(self, *args, **kwargs): self.args = args self.kwargs = kwargs class PrivateApiDepositClientStatusUpdateTest(unittest.TestCase): def test_status_update(self): """Update status """ _client = FakeRequestClientPut() deposit_client = PrivateApiDepositClient( config=CLIENT_TEST_CONFIG, _client=_client ) deposit_client.status_update( "/update/status", DEPOSIT_STATUS_LOAD_SUCCESS, revision_id="some-revision-id", ) self.assertEqual(_client.args, ("https://nowhere.org/update/status",)) self.assertEqual( _client.kwargs, { "json": { "status": DEPOSIT_STATUS_LOAD_SUCCESS, "revision_id": "some-revision-id", } }, ) def test_status_update_with_no_revision_id(self): """Reading metadata can fail for some reasons """ _client = FakeRequestClientPut() deposit_client = PrivateApiDepositClient( config=CLIENT_TEST_CONFIG, _client=_client ) deposit_client.status_update("/update/status/fail", DEPOSIT_STATUS_LOAD_FAILURE) self.assertEqual(_client.args, ("https://nowhere.org/update/status/fail",)) self.assertEqual( _client.kwargs, {"json": {"status": DEPOSIT_STATUS_LOAD_FAILURE,}} ) diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py index 644d8f33..8be41c4c 100644 --- a/swh/deposit/tests/test_utils.py +++ b/swh/deposit/tests/test_utils.py @@ -1,141 +1,141 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import pytest - from unittest.mock import patch +import pytest + from swh.deposit import utils def test_merge(): """Calling utils.merge on dicts should merge without losing information """ d0 = {"author": "someone", "license": [["gpl2"]], "a": 1} d1 = { "author": ["author0", {"name": "author1"}], "license": [["gpl3"]], "b": {"1": "2"}, } d2 = {"author": map(lambda x: x, ["else"]), "license": "mit", "b": {"2": "3",}} d3 = { "author": (v for v in ["no one"]), } actual_merge = utils.merge(d0, d1, d2, d3) expected_merge = { "a": 1, "license": [["gpl2"], ["gpl3"], "mit"], "author": ["someone", "author0", {"name": "author1"}, "else", "no one"], "b": {"1": "2", "2": "3",}, } assert actual_merge == expected_merge def test_merge_2(): d0 = {"license": "gpl2", "runtime": {"os": "unix derivative"}} d1 = {"license": "gpl3", "runtime": "GNU/Linux"} expected = { "license": ["gpl2", "gpl3"], "runtime": [{"os": "unix derivative"}, "GNU/Linux"], } actual = utils.merge(d0, d1) assert actual == expected def test_merge_edge_cases(): input_dict = { "license": ["gpl2", "gpl3"], "runtime": [{"os": "unix derivative"}, "GNU/Linux"], } # against empty dict actual = utils.merge(input_dict, {}) assert actual == input_dict # against oneself actual = utils.merge(input_dict, input_dict, input_dict) assert actual == input_dict def test_merge_one_dict(): """Merge one dict should result in the same dict value """ input_and_expected = {"anything": "really"} actual = utils.merge(input_and_expected) assert actual == input_and_expected def test_merge_raise(): """Calling utils.merge with any no dict argument should raise """ d0 = {"author": "someone", "a": 1} d1 = ["not a dict"] with pytest.raises(ValueError): utils.merge(d0, d1) with pytest.raises(ValueError): utils.merge(d1, d0) with pytest.raises(ValueError): utils.merge(d1) assert utils.merge(d0) == d0 @patch("swh.deposit.utils.normalize_timestamp", side_effect=lambda x: x) def test_normalize_date_0(mock_normalize): """When date is a list, choose the first date and normalize it Note: We do not test swh.model.identifiers which is already tested in swh.model """ actual_date = utils.normalize_date(["2017-10-12", "date1"]) expected_date = "2017-10-12 00:00:00+00:00" assert str(actual_date) == expected_date @patch("swh.deposit.utils.normalize_timestamp", side_effect=lambda x: x) def test_normalize_date_1(mock_normalize): """Providing a date in a reasonable format, everything is fine Note: We do not test swh.model.identifiers which is already tested in swh.model """ actual_date = utils.normalize_date("2018-06-11 17:02:02") expected_date = "2018-06-11 17:02:02+00:00" assert str(actual_date) == expected_date @patch("swh.deposit.utils.normalize_timestamp", side_effect=lambda x: x) def test_normalize_date_doing_irrelevant_stuff(mock_normalize): """Providing a date with only the year results in a reasonable date Note: We do not test swh.model.identifiers which is already tested in swh.model """ actual_date = utils.normalize_date("2017") expected_date = "2017-01-01 00:00:00+00:00" assert str(actual_date) == expected_date diff --git a/swh/deposit/urls.py b/swh/deposit/urls.py index 384844c3..9f6ab0eb 100644 --- a/swh/deposit/urls.py +++ b/swh/deposit/urls.py @@ -1,31 +1,31 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """SWH's main deposit URL Configuration """ -from django.conf.urls import url, include +from django.conf.urls import include, url from django.shortcuts import render from django.views.generic.base import RedirectView from rest_framework.urlpatterns import format_suffix_patterns favicon_view = RedirectView.as_view( url="/static/img/icons/swh-logo-32x32.png", permanent=True ) def default_view(req): return render(req, "homepage.html") urlpatterns = [ url(r"^favicon\.ico$", favicon_view), url(r"^1/", include("swh.deposit.api.urls")), url(r"^1/private/", include("swh.deposit.api.private.urls")), url(r"^$", default_view, name="home"), ] urlpatterns = format_suffix_patterns(urlpatterns) diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py index ee3711db..3b79293e 100644 --- a/swh/deposit/utils.py +++ b/swh/deposit/utils.py @@ -1,83 +1,83 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import iso8601 - from types import GeneratorType +import iso8601 + from swh.model.identifiers import normalize_timestamp def merge(*dicts): """Given an iterator of dicts, merge them losing no information. Args: *dicts: arguments are all supposed to be dict to merge into one Returns: dict merged without losing information """ def _extend(existing_val, value): """Given an existing value and a value (as potential lists), merge them together without repetition. """ if isinstance(value, (list, map, GeneratorType)): vals = value else: vals = [value] for v in vals: if v in existing_val: continue existing_val.append(v) return existing_val d = {} for data in dicts: if not isinstance(data, dict): raise ValueError("dicts is supposed to be a variable arguments of dict") for key, value in data.items(): existing_val = d.get(key) if not existing_val: d[key] = value continue if isinstance(existing_val, (list, map, GeneratorType)): new_val = _extend(existing_val, value) elif isinstance(existing_val, dict): if isinstance(value, dict): new_val = merge(existing_val, value) else: new_val = _extend([existing_val], value) else: new_val = _extend([existing_val], value) d[key] = new_val return d def normalize_date(date): """Normalize date fields as expected by swh workers. If date is a list, elect arbitrarily the first element of that list If date is (then) a string, parse it through dateutil.parser.parse to extract a datetime. Then normalize it through swh.model.identifiers.normalize_timestamp. Returns The swh date object """ if isinstance(date, list): date = date[0] if isinstance(date, str): date = iso8601.parse_date(date) return normalize_timestamp(date)