diff --git a/requirements-swh-server.txt b/requirements-swh-server.txt index d5f32737..489c96bf 100644 --- a/requirements-swh-server.txt +++ b/requirements-swh-server.txt @@ -1,4 +1,5 @@ swh.core[http] >= 0.4 swh.loader.core >= 0.0.71 swh.scheduler >= 0.7.0 swh.model >= 0.3.8 +swh.auth[django] >= 0.3.3 diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py index 52a64b16..e10ac098 100644 --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -1,1263 +1,1269 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from abc import ABCMeta, abstractmethod import datetime import hashlib import json from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union import uuid import attr from django.core.files.uploadedfile import UploadedFile from django.http import FileResponse, HttpResponse from django.shortcuts import render from django.template.loader import render_to_string from django.urls import reverse from django.utils import timezone from rest_framework import status -from rest_framework.authentication import BaseAuthentication, BasicAuthentication +from rest_framework.authentication import BaseAuthentication from rest_framework.permissions import BasePermission, IsAuthenticated from rest_framework.request import Request from rest_framework.views import APIView from swh.deposit.api.checks import check_metadata from swh.deposit.api.converters import convert_status_detail +from swh.deposit.auth import HasDepositPermission, KeycloakBasicAuthentication from swh.deposit.models import Deposit from swh.deposit.utils import compute_metadata_context from swh.model import hashutil from swh.model.identifiers import ( ExtendedObjectType, ExtendedSWHID, QualifiedSWHID, ValidationError, ) from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, Origin, RawExtrinsicMetadata, ) from swh.scheduler.utils import create_oneshot_task_dict from ..config import ( ARCHIVE_KEY, ARCHIVE_TYPE, CONT_FILE_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_PARTIAL, EDIT_IRI, EM_IRI, METADATA_KEY, METADATA_TYPE, RAW_METADATA_KEY, SE_IRI, STATE_IRI, APIConfig, ) from ..errors import ( BAD_REQUEST, CHECKSUM_MISMATCH, ERROR_CONTENT, FORBIDDEN, MAX_UPLOAD_SIZE_EXCEEDED, MEDIATION_NOT_ALLOWED, METHOD_NOT_ALLOWED, NOT_FOUND, PARSING_ERROR, DepositError, ParserError, ) from ..models import DepositClient, DepositCollection, DepositRequest from ..parsers import parse_xml from ..utils import extended_swhid_from_qualified, parse_swh_reference ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"] ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"] @attr.s class ParsedRequestHeaders: content_type = attr.ib(type=str) content_length = attr.ib(type=Optional[int]) in_progress = attr.ib(type=bool) content_disposition = attr.ib(type=Optional[str]) content_md5sum = attr.ib(type=Optional[bytes]) packaging = attr.ib(type=Optional[str]) slug = attr.ib(type=Optional[str]) on_behalf_of = attr.ib(type=Optional[str]) metadata_relevant = attr.ib(type=Optional[str]) swhid = attr.ib(type=Optional[str]) @attr.s class Receipt: """Data computed while handling the request body that will be served in the Deposit Receipt.""" deposit_id = attr.ib(type=int) deposit_date = attr.ib(type=datetime.datetime) status = attr.ib(type=str) archive = attr.ib(type=Optional[str]) def _compute_md5(filehandler: UploadedFile) -> bytes: h = hashlib.md5() for chunk in filehandler: h.update(chunk) # type: ignore return h.digest() def get_deposit_by_id( deposit_id: int, collection_name: Optional[str] = None ) -> Deposit: """Gets an existing Deposit object if it exists, or raises `DepositError`. If `collection` is not None, also checks the deposit belongs to the collection.""" try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: raise DepositError(NOT_FOUND, f"Deposit {deposit_id} does not exist") if collection_name and deposit.collection.name != collection_name: get_collection_by_name(collection_name) # raises if does not exist raise DepositError( NOT_FOUND, f"Deposit {deposit_id} does not belong to collection {collection_name}", ) return deposit def get_collection_by_name(collection_name: str): """Gets an existing Deposit object if it exists, or raises `DepositError`.""" try: collection = DepositCollection.objects.get(name=collection_name) except DepositCollection.DoesNotExist: raise DepositError(NOT_FOUND, f"Unknown collection name {collection_name}") assert collection is not None return collection def guess_deposit_origin_url(deposit: Deposit): """Guesses an origin url for the given deposit.""" external_id = deposit.external_id if not external_id: # The client provided neither an origin_url nor a slug. That's inconvenient, # but SWORD requires we support it. So let's generate a random slug. external_id = str(uuid.uuid4()) return "%s/%s" % (deposit.client.provider_url.rstrip("/"), external_id) def check_client_origin(client: DepositClient, origin_url: str): provider_url = client.provider_url.rstrip("/") + "/" if not origin_url.startswith(provider_url): raise DepositError( FORBIDDEN, f"Cannot create origin {origin_url}, it must start with " f"{provider_url}", ) class AuthenticatedAPIView(APIView): """Mixin intended as a based API view to enforce the basic authentication check """ - authentication_classes: Sequence[Type[BaseAuthentication]] = (BasicAuthentication,) - permission_classes: Sequence[Type[BasePermission]] = (IsAuthenticated,) + authentication_classes: Sequence[Type[BaseAuthentication]] = ( + KeycloakBasicAuthentication, + ) + permission_classes: Sequence[Type[BasePermission]] = ( + IsAuthenticated, + HasDepositPermission, + ) class APIBase(APIConfig, AuthenticatedAPIView, metaclass=ABCMeta): """Base deposit request class sharing multiple common behaviors. """ _client: Optional[DepositClient] = None def _read_headers(self, request: Request) -> ParsedRequestHeaders: """Read and unify the necessary headers from the request (those are not stored in the same location or not properly formatted). Args: request: Input request Returns: Dictionary with the following keys (some associated values may be None): - content-type - content-length - in-progress - content-disposition - packaging - slug - on-behalf-of """ meta = request._request.META content_length = meta.get("CONTENT_LENGTH") if content_length and isinstance(content_length, str): content_length = int(content_length) # final deposit if not provided in_progress = meta.get("HTTP_IN_PROGRESS", False) if isinstance(in_progress, str): in_progress = in_progress.lower() == "true" content_md5sum = meta.get("HTTP_CONTENT_MD5") if content_md5sum: content_md5sum = bytes.fromhex(content_md5sum) return ParsedRequestHeaders( content_type=request.content_type, content_length=content_length, in_progress=in_progress, content_disposition=meta.get("HTTP_CONTENT_DISPOSITION"), content_md5sum=content_md5sum, packaging=meta.get("HTTP_PACKAGING"), slug=meta.get("HTTP_SLUG"), on_behalf_of=meta.get("HTTP_ON_BEHALF_OF"), metadata_relevant=meta.get("HTTP_METADATA_RELEVANT"), swhid=meta.get("HTTP_X_CHECK_SWHID"), ) def _deposit_put(self, deposit: Deposit, in_progress: bool = False) -> None: """Save/Update a deposit in db. Args: deposit: deposit being updated/created in_progress: deposit status """ if in_progress is False: self._complete_deposit(deposit) else: deposit.status = DEPOSIT_STATUS_PARTIAL deposit.save() def _complete_deposit(self, deposit: Deposit) -> None: """Marks the deposit as 'deposited', then schedule a check task if configured to do so.""" deposit.complete_date = timezone.now() deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() if not deposit.origin_url: deposit.origin_url = guess_deposit_origin_url(deposit) if self.config["checks"]: scheduler = self.scheduler if deposit.status == DEPOSIT_STATUS_DEPOSITED and not deposit.check_task_id: task = create_oneshot_task_dict( "check-deposit", collection=deposit.collection.name, deposit_id=deposit.id, retries_left=3, ) check_task_id = scheduler.create_tasks([task])[0]["id"] deposit.check_task_id = check_task_id deposit.save() def _deposit_request_put( self, deposit: Deposit, deposit_request_data: Dict[str, Any], replace_metadata: bool = False, replace_archives: bool = False, ) -> DepositRequest: """Save a deposit request with metadata attached to a deposit. Args: deposit: The deposit concerned by the request deposit_request_data: The dictionary with at most 2 deposit request types (archive, metadata) to associate to the deposit replace_metadata: Flag defining if we add or update existing metadata to the deposit replace_archives: Flag defining if we add or update archives to existing deposit Returns: the DepositRequest object stored in the backend """ if replace_metadata: DepositRequest.objects.filter(deposit=deposit, type=METADATA_TYPE).delete() if replace_archives: DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() deposit_request = None archive_file = deposit_request_data.get(ARCHIVE_KEY) if archive_file: deposit_request = DepositRequest( type=ARCHIVE_TYPE, deposit=deposit, archive=archive_file ) deposit_request.save() metadata = deposit_request_data.get(METADATA_KEY) if metadata: raw_metadata = deposit_request_data[RAW_METADATA_KEY] deposit_request = DepositRequest( type=METADATA_TYPE, deposit=deposit, metadata=metadata, raw_metadata=raw_metadata.decode("utf-8"), ) deposit_request.save() assert deposit_request is not None return deposit_request def _delete_archives(self, collection_name: str, deposit: Deposit) -> Dict: """Delete archive references from the deposit id. """ DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() return {} def _delete_deposit(self, collection_name: str, deposit: Deposit) -> Dict: """Delete deposit reference. Args: collection_name: Client's collection deposit: The deposit to delete Returns Empty dict when ok. Dict with error key to describe the failure. """ if deposit.collection.name != collection_name: summary = "Cannot delete a deposit from another collection" description = "Deposit %s does not belong to the collection %s" % ( deposit.id, collection_name, ) raise DepositError( BAD_REQUEST, summary=summary, verbose_description=description ) DepositRequest.objects.filter(deposit=deposit).delete() deposit.delete() return {} def _check_file_length( self, filehandler: UploadedFile, content_length: Optional[int] = None, ) -> None: """Check the filehandler passed as argument has exactly the expected content_length Args: filehandler: The file to check content_length: the expected length if provided. Raises: DepositError if the actual length does not match """ max_upload_size = self.config["max_upload_size"] if content_length: length = filehandler.size if length != content_length: raise DepositError(status.HTTP_412_PRECONDITION_FAILED, "Wrong length") if filehandler.size > max_upload_size: raise DepositError( MAX_UPLOAD_SIZE_EXCEEDED, f"Upload size limit exceeded (max {max_upload_size} bytes)." "Please consider sending the archive in multiple steps.", ) def _check_file_md5sum( self, filehandler: UploadedFile, md5sum: Optional[bytes], ) -> None: """Check the filehandler passed as argument has the expected md5sum Args: filehandler: The file to check md5sum: md5 hash expected from the file's content Raises: DepositError if the md5sum does not match """ if md5sum: _md5sum = _compute_md5(filehandler) if _md5sum != md5sum: raise DepositError( CHECKSUM_MISMATCH, "Wrong md5 hash", f"The checksum sent {hashutil.hash_to_hex(md5sum)} and the actual " f"checksum {hashutil.hash_to_hex(_md5sum)} does not match.", ) def _binary_upload( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, replace_metadata: bool = False, replace_archives: bool = False, ) -> Receipt: """Binary upload routine. Other than such a request, a 415 response is returned. Args: request: the request holding information to parse and inject in db headers: parsed request headers collection_name: the associated client deposit: deposit to be updated replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Raises: - 400 (bad request) if the request is not providing an external identifier - 413 (request entity too large) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided """ content_length = headers.content_length if not content_length: raise DepositError( BAD_REQUEST, "CONTENT_LENGTH header is mandatory", "For archive deposit, the CONTENT_LENGTH header must be sent.", ) content_disposition = headers.content_disposition if not content_disposition: raise DepositError( BAD_REQUEST, "CONTENT_DISPOSITION header is mandatory", "For archive deposit, the CONTENT_DISPOSITION header must be sent.", ) packaging = headers.packaging if packaging and packaging not in ACCEPT_PACKAGINGS: raise DepositError( BAD_REQUEST, f"Only packaging {ACCEPT_PACKAGINGS} is supported", f"The packaging provided {packaging} is not supported", ) filehandler = request.FILES["file"] assert isinstance(filehandler, UploadedFile), filehandler self._check_file_length(filehandler, content_length) self._check_file_md5sum(filehandler, headers.content_md5sum) # actual storage of data archive_metadata = filehandler self._deposit_put( deposit=deposit, in_progress=headers.in_progress, ) self._deposit_request_put( deposit, {ARCHIVE_KEY: archive_metadata}, replace_metadata=replace_metadata, replace_archives=replace_archives, ) return Receipt( deposit_id=deposit.id, deposit_date=deposit.reception_date, status=deposit.status, archive=filehandler.name, ) def _read_metadata(self, metadata_stream) -> Tuple[bytes, Dict[str, Any]]: """Given a metadata stream, reads the metadata and returns both the parsed and the raw metadata. """ raw_metadata = metadata_stream.read() metadata = parse_xml(raw_metadata) return raw_metadata, metadata def _multipart_upload( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, replace_metadata: bool = False, replace_archives: bool = False, ) -> Receipt: """Multipart upload supported with exactly: - 1 archive (zip) - 1 atom entry Other than such a request, a 415 response is returned. Args: request: the request holding information to parse and inject in db headers: parsed request headers collection_name: the associated client deposit: deposit to be updated replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Raises: - 400 (bad request) if the request is not providing an external identifier - 412 (precondition failed) if the potentially md5 hash provided mismatch the reality of the archive - 413 (request entity too large) if the length of the archive exceeds the max size configured - 415 (unsupported media type) if a wrong media type is provided """ content_types_present = set() data: Dict[str, Optional[Any]] = { "application/zip": None, # expected either zip "application/x-tar": None, # or x-tar "application/atom+xml": None, } for key, value in request.FILES.items(): fh = value content_type = fh.content_type if content_type in content_types_present: raise DepositError( ERROR_CONTENT, "Only 1 application/zip (or application/x-tar) archive " "and 1 atom+xml entry is supported (as per sword2.0 " "specification)", "You provided more than 1 application/(zip|x-tar) " "or more than 1 application/atom+xml content-disposition " "header in the multipart deposit", ) content_types_present.add(content_type) assert content_type is not None data[content_type] = fh if len(content_types_present) != 2: raise DepositError( ERROR_CONTENT, "You must provide both 1 application/zip (or " "application/x-tar) and 1 atom+xml entry for multipart " "deposit", "You need to provide only 1 application/(zip|x-tar) " "and 1 application/atom+xml content-disposition header " "in the multipart deposit", ) filehandler = data["application/zip"] if not filehandler: filehandler = data["application/x-tar"] assert isinstance(filehandler, UploadedFile), filehandler self._check_file_length(filehandler) self._check_file_md5sum(filehandler, headers.content_md5sum) try: raw_metadata, metadata = self._read_metadata(data["application/atom+xml"]) except ParserError: raise DepositError( PARSING_ERROR, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) self._set_deposit_origin_from_metadata(deposit, metadata, headers) # actual storage of data self._deposit_put( deposit=deposit, in_progress=headers.in_progress, ) deposit_request_data = { ARCHIVE_KEY: filehandler, METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata, } self._deposit_request_put( deposit, deposit_request_data, replace_metadata, replace_archives ) assert filehandler is not None return Receipt( deposit_id=deposit.id, deposit_date=deposit.reception_date, archive=filehandler.name, status=deposit.status, ) def _store_metadata_deposit( self, deposit: Deposit, swhid_reference: Union[str, QualifiedSWHID], metadata: Dict, raw_metadata: bytes, deposit_origin: Optional[str] = None, ) -> Tuple[ExtendedSWHID, Deposit, DepositRequest]: """When all user inputs pass the checks, this associates the raw_metadata to the swhid_reference in the raw extrinsic metadata storage. In case of any issues, a bad request response is returned to the user with the details. Checks: - metadata are technically parsable - metadata pass the functional checks - SWHID (if any) is technically valid Args: deposit: Deposit reference swhid_reference: The swhid or the origin to attach metadata information to metadata: Full dict of metadata to check for validity (parsed out of raw_metadata) raw_metadata: The actual raw metadata to send in the storage metadata deposit_origin: Optional deposit origin url to use if any (e.g. deposit update scenario provides one) Raises: DepositError in case of incorrect inputs from the deposit client (e.g. functionally invalid metadata, ...) Returns: Tuple of target swhid, deposit, and deposit request """ metadata_ok, error_details = check_metadata(metadata) if not metadata_ok: assert error_details, "Details should be set when a failure occurs" raise DepositError( BAD_REQUEST, "Functional metadata checks failure", convert_status_detail(error_details), ) metadata_authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit.client.provider_url, metadata={"name": deposit.client.last_name}, ) metadata_fetcher = self.swh_deposit_fetcher() # replace metadata within the deposit backend deposit_request_data = { METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata, } # actually add the metadata to the completed deposit deposit_request = self._deposit_request_put(deposit, deposit_request_data) target_swhid: ExtendedSWHID # origin URL or CoreSWHID if isinstance(swhid_reference, str): target_swhid = Origin(swhid_reference).swhid() metadata_context = {} else: metadata_context = compute_metadata_context(swhid_reference) if deposit_origin: # metadata deposit update on completed deposit metadata_context["origin"] = deposit_origin target_swhid = extended_swhid_from_qualified(swhid_reference) self._check_swhid_in_archive(target_swhid) # metadata deposited by the client metadata_object = RawExtrinsicMetadata( target=target_swhid, # core swhid or origin discovery_date=deposit_request.date, authority=metadata_authority, fetcher=metadata_fetcher, format="sword-v2-atom-codemeta", metadata=raw_metadata, **metadata_context, ) # metadata on the metadata object swh_deposit_authority = self.swh_deposit_authority() swh_deposit_fetcher = self.swh_deposit_fetcher() metametadata_object = RawExtrinsicMetadata( target=metadata_object.swhid(), discovery_date=deposit_request.date, authority=swh_deposit_authority, fetcher=swh_deposit_fetcher, format="xml-deposit-info", metadata=render_to_string( "deposit/deposit_info.xml", context={"deposit": deposit} ).encode(), ) # write to metadata storage self.storage_metadata.metadata_authority_add( [metadata_authority, swh_deposit_authority] ) self.storage_metadata.metadata_fetcher_add( [metadata_fetcher, swh_deposit_fetcher] ) self.storage_metadata.raw_extrinsic_metadata_add( [metadata_object, metametadata_object] ) return (target_swhid, deposit, deposit_request) def _check_swhid_in_archive(self, target_swhid: ExtendedSWHID) -> None: """Check the target object already exists in the archive, and raises a BAD_REQUEST if it does not.""" if target_swhid.object_type in (ExtendedObjectType.CONTENT,): if list( self.storage.content_missing_per_sha1_git([target_swhid.object_id]) ): raise DepositError( BAD_REQUEST, f"Cannot load metadata on {target_swhid}, this content " f"object does not exist in the archive (yet?).", ) elif target_swhid.object_type in ( ExtendedObjectType.DIRECTORY, ExtendedObjectType.REVISION, ExtendedObjectType.RELEASE, ExtendedObjectType.SNAPSHOT, ): target_type_name = target_swhid.object_type.name.lower() method = getattr(self.storage, target_type_name + "_missing") if list(method([target_swhid.object_id])): raise DepositError( BAD_REQUEST, f"Cannot load metadata on {target_swhid}, this {target_type_name} " f"object does not exist in the archive (yet?).", ) elif target_swhid.object_type in (ExtendedObjectType.ORIGIN,): if None in list(self.storage.origin_get_by_sha1([target_swhid.object_id])): raise DepositError( BAD_REQUEST, "Cannot load metadata on origin, it is not (yet?) known to the " "archive.", ) else: # This should not happen, because target_swhid is generated from either # a core swhid or an origin URL. # Let's just check it again so the "switch" is exhaustive. raise ValueError( f"_check_swhid_in_archive expected core SWHID or origin SWHID, " f"but got {target_swhid}." ) def _atom_entry( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, replace_metadata: bool = False, replace_archives: bool = False, ) -> Receipt: """Atom entry deposit. Args: request: the request holding information to parse and inject in db headers: parsed request headers collection_name: the associated client deposit: deposit to be updated replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Raises: - 400 (bad request) if the request is not providing an external identifier - 400 (bad request) if the request's body is empty - 415 (unsupported media type) if a wrong media type is provided """ try: raw_metadata, metadata = self._read_metadata(request.data) except ParserError: raise DepositError( BAD_REQUEST, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) if metadata is None: raise DepositError( BAD_REQUEST, "Empty body request is not supported", "Atom entry deposit is supposed to send for metadata. " "If the body is empty, there is no metadata.", ) self._set_deposit_origin_from_metadata(deposit, metadata, headers) # Determine if we are in the metadata-only deposit case try: swhid_ref = parse_swh_reference(metadata) except ValidationError as e: raise DepositError( PARSING_ERROR, "Invalid SWHID reference", str(e), ) if swhid_ref is not None and ( deposit.origin_url or deposit.parent or deposit.external_id ): raise DepositError( BAD_REQUEST, " is for metadata-only deposits and " " / / Slug are for " "code deposits, only one may be used on a given deposit.", ) if swhid_ref is not None: deposit.save() # We need a deposit id target_swhid, depo, depo_request = self._store_metadata_deposit( deposit, swhid_ref, metadata, raw_metadata ) deposit.status = DEPOSIT_STATUS_LOAD_SUCCESS if isinstance(swhid_ref, QualifiedSWHID): deposit.swhid = str(extended_swhid_from_qualified(swhid_ref)) deposit.swhid_context = str(swhid_ref) deposit.complete_date = depo_request.date deposit.reception_date = depo_request.date deposit.save() return Receipt( deposit_id=deposit.id, deposit_date=depo_request.date, status=deposit.status, archive=None, ) self._deposit_put( deposit=deposit, in_progress=headers.in_progress, ) self._deposit_request_put( deposit, {METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata}, replace_metadata, replace_archives, ) return Receipt( deposit_id=deposit.id, deposit_date=deposit.reception_date, status=deposit.status, archive=None, ) def _set_deposit_origin_from_metadata(self, deposit, metadata, headers): create_origin = metadata.get("swh:deposit", {}).get("swh:create_origin") add_to_origin = metadata.get("swh:deposit", {}).get("swh:add_to_origin") if create_origin and add_to_origin: raise DepositError( BAD_REQUEST, " and are mutually exclusive, " "as they respectively create a new origin and add to an existing " "origin.", ) if create_origin: origin_url = create_origin["swh:origin"]["@url"] check_client_origin(deposit.client, origin_url) deposit.origin_url = origin_url if add_to_origin: origin_url = add_to_origin["swh:origin"]["@url"] check_client_origin(deposit.client, origin_url) deposit.parent = ( Deposit.objects.filter( client=deposit.client, origin_url=origin_url, status=DEPOSIT_STATUS_LOAD_SUCCESS, ) .order_by("-id")[0:1] .get() ) deposit.origin_url = origin_url if "atom:external_identifier" in metadata: # Deprecated tag. # When clients stopped using it, this should raise an error # unconditionally if deposit.origin_url: raise DepositError( BAD_REQUEST, " is deprecated, you should only use " " and from now on.", ) if headers.slug and metadata["atom:external_identifier"] != headers.slug: raise DepositError( BAD_REQUEST, "The tag and Slug header are deprecated, " " or " "should be used instead.", ) def _empty_post( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, ) -> Receipt: """Empty post to finalize a deposit. Args: request: the request holding information to parse and inject in db headers: parsed request headers collection_name: the associated client deposit: deposit to be finalized """ self._complete_deposit(deposit) assert deposit.complete_date is not None return Receipt( deposit_id=deposit.id, deposit_date=deposit.complete_date, status=deposit.status, archive=None, ) def additional_checks( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Optional[Deposit], ) -> Dict[str, Any]: """Permit the child class to enrich additional checks. Returns: dict with 'error' detailing the problem. """ return {} def get_client(self, request) -> DepositClient: # This class depends on AuthenticatedAPIView, so request.user.username # is always set username = request.user.username assert username is not None if self._client is None: try: self._client = DepositClient.objects.get( # type: ignore username=username ) except DepositClient.DoesNotExist: raise DepositError(NOT_FOUND, f"Unknown client name {username}") assert self._client.username == username return self._client def checks( self, request: Request, collection_name: str, deposit: Optional[Deposit] = None ) -> ParsedRequestHeaders: if deposit is None: collection = get_collection_by_name(collection_name) else: assert collection_name == deposit.collection.name collection = deposit.collection client = self.get_client(request) collection_id = collection.id collections = client.collections assert collections is not None if collection_id not in collections: raise DepositError( FORBIDDEN, f"Client {client.username} cannot access collection {collection_name}", ) headers = self._read_headers(request) if deposit is not None: self.restrict_access(request, headers, deposit) if headers.on_behalf_of: raise DepositError(MEDIATION_NOT_ALLOWED, "Mediation is not supported.") self.additional_checks(request, headers, collection_name, deposit) return headers def restrict_access( self, request: Request, headers: ParsedRequestHeaders, deposit: Deposit ) -> None: """Allow modifications on deposit with status 'partial' only, reject the rest. """ if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL: summary = "You can only act on deposit with status '%s'" % ( DEPOSIT_STATUS_PARTIAL, ) description = f"This deposit has status '{deposit.status}'" raise DepositError( BAD_REQUEST, summary=summary, verbose_description=description ) def _basic_not_allowed_method(self, request: Request, method: str): raise DepositError( METHOD_NOT_ALLOWED, f"{method} method is not supported on this endpoint", ) def get( self, request: Request, collection_name: str, deposit_id: int ) -> Union[HttpResponse, FileResponse]: return self._basic_not_allowed_method(request, "GET") def post( self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: return self._basic_not_allowed_method(request, "POST") def put( self, request: Request, collection_name: str, deposit_id: int ) -> HttpResponse: return self._basic_not_allowed_method(request, "PUT") def delete( self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: return self._basic_not_allowed_method(request, "DELETE") class APIGet(APIBase, metaclass=ABCMeta): """Mixin for class to support GET method. """ def get( self, request: Request, collection_name: str, deposit_id: int ) -> Union[HttpResponse, FileResponse]: """Endpoint to create/add resources to deposit. Returns: 200 response when no error during routine occurred 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ deposit = get_deposit_by_id(deposit_id, collection_name) self.checks(request, collection_name, deposit) r = self.process_get(request, collection_name, deposit) status, content, content_type = r if content_type == "swh/generator": with content as path: return FileResponse( open(path, "rb"), status=status, content_type="application/tar" ) if content_type == "application/json": return HttpResponse( json.dumps(content), status=status, content_type=content_type ) return HttpResponse(content, status=status, content_type=content_type) @abstractmethod def process_get( self, request: Request, collection_name: str, deposit: Deposit ) -> Tuple[int, Any, str]: """Routine to deal with the deposit's get processing. Returns: Tuple status, stream of content, content-type """ pass class APIPost(APIBase, metaclass=ABCMeta): """Mixin for class to support POST method. """ def post( self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: """Endpoint to create/add resources to deposit. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ if deposit_id is None: deposit = None else: deposit = get_deposit_by_id(deposit_id, collection_name) headers = self.checks(request, collection_name, deposit) status, iri_key, receipt = self.process_post( request, headers, collection_name, deposit ) return self._make_deposit_receipt( request, collection_name, status, iri_key, receipt, ) def _make_deposit_receipt( self, request, collection_name: str, status: int, iri_key: str, receipt: Receipt, ) -> HttpResponse: """Returns an HttpResponse with a SWORD Deposit receipt as content.""" # Build the IRIs in the receipt args = [collection_name, receipt.deposit_id] iris = { iri: request.build_absolute_uri(reverse(iri, args=args)) for iri in [EM_IRI, EDIT_IRI, CONT_FILE_IRI, SE_IRI, STATE_IRI] } context = { **attr.asdict(receipt), **iris, "packagings": ACCEPT_PACKAGINGS, } response = render( request, "deposit/deposit_receipt.xml", context=context, content_type="application/xml", status=status, ) response._headers["location"] = "Location", iris[iri_key] # type: ignore return response @abstractmethod def process_post( self, request, headers: ParsedRequestHeaders, collection_name: str, deposit: Optional[Deposit] = None, ) -> Tuple[int, str, Receipt]: """Routine to deal with the deposit's processing. Returns Tuple of: - response status code (200, 201, etc...) - key iri (EM_IRI, EDIT_IRI, etc...) - Receipt """ pass class APIPut(APIBase, metaclass=ABCMeta): """Mixin for class to support PUT method. """ def put( self, request: Request, collection_name: str, deposit_id: int ) -> HttpResponse: """Endpoint to update deposit resources. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ if deposit_id is None: deposit = None else: deposit = get_deposit_by_id(deposit_id, collection_name) headers = self.checks(request, collection_name, deposit) self.process_put(request, headers, collection_name, deposit) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_put( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, ) -> None: """Routine to deal with updating a deposit in some way. Returns dictionary of the processing result """ pass class APIDelete(APIBase, metaclass=ABCMeta): """Mixin for class to support DELETE method. """ def delete( self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: """Endpoint to delete some deposit's resources (archives, deposit). Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ assert deposit_id is not None deposit = get_deposit_by_id(deposit_id, collection_name) self.checks(request, collection_name, deposit) self.process_delete(request, collection_name, deposit) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_delete( self, request: Request, collection_name: str, deposit: Deposit ) -> None: """Routine to delete a resource. This is mostly not allowed except for the EM_IRI (cf. .api.deposit_update.APIUpdateArchive) """ pass diff --git a/swh/deposit/api/service_document.py b/swh/deposit/api/service_document.py index c74c8a6d..369ba942 100644 --- a/swh/deposit/api/service_document.py +++ b/swh/deposit/api/service_document.py @@ -1,33 +1,38 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.shortcuts import render from django.urls import reverse -from ..config import COL_IRI -from ..models import DepositClient, DepositCollection -from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, ACCEPT_PACKAGINGS, APIBase +from swh.deposit.api.common import ( + ACCEPT_ARCHIVE_CONTENT_TYPES, + ACCEPT_PACKAGINGS, + APIBase, +) +from swh.deposit.config import COL_IRI +from swh.deposit.models import DepositCollection class ServiceDocumentAPI(APIBase): - def get(self, req, *args, **kwargs): - client = DepositClient.objects.get(username=req.user) - + def get(self, request, *args, **kwargs): + client = request.user collections = {} - for col_id in client.collections: col = DepositCollection.objects.get(pk=col_id) - col_uri = req.build_absolute_uri(reverse(COL_IRI, args=[col.name])) + col_uri = request.build_absolute_uri(reverse(COL_IRI, args=[col.name])) collections[col.name] = col_uri context = { "max_upload_size": self.config["max_upload_size"], "accept_packagings": ACCEPT_PACKAGINGS, "accept_content_types": ACCEPT_ARCHIVE_CONTENT_TYPES, "collections": collections, } return render( - req, "deposit/service_document.xml", context, content_type="application/xml" + request, + "deposit/service_document.xml", + context, + content_type="application/xml", ) diff --git a/swh/deposit/auth.py b/swh/deposit/auth.py index 6c3fb63e..5e72ebb5 100644 --- a/swh/deposit/auth.py +++ b/swh/deposit/auth.py @@ -1,63 +1,173 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import logging +from typing import Optional + +from django.core.cache import cache +from django.utils import timezone from rest_framework import status +from rest_framework.authentication import BasicAuthentication +from rest_framework.exceptions import AuthenticationFailed +from rest_framework.permissions import BasePermission +from sentry_sdk import capture_exception + +from swh.auth.django.models import OIDCUser +from swh.auth.django.utils import oidc_user_from_profile +from swh.auth.keycloak import KeycloakOpenIDConnect +from swh.deposit.models import DepositClient from .errors import UNAUTHORIZED, make_error_response +logger = logging.getLogger(__name__) + + +OIDC_DEPOSIT_CLIENT_ID = "swh-deposit" +DEPOSIT_PERMISSION = "swh.deposit.api" + def convert_response(request, content): """Convert response from drf's basic authentication mechanism to a swh-deposit one. Args: request (Request): Use to build the response content (bytes): The drf's answer Returns: Response with the same status error as before, only the body is now an swh-deposit compliant one. """ from json import loads content = loads(content.decode("utf-8")) detail = content.get("detail") if detail: verbose_description = "API is protected by basic authentication" else: detail = "API is protected by basic authentication" verbose_description = None response = make_error_response( request, UNAUTHORIZED, summary=detail, verbose_description=verbose_description ) response["WWW-Authenticate"] = 'Basic realm=""' return response class WrapBasicAuthenticationResponseMiddleware: """Middleware to capture potential authentication error and convert them to standard deposit response. This is to be installed in django's settings.py module. """ def __init__(self, get_response): super().__init__() self.get_response = get_response def __call__(self, request): response = self.get_response(request) if response.status_code is status.HTTP_401_UNAUTHORIZED: content_type = response._headers.get("content-type") if content_type == ("Content-Type", "application/json"): return convert_response(request, response.content) return response + + +class HasDepositPermission(BasePermission): + """Allows access to authenticated users with the DEPOSIT_PERMISSION. + + """ + + def has_permission(self, request, view): + assert isinstance(request.user, DepositClient) + return request.user.oidc_user.has_perm(DEPOSIT_PERMISSION) + + +class KeycloakBasicAuthentication(BasicAuthentication): + """Keycloack authentication against username/password. + + Deposit users will continue sending `Basic authentication` queries to the deposit + server. Transparently, the deposit server will stop authenticate itself the users. + It will delegate the authentication queries to the keycloak instance. + + Technically, reuses :class:`rest_framework.BasicAuthentication` and overrides the + func:`authenticate_credentials` method to discuss with keycloak. + + As an implementation detail, this also uses the django cache mechanism to avoid too + many authentication request to keycloak. + + """ + + _client: Optional[KeycloakOpenIDConnect] = None + + @property + def client(self): + if self._client is None: + self._client = KeycloakOpenIDConnect.from_configfile( + client_id=OIDC_DEPOSIT_CLIENT_ID + ) + return self._client + + def get_user(self, user_id: str) -> Optional[OIDCUser]: + """Retrieve user from cache if any. + + """ + oidc_profile = cache.get(f"oidc_user_{user_id}") + if oidc_profile: + try: + return oidc_user_from_profile(self.client, oidc_profile) + except Exception as e: + capture_exception(e) + return None + + def authenticate_credentials(self, user_id, password, request): + """Authenticate the user_id/password against keycloak. + + Raises: + AuthenticationFailed in case of authentication failure + + Returns: + Tuple of deposit_client, None. + + """ + oidc_user = self.get_user(user_id) + ttl: Optional[int] = None + if not oidc_user: + try: + oidc_profile = self.client.login(user_id, password) + except Exception as e: + raise AuthenticationFailed(e) + + oidc_user = oidc_user_from_profile(self.client, oidc_profile) + ttl = int( + oidc_user.refresh_expires_at.timestamp() - timezone.now().timestamp() + ) + + # Making sure the associated deposit client is correctly configured in backend + try: + deposit_client = DepositClient.objects.get(username=user_id) + except DepositClient.DoesNotExist: + raise AuthenticationFailed(f"Unknown user {user_id}") + + if not deposit_client.is_active: + raise AuthenticationFailed(f"Deactivated user {user_id}") + + deposit_client.oidc_user = oidc_user + + if ttl: + # cache the oidc_profile user while it's valid + cache.set( + f"oidc_user_{user_id}", oidc_profile, timeout=max(0, ttl), + ) + + return (deposit_client, None) diff --git a/swh/deposit/cli/admin.py b/swh/deposit/cli/admin.py index e009574c..792ce68c 100644 --- a/swh/deposit/cli/admin.py +++ b/swh/deposit/cli/admin.py @@ -1,285 +1,283 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # WARNING: do not import unnecessary things here to keep cli startup time under # control from __future__ import annotations from typing import TYPE_CHECKING import click from swh.deposit.cli import deposit if TYPE_CHECKING: from swh.deposit.models import DepositCollection @deposit.group("admin") @click.option( "--config-file", "-C", default=None, type=click.Path(exists=True, dir_okay=False,), help="Optional extra configuration file.", ) @click.option( "--platform", default="development", type=click.Choice(["development", "production"]), help="development or production platform", ) @click.pass_context def admin(ctx, config_file: str, platform: str): """Server administration tasks (manipulate user or collections)""" from swh.deposit.config import setup_django_for # configuration happens here setup_django_for(platform, config_file=config_file) @admin.group("user") @click.pass_context def user(ctx): """Manipulate user.""" # configuration happens here pass def _create_collection(name: str) -> DepositCollection: """Create the collection with name if it does not exist. Args: name: collection name Returns: collection: the existing collection object """ # to avoid loading too early django namespaces from swh.deposit.models import DepositCollection try: collection = DepositCollection.objects.get(name=name) click.echo(f"Collection '{name}' exists, skipping.") except DepositCollection.DoesNotExist: click.echo(f"Create collection '{name}'.") collection = DepositCollection.objects.create(name=name) click.echo(f"Collection '{name}' created.") return collection @user.command("create") @click.option("--username", required=True, help="User's name") -@click.option("--password", required=True, help="Desired user's password (plain).") +@click.option("--password", help="(Deprecated) Desired user password (plain).") @click.option("--firstname", default="", help="User's first name") @click.option("--lastname", default="", help="User's last name") @click.option("--email", default="", help="User's email") @click.option("--collection", help="User's collection") @click.option("--provider-url", default="", help="Provider URL") @click.option("--domain", default="", help="The domain") @click.pass_context def user_create( ctx, username: str, password: str, firstname: str, lastname: str, email: str, collection: str, provider_url: str, domain: str, ): """Create a user with some needed information (password, collection) If the collection does not exist, the collection is then created alongside. The password is stored encrypted using django's utilities. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositClient # If collection is not provided, fallback to username if not collection: collection = username # create the collection if it does not exist collection_ = _create_collection(collection) # user create/update try: user = DepositClient.objects.get(username=username) # type: ignore click.echo(f"Update user '{username}'.") - user.set_password(password) action_done = "updated" except DepositClient.DoesNotExist: click.echo(f"Create user '{username}'.") - user = DepositClient.objects.create_user( # type: ignore - username=username, password=password - ) + user = DepositClient(username=username) + user.save() action_done = "created" user.collections = [collection_.id] user.first_name = firstname user.last_name = lastname user.email = email user.is_active = True user.provider_url = provider_url user.domain = domain user.save() click.echo(f"User '{username}' {action_done}.") @user.command("list") @click.pass_context def user_list(ctx): """List existing users. This entrypoint is not paginated yet as there is not a lot of entry. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositClient users = DepositClient.objects.all() if not users: output = "Empty user list" else: output = "\n".join((user.username for user in users)) click.echo(output) @user.command("exists") @click.argument("username", required=True) @click.pass_context def user_exists(ctx, username: str): """Check if user exists. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositClient try: DepositClient.objects.get(username=username) # type: ignore click.echo(f"User {username} exists.") ctx.exit(0) except DepositClient.DoesNotExist: click.echo(f"User {username} does not exist.") ctx.exit(1) @admin.group("collection") @click.pass_context def collection(ctx): """Manipulate collections.""" pass @collection.command("create") @click.option("--name", required=True, help="Collection's name") @click.pass_context def collection_create(ctx, name): _create_collection(name) @collection.command("list") @click.pass_context def collection_list(ctx): """List existing collections. This entrypoint is not paginated yet as there is not a lot of entry. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositCollection collections = DepositCollection.objects.all() if not collections: output = "Empty collection list" else: output = "\n".join((col.name for col in collections)) click.echo(output) @admin.group("deposit") @click.pass_context def adm_deposit(ctx): """Manipulate deposit.""" pass @adm_deposit.command("reschedule") @click.option("--deposit-id", required=True, help="Deposit identifier") @click.pass_context def adm_deposit_reschedule(ctx, deposit_id): """Reschedule the deposit loading This will: - check the deposit's status to something reasonable (failed or done). That means that the checks have passed alright but something went wrong during the loading (failed: loading failed, done: loading ok, still for some reasons as in bugs, we need to reschedule it) - reset the deposit's status to 'verified' (prior to any loading but after the checks which are fine) and removes the different archives' identifiers (swh-id, ...) - trigger back the loading task through the scheduler """ # to avoid loading too early django namespaces from datetime import datetime from swh.deposit.config import ( DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_VERIFIED, APIConfig, ) from swh.deposit.models import Deposit try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: click.echo(f"Deposit {deposit_id} does not exist.") ctx.exit(1) # Check the deposit is in a reasonable state accepted_statuses = [DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE] if deposit.status == DEPOSIT_STATUS_VERIFIED: click.echo(f"Deposit {deposit_id} already set for rescheduling.") ctx.exit(0) if deposit.status not in accepted_statuses: click.echo( f"Deposit {deposit_id} cannot be rescheduled (status: {deposit.status}).\n" "Rescheduling deposit is only accepted for deposit with status: " f"{', '.join(accepted_statuses)}." ) ctx.exit(1) task_id = deposit.load_task_id if not task_id: click.echo( f"Deposit {deposit_id} cannot be rescheduled. It misses the " "associated scheduler task id (field load_task_id)." ) ctx.exit(1) # Reset the deposit's state deposit.swhid = None deposit.swhid_context = None deposit.status = DEPOSIT_STATUS_VERIFIED deposit.save() # Schedule back the deposit loading task scheduler = APIConfig().scheduler scheduler.set_status_tasks( [task_id], status="next_run_not_scheduled", next_run=datetime.now() ) diff --git a/swh/deposit/config.py b/swh/deposit/config.py index ecd66454..47e00b53 100644 --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -1,125 +1,124 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from typing import Any, Dict from swh.core import config from swh.deposit import __version__ from swh.model.model import MetadataAuthority, MetadataAuthorityType, MetadataFetcher from swh.scheduler import get_scheduler from swh.scheduler.interface import SchedulerInterface from swh.storage import get_storage from swh.storage.interface import StorageInterface # IRIs (Internationalized Resource identifier) sword 2.0 specified EDIT_IRI = "edit_iri" SE_IRI = "se_iri" EM_IRI = "em_iri" CONT_FILE_IRI = "cont_file_iri" SD_IRI = "servicedocument" COL_IRI = "upload" STATE_IRI = "state_iri" PRIVATE_GET_RAW_CONTENT = "private-download" PRIVATE_CHECK_DEPOSIT = "check-deposit" PRIVATE_PUT_DEPOSIT = "private-update" PRIVATE_GET_DEPOSIT_METADATA = "private-read" PRIVATE_LIST_DEPOSITS = "private-deposit-list" ARCHIVE_KEY = "archive" METADATA_KEY = "metadata" RAW_METADATA_KEY = "raw-metadata" ARCHIVE_TYPE = "archive" METADATA_TYPE = "metadata" - AUTHORIZED_PLATFORMS = ["development", "production", "testing"] DEPOSIT_STATUS_REJECTED = "rejected" DEPOSIT_STATUS_PARTIAL = "partial" DEPOSIT_STATUS_DEPOSITED = "deposited" DEPOSIT_STATUS_VERIFIED = "verified" DEPOSIT_STATUS_LOAD_SUCCESS = "done" DEPOSIT_STATUS_LOAD_FAILURE = "failed" # Revision author for deposit SWH_PERSON = { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org", } DEFAULT_CONFIG = { "max_upload_size": 209715200, "checks": True, } def setup_django_for(platform=None, config_file=None): """Setup function for command line tools (swh.deposit.create_user) to initialize the needed db access. Note: Do not import any django related module prior to this function call. Otherwise, this will raise an django.core.exceptions.ImproperlyConfigured error message. Args: platform (str): the platform the scheduling is running config_file (str): Extra configuration file (typically for the production platform) Raises: ValueError in case of wrong platform inputs. """ if platform is not None: if platform not in AUTHORIZED_PLATFORMS: raise ValueError("Platform should be one of %s" % AUTHORIZED_PLATFORMS) if "DJANGO_SETTINGS_MODULE" not in os.environ: os.environ["DJANGO_SETTINGS_MODULE"] = "swh.deposit.settings.%s" % platform if config_file: os.environ.setdefault("SWH_CONFIG_FILENAME", config_file) import django django.setup() class APIConfig: """API Configuration centralized class. This loads explicitly the configuration file out of the SWH_CONFIG_FILENAME environment variable. """ def __init__(self): self.config: Dict[str, Any] = config.load_from_envvar(DEFAULT_CONFIG) self.scheduler: SchedulerInterface = get_scheduler(**self.config["scheduler"]) self.tool = { "name": "swh-deposit", "version": __version__, "configuration": {"sword_version": "2"}, } self.storage: StorageInterface = get_storage(**self.config["storage"]) self.storage_metadata: StorageInterface = get_storage( **self.config["storage_metadata"] ) def swh_deposit_authority(self): return MetadataAuthority( type=MetadataAuthorityType.REGISTRY, url=self.config["swh_authority_url"], metadata={}, ) def swh_deposit_fetcher(self): return MetadataFetcher( name=self.tool["name"], version=self.tool["version"], metadata=self.tool["configuration"], ) diff --git a/swh/deposit/exception.py b/swh/deposit/exception.py index e0252e00..5c6a224a 100644 --- a/swh/deposit/exception.py +++ b/swh/deposit/exception.py @@ -1,37 +1,38 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Dict, Optional from django.db.utils import OperationalError from django.http import HttpResponse from rest_framework.exceptions import APIException -from rest_framework.views import exception_handler def custom_exception_handler( exc: APIException, context: Dict ) -> Optional[HttpResponse]: """Custom deposit exception handler to ensure consistent xml output """ + from rest_framework.views import exception_handler + # drf's default exception handler first, to get the standard error response response = exception_handler(exc, context) if isinstance(exc, OperationalError): status = "Database backend maintenance" detail = "Service temporarily unavailable, try again later." data = f""" {status} {detail} """.encode( "utf-8" ) return HttpResponse(data, status=503, content_type="application/xml") return response diff --git a/swh/deposit/models.py b/swh/deposit/models.py index da708e4f..cad6861f 100644 --- a/swh/deposit/models.py +++ b/swh/deposit/models.py @@ -1,239 +1,243 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # Generated from: # cd swh_deposit && \ # python3 -m manage inspectdb import datetime +from typing import Optional from django.contrib.auth.models import User, UserManager from django.contrib.postgres.fields import ArrayField, JSONField from django.db import models from django.utils.timezone import now +from swh.auth.django.models import OIDCUser + from .config import ( ARCHIVE_TYPE, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED, METADATA_TYPE, ) class Dbversion(models.Model): """Db version """ version = models.IntegerField(primary_key=True) release = models.DateTimeField(default=now, null=True) description = models.TextField(blank=True, null=True) class Meta: db_table = "dbversion" def __str__(self): return str( { "version": self.version, "release": self.release, "description": self.description, } ) """Possible status""" DEPOSIT_STATUS = [ (DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_PARTIAL), ("expired", "expired"), (DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_DEPOSITED), (DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_VERIFIED), (DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_REJECTED), ("loading", "loading"), (DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_SUCCESS), (DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_FAILURE), ] """Possible status and the detailed meaning.""" DEPOSIT_STATUS_DETAIL = { DEPOSIT_STATUS_PARTIAL: "Deposit is partially received. To finalize it, " "In-Progress header should be false", "expired": "Deposit has been there too long and is now " "deemed ready to be garbage collected", DEPOSIT_STATUS_DEPOSITED: "Deposit is ready for additional checks " "(tarball ok, metadata, etc...)", DEPOSIT_STATUS_VERIFIED: "Deposit is fully received, checked, and " "ready for loading", DEPOSIT_STATUS_REJECTED: "Deposit failed the checks", "loading": "Loading is ongoing on swh's side", DEPOSIT_STATUS_LOAD_SUCCESS: "The deposit has been successfully " "loaded into the Software Heritage archive", DEPOSIT_STATUS_LOAD_FAILURE: "The deposit loading into the " "Software Heritage archive failed", } class DepositClient(User): """Deposit client """ collections = ArrayField(models.IntegerField(), null=True) objects = UserManager() # type: ignore # this typing hint is due to a mypy/django-stubs limitation, # see https://github.com/typeddjango/django-stubs/issues/174 provider_url = models.TextField(null=False) domain = models.TextField(null=False) + oidc_user: Optional[OIDCUser] = None class Meta: db_table = "deposit_client" def __str__(self): return str( { "id": self.id, "collections": self.collections, "username": super().username, "domain": self.domain, "provider_url": self.provider_url, } ) class Deposit(models.Model): """Deposit reception table """ id = models.BigAutoField(primary_key=True) # First deposit reception date reception_date = models.DateTimeField(auto_now_add=True) # Date when the deposit is deemed complete and ready for loading complete_date = models.DateTimeField(null=True) # collection concerned by the deposit collection = models.ForeignKey("DepositCollection", models.DO_NOTHING) # Deprecated: Deposit's external identifier - external_id = models.TextField() + external_id = models.TextField(null=True) # URL of the origin of this deposit, null if this is a metadata-only deposit origin_url = models.TextField(null=True) # Deposit client client = models.ForeignKey("DepositClient", models.DO_NOTHING) # SWH's loading result identifier swhid = models.TextField(blank=True, null=True) swhid_context = models.TextField(blank=True, null=True) # Deposit's status regarding loading status = models.TextField(choices=DEPOSIT_STATUS, default=DEPOSIT_STATUS_PARTIAL) status_detail = JSONField(null=True) # deposit can have one parent parent = models.ForeignKey("self", on_delete=models.PROTECT, null=True) check_task_id = models.TextField( blank=True, null=True, verbose_name="Scheduler's associated checking task id" ) load_task_id = models.TextField( blank=True, null=True, verbose_name="Scheduler's associated loading task id" ) class Meta: db_table = "deposit" def __str__(self): d = { "id": self.id, "reception_date": self.reception_date, "collection": self.collection.name, "external_id": self.external_id, "origin_url": self.origin_url, "client": self.client.username, "status": self.status, } if self.status in (DEPOSIT_STATUS_REJECTED): d["status_detail"] = self.status_detail return str(d) def client_directory_path(instance: "DepositRequest", filename: str) -> str: """Callable to determine the upload archive path. This defaults to MEDIA_ROOT/client_/%Y%m%d-%H%M%S.%f/. The format "%Y%m%d-%H%M%S.%f" is the reception date of the associated deposit formatted using strftime. Args: instance: DepositRequest concerned by the upload filename: Filename of the uploaded file Returns: The upload archive path. """ reception_date = instance.deposit.reception_date assert isinstance(reception_date, datetime.datetime) folder = reception_date.strftime("%Y%m%d-%H%M%S.%f") return f"client_{instance.deposit.client.id}/{folder}/{filename}" REQUEST_TYPES = [(ARCHIVE_TYPE, ARCHIVE_TYPE), (METADATA_TYPE, METADATA_TYPE)] class DepositRequest(models.Model): """Deposit request associated to one deposit. """ id = models.BigAutoField(primary_key=True) # Deposit concerned by the request deposit = models.ForeignKey(Deposit, models.DO_NOTHING) date = models.DateTimeField(auto_now_add=True) # Deposit request information on the data to inject # this can be null when type is 'archive' metadata = JSONField(null=True) raw_metadata = models.TextField(null=True) # this can be null when type is 'metadata' archive = models.FileField(null=True, upload_to=client_directory_path) type = models.CharField(max_length=8, choices=REQUEST_TYPES, null=True) class Meta: db_table = "deposit_request" def __str__(self): meta = None if self.metadata: from json import dumps meta = dumps(self.metadata) archive_name = None if self.archive: archive_name = self.archive.name return str( { "id": self.id, "deposit": self.deposit, "metadata": meta, "archive": archive_name, } ) class DepositCollection(models.Model): id = models.BigAutoField(primary_key=True) # Human readable name for the collection type e.g HAL, arXiv, etc... name = models.TextField() class Meta: db_table = "deposit_collection" def __str__(self): return str({"id": self.id, "name": self.name}) diff --git a/swh/deposit/settings/common.py b/swh/deposit/settings/common.py index 749f7f6b..5360782c 100644 --- a/swh/deposit/settings/common.py +++ b/swh/deposit/settings/common.py @@ -1,115 +1,115 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """ Django settings for swh project. Generated by 'django-admin startproject' using Django 1.10.7. For more information on this file, see https://docs.djangoproject.com/en/1.10/topics/settings/ For the full list of settings and their values, see https://docs.djangoproject.com/en/1.10/ref/settings/ """ import os # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/ ALLOWED_HOSTS = ["127.0.0.1", "localhost"] # Application definition INSTALLED_APPS = [ "django.contrib.auth", "django.contrib.contenttypes", "django.contrib.staticfiles", "django.contrib.sessions", "django.contrib.messages", "django.contrib.postgres", # for JSONField, ArrayField "swh.deposit.apps.DepositConfig", ] MIDDLEWARE = [ "django.middleware.security.SecurityMiddleware", "django.contrib.sessions.middleware.SessionMiddleware", "django.middleware.common.CommonMiddleware", "django.middleware.csrf.CsrfViewMiddleware", "django.contrib.auth.middleware.AuthenticationMiddleware", "django.contrib.messages.middleware.MessageMiddleware", "django.middleware.clickjacking.XFrameOptionsMiddleware", "swh.deposit.auth.WrapBasicAuthenticationResponseMiddleware", "swh.deposit.errors.DepositErrorMiddleware", ] ROOT_URLCONF = "swh.deposit.urls" TEMPLATES = [ { "BACKEND": "django.template.backends.django.DjangoTemplates", "DIRS": [], "APP_DIRS": True, "OPTIONS": { "context_processors": [ "django.template.context_processors.debug", "django.template.context_processors.request", "django.contrib.auth.context_processors.auth", "django.contrib.messages.context_processors.messages", ], }, }, ] # Password validation # https://docs.djangoproject.com/en/1.10/ref/settings/#auth-password-validators AUTH_PASSWORD_VALIDATORS = [ { "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", # noqa }, {"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",}, {"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",}, {"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",}, ] # Internationalization # https://docs.djangoproject.com/en/1.10/topics/i18n/ LANGUAGE_CODE = "en-us" TIME_ZONE = "UTC" USE_I18N = True USE_L10N = True USE_TZ = True # Static files (CSS, JavaScript, Images) # https://docs.djangoproject.com/en/1.10/howto/static-files/ STATIC_URL = "/static/" REST_FRAMEWORK = { - "DEFAULT_AUTHENTICATION_CLASSES": ( - "rest_framework.authentication.BasicAuthentication", - ), + "DEFAULT_AUTHENTICATION_CLASSES": ("swh.deposit.auth.KeycloakBasicAuthentication",), "EXCEPTION_HANDLER": "swh.deposit.exception.custom_exception_handler", } FILE_UPLOAD_HANDLERS = [ "django.core.files.uploadhandler.MemoryFileUploadHandler", "django.core.files.uploadhandler.TemporaryFileUploadHandler", ] + +CACHES = {"default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache",}} diff --git a/swh/deposit/settings/testing.py b/swh/deposit/settings/testing.py index bc1ee990..ca0fe59e 100644 --- a/swh/deposit/settings/testing.py +++ b/swh/deposit/settings/testing.py @@ -1,42 +1,42 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from .common import * # noqa from .common import ALLOWED_HOSTS from .development import * # noqa from .development import INSTALLED_APPS # django setup ALLOWED_HOSTS += ["testserver"] INSTALLED_APPS += ["pytest_django"] # https://docs.djangoproject.com/en/1.10/ref/settings/#logging LOGGING = { "version": 1, "disable_existing_loggers": True, "formatters": { "standard": { "format": "[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s", # noqa "datefmt": "%d/%b/%Y %H:%M:%S", }, }, "handlers": { "console": { "level": "ERROR", "class": "logging.StreamHandler", "formatter": "standard", }, }, "loggers": {"swh.deposit": {"handlers": ["console"], "level": "ERROR",},}, } # https://docs.djangoproject.com/en/1.11/ref/settings/#std:setting-MEDIA_ROOT # SECURITY WARNING: Override this in the production.py module MEDIA_ROOT = "/tmp/swh-deposit/test/uploads/" FILE_UPLOAD_HANDLERS = [ "django.core.files.uploadhandler.MemoryFileUploadHandler", ] diff --git a/swh/deposit/tests/api/test_collection.py b/swh/deposit/tests/api/test_collection.py index 4d1beb1a..513a3b87 100644 --- a/swh/deposit/tests/api/test_collection.py +++ b/swh/deposit/tests/api/test_collection.py @@ -1,75 +1,83 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib from io import BytesIO from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_REJECTED from swh.deposit.parsers import parse_xml -def test_deposit_post_will_fail_with_401(client): +def test_deposit_post_will_fail_with_401(unauthorized_client): """Without authentication, endpoint refuses access with 401 response """ url = reverse(COL_IRI, args=["hal"]) - response = client.post(url) + response = unauthorized_client.post(url) assert response.status_code == status.HTTP_401_UNAUTHORIZED +def test_deposit_post_insufficient_permission(insufficient_perm_client): + """With connection ok but insufficient permission, endpoint refuses access""" + url = reverse(COL_IRI, args=["hal"]) + response = insufficient_perm_client.post(url) + assert response.status_code == status.HTTP_403_FORBIDDEN + assert b"permission" in response.content + + def test_access_to_another_user_collection_is_forbidden( authenticated_client, deposit_another_collection, deposit_user ): """Access to another user collection should return a 403 """ coll2 = deposit_another_collection url = reverse(COL_IRI, args=[coll2.name]) response = authenticated_client.post(url) assert response.status_code == status.HTTP_403_FORBIDDEN msg = "Client %s cannot access collection %s" % (deposit_user.username, coll2.name,) assert msg in response.content.decode("utf-8") def test_delete_on_col_iri_not_supported(authenticated_client, deposit_collection): """Delete on col iri should return a 405 response """ url = reverse(COL_IRI, args=[deposit_collection.name]) response = authenticated_client.delete(url) assert response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED assert "DELETE method is not supported on this endpoint" in response.content.decode( "utf-8" ) def create_deposit_with_rejection_status(authenticated_client, deposit_collection): url = reverse(COL_IRI, args=[deposit_collection.name]) data = b"some data which is clearly not a zip file" md5sum = hashlib.md5(data).hexdigest() external_id = "some-external-id-1" # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=data, # + headers CONTENT_LENGTH=len(data), # other headers needs HTTP_ prefix to be taken into account HTTP_SLUG=external_id, HTTP_CONTENT_MD5=md5sum, HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) actual_state = response_content["deposit_status"] assert actual_state == DEPOSIT_STATUS_REJECTED diff --git a/swh/deposit/tests/api/test_collection_add_to_origin.py b/swh/deposit/tests/api/test_collection_add_to_origin.py index 7af4f398..b545acd4 100644 --- a/swh/deposit/tests/api/test_collection_add_to_origin.py +++ b/swh/deposit/tests/api/test_collection_add_to_origin.py @@ -1,159 +1,157 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import BytesIO from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_LOAD_SUCCESS from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import post_atom -from ..conftest import create_deposit +from ..conftest import internal_create_deposit def test_add_deposit_with_add_to_origin( authenticated_client, deposit_collection, completed_deposit, atom_dataset, deposit_user, ): """Posting deposit with creates a new deposit with parent """ # given multiple deposit already loaded deposit = completed_deposit assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS origin_url = deposit_user.provider_url + deposit.external_id # adding a new deposit with the same external id as a completed deposit # creates the parenting chain response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-with-add-to-origin"] % origin_url, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["swh:deposit_id"] assert deposit_id != deposit.id new_deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == new_deposit.collection assert deposit.origin_url == origin_url assert new_deposit != deposit assert new_deposit.parent == deposit assert new_deposit.origin_url == origin_url def test_add_deposit_add_to_origin_conflict( authenticated_client, - another_authenticated_client, deposit_collection, deposit_another_collection, atom_dataset, sample_archive, deposit_user, deposit_another_user, ): """Posting a deposit with an referencing an origin owned by a different client raises an error """ external_id = "foobar" origin_url = deposit_another_user.provider_url + external_id # create a deposit for that other user, with the same slug - create_deposit( - another_authenticated_client, - deposit_another_collection.name, - sample_archive, + internal_create_deposit( + deposit_another_user, + deposit_another_collection, external_id, DEPOSIT_STATUS_LOAD_SUCCESS, ) # adding a new deposit with the same external id as a completed deposit response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, ) assert response.status_code == status.HTTP_403_FORBIDDEN assert b"must start with" in response.content def test_add_deposit_add_to_wrong_origin( authenticated_client, deposit_collection, atom_dataset, sample_archive, ): """Posting a deposit with an referencing an origin not starting with the provider_url raises an error """ origin_url = "http://example.org/foo" # adding a new deposit with the same external id as a completed deposit response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, ) assert response.status_code == status.HTTP_403_FORBIDDEN assert b"must start with" in response.content def test_add_deposit_with_add_to_origin_and_external_identifier( authenticated_client, deposit_collection, completed_deposit, atom_dataset, deposit_user, ): """Posting deposit with creates a new deposit with parent """ # given multiple deposit already loaded origin_url = deposit_user.provider_url + completed_deposit.external_id # adding a new deposit with the same external id as a completed deposit # creates the parenting chain response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-with-both-add-to-origin-and-external-id"] % origin_url, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"<external_identifier> is deprecated" in response.content def test_post_deposit_atom_403_add_to_wrong_origin_url_prefix( authenticated_client, deposit_collection, atom_dataset, deposit_user ): """Creating an origin for a prefix not owned by the client is forbidden """ origin_url = "http://example.org/foo" response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-with-add-to-origin"] % origin_url, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_403_FORBIDDEN expected_msg = ( f"Cannot create origin {origin_url}, " f"it must start with {deposit_user.provider_url}" ) assert expected_msg in response.content.decode() diff --git a/swh/deposit/tests/api/test_collection_reuse_slug.py b/swh/deposit/tests/api/test_collection_reuse_slug.py index 31c0d3b1..99670b57 100644 --- a/swh/deposit/tests/api/test_collection_reuse_slug.py +++ b/swh/deposit/tests/api/test_collection_reuse_slug.py @@ -1,287 +1,283 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import BytesIO from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_PARTIAL, SE_IRI, ) from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import post_atom -from ..conftest import create_deposit +from ..conftest import internal_create_deposit def test_act_on_deposit_rejected_is_not_permitted( authenticated_client, deposit_collection, rejected_deposit, atom_dataset ): deposit = rejected_deposit response = post_atom( authenticated_client, reverse(SE_IRI, args=[deposit.collection.name, deposit.id]), data=atom_dataset["entry-data1"], HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_400_BAD_REQUEST msg = "You can only act on deposit with status '%s'" % ( DEPOSIT_STATUS_PARTIAL, ) assert msg in response.content.decode("utf-8") def test_add_deposit_when_partial_makes_new_deposit( authenticated_client, deposit_collection, partial_deposit, atom_dataset, deposit_user, ): """Posting deposit on collection when previous is partial makes new deposit """ deposit = partial_deposit assert deposit.status == DEPOSIT_STATUS_PARTIAL origin_url = deposit_user.provider_url + deposit.external_id # adding a new deposit with the same external id response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["swh:deposit_id"] assert deposit_id != deposit.id # new deposit new_deposit = Deposit.objects.get(pk=deposit_id) assert new_deposit != deposit assert new_deposit.parent is None assert new_deposit.origin_url == origin_url def test_add_deposit_when_failed_makes_new_deposit_with_no_parent( authenticated_client, deposit_collection, failed_deposit, atom_dataset, deposit_user ): """Posting deposit on collection when deposit done makes new deposit with parent """ deposit = failed_deposit assert deposit.status == DEPOSIT_STATUS_LOAD_FAILURE origin_url = deposit_user.provider_url + deposit.external_id # adding a new deposit with the same external id as a completed deposit # creates the parenting chain response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["swh:deposit_id"] assert deposit_id != deposit.id new_deposit = Deposit.objects.get(pk=deposit_id) assert new_deposit != deposit assert new_deposit.parent is None assert new_deposit.origin_url == origin_url def test_add_deposit_when_done_makes_new_deposit_with_parent_old_one( authenticated_client, deposit_collection, completed_deposit, atom_dataset, deposit_user, ): """Posting deposit on collection when deposit done makes new deposit with parent """ # given multiple deposit already loaded deposit = completed_deposit assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS origin_url = deposit_user.provider_url + deposit.external_id # adding a new deposit with the same external id as a completed deposit # creates the parenting chain response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["swh:deposit_id"] assert deposit_id != deposit.id new_deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == new_deposit.collection assert deposit.origin_url == origin_url assert new_deposit != deposit assert new_deposit.parent == deposit assert new_deposit.origin_url == origin_url def test_add_deposit_with_external_identifier( authenticated_client, deposit_collection, completed_deposit, atom_dataset, deposit_user, ): """Even though is deprecated, it should still be allowed when it matches the slug, so that we don't break existing clients """ # given multiple deposit already loaded deposit = completed_deposit assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS origin_url = deposit_user.provider_url + deposit.external_id # adding a new deposit with the same external id as a completed deposit # creates the parenting chain response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["error-with-external-identifier"] % deposit.external_id, HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["swh:deposit_id"] assert deposit_id != deposit.id new_deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == new_deposit.collection assert deposit.origin_url == origin_url assert new_deposit != deposit assert new_deposit.parent == deposit assert new_deposit.origin_url == origin_url def test_add_deposit_external_id_conflict_no_parent( authenticated_client, - another_authenticated_client, deposit_collection, deposit_another_collection, atom_dataset, - sample_archive, deposit_user, + deposit_another_user, ): """Posting a deposit with an external_id conflicting with an external_id of a different client does not create a parent relationship """ external_id = "foobar" origin_url = deposit_user.provider_url + external_id # create a deposit for that other user, with the same slug - other_deposit = create_deposit( - another_authenticated_client, - deposit_another_collection.name, - sample_archive, + other_deposit = internal_create_deposit( + deposit_another_user, + deposit_another_collection, external_id, DEPOSIT_STATUS_LOAD_SUCCESS, ) # adding a new deposit with the same external id as a completed deposit response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, HTTP_SLUG=external_id, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["swh:deposit_id"] assert other_deposit.id != deposit_id new_deposit = Deposit.objects.get(pk=deposit_id) assert new_deposit.parent is None assert new_deposit.origin_url == origin_url def test_add_deposit_external_id_conflict_with_parent( authenticated_client, - another_authenticated_client, deposit_collection, deposit_another_collection, completed_deposit, atom_dataset, - sample_archive, deposit_user, + deposit_another_user, ): """Posting a deposit with an external_id conflicting with an external_id of a different client creates a parent relationship with the deposit of the right client instead of the last matching deposit This test does not have an equivalent for origin url conflicts, as these can not happen (assuming clients do not have provider_url overlaps) """ # given multiple deposit already loaded deposit = completed_deposit assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS origin_url = deposit_user.provider_url + deposit.external_id # create a deposit for that other user, with the same slug - other_deposit = create_deposit( - another_authenticated_client, - deposit_another_collection.name, - sample_archive, + other_deposit = internal_create_deposit( + deposit_another_user, + deposit_another_collection, deposit.external_id, DEPOSIT_STATUS_LOAD_SUCCESS, ) # adding a new deposit with the same external id as a completed deposit response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["swh:deposit_id"] assert deposit_id != deposit.id assert other_deposit.id != deposit.id new_deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == new_deposit.collection assert deposit.external_id == new_deposit.external_id assert new_deposit != deposit assert new_deposit.parent == deposit assert new_deposit.origin_url == origin_url diff --git a/swh/deposit/tests/api/test_exception.py b/swh/deposit/tests/api/test_exception.py index a606397f..f1ea1e8c 100644 --- a/swh/deposit/tests/api/test_exception.py +++ b/swh/deposit/tests/api/test_exception.py @@ -1,52 +1,52 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.db.utils import OperationalError from rest_framework.exceptions import APIException from rest_framework.response import Response from swh.deposit.exception import custom_exception_handler def test_custom_exception_handler_operational_error(mocker): """Operation error are translated to service unavailable """ fake_exception = OperationalError("Fake internal error", 503) response = custom_exception_handler(fake_exception, {}) assert response is not None assert response.status_code == 503 status = "Database backend maintenance" detail = "Service temporarily unavailable, try again later." assert ( response.content.decode("utf-8") == f""" {status} {detail} """ ) def test_custom_exception_handler_default_behavior_maintained(mocker): """Other internal errors are transmitted as is """ fake_exception = APIException("Fake internal error", 500) fake_response = Response( exception=fake_exception, status=fake_exception.status_code ) - mock_exception_handler = mocker.patch("swh.deposit.exception.exception_handler") + mock_exception_handler = mocker.patch("rest_framework.views.exception_handler") mock_exception_handler.return_value = fake_response response = custom_exception_handler(fake_exception, {}) assert response is not None assert response == fake_response diff --git a/swh/deposit/tests/api/test_get_file.py b/swh/deposit/tests/api/test_get_file.py index 1d9d0ee3..4a26208a 100644 --- a/swh/deposit/tests/api/test_get_file.py +++ b/swh/deposit/tests/api/test_get_file.py @@ -1,51 +1,53 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Tests 'GET File-IRI'.""" from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.config import CONT_FILE_IRI from swh.deposit.models import DEPOSIT_STATUS_DETAIL from swh.deposit.parsers import parse_xml def test_api_deposit_content_nominal( - client, complete_deposit, partial_deposit_only_metadata + authenticated_client, complete_deposit, partial_deposit_only_metadata ): """Retrieve information on deposit should return 200 response """ for deposit in [complete_deposit, partial_deposit_only_metadata]: expected_deposit = { "swh:deposit_id": str(deposit.id), "swh:deposit_status": deposit.status, "swh:deposit_status_detail": DEPOSIT_STATUS_DETAIL[deposit.status], } url = reverse(CONT_FILE_IRI, args=[deposit.collection.name, deposit.id]) - response = client.get(url) + response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK actual_deposit = dict(parse_xml(response.content)) del actual_deposit["swh:deposit_date"] assert set(actual_deposit.items()) >= set(expected_deposit.items()) -def test_api_deposit_content_unknown(client, complete_deposit, deposit_collection): +def test_api_deposit_content_unknown( + authenticated_client, complete_deposit, deposit_collection +): """Retrieve information on unknown deposit or collection should return 404 """ unknown_deposit_id = 999 unknown_collection = "unknown" for collection, deposit_id in [ (deposit_collection.name, unknown_deposit_id), (unknown_collection, complete_deposit.id), (complete_deposit.collection.name, complete_deposit.id + 10), ]: url = reverse(CONT_FILE_IRI, args=[collection, deposit_id]) - response = client.get(url) + response = authenticated_client.get(url) assert response.status_code == status.HTTP_404_NOT_FOUND diff --git a/swh/deposit/tests/api/test_service_document.py b/swh/deposit/tests/api/test_service_document.py index d57d3a85..666a151f 100644 --- a/swh/deposit/tests/api/test_service_document.py +++ b/swh/deposit/tests/api/test_service_document.py @@ -1,82 +1,82 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.config import SD_IRI def test_service_document_no_auth_fails(client): """Without authentication, service document endpoint should return 401 """ url = reverse(SD_IRI) response = client.get(url) assert response.status_code == status.HTTP_401_UNAUTHORIZED def test_service_document_no_auth_with_http_auth_should_not_break(client): """Without auth, sd endpoint through browser should return 401 """ url = reverse(SD_IRI) response = client.get(url, HTTP_ACCEPT="text/html,application/xml;q=9,*/*,q=8") assert response.status_code == status.HTTP_401_UNAUTHORIZED -def test_service_document(authenticated_client, deposit_user): +def test_service_document(authenticated_client): """With authentication, service document list user's collection """ url = reverse(SD_IRI) response = authenticated_client.get(url) - check_response(response, deposit_user.username) + check_response(response, authenticated_client.deposit_client.username) -def test_service_document_with_http_accept_header(authenticated_client, deposit_user): +def test_service_document_with_http_accept_header(authenticated_client): """With authentication, with browser, sd list user's collection """ url = reverse(SD_IRI) response = authenticated_client.get( url, HTTP_ACCEPT="text/html,application/xml;q=9,*/*,q=8" ) - check_response(response, deposit_user.username) + check_response(response, authenticated_client.deposit_client.username) def check_response(response, username): assert response.status_code == status.HTTP_200_OK assert ( response.content.decode("utf-8") == """ 2.0 %s The Software Heritage (SWH) Archive %s Software Collection application/zip application/x-tar Collection Policy Software Heritage Archive Collect, Preserve, Share false false http://purl.org/net/sword/package/SimpleZip http://testserver/1/%s/ %s """ % (500, username, username, username, username) ) # noqa diff --git a/swh/deposit/tests/cli/test_admin.py b/swh/deposit/tests/cli/test_admin.py index c05659b3..32afd670 100644 --- a/swh/deposit/tests/cli/test_admin.py +++ b/swh/deposit/tests/cli/test_admin.py @@ -1,317 +1,317 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.deposit.cli.admin import admin as cli from swh.deposit.config import ( DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_VERIFIED, ) from swh.deposit.models import DepositClient, DepositCollection from swh.scheduler.utils import create_oneshot_task_dict @pytest.fixture(autouse=True) def enable_db_access_for_all_tests(db): pass def test_cli_admin_user_list_nothing(cli_runner): result = cli_runner.invoke(cli, ["user", "list",]) assert result.exit_code == 0, f"Unexpected output: {result.output}" assert result.output == "Empty user list\n" def test_cli_admin_user_list_with_users(cli_runner, deposit_user): result = cli_runner.invoke(cli, ["user", "list",]) assert result.exit_code == 0, f"Unexpected output: {result.output}" assert result.output == f"{deposit_user.username}\n" # only 1 user def test_cli_admin_collection_list_nothing(cli_runner): result = cli_runner.invoke(cli, ["collection", "list",]) assert result.exit_code == 0, f"Unexpected output: {result.output}" assert result.output == "Empty collection list\n" def test_cli_admin_collection_list_with_collections(cli_runner, deposit_collection): from swh.deposit.tests.conftest import create_deposit_collection new_collection = create_deposit_collection("something") result = cli_runner.invoke(cli, ["collection", "list",]) assert result.exit_code == 0, f"Unexpected output: {result.output}" collections = "\n".join([deposit_collection.name, new_collection.name]) assert result.output == f"{collections}\n" def test_cli_admin_user_exists_unknown(cli_runner): result = cli_runner.invoke(cli, ["user", "exists", "unknown"]) assert result.exit_code == 1, f"Unexpected output: {result.output}" assert result.output == "User unknown does not exist.\n" def test_cli_admin_user_exists(cli_runner, deposit_user): result = cli_runner.invoke(cli, ["user", "exists", deposit_user.username]) assert result.exit_code == 0, f"Unexpected output: {result.output}" assert result.output == f"User {deposit_user.username} exists.\n" def test_cli_admin_create_collection(cli_runner): collection_name = "something" try: DepositCollection.objects.get(name=collection_name) except DepositCollection.DoesNotExist: pass result = cli_runner.invoke( cli, ["collection", "create", "--name", collection_name,] ) assert result.exit_code == 0, f"Unexpected output: {result.output}" collection = DepositCollection.objects.get(name=collection_name) assert collection is not None assert ( result.output == f"""Create collection '{collection_name}'. Collection '{collection_name}' created. """ ) result2 = cli_runner.invoke( cli, ["collection", "create", "--name", collection_name,] ) assert result2.exit_code == 0, f"Unexpected output: {result.output}" assert ( result2.output == f"""Collection '{collection_name}' exists, skipping. """ ) def test_cli_admin_user_create(cli_runner): user_name = "user" collection_name = user_name try: DepositClient.objects.get(username=user_name) except DepositClient.DoesNotExist: pass try: DepositCollection.objects.get(name=collection_name) except DepositCollection.DoesNotExist: pass result = cli_runner.invoke( cli, ["user", "create", "--username", user_name, "--password", "password",] ) assert result.exit_code == 0, f"Unexpected output: {result.output}" user = DepositClient.objects.get(username=user_name) assert user is not None collection = DepositCollection.objects.get(name=collection_name) assert collection is not None assert ( result.output == f"""Create collection '{user_name}'. Collection '{collection_name}' created. Create user '{user_name}'. User '{user_name}' created. """ ) assert collection.name == collection_name assert user.username == user_name first_password = user.password assert first_password is not None assert user.collections == [collection.id] assert user.is_active is True assert user.domain == "" assert user.provider_url == "" assert user.email == "" assert user.first_name == "" assert user.last_name == "" # create a user that already exists result2 = cli_runner.invoke( cli, [ "user", "create", "--username", "user", "--password", "another-password", # changing password "--collection", collection_name, # specifying the collection this time "--firstname", "User", "--lastname", "no one", "--email", "user@org.org", "--provider-url", "http://some-provider.org", "--domain", "domain", ], ) assert result2.exit_code == 0, f"Unexpected output: {result2.output}" user = DepositClient.objects.get(username=user_name) assert user is not None assert user.username == user_name assert user.collections == [collection.id] assert user.is_active is True second_password = user.password assert second_password is not None - assert second_password != first_password, "Password should have changed" + assert second_password == first_password, "Password not changed (no longer used)" assert user.domain == "domain" assert user.provider_url == "http://some-provider.org" assert user.email == "user@org.org" assert user.first_name == "User" assert user.last_name == "no one" assert ( result2.output == f"""Collection '{collection_name}' exists, skipping. Update user '{user_name}'. User '{user_name}' updated. """ ) def test_cli_admin_reschedule_unknown_deposit(cli_runner): """Rescheduling unknown deposit should report failure """ unknown_deposit_id = 666 from swh.deposit.models import Deposit try: Deposit.objects.get(id=unknown_deposit_id) except Deposit.DoesNotExist: pass result = cli_runner.invoke( cli, ["deposit", "reschedule", "--deposit-id", unknown_deposit_id] ) assert result.output == f"Deposit {unknown_deposit_id} does not exist.\n" assert result.exit_code == 1 def test_cli_admin_reschedule_verified_deposit(cli_runner, complete_deposit): """Rescheduling verified deposit should do nothing but report """ deposit = complete_deposit deposit.status = "verified" deposit.save() result = cli_runner.invoke( cli, ["deposit", "reschedule", "--deposit-id", deposit.id] ) assert result.output == f"Deposit {deposit.id} already set for rescheduling.\n" assert result.exit_code == 0 @pytest.mark.parametrize( "status_to_check", [DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_DEPOSITED] ) def test_cli_admin_reschedule_unaccepted_deposit_status( status_to_check, cli_runner, complete_deposit ): """Rescheduling verified deposit should do nothing but report """ deposit = complete_deposit deposit.status = status_to_check # not accepted status will fail the check deposit.save() result = cli_runner.invoke( cli, ["deposit", "reschedule", "--deposit-id", deposit.id] ) assert result.output == ( f"Deposit {deposit.id} cannot be rescheduled (status: {deposit.status}).\n" "Rescheduling deposit is only accepted for deposit with status: done, failed.\n" ) assert result.exit_code == 1 def test_cli_admin_reschedule_missing_task_id(cli_runner, complete_deposit): """Rescheduling deposit with no load_task_id cannot work. """ deposit = complete_deposit deposit.load_task_id = "" # drop the load-task-id so it fails the check deposit.save() result = cli_runner.invoke( cli, ["deposit", "reschedule", "--deposit-id", deposit.id] ) assert result.output == ( f"Deposit {deposit.id} cannot be rescheduled. It misses the " "associated scheduler task id (field load_task_id).\n" ) assert result.exit_code == 1 def test_cli_admin_reschedule_nominal(cli_runner, complete_deposit, swh_scheduler): """Rescheduling deposit with no load_task_id cannot work. """ deposit = complete_deposit from swh.deposit.models import Deposit # create a task to keep a reference on it task = create_oneshot_task_dict( "load-deposit", url=deposit.origin_url, deposit_id=deposit.id, retries_left=3 ) scheduled_task = swh_scheduler.create_tasks([task])[0] # disable it swh_scheduler.set_status_tasks([scheduled_task["id"]], status="disabled") # Now update the deposit state with some swhid and relevant load_task_id deposit = complete_deposit deposit.load_task_id = scheduled_task["id"] deposit.swhid = "swh:1:dir:02ed6084fb0e8384ac58980e07548a547431cf74" deposit.swhid_context = f"{deposit.swhid};origin=https://url/external-id" deposit.save() # Reschedule it result = cli_runner.invoke( cli, ["deposit", "reschedule", "--deposit-id", deposit.id] ) assert result.exit_code == 0 # Now, ensure the deposit and the associated task are in the right shape deposit = Deposit.objects.get(id=deposit.id) # got reset to a state which allows rescheduling assert deposit.id assert deposit.swhid is None assert deposit.swhid_context is None assert deposit.status == DEPOSIT_STATUS_VERIFIED task = swh_scheduler.search_tasks(task_id=deposit.load_task_id)[0] assert task["status"] == "next_run_not_scheduled" diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py index 8cd0fee8..b591cf90 100644 --- a/swh/deposit/tests/conftest.py +++ b/swh/deposit/tests/conftest.py @@ -1,482 +1,583 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 +from copy import deepcopy from functools import partial from io import BytesIO import os import re -from typing import Mapping +from typing import TYPE_CHECKING, Dict, Mapping from django.test.utils import setup_databases # type: ignore from django.urls import reverse_lazy as reverse import psycopg2 from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT import pytest from rest_framework import status from rest_framework.test import APIClient import yaml +from swh.auth.pytest_plugin import keycloak_mock_factory from swh.core.config import read from swh.core.pytest_plugin import get_response_cb +from swh.deposit.auth import DEPOSIT_PERMISSION from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED, SE_IRI, setup_django_for, ) from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import ( create_arborescence_archive, post_archive, post_atom, ) from swh.model.hashutil import hash_to_bytes from swh.model.identifiers import CoreSWHID, ObjectType, QualifiedSWHID from swh.scheduler import get_scheduler +if TYPE_CHECKING: + from swh.deposit.models import Deposit, DepositClient, DepositCollection + + # mypy is asked to ignore the import statement above because setup_databases # is not part of the d.t.utils.__all__ variable. +USERNAME = "test" +EMAIL = "test@example.org" +COLLECTION = "test" TEST_USER = { - "username": "test", - "password": "password", - "email": "test@example.org", + "username": USERNAME, + "password": "", + "email": EMAIL, "provider_url": "https://hal-test.archives-ouvertes.fr/", "domain": "archives-ouvertes.fr/", - "collection": {"name": "test"}, + "collection": {"name": COLLECTION}, +} + +USER_INFO = { + "name": USERNAME, + "email": EMAIL, + "email_verified": False, + "family_name": "", + "given_name": "", + "groups": [], + "preferred_username": USERNAME, + "sub": "ffffffff-bbbb-4444-aaaa-14f61e6b7200", } +USERNAME2 = "test2" +EMAIL2 = "test@example.org" +COLLECTION2 = "another-collection" -ANOTHER_TEST_USER = { - "username": "test2", - "password": "password2", - "email": "test@example2.org", +TEST_USER2 = { + "username": USERNAME2, + "password": "", + "email": EMAIL2, "provider_url": "https://hal-test.archives-ouvertes.example/", "domain": "archives-ouvertes.example/", - "collection": {"name": "another-collection"}, + "collection": {"name": COLLECTION2}, } +KEYCLOAK_SERVER_URL = "https://auth.swh.org/SWHTest" +KEYCLOAK_REALM_NAME = "SWHTest" +CLIENT_ID = "swh-deposit" + + +keycloak_mock_auth_success = keycloak_mock_factory( + server_url=KEYCLOAK_SERVER_URL, + realm_name=KEYCLOAK_REALM_NAME, + client_id=CLIENT_ID, + auth_success=True, + user_info=USER_INFO, + user_permissions=[DEPOSIT_PERMISSION], +) + + +keycloak_mock_auth_failure = keycloak_mock_factory( + server_url=KEYCLOAK_SERVER_URL, + realm_name=KEYCLOAK_REALM_NAME, + client_id=CLIENT_ID, + auth_success=False, +) + def pytest_configure(): setup_django_for("testing") @pytest.fixture def requests_mock_datadir(datadir, requests_mock_datadir): """Override default behavior to deal with put/post methods """ cb = partial(get_response_cb, datadir=datadir) requests_mock_datadir.put(re.compile("https://"), body=cb) requests_mock_datadir.post(re.compile("https://"), body=cb) return requests_mock_datadir @pytest.fixture() def deposit_config(swh_scheduler_config, swh_storage_backend_config): return { "max_upload_size": 500, "extraction_dir": "/tmp/swh-deposit/test/extraction-dir", "checks": False, "scheduler": {"cls": "local", **swh_scheduler_config,}, "storage": swh_storage_backend_config, "storage_metadata": swh_storage_backend_config, "swh_authority_url": "http://deposit.softwareheritage.example/", + "keycloak": { + "server_url": KEYCLOAK_SERVER_URL, + "realm_name": KEYCLOAK_REALM_NAME, + }, } @pytest.fixture() def deposit_config_path(tmp_path, monkeypatch, deposit_config): conf_path = os.path.join(tmp_path, "deposit.yml") with open(conf_path, "w") as f: f.write(yaml.dump(deposit_config)) monkeypatch.setenv("SWH_CONFIG_FILENAME", conf_path) return conf_path @pytest.fixture(autouse=True) def deposit_autoconfig(deposit_config_path): """Enforce config for deposit classes inherited from APIConfig.""" cfg = read(deposit_config_path) if "scheduler" in cfg: # scheduler setup: require the check-deposit and load-deposit tasks scheduler = get_scheduler(**cfg["scheduler"]) task_types = [ { "type": "check-deposit", "backend_name": "swh.deposit.loader.tasks.ChecksDepositTsk", "description": "Check deposit metadata/archive before loading", "num_retries": 3, }, { "type": "load-deposit", "backend_name": "swh.loader.package.deposit.tasks.LoadDeposit", "description": "Loading deposit archive into swh archive", "num_retries": 3, }, ] for task_type in task_types: scheduler.create_task_type(task_type) @pytest.fixture(scope="session") def django_db_setup(request, django_db_blocker, postgresql_proc): from django.conf import settings settings.DATABASES["default"].update( { ("ENGINE", "django.db.backends.postgresql"), ("NAME", "tests"), ("USER", postgresql_proc.user), # noqa ("HOST", postgresql_proc.host), # noqa ("PORT", postgresql_proc.port), # noqa } ) with django_db_blocker.unblock(): setup_databases( verbosity=request.config.option.verbose, interactive=False, keepdb=False ) def execute_sql(sql): """Execute sql to postgres db""" with psycopg2.connect(database="postgres") as conn: conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() cur.execute(sql) @pytest.fixture(autouse=True, scope="session") def swh_proxy(): """Automatically inject this fixture in all tests to ensure no outside connection takes place. """ os.environ["http_proxy"] = "http://localhost:999" os.environ["https_proxy"] = "http://localhost:999" def create_deposit_collection(collection_name: str): """Create a deposit collection with name collection_name """ from swh.deposit.models import DepositCollection try: collection = DepositCollection._default_manager.get(name=collection_name) except DepositCollection.DoesNotExist: collection = DepositCollection(name=collection_name) collection.save() return collection -def deposit_collection_factory(collection_name=TEST_USER["collection"]["name"]): +def deposit_collection_factory(collection_name): @pytest.fixture def _deposit_collection(db, collection_name=collection_name): return create_deposit_collection(collection_name) return _deposit_collection -deposit_collection = deposit_collection_factory() -deposit_another_collection = deposit_collection_factory("another-collection") +deposit_collection = deposit_collection_factory(COLLECTION) +deposit_another_collection = deposit_collection_factory(COLLECTION2) -def _create_deposit_user(db, collection, user_data): +def _create_deposit_user( + collection: "DepositCollection", user_data: Dict +) -> "DepositClient": """Create/Return the test_user "test" """ from swh.deposit.models import DepositClient - try: - user = DepositClient._default_manager.get(username=user_data["username"]) - except DepositClient.DoesNotExist: - user = DepositClient._default_manager.create_user( - username=user_data["username"], - email=user_data["email"], - password=user_data["password"], - provider_url=user_data["provider_url"], - domain=user_data["domain"], - ) - user.collections = [collection.id] - user.save() + user_data_d = deepcopy(user_data) + user_data_d.pop("collection", None) + user, _ = DepositClient.objects.get_or_create( # type: ignore + username=user_data_d["username"], + defaults={**user_data_d, "collections": [collection.id]}, + ) return user @pytest.fixture def deposit_user(db, deposit_collection): - return _create_deposit_user(db, deposit_collection, TEST_USER) + return _create_deposit_user(deposit_collection, TEST_USER) @pytest.fixture def deposit_another_user(db, deposit_another_collection): - return _create_deposit_user(db, deposit_another_collection, ANOTHER_TEST_USER) + return _create_deposit_user(deposit_another_collection, TEST_USER2) @pytest.fixture -def client(): - """Override pytest-django one which does not work for djangorestframework. +def anonymous_client(): + """Create an anonymous client (no credentials during queries to the deposit) """ return APIClient() # <- drf's client -def _create_authenticated_client(client, user, user_data): - """Returned a logged client +def mock_keycloakopenidconnect(mocker, keycloak_mock): + """Mock swh.deposit.auth.KeycloakOpenIDConnect to return the keycloak_mock + + """ + mock = mocker.patch("swh.deposit.auth.KeycloakOpenIDConnect") + mock.from_configfile.return_value = keycloak_mock + return mock + + +@pytest.fixture +def mock_keycloakopenidconnect_ok(mocker, keycloak_mock_auth_success): + """Mock keycloak so it always accepts connection for user with the right + permissions + + """ + return mock_keycloakopenidconnect(mocker, keycloak_mock_auth_success) + + +@pytest.fixture +def mock_keycloakopenidconnect_ko(mocker, keycloak_mock_auth_failure): + """Mock keycloak so it always refuses connections.""" + return mock_keycloakopenidconnect(mocker, keycloak_mock_auth_failure) + + +@pytest.fixture +def unauthorized_client(anonymous_client, mock_keycloakopenidconnect_ko): + """Create an unauthorized client (will see their authentication fail) + + """ + return anonymous_client + + +def _create_authenticated_client(client, user): + """Return a client whose credentials will be proposed to the deposit server. This also patched the client instance to keep a reference on the associated deposit_user. """ - _token = "%s:%s" % (user.username, user_data["password"]) + _token = "%s:%s" % (user.username, "irrelevant-in-test-context") token = base64.b64encode(_token.encode("utf-8")) authorization = "Basic %s" % token.decode("utf-8") client.credentials(HTTP_AUTHORIZATION=authorization) client.deposit_client = user yield client client.logout() @pytest.fixture -def authenticated_client(client, deposit_user): - yield from _create_authenticated_client(client, deposit_user, TEST_USER) +def authenticated_client(mock_keycloakopenidconnect_ok, anonymous_client, deposit_user): + yield from _create_authenticated_client(anonymous_client, deposit_user) @pytest.fixture -def another_authenticated_client(deposit_another_user): - client = APIClient() - yield from _create_authenticated_client( - client, deposit_another_user, ANOTHER_TEST_USER - ) +def insufficient_perm_client( + mocker, keycloak_mock_auth_success, anonymous_client, deposit_user +): + """keycloak accepts connection but client returned has no deposit permission, so access + is not allowed. + + """ + keycloak_mock_auth_success.user_permissions = [] + mock_keycloakopenidconnect(mocker, keycloak_mock_auth_success) + yield from _create_authenticated_client(anonymous_client, deposit_user) @pytest.fixture def sample_archive(tmp_path): """Returns a sample archive """ tmp_path = str(tmp_path) # pytest version limitation in previous version archive = create_arborescence_archive( tmp_path, "archive1", "file1", b"some content in file" ) return archive @pytest.fixture def atom_dataset(datadir) -> Mapping[str, str]: """Compute the paths to atom files. Returns: Dict of atom name per content (bytes) """ atom_path = os.path.join(datadir, "atom") data = {} for filename in os.listdir(atom_path): filepath = os.path.join(atom_path, filename) with open(filepath, "rb") as f: raw_content = f.read().decode("utf-8") # Keep the filename without extension atom_name = filename.split(".")[0] data[atom_name] = raw_content return data +def internal_create_deposit( + client: "DepositClient", + collection: "DepositCollection", + external_id: str, + status: str, +) -> "Deposit": + """Create a deposit for a given collection with internal tool + + """ + from swh.deposit.models import Deposit + + deposit = Deposit( + client=client, external_id=external_id, status=status, collection=collection + ) + deposit.save() + return deposit + + def create_deposit( - authenticated_client, + client, collection_name: str, sample_archive, external_id: str, deposit_status=DEPOSIT_STATUS_DEPOSITED, in_progress=False, ): """Create a skeleton shell deposit """ url = reverse(COL_IRI, args=[collection_name]) # when response = post_archive( - authenticated_client, + client, url, sample_archive, HTTP_SLUG=external_id, HTTP_IN_PROGRESS=str(in_progress).lower(), ) # then assert response.status_code == status.HTTP_201_CREATED, response.content.decode() from swh.deposit.models import Deposit response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["swh:deposit_id"] deposit = Deposit._default_manager.get(id=deposit_id) if deposit.status != deposit_status: deposit.status = deposit_status deposit.save() assert deposit.status == deposit_status return deposit def create_binary_deposit( authenticated_client, collection_name: str, deposit_status: str = DEPOSIT_STATUS_DEPOSITED, atom_dataset: Mapping[str, bytes] = {}, **kwargs, ): """Create a deposit with both metadata and archive set. Then alters its status to `deposit_status`. """ deposit = create_deposit( authenticated_client, collection_name, deposit_status=DEPOSIT_STATUS_PARTIAL, **kwargs, ) origin_url = deposit.client.provider_url + deposit.external_id response = post_atom( authenticated_client, reverse(SE_IRI, args=[collection_name, deposit.id]), data=atom_dataset["entry-data0"] % origin_url, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_201_CREATED assert deposit.status == DEPOSIT_STATUS_PARTIAL from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit.id) assert deposit.status == deposit_status return deposit def deposit_factory(deposit_status=DEPOSIT_STATUS_DEPOSITED, in_progress=False): """Build deposit with a specific status """ @pytest.fixture() def _deposit( sample_archive, deposit_collection, authenticated_client, deposit_status=deposit_status, ): external_id = "external-id-%s" % deposit_status return create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id=external_id, deposit_status=deposit_status, in_progress=in_progress, ) return _deposit deposited_deposit = deposit_factory() rejected_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_REJECTED) partial_deposit = deposit_factory( deposit_status=DEPOSIT_STATUS_PARTIAL, in_progress=True ) verified_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_VERIFIED) completed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS) failed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_FAILURE) @pytest.fixture def partial_deposit_with_metadata( sample_archive, deposit_collection, authenticated_client, atom_dataset ): """Returns deposit with archive and metadata provided, status 'partial' """ return create_binary_deposit( authenticated_client, deposit_collection.name, sample_archive=sample_archive, external_id="external-id-partial", in_progress=True, deposit_status=DEPOSIT_STATUS_PARTIAL, atom_dataset=atom_dataset, ) @pytest.fixture def partial_deposit_only_metadata( deposit_collection, authenticated_client, atom_dataset ): response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data1"], HTTP_SLUG="external-id-partial", HTTP_IN_PROGRESS=True, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = response_content["swh:deposit_id"] from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_PARTIAL return deposit @pytest.fixture def complete_deposit(sample_archive, deposit_collection, authenticated_client): """Returns a completed deposit (load success) """ deposit = create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id="external-id-complete", deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS, ) origin = "https://hal.archives-ouvertes.fr/hal-01727745" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" revision_id = hash_to_bytes("548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10") snapshot_id = hash_to_bytes("e5e82d064a9c3df7464223042e0c55d72ccff7f0") deposit.swhid = f"swh:1:dir:{directory_id}" deposit.swhid_context = str( QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(directory_id), origin=origin, visit=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snapshot_id), anchor=CoreSWHID(object_type=ObjectType.REVISION, object_id=revision_id), path=b"/", ) ) deposit.save() return deposit @pytest.fixture() def tmp_path(tmp_path): return str(tmp_path) # issue with oldstable's pytest version diff --git a/swh/deposit/tests/test_backend.py b/swh/deposit/tests/test_backend.py new file mode 100644 index 00000000..ca31d40a --- /dev/null +++ b/swh/deposit/tests/test_backend.py @@ -0,0 +1,71 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest +from rest_framework.exceptions import AuthenticationFailed + +from swh.deposit.auth import KeycloakBasicAuthentication +from swh.deposit.tests.conftest import TEST_USER + +REQUEST_OBJECT = "request-unused" +PASSWORD = "some-deposit-pass" + + +@pytest.fixture +def backend_success(mock_keycloakopenidconnect_ok, deposit_config, db): + """Backend whose connection to keycloak will systematically succeed.""" + return KeycloakBasicAuthentication() + + +@pytest.fixture +def backend_failure(mock_keycloakopenidconnect_ko, deposit_config): + """Backend whose connection to keycloak will systematically fail.""" + return KeycloakBasicAuthentication() + + +def test_backend_authentication_refused(backend_failure): + with pytest.raises(AuthenticationFailed): + backend_failure.authenticate_credentials( + TEST_USER["username"], PASSWORD, REQUEST_OBJECT + ) + + +def test_backend_authentication_db_misconfigured(backend_success): + """Keycloak configured ok, backend db misconfigured (missing user), this raises""" + with pytest.raises(AuthenticationFailed, match="Unknown"): + backend_success.authenticate_credentials( + TEST_USER["username"], PASSWORD, REQUEST_OBJECT + ) + + +def test_backend_authentication_user_inactive(backend_success, deposit_user): + """Keycloak configured ok, backend db configured, user inactive, this raises""" + deposit_user.is_active = False + deposit_user.save() + + with pytest.raises(AuthenticationFailed, match="Deactivated"): + backend_success.authenticate_credentials( + deposit_user.username, PASSWORD, REQUEST_OBJECT + ) + + +def test_backend_authentication_ok(backend_success, deposit_user): + """Keycloak configured ok, backend db configured ok, user logs in + + """ + user0, _ = backend_success.authenticate_credentials( + deposit_user.username, PASSWORD, REQUEST_OBJECT + ) + + assert user0 is not None + + # A second authentication call should leverage the django cache feature. + + user1, _ = backend_success.authenticate_credentials( + deposit_user.username, PASSWORD, REQUEST_OBJECT + ) + assert user1 is not None + + assert user0 == user1, "Should have been retrieved from the cache"