diff --git a/swh/deposit/api/checks.py b/swh/deposit/api/checks.py index 1d82a28e..2243aee8 100644 --- a/swh/deposit/api/checks.py +++ b/swh/deposit/api/checks.py @@ -1,240 +1,247 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Functional Metadata checks: Mandatory fields: - 'author' - 'name' or 'title' Suggested fields: - metadata-provenance """ import dataclasses import functools import re from typing import Dict, Iterator, Optional, Tuple, cast import urllib from xml.etree import ElementTree import pkg_resources import xmlschema from swh.deposit.errors import FORBIDDEN, DepositError from swh.deposit.utils import NAMESPACES, parse_swh_metadata_provenance MANDATORY_FIELDS_MISSING = "Mandatory fields are missing" INVALID_DATE_FORMAT = "Invalid date format" SUGGESTED_FIELDS_MISSING = "Suggested fields are missing" METADATA_PROVENANCE_KEY = "swh:metadata-provenance" AFFILIATION_NO_NAME = "Reason: affiliation does not have a element" def extra_validator( element: ElementTree.Element, xsd_element: xmlschema.validators.elements.Xsd11Element, ) -> Optional[Iterator[xmlschema.XMLSchemaValidationError]]: """Performs extra checks on Atom elements that cannot be implemented purely within XML Schema. For now, this only checks URIs are absolute.""" type_name = xsd_element.type.name if type_name == "{http://www.w3.org/2001/XMLSchema}anyURI": # Check their URI is absolute. # This could technically be implemented in the schema like this: # # # # # # # However, this would give an unreadable error, so we implement it here # in Python instead. yield from absolute_uri_validator(element, xsd_element) elif type_name == "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}identifierType": # Made-up type, that allows both absolute URIs and HAL-IDs if not re.match("hal-[0-9]+", element.text or ""): yield from absolute_uri_validator(element, xsd_element) def absolute_uri_validator( element: ElementTree.Element, xsd_element: xmlschema.validators.elements.Xsd11Element, ) -> Iterator[xmlschema.XMLSchemaValidationError]: try: url = urllib.parse.urlparse(element.text) except ValueError: yield xmlschema.XMLSchemaValidationError( - xsd_element, element, f"{element.text!r} is not a valid URI", + xsd_element, + element, + f"{element.text!r} is not a valid URI", ) else: if not url.scheme or not url.netloc: yield xmlschema.XMLSchemaValidationError( - xsd_element, element, f"{element.text!r} is not an absolute URI", + xsd_element, + element, + f"{element.text!r} is not an absolute URI", ) elif " " in url.netloc: # urllib is a little too permissive... yield xmlschema.XMLSchemaValidationError( - xsd_element, element, f"{element.text!r} is not a valid URI", + xsd_element, + element, + f"{element.text!r} is not a valid URI", ) @dataclasses.dataclass class Schemas: swh: xmlschema.XMLSchema11 codemeta: xmlschema.XMLSchema11 @functools.lru_cache(1) def schemas() -> Schemas: def load_xsd(name) -> xmlschema.XMLSchema11: return xmlschema.XMLSchema11( pkg_resources.resource_string("swh.deposit", f"xsd/{name}.xsd").decode() ) return Schemas(swh=load_xsd("swh"), codemeta=load_xsd("codemeta")) def check_metadata(metadata: ElementTree.Element) -> Tuple[bool, Optional[Dict]]: """Check metadata for mandatory field presence and date format. Args: metadata: Metadata dictionary to check Returns: tuple (status, error_detail): - (True, None) if metadata are ok and suggested fields are also present - (True, ) if metadata are ok but some suggestions are missing - (False, ) otherwise. """ suggested_fields = [] # at least one value per couple below is mandatory alternate_fields = { ("atom:name", "atom:title", "codemeta:name"): False, ("atom:author", "codemeta:author"): False, } for possible_names in alternate_fields: for possible_name in possible_names: if metadata.find(possible_name, namespaces=NAMESPACES) is not None: alternate_fields[possible_names] = True continue mandatory_result = [" or ".join(k) for k, v in alternate_fields.items() if not v] # provenance metadata is optional provenance_meta = parse_swh_metadata_provenance(metadata) if provenance_meta is None: suggested_fields = [ {"summary": SUGGESTED_FIELDS_MISSING, "fields": [METADATA_PROVENANCE_KEY]} ] if mandatory_result: detail = [{"summary": MANDATORY_FIELDS_MISSING, "fields": mandatory_result}] return False, {"metadata": detail + suggested_fields} deposit_elt = metadata.find("swh:deposit", namespaces=NAMESPACES) if deposit_elt: try: schemas().swh.validate( deposit_elt, extra_validator=cast( # ExtraValidatorType is a callable with "SchemaType" as second # argument, but extra_validator() is actually passed Xsd11Element # as second argument # https://github.com/sissaschool/xmlschema/issues/291 xmlschema.aliases.ExtraValidatorType, extra_validator, ), ) except xmlschema.exceptions.XMLSchemaException as e: return False, {"metadata": [{"fields": ["swh:deposit"], "summary": str(e)}]} detail = [] for child in metadata: for schema_element in schemas().codemeta.root_elements: if child.tag in schema_element.name: break else: # Tag is not specified in the schema, don't validate it continue try: schemas().codemeta.validate( child, extra_validator=cast( # ExtraValidatorType is a callable with "SchemaType" as second # argument, but extra_validator() is actually passed Xsd11Element # as second argument # https://github.com/sissaschool/xmlschema/issues/291 xmlschema.aliases.ExtraValidatorType, extra_validator, ), ) except xmlschema.exceptions.XMLSchemaException as e: detail.append({"fields": [schema_element.prefixed_name], "summary": str(e)}) else: # Manually validate . Unfortunately, this cannot be # validated by codemeta.xsd, because Codemeta has conflicting requirements: # 1. https://codemeta.github.io/terms/ requires it to be Text (represented # by simple content), but # 2. https://doi.org/10.5063/SCHEMA/CODEMETA-2.0 requires it to be an # Organization (represented by complex content) # And this is (legitimately) not representable in XML Schema. # # See https://github.com/codemeta/codemeta/pull/239 for a discussion about # this issue. for affiliation in child.findall( "codemeta:affiliation", namespaces=NAMESPACES ): if len(affiliation) > 0: # This is a complex element (as required by # https://codemeta.github.io/terms/), then we want to make sure # there is at least a name. if not affiliation.findtext("codemeta:name", namespaces=NAMESPACES): detail.append( { "fields": [schema_element.prefixed_name], "summary": AFFILIATION_NO_NAME, } ) break else: # This is a simple element (as required by # https://doi.org/10.5063/SCHEMA/CODEMETA-2.0) if affiliation.text is None or not affiliation.text.strip(): # Completely empty element detail.append( { "fields": [schema_element.prefixed_name], "summary": AFFILIATION_NO_NAME, } ) break if detail: return False, {"metadata": detail + suggested_fields} if suggested_fields: # it's fine but warn about missing suggested fields return True, {"metadata": suggested_fields} return True, None def check_url_match_provider(url: str, provider_url: str) -> None: """Check url matches the provider url. Raises DepositError in case of mismatch """ provider_url = provider_url.rstrip("/") + "/" if not url.startswith(provider_url): raise DepositError( - FORBIDDEN, f"URL mismatch: {url} must start with {provider_url}", + FORBIDDEN, + f"URL mismatch: {url} must start with {provider_url}", ) diff --git a/swh/deposit/api/collection.py b/swh/deposit/api/collection.py index b877ccf6..e80c0bf0 100644 --- a/swh/deposit/api/collection.py +++ b/swh/deposit/api/collection.py @@ -1,175 +1,173 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Optional, Tuple from django.shortcuts import render from rest_framework import status from rest_framework.generics import ListAPIView from ..config import DEPOSIT_STATUS_LOAD_SUCCESS, EDIT_IRI from ..models import Deposit from ..parsers import ( SWHAtomEntryParser, SWHFileUploadTarParser, SWHFileUploadZipParser, SWHMultiPartParser, ) from .common import ( ACCEPT_ARCHIVE_CONTENT_TYPES, APIPost, ParsedRequestHeaders, Receipt, get_collection_by_name, ) from .utils import DefaultPagination, DepositSerializer class CollectionAPI(ListAPIView, APIPost): """Deposit request class defining api endpoints for sword deposit. What's known as 'Col-IRI' in the sword specification. HTTP verbs supported: GET and POST """ parser_classes = ( SWHMultiPartParser, SWHFileUploadZipParser, SWHFileUploadTarParser, SWHAtomEntryParser, ) serializer_class = DepositSerializer pagination_class = DefaultPagination def get(self, request, *args, **kwargs): - """List the user's collection if the user has access to said collection. - - """ + """List the user's collection if the user has access to said collection.""" self.checks(request, kwargs["collection_name"]) paginated_result = super().get(request, *args, **kwargs) data = paginated_result.data # Build pagination link headers links = [] for link_name in ["next", "previous"]: link = data.get(link_name) if link is None: continue links.append(f'<{link}>; rel="{link_name}"') response = render( request, "deposit/collection_list.xml", context={ "count": data["count"], "results": [dict(d) for d in data["results"]], }, content_type="application/xml", status=status.HTTP_200_OK, ) response["Link"] = ",".join(links) return response def get_queryset(self): """List the deposits for the authenticated user (pagination is handled by the `pagination_class` class attribute). """ return Deposit.objects.filter(client=self.request.user.id).order_by("id") def process_post( self, req, headers: ParsedRequestHeaders, collection_name: str, deposit: Optional[Deposit] = None, ) -> Tuple[int, str, Receipt]: """Create a first deposit as: - archive deposit (1 zip) - multipart (1 zip + 1 atom entry) - atom entry Args: req (Request): the request holding the information to parse and inject in db collection_name (str): the associated client Returns: An http response (HttpResponse) according to the situation. If everything is ok, a 201 response (created) with a deposit receipt. Raises: - archive deposit: - 400 (bad request) if the request is not providing an external identifier - 403 (forbidden) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or hash provided mismatch the reality of the archive. - 415 (unsupported media type) if a wrong media type is provided - multipart deposit: - 400 (bad request) if the request is not providing an external identifier - 412 (precondition failed) if the potentially md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided - Atom entry deposit: - 400 (bad request) if the request is not providing an external identifier - 400 (bad request) if the request's body is empty - 415 (unsupported media type) if a wrong media type is provided """ assert deposit is None deposit = self._deposit_create(req, collection_name, external_id=headers.slug) if req.content_type in ACCEPT_ARCHIVE_CONTENT_TYPES: receipt = self._binary_upload(req, headers, collection_name, deposit) elif req.content_type.startswith("multipart/"): receipt = self._multipart_upload(req, headers, collection_name, deposit) else: receipt = self._atom_entry(req, headers, collection_name, deposit) return status.HTTP_201_CREATED, EDIT_IRI, receipt def _deposit_create( self, request, collection_name: str, external_id: Optional[str] ) -> Deposit: collection = get_collection_by_name(collection_name) client = self.get_client(request) deposit_parent: Optional[Deposit] = None if external_id: # TODO: delete this when clients stopped relying on the slug try: # find a deposit parent (same external id, status load to success) deposit_parent = ( Deposit.objects.filter( client=client, external_id=external_id, status=DEPOSIT_STATUS_LOAD_SUCCESS, ) .order_by("-id")[0:1] .get() ) except Deposit.DoesNotExist: # then no parent for that deposit, deposit_parent already None pass return Deposit( collection=collection, external_id=external_id or "", client=client, parent=deposit_parent, ) diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py index 3377cc42..a1c9bbca 100644 --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -1,1287 +1,1288 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from abc import ABCMeta, abstractmethod import datetime import hashlib import json from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union import uuid from xml.etree import ElementTree import attr from django.core.files.uploadedfile import UploadedFile from django.http import FileResponse, HttpResponse from django.shortcuts import render from django.template.loader import render_to_string from django.urls import reverse from django.utils import timezone from rest_framework import status from rest_framework.authentication import BaseAuthentication, BasicAuthentication from rest_framework.permissions import BasePermission, IsAuthenticated from rest_framework.request import Request from rest_framework.views import APIView from swh.deposit.api.checks import check_metadata, check_url_match_provider from swh.deposit.api.converters import convert_status_detail from swh.deposit.auth import HasDepositPermission, KeycloakBasicAuthentication from swh.deposit.models import DEPOSIT_METADATA_ONLY, Deposit from swh.deposit.parsers import parse_xml from swh.deposit.utils import ( NAMESPACES, compute_metadata_context, parse_swh_metadata_provenance, ) from swh.model import hashutil from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, Origin, RawExtrinsicMetadata, ) from swh.model.swhids import ( ExtendedObjectType, ExtendedSWHID, QualifiedSWHID, ValidationError, ) from swh.scheduler.utils import create_oneshot_task_dict from ..config import ( ARCHIVE_KEY, ARCHIVE_TYPE, CONT_FILE_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_PARTIAL, EDIT_IRI, EM_IRI, METADATA_TYPE, RAW_METADATA_KEY, SE_IRI, STATE_IRI, APIConfig, ) from ..errors import ( BAD_REQUEST, CHECKSUM_MISMATCH, ERROR_CONTENT, FORBIDDEN, MAX_UPLOAD_SIZE_EXCEEDED, MEDIATION_NOT_ALLOWED, METHOD_NOT_ALLOWED, NOT_FOUND, PARSING_ERROR, DepositError, ParserError, ) from ..models import DepositClient, DepositCollection, DepositRequest from ..utils import ( extended_swhid_from_qualified, parse_swh_deposit_origin, parse_swh_reference, ) ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"] ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"] @attr.s class ParsedRequestHeaders: content_type = attr.ib(type=str) content_length = attr.ib(type=Optional[int]) in_progress = attr.ib(type=bool) content_disposition = attr.ib(type=Optional[str]) content_md5sum = attr.ib(type=Optional[bytes]) packaging = attr.ib(type=Optional[str]) slug = attr.ib(type=Optional[str]) on_behalf_of = attr.ib(type=Optional[str]) metadata_relevant = attr.ib(type=Optional[str]) swhid = attr.ib(type=Optional[str]) @attr.s class Receipt: """Data computed while handling the request body that will be served in the Deposit Receipt.""" deposit_id = attr.ib(type=int) deposit_date = attr.ib(type=datetime.datetime) status = attr.ib(type=str) archive = attr.ib(type=Optional[str]) def _compute_md5(filehandler: UploadedFile) -> bytes: h = hashlib.md5() for chunk in filehandler: h.update(chunk) # type: ignore return h.digest() def get_deposit_by_id( deposit_id: int, collection_name: Optional[str] = None ) -> Deposit: """Gets an existing Deposit object if it exists, or raises `DepositError`. If `collection` is not None, also checks the deposit belongs to the collection.""" try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: raise DepositError(NOT_FOUND, f"Deposit {deposit_id} does not exist") if collection_name and deposit.collection.name != collection_name: get_collection_by_name(collection_name) # raises if does not exist raise DepositError( NOT_FOUND, f"Deposit {deposit_id} does not belong to collection {collection_name}", ) return deposit def get_collection_by_name(collection_name: str): """Gets an existing Deposit object if it exists, or raises `DepositError`.""" try: collection = DepositCollection.objects.get(name=collection_name) except DepositCollection.DoesNotExist: raise DepositError(NOT_FOUND, f"Unknown collection name {collection_name}") assert collection is not None return collection def guess_deposit_origin_url(deposit: Deposit): """Guesses an origin url for the given deposit.""" external_id = deposit.external_id if not external_id: # The client provided neither an origin_url nor a slug. That's inconvenient, # but SWORD requires we support it. So let's generate a random slug. external_id = str(uuid.uuid4()) return "%s/%s" % (deposit.client.provider_url.rstrip("/"), external_id) class APIBase(APIConfig, APIView, metaclass=ABCMeta): - """Base deposit request class sharing multiple common behaviors. - - """ + """Base deposit request class sharing multiple common behaviors.""" _client: Optional[DepositClient] = None def __init__(self): super().__init__() auth_provider = self.config.get("authentication_provider") if auth_provider == "basic": self.authentication_classes: Sequence[Type[BaseAuthentication]] = ( BasicAuthentication, ) self.permission_classes: Sequence[Type[BasePermission]] = (IsAuthenticated,) elif auth_provider == "keycloak": self.authentication_classes: Sequence[Type[BaseAuthentication]] = ( KeycloakBasicAuthentication, ) self.permission_classes: Sequence[Type[BasePermission]] = ( IsAuthenticated, HasDepositPermission, ) else: raise ValueError( "Configuration key 'authentication_provider' should be provided with" f"either 'basic' or 'keycloak' value not {auth_provider!r}." ) def _read_headers(self, request: Request) -> ParsedRequestHeaders: """Read and unify the necessary headers from the request (those are not stored in the same location or not properly formatted). Args: request: Input request Returns: Dictionary with the following keys (some associated values may be None): - content-type - content-length - in-progress - content-disposition - packaging - slug - on-behalf-of """ meta = request._request.META content_length = meta.get("CONTENT_LENGTH") if content_length and isinstance(content_length, str): content_length = int(content_length) # final deposit if not provided in_progress = meta.get("HTTP_IN_PROGRESS", False) if isinstance(in_progress, str): in_progress = in_progress.lower() == "true" content_md5sum = meta.get("HTTP_CONTENT_MD5") if content_md5sum: content_md5sum = bytes.fromhex(content_md5sum) return ParsedRequestHeaders( content_type=request.content_type, content_length=content_length, in_progress=in_progress, content_disposition=meta.get("HTTP_CONTENT_DISPOSITION"), content_md5sum=content_md5sum, packaging=meta.get("HTTP_PACKAGING"), slug=meta.get("HTTP_SLUG"), on_behalf_of=meta.get("HTTP_ON_BEHALF_OF"), metadata_relevant=meta.get("HTTP_METADATA_RELEVANT"), swhid=meta.get("HTTP_X_CHECK_SWHID"), ) def _deposit_put(self, deposit: Deposit, in_progress: bool = False) -> None: """Save/Update a deposit in db. Args: deposit: deposit being updated/created in_progress: deposit status """ if in_progress is False: self._complete_deposit(deposit) else: deposit.status = DEPOSIT_STATUS_PARTIAL deposit.save() def _complete_deposit(self, deposit: Deposit) -> None: """Marks the deposit as 'deposited', then schedule a check task if configured to do so.""" deposit.complete_date = timezone.now() deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() if not deposit.origin_url: deposit.origin_url = guess_deposit_origin_url(deposit) if self.config["checks"]: scheduler = self.scheduler if deposit.status == DEPOSIT_STATUS_DEPOSITED and not deposit.check_task_id: task = create_oneshot_task_dict( "check-deposit", collection=deposit.collection.name, deposit_id=deposit.id, retries_left=3, ) check_task_id = scheduler.create_tasks([task])[0]["id"] deposit.check_task_id = check_task_id deposit.save() def _deposit_request_put( self, deposit: Deposit, deposit_request_data: Dict[str, Any], replace_metadata: bool = False, replace_archives: bool = False, ) -> DepositRequest: """Save a deposit request with metadata attached to a deposit. Args: deposit: The deposit concerned by the request deposit_request_data: The dictionary with at most 2 deposit request types (archive, metadata) to associate to the deposit replace_metadata: Flag defining if we add or update existing metadata to the deposit replace_archives: Flag defining if we add or update archives to existing deposit Returns: the DepositRequest object stored in the backend """ if replace_metadata: DepositRequest.objects.filter(deposit=deposit, type=METADATA_TYPE).delete() if replace_archives: DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() deposit_request = None archive_file = deposit_request_data.get(ARCHIVE_KEY) if archive_file: deposit_request = DepositRequest( type=ARCHIVE_TYPE, deposit=deposit, archive=archive_file ) deposit_request.save() raw_metadata = deposit_request_data.get(RAW_METADATA_KEY) if raw_metadata: deposit_request = DepositRequest( type=METADATA_TYPE, deposit=deposit, raw_metadata=raw_metadata.decode("utf-8"), ) deposit_request.save() assert deposit_request is not None return deposit_request def _delete_archives(self, collection_name: str, deposit: Deposit) -> Dict: - """Delete archive references from the deposit id. - - """ + """Delete archive references from the deposit id.""" DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() return {} def _delete_deposit(self, collection_name: str, deposit: Deposit) -> Dict: """Delete deposit reference. Args: collection_name: Client's collection deposit: The deposit to delete Returns Empty dict when ok. Dict with error key to describe the failure. """ if deposit.collection.name != collection_name: summary = "Cannot delete a deposit from another collection" description = "Deposit %s does not belong to the collection %s" % ( deposit.id, collection_name, ) raise DepositError( BAD_REQUEST, summary=summary, verbose_description=description ) DepositRequest.objects.filter(deposit=deposit).delete() deposit.delete() return {} def _check_file_length( - self, filehandler: UploadedFile, content_length: Optional[int] = None, + self, + filehandler: UploadedFile, + content_length: Optional[int] = None, ) -> None: """Check the filehandler passed as argument has exactly the expected content_length Args: filehandler: The file to check content_length: the expected length if provided. Raises: DepositError if the actual length does not match """ max_upload_size = self.config["max_upload_size"] if content_length: length = filehandler.size if length != content_length: raise DepositError(status.HTTP_412_PRECONDITION_FAILED, "Wrong length") if filehandler.size > max_upload_size: raise DepositError( MAX_UPLOAD_SIZE_EXCEEDED, f"Upload size limit exceeded (max {max_upload_size} bytes)." "Please consider sending the archive in multiple steps.", ) def _check_file_md5sum( - self, filehandler: UploadedFile, md5sum: Optional[bytes], + self, + filehandler: UploadedFile, + md5sum: Optional[bytes], ) -> None: """Check the filehandler passed as argument has the expected md5sum Args: filehandler: The file to check md5sum: md5 hash expected from the file's content Raises: DepositError if the md5sum does not match """ if md5sum: _md5sum = _compute_md5(filehandler) if _md5sum != md5sum: raise DepositError( CHECKSUM_MISMATCH, "Wrong md5 hash", f"The checksum sent {hashutil.hash_to_hex(md5sum)} and the actual " f"checksum {hashutil.hash_to_hex(_md5sum)} does not match.", ) def _binary_upload( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, replace_metadata: bool = False, replace_archives: bool = False, ) -> Receipt: """Binary upload routine. Other than such a request, a 415 response is returned. Args: request: the request holding information to parse and inject in db headers: parsed request headers collection_name: the associated client deposit: deposit to be updated replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Raises: - 400 (bad request) if the request is not providing an external identifier - 413 (request entity too large) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided """ content_length = headers.content_length if not content_length: raise DepositError( BAD_REQUEST, "CONTENT_LENGTH header is mandatory", "For archive deposit, the CONTENT_LENGTH header must be sent.", ) content_disposition = headers.content_disposition if not content_disposition: raise DepositError( BAD_REQUEST, "CONTENT_DISPOSITION header is mandatory", "For archive deposit, the CONTENT_DISPOSITION header must be sent.", ) packaging = headers.packaging if packaging and packaging not in ACCEPT_PACKAGINGS: raise DepositError( BAD_REQUEST, f"Only packaging {ACCEPT_PACKAGINGS} is supported", f"The packaging provided {packaging} is not supported", ) filehandler = request.FILES["file"] assert isinstance(filehandler, UploadedFile), filehandler self._check_file_length(filehandler, content_length) self._check_file_md5sum(filehandler, headers.content_md5sum) # actual storage of data archive_metadata = filehandler self._deposit_put( - deposit=deposit, in_progress=headers.in_progress, + deposit=deposit, + in_progress=headers.in_progress, ) self._deposit_request_put( deposit, {ARCHIVE_KEY: archive_metadata}, replace_metadata=replace_metadata, replace_archives=replace_archives, ) return Receipt( deposit_id=deposit.id, deposit_date=deposit.reception_date, status=deposit.status, archive=filehandler.name, ) def _read_metadata(self, metadata_stream) -> Tuple[bytes, ElementTree.Element]: """ Given a metadata stream, reads the metadata and returns the metadata in three forms: * verbatim (as raw bytes), for archival in long-term storage * parsed as a Python dict, for archival in postgresql's jsonb type * parsed as ElementTree, to extract information immediately """ raw_metadata = metadata_stream.read() metadata_tree = parse_xml(raw_metadata) return raw_metadata, metadata_tree def _multipart_upload( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, replace_metadata: bool = False, replace_archives: bool = False, ) -> Receipt: """Multipart upload supported with exactly: - 1 archive (zip) - 1 atom entry Other than such a request, a 415 response is returned. Args: request: the request holding information to parse and inject in db headers: parsed request headers collection_name: the associated client deposit: deposit to be updated replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Raises: - 400 (bad request) if the request is not providing an external identifier - 412 (precondition failed) if the potentially md5 hash provided mismatch the reality of the archive - 413 (request entity too large) if the length of the archive exceeds the max size configured - 415 (unsupported media type) if a wrong media type is provided """ content_types_present = set() data: Dict[str, Optional[Any]] = { "application/zip": None, # expected either zip "application/x-tar": None, # or x-tar "application/atom+xml": None, } for key, value in request.FILES.items(): fh = value content_type = fh.content_type if content_type in content_types_present: raise DepositError( ERROR_CONTENT, "Only 1 application/zip (or application/x-tar) archive " "and 1 atom+xml entry is supported (as per sword2.0 " "specification)", "You provided more than 1 application/(zip|x-tar) " "or more than 1 application/atom+xml content-disposition " "header in the multipart deposit", ) content_types_present.add(content_type) assert content_type is not None data[content_type] = fh if len(content_types_present) != 2: raise DepositError( ERROR_CONTENT, "You must provide both 1 application/zip (or " "application/x-tar) and 1 atom+xml entry for multipart " "deposit", "You need to provide only 1 application/(zip|x-tar) " "and 1 application/atom+xml content-disposition header " "in the multipart deposit", ) filehandler = data["application/zip"] if not filehandler: filehandler = data["application/x-tar"] assert isinstance(filehandler, UploadedFile), filehandler self._check_file_length(filehandler) self._check_file_md5sum(filehandler, headers.content_md5sum) try: raw_metadata, metadata_tree = self._read_metadata( data["application/atom+xml"] ) except ParserError: raise DepositError( PARSING_ERROR, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) self._set_deposit_origin_from_metadata(deposit, metadata_tree, headers) # actual storage of data self._deposit_put( - deposit=deposit, in_progress=headers.in_progress, + deposit=deposit, + in_progress=headers.in_progress, ) deposit_request_data = { ARCHIVE_KEY: filehandler, RAW_METADATA_KEY: raw_metadata, } self._deposit_request_put( deposit, deposit_request_data, replace_metadata, replace_archives ) assert filehandler is not None return Receipt( deposit_id=deposit.id, deposit_date=deposit.reception_date, archive=filehandler.name, status=deposit.status, ) def _store_metadata_deposit( self, deposit: Deposit, swhid_reference: Union[str, QualifiedSWHID], metadata_tree: ElementTree.Element, raw_metadata: bytes, deposit_origin: Optional[str] = None, ) -> Tuple[ExtendedSWHID, Deposit, DepositRequest]: """When all user inputs pass the checks, this associates the raw_metadata to the swhid_reference in the raw extrinsic metadata storage. In case of any issues, a bad request response is returned to the user with the details. Checks: - metadata are technically parsable - metadata pass the functional checks - SWHID (if any) is technically valid Args: deposit: Deposit reference swhid_reference: The swhid or the origin to attach metadata information to metadata_tree: Full element tree of metadata to check for validity (parsed out of raw_metadata) raw_metadata: The actual raw metadata to send in the storage metadata deposit_origin: Optional deposit origin url to use if any (e.g. deposit update scenario provides one) Raises: DepositError in case of incorrect inputs from the deposit client (e.g. functionally invalid metadata, ...) Returns: Tuple of target swhid, deposit, and deposit request """ metadata_ok, error_details = check_metadata(metadata_tree) if not metadata_ok: assert error_details, "Details should be set when a failure occurs" raise DepositError( BAD_REQUEST, "Functional metadata checks failure", convert_status_detail(error_details), ) metadata_authority = MetadataAuthority( - type=MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit.client.provider_url, + type=MetadataAuthorityType.DEPOSIT_CLIENT, + url=deposit.client.provider_url, ) metadata_fetcher = self.swh_deposit_fetcher() # replace metadata within the deposit backend deposit_request_data = { RAW_METADATA_KEY: raw_metadata, } # actually add the metadata to the completed deposit deposit_request = self._deposit_request_put(deposit, deposit_request_data) target_swhid: ExtendedSWHID # origin URL or CoreSWHID if isinstance(swhid_reference, str): target_swhid = Origin(swhid_reference).swhid() metadata_context = {} else: metadata_context = compute_metadata_context(swhid_reference) if deposit_origin: # metadata deposit update on completed deposit metadata_context["origin"] = deposit_origin target_swhid = extended_swhid_from_qualified(swhid_reference) self._check_swhid_in_archive(target_swhid) # metadata deposited by the client metadata_object = RawExtrinsicMetadata( target=target_swhid, # core swhid or origin discovery_date=deposit_request.date, authority=metadata_authority, fetcher=metadata_fetcher, format="sword-v2-atom-codemeta", metadata=raw_metadata, **metadata_context, ) # metadata on the metadata object swh_deposit_authority = self.swh_deposit_authority() swh_deposit_fetcher = self.swh_deposit_fetcher() metametadata_object = RawExtrinsicMetadata( target=metadata_object.swhid(), discovery_date=deposit_request.date, authority=swh_deposit_authority, fetcher=swh_deposit_fetcher, format="xml-deposit-info", metadata=render_to_string( "deposit/deposit_info.xml", context={"deposit": deposit} ).encode(), ) # write to metadata storage self.storage_metadata.metadata_authority_add( [metadata_authority, swh_deposit_authority] ) self.storage_metadata.metadata_fetcher_add( [metadata_fetcher, swh_deposit_fetcher] ) self.storage_metadata.raw_extrinsic_metadata_add( [metadata_object, metametadata_object] ) return (target_swhid, deposit, deposit_request) def _check_swhid_in_archive(self, target_swhid: ExtendedSWHID) -> None: """Check the target object already exists in the archive, and raises a BAD_REQUEST if it does not.""" if target_swhid.object_type in (ExtendedObjectType.CONTENT,): if list( self.storage.content_missing_per_sha1_git([target_swhid.object_id]) ): raise DepositError( BAD_REQUEST, f"Cannot load metadata on {target_swhid}, this content " f"object does not exist in the archive (yet?).", ) elif target_swhid.object_type in ( ExtendedObjectType.DIRECTORY, ExtendedObjectType.REVISION, ExtendedObjectType.RELEASE, ExtendedObjectType.SNAPSHOT, ): target_type_name = target_swhid.object_type.name.lower() method = getattr(self.storage, target_type_name + "_missing") if list(method([target_swhid.object_id])): raise DepositError( BAD_REQUEST, f"Cannot load metadata on {target_swhid}, this {target_type_name} " f"object does not exist in the archive (yet?).", ) elif target_swhid.object_type in (ExtendedObjectType.ORIGIN,): if None in list(self.storage.origin_get_by_sha1([target_swhid.object_id])): raise DepositError( BAD_REQUEST, "Cannot load metadata on origin, it is not (yet?) known to the " "archive.", ) else: # This should not happen, because target_swhid is generated from either # a core swhid or an origin URL. # Let's just check it again so the "switch" is exhaustive. raise ValueError( f"_check_swhid_in_archive expected core SWHID or origin SWHID, " f"but got {target_swhid}." ) def _atom_entry( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, replace_metadata: bool = False, replace_archives: bool = False, ) -> Receipt: """Atom entry deposit. Args: request: the request holding information to parse and inject in db headers: parsed request headers collection_name: the associated client deposit: deposit to be updated replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Raises: - 400 (bad request) if the request is not providing an external identifier - 400 (bad request) if the request's body is empty - 415 (unsupported media type) if a wrong media type is provided """ metadata_stream = request.data empty_atom_entry_summary = "Empty body request is not supported." empty_atom_entry_desc = ( "Atom entry request is about non-empty metadata deposit." ) if not metadata_stream: raise DepositError( BAD_REQUEST, empty_atom_entry_summary, empty_atom_entry_desc ) try: raw_metadata, metadata_tree = self._read_metadata(metadata_stream) except ParserError: raise DepositError( BAD_REQUEST, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) if len(metadata_tree) == 0: raise DepositError( BAD_REQUEST, empty_atom_entry_summary, empty_atom_entry_desc ) self._set_deposit_origin_from_metadata(deposit, metadata_tree, headers) # Determine if we are in the metadata-only deposit case try: swhid_ref = parse_swh_reference(metadata_tree) except ValidationError as e: raise DepositError( - PARSING_ERROR, "Invalid SWHID reference", str(e), + PARSING_ERROR, + "Invalid SWHID reference", + str(e), ) if swhid_ref is not None and ( deposit.origin_url or deposit.parent or deposit.external_id ): raise DepositError( BAD_REQUEST, " is for metadata-only deposits and " " / / Slug are for " "code deposits, only one may be used on a given deposit.", ) if swhid_ref is not None: # It's suggested to user to provide it metadata_provenance_url = parse_swh_metadata_provenance(metadata_tree) if metadata_provenance_url: # If the provenance is provided, ensure it matches client provider url check_url_match_provider( metadata_provenance_url, deposit.client.provider_url ) deposit.save() # We need a deposit id target_swhid, depo, depo_request = self._store_metadata_deposit( deposit, swhid_ref, metadata_tree, raw_metadata ) deposit.status = DEPOSIT_STATUS_LOAD_SUCCESS if isinstance(swhid_ref, QualifiedSWHID): deposit.swhid = str(extended_swhid_from_qualified(swhid_ref)) deposit.swhid_context = str(swhid_ref) deposit.type = DEPOSIT_METADATA_ONLY deposit.complete_date = depo_request.date deposit.reception_date = depo_request.date deposit.save() return Receipt( deposit_id=deposit.id, deposit_date=depo_request.date, status=deposit.status, archive=None, ) self._deposit_put( - deposit=deposit, in_progress=headers.in_progress, + deposit=deposit, + in_progress=headers.in_progress, ) self._deposit_request_put( deposit, {RAW_METADATA_KEY: raw_metadata}, replace_metadata, replace_archives, ) return Receipt( deposit_id=deposit.id, deposit_date=deposit.reception_date, status=deposit.status, archive=None, ) def _set_deposit_origin_from_metadata(self, deposit, metadata, headers): (create_origin, add_to_origin) = parse_swh_deposit_origin(metadata) if create_origin and add_to_origin: raise DepositError( BAD_REQUEST, " and are mutually exclusive, " "as they respectively create a new origin and add to an existing " "origin.", ) if create_origin: origin_url = create_origin check_url_match_provider(origin_url, deposit.client.provider_url) deposit.origin_url = origin_url if add_to_origin: origin_url = add_to_origin check_url_match_provider(origin_url, deposit.client.provider_url) deposit.parent = ( Deposit.objects.filter( client=deposit.client, origin_url=origin_url, status=DEPOSIT_STATUS_LOAD_SUCCESS, ) .order_by("-id")[0:1] .get() ) deposit.origin_url = origin_url external_identifier_element = metadata.find( "atom:external_identifier", namespaces=NAMESPACES ) if external_identifier_element is not None: # Deprecated tag. # When clients stopped using it, this should raise an error # unconditionally if deposit.origin_url: raise DepositError( BAD_REQUEST, " is deprecated, you should only use " " and from now on.", ) if headers.slug and external_identifier_element.text != headers.slug: raise DepositError( BAD_REQUEST, "The tag and Slug header are deprecated, " " or " "should be used instead.", ) def _empty_post( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, ) -> Receipt: """Empty post to finalize a deposit. Args: request: the request holding information to parse and inject in db headers: parsed request headers collection_name: the associated client deposit: deposit to be finalized """ self._complete_deposit(deposit) assert deposit.complete_date is not None return Receipt( deposit_id=deposit.id, deposit_date=deposit.complete_date, status=deposit.status, archive=None, ) def additional_checks( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Optional[Deposit], ) -> Dict[str, Any]: """Permit the child class to enrich additional checks. Returns: dict with 'error' detailing the problem. """ return {} def get_client(self, request) -> DepositClient: # This class depends on AuthenticatedAPIView, so request.user.username # is always set username = request.user.username assert username is not None if self._client is None: try: self._client = DepositClient.objects.get( # type: ignore username=username ) except DepositClient.DoesNotExist: raise DepositError(NOT_FOUND, f"Unknown client name {username}") assert self._client.username == username return self._client def checks( self, request: Request, collection_name: str, deposit: Optional[Deposit] = None ) -> ParsedRequestHeaders: if deposit is None: collection = get_collection_by_name(collection_name) else: assert collection_name == deposit.collection.name collection = deposit.collection client = self.get_client(request) collection_id = collection.id collections = client.collections assert collections is not None if collection_id not in collections: raise DepositError( FORBIDDEN, f"Client {client.username} cannot access collection {collection_name}", ) headers = self._read_headers(request) if deposit is not None: self.restrict_access(request, headers, deposit) if headers.on_behalf_of: raise DepositError(MEDIATION_NOT_ALLOWED, "Mediation is not supported.") self.additional_checks(request, headers, collection_name, deposit) return headers def restrict_access( self, request: Request, headers: ParsedRequestHeaders, deposit: Deposit ) -> None: - """Allow modifications on deposit with status 'partial' only, reject the rest. - - """ + """Allow modifications on deposit with status 'partial' only, reject the rest.""" if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL: summary = "You can only act on deposit with status '%s'" % ( DEPOSIT_STATUS_PARTIAL, ) description = f"This deposit has status '{deposit.status}'" raise DepositError( BAD_REQUEST, summary=summary, verbose_description=description ) def _basic_not_allowed_method(self, request: Request, method: str): raise DepositError( - METHOD_NOT_ALLOWED, f"{method} method is not supported on this endpoint", + METHOD_NOT_ALLOWED, + f"{method} method is not supported on this endpoint", ) def get( self, request: Request, *args, **kwargs ) -> Union[HttpResponse, FileResponse]: return self._basic_not_allowed_method(request, "GET") def post(self, request: Request, *args, **kwargs) -> HttpResponse: return self._basic_not_allowed_method(request, "POST") def put(self, request: Request, *args, **kwargs) -> HttpResponse: return self._basic_not_allowed_method(request, "PUT") def delete(self, request: Request, *args, **kwargs) -> HttpResponse: return self._basic_not_allowed_method(request, "DELETE") class APIGet(APIBase, metaclass=ABCMeta): - """Mixin for class to support GET method. - - """ + """Mixin for class to support GET method.""" def get( # type: ignore self, request: Request, collection_name: str, deposit_id: int ) -> Union[HttpResponse, FileResponse]: """Endpoint to create/add resources to deposit. Returns: 200 response when no error during routine occurred 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ deposit = get_deposit_by_id(deposit_id, collection_name) self.checks(request, collection_name, deposit) r = self.process_get(request, collection_name, deposit) status, content, content_type = r if content_type == "swh/generator": with content as path: return FileResponse( open(path, "rb"), status=status, content_type="application/tar" ) if content_type == "application/json": return HttpResponse( json.dumps(content), status=status, content_type=content_type ) return HttpResponse(content, status=status, content_type=content_type) @abstractmethod def process_get( self, request: Request, collection_name: str, deposit: Deposit ) -> Tuple[int, Any, str]: """Routine to deal with the deposit's get processing. Returns: Tuple status, stream of content, content-type """ pass class APIPost(APIBase, metaclass=ABCMeta): - """Mixin for class to support POST method. - - """ + """Mixin for class to support POST method.""" def post( # type: ignore self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: """Endpoint to create/add resources to deposit. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ if deposit_id is None: deposit = None else: deposit = get_deposit_by_id(deposit_id, collection_name) headers = self.checks(request, collection_name, deposit) status, iri_key, receipt = self.process_post( request, headers, collection_name, deposit ) return self._make_deposit_receipt( - request, collection_name, status, iri_key, receipt, + request, + collection_name, + status, + iri_key, + receipt, ) def _make_deposit_receipt( self, request, collection_name: str, status: int, iri_key: str, receipt: Receipt, ) -> HttpResponse: """Returns an HttpResponse with a SWORD Deposit receipt as content.""" # Build the IRIs in the receipt args = [collection_name, receipt.deposit_id] iris = { iri: request.build_absolute_uri(reverse(iri, args=args)) for iri in [EM_IRI, EDIT_IRI, CONT_FILE_IRI, SE_IRI, STATE_IRI] } context = { **attr.asdict(receipt), **iris, "packagings": ACCEPT_PACKAGINGS, } response = render( request, "deposit/deposit_receipt.xml", context=context, content_type="application/xml", status=status, ) response["Location"] = iris[iri_key] return response @abstractmethod def process_post( self, request, headers: ParsedRequestHeaders, collection_name: str, deposit: Optional[Deposit] = None, ) -> Tuple[int, str, Receipt]: """Routine to deal with the deposit's processing. Returns Tuple of: - response status code (200, 201, etc...) - key iri (EM_IRI, EDIT_IRI, etc...) - Receipt """ pass class APIPut(APIBase, metaclass=ABCMeta): - """Mixin for class to support PUT method. - - """ + """Mixin for class to support PUT method.""" def put( # type: ignore self, request: Request, collection_name: str, deposit_id: int ) -> HttpResponse: """Endpoint to update deposit resources. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ if deposit_id is None: deposit = None else: deposit = get_deposit_by_id(deposit_id, collection_name) headers = self.checks(request, collection_name, deposit) self.process_put(request, headers, collection_name, deposit) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_put( self, request: Request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, ) -> None: """Routine to deal with updating a deposit in some way. Returns dictionary of the processing result """ pass class APIDelete(APIBase, metaclass=ABCMeta): - """Mixin for class to support DELETE method. - - """ + """Mixin for class to support DELETE method.""" def delete( # type: ignore self, request: Request, collection_name: str, deposit_id: Optional[int] = None ) -> HttpResponse: """Endpoint to delete some deposit's resources (archives, deposit). Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ assert deposit_id is not None deposit = get_deposit_by_id(deposit_id, collection_name) self.checks(request, collection_name, deposit) self.process_delete(request, collection_name, deposit) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_delete( self, request: Request, collection_name: str, deposit: Deposit ) -> None: """Routine to delete a resource. This is mostly not allowed except for the EM_IRI (cf. .api.deposit_update.APIUpdateArchive) """ pass diff --git a/swh/deposit/api/edit.py b/swh/deposit/api/edit.py index 7f222669..67e8bf85 100644 --- a/swh/deposit/api/edit.py +++ b/swh/deposit/api/edit.py @@ -1,142 +1,142 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework.request import Request from swh.deposit.models import Deposit from swh.model.swhids import QualifiedSWHID from ..config import DEPOSIT_STATUS_LOAD_SUCCESS from ..errors import BAD_REQUEST, DepositError, ParserError from ..parsers import SWHAtomEntryParser, SWHMultiPartParser from .common import APIDelete, APIPut, ParsedRequestHeaders class EditAPI(APIPut, APIDelete): """Deposit request class defining api endpoints for sword deposit. - What's known as 'Edit-IRI' in the sword specification. + What's known as 'Edit-IRI' in the sword specification. - HTTP verbs supported: PUT, DELETE + HTTP verbs supported: PUT, DELETE """ parser_classes = (SWHMultiPartParser, SWHAtomEntryParser) def restrict_access( self, request: Request, headers: ParsedRequestHeaders, deposit: Deposit ) -> None: """Relax restriction access to allow metadata update on deposit with status "done" when a swhid is provided. """ if ( request.method == "PUT" and headers.swhid is not None and deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS ): # Allow metadata update on deposit with status "done" when swhid provided return # otherwise, let the standard access restriction check occur super().restrict_access(request, headers, deposit) def process_put( self, request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, ) -> None: """This allows the following scenarios: - multipart: replace all the deposit (status partial) metadata and archive with the provided ones. - atom: replace all the deposit (status partial) metadata with the provided ones. - with swhid, atom: Add new metatada to deposit (status done) with provided ones and push such metadata to the metadata storage directly. source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_metadata - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_multipart Raises: 400 if any of the following occur: - the swhid provided and the deposit swhid do not match - the provided metadata xml file is malformed - the provided xml atom entry is empty - the provided swhid does not exist in the archive """ # noqa swhid = headers.swhid if swhid is None: if request.content_type.startswith("multipart/"): self._multipart_upload( request, headers, collection_name, deposit=deposit, replace_archives=True, replace_metadata=True, ) else: # standard metadata update (replace all metadata already provided to the # deposit by the new ones) self._atom_entry( request, headers, collection_name, deposit=deposit, replace_metadata=True, ) return # Update metadata on a deposit already ingested # Write to the metadata storage (and the deposit backend) # no ingestion triggered assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS if swhid != deposit.swhid: raise DepositError( BAD_REQUEST, f"Mismatched provided SWHID {swhid} with deposit's {deposit.swhid}.", "The provided SWHID does not match the deposit to update. " "Please ensure you send the correct deposit SWHID.", ) try: raw_metadata, metadata_tree = self._read_metadata(request.data) except ParserError: raise DepositError( BAD_REQUEST, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) if len(metadata_tree) == 0: raise DepositError( BAD_REQUEST, "Empty body request is not supported", "Atom entry deposit is supposed to send for metadata. " "If the body is empty, there is no metadata.", ) _, deposit, deposit_request = self._store_metadata_deposit( deposit, QualifiedSWHID.from_string(swhid), metadata_tree, raw_metadata, deposit.origin_url, ) def process_delete(self, req, collection_name: str, deposit: Deposit) -> None: """Delete the container (deposit). - source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deleteconteiner # noqa + source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deleteconteiner # noqa """ self._delete_deposit(collection_name, deposit) diff --git a/swh/deposit/api/edit_media.py b/swh/deposit/api/edit_media.py index 102cae40..b673ff75 100644 --- a/swh/deposit/api/edit_media.py +++ b/swh/deposit/api/edit_media.py @@ -1,100 +1,100 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Optional, Tuple from rest_framework import status from ..config import CONT_FILE_IRI from ..errors import BAD_REQUEST, DepositError from ..models import Deposit from ..parsers import SWHFileUploadTarParser, SWHFileUploadZipParser from .common import ( ACCEPT_ARCHIVE_CONTENT_TYPES, APIDelete, APIPost, APIPut, ParsedRequestHeaders, Receipt, ) class EditMediaAPI(APIPost, APIPut, APIDelete): """Deposit request class defining api endpoints for sword deposit. - What's known as 'EM IRI' in the sword specification. + What's known as 'EM IRI' in the sword specification. - HTTP verbs supported: PUT, POST, DELETE + HTTP verbs supported: PUT, POST, DELETE """ parser_classes = ( SWHFileUploadZipParser, SWHFileUploadTarParser, ) def process_put( self, req, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit ) -> None: """Replace existing content for the existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_binary # noqa Returns: 204 No content """ if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES: msg = "Packaging format supported is restricted to %s" % ( ", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES) ) raise DepositError(BAD_REQUEST, msg) self._binary_upload( req, headers, collection_name, deposit=deposit, replace_archives=True ) def process_post( self, req, headers: ParsedRequestHeaders, collection_name: str, deposit: Optional[Deposit] = None, ) -> Tuple[int, str, Receipt]: """Add new content to the existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_mediaresource # noqa Returns: 201 Created Headers: Location: [Cont-File-IRI] Body: [optional Deposit Receipt] """ assert deposit is not None if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES: msg = "Packaging format supported is restricted to %s" % ( ", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES) ) raise DepositError(BAD_REQUEST, msg) return ( status.HTTP_201_CREATED, CONT_FILE_IRI, self._binary_upload(req, headers, collection_name, deposit), ) def process_delete(self, req, collection_name: str, deposit: Deposit) -> None: """Delete content (archives) from existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deletingcontent # noqa Returns: 204 Created """ self._delete_archives(collection_name, deposit) diff --git a/swh/deposit/api/private/__init__.py b/swh/deposit/api/private/__init__.py index 0adbd25c..acf91c02 100644 --- a/swh/deposit/api/private/__init__.py +++ b/swh/deposit/api/private/__init__.py @@ -1,83 +1,89 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Optional from rest_framework.permissions import AllowAny from rest_framework.views import APIView from ...config import METADATA_TYPE, APIConfig from ...models import Deposit, DepositRequest class DepositReadMixin: - """Deposit Read mixin - - """ + """Deposit Read mixin""" def _deposit_requests(self, deposit: Deposit, request_type: str): """Given a deposit, yields its associated deposit_request Args: deposit: Deposit to list requests for request_type: 'archive' or 'metadata' Yields: deposit requests of type request_type associated to the deposit, most recent first """ deposit_requests = DepositRequest.objects.filter( type=request_type, deposit=deposit ).order_by("-id") for deposit_request in deposit_requests: yield deposit_request def _metadata_get(self, deposit: Deposit) -> Optional[bytes]: """Retrieve the last non-empty raw metadata object for that deposit, if any Args: deposit: The deposit instance to extract metadata from """ for deposit_request in self._deposit_requests( deposit, request_type=METADATA_TYPE ): if deposit_request.raw_metadata is not None: return deposit_request.raw_metadata return None class APIPrivateView(APIConfig, APIView): """Mixin intended as private api (so no authentication) based API view - (for the private ones). + (for the private ones). """ def __init__(self): super().__init__() self.authentication_classes = () self.permission_classes = (AllowAny,) def checks(self, req, collection_name, deposit=None): - """Override default checks implementation to allow empty collection. - - """ + """Override default checks implementation to allow empty collection.""" headers = self._read_headers(req) self.additional_checks(req, headers, collection_name, deposit) return {"headers": headers} def get( - self, request, collection_name=None, deposit_id=None, *args, **kwargs, + self, + request, + collection_name=None, + deposit_id=None, + *args, + **kwargs, ): return super().get(request, collection_name, deposit_id) def put( - self, request, collection_name=None, deposit_id=None, *args, **kwargs, + self, + request, + collection_name=None, + deposit_id=None, + *args, + **kwargs, ): return super().put(request, collection_name, deposit_id) diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py index 90598dc8..a8807908 100644 --- a/swh/deposit/api/private/deposit_check.py +++ b/swh/deposit/api/private/deposit_check.py @@ -1,200 +1,206 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from itertools import chain import re from shutil import get_unpack_formats import tarfile from typing import Dict, Optional, Tuple from xml.etree import ElementTree import zipfile from rest_framework import status from rest_framework.request import Request from swh.scheduler.utils import create_oneshot_task_dict from . import APIPrivateView, DepositReadMixin from ...config import ARCHIVE_TYPE, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED from ...models import Deposit, DepositRequest from ..checks import check_metadata from ..common import APIGet MANDATORY_ARCHIVE_UNREADABLE = ( "At least one of its associated archives is not readable" # noqa ) MANDATORY_ARCHIVE_INVALID = ( "Mandatory archive is invalid (i.e contains only one archive)" # noqa ) MANDATORY_ARCHIVE_UNSUPPORTED = "Mandatory archive type is not supported" MANDATORY_ARCHIVE_MISSING = "Deposit without archive is rejected" ARCHIVE_EXTENSIONS = [ "zip", "tar", "tar.gz", "xz", "tar.xz", "bz2", "tar.bz2", "Z", "tar.Z", "tgz", "7z", ] PATTERN_ARCHIVE_EXTENSION = re.compile(r".*\.(%s)$" % "|".join(ARCHIVE_EXTENSIONS)) def known_archive_format(filename): return any( filename.endswith(t) for t in chain(*(x[1] for x in get_unpack_formats())) ) class APIChecks(APIPrivateView, APIGet, DepositReadMixin): """Dedicated class to trigger the deposit checks on deposit archives and metadata. Only GET is supported. """ def _check_deposit_archives(self, deposit: Deposit) -> Tuple[bool, Optional[Dict]]: """Given a deposit, check each deposit request of type archive. Args: The deposit to check archives for Returns tuple (status, details): True, None if all archives are ok, (False, ) otherwise. """ requests = list(self._deposit_requests(deposit, request_type=ARCHIVE_TYPE)) requests.reverse() if len(requests) == 0: # no associated archive is refused - return False, {"archive": [{"summary": MANDATORY_ARCHIVE_MISSING,}]} + return False, { + "archive": [ + { + "summary": MANDATORY_ARCHIVE_MISSING, + } + ] + } errors = [] for archive_request in requests: check, error_message = self._check_archive(archive_request) if not check: errors.append( {"summary": error_message, "fields": [archive_request.id]} ) if not errors: return True, None return False, {"archive": errors} def _check_archive( self, archive_request: DepositRequest ) -> Tuple[bool, Optional[str]]: """Check that a deposit associated archive is ok: - readable - supported archive format - valid content: the archive does not contain a single archive file If any of those checks are not ok, return the corresponding failing check. Args: archive_path (DepositRequest): Archive to check Returns: (True, None) if archive is check compliant, (False, ) otherwise. """ archive_path = archive_request.archive.path if not known_archive_format(archive_path): return False, MANDATORY_ARCHIVE_UNSUPPORTED try: if zipfile.is_zipfile(archive_path): with zipfile.ZipFile(archive_path) as zipfile_: files = zipfile_.namelist() elif tarfile.is_tarfile(archive_path): with tarfile.open(archive_path) as tarfile_: files = tarfile_.getnames() else: return False, MANDATORY_ARCHIVE_UNSUPPORTED except Exception: return False, MANDATORY_ARCHIVE_UNREADABLE if len(files) > 1: return True, None element = files[0] if PATTERN_ARCHIVE_EXTENSION.match(element): # archive in archive! return False, MANDATORY_ARCHIVE_INVALID return True, None def process_get( self, req: Request, collection_name: str, deposit: Deposit ) -> Tuple[int, Dict, str]: """Trigger the checks on the deposit archives and then on the deposit metadata. If any problems (or warnings) are raised, the deposit status and status detail are updated accordingly. If all checks are ok, the deposit status is updated to the 'verified' status (details updated with warning if any) and a loading task is scheduled for the deposit to be ingested. Otherwise, the deposit is marked as 'rejected' with the error details. A json response is returned to the caller with the deposit checks. Args: req: Client request collection_name: Collection owning the deposit deposit: Deposit concerned by the reading Returns: Tuple (status, json response, content-type) """ raw_metadata = self._metadata_get(deposit) details_dict: Dict = {} # will check each deposit's associated request (both of type # archive and metadata) for errors archives_status_ok, details = self._check_deposit_archives(deposit) if not archives_status_ok: assert details is not None details_dict.update(details) if raw_metadata is None: metadata_status_ok = False details_dict["metadata"] = [{"summary": "Missing Atom document"}] else: metadata_tree = ElementTree.fromstring(raw_metadata) metadata_status_ok, details = check_metadata(metadata_tree) # Ensure in case of error, we do have the rejection details assert metadata_status_ok or ( not metadata_status_ok and details is not None ) # we can have warnings even if checks are ok (e.g. missing suggested field) details_dict.update(details or {}) deposit_status_ok = archives_status_ok and metadata_status_ok # if any details_dict arose, the deposit is rejected deposit.status = ( DEPOSIT_STATUS_VERIFIED if deposit_status_ok else DEPOSIT_STATUS_REJECTED ) response: Dict = { "status": deposit.status, } if details_dict: deposit.status_detail = details_dict response["details"] = details_dict # Deposit ok, then we schedule the deposit loading task (if not already done) if deposit_status_ok and not deposit.load_task_id and self.config["checks"]: url = deposit.origin_url task = create_oneshot_task_dict( "load-deposit", url=url, deposit_id=deposit.id, retries_left=3 ) load_task_id = self.scheduler.create_tasks([task])[0]["id"] deposit.load_task_id = load_task_id deposit.save() return status.HTTP_200_OK, response, "application/json" diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py index 40b8f0f6..fb88d9a4 100644 --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -1,219 +1,217 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from contextlib import contextmanager import os import shutil import tempfile from typing import Any, Dict, Optional, Tuple from xml.etree import ElementTree from rest_framework import status from swh.core import tarball from swh.deposit.utils import NAMESPACES, normalize_date from swh.model.hashutil import hash_to_hex from swh.model.model import MetadataAuthorityType from swh.model.swhids import CoreSWHID from . import APIPrivateView, DepositReadMixin from ...config import ARCHIVE_TYPE, SWH_PERSON from ...models import Deposit from ..common import APIGet @contextmanager def aggregate_tarballs(extraction_dir, archive_paths): """Aggregate multiple tarballs into one and returns this new archive's path. Args: extraction_dir (path): Path to use for the tarballs computation archive_paths ([str]): Deposit's archive paths Returns: Tuple (directory to clean up, archive path (aggregated or not)) """ # rebuild one zip archive from (possibly) multiple ones os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix="swh.deposit-", dir=extraction_dir) # root folder to build an aggregated tarball aggregated_tarball_rootdir = os.path.join(dir_path, "aggregate") os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True) # uncompress in a temporary location all archives for archive_path in archive_paths: tarball.uncompress(archive_path, aggregated_tarball_rootdir) # Aggregate into one big tarball the multiple smaller ones temp_tarpath = shutil.make_archive( aggregated_tarball_rootdir, "tar", aggregated_tarball_rootdir ) # can already clean up temporary directory shutil.rmtree(aggregated_tarball_rootdir) try: yield temp_tarpath finally: shutil.rmtree(dir_path) class APIReadArchives(APIPrivateView, APIGet, DepositReadMixin): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ def __init__(self): super().__init__() self.extraction_dir = self.config["extraction_dir"] if not os.path.exists(self.extraction_dir): os.makedirs(self.extraction_dir) def process_get( self, request, collection_name: str, deposit: Deposit ) -> Tuple[int, Any, str]: """Build a unique tarball from the multiple received and stream that content to the client. Args: request (Request): collection_name: Collection owning the deposit deposit: Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ archive_paths = [ r.archive.path for r in self._deposit_requests(deposit, request_type=ARCHIVE_TYPE) ] return ( status.HTTP_200_OK, aggregate_tarballs(self.extraction_dir, archive_paths), "swh/generator", ) class APIReadMetadata(APIPrivateView, APIGet, DepositReadMixin): - """Class in charge of aggregating metadata on a deposit. - - """ + """Class in charge of aggregating metadata on a deposit.""" def _parse_dates( self, deposit: Deposit, metadata: ElementTree.Element ) -> Tuple[dict, dict]: """Normalize the date to use as a tuple of author date, committer date from the incoming metadata. Returns: Tuple of author date, committer date. Those dates are swh normalized. """ commit_date_elt = metadata.find("codemeta:datePublished", namespaces=NAMESPACES) author_date_elt = metadata.find("codemeta:dateCreated", namespaces=NAMESPACES) author_date: Any commit_date: Any if author_date_elt is None and commit_date_elt is None: author_date = commit_date = deposit.complete_date elif commit_date_elt is None: author_date = commit_date = author_date_elt.text # type: ignore elif author_date_elt is None: author_date = commit_date = commit_date_elt.text else: author_date = author_date_elt.text commit_date = commit_date_elt.text return (normalize_date(author_date), normalize_date(commit_date)) def metadata_read(self, deposit: Deposit) -> Dict[str, Any]: """Read and aggregate multiple deposit information into one unified dictionary. Args: deposit: Deposit to retrieve information from Returns: Dictionary of deposit information read by the deposit loader, with the following keys: **origin** (Dict): Information about the origin **raw_metadata** (str): List of raw metadata received for the deposit **provider** (Dict): the metadata provider information about the deposit client **tool** (Dict): the deposit information **deposit** (Dict): deposit information relevant to build the revision (author_date, committer_date, etc...) """ raw_metadata = self._metadata_get(deposit) author_date: Optional[dict] commit_date: Optional[dict] if raw_metadata: metadata_tree = ElementTree.fromstring(raw_metadata) author_date, commit_date = self._parse_dates(deposit, metadata_tree) release_notes_elements = metadata_tree.findall( "codemeta:releaseNotes", namespaces=NAMESPACES ) else: author_date = commit_date = None release_notes_elements = [] if deposit.parent and deposit.parent.swhid: parent_swhid = deposit.parent.swhid assert parent_swhid is not None swhid = CoreSWHID.from_string(parent_swhid) parent_revision = hash_to_hex(swhid.object_id) parents = [parent_revision] else: parents = [] release_notes: Optional[str] if release_notes_elements: release_notes = "\n\n".join( element.text for element in release_notes_elements if element.text ) else: release_notes = None return { "origin": {"type": "deposit", "url": deposit.origin_url}, "provider": { "provider_name": deposit.client.last_name, "provider_url": deposit.client.provider_url, "provider_type": MetadataAuthorityType.DEPOSIT_CLIENT.value, "metadata": {}, }, "tool": self.tool, "raw_metadata": raw_metadata, "deposit": { "id": deposit.id, "client": deposit.client.username, "collection": deposit.collection.name, "author": SWH_PERSON, "author_date": author_date, "committer": SWH_PERSON, "committer_date": commit_date, "revision_parents": parents, "release_notes": release_notes, }, } def process_get( self, request, collection_name: str, deposit: Deposit ) -> Tuple[int, Dict, str]: data = self.metadata_read(deposit) return status.HTTP_200_OK, data if data else {}, "application/json" diff --git a/swh/deposit/api/sword_edit.py b/swh/deposit/api/sword_edit.py index e038a0c5..1378a5c1 100644 --- a/swh/deposit/api/sword_edit.py +++ b/swh/deposit/api/sword_edit.py @@ -1,81 +1,81 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Optional, Tuple from rest_framework import status from swh.storage import get_storage from swh.storage.interface import StorageInterface from ..config import EDIT_IRI, EM_IRI from ..models import Deposit from ..parsers import SWHAtomEntryParser, SWHMultiPartParser from .common import APIPost, ParsedRequestHeaders, Receipt class SwordEditAPI(APIPost): """Deposit request class defining api endpoints for sword deposit. - What's known as 'SE-IRI' in the sword specification. + What's known as 'SE-IRI' in the sword specification. - HTTP verbs supported: POST + HTTP verbs supported: POST """ parser_classes = (SWHMultiPartParser, SWHAtomEntryParser) def __init__(self): super().__init__() self.storage_metadata: StorageInterface = get_storage( **self.config["storage_metadata"] ) def process_post( self, request, headers: ParsedRequestHeaders, collection_name: str, deposit: Optional[Deposit] = None, ) -> Tuple[int, str, Receipt]: """Add new metadata/archive to existing deposit. This allows the following scenarios to occur: - multipart: Add new metadata and archive to a deposit in status partial with the provided ones. - empty atom: Allows to finalize a deposit in status partial (transition to deposited). source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_metadata - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_multipart - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#continueddeposit_complete Returns: In optimal case for a multipart and atom-entry update, a 201 Created response. The body response will hold a deposit. And the response headers will contain an entry 'Location' with the EM-IRI. For the empty post case, this returns a 200. """ # noqa assert deposit is not None if request.content_type.startswith("multipart/"): receipt = self._multipart_upload( request, headers, collection_name, deposit=deposit ) return (status.HTTP_201_CREATED, EM_IRI, receipt) content_length = headers.content_length or 0 if content_length == 0 and headers.in_progress is False: # check for final empty post receipt = self._empty_post(request, headers, collection_name, deposit) return (status.HTTP_200_OK, EDIT_IRI, receipt) receipt = self._atom_entry(request, headers, collection_name, deposit=deposit) return (status.HTTP_201_CREATED, EM_IRI, receipt) diff --git a/swh/deposit/api/utils.py b/swh/deposit/api/utils.py index 7a9aff1b..43c8f370 100644 --- a/swh/deposit/api/utils.py +++ b/swh/deposit/api/utils.py @@ -1,36 +1,36 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework import serializers from rest_framework.fields import _UnvalidatedField from rest_framework.pagination import PageNumberPagination from swh.deposit.api.converters import convert_status_detail from swh.deposit.models import Deposit class DefaultPagination(PageNumberPagination): page_size = 100 page_size_query_param = "page_size" class StatusDetailField(_UnvalidatedField): """status_detail field is a dict, we want a simple message instead. - So, we reuse the convert_status_detail from deposit_status - endpoint to that effect. + So, we reuse the convert_status_detail from deposit_status + endpoint to that effect. """ def to_representation(self, value): return convert_status_detail(value) class DepositSerializer(serializers.ModelSerializer): status_detail = StatusDetailField() raw_metadata = _UnvalidatedField() class Meta: model = Deposit fields = "__all__" diff --git a/swh/deposit/auth.py b/swh/deposit/auth.py index 655d05a5..f1edd20f 100644 --- a/swh/deposit/auth.py +++ b/swh/deposit/auth.py @@ -1,186 +1,182 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from typing import Optional from django.core.cache import cache from django.utils import timezone from rest_framework import status from rest_framework.authentication import BasicAuthentication from rest_framework.exceptions import AuthenticationFailed from rest_framework.permissions import BasePermission from sentry_sdk import capture_exception from swh.auth.django.models import OIDCUser from swh.auth.django.utils import oidc_user_from_profile from swh.auth.keycloak import ( KeycloakError, KeycloakOpenIDConnect, keycloak_error_message, ) from swh.deposit.models import DepositClient from .errors import UNAUTHORIZED, make_error_response logger = logging.getLogger(__name__) OIDC_DEPOSIT_CLIENT_ID = "swh-deposit" DEPOSIT_PERMISSION = "swh.deposit.api" def convert_response(request, content): """Convert response from drf's basic authentication mechanism to a - swh-deposit one. + swh-deposit one. - Args: - request (Request): Use to build the response - content (bytes): The drf's answer + Args: + request (Request): Use to build the response + content (bytes): The drf's answer - Returns: + Returns: - Response with the same status error as before, only the - body is now an swh-deposit compliant one. + Response with the same status error as before, only the + body is now an swh-deposit compliant one. """ from json import loads content = loads(content.decode("utf-8")) detail = content.get("detail") if detail: verbose_description = "API is protected by basic authentication" else: detail = "API is protected by basic authentication" verbose_description = None response = make_error_response( request, UNAUTHORIZED, summary=detail, verbose_description=verbose_description ) response["WWW-Authenticate"] = 'Basic realm=""' return response class WrapBasicAuthenticationResponseMiddleware: """Middleware to capture potential authentication error and convert - them to standard deposit response. + them to standard deposit response. - This is to be installed in django's settings.py module. + This is to be installed in django's settings.py module. """ def __init__(self, get_response): super().__init__() self.get_response = get_response def __call__(self, request): response = self.get_response(request) if response.status_code is status.HTTP_401_UNAUTHORIZED: content_type = response.get("content-type") if content_type == "application/json": return convert_response(request, response.content) return response class HasDepositPermission(BasePermission): - """Allows access to authenticated users with the DEPOSIT_PERMISSION. - - """ + """Allows access to authenticated users with the DEPOSIT_PERMISSION.""" def has_permission(self, request, view): assert isinstance(request.user, DepositClient) return request.user.oidc_user.has_perm(DEPOSIT_PERMISSION) class KeycloakBasicAuthentication(BasicAuthentication): """Keycloack authentication against username/password. Deposit users will continue sending `Basic authentication` queries to the deposit server. Transparently, the deposit server will stop authenticate itself the users. It will delegate the authentication queries to the keycloak instance. Technically, reuses :class:`rest_framework.BasicAuthentication` and overrides the func:`authenticate_credentials` method to discuss with keycloak. As an implementation detail, this also uses the django cache mechanism to avoid too many authentication request to keycloak. """ _client: Optional[KeycloakOpenIDConnect] = None @property def client(self): if self._client is None: self._client = KeycloakOpenIDConnect.from_configfile( client_id=OIDC_DEPOSIT_CLIENT_ID ) return self._client def _cache_key(self, user_id: str) -> str: - """Internal key to use to store user id token. - - """ + """Internal key to use to store user id token.""" return f"oidc_user_{self.client.realm_name}_{self.client.client_id}_{user_id}" def get_user(self, user_id: str) -> Optional[OIDCUser]: - """Retrieve user from cache if any. - - """ + """Retrieve user from cache if any.""" oidc_profile = cache.get(self._cache_key(user_id)) if oidc_profile: try: return oidc_user_from_profile(self.client, oidc_profile) except Exception as e: logger.warning("Error during cache token retrieval: %s", e) capture_exception(e) return None def authenticate_credentials(self, user_id, password, request): """Authenticate the user_id/password against keycloak. Raises: AuthenticationFailed in case of authentication failure Returns: Tuple of deposit_client, None. """ oidc_user = self.get_user(user_id) ttl: Optional[int] = None if not oidc_user: try: oidc_profile = self.client.login(user_id, password) except KeycloakError as e: logger.debug("KeycloakError: e: %s", e) error_msg = keycloak_error_message(e) raise AuthenticationFailed(error_msg) oidc_user = oidc_user_from_profile(self.client, oidc_profile) ttl = int( oidc_user.refresh_expires_at.timestamp() - timezone.now().timestamp() ) # Making sure the associated deposit client is correctly configured in backend try: deposit_client = DepositClient.objects.get(username=user_id) except DepositClient.DoesNotExist: raise AuthenticationFailed(f"Unknown user {user_id}") if not deposit_client.is_active: raise AuthenticationFailed(f"Deactivated user {user_id}") deposit_client.oidc_user = oidc_user if ttl: # cache the oidc_profile user while it's valid cache.set( - self._cache_key(user_id), oidc_profile, timeout=max(0, ttl), + self._cache_key(user_id), + oidc_profile, + timeout=max(0, ttl), ) return (deposit_client, None) diff --git a/swh/deposit/cli/__init__.py b/swh/deposit/cli/__init__.py index 0e110ce4..663035c5 100644 --- a/swh/deposit/cli/__init__.py +++ b/swh/deposit/cli/__init__.py @@ -1,44 +1,43 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging # WARNING: do not import unnecessary things here to keep cli startup time under # control import click from swh.core.cli import CONTEXT_SETTINGS from swh.core.cli import swh as swh_cli_group logger = logging.getLogger(__name__) @swh_cli_group.group(context_settings=CONTEXT_SETTINGS) @click.pass_context def deposit(ctx): - """Deposit main command - """ + """Deposit main command""" ctx.ensure_object(dict) log_level = ctx.obj.get("log_level", logging.INFO) logger.setLevel(log_level) def main(): logging.basicConfig() return deposit(auto_envvar_prefix="SWH_DEPOSIT") # These import statements MUST be executed after defining the 'deposit' group # since the subcommands in these are defined using this 'deposit' group. from . import client # noqa try: from . import admin # noqa except ImportError: # server part is optional logger.debug("admin subcommand not loaded") if __name__ == "__main__": main() diff --git a/swh/deposit/cli/admin.py b/swh/deposit/cli/admin.py index 2c55a996..594ca177 100644 --- a/swh/deposit/cli/admin.py +++ b/swh/deposit/cli/admin.py @@ -1,287 +1,289 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # WARNING: do not import unnecessary things here to keep cli startup time under # control from __future__ import annotations from typing import TYPE_CHECKING import click from swh.deposit.cli import deposit if TYPE_CHECKING: from swh.deposit.models import DepositCollection @deposit.group("admin") @click.option( "--config-file", "-C", default=None, - type=click.Path(exists=True, dir_okay=False,), + type=click.Path( + exists=True, + dir_okay=False, + ), help="Optional extra configuration file.", ) @click.option( "--platform", default="development", type=click.Choice(["development", "production"]), help="development or production platform", ) @click.pass_context def admin(ctx, config_file: str, platform: str): """Server administration tasks (manipulate user or collections)""" from swh.deposit.config import setup_django_for # configuration happens here setup_django_for(platform, config_file=config_file) @admin.group("user") @click.pass_context def user(ctx): """Manipulate user.""" # configuration happens here pass def _create_collection(name: str) -> DepositCollection: """Create the collection with name if it does not exist. Args: name: collection name Returns: collection: the existing collection object """ # to avoid loading too early django namespaces from swh.deposit.models import DepositCollection try: collection = DepositCollection.objects.get(name=name) click.echo(f"Collection '{name}' exists, skipping.") except DepositCollection.DoesNotExist: click.echo(f"Create collection '{name}'.") collection = DepositCollection.objects.create(name=name) click.echo(f"Collection '{name}' created.") return collection @user.command("create") @click.option("--username", required=True, help="User's name") @click.option("--password", help="(Deprecated) Desired user password (plain).") @click.option("--firstname", default="", help="User's first name") @click.option("--lastname", default="", help="User's last name") @click.option("--email", default="", help="User's email") @click.option("--collection", help="User's collection") @click.option("--provider-url", default="", help="Provider URL") @click.option("--domain", default="", help="The domain") @click.pass_context def user_create( ctx, username: str, password: str, firstname: str, lastname: str, email: str, collection: str, provider_url: str, domain: str, ): """Create a user with some needed information (password, collection) If the collection does not exist, the collection is then created alongside. The password is stored encrypted using django's utilities. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositClient # If collection is not provided, fallback to username if not collection: collection = username # create the collection if it does not exist collection_ = _create_collection(collection) # user create/update try: user = DepositClient.objects.get(username=username) # type: ignore click.echo(f"Update user '{username}'.") action_done = "updated" except DepositClient.DoesNotExist: click.echo(f"Create user '{username}'.") user = DepositClient(username=username) user.save() action_done = "created" if password: user.set_password(password) user.collections = [collection_.id] user.first_name = firstname user.last_name = lastname user.email = email user.is_active = True user.provider_url = provider_url user.domain = domain user.save() click.echo(f"User '{username}' {action_done}.") @user.command("list") @click.pass_context def user_list(ctx): """List existing users. - This entrypoint is not paginated yet as there is not a lot of - entry. + This entrypoint is not paginated yet as there is not a lot of + entry. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositClient users = DepositClient.objects.all() if not users: output = "Empty user list" else: output = "\n".join((user.username for user in users)) click.echo(output) @user.command("exists") @click.argument("username", required=True) @click.pass_context def user_exists(ctx, username: str): - """Check if user exists. - """ + """Check if user exists.""" # to avoid loading too early django namespaces from swh.deposit.models import DepositClient try: DepositClient.objects.get(username=username) # type: ignore click.echo(f"User {username} exists.") ctx.exit(0) except DepositClient.DoesNotExist: click.echo(f"User {username} does not exist.") ctx.exit(1) @admin.group("collection") @click.pass_context def collection(ctx): """Manipulate collections.""" pass @collection.command("create") @click.option("--name", required=True, help="Collection's name") @click.pass_context def collection_create(ctx, name): _create_collection(name) @collection.command("list") @click.pass_context def collection_list(ctx): """List existing collections. - This entrypoint is not paginated yet as there is not a lot of - entry. + This entrypoint is not paginated yet as there is not a lot of + entry. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositCollection collections = DepositCollection.objects.all() if not collections: output = "Empty collection list" else: output = "\n".join((col.name for col in collections)) click.echo(output) @admin.group("deposit") @click.pass_context def adm_deposit(ctx): """Manipulate deposit.""" pass @adm_deposit.command("reschedule") @click.option("--deposit-id", required=True, help="Deposit identifier") @click.pass_context def adm_deposit_reschedule(ctx, deposit_id): """Reschedule the deposit loading This will: - check the deposit's status to something reasonable (failed or done). That means that the checks have passed alright but something went wrong during the loading (failed: loading failed, done: loading ok, still for some reasons as in bugs, we need to reschedule it) - reset the deposit's status to 'verified' (prior to any loading but after the checks which are fine) and removes the different archives' identifiers (swh-id, ...) - trigger back the loading task through the scheduler """ # to avoid loading too early django namespaces import datetime from swh.deposit.config import ( DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_VERIFIED, APIConfig, ) from swh.deposit.models import Deposit try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: click.echo(f"Deposit {deposit_id} does not exist.") ctx.exit(1) # Check the deposit is in a reasonable state accepted_statuses = [DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE] if deposit.status == DEPOSIT_STATUS_VERIFIED: click.echo(f"Deposit {deposit_id} already set for rescheduling.") ctx.exit(0) if deposit.status not in accepted_statuses: click.echo( f"Deposit {deposit_id} cannot be rescheduled (status: {deposit.status}).\n" "Rescheduling deposit is only accepted for deposit with status: " f"{', '.join(accepted_statuses)}." ) ctx.exit(1) task_id = deposit.load_task_id if not task_id: click.echo( f"Deposit {deposit_id} cannot be rescheduled. It misses the " "associated scheduler task id (field load_task_id)." ) ctx.exit(1) # Reset the deposit's state deposit.swhid = None deposit.swhid_context = None deposit.status = DEPOSIT_STATUS_VERIFIED deposit.save() # Schedule back the deposit loading task scheduler = APIConfig().scheduler scheduler.set_status_tasks( [task_id], status="next_run_not_scheduled", next_run=datetime.datetime.now(tz=datetime.timezone.utc), ) diff --git a/swh/deposit/cli/client.py b/swh/deposit/cli/client.py index 7df3e7b0..2ec34fce 100644 --- a/swh/deposit/cli/client.py +++ b/swh/deposit/cli/client.py @@ -1,648 +1,634 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations from contextlib import contextmanager from datetime import datetime, timezone import logging # WARNING: do not import unnecessary things here to keep cli startup time under # control import os import sys from typing import TYPE_CHECKING, Any, Collection, Dict, List, Optional import warnings import xml.etree.ElementTree as ET import click from swh.deposit.cli import deposit from swh.deposit.utils import NAMESPACES as NS logger = logging.getLogger(__name__) if TYPE_CHECKING: from swh.deposit.client import PublicApiDepositClient class InputError(ValueError): - """Input script error - - """ + """Input script error""" pass @contextmanager def trap_and_report_exceptions(): - """Trap and report exceptions (InputError, MaintenanceError) in a unified way. - - """ + """Trap and report exceptions (InputError, MaintenanceError) in a unified way.""" from swh.deposit.client import MaintenanceError try: yield except InputError as e: logger.error("Problem during parsing options: %s", e) sys.exit(1) except MaintenanceError as e: logger.error(e) sys.exit(1) def _url(url: str) -> str: """Force the /1 api version at the end of the url (avoiding confusing issues without it). Args: url (str): api url used by cli users Returns: Top level api url to actually request """ if not url.endswith("/1"): url = "%s/1" % url return url def generate_metadata( deposit_client: str, name: str, authors: List[str], external_id: Optional[str] = None, create_origin: Optional[str] = None, metadata_provenance_url: Optional[str] = None, ) -> str: """Generate sword compliant xml metadata with the minimum required metadata. The Atom spec, https://tools.ietf.org/html/rfc4287, says that: - atom:entry elements MUST contain one or more atom:author elements - atom:entry elements MUST contain exactly one atom:title element. - atom:entry elements MUST contain exactly one atom:updated element. However, we are also using CodeMeta, so we want some basic information to be mandatory. Therefore, we generate the following mandatory fields: - http://www.w3.org/2005/Atom#updated - http://www.w3.org/2005/Atom#author - http://www.w3.org/2005/Atom#title - https://doi.org/10.5063/SCHEMA/CODEMETA-2.0#name (yes, in addition to http://www.w3.org/2005/Atom#title, even if they have somewhat the same meaning) - https://doi.org/10.5063/SCHEMA/CODEMETA-2.0#author Args: deposit_client: Deposit client username, name: Software name authors: List of author names create_origin: Origin concerned by the deposit metadata_provenance_url: Provenance metadata url Returns: metadata xml string """ # generate a metadata file with the minimum required metadata document = ET.Element(f"{{{NS['atom']}}}entry") now = datetime.now(tz=timezone.utc) ET.SubElement(document, f"{{{NS['atom']}}}updated").text = str(now) ET.SubElement(document, f"{{{NS['atom']}}}author").text = deposit_client ET.SubElement(document, f"{{{NS['atom']}}}title").text = name ET.SubElement(document, f"{{{NS['codemeta']}}}name").text = name for author_name in authors: author = ET.SubElement(document, f"{{{NS['codemeta']}}}author") ET.SubElement(author, f"{{{NS['codemeta']}}}name").text = author_name if external_id: ET.SubElement(document, f"{{{NS['codemeta']}}}identifier").text = external_id swh_deposit_elt = ET.Element(f"{{{NS['swh']}}}deposit") if create_origin: elt = ET.SubElement(swh_deposit_elt, f"{{{NS['swh']}}}create_origin") ET.SubElement(elt, f"{{{NS['swh']}}}origin").set("url", create_origin) if metadata_provenance_url: elt = ET.SubElement(swh_deposit_elt, f"{{{NS['swh']}}}metadata-provenance") ET.SubElement(elt, f"{{{NS['schema']}}}url").text = metadata_provenance_url if len(swh_deposit_elt): document.append(swh_deposit_elt) s = ET.tostring(document, encoding="utf-8").decode() logging.debug("Atom entry dict to generate as xml: %s", s) return s def _collection(client: PublicApiDepositClient) -> str: - """Retrieve the client's collection - - """ + """Retrieve the client's collection""" # retrieve user's collection sd_content = client.service_document() if "error" in sd_content: msg = sd_content["error"] raise InputError(f"Service document retrieval: {msg}") collection = sd_content["app:service"]["app:workspace"][0]["app:collection"][ "sword:name" ] return collection def client_command_parse_input( client, username: str, archive: Optional[str], metadata: Optional[str], collection: Optional[str], slug: Optional[str], create_origin: Optional[str], metadata_provenance_url: Optional[str], partial: bool, deposit_id: Optional[int], swhid: Optional[str], replace: bool, url: str, name: Optional[str], authors: List[str], temp_dir: str, ) -> Dict[str, Any]: """Parse the client subcommand options and make sure the combination is acceptable*. If not, an InputError exception is raised explaining the issue. By acceptable, we mean: - A multipart deposit (create or update) requires: - an existing software archive - an existing metadata file or author(s) and name provided in params - A binary deposit (create/update) requires an existing software archive - A metadata deposit (create/update) requires an existing metadata file or author(s) and name provided in params - A deposit update requires a deposit_id This will not prevent all failure cases though. The remaining errors are already dealt with by the underlying api client. Raises: InputError explaining the user input related issue MaintenanceError explaining the api status Returns: dict with the following keys: "archive": the software archive to deposit "username": username "metadata": the metadata file to deposit "collection": the user's collection under which to put the deposit "create_origin": the origin concerned by the deposit "metadata_provenance_url": the metadata provenance url "in_progress": if the deposit is partial or not "url": deposit's server main entry point "deposit_id": optional deposit identifier "swhid": optional deposit swhid "replace": whether the given deposit is to be replaced or not """ if not metadata: if name and authors: metadata_path = os.path.join(temp_dir, "metadata.xml") logging.debug("Temporary file: %s", metadata_path) metadata_xml = generate_metadata( username, name, authors, external_id=slug, create_origin=create_origin, metadata_provenance_url=metadata_provenance_url, ) logging.debug("Metadata xml generated: %s", metadata_xml) with open(metadata_path, "w") as f: f.write(metadata_xml) metadata = metadata_path elif archive is not None and not partial and not deposit_id: # If we meet all the following conditions: # * this is not an archive-only deposit request # * it is not part of a multipart deposit (either create/update # or finish) # * it misses either name or authors raise InputError( "For metadata deposit request, either a metadata file with " "--metadata or both --author and --name must be provided. " ) elif name or authors: # If we are generating metadata, then all mandatory metadata # must be present raise InputError( "For metadata deposit request, either a metadata file with " "--metadata or both --author and --name must be provided." ) else: # TODO: this is a multipart deposit, we might want to check that # metadata are deposited at some point pass elif name or authors or create_origin: raise InputError( "Using --metadata flag is incompatible with " "--author and --name and --create-origin (those are used to generate one " "metadata file)." ) if not archive and not metadata: raise InputError( "Please provide an actionable command. See --help for more information" ) if metadata: from xml.etree import ElementTree from swh.deposit.utils import ( parse_swh_deposit_origin, parse_swh_metadata_provenance, ) metadata_tree = ElementTree.fromstring(open(metadata).read()) (create_origin, add_to_origin) = parse_swh_deposit_origin(metadata_tree) if create_origin and add_to_origin: logger.error( "The metadata file provided must not contain both " '"" and "" tags', ) elif not create_origin and not add_to_origin: logger.warning( "The metadata file provided should contain " '"" or "" tag', ) meta_prov_url = parse_swh_metadata_provenance(metadata_tree) if not meta_prov_url: logger.warning( "The metadata file provided should contain " '"" tag' ) if replace and not deposit_id: raise InputError("To update an existing deposit, you must provide its id") if not collection: collection = _collection(client) return { "archive": archive, "username": username, "metadata": metadata, "collection": collection, "slug": slug, "in_progress": partial, "url": url, "deposit_id": deposit_id, "swhid": swhid, "replace": replace, } def _subdict(d: Dict[str, Any], keys: Collection[str]) -> Dict[str, Any]: "return a dict from d with only given keys" return {k: v for k, v in d.items() if k in keys} def credentials_decorator(f): - """Add default --url, --username and --password flag to cli. - - """ + """Add default --url, --username and --password flag to cli.""" f = click.option( "--password", required=True, help="(Mandatory) User's associated password" )(f) f = click.option("--username", required=True, help="(Mandatory) User's name")(f) f = click.option( "--url", default="https://deposit.softwareheritage.org", help=( "(Optional) Deposit server api endpoint. By default, " "https://deposit.softwareheritage.org/1" ), )(f) return f def output_format_decorator(f): - """Add --format output flag decorator to cli. - - """ + """Add --format output flag decorator to cli.""" return click.option( "-f", "--format", "output_format", default="logging", type=click.Choice(["logging", "yaml", "json"]), help="Output format results.", )(f) @deposit.command() @credentials_decorator @click.option( "--archive", type=click.Path(exists=True), help="(Optional) Software archive to deposit", ) @click.option( "--metadata", type=click.Path(exists=True), help=( "(Optional) Path to xml metadata file. If not provided, " "this will use a file named .metadata.xml" ), ) @click.option( "--archive-deposit/--no-archive-deposit", default=False, help="Deprecated (ignored)", ) @click.option( "--metadata-deposit/--no-metadata-deposit", default=False, help="Deprecated (ignored)", ) @click.option( "--collection", help="(Optional) User's collection. If not provided, this will be fetched.", ) @click.option( "--slug", help=( "(Deprecated) (Optional) External system information identifier. " "If not provided, it will be generated" ), ) @click.option( "--create-origin", help=( "(Optional) Origin url to attach information to. To be used alongside " "--name and --author. This will be generated alongside the metadata to " "provide to the deposit server." ), ) @click.option( "--metadata-provenance-url", help=( "(Optional) Provenance metadata url to indicate from where the metadata is " "coming from." ), ) @click.option( "--partial/--no-partial", default=False, help=( "(Optional) The deposit will be partial, other deposits " "will have to take place to finalize it." ), ) @click.option( "--deposit-id", default=None, help="(Optional) Update an existing partial deposit with its identifier", ) @click.option( "--swhid", default=None, help="(Optional) Update existing completed deposit (status done) with new metadata", ) @click.option( "--replace/--no-replace", default=False, help="(Optional) Update by replacing existing metadata to a deposit", ) @click.option("--verbose/--no-verbose", default=False, help="Verbose mode") @click.option("--name", help="Software name") @click.option( "--author", multiple=True, help="Software author(s), this can be repeated as many times" " as there are authors", ) @output_format_decorator @click.pass_context def upload( ctx, username: str, password: str, archive: Optional[str], metadata: Optional[str], archive_deposit: bool, metadata_deposit: bool, collection: Optional[str], slug: Optional[str], create_origin: Optional[str], metadata_provenance_url: Optional[str], partial: bool, deposit_id: Optional[int], swhid: Optional[str], replace: bool, url: str, verbose: bool, name: Optional[str], author: List[str], output_format: Optional[str], ): """Software Heritage Public Deposit Client - Create/Update deposit through the command line. + Create/Update deposit through the command line. -More documentation can be found at -https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html. + More documentation can be found at + https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html. """ import tempfile from swh.deposit.client import PublicApiDepositClient if archive_deposit or metadata_deposit: warnings.warn( '"archive_deposit" and "metadata_deposit" option arguments are ' "deprecated and have no effect; simply do not provide the archive " "for a metadata-only deposit, and do not provide a metadata for a" "archive-only deposit.", DeprecationWarning, ) if slug: if create_origin and slug != create_origin: raise InputError( '"--slug" flag has been deprecated in favor of "--create-origin" flag. ' "You mentioned both with different values, please only " 'use "--create-origin".' ) warnings.warn( '"--slug" flag has been deprecated in favor of "--create-origin" flag. ' 'Please, start using "--create-origin" instead of "--slug"', DeprecationWarning, ) url = _url(url) client = PublicApiDepositClient(url=url, auth=(username, password)) with tempfile.TemporaryDirectory() as temp_dir: with trap_and_report_exceptions(): logger.debug("Parsing cli options") config = client_command_parse_input( client, username, archive, metadata, collection, slug, create_origin, metadata_provenance_url, partial, deposit_id, swhid, replace, url, name, author, temp_dir, ) if verbose: logger.info("Parsed configuration: %s", config) keys = [ "archive", "collection", "in_progress", "metadata", "slug", ] if config["deposit_id"]: keys += ["deposit_id", "replace", "swhid"] data = client.deposit_update(**_subdict(config, keys)) else: data = client.deposit_create(**_subdict(config, keys)) print_result(data, output_format) @deposit.command() @credentials_decorator @click.option("--deposit-id", default=None, required=True, help="Deposit identifier.") @output_format_decorator @click.pass_context def status(ctx, url, username, password, deposit_id, output_format): - """Deposit's status - - """ + """Deposit's status""" from swh.deposit.client import PublicApiDepositClient url = _url(url) logger.debug("Status deposit") with trap_and_report_exceptions(): client = PublicApiDepositClient(url=_url(url), auth=(username, password)) collection = _collection(client) print_result( client.deposit_status(collection=collection, deposit_id=deposit_id), output_format, ) def print_result(data: Dict[str, Any], output_format: Optional[str]) -> None: - """Display the result data into a dedicated output format. - - """ + """Display the result data into a dedicated output format.""" import json import yaml if output_format == "json": click.echo(json.dumps(data)) elif output_format == "yaml": click.echo(yaml.dump(data)) else: logger.info(data) @deposit.command("metadata-only") @credentials_decorator @click.option( "--metadata", "metadata_path", type=click.Path(exists=True), required=True, help="Path to xml metadata file", ) @output_format_decorator @click.pass_context def metadata_only(ctx, url, username, password, metadata_path, output_format): - """Deposit metadata only upload - - """ + """Deposit metadata only upload""" from xml.etree import ElementTree from swh.deposit.client import PublicApiDepositClient from swh.deposit.utils import parse_swh_metadata_provenance, parse_swh_reference # Parse to check for a swhid presence within the metadata file with open(metadata_path, "r") as f: raw_metadata = f.read() metadata_tree = ElementTree.fromstring(raw_metadata) actual_swhid = parse_swh_reference(metadata_tree) if not actual_swhid: raise InputError("A SWHID must be provided for a metadata-only deposit") meta_prov_url = parse_swh_metadata_provenance(metadata_tree) if not meta_prov_url: logger.warning( "A '' should be provided for a metadata-only " "deposit" ) with trap_and_report_exceptions(): client = PublicApiDepositClient(url=_url(url), auth=(username, password)) collection = _collection(client) result = client.deposit_metadata_only(collection, metadata_path) print_result(result, output_format) @deposit.command("list") @credentials_decorator @output_format_decorator @click.option( - "--page", default=1, help="Page number when requesting more information", + "--page", + default=1, + help="Page number when requesting more information", ) @click.option( - "--page-size", default=100, help="Page number when requesting more information", + "--page-size", + default=100, + help="Page number when requesting more information", ) @click.pass_context def deposit_list(ctx, url, username, password, output_format, page, page_size): - """Client deposit listing - - """ + """Client deposit listing""" from swh.deposit.client import PublicApiDepositClient url = _url(url) logger.debug("List deposits for user %s", username) with trap_and_report_exceptions(): client = PublicApiDepositClient(url=_url(url), auth=(username, password)) collection = _collection(client) result = client.deposit_list(collection, page=page, page_size=page_size) print_result(result, output_format) diff --git a/swh/deposit/client.py b/swh/deposit/client.py index ac8fbe70..7054de8e 100644 --- a/swh/deposit/client.py +++ b/swh/deposit/client.py @@ -1,892 +1,883 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining an swh-deposit client """ import hashlib import logging import os from typing import Any, Dict, List, Optional, Tuple from urllib.parse import urljoin import warnings from xml.etree import ElementTree import requests from requests import Response from requests.utils import parse_header_links from swh.core.config import load_from_envvar from swh.deposit import __version__ as swh_deposit_version from swh.deposit.utils import NAMESPACES logger = logging.getLogger(__name__) def compute_unified_information( collection: str, in_progress: bool, slug: str, *, filepath: Optional[str] = None, swhid: Optional[str] = None, **kwargs, ) -> Dict[str, Any]: """Given a filepath, compute necessary information on that file. Args: collection: Deposit collection in_progress: do we finalize the deposit? slug: external id to use filepath: Path to the file to compute the necessary information out of swhid: Deposit swhid if any Returns: dict with keys: 'slug': external id to use 'in_progress': do we finalize the deposit? 'content-type': content type associated 'md5sum': md5 sum 'filename': filename 'filepath': filepath 'swhid': deposit swhid """ result: Dict[str, Any] = { "slug": slug, "in_progress": in_progress, "swhid": swhid, } content_type: Optional[str] = None md5sum: Optional[str] = None if filepath: filename = os.path.basename(filepath) md5sum = hashlib.md5(open(filepath, "rb").read()).hexdigest() extension = filename.split(".")[-1] if "zip" in extension: content_type = "application/zip" else: content_type = "application/x-tar" result.update( { "content-type": content_type, "md5sum": md5sum, "filename": filename, "filepath": filepath, } ) return result class MaintenanceError(ValueError): - """Informational maintenance error exception - - """ + """Informational maintenance error exception""" pass def handle_deprecated_config(config: Dict) -> Tuple[str, Optional[Tuple[str, str]]]: warnings.warn( '"config" argument is deprecated, please ' 'use "url" and "auth" arguments instead; note that "auth" ' "expects now a couple (username, password) and not a dict.", DeprecationWarning, ) url: str = config["url"] auth: Optional[Tuple[str, str]] = None if config.get("auth"): auth = (config["auth"]["username"], config["auth"]["password"]) return (url, auth) class BaseApiDepositClient: - """Deposit client base class - - """ + """Deposit client base class""" def __init__( self, config: Optional[Dict] = None, url: Optional[str] = None, auth: Optional[Tuple[str, str]] = None, ): if not url and not config: config = load_from_envvar() if config: url, auth = handle_deprecated_config(config) # needed to help mypy not be fooled by the Optional nature of url assert url is not None self.base_url = url.strip("/") + "/" self.auth = auth self.session = requests.Session() if auth: self.session.auth = auth self.session.headers.update( {"user-agent": f"swh-deposit/{swh_deposit_version}"} ) def do(self, method, url, *args, **kwargs): """Internal method to deal with requests, possibly with basic http authentication. Args: method (str): supported http methods as in self._methods' keys Returns: The request's execution """ full_url = urljoin(self.base_url, url.lstrip("/")) return self.session.request(method, full_url, *args, **kwargs) class PrivateApiDepositClient(BaseApiDepositClient): """Private API deposit client to: - read a given deposit's archive(s) - read a given deposit's metadata - update a given deposit's status """ def archive_get(self, archive_update_url: str, archive: str) -> Optional[str]: """Retrieve the archive from the deposit to a local directory. Args: archive_update_url (str): The full deposit archive(s)'s raw content to retrieve locally archive (str): the local archive's path where to store the raw content Returns: The archive path to the local archive to load. Or None if any problem arose. """ response = self.do("get", archive_update_url, stream=True) if response.ok: with open(archive, "wb") as f: for chunk in response.iter_content(): f.write(chunk) return archive msg = "Problem when retrieving deposit archive at %s" % (archive_update_url,) logger.error(msg) raise ValueError(msg) def metadata_get(self, metadata_url): """Retrieve the metadata information on a given deposit. Args: metadata_url (str): The full deposit metadata url to retrieve locally Returns: The dictionary of metadata for that deposit or None if any problem arose. """ r = self.do("get", metadata_url) if r.ok: return r.json() msg = "Problem when retrieving metadata at %s" % metadata_url logger.error(msg) raise ValueError(msg) def status_update( self, update_status_url, status, status_detail=None, release_id=None, directory_id=None, origin_url=None, ): """Update the deposit's status. Args: update_status_url (str): the full deposit's archive status (str): The status to update the deposit with release_id (str/None): the release's identifier to update to directory_id (str/None): the directory's identifier to update to origin_url (str/None): deposit's associated origin url """ payload = {"status": status} if release_id: payload["release_id"] = release_id if directory_id: payload["directory_id"] = directory_id if origin_url: payload["origin_url"] = origin_url if status_detail: payload["status_detail"] = status_detail self.do("put", update_status_url, json=payload) def check(self, check_url): """Check the deposit's associated data (metadata, archive(s)) Args: check_url (str): the full deposit's check url """ r = self.do("get", check_url) if r.ok: data = r.json() return data["status"] msg = "Problem when checking deposit %s" % check_url logger.error(msg) raise ValueError(msg) class BaseDepositClient(BaseApiDepositClient): - """Base Deposit client to access the public api. - - """ + """Base Deposit client to access the public api.""" def __init__( self, config=None, url=None, auth=None, error_msg=None, empty_result={} ): super().__init__(url=url, auth=auth, config=config) self.error_msg = error_msg self.empty_result = empty_result def compute_url(self, *args, **kwargs): """Compute api url endpoint to query.""" raise NotImplementedError def compute_method(self, *args, **kwargs): """Http method to use on the url""" raise NotImplementedError def parse_result_ok( self, xml_content: str, headers: Optional[Dict] = None ) -> Dict[str, Any]: """Given an xml result from the api endpoint, parse it and returns a - dict. + dict. """ raise NotImplementedError def compute_information(self, *args, **kwargs) -> Dict[str, Any]: """Compute some more information given the inputs (e.g http headers, - ...) + ...) """ return {} def parse_result_error(self, xml_content: str) -> Dict[str, Any]: """Given an error response in xml, parse it into a dict. Returns: dict with following keys: 'error': The error message 'detail': Some more detail about the error if any """ data = ElementTree.fromstring(xml_content) return { "summary": data.findtext("atom:summary", namespaces=NAMESPACES), "detail": data.findtext("detail", "", namespaces=NAMESPACES).strip(), "sword:verboseDescription": data.findtext( "sword:verboseDescription", "", namespaces=NAMESPACES ).strip(), } def do_execute(self, method: str, url: str, info: Dict, **kwargs) -> Response: """Execute the http query to url using method and info information. By default, execute a simple query to url with the http method. Override this in subclass to improve the default behavior if needed. """ return self.do(method, url, **kwargs) def compute_params(self, **kwargs) -> Dict[str, Any]: """Determine the params out of the kwargs""" return {} def execute(self, *args, **kwargs) -> Dict[str, Any]: """Main endpoint to prepare and execute the http query to the api. Raises: MaintenanceError if some api maintenance is happening. Returns: Dict of computed api data """ url = self.compute_url(*args, **kwargs) method = self.compute_method(*args, **kwargs) info = self.compute_information(*args, **kwargs) params = self.compute_params(**kwargs) try: response = self.do_execute(method, url, info, params=params) except Exception as e: msg = self.error_msg % (url, e) result = self.empty_result result.update( - {"error": msg,} + { + "error": msg, + } ) return result else: if response.ok: if int(response.status_code) == 204: # 204 returns no body return {"status": response.status_code} else: headers = dict(response.headers) if response.headers else None return self.parse_result_ok(response.text, headers) else: try: error = self.parse_result_error(response.text) except ElementTree.ParseError: logger.warning( "Error message in response is not xml parsable: %s", response.text, ) error = {} empty = self.empty_result error.update(empty) if response.status_code == 503: summary = error.get("summary") detail = error.get("sword:verboseDescription") # Maintenance error if summary and detail: raise MaintenanceError(f"{summary}: {detail}") error.update( - {"status": response.status_code,} + { + "status": response.status_code, + } ) return error class ServiceDocumentDepositClient(BaseDepositClient): - """Service Document information retrieval. - - """ + """Service Document information retrieval.""" def __init__(self, config=None, url=None, auth=None): super().__init__( url=url, auth=auth, config=config, error_msg="Service document failure at %s: %s", empty_result={"collection": None}, ) def compute_url(self, *args, **kwargs): return "/servicedocument/" def compute_method(self, *args, **kwargs): return "get" def parse_result_ok( self, xml_content: str, headers: Optional[Dict] = None ) -> Dict[str, Any]: - """Parse service document's success response. - - """ + """Parse service document's success response.""" single_keys = [ "atom:title", "sword:collectionPolicy", "dc:abstract", "sword:treatment", "sword:mediation", "sword:metadataRelevantHeader", "sword:service", "sword:name", ] multi_keys = [ "app:accept", "sword:acceptPackaging", ] data = ElementTree.fromstring(xml_content) workspace: List[Dict[str, Any]] = [ { "app:collection": { **{ key: collection.findtext(key, namespaces=NAMESPACES) for key in single_keys }, **{ key: [ elt.text for elt in collection.findall(key, namespaces=NAMESPACES) ] for key in multi_keys }, } } for collection in data.findall( "app:workspace/app:collection", namespaces=NAMESPACES ) ] return {"app:service": {"app:workspace": workspace}} def parse_result_error(self, xml_content: str) -> Dict[str, Any]: result = super().parse_result_error(xml_content) return {"error": result["summary"]} class StatusDepositClient(BaseDepositClient): - """Status information on a deposit. - - """ + """Status information on a deposit.""" def __init__(self, config=None, url=None, auth=None): super().__init__( url=url, auth=auth, config=config, error_msg="Status check failure at %s: %s", empty_result={ "deposit_status": None, "deposit_status_detail": None, "deposit_swh_id": None, }, ) def compute_url(self, collection, deposit_id): return "/%s/%s/status/" % (collection, deposit_id) def compute_method(self, *args, **kwargs): return "get" def parse_result_ok( self, xml_content: str, headers: Optional[Dict] = None ) -> Dict[str, Any]: - """Given an xml content as string, returns a deposit dict. - - """ + """Given an xml content as string, returns a deposit dict.""" data = ElementTree.fromstring(xml_content) keys = [ "deposit_id", "deposit_status", "deposit_status_detail", "deposit_swh_id", "deposit_swh_id_context", "deposit_external_id", ] return {key: data.findtext("swh:" + key, namespaces=NAMESPACES) for key in keys} class CollectionListDepositClient(BaseDepositClient): - """List a collection of deposits (owned by a user) - - """ + """List a collection of deposits (owned by a user)""" def __init__(self, config=None, url=None, auth=None): super().__init__( url=url, auth=auth, config=config, error_msg="List deposits failure at %s: %s", empty_result={}, ) def compute_url(self, collection, **kwargs): return f"/{collection}/" def compute_method(self, *args, **kwargs): return "get" def compute_params(self, **kwargs) -> Dict[str, Any]: """Transmit pagination params if values provided are not None (e.g. page, page_size) """ return {k: v for k, v in kwargs.items() if v is not None} def parse_result_ok( self, xml_content: str, headers: Optional[Dict] = None ) -> Dict[str, Any]: - """Given an xml content as string, returns a deposit dict. - - """ + """Given an xml content as string, returns a deposit dict.""" link_header = headers.get("Link", "") if headers else "" links = parse_header_links(link_header) data = ElementTree.fromstring(xml_content) total_result = data.findtext("swh:count", "0", namespaces=NAMESPACES).strip() keys = [ "id", "reception_date", "complete_date", "external_id", "swhid", "status", "status_detail", "swhid_context", "origin_url", ] entries = data.findall("atom:entry", namespaces=NAMESPACES) deposits_d = [ { key: deposit.findtext(f"swh:{key}", namespaces=NAMESPACES) for key in keys if deposit.find(f"swh:{key}", namespaces=NAMESPACES) is not None } for deposit in entries ] return { "count": total_result, "deposits": deposits_d, **{entry["rel"]: entry["url"] for entry in links}, } class BaseCreateDepositClient(BaseDepositClient): - """Deposit client base class to post new deposit. - - """ + """Deposit client base class to post new deposit.""" def __init__(self, config=None, url=None, auth=None): super().__init__( url=url, auth=auth, config=config, error_msg="Post Deposit failure at %s: %s", - empty_result={"swh:deposit_id": None, "swh:deposit_status": None,}, + empty_result={ + "swh:deposit_id": None, + "swh:deposit_status": None, + }, ) def compute_url(self, collection, *args, **kwargs): return "/%s/" % collection def compute_method(self, *args, **kwargs): return "post" def parse_result_ok( self, xml_content: str, headers: Optional[Dict] = None ) -> Dict[str, Any]: - """Given an xml content as string, returns a deposit dict. - - """ + """Given an xml content as string, returns a deposit dict.""" data = ElementTree.fromstring(xml_content) keys = [ "deposit_id", "deposit_status", "deposit_status_detail", "deposit_date", ] return {key: data.findtext("swh:" + key, namespaces=NAMESPACES) for key in keys} def compute_headers(self, info: Dict[str, Any]) -> Dict[str, Any]: return info def do_execute(self, method, url, info, **kwargs): with open(info["filepath"], "rb") as f: return self.do(method, url, data=f, headers=info["headers"]) class CreateArchiveDepositClient(BaseCreateDepositClient): """Post an archive (binary) deposit client.""" def compute_headers(self, info): headers = { "CONTENT_MD5": info["md5sum"], "IN-PROGRESS": str(info["in_progress"]), "CONTENT-TYPE": info["content-type"], "CONTENT-DISPOSITION": "attachment; filename=%s" % (info["filename"],), } if "slug" in info: headers["SLUG"] = info["slug"] return headers def compute_information(self, *args, **kwargs) -> Dict[str, Any]: info = compute_unified_information( *args, filepath=kwargs["archive_path"], **kwargs ) info["headers"] = self.compute_headers(info) return info class UpdateArchiveDepositClient(CreateArchiveDepositClient): """Update (add/replace) an archive (binary) deposit client.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return "/%s/%s/media/" % (collection, deposit_id) def compute_method(self, *args, replace=False, **kwargs): return "put" if replace else "post" class CreateMetadataDepositClient(BaseCreateDepositClient): """Post a metadata deposit client.""" def compute_headers(self, info): headers = { "IN-PROGRESS": str(info["in_progress"]), "CONTENT-TYPE": "application/atom+xml;type=entry", } if "slug" in info: headers["SLUG"] = info["slug"] return headers def compute_information(self, *args, **kwargs) -> Dict[str, Any]: info = compute_unified_information( *args, filepath=kwargs["metadata_path"], **kwargs ) info["headers"] = self.compute_headers(info) return info class UpdateMetadataOnPartialDepositClient(CreateMetadataDepositClient): """Update (add/replace) metadata on partial deposit scenario.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return f"/{collection}/{deposit_id}/metadata/" def compute_method(self, *args, replace: bool = False, **kwargs) -> str: return "put" if replace else "post" class UpdateMetadataOnDoneDepositClient(CreateMetadataDepositClient): """Update metadata on "done" deposit. This requires the deposit swhid.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return f"/{collection}/{deposit_id}/atom/" def compute_headers(self, info: Dict[str, Any]) -> Dict[str, Any]: return { "CONTENT-TYPE": "application/atom+xml;type=entry", "X_CHECK_SWHID": info["swhid"], } def compute_method(self, *args, **kwargs) -> str: return "put" class CreateMetadataOnlyDepositClient(BaseCreateDepositClient): """Create metadata-only deposit.""" def compute_information(self, *args, **kwargs) -> Dict[str, Any]: return { - "headers": {"CONTENT-TYPE": "application/atom+xml;type=entry",}, + "headers": { + "CONTENT-TYPE": "application/atom+xml;type=entry", + }, "filepath": kwargs["metadata_path"], } def parse_result_ok( self, xml_content: str, headers: Optional[Dict] = None ) -> Dict[str, Any]: - """Given an xml content as string, returns a deposit dict. - - """ + """Given an xml content as string, returns a deposit dict.""" data = ElementTree.fromstring(xml_content) keys = [ "deposit_id", "deposit_status", "deposit_date", ] return {key: data.findtext("swh:" + key, namespaces=NAMESPACES) for key in keys} class CreateMultipartDepositClient(BaseCreateDepositClient): """Create a multipart deposit client.""" def _multipart_info(self, info, info_meta): files = [ ( "file", (info["filename"], open(info["filepath"], "rb"), info["content-type"]), ), ( "atom", ( info_meta["filename"], open(info_meta["filepath"], "rb"), "application/atom+xml", ), ), ] headers = { "CONTENT_MD5": info["md5sum"], "IN-PROGRESS": str(info["in_progress"]), } if "slug" in info: headers["SLUG"] = info["slug"] return files, headers def compute_information(self, *args, **kwargs) -> Dict[str, Any]: - info = compute_unified_information(*args, filepath=kwargs["archive_path"],) + info = compute_unified_information( + *args, + filepath=kwargs["archive_path"], + ) info_meta = compute_unified_information( - *args, filepath=kwargs["metadata_path"], + *args, + filepath=kwargs["metadata_path"], ) files, headers = self._multipart_info(info, info_meta) return {"files": files, "headers": headers} def do_execute(self, method, url, info, **kwargs): return self.do(method, url, files=info["files"], headers=info["headers"]) class UpdateMultipartDepositClient(CreateMultipartDepositClient): """Update a multipart deposit client.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return "/%s/%s/metadata/" % (collection, deposit_id) def compute_method(self, *args, replace=False, **kwargs): return "put" if replace else "post" class PublicApiDepositClient(BaseApiDepositClient): """Public api deposit client.""" def service_document(self): """Retrieve service document endpoint's information.""" return ServiceDocumentDepositClient(url=self.base_url, auth=self.auth).execute() def deposit_status(self, collection: str, deposit_id: int): """Retrieve status information on a deposit.""" return StatusDepositClient(url=self.base_url, auth=self.auth).execute( collection, deposit_id ) def deposit_list( self, collection: str, page: Optional[int] = None, page_size: Optional[int] = None, ): """List deposits from the collection""" return CollectionListDepositClient(url=self.base_url, auth=self.auth).execute( collection, page=page, page_size=page_size ) def deposit_create( self, collection: str, slug: Optional[str], archive: Optional[str] = None, metadata: Optional[str] = None, in_progress: bool = False, ): """Create a new deposit (archive, metadata, both as multipart).""" if archive and not metadata: return CreateArchiveDepositClient( url=self.base_url, auth=self.auth ).execute(collection, in_progress, slug, archive_path=archive) elif not archive and metadata: return CreateMetadataDepositClient( url=self.base_url, auth=self.auth ).execute(collection, in_progress, slug, metadata_path=metadata) else: return CreateMultipartDepositClient( url=self.base_url, auth=self.auth ).execute( collection, in_progress, slug, archive_path=archive, metadata_path=metadata, ) def deposit_update( self, collection: str, deposit_id: int, slug: Optional[str], archive: Optional[str] = None, metadata: Optional[str] = None, in_progress: bool = False, replace: bool = False, swhid: Optional[str] = None, ): """Update (add/replace) existing deposit (archive, metadata, both).""" response = self.deposit_status(collection, deposit_id) if "error" in response: return response status = response["deposit_status"] if swhid is None and status != "partial": return { "error": "You can only act on deposit with status 'partial'", "detail": f"The deposit {deposit_id} has status '{status}'", "deposit_status": status, "deposit_id": deposit_id, } if swhid is not None and status != "done": return { "error": "You can only update metadata on deposit with status 'done'", "detail": f"The deposit {deposit_id} has status '{status}'", "deposit_status": status, "deposit_id": deposit_id, } if archive and not metadata: result = UpdateArchiveDepositClient( url=self.base_url, auth=self.auth ).execute( collection, in_progress, slug, deposit_id=deposit_id, archive_path=archive, replace=replace, ) elif not archive and metadata and swhid is None: result = UpdateMetadataOnPartialDepositClient( url=self.base_url, auth=self.auth ).execute( collection, in_progress, slug, deposit_id=deposit_id, metadata_path=metadata, replace=replace, ) elif not archive and metadata and swhid is not None: result = UpdateMetadataOnDoneDepositClient( url=self.base_url, auth=self.auth ).execute( collection, in_progress, slug, deposit_id=deposit_id, metadata_path=metadata, swhid=swhid, ) else: result = UpdateMultipartDepositClient( url=self.base_url, auth=self.auth ).execute( collection, in_progress, slug, deposit_id=deposit_id, archive_path=archive, metadata_path=metadata, replace=replace, ) if "error" in result: return result return self.deposit_status(collection, deposit_id) def deposit_metadata_only( - self, collection: str, metadata: Optional[str] = None, + self, + collection: str, + metadata: Optional[str] = None, ): assert metadata is not None return CreateMetadataOnlyDepositClient( url=self.base_url, auth=self.auth ).execute(collection, metadata_path=metadata) diff --git a/swh/deposit/config.py b/swh/deposit/config.py index 758ecf09..e3bba9ff 100644 --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -1,117 +1,121 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from typing import Any, Dict from swh.core import config from swh.deposit import __version__ from swh.model.model import MetadataAuthority, MetadataAuthorityType, MetadataFetcher from swh.scheduler import get_scheduler from swh.scheduler.interface import SchedulerInterface from swh.storage import get_storage from swh.storage.interface import StorageInterface # IRIs (Internationalized Resource identifier) sword 2.0 specified EDIT_IRI = "edit_iri" SE_IRI = "se_iri" EM_IRI = "em_iri" CONT_FILE_IRI = "cont_file_iri" SD_IRI = "servicedocument" COL_IRI = "upload" STATE_IRI = "state_iri" PRIVATE_GET_RAW_CONTENT = "private-download" PRIVATE_CHECK_DEPOSIT = "check-deposit" PRIVATE_PUT_DEPOSIT = "private-update" PRIVATE_GET_DEPOSIT_METADATA = "private-read" PRIVATE_LIST_DEPOSITS = "private-deposit-list" ARCHIVE_KEY = "archive" RAW_METADATA_KEY = "raw-metadata" ARCHIVE_TYPE = "archive" METADATA_TYPE = "metadata" AUTHORIZED_PLATFORMS = ["development", "production", "testing"] DEPOSIT_STATUS_REJECTED = "rejected" DEPOSIT_STATUS_PARTIAL = "partial" DEPOSIT_STATUS_DEPOSITED = "deposited" DEPOSIT_STATUS_VERIFIED = "verified" DEPOSIT_STATUS_LOAD_SUCCESS = "done" DEPOSIT_STATUS_LOAD_FAILURE = "failed" # Release author for deposit SWH_PERSON = { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org", } DEFAULT_CONFIG = { "max_upload_size": 209715200, "checks": True, } def setup_django_for(platform=None, config_file=None): """Setup function for command line tools (swh.deposit.create_user) to initialize the needed db access. Note: Do not import any django related module prior to this function call. Otherwise, this will raise an django.core.exceptions.ImproperlyConfigured error message. Args: platform (str): the platform the scheduling is running config_file (str): Extra configuration file (typically for the production platform) Raises: ValueError in case of wrong platform inputs. """ if platform is not None: if platform not in AUTHORIZED_PLATFORMS: raise ValueError("Platform should be one of %s" % AUTHORIZED_PLATFORMS) if "DJANGO_SETTINGS_MODULE" not in os.environ: os.environ["DJANGO_SETTINGS_MODULE"] = "swh.deposit.settings.%s" % platform if config_file: os.environ.setdefault("SWH_CONFIG_FILENAME", config_file) import django django.setup() class APIConfig: """API Configuration centralized class. This loads explicitly the configuration file out of the SWH_CONFIG_FILENAME environment variable. """ def __init__(self): self.config: Dict[str, Any] = config.load_from_envvar(DEFAULT_CONFIG) self.scheduler: SchedulerInterface = get_scheduler(**self.config["scheduler"]) self.tool = { "name": "swh-deposit", "version": __version__, "configuration": {"sword_version": "2"}, } self.storage: StorageInterface = get_storage(**self.config["storage"]) self.storage_metadata: StorageInterface = get_storage( **self.config["storage_metadata"] ) def swh_deposit_authority(self): return MetadataAuthority( - type=MetadataAuthorityType.REGISTRY, url=self.config["swh_authority_url"], + type=MetadataAuthorityType.REGISTRY, + url=self.config["swh_authority_url"], ) def swh_deposit_fetcher(self): - return MetadataFetcher(name=self.tool["name"], version=self.tool["version"],) + return MetadataFetcher( + name=self.tool["name"], + version=self.tool["version"], + ) diff --git a/swh/deposit/errors.py b/swh/deposit/errors.py index ca8385ac..568d1d8b 100644 --- a/swh/deposit/errors.py +++ b/swh/deposit/errors.py @@ -1,198 +1,194 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of providing the standard sword errors """ import logging from django.shortcuts import render from rest_framework import status FORBIDDEN = "forbidden" UNAUTHORIZED = "unauthorized" NOT_FOUND = "unknown" BAD_REQUEST = "bad-request" ERROR_CONTENT = "error-content" CHECKSUM_MISMATCH = "checksum-mismatch" MEDIATION_NOT_ALLOWED = "mediation-not-allowed" METHOD_NOT_ALLOWED = "method-not-allowed" MAX_UPLOAD_SIZE_EXCEEDED = "max_upload_size_exceeded" PARSING_ERROR = "parsing-error" logger = logging.getLogger(__name__) class ParserError(ValueError): - """Specific parsing error detected when parsing the xml metadata input - - """ + """Specific parsing error detected when parsing the xml metadata input""" pass ERRORS = { FORBIDDEN: { "status": status.HTTP_403_FORBIDDEN, "iri": "http://purl.org/net/sword/error/ErrorForbidden", "tag": "sword:ErrorForbidden", }, UNAUTHORIZED: { "status": status.HTTP_401_UNAUTHORIZED, "iri": "http://purl.org/net/sword/error/ErrorUnauthorized", "tag": "sword:ErrorUnauthorized", }, NOT_FOUND: { "status": status.HTTP_404_NOT_FOUND, "iri": "http://purl.org/net/sword/error/ErrorNotFound", "tag": "sword:ErrorNotFound", }, ERROR_CONTENT: { "status": status.HTTP_415_UNSUPPORTED_MEDIA_TYPE, "iri": "http://purl.org/net/sword/error/ErrorContent", "tag": "sword:ErrorContent", }, CHECKSUM_MISMATCH: { "status": status.HTTP_412_PRECONDITION_FAILED, "iri": "http://purl.org/net/sword/error/ErrorChecksumMismatch", "tag": "sword:ErrorChecksumMismatch", }, BAD_REQUEST: { "status": status.HTTP_400_BAD_REQUEST, "iri": "http://purl.org/net/sword/error/ErrorBadRequest", "tag": "sword:ErrorBadRequest", }, PARSING_ERROR: { "status": status.HTTP_400_BAD_REQUEST, "iri": "http://purl.org/net/sword/error/ErrorBadRequest", "tag": "sword:ErrorBadRequest", }, MEDIATION_NOT_ALLOWED: { "status": status.HTTP_412_PRECONDITION_FAILED, "iri": "http://purl.org/net/sword/error/MediationNotAllowed", "tag": "sword:MediationNotAllowed", }, METHOD_NOT_ALLOWED: { "status": status.HTTP_405_METHOD_NOT_ALLOWED, "iri": "http://purl.org/net/sword/error/MethodNotAllowed", "tag": "sword:MethodNotAllowed", }, MAX_UPLOAD_SIZE_EXCEEDED: { "status": status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, "iri": "http://purl.org/net/sword/error/MaxUploadSizeExceeded", "tag": "sword:MaxUploadSizeExceeded", }, } def make_error_dict(key, summary=None, verbose_description=None): """Utility function to factorize error message dictionary. Args: key (str): Error status key referenced in swh.deposit.errors module summary (str/None): Error message clarifying the status verbose_description (str/None): A more verbose description or work around a potential problem. Returns: Dictionary with key 'error' detailing the 'status' and associated 'message' """ return { "error": { "key": key, "summary": summary, "verboseDescription": verbose_description, }, } def make_error_response_from_dict(req, error): """Utility function to return an http response with error detail. Args: req (Request): original request error (dict): Error described as dict, typically generated from the make_error_dict function. Returns: HttpResponse with detailed error. """ error_information = ERRORS[error["key"]] context = error context.update(error_information) return render( req, "deposit/error.xml", context=error, content_type="application/xml", status=error_information["status"], ) def make_error_response(req, key, summary=None, verbose_description=None): """Utility function to create an http response with detailed error. Args: req (Request): original request key (str): Error status key referenced in swh.deposit.errors module summary (str): Error message clarifying the status verbose_description (str / None): A more verbose description or work around a potential problem. Returns: Dictionary with key 'error' detailing the 'status' and associated 'message' """ error = make_error_dict(key, summary, verbose_description) return make_error_response_from_dict(req, error["error"]) class DepositError(ValueError): - """Represents an error that should be reported to the client - - """ + """Represents an error that should be reported to the client""" def __init__(self, key, summary, verbose_description=None): self.key = key self.summary = summary self.verbose_description = verbose_description def to_dict(self): return make_error_dict(self.key, self.summary, self.verbose_description) class DepositErrorMiddleware: """A Django middleware that catches DepositError and returns a proper error response.""" # __init__ and __call__ are boilerplate to make a pass-through Django # middleware def __init__(self, get_response): self.get_response = get_response def __call__(self, request): response = self.get_response(request) return response def process_exception(self, request, exception): if isinstance(exception, DepositError): logger.info( "%s %s -> %s('%s'):\n%s", request.method, request.path, exception.key, exception.summary, exception.verbose_description, ) return make_error_response_from_dict(request, exception.to_dict()["error"]) else: return None diff --git a/swh/deposit/exception.py b/swh/deposit/exception.py index 5c6a224a..de56ed05 100644 --- a/swh/deposit/exception.py +++ b/swh/deposit/exception.py @@ -1,38 +1,36 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Dict, Optional from django.db.utils import OperationalError from django.http import HttpResponse from rest_framework.exceptions import APIException def custom_exception_handler( exc: APIException, context: Dict ) -> Optional[HttpResponse]: - """Custom deposit exception handler to ensure consistent xml output - - """ + """Custom deposit exception handler to ensure consistent xml output""" from rest_framework.views import exception_handler # drf's default exception handler first, to get the standard error response response = exception_handler(exc, context) if isinstance(exc, OperationalError): status = "Database backend maintenance" detail = "Service temporarily unavailable, try again later." data = f""" {status} {detail} """.encode( "utf-8" ) return HttpResponse(data, status=503, content_type="application/xml") return response diff --git a/swh/deposit/migrations/0001_initial.py b/swh/deposit/migrations/0001_initial.py index bc91890a..e62bed92 100644 --- a/swh/deposit/migrations/0001_initial.py +++ b/swh/deposit/migrations/0001_initial.py @@ -1,141 +1,155 @@ # -*- coding: utf-8 -*- # Generated by Django 1.10.7 on 2017-09-24 10:03 from __future__ import unicode_literals from django.conf import settings import django.contrib.auth.models import django.contrib.postgres.fields import django.contrib.postgres.fields.jsonb from django.db import migrations, models import django.db.models.deletion import django.utils.timezone class Migration(migrations.Migration): initial = True dependencies = [ ("auth", "0008_alter_user_username_max_length"), ] operations = [ migrations.CreateModel( name="Dbversion", fields=[ ("version", models.IntegerField(primary_key=True, serialize=False)), ( "release", models.DateTimeField(default=django.utils.timezone.now, null=True), ), ("description", models.TextField(blank=True, null=True)), ], - options={"db_table": "dbversion",}, + options={ + "db_table": "dbversion", + }, ), migrations.CreateModel( name="Deposit", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ("reception_date", models.DateTimeField(auto_now_add=True)), ("complete_date", models.DateTimeField(null=True)), ("external_id", models.TextField()), ("swh_id", models.TextField(blank=True, null=True)), ( "status", models.TextField( choices=[ ("partial", "partial"), ("expired", "expired"), ("ready", "ready"), ("injecting", "injecting"), ("success", "success"), ("failure", "failure"), ], default="partial", ), ), ], - options={"db_table": "deposit",}, + options={ + "db_table": "deposit", + }, ), migrations.CreateModel( name="DepositClient", fields=[ ( "user_ptr", models.OneToOneField( auto_created=True, on_delete=django.db.models.deletion.CASCADE, parent_link=True, primary_key=True, serialize=False, to=settings.AUTH_USER_MODEL, ), ), ( "collections", django.contrib.postgres.fields.ArrayField( base_field=models.IntegerField(), null=True, size=None ), ), ], - options={"db_table": "deposit_client",}, + options={ + "db_table": "deposit_client", + }, bases=("auth.user",), - managers=[("objects", django.contrib.auth.models.UserManager()),], + managers=[ + ("objects", django.contrib.auth.models.UserManager()), + ], ), migrations.CreateModel( name="DepositCollection", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ("name", models.TextField()), ], - options={"db_table": "deposit_collection",}, + options={ + "db_table": "deposit_collection", + }, ), migrations.CreateModel( name="DepositRequest", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ("date", models.DateTimeField(auto_now_add=True)), ("metadata", django.contrib.postgres.fields.jsonb.JSONField(null=True)), ( "deposit", models.ForeignKey( on_delete=django.db.models.deletion.DO_NOTHING, to="deposit.Deposit", ), ), ], - options={"db_table": "deposit_request",}, + options={ + "db_table": "deposit_request", + }, ), migrations.CreateModel( name="DepositRequestType", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ("name", models.TextField()), ], - options={"db_table": "deposit_request_type",}, + options={ + "db_table": "deposit_request_type", + }, ), migrations.AddField( model_name="depositrequest", name="type", field=models.ForeignKey( on_delete=django.db.models.deletion.DO_NOTHING, to="deposit.DepositRequestType", ), ), migrations.AddField( model_name="deposit", name="client", field=models.ForeignKey( on_delete=django.db.models.deletion.DO_NOTHING, to="deposit.DepositClient", ), ), migrations.AddField( model_name="deposit", name="collection", field=models.ForeignKey( on_delete=django.db.models.deletion.DO_NOTHING, to="deposit.DepositCollection", ), ), ] diff --git a/swh/deposit/migrations/0003_temporaryarchive.py b/swh/deposit/migrations/0003_temporaryarchive.py index 737fb2b6..a2ac8395 100644 --- a/swh/deposit/migrations/0003_temporaryarchive.py +++ b/swh/deposit/migrations/0003_temporaryarchive.py @@ -1,24 +1,26 @@ # -*- coding: utf-8 -*- # Generated by Django 1.10.7 on 2017-10-06 13:06 from __future__ import unicode_literals from django.db import migrations, models class Migration(migrations.Migration): dependencies = [ ("deposit", "0002_depositrequest_archive"), ] operations = [ migrations.CreateModel( name="TemporaryArchive", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ("path", models.TextField()), ("date", models.DateTimeField(auto_now_add=True)), ], - options={"db_table": "deposit_temporary_archive",}, + options={ + "db_table": "deposit_temporary_archive", + }, ), ] diff --git a/swh/deposit/migrations/0004_delete_temporaryarchive.py b/swh/deposit/migrations/0004_delete_temporaryarchive.py index 8c995aea..d30b9211 100644 --- a/swh/deposit/migrations/0004_delete_temporaryarchive.py +++ b/swh/deposit/migrations/0004_delete_temporaryarchive.py @@ -1,16 +1,18 @@ # -*- coding: utf-8 -*- # Generated by Django 1.10.7 on 2017-10-18 09:03 from __future__ import unicode_literals from django.db import migrations class Migration(migrations.Migration): dependencies = [ ("deposit", "0003_temporaryarchive"), ] operations = [ - migrations.DeleteModel(name="TemporaryArchive",), + migrations.DeleteModel( + name="TemporaryArchive", + ), ] diff --git a/swh/deposit/migrations/0007_auto_20171129_1609.py b/swh/deposit/migrations/0007_auto_20171129_1609.py index ee2f158a..0db3703c 100644 --- a/swh/deposit/migrations/0007_auto_20171129_1609.py +++ b/swh/deposit/migrations/0007_auto_20171129_1609.py @@ -1,18 +1,20 @@ # -*- coding: utf-8 -*- # Generated by Django 1.10.7 on 2017-11-29 16:09 from __future__ import unicode_literals from django.db import migrations, models class Migration(migrations.Migration): dependencies = [ ("deposit", "0006_depositclient_url"), ] operations = [ migrations.AlterField( - model_name="depositclient", name="url", field=models.TextField(null=False), + model_name="depositclient", + name="url", + field=models.TextField(null=False), ), ] diff --git a/swh/deposit/migrations/0010_auto_20180110_0953.py b/swh/deposit/migrations/0010_auto_20180110_0953.py index 469208ed..9df2b0c5 100644 --- a/swh/deposit/migrations/0010_auto_20180110_0953.py +++ b/swh/deposit/migrations/0010_auto_20180110_0953.py @@ -1,24 +1,26 @@ # -*- coding: utf-8 -*- # Generated by Django 1.10.7 on 2018-01-10 09:53 from __future__ import unicode_literals from django.db import migrations, models class Migration(migrations.Migration): dependencies = [ ("deposit", "0009_deposit_parent"), ] operations = [ migrations.RenameField( - model_name="depositclient", old_name="url", new_name="provider_url", + model_name="depositclient", + old_name="url", + new_name="provider_url", ), migrations.AddField( model_name="depositclient", name="domain", field=models.TextField(default=""), preserve_default=False, ), ] diff --git a/swh/deposit/migrations/0015_depositrequest_typemigration.py b/swh/deposit/migrations/0015_depositrequest_typemigration.py index a9974287..f9748109 100644 --- a/swh/deposit/migrations/0015_depositrequest_typemigration.py +++ b/swh/deposit/migrations/0015_depositrequest_typemigration.py @@ -1,40 +1,47 @@ # -*- coding: utf-8 -*- # Generated by Django 1.11.18 on 2019-04-12 16:40 from __future__ import unicode_literals from django.db import migrations, models def populate_deposit_type2(apps, schema_editor): # We can't import the DepositRequest model directly as it may be a newer # version than this migration expects. We use the historical version. DepositRequest = apps.get_model("deposit", "DepositRequest") for deposit in DepositRequest.objects.all(): deposit.type2 = deposit.type.name deposit.save() class Migration(migrations.Migration): dependencies = [ ("deposit", "0014_auto_20180720_1221"), ] operations = [ migrations.AddField( model_name="depositrequest", name="type2", field=models.CharField( choices=[("archive", "archive"), ("metadata", "metadata")], max_length=8, null=True, ), ), migrations.RunPython(populate_deposit_type2), - migrations.RemoveField(model_name="depositrequest", name="type",), + migrations.RemoveField( + model_name="depositrequest", + name="type", + ), migrations.RenameField( - model_name="depositrequest", old_name="type2", new_name="type", + model_name="depositrequest", + old_name="type2", + new_name="type", + ), + migrations.DeleteModel( + name="DepositRequestType", ), - migrations.DeleteModel(name="DepositRequestType",), ] diff --git a/swh/deposit/migrations/0018_migrate_swhids.py b/swh/deposit/migrations/0018_migrate_swhids.py index d5835824..b9bb2e57 100644 --- a/swh/deposit/migrations/0018_migrate_swhids.py +++ b/swh/deposit/migrations/0018_migrate_swhids.py @@ -1,348 +1,346 @@ # -*- coding: utf-8 -*- # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import unicode_literals import logging import os from typing import Any, Dict, Optional, Tuple from django.db import migrations from swh.core import config from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.swhids import CoreSWHID, ObjectType, QualifiedSWHID from swh.storage import get_storage as get_storage_client from swh.storage.algos.snapshot import snapshot_id_get_from_revision SWH_PROVIDER_URL = "https://www.softwareheritage.org" logger = logging.getLogger(__name__) swh_storage = None def get_storage() -> Optional[Any]: - """Instantiate a storage client - - """ + """Instantiate a storage client""" settings = os.environ.get("DJANGO_SETTINGS_MODULE") if settings != "swh.deposit.settings.production": # Bypass for now return None global swh_storage if not swh_storage: config_file = os.environ.get("SWH_CONFIG_FILENAME") if not config_file: raise ValueError( "Production: SWH_CONFIG_FILENAME must be set to the" " configuration file needed!" ) if not os.path.exists(config_file): raise ValueError( "Production: configuration file %s does not exist!" % (config_file,) ) conf = config.load_named_config(config_file) if not conf: raise ValueError( "Production: configuration %s does not exist." % (config_file,) ) storage_config = conf.get("storage") if not storage_config: raise ValueError( "Production: invalid configuration; missing 'storage' config entry." ) swh_storage = get_storage_client(**storage_config) return swh_storage def migrate_deposit_swhid_context_not_null(apps, schema_editor) -> None: """Migrate deposit SWHIDs to the new format. Migrate deposit SWHIDs to the new format. Only deposit with status done and swh_id_context not null are concerned. """ storage = get_storage() if not storage: logging.warning("Nothing to do") return None Deposit = apps.get_model("deposit", "Deposit") for deposit in Deposit.objects.filter( status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=False ): obj_dir = QualifiedSWHID.from_string(deposit.swh_id_context) assert obj_dir.object_type == ObjectType.DIRECTORY obj_rev = CoreSWHID.from_string(deposit.swh_anchor_id) assert obj_rev.object_type == ObjectType.REVISION if set(obj_dir.qualifiers()) != {"origin"}: # Assuming the migration is already done for that deposit logger.warning( "Deposit id %s: Migration already done, skipping", deposit.id ) continue # Starting migration dir_id = obj_dir.object_id origin = obj_dir.origin assert origin check_origin = storage.origin_get([origin])[0] if not check_origin: logger.warning("Deposit id %s: Origin %s not found!", deposit.id, origin) continue rev_id = obj_rev.object_id # Find the snapshot targeting the revision snp_id = snapshot_id_get_from_revision(storage, origin, hash_to_bytes(rev_id)) if snp_id is None: logger.warning( "Deposit id %s: Snapshot targeting revision %s not found!", deposit.id, rev_id, ) continue # Reference the old values to do some checks later old_swh_id = deposit.swh_id old_swh_id_context = deposit.swh_id_context old_swh_anchor_id = deposit.swh_anchor_id old_swh_anchor_id_context = deposit.swh_anchor_id_context # Update deposit.swh_id_context = QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=dir_id, origin=origin, visit=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id), anchor=CoreSWHID( object_type=ObjectType.REVISION, object_id=hash_to_bytes(rev_id) ), path=b"/", ) # Ensure only deposit.swh_id_context changed logging.debug("deposit.id: {deposit.id}") logging.debug("deposit.swh_id: %s -> %s", old_swh_id, deposit.swh_id) assert old_swh_id == deposit.swh_id logging.debug( "deposit.swh_id_context: %s -> %s", old_swh_id_context, deposit.swh_id_context, ) assert old_swh_id_context != deposit.swh_id_context logging.debug( "deposit.swh_anchor_id: %s -> %s", old_swh_anchor_id, deposit.swh_anchor_id ) assert old_swh_anchor_id == deposit.swh_anchor_id logging.debug( "deposit.swh_anchor_id_context: %s -> %s", old_swh_anchor_id_context, deposit.swh_anchor_id_context, ) assert old_swh_anchor_id_context == deposit.swh_anchor_id_context # Commit deposit.save() def resolve_origin(deposit_id: int, provider_url: str, external_id: str) -> str: """Resolve the origin from provider-url and external-id For some edge case, only the external_id is used as there is some old inconsistency from testing which exists. """ map_edge_case_origin: Dict[Tuple[int, str], str] = { ( 76, "hal-01588782", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588782", ( 87, "hal-01588927", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588927", (89, "hal-01588935"): "https://hal-preprod.archives-ouvertes.fr/hal-01588935", ( 88, "hal-01588928", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588928", ( 90, "hal-01588942", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588942", (143, "hal-01592430"): "https://hal-preprod.archives-ouvertes.fr/hal-01592430", ( 75, "hal-01588781", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588781", } origin = map_edge_case_origin.get((deposit_id, external_id)) if origin: return origin # Some simpler origin edge cases (mostly around the initial deposits) map_origin = { ( SWH_PROVIDER_URL, "je-suis-gpl", ): "https://forge.softwareheritage.org/source/jesuisgpl/", ( SWH_PROVIDER_URL, "external-id", ): "https://hal.archives-ouvertes.fr/external-id", } key = (provider_url, external_id) return map_origin.get(key, f"{provider_url.rstrip('/')}/{external_id}") def migrate_deposit_swhid_context_null(apps, schema_editor) -> None: """Migrate deposit SWHIDs to the new format. Migrate deposit whose swh_id_context is not set (initial deposits not migrated at the time). Only deposit with status done and swh_id_context null are concerned. Note: Those deposits have their swh_id being the SWHPIDs of the revision! So we can align them as well. """ storage = get_storage() if not storage: logging.warning("Nothing to do") return None Deposit = apps.get_model("deposit", "Deposit") for deposit in Deposit.objects.filter( status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=True ): obj_rev = CoreSWHID.from_string(deposit.swh_id) if obj_rev.object_type == ObjectType.DIRECTORY: # Assuming the migration is already done for that deposit logger.warning( "Deposit id %s: Migration already done, skipping", deposit.id ) continue # Ensuring Migration not done assert obj_rev.object_type == ObjectType.REVISION assert deposit.swh_id is not None assert deposit.swh_id_context is None assert deposit.swh_anchor_id is None assert deposit.swh_anchor_id_context is None rev_id = obj_rev.object_id rev_id_bytes = hash_to_bytes(rev_id) revision = storage.revision_get([rev_id_bytes])[0] if not revision: logger.warning("Deposit id %s: Revision %s not found!", deposit.id, rev_id) continue provider_url = deposit.client.provider_url external_id = deposit.external_id origin = resolve_origin(deposit.id, provider_url, external_id) check_origin = storage.origin_get([origin])[0] if not check_origin: logger.warning("Deposit id %s: Origin %s not found!", deposit.id, origin) continue dir_id = hash_to_hex(revision["directory"]) # Reference the old values to do some checks later old_swh_id = deposit.swh_id old_swh_id_context = deposit.swh_id_context old_swh_anchor_id = deposit.swh_anchor_id old_swh_anchor_id_context = deposit.swh_anchor_id_context # retrieve the snapshot from the archive snp_id = snapshot_id_get_from_revision(storage, origin, rev_id_bytes) if snp_id is None: logger.warning( "Deposit id %s: Snapshot targeting revision %s not found!", deposit.id, rev_id, ) continue # New SWHIDs ids deposit.swh_id = CoreSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(dir_id) ) deposit.swh_id_context = QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=dir_id, origin=origin, visit=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id), anchor=CoreSWHID(object_type=ObjectType.REVISION, object_id=rev_id_bytes), path=b"/", ) # Realign the remaining deposit SWHIDs fields deposit.swh_anchor_id = str( CoreSWHID(object_type=ObjectType.REVISION, object_id=rev_id_bytes) ) deposit.swh_anchor_id_context = str( QualifiedSWHID( object_type=ObjectType.REVISION, object_id=rev_id_bytes, origin=origin ) ) # Ensure only deposit.swh_id_context changed logging.debug("deposit.id: {deposit.id}") logging.debug("deposit.swh_id: %s -> %s", old_swh_id, deposit.swh_id) assert old_swh_id != deposit.swh_id logging.debug( "deposit.swh_id_context: %s -> %s", old_swh_id_context, deposit.swh_id_context, ) assert old_swh_id_context != deposit.swh_id_context assert deposit.swh_id_context is not None logging.debug( "deposit.swh_anchor_id: %s -> %s", old_swh_anchor_id, deposit.swh_anchor_id ) assert deposit.swh_anchor_id == old_swh_id assert deposit.swh_anchor_id is not None logging.debug( "deposit.swh_anchor_id_context: %s -> %s", old_swh_anchor_id_context, deposit.swh_anchor_id_context, ) assert deposit.swh_anchor_id_context is not None deposit.save() class Migration(migrations.Migration): dependencies = [ ("deposit", "0017_auto_20190925_0906"), ] operations = [ # Migrate and make the operations possibly reversible # https://docs.djangoproject.com/en/3.0/ref/migration-operations/#django.db.migrations.operations.RunPython.noop # noqa migrations.RunPython( migrate_deposit_swhid_context_not_null, reverse_code=migrations.RunPython.noop, ), migrations.RunPython( migrate_deposit_swhid_context_null, reverse_code=migrations.RunPython.noop ), ] diff --git a/swh/deposit/migrations/0019_auto_20200519_1035.py b/swh/deposit/migrations/0019_auto_20200519_1035.py index f54ee98d..f1e0911b 100644 --- a/swh/deposit/migrations/0019_auto_20200519_1035.py +++ b/swh/deposit/migrations/0019_auto_20200519_1035.py @@ -1,17 +1,23 @@ # -*- coding: utf-8 -*- # Generated by Django 1.11.23 on 2020-05-19 10:35 from __future__ import unicode_literals from django.db import migrations class Migration(migrations.Migration): dependencies = [ ("deposit", "0018_migrate_swhids"), ] operations = [ - migrations.RemoveField(model_name="deposit", name="swh_anchor_id",), - migrations.RemoveField(model_name="deposit", name="swh_anchor_id_context",), + migrations.RemoveField( + model_name="deposit", + name="swh_anchor_id", + ), + migrations.RemoveField( + model_name="deposit", + name="swh_anchor_id_context", + ), ] diff --git a/swh/deposit/migrations/0020_auto_20200929_0855.py b/swh/deposit/migrations/0020_auto_20200929_0855.py index 2ec6cef2..dafeb8c8 100644 --- a/swh/deposit/migrations/0020_auto_20200929_0855.py +++ b/swh/deposit/migrations/0020_auto_20200929_0855.py @@ -1,21 +1,25 @@ # -*- coding: utf-8 -*- # Generated by Django 1.11.23 on 2020-09-29 08:55 from __future__ import unicode_literals from django.db import migrations class Migration(migrations.Migration): dependencies = [ ("deposit", "0019_auto_20200519_1035"), ] operations = [ migrations.RenameField( - model_name="deposit", old_name="swh_id", new_name="swhid", + model_name="deposit", + old_name="swh_id", + new_name="swhid", ), migrations.RenameField( - model_name="deposit", old_name="swh_id_context", new_name="swhid_context", + model_name="deposit", + old_name="swh_id_context", + new_name="swhid_context", ), ] diff --git a/swh/deposit/migrations/0021_deposit_origin_url_20201124_1438.py b/swh/deposit/migrations/0021_deposit_origin_url_20201124_1438.py index 399a9bf1..326a407b 100644 --- a/swh/deposit/migrations/0021_deposit_origin_url_20201124_1438.py +++ b/swh/deposit/migrations/0021_deposit_origin_url_20201124_1438.py @@ -1,28 +1,32 @@ from django.db import migrations, models from swh.deposit.api.common import guess_deposit_origin_url from swh.deposit.models import Deposit def fill_origin_url(apps, schema_editor): for deposit in Deposit.objects.all(): if deposit.origin_url is None: deposit.origin_url = guess_deposit_origin_url(deposit) deposit.save() class Migration(migrations.Migration): dependencies = [ ("deposit", "0020_auto_20200929_0855"), ] operations = [ migrations.AddField( - model_name="deposit", name="origin_url", field=models.TextField(null=True), + model_name="deposit", + name="origin_url", + field=models.TextField(null=True), ), # migrations.RunPython(fill_origin_url), migrations.AlterField( - model_name="deposit", name="external_id", field=models.TextField(null=True), + model_name="deposit", + name="external_id", + field=models.TextField(null=True), ), ] diff --git a/swh/deposit/migrations/0022_auto_20220223_1542.py b/swh/deposit/migrations/0022_auto_20220223_1542.py index 70bf4e5f..f78c42ed 100644 --- a/swh/deposit/migrations/0022_auto_20220223_1542.py +++ b/swh/deposit/migrations/0022_auto_20220223_1542.py @@ -1,58 +1,61 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.db import migrations, models from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS from swh.deposit.models import ( DEPOSIT_CODE, DEPOSIT_METADATA_ONLY, DEPOSIT_TYPES, Deposit, ) def fill_deposit_type(apps, schema_editor): """Fill the new field metadata_only on existing data. This will mark metadata only deposits all deposits whose status is done, their complete date is exactly the reception date, and they have their swhid filled in. """ for deposit in Deposit.objects.all(): deposit.type = ( DEPOSIT_METADATA_ONLY if ( deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS and deposit.complete_date == deposit.reception_date and deposit.complete_date is not None and deposit.swhid is not None and deposit.swhid_context is not None ) else DEPOSIT_CODE ) deposit.save() class Migration(migrations.Migration): dependencies = [ ("deposit", "0021_deposit_origin_url_20201124_1438"), ] operations = [ migrations.AddField( model_name="deposit", name="type", field=models.CharField( - choices=DEPOSIT_TYPES, default=DEPOSIT_CODE, max_length=4, + choices=DEPOSIT_TYPES, + default=DEPOSIT_CODE, + max_length=4, ), preserve_default=False, ), # Migrate and make the operations possibly reversible migrations.RunPython( - fill_deposit_type, reverse_code=migrations.RunPython.noop, + fill_deposit_type, + reverse_code=migrations.RunPython.noop, ), ] diff --git a/swh/deposit/models.py b/swh/deposit/models.py index 3d8702d4..af357405 100644 --- a/swh/deposit/models.py +++ b/swh/deposit/models.py @@ -1,269 +1,261 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # Generated from: # cd swh_deposit && \ # python3 -m manage inspectdb import datetime from typing import Optional from django.contrib.auth.models import User, UserManager from django.contrib.postgres.fields import ArrayField, JSONField from django.db import models from django.utils.timezone import now from swh.auth.django.models import OIDCUser from .config import ( ARCHIVE_TYPE, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED, METADATA_TYPE, ) class Dbversion(models.Model): - """Db version - - """ + """Db version""" version = models.IntegerField(primary_key=True) release = models.DateTimeField(default=now, null=True) description = models.TextField(blank=True, null=True) class Meta: db_table = "dbversion" app_label = "deposit" def __str__(self): return str( { "version": self.version, "release": self.release, "description": self.description, } ) """Possible status""" DEPOSIT_STATUS = [ (DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_PARTIAL), ("expired", "expired"), (DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_DEPOSITED), (DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_VERIFIED), (DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_REJECTED), ("loading", "loading"), (DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_SUCCESS), (DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_FAILURE), ] """Possible status and the detailed meaning.""" DEPOSIT_STATUS_DETAIL = { DEPOSIT_STATUS_PARTIAL: "Deposit is partially received. To finalize it, " "In-Progress header should be false", "expired": "Deposit has been there too long and is now " "deemed ready to be garbage collected", DEPOSIT_STATUS_DEPOSITED: "Deposit is ready for additional checks " "(tarball ok, metadata, etc...)", DEPOSIT_STATUS_VERIFIED: "Deposit is fully received, checked, and " "ready for loading", DEPOSIT_STATUS_REJECTED: "Deposit failed the checks", "loading": "Loading is ongoing on swh's side", DEPOSIT_STATUS_LOAD_SUCCESS: "The deposit has been successfully " "loaded into the Software Heritage archive", DEPOSIT_STATUS_LOAD_FAILURE: "The deposit loading into the " "Software Heritage archive failed", } class DepositClient(User): - """Deposit client - - """ + """Deposit client""" collections = ArrayField(models.IntegerField(), null=True) objects = UserManager() # type: ignore # this typing hint is due to a mypy/django-stubs limitation, # see https://github.com/typeddjango/django-stubs/issues/174 provider_url = models.TextField(null=False) domain = models.TextField(null=False) oidc_user: Optional[OIDCUser] = None class Meta: db_table = "deposit_client" app_label = "deposit" def __str__(self): return str( { "id": self.id, "collections": self.collections, "username": super().username, "domain": self.domain, "provider_url": self.provider_url, } ) DEPOSIT_METADATA_ONLY = "meta" DEPOSIT_CODE = "code" DEPOSIT_TYPES = [ (DEPOSIT_METADATA_ONLY, DEPOSIT_METADATA_ONLY), (DEPOSIT_CODE, DEPOSIT_CODE), ] class Deposit(models.Model): - """Deposit reception table - - """ + """Deposit reception table""" id = models.BigAutoField(primary_key=True) # First deposit reception date reception_date = models.DateTimeField(auto_now_add=True) # Date when the deposit is deemed complete and ready for loading complete_date = models.DateTimeField(null=True) # collection concerned by the deposit collection = models.ForeignKey("DepositCollection", models.DO_NOTHING) # Deprecated: Deposit's external identifier external_id = models.TextField(null=True) # URL of the origin of this deposit, null if this is a metadata-only deposit origin_url = models.TextField(null=True) # Deposit client client = models.ForeignKey("DepositClient", models.DO_NOTHING) # SWH's loading result identifier swhid = models.TextField(blank=True, null=True) swhid_context = models.TextField(blank=True, null=True) # Deposit's status regarding loading status = models.TextField(choices=DEPOSIT_STATUS, default=DEPOSIT_STATUS_PARTIAL) status_detail = JSONField(null=True) # deposit can have one parent parent = models.ForeignKey("self", on_delete=models.PROTECT, null=True) check_task_id = models.TextField( blank=True, null=True, verbose_name="Scheduler's associated checking task id" ) load_task_id = models.TextField( blank=True, null=True, verbose_name="Scheduler's associated loading task id" ) type = models.CharField(max_length=4, choices=DEPOSIT_TYPES, default=DEPOSIT_CODE) raw_metadata: Optional[str] = None class Meta: db_table = "deposit" app_label = "deposit" def __str__(self): d = { "id": self.id, "type": self.type, "status": self.status, "reception_date": self.reception_date, "complete_date": self.complete_date, "collection": self.collection.name, "external_id": self.external_id, "origin_url": self.origin_url, "client": self.client.username, } if self.status in (DEPOSIT_STATUS_REJECTED): d["status_detail"] = self.status_detail return str(d) def set_raw_metadata(self, raw_metadata: str) -> None: """Set the metadata raw out of a 'metadata' typed deposit request. This is specifically used during listing. """ self.raw_metadata = raw_metadata def client_directory_path(instance: "DepositRequest", filename: str) -> str: """Callable to determine the upload archive path. This defaults to MEDIA_ROOT/client_/%Y%m%d-%H%M%S.%f/. The format "%Y%m%d-%H%M%S.%f" is the reception date of the associated deposit formatted using strftime. Args: instance: DepositRequest concerned by the upload filename: Filename of the uploaded file Returns: The upload archive path. """ reception_date = instance.deposit.reception_date assert isinstance(reception_date, datetime.datetime) folder = reception_date.strftime("%Y%m%d-%H%M%S.%f") return f"client_{instance.deposit.client.id}/{folder}/{filename}" REQUEST_TYPES = [(ARCHIVE_TYPE, ARCHIVE_TYPE), (METADATA_TYPE, METADATA_TYPE)] class DepositRequest(models.Model): - """Deposit request associated to one deposit. - - """ + """Deposit request associated to one deposit.""" id = models.BigAutoField(primary_key=True) # Deposit concerned by the request deposit = models.ForeignKey(Deposit, models.DO_NOTHING) date = models.DateTimeField(auto_now_add=True) # Deposit request information on the data to inject # this can be null when type is 'archive' metadata = JSONField(null=True) raw_metadata = models.TextField(null=True) # this can be null when type is 'metadata' archive = models.FileField(null=True, upload_to=client_directory_path) type = models.CharField(max_length=8, choices=REQUEST_TYPES, null=True) class Meta: db_table = "deposit_request" app_label = "deposit" def __str__(self): meta = None if self.metadata: from json import dumps meta = dumps(self.metadata) archive_name = None if self.archive: archive_name = self.archive.name return str( { "id": self.id, "deposit": self.deposit, "metadata": meta, "archive": archive_name, } ) class DepositCollection(models.Model): id = models.BigAutoField(primary_key=True) # Human readable name for the collection type e.g HAL, arXiv, etc... name = models.TextField() class Meta: db_table = "deposit_collection" app_label = "deposit" def __str__(self): return str({"id": self.id, "name": self.name}) diff --git a/swh/deposit/parsers.py b/swh/deposit/parsers.py index 5dfc4795..5b68179c 100644 --- a/swh/deposit/parsers.py +++ b/swh/deposit/parsers.py @@ -1,94 +1,86 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining parsers with SWORD 2.0 supported mediatypes. """ import logging from xml.etree import ElementTree from django.conf import settings from rest_framework.parsers import BaseParser, FileUploadParser, MultiPartParser from swh.deposit.errors import ParserError logger = logging.getLogger(__name__) class SWHFileUploadZipParser(FileUploadParser): - """File upload parser limited to zip archive. - - """ + """File upload parser limited to zip archive.""" media_type = "application/zip" class SWHFileUploadTarParser(FileUploadParser): - """File upload parser limited to tarball (tar, tar.gz, tar.*) archives. - - """ + """File upload parser limited to tarball (tar, tar.gz, tar.*) archives.""" media_type = "application/x-tar" class SWHXMLParser(BaseParser): """ XML parser. """ media_type = "application/xml" def parse(self, stream, media_type=None, parser_context=None): """ Parses the incoming bytestream as XML and returns the resulting data. """ parser_context = parser_context or {} encoding = parser_context.get("encoding", settings.DEFAULT_CHARSET) parser = ElementTree.XMLParser(encoding=encoding) return ElementTree.parse(stream, parser=parser) class SWHAtomEntryParser(SWHXMLParser): - """Atom entry parser limited to specific mediatype - - """ + """Atom entry parser limited to specific mediatype""" media_type = "application/atom+xml;type=entry" def parse(self, stream, media_type=None, parser_context=None): # We do not actually want to parse the stream yet # because we want to keep the raw data as well # this is done later in the atom entry call # (cf. swh.deposit.api.common.APIBase._atom_entry) return stream class SWHMultiPartParser(MultiPartParser): - """Multipart parser limited to a subset of mediatypes. - - """ + """Multipart parser limited to a subset of mediatypes.""" media_type = "multipart/*; *" def parse_xml(raw_content): """Parse xml body. Args: raw_content (bytes): The content to parse Raises: ParserError in case of a malformed xml Returns: content parsed as dict. """ try: return ElementTree.fromstring(raw_content) except ElementTree.ParseError as e: raise ParserError(str(e)) diff --git a/swh/deposit/settings/common.py b/swh/deposit/settings/common.py index 38e26d79..5a85b99d 100644 --- a/swh/deposit/settings/common.py +++ b/swh/deposit/settings/common.py @@ -1,114 +1,124 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """ Django settings for swh project. Generated by 'django-admin startproject' using Django 1.10.7. For more information on this file, see https://docs.djangoproject.com/en/1.10/topics/settings/ For the full list of settings and their values, see https://docs.djangoproject.com/en/1.10/ref/settings/ """ import os # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/ ALLOWED_HOSTS = ["127.0.0.1", "localhost"] # Application definition INSTALLED_APPS = [ "django.contrib.auth", "django.contrib.contenttypes", "django.contrib.staticfiles", "django.contrib.sessions", "django.contrib.messages", "django.contrib.postgres", # for JSONField, ArrayField "swh.deposit.apps.DepositConfig", ] MIDDLEWARE = [ "django.middleware.security.SecurityMiddleware", "django.contrib.sessions.middleware.SessionMiddleware", "django.middleware.common.CommonMiddleware", "django.middleware.csrf.CsrfViewMiddleware", "django.contrib.auth.middleware.AuthenticationMiddleware", "django.contrib.messages.middleware.MessageMiddleware", "django.middleware.clickjacking.XFrameOptionsMiddleware", "swh.deposit.auth.WrapBasicAuthenticationResponseMiddleware", "swh.deposit.errors.DepositErrorMiddleware", ] ROOT_URLCONF = "swh.deposit.urls" TEMPLATES = [ { "BACKEND": "django.template.backends.django.DjangoTemplates", "DIRS": [], "APP_DIRS": True, "OPTIONS": { "context_processors": [ "django.template.context_processors.debug", "django.template.context_processors.request", "django.contrib.auth.context_processors.auth", "django.contrib.messages.context_processors.messages", ], }, }, ] # Password validation # https://docs.djangoproject.com/en/1.10/ref/settings/#auth-password-validators AUTH_PASSWORD_VALIDATORS = [ { "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", # noqa }, - {"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",}, - {"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",}, - {"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",}, + { + "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", + }, ] # Internationalization # https://docs.djangoproject.com/en/1.10/topics/i18n/ LANGUAGE_CODE = "en-us" TIME_ZONE = "UTC" USE_I18N = True USE_L10N = True USE_TZ = True # Static files (CSS, JavaScript, Images) # https://docs.djangoproject.com/en/1.10/howto/static-files/ STATIC_URL = "/static/" REST_FRAMEWORK = { "EXCEPTION_HANDLER": "swh.deposit.exception.custom_exception_handler", } FILE_UPLOAD_HANDLERS = [ "django.core.files.uploadhandler.MemoryFileUploadHandler", "django.core.files.uploadhandler.TemporaryFileUploadHandler", ] -CACHES = {"default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache",}} +CACHES = { + "default": { + "BACKEND": "django.core.cache.backends.locmem.LocMemCache", + } +} diff --git a/swh/deposit/settings/development.py b/swh/deposit/settings/development.py index 86670558..f7a63e9c 100644 --- a/swh/deposit/settings/development.py +++ b/swh/deposit/settings/development.py @@ -1,52 +1,59 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from .common import * # noqa # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True # SECURITY WARNING: keep the secret key used in production secret! SECRET_KEY = "development-key" # https://docs.djangoproject.com/en/1.10/ref/settings/#logging LOGGING = { "version": 1, "disable_existing_loggers": False, "formatters": { "standard": { "format": "[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s", # noqa "datefmt": "%d/%b/%Y %H:%M:%S", }, }, "handlers": { "console": { "level": "DEBUG", "class": "logging.StreamHandler", "formatter": "standard", }, }, "loggers": { - "django": {"handlers": ["console"], "level": "DEBUG", "propagate": True,}, + "django": { + "handlers": ["console"], + "level": "DEBUG", + "propagate": True, + }, "django.db.backends": { "handlers": ["console"], "level": "INFO", "propagate": False, }, - "swh.deposit": {"handlers": ["console"], "level": "DEBUG",}, + "swh.deposit": { + "handlers": ["console"], + "level": "DEBUG", + }, }, } # https://docs.djangoproject.com/en/1.10/ref/settings/#databases DATABASES = { "default": { "ENGINE": "django.db.backends.postgresql", "NAME": "swh-deposit-dev", # this is no longer used in test env } } # https://docs.djangoproject.com/en/1.11/ref/settings/#std:setting-MEDIA_ROOT # SECURITY WARNING: Override this in the production.py module MEDIA_ROOT = "/tmp/swh-deposit/uploads/" diff --git a/swh/deposit/settings/testing.py b/swh/deposit/settings/testing.py index ea67376c..10e683f2 100644 --- a/swh/deposit/settings/testing.py +++ b/swh/deposit/settings/testing.py @@ -1,46 +1,51 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from .common import * # noqa from .common import ALLOWED_HOSTS from .development import * # noqa from .development import INSTALLED_APPS # django setup ALLOWED_HOSTS += ["testserver"] INSTALLED_APPS += ["pytest_django"] # https://docs.djangoproject.com/en/1.10/ref/settings/#logging LOGGING = { "version": 1, "disable_existing_loggers": True, "formatters": { "standard": { "format": "[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s", # noqa "datefmt": "%d/%b/%Y %H:%M:%S", }, }, "handlers": { "console": { "level": "ERROR", "class": "logging.StreamHandler", "formatter": "standard", }, }, - "loggers": {"swh.deposit": {"handlers": ["console"], "level": "ERROR",},}, + "loggers": { + "swh.deposit": { + "handlers": ["console"], + "level": "ERROR", + }, + }, } # https://docs.djangoproject.com/en/1.11/ref/settings/#std:setting-MEDIA_ROOT # SECURITY WARNING: Override this in the production.py module MEDIA_ROOT = "/tmp/swh-deposit/test/uploads/" FILE_UPLOAD_HANDLERS = [ "django.core.files.uploadhandler.MemoryFileUploadHandler", ] REST_FRAMEWORK = { "EXCEPTION_HANDLER": "swh.deposit.exception.custom_exception_handler", } diff --git a/swh/deposit/tests/api/conftest.py b/swh/deposit/tests/api/conftest.py index 6dbcfe8f..d9c676f3 100644 --- a/swh/deposit/tests/api/conftest.py +++ b/swh/deposit/tests/api/conftest.py @@ -1,94 +1,90 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib import os from django.urls import reverse_lazy as reverse import pytest from swh.deposit.api.private.deposit_check import APIChecks from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_VERIFIED, ) from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml from swh.deposit.utils import NAMESPACES @pytest.fixture def datadir(request): """Override default datadir to target main test datadir""" return os.path.join(os.path.dirname(str(request.fspath)), "../data") @pytest.fixture def ready_deposit_ok(partial_deposit_with_metadata): - """Returns a deposit ready for checks (it will pass the checks). - - """ + """Returns a deposit ready for checks (it will pass the checks).""" deposit = partial_deposit_with_metadata deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() return deposit @pytest.fixture def ready_deposit_verified(partial_deposit_with_metadata): - """Returns a deposit ready for checks (it will pass the checks). - - """ + """Returns a deposit ready for checks (it will pass the checks).""" deposit = partial_deposit_with_metadata deposit.status = DEPOSIT_STATUS_VERIFIED deposit.save() return deposit @pytest.fixture def ready_deposit_only_metadata(partial_deposit_only_metadata): """Deposit in status ready that will fail the checks (because missing - archive). + archive). """ deposit = partial_deposit_only_metadata deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() return deposit @pytest.fixture def ready_deposit_invalid_archive(authenticated_client, deposit_collection): url = reverse(COL_IRI, args=[deposit_collection.name]) data = b"some data which is clearly not a zip file" md5sum = hashlib.md5(data).hexdigest() # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=data, # + headers CONTENT_LENGTH=len(data), # other headers needs HTTP_ prefix to be taken into account HTTP_SLUG="external-id-invalid", HTTP_CONTENT_MD5=md5sum, HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() return deposit @pytest.fixture def swh_checks_deposit(): return APIChecks() diff --git a/swh/deposit/tests/api/test_basic_auth.py b/swh/deposit/tests/api/test_basic_auth.py index fe214994..c5ca171c 100644 --- a/swh/deposit/tests/api/test_basic_auth.py +++ b/swh/deposit/tests/api/test_basic_auth.py @@ -1,32 +1,30 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module to check at least one basic authentication works. """ from django.urls import reverse_lazy as reverse import pytest from swh.deposit.config import SD_IRI from .test_service_document import check_response @pytest.fixture() def deposit_config(common_deposit_config): return { **common_deposit_config, "authentication_provider": "basic", } def test_service_document_basic(basic_authenticated_client): - """With authentication, service document list user's collection - - """ + """With authentication, service document list user's collection""" url = reverse(SD_IRI) response = basic_authenticated_client.get(url) check_response(response, basic_authenticated_client.deposit_client.username) diff --git a/swh/deposit/tests/api/test_checks.py b/swh/deposit/tests/api/test_checks.py index e3e55704..b287d358 100644 --- a/swh/deposit/tests/api/test_checks.py +++ b/swh/deposit/tests/api/test_checks.py @@ -1,1087 +1,1088 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # disable flake8 on this file because of line length # flake8: noqa import pprint import re import textwrap from typing import Any, Dict from xml.etree import ElementTree import pytest from swh.deposit.api.checks import ( METADATA_PROVENANCE_KEY, SUGGESTED_FIELDS_MISSING, check_metadata, ) METADATA_PROVENANCE_DICT: Dict[str, Any] = { "swh:deposit": { METADATA_PROVENANCE_KEY: {"schema:url": "some-metadata-provenance-url"} } } XMLNS = """xmlns="http://www.w3.org/2005/Atom" xmlns:swh="https://www.softwareheritage.org/schema/2018/deposit" xmlns:codemeta="https://doi.org/10.5063/SCHEMA/CODEMETA-2.0" xmlns:schema="http://schema.org/" """ PROVENANCE_XML = """ some-metadata-provenance-url """ _parameters1 = [ pytest.param(textwrap.dedent(metadata_ok), id=id_) for (id_, metadata_ok,) in [ ( "atom-only-with-name", f"""\ something something-else foo someone {PROVENANCE_XML} """, ), ( "atom-only-with-title", f"""\ something something-else bar someone """, ), ( "atom-only-and-external_identifier", f"""\ something something-else foo no one {PROVENANCE_XML} """, ), ( "atom-and-codemeta-minimal", f"""\ some url bar no one {PROVENANCE_XML} """, ), ( "unknown-codemeta-inner-element-after", f"""\ some url bar someone should allow anything here {PROVENANCE_XML} """, ), ( "unknown-schema-inner-element-after", f"""\ some url bar someone should allow anything here {PROVENANCE_XML} """, ), ( "unknown-schema-inner-element-before", f"""\ some url bar should allow anything here someone {PROVENANCE_XML} """, ), ( "unknown-schema-inner-element-before-and-after", f"""\ some url bar should allow anything here someone should allow anything here {PROVENANCE_XML} """, ), ( "identifier-is-halid", f"""\ some url bar The Author hal-12345 {PROVENANCE_XML} """, ), ( "identifier-is-propertyvalue", f"""\ some url bar The Author schema:PropertyValue HAL-ID hal-02527911 {PROVENANCE_XML} """, ), ( "codemeta-dates", f"""\ some url some id nar no one 2020-12-21 2020-12-21 2020-12-25 2020-12-25 {PROVENANCE_XML} """, ), ( "codemeta-date-month", # Allowed by ISO8601, therefore by schema:Date, but not by xsd:date f"""\ some url some id nar no one 2020-12 2020-12 2020-12 {PROVENANCE_XML} """, ), ( "codemeta-date-year", # Allowed by ISO8601, therefore by schema:Date, but not by xsd:date f"""\ some url some id nar no one 2020 2020 2020 {PROVENANCE_XML} """, ), ( "codemeta-datetimes", # technically, only Date is allowed for datePublished; but we allow DateTime # for backward compatibility with old swh-deposit versions f"""\ some url some id nar no one 2020-12-21T12:00:00 2020-12-21T12:00:00 2020-12-25T12:00:00 {PROVENANCE_XML} """, ), ( "author-two-names", f"""\ some url bar someone an alias {PROVENANCE_XML} """, ), ( # Required by codemeta.jsonld, but forbidden by # https://codemeta.github.io/terms/ "element-in--affiliation", f"""\ some url bar someone My Orga {PROVENANCE_XML} """, ), ( # Forbidden by codemeta.jsonld, but required by # https://codemeta.github.io/terms/ "chardata-in-affiliation", f"""\ some url bar someone My Orga {PROVENANCE_XML} """, ), ( "swh:add_to_origin", f"""\ something something-else bar someone some-metadata-provenance-url """, ), ( "swh:reference-origin", f"""\ something something-else bar someone some-metadata-provenance-url """, ), ( "swh:reference-object", f"""\ something something-else bar someone some-metadata-provenance-url """, ), ( # a full example with every tag we know "codemeta-full", f"""\ something foo someone The Author http://example.org/~theauthor/ author@example.org University 1 https://sandbox.orcid.org/0000-0002-9227-8514 A Contributor University 2 A Maintainer University 3 University 3 A Maintainer something something else http://example.org/ Blah blah 1.0.0 1.0.0 kw1 kw2 Blah blah http://example.org/ http://example.org/ http://example.org/ {PROVENANCE_XML} """, ), ] ] @pytest.mark.parametrize( - "metadata_ok", _parameters1, + "metadata_ok", + _parameters1, ) def test_api_checks_check_metadata_ok(metadata_ok, swh_checks_deposit): actual_check, detail = check_metadata(ElementTree.fromstring(metadata_ok)) assert actual_check is True, f"Unexpected result:\n{pprint.pformat(detail)}" if "swh:deposit" in metadata_ok: # no missing suggested field assert detail is None else: # missing suggested field assert detail == { "metadata": [ { "fields": [METADATA_PROVENANCE_KEY], "summary": SUGGESTED_FIELDS_MISSING, } ] } _parameters2 = [ pytest.param(textwrap.dedent(metadata_ko), expected_summary, id=id_) for (id_, metadata_ko, expected_summary) in [ ( "no-name-or-title", f"""\ something something-else someone {PROVENANCE_XML} """, { "summary": "Mandatory fields are missing", "fields": ["atom:name or atom:title or codemeta:name"], }, ), ( "no-author", f"""\ something something-else foobar {PROVENANCE_XML} """, { "summary": "Mandatory fields are missing", "fields": ["atom:author or codemeta:author"], }, ), ( "wrong-title-namespace", f"""\ something something-else bar someone {PROVENANCE_XML} """, { "summary": "Mandatory fields are missing", "fields": ["atom:name or atom:title or codemeta:name"], }, ), ( "wrong-author-namespace", f"""\ something something-else foobar foo {PROVENANCE_XML} """, { "summary": "Mandatory fields are missing", "fields": ["atom:author or codemeta:author"], }, ), ( "wrong-author-tag", f"""\ something something-else bar someone {PROVENANCE_XML} """, { "summary": "Mandatory fields are missing", "fields": ["atom:author or codemeta:author"], }, ), ] ] @pytest.mark.parametrize("metadata_ko,expected_summary", _parameters2) def test_api_checks_check_metadata_ko( metadata_ko, expected_summary, swh_checks_deposit ): actual_check, error_detail = check_metadata(ElementTree.fromstring(metadata_ko)) assert actual_check is False assert error_detail == {"metadata": [expected_summary]} _parameters3 = [ pytest.param(textwrap.dedent(metadata_ko), expected_summary, id=id_) for (id_, metadata_ko, expected_summary) in [ ( "child-element-in-name", f"""\ some url bar no one {PROVENANCE_XML} """, [ { "summary": ".*Reason: a simple content element can't have child elements.*", "fields": ["codemeta:name"], }, ], ), ( "affiliation-with-no-name", f"""\ some url bar someone http://example.org {PROVENANCE_XML} """, [ { "summary": ".*Reason: affiliation does not have a element.*", "fields": ["codemeta:author"], }, ], ), ( "empty-affiliation", f"""\ some url bar someone {PROVENANCE_XML} """, [ { "summary": ".*Reason: affiliation does not have a element.*", "fields": ["codemeta:author"], }, ], ), ( "chardata-in-author", f"""\ some url bar no one {PROVENANCE_XML} """, [ { "summary": ".*Reason: character data between child elements.*", "fields": ["codemeta:author"], }, ], ), ( "author-with-no-name", f"""\ some url bar should allow anything here {PROVENANCE_XML} """, [ { "summary": ".*Tag '?codemeta:name'? expected.*", "fields": ["codemeta:author"], }, ], ), ( "contributor-with-no-name", f"""\ some url bar should allow anything here abc {PROVENANCE_XML} """, [ { "summary": ".*Tag '?codemeta:name'? expected.*", "fields": ["codemeta:contributor"], }, ], ), ( "maintainer-with-no-name", f"""\ some url bar should allow anything here abc {PROVENANCE_XML} """, [ { "summary": ".*Tag '?codemeta:name'? expected.*", "fields": ["codemeta:maintainer"], }, ], ), ( "id-is-not-url", f"""\ some url bar The Author http://not a url/ {PROVENANCE_XML} """, [ { "summary": ".*Reason: 'http://not a url/' is not a valid URI.*", "fields": ["codemeta:author"], }, ], ), ( "identifier-is-invalid-url", f"""\ some url bar The Author http://[invalid-url/ {PROVENANCE_XML} """, [ { "summary": ( r".*Reason: 'http://\[invalid-url/' is not a valid URI.*" ), "fields": ["codemeta:author"], }, ], ), ( "identifier-is-not-url", f"""\ some url bar The Author http://not a url/ {PROVENANCE_XML} """, [ { "summary": ".*Reason: 'http://not a url/' is not a valid URI.*", "fields": ["codemeta:author"], }, ], ), ( "identifier-is-not-url2", f"""\ some url bar The Author not a url {PROVENANCE_XML} """, [ { "summary": ".*Reason: 'not a url' is not an absolute URI.*", "fields": ["codemeta:author"], }, ], ), ( "invalid-dates", f"""\ something something-else bar someone 2020-aa-21 2020-12-bb {PROVENANCE_XML} """, [ { "summary": ".*Reason: invalid value '2020-aa-21'.*", "fields": ["codemeta:datePublished"], }, { "summary": ".*Reason: invalid value '2020-12-bb'.*", "fields": ["codemeta:dateCreated"], }, ], ), ( "invalid-dateModified", f"""\ some url someid bar no one 2020-12-aa {PROVENANCE_XML} """, [ { "summary": ".*Reason: invalid value '2020-12-aa'.*", "fields": ["codemeta:dateModified"], }, ], ), ( "invalid-embargoDate", f"""\ some url someid bar no one 2022-02-28T12:00:00 {PROVENANCE_XML} """, [ { "summary": ".*Invalid datetime string '2022-02-28T12:00:00'.*", "fields": ["codemeta:embargoDate"], }, ], ), ( "error-and-missing-provenance", f"""\ some url bar no one """, [ { "summary": ".*Reason: character data between child elements.*", "fields": ["codemeta:author"], }, { "summary": "Suggested fields are missing", "fields": ["swh:metadata-provenance"], }, ], ), ( "unknown-tag-in-swh-namespace", f"""\ something something-else bar someone some-metadata-provenance-url """, [ { "summary": ( r".*Reason: Unexpected child with tag 'swh:invalid'.*" r"Instance:.*swh:invalid.*" ), "fields": ["swh:deposit"], } ], ), ( "multiple-swh:add_to_origin", f"""\ something something-else bar someone some-metadata-provenance-url """, [ { "summary": ( r".*Reason: Unexpected child with tag 'swh:add_to_origin'.*" ), "fields": ["swh:deposit"], } ], ), ( "swh:add_to_origin-and-swh:create_origin", f"""\ something something-else bar someone some-metadata-provenance-url """, [ { "summary": ( r".*Reason: assertion test if false.*" r"Schema:\n*" r' *]+ id="swhdeposit-incompatible-create-and-add".*' ), "fields": ["swh:deposit"], } ], ), ( "swh:reference-and-swh:create_origin", f"""\ something something-else bar someone some-metadata-provenance-url """, [ { "summary": ( r".*Reason: assertion test if false.*" r"Schema:\n*" r' *]+ id="swhdeposit-incompatible-create-and-reference".*' ), "fields": ["swh:deposit"], } ], ), ( "swh:add_to_origin-and-swh:reference", f"""\ something something-else bar someone some-metadata-provenance-url """, [ { "summary": ( r".*Reason: assertion test if false.*" r"Schema:\n*" r' *]+ id="swhdeposit-incompatible-add-and-reference".*' ), "fields": ["swh:deposit"], } ], ), ( "swh:reference-two-children", f"""\ something something-else bar someone some-metadata-provenance-url """, [ { "summary": r".*Reason: Unexpected child with tag 'swh:origin'.*", "fields": ["swh:deposit"], }, ], ), ( "swh:reference-two-origins", f"""\ something something-else bar someone some-metadata-provenance-url """, [ { "summary": r".*Reason: Unexpected child with tag 'swh:origin'.*", "fields": ["swh:deposit"], }, ], ), ( "swh:reference-two-objects", f"""\ something something-else bar someone some-metadata-provenance-url """, [ { "summary": r".*Reason: Unexpected child with tag 'swh:object'.*", "fields": ["swh:deposit"], }, ], ), ] ] @pytest.mark.parametrize("metadata_ko,expected_summaries", _parameters3) def test_api_checks_check_metadata_ko_schema( metadata_ko, expected_summaries, swh_checks_deposit ): actual_check, error_detail = check_metadata(ElementTree.fromstring(metadata_ko)) assert actual_check is False assert len(error_detail["metadata"]) == len(expected_summaries), error_detail[ "metadata" ] for (detail, expected_summary) in zip(error_detail["metadata"], expected_summaries): assert detail["fields"] == expected_summary["fields"] # xmlschema returns very detailed errors, we cannot reasonably test them # for equality summary = detail["summary"] assert re.match( expected_summary["summary"], summary, re.DOTALL ), f"Failed to match {expected_summary['summary']!r} with:\n{summary}" diff --git a/swh/deposit/tests/api/test_collection.py b/swh/deposit/tests/api/test_collection.py index ade4e7cd..a4340095 100644 --- a/swh/deposit/tests/api/test_collection.py +++ b/swh/deposit/tests/api/test_collection.py @@ -1,95 +1,90 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib from io import BytesIO from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_REJECTED from swh.deposit.parsers import parse_xml def test_deposit_post_will_fail_with_401(unauthorized_client): - """Without authentication, endpoint refuses access with 401 response - - """ + """Without authentication, endpoint refuses access with 401 response""" url = reverse(COL_IRI, args=["hal"]) response = unauthorized_client.post(url) assert response.status_code == status.HTTP_401_UNAUTHORIZED def test_deposit_post_insufficient_permission(insufficient_perm_client): """With connection ok but insufficient permission, endpoint refuses access""" url = reverse(COL_IRI, args=["hal"]) response = insufficient_perm_client.post(url) assert response.status_code == status.HTTP_403_FORBIDDEN assert b"permission" in response.content def test_access_to_another_user_collection_is_forbidden( authenticated_client, deposit_another_collection, deposit_user ): - """Access to another user collection should return a 403 - - """ + """Access to another user collection should return a 403""" coll2 = deposit_another_collection url = reverse(COL_IRI, args=[coll2.name]) response = authenticated_client.post(url) assert response.status_code == status.HTTP_403_FORBIDDEN - msg = "Client %s cannot access collection %s" % (deposit_user.username, coll2.name,) + msg = "Client %s cannot access collection %s" % ( + deposit_user.username, + coll2.name, + ) assert msg in response.content.decode("utf-8") def test_put_on_col_iri_not_supported(authenticated_client, deposit_collection): - """Delete on col iri should return a 405 response - - """ + """Delete on col iri should return a 405 response""" url = reverse(COL_IRI, args=[deposit_collection.name]) response = authenticated_client.put(url) assert response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED assert "PUT method is not supported on this endpoint" in response.content.decode( "utf-8" ) def test_delete_on_col_iri_not_supported(authenticated_client, deposit_collection): - """Delete on col iri should return a 405 response - - """ + """Delete on col iri should return a 405 response""" url = reverse(COL_IRI, args=[deposit_collection.name]) response = authenticated_client.delete(url) assert response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED assert "DELETE method is not supported on this endpoint" in response.content.decode( "utf-8" ) def create_deposit_with_rejection_status(authenticated_client, deposit_collection): url = reverse(COL_IRI, args=[deposit_collection.name]) data = b"some data which is clearly not a zip file" md5sum = hashlib.md5(data).hexdigest() external_id = "some-external-id-1" # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=data, # + headers CONTENT_LENGTH=len(data), # other headers needs HTTP_ prefix to be taken into account HTTP_SLUG=external_id, HTTP_CONTENT_MD5=md5sum, HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) actual_state = response_content["deposit_status"] assert actual_state == DEPOSIT_STATUS_REJECTED diff --git a/swh/deposit/tests/api/test_collection_add_to_origin.py b/swh/deposit/tests/api/test_collection_add_to_origin.py index 8f3f02e3..666bedc6 100644 --- a/swh/deposit/tests/api/test_collection_add_to_origin.py +++ b/swh/deposit/tests/api/test_collection_add_to_origin.py @@ -1,152 +1,149 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_LOAD_SUCCESS from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import post_atom from swh.deposit.utils import NAMESPACES from ..conftest import internal_create_deposit def test_add_deposit_with_add_to_origin( authenticated_client, deposit_collection, completed_deposit, atom_dataset, deposit_user, ): - """Posting deposit with creates a new deposit with parent - - """ + """Posting deposit with creates a new deposit with parent""" # given multiple deposit already loaded deposit = completed_deposit assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS origin_url = deposit_user.provider_url + deposit.external_id # adding a new deposit with the same external id as a completed deposit # creates the parenting chain response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-with-add-to-origin"] % origin_url, ) assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert deposit_id != deposit.id new_deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == new_deposit.collection assert deposit.origin_url == origin_url assert new_deposit != deposit assert new_deposit.parent == deposit assert new_deposit.origin_url == origin_url def test_add_deposit_add_to_origin_conflict( authenticated_client, deposit_collection, deposit_another_collection, atom_dataset, sample_archive, deposit_user, deposit_another_user, ): """Posting a deposit with an referencing an origin owned by a different client raises an error """ external_id = "foobar" origin_url = deposit_another_user.provider_url + external_id # create a deposit for that other user, with the same slug internal_create_deposit( deposit_another_user, deposit_another_collection, external_id, DEPOSIT_STATUS_LOAD_SUCCESS, ) # adding a new deposit with the same external id as a completed deposit response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, ) assert response.status_code == status.HTTP_403_FORBIDDEN assert b"must start with" in response.content def test_add_deposit_add_to_wrong_origin( - authenticated_client, deposit_collection, atom_dataset, sample_archive, + authenticated_client, + deposit_collection, + atom_dataset, + sample_archive, ): """Posting a deposit with an referencing an origin not starting with the provider_url raises an error """ origin_url = "http://example.org/foo" # adding a new deposit with the same external id as a completed deposit response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, ) assert response.status_code == status.HTTP_403_FORBIDDEN, response.content.decode() assert b"must start with" in response.content def test_add_deposit_with_add_to_origin_and_external_identifier( authenticated_client, deposit_collection, completed_deposit, atom_dataset, deposit_user, ): - """Posting deposit with creates a new deposit with parent - - """ + """Posting deposit with creates a new deposit with parent""" # given multiple deposit already loaded origin_url = deposit_user.provider_url + completed_deposit.external_id # adding a new deposit with the same external id as a completed deposit # creates the parenting chain response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-with-both-add-to-origin-and-external-id"] % origin_url, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"<external_identifier> is deprecated" in response.content def test_post_deposit_atom_403_add_to_wrong_origin_url_prefix( authenticated_client, deposit_collection, atom_dataset, deposit_user ): - """Creating an origin for a prefix not owned by the client is forbidden - - """ + """Creating an origin for a prefix not owned by the client is forbidden""" origin_url = "http://example.org/foo" response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-with-add-to-origin"] % origin_url, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_403_FORBIDDEN assert "URL mismatch" in response.content.decode() diff --git a/swh/deposit/tests/api/test_collection_list.py b/swh/deposit/tests/api/test_collection_list.py index c92b1094..c6395008 100644 --- a/swh/deposit/tests/api/test_collection_list.py +++ b/swh/deposit/tests/api/test_collection_list.py @@ -1,122 +1,116 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse_lazy as reverse from requests.utils import parse_header_links from rest_framework import status from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL from swh.deposit.models import DepositCollection from swh.deposit.parsers import parse_xml from swh.deposit.utils import NAMESPACES def test_deposit_collection_list_is_auth_protected(anonymous_client): - """Deposit list should require authentication - - """ + """Deposit list should require authentication""" url = reverse(COL_IRI, args=("test",)) response = anonymous_client.get(url) assert response.status_code == status.HTTP_401_UNAUTHORIZED assert b"protected by basic authentication" in response.content def test_deposit_collection_list_collection_access_restricted_to_user_coll( deposit_another_collection, deposit_user, authenticated_client ): - """Deposit list api should restrict access to user's collection - - """ + """Deposit list api should restrict access to user's collection""" collection_id = authenticated_client.deposit_client.collections[0] coll = DepositCollection.objects.get(pk=collection_id) # authenticated_client has access to the "coll" collection coll2 = deposit_another_collection assert coll.name != coll2.name # but does not have access to that coll2 collection url = reverse(COL_IRI, args=(coll2.name,)) response = authenticated_client.get(url) # so it gets rejected access to the listing of that coll2 collection assert response.status_code == status.HTTP_403_FORBIDDEN msg = f"{deposit_user.username} cannot access collection {coll2.name}" assert msg in response.content.decode("utf-8") def test_deposit_collection_list_nominal( partial_deposit, deposited_deposit, authenticated_client ): - """Deposit list api should return the user deposits in a paginated way - - """ + """Deposit list api should return the user deposits in a paginated way""" client_id = authenticated_client.deposit_client.id assert partial_deposit.client.id == client_id assert deposited_deposit.client.id == client_id # Both deposit were deposited by the authenticated client # so requesting the listing of the deposits, both should be listed deposit_id = str(partial_deposit.id) deposit_id2 = str(deposited_deposit.id) coll = partial_deposit.collection # requesting the listing of the deposit for the user's collection url = reverse(COL_IRI, args=(coll.name,)) response = authenticated_client.get(f"{url}?page_size=1") assert response.status_code == status.HTTP_200_OK data = parse_xml(response.content) assert ( data.findtext("swh:count", namespaces=NAMESPACES) == "2" ) # total result of 2 deposits if consuming all results header_link = parse_header_links(response["Link"]) assert len(header_link) == 1 # only 1 next link expected_next = f"{url}?page=2&page_size=1" assert header_link[0]["url"].endswith(expected_next) assert header_link[0]["rel"] == "next" # only one deposit in the response assert len(data.findall("atom:entry", namespaces=NAMESPACES)) == 1 assert data.findtext("atom:entry/swh:id", namespaces=NAMESPACES) == str(deposit_id) assert ( data.findtext("atom:entry/swh:status", namespaces=NAMESPACES) == DEPOSIT_STATUS_PARTIAL ) # then 2nd page response2 = authenticated_client.get(expected_next) assert response2.status_code == status.HTTP_200_OK data2 = parse_xml(response2.content) assert ( data2.findtext("swh:count", namespaces=NAMESPACES) == "2" ) # still total of 2 deposits across all results expected_previous = f"{url}?page_size=1" header_link2 = parse_header_links(response2["Link"]) assert len(header_link2) == 1 # only 1 previous link assert header_link2[0]["url"].endswith(expected_previous) assert header_link2[0]["rel"] == "previous" # only 1 deposit in the response assert len(data2.findall("atom:entry", namespaces=NAMESPACES)) == 1 assert data2.findtext("atom:entry/swh:id", namespaces=NAMESPACES) == str( deposit_id2 ) assert ( data2.findtext("atom:entry/swh:status", namespaces=NAMESPACES) == DEPOSIT_STATUS_DEPOSITED ) # Retrieve every deposit in one query (no page_size parameter) response3 = authenticated_client.get(url) assert response3.status_code == status.HTTP_200_OK data3 = parse_xml(response3.content) assert ( data3.findtext("swh:count", namespaces=NAMESPACES) == "2" ) # total result of 2 deposits across all results deposits3 = data3.findall("atom:entry/swh:id", namespaces=NAMESPACES) # list here assert isinstance(deposits3, list) assert len(deposits3) == 2 header_link3 = parse_header_links(response3["Link"]) assert header_link3 == [] # no pagination as all results received in one round assert deposits3[0].text == str(deposit_id) assert deposits3[1].text == str(deposit_id2) diff --git a/swh/deposit/tests/api/test_collection_post_atom.py b/swh/deposit/tests/api/test_collection_post_atom.py index ae2f7953..a613b4ec 100644 --- a/swh/deposit/tests/api/test_collection_post_atom.py +++ b/swh/deposit/tests/api/test_collection_post_atom.py @@ -1,835 +1,839 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Tests the handling of the Atom content when doing a POST Col-IRI.""" import datetime import textwrap import uuid import warnings from xml.etree import ElementTree import attr from django.urls import reverse_lazy as reverse import pytest from rest_framework import status from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_SUCCESS, APIConfig, ) from swh.deposit.models import Deposit, DepositCollection, DepositRequest from swh.deposit.tests.common import post_atom from swh.deposit.utils import ( NAMESPACES, compute_metadata_context, extended_swhid_from_qualified, ) from swh.model.hypothesis_strategies import ( directories, present_contents, releases, revisions, snapshots, ) from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, Origin, RawExtrinsicMetadata, ) from swh.model.swhids import ObjectType, QualifiedSWHID from swh.storage.interface import PagedResult def _insert_object(swh_storage, swhid): """Insert an object with the given swhid in the archive""" if swhid.object_type == ObjectType.CONTENT: with warnings.catch_warnings(): # hypothesis doesn't like us using .example(), but we know what we're doing warnings.simplefilter("ignore") obj = present_contents().example() swh_storage.content_add([attr.evolve(obj, sha1_git=swhid.object_id)]) else: object_type_name = swhid.object_type.name.lower() strategy = { "directory": directories, "revision": revisions, "release": releases, "snapshot": snapshots, }[object_type_name] method = getattr(swh_storage, object_type_name + "_add") with warnings.catch_warnings(): # hypothesis doesn't like us using .example(), but we know what we're doing warnings.simplefilter("ignore") obj = strategy().example() method([attr.evolve(obj, id=swhid.object_id)]) def _assert_deposit_info_on_metadata( swh_storage, metadata_swhid, deposit, metadata_fetcher ): swh_authority = MetadataAuthority( - MetadataAuthorityType.REGISTRY, "http://deposit.softwareheritage.example/", + MetadataAuthorityType.REGISTRY, + "http://deposit.softwareheritage.example/", ) page_results = swh_storage.raw_extrinsic_metadata_get(metadata_swhid, swh_authority) assert len(page_results.results) == 1 assert page_results.next_page_token is None expected_xml_data = textwrap.dedent( f"""\ {deposit.id} https://hal-test.archives-ouvertes.fr/ test """ ) assert page_results == PagedResult( results=[ RawExtrinsicMetadata( target=metadata_swhid, discovery_date=deposit.complete_date, authority=swh_authority, fetcher=metadata_fetcher, format="xml-deposit-info", metadata=expected_xml_data.encode(), ) ], next_page_token=None, ) def test_post_deposit_atom_201_even_with_decimal( authenticated_client, deposit_collection, atom_dataset ): - """Posting an initial atom entry should return 201 with deposit receipt - - """ + """Posting an initial atom entry should return 201 with deposit receipt""" atom_error_with_decimal = atom_dataset["error-with-decimal"] response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_error_with_decimal, HTTP_SLUG="external-id", HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = ElementTree.fromstring(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) dr = DepositRequest.objects.get(deposit=deposit) assert dr.raw_metadata is not None sw_version = ElementTree.fromstring(dr.raw_metadata).findtext( "codemeta:softwareVersion", namespaces=NAMESPACES ) assert sw_version == "10.4" def test_post_deposit_atom_400_with_empty_body( authenticated_client, deposit_collection, atom_dataset ): - """Posting empty body request should return a 400 response - - """ + """Posting empty body request should return a 400 response""" atom_content = atom_dataset["entry-data-empty-body"] response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_content, HTTP_SLUG="external-id", ) assert ( response.status_code == status.HTTP_400_BAD_REQUEST ), response.content.decode() assert b"Empty body request is not supported" in response.content def test_post_deposit_atom_400_with_empty_request( authenticated_client, deposit_collection ): - """Posting empty request should return a 400 response - - """ + """Posting empty request should return a 400 response""" response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data={}, HTTP_SLUG="external-id", CONTENT_LENGTH=0, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Empty body request is not supported" in response.content def test_post_deposit_atom_400_badly_formatted_atom( authenticated_client, deposit_collection, atom_dataset ): - """Posting a badly formatted atom should return a 400 response - - """ + """Posting a badly formatted atom should return a 400 response""" response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-badly-formatted"], HTTP_SLUG="external-id", ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Malformed xml metadata" in response.content def test_post_deposit_atom_parsing_error( authenticated_client, deposit_collection, atom_dataset ): - """Posting parsing error prone atom should return 400 - - """ + """Posting parsing error prone atom should return 400""" response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-parsing-error-prone"], HTTP_SLUG="external-id", ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Malformed xml metadata" in response.content def test_post_deposit_atom_400_both_create_origin_and_add_to_origin( authenticated_client, deposit_collection, atom_dataset ): - """Posting a badly formatted atom should return a 400 response - - """ + """Posting a badly formatted atom should return a 400 response""" response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data-with-both-create-origin-and-add-to-origin"], ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert ( b"<swh:create_origin> and <swh:add_to_origin> " b"are mutually exclusive" ) in response.content def test_post_deposit_atom_403_create_wrong_origin_url_prefix( authenticated_client, deposit_collection, atom_dataset, deposit_user ): - """Creating an origin for a prefix not owned by the client is forbidden - - """ + """Creating an origin for a prefix not owned by the client is forbidden""" origin_url = "http://example.org/foo" response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_403_FORBIDDEN assert "URL mismatch" in response.content.decode() def test_post_deposit_atom_use_slug_header( authenticated_client, deposit_collection, deposit_user, atom_dataset, mocker ): """Posting an atom entry with a slug header but no origin url generates an origin url from the slug """ url = reverse(COL_IRI, args=[deposit_collection.name]) slug = str(uuid.uuid4()) # when response = post_atom( authenticated_client, url, data=atom_dataset["entry-data-no-origin-url"], HTTP_IN_PROGRESS="false", HTTP_SLUG=slug, ) assert response.status_code == status.HTTP_201_CREATED response_content = ElementTree.fromstring(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == deposit_user.provider_url + slug assert deposit.status == DEPOSIT_STATUS_DEPOSITED def test_post_deposit_atom_no_origin_url_nor_slug_header( authenticated_client, deposit_collection, deposit_user, atom_dataset, mocker ): - """Posting an atom entry without an origin url or a slug header should generate one - - """ + """Posting an atom entry without an origin url or a slug header should generate one""" url = reverse(COL_IRI, args=[deposit_collection.name]) slug = str(uuid.uuid4()) mocker.patch("uuid.uuid4", return_value=slug) # when response = post_atom( authenticated_client, url, data=atom_dataset["entry-data-no-origin-url"], HTTP_IN_PROGRESS="false", ) assert response.status_code == status.HTTP_201_CREATED response_content = ElementTree.fromstring(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == deposit_user.provider_url + slug assert deposit.status == DEPOSIT_STATUS_DEPOSITED def test_post_deposit_atom_with_slug_and_external_identifier( authenticated_client, deposit_collection, deposit_user, atom_dataset, mocker ): """Even though is deprecated, it should still be allowed when it matches the slug, so that we don't break existing clients """ url = reverse(COL_IRI, args=[deposit_collection.name]) slug = str(uuid.uuid4()) # when response = post_atom( authenticated_client, url, data=atom_dataset["error-with-external-identifier"] % slug, HTTP_IN_PROGRESS="false", HTTP_SLUG=slug, ) assert response.status_code == status.HTTP_201_CREATED response_content = ElementTree.fromstring(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == deposit_user.provider_url + slug assert deposit.status == DEPOSIT_STATUS_DEPOSITED def test_post_deposit_atom_with_mismatched_slug_and_external_identifier( authenticated_client, deposit_collection, atom_dataset ): """Posting an atom entry with mismatched slug header and external_identifier should return a 400 """ external_id = "foobar" url = reverse(COL_IRI, args=[deposit_collection.name]) # when response = post_atom( authenticated_client, url, data=atom_dataset["error-with-external-identifier"] % external_id, HTTP_IN_PROGRESS="false", HTTP_SLUG="something", ) assert ( b"The <external_identifier> tag and Slug header are deprecated" in response.content ) assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_with_create_origin_and_external_identifier( authenticated_client, deposit_collection, atom_dataset, deposit_user ): """ was deprecated before was introduced, clients should get an error when trying to use both """ external_id = "foobar" origin_url = deposit_user.provider_url + external_id url = reverse(COL_IRI, args=[deposit_collection.name]) document = atom_dataset["error-with-external-identifier-and-create-origin"].format( - external_id=external_id, url=origin_url, + external_id=external_id, + url=origin_url, ) # when response = post_atom( - authenticated_client, url, data=document, HTTP_IN_PROGRESS="false", + authenticated_client, + url, + data=document, + HTTP_IN_PROGRESS="false", ) assert b"<external_identifier> is deprecated" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_with_create_origin_and_reference( authenticated_client, deposit_collection, atom_dataset, deposit_user ): - """ and are mutually exclusive - - """ + """ and are mutually exclusive""" external_id = "foobar" origin_url = deposit_user.provider_url + external_id url = reverse(COL_IRI, args=[deposit_collection.name]) document = atom_dataset["error-with-reference-and-create-origin"].format( - external_id=external_id, url=origin_url, + external_id=external_id, + url=origin_url, ) # when response = post_atom( - authenticated_client, url, data=document, HTTP_IN_PROGRESS="false", + authenticated_client, + url, + data=document, + HTTP_IN_PROGRESS="false", ) assert b"only one may be used on a given deposit" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_unknown_collection(authenticated_client, atom_dataset): - """Posting an atom entry to an unknown collection should return a 404 - - """ + """Posting an atom entry to an unknown collection should return a 404""" unknown_collection = "unknown-one" with pytest.raises(DepositCollection.DoesNotExist): DepositCollection.objects.get(name=unknown_collection) response = post_atom( authenticated_client, reverse(COL_IRI, args=[unknown_collection]), data=atom_dataset["entry-data0"], HTTP_SLUG="something", ) assert response.status_code == status.HTTP_404_NOT_FOUND assert b"Unknown collection" in response.content def test_post_deposit_atom_entry_initial( authenticated_client, deposit_collection, atom_dataset, deposit_user ): - """Posting an initial atom entry should return 201 with deposit receipt - - """ + """Posting an initial atom entry should return 201 with deposit receipt""" # given origin_url = deposit_user.provider_url + "1225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(origin_url=origin_url) atom_entry_data = atom_dataset["entry-data0"] % origin_url # when date_before = datetime.datetime.now(tz=datetime.timezone.utc) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_entry_data, HTTP_IN_PROGRESS="false", ) date_after = datetime.datetime.now(tz=datetime.timezone.utc) # then assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = ElementTree.fromstring(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert ( date_before <= datetime.datetime.fromisoformat( response_content.findtext("swh:deposit_date", namespaces=NAMESPACES) ) <= date_after ) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == origin_url assert deposit.status == DEPOSIT_STATUS_DEPOSITED # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.raw_metadata == atom_entry_data assert bool(deposit_request.archive) is False def test_post_deposit_atom_entry_with_codemeta( authenticated_client, deposit_collection, atom_dataset, deposit_user ): - """Posting an initial atom entry should return 201 with deposit receipt - - """ + """Posting an initial atom entry should return 201 with deposit receipt""" # given origin_url = deposit_user.provider_url + "1225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(origin_url=origin_url) atom_entry_data = atom_dataset["codemeta-sample"] % origin_url # when response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_entry_data, HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = ElementTree.fromstring(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == origin_url assert deposit.status == DEPOSIT_STATUS_DEPOSITED # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.raw_metadata == atom_entry_data assert bool(deposit_request.archive) is False def test_deposit_metadata_invalid( authenticated_client, deposit_collection, atom_dataset ): - """Posting invalid swhid reference is bad request returned to client - - """ + """Posting invalid swhid reference is bad request returned to client""" invalid_swhid = "swh:1:dir :31b5c8cc985d190b5a7ef4878128ebfdc2358f49" xml_data = atom_dataset["entry-data-with-swhid-no-prov"].format(swhid=invalid_swhid) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Invalid SWHID reference" in response.content def test_deposit_metadata_invalid_metadata_provenance( authenticated_client, deposit_collection, atom_dataset ): - """Posting invalid metadata provenance is bad request returned to client - - """ + """Posting invalid metadata provenance is bad request returned to client""" invalid_swhid = "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49" xml_data = atom_dataset["entry-data-with-swhid"].format( swhid=invalid_swhid, metadata_provenance_url=( "https://inria.halpreprod.archives-ouvertes.fr/hal-abcdefgh" ), ) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert response.status_code == status.HTTP_403_FORBIDDEN assert b"URL mismatch" in response.content def test_deposit_metadata_fails_functional_checks( authenticated_client, deposit_collection, atom_dataset ): - """Posting functionally invalid metadata swhid is bad request returned to client - - """ + """Posting functionally invalid metadata swhid is bad request returned to client""" swhid = "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49" invalid_xml_data = atom_dataset[ "entry-data-with-swhid-fail-metadata-functional-checks" ].format(swhid=swhid) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=invalid_xml_data, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Functional metadata checks failure" in response.content @pytest.mark.parametrize( "swhid", [ "swh:1:cnt:01b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:dir:11b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:rev:21b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;origin=https://inria.halpreprod.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:rev:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa "swh:1:rev:71b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:rel:81b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:snp:91b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", ], ) def test_deposit_metadata_swhid( - swhid, authenticated_client, deposit_collection, atom_dataset, swh_storage, + swhid, + authenticated_client, + deposit_collection, + atom_dataset, + swh_storage, ): - """Posting a swhid reference is stored on raw extrinsic metadata storage - - """ + """Posting a swhid reference is stored on raw extrinsic metadata storage""" swhid_reference = QualifiedSWHID.from_string(swhid) swhid_target = extended_swhid_from_qualified(swhid_reference) xml_data = atom_dataset["entry-data-with-swhid"].format( swhid=swhid, metadata_provenance_url="https://hal-test.archives-ouvertes.fr/hal-abcdefgh", ) deposit_client = authenticated_client.deposit_client _insert_object(swh_storage, swhid_reference) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = ElementTree.fromstring(response.content) # Ensure the deposit is finalized deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.swhid == str(swhid_target) assert deposit.swhid_context == str(swhid_reference) assert deposit.complete_date == deposit.reception_date assert deposit.complete_date is not None assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS # Ensure metadata stored in the metadata storage is consistent metadata_authority = MetadataAuthority( - type=MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url, + type=MetadataAuthorityType.DEPOSIT_CLIENT, + url=deposit_client.provider_url, ) actual_authority = swh_storage.metadata_authority_get( MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url ) assert actual_authority == metadata_authority config = APIConfig() metadata_fetcher = MetadataFetcher( - name=config.tool["name"], version=config.tool["version"], + name=config.tool["name"], + version=config.tool["version"], ) actual_fetcher = swh_storage.metadata_fetcher_get( config.tool["name"], config.tool["version"] ) assert actual_fetcher == metadata_fetcher # Get the deposited metadata object and check it: page_results = swh_storage.raw_extrinsic_metadata_get( swhid_target, metadata_authority ) assert len(page_results.results) == 1 assert page_results.next_page_token is None metadata_context = compute_metadata_context(swhid_reference) metadata = RawExtrinsicMetadata( target=swhid_target, discovery_date=deposit.complete_date, authority=metadata_authority, fetcher=metadata_fetcher, format="sword-v2-atom-codemeta", metadata=xml_data.encode(), **metadata_context, ) - assert page_results == PagedResult(results=[metadata], next_page_token=None,) + assert page_results == PagedResult( + results=[metadata], + next_page_token=None, + ) # Get metadata about the deposited metadata object and check it: _assert_deposit_info_on_metadata( swh_storage, metadata.swhid(), deposit, metadata_fetcher ) @pytest.mark.parametrize( - "url", ["https://gitlab.org/user/repo", "https://whatever.else/repo",] + "url", + [ + "https://gitlab.org/user/repo", + "https://whatever.else/repo", + ], ) def test_deposit_metadata_origin( - url, authenticated_client, deposit_collection, atom_dataset, swh_storage, + url, + authenticated_client, + deposit_collection, + atom_dataset, + swh_storage, ): - """Posting a swhid reference is stored on raw extrinsic metadata storage - - """ + """Posting a swhid reference is stored on raw extrinsic metadata storage""" xml_data = atom_dataset["entry-data-with-origin-reference"].format(url=url) origin_swhid = Origin(url).swhid() deposit_client = authenticated_client.deposit_client swh_storage.origin_add([Origin(url)]) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = ElementTree.fromstring(response.content) # Ensure the deposit is finalized deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) # we got not swhid as input so we cannot have those assert deposit.swhid is None assert deposit.swhid_context is None assert deposit.complete_date == deposit.reception_date assert deposit.complete_date is not None assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS # Ensure metadata stored in the metadata storage is consistent metadata_authority = MetadataAuthority( - type=MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url, + type=MetadataAuthorityType.DEPOSIT_CLIENT, + url=deposit_client.provider_url, ) actual_authority = swh_storage.metadata_authority_get( MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url ) assert actual_authority == metadata_authority config = APIConfig() metadata_fetcher = MetadataFetcher( - name=config.tool["name"], version=config.tool["version"], + name=config.tool["name"], + version=config.tool["version"], ) actual_fetcher = swh_storage.metadata_fetcher_get( config.tool["name"], config.tool["version"] ) assert actual_fetcher == metadata_fetcher # Get the deposited metadata object and check it: page_results = swh_storage.raw_extrinsic_metadata_get( origin_swhid, metadata_authority ) assert len(page_results.results) == 1 assert page_results.next_page_token is None metadata = RawExtrinsicMetadata( target=origin_swhid, discovery_date=deposit.complete_date, authority=metadata_authority, fetcher=metadata_fetcher, format="sword-v2-atom-codemeta", metadata=xml_data.encode(), ) - assert page_results == PagedResult(results=[metadata], next_page_token=None,) + assert page_results == PagedResult( + results=[metadata], + next_page_token=None, + ) # Get metadata about the deposited metadata object and check it: _assert_deposit_info_on_metadata( swh_storage, metadata.swhid(), deposit, metadata_fetcher ) @pytest.mark.parametrize( "swhid", [ "swh:1:cnt:01b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:dir:11b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:rev:21b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;origin=https://inria.halpreprod.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:rev:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa "swh:1:rev:71b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:rel:81b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", "swh:1:snp:91b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", ], ) def test_deposit_metadata_unknown_swhid( - swhid, authenticated_client, deposit_collection, atom_dataset, swh_storage, + swhid, + authenticated_client, + deposit_collection, + atom_dataset, + swh_storage, ): - """Posting a swhid reference is rejected if the referenced object is unknown - - """ + """Posting a swhid reference is rejected if the referenced object is unknown""" xml_data = atom_dataset["entry-data-with-swhid-no-prov"].format(swhid=swhid) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert ( response.status_code == status.HTTP_400_BAD_REQUEST ), response.content.decode() response_content = ElementTree.fromstring(response.content) assert "object does not exist" in response_content.findtext( "atom:summary", namespaces=NAMESPACES ) @pytest.mark.parametrize( "swhid", [ "swh:1:ori:01b5c8cc985d190b5a7ef4878128ebfdc2358f49", "swh:1:emd:11b5c8cc985d190b5a7ef4878128ebfdc2358f49", ], ) def test_deposit_metadata_extended_swhid( - swhid, authenticated_client, deposit_collection, atom_dataset, swh_storage, + swhid, + authenticated_client, + deposit_collection, + atom_dataset, + swh_storage, ): """Posting a swhid reference is rejected if the referenced SWHID is for an extended object type """ xml_data = atom_dataset["entry-data-with-swhid-no-prov"].format(swhid=swhid) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert ( response.status_code == status.HTTP_400_BAD_REQUEST ), response.content.decode() response_content = ElementTree.fromstring(response.content) assert "Invalid SWHID reference" in response_content.findtext( "atom:summary", namespaces=NAMESPACES ) def test_deposit_metadata_unknown_origin( - authenticated_client, deposit_collection, atom_dataset, swh_storage, + authenticated_client, + deposit_collection, + atom_dataset, + swh_storage, ): - """Posting a swhid reference is stored on raw extrinsic metadata storage - - """ + """Posting a swhid reference is stored on raw extrinsic metadata storage""" url = "https://gitlab.org/user/repo" xml_data = atom_dataset["entry-data-with-origin-reference"].format(url=url) response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=xml_data, ) assert ( response.status_code == status.HTTP_400_BAD_REQUEST ), response.content.decode() response_content = ElementTree.fromstring(response.content) assert "known to the archive" in response_content.findtext( "atom:summary", namespaces=NAMESPACES ) diff --git a/swh/deposit/tests/api/test_collection_post_binary.py b/swh/deposit/tests/api/test_collection_post_binary.py index 5e39aeb9..26e7f5a4 100644 --- a/swh/deposit/tests/api/test_collection_post_binary.py +++ b/swh/deposit/tests/api/test_collection_post_binary.py @@ -1,356 +1,341 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Tests the handling of the binary content when doing a POST Col-IRI.""" import uuid from django.urls import reverse_lazy as reverse import pytest from rest_framework import status from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED from swh.deposit.models import Deposit, DepositRequest from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import ( check_archive, create_arborescence_archive, post_archive, ) from swh.deposit.utils import NAMESPACES def test_post_deposit_binary_no_slug( authenticated_client, deposit_collection, sample_archive, deposit_user, mocker ): - """Posting a binary deposit without slug header should generate one - - """ + """Posting a binary deposit without slug header should generate one""" id_ = str(uuid.uuid4()) mocker.patch("uuid.uuid4", return_value=id_) url = reverse(COL_IRI, args=[deposit_collection.name]) # when response = post_archive( - authenticated_client, url, sample_archive, in_progress="false", + authenticated_client, + url, + sample_archive, + in_progress="false", ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == deposit_user.provider_url + id_ assert deposit.status == DEPOSIT_STATUS_DEPOSITED def test_post_deposit_binary_support( authenticated_client, deposit_collection, sample_archive ): - """Binary upload with content-type not in [zip,x-tar] should return 415 - - """ + """Binary upload with content-type not in [zip,x-tar] should return 415""" # given url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id-1" # when response = authenticated_client.post( url, sample_archive, HTTP_SLUG=external_id, content_type="application/octet-stream", HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_415_UNSUPPORTED_MEDIA_TYPE with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) def test_post_deposit_binary_upload_ok( authenticated_client, deposit_collection, sample_archive ): - """Binary upload with correct headers should return 201 with receipt - - """ + """Binary upload with correct headers should return 201 with receipt""" # given url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id-1" # when response = post_archive( authenticated_client, url, sample_archive, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", ) # then response_content = parse_xml(response.content) assert response.status_code == status.HTTP_201_CREATED deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swhid is None deposit_request = DepositRequest.objects.get(deposit=deposit) check_archive(sample_archive["name"], deposit_request.archive.name) assert deposit_request.metadata is None assert deposit_request.raw_metadata is None response_content = parse_xml(response.content) assert ( response_content.findtext("swh:deposit_archive", namespaces=NAMESPACES) == sample_archive["name"] ) assert ( int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) == deposit.id ) assert ( response_content.findtext("swh:deposit_status", namespaces=NAMESPACES) == deposit.status ) # deprecated tags assert ( response_content.findtext("atom:deposit_archive", namespaces=NAMESPACES) == sample_archive["name"] ) assert ( int(response_content.findtext("atom:deposit_id", namespaces=NAMESPACES)) == deposit.id ) assert ( response_content.findtext("atom:deposit_status", namespaces=NAMESPACES) == deposit.status ) from django.urls import reverse as reverse_strict edit_iri = reverse_strict("edit_iri", args=[deposit_collection.name, deposit.id]) assert response["location"] == f"http://testserver{edit_iri}" def test_post_deposit_binary_failure_unsupported_packaging_header( authenticated_client, deposit_collection, sample_archive ): - """Bin deposit without supported content_disposition header returns 400 - - """ + """Bin deposit without supported content_disposition header returns 400""" # given url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id" # when response = post_archive( authenticated_client, url, sample_archive, HTTP_SLUG=external_id, HTTP_PACKAGING="something-unsupported", ) # then assert response.status_code == status.HTTP_400_BAD_REQUEST assert ( b"The packaging provided something-unsupported is not supported" in response.content ) with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) def test_post_deposit_binary_upload_no_content_disposition_header( authenticated_client, deposit_collection, sample_archive ): - """Binary upload without content_disposition header should return 400 - - """ + """Binary upload without content_disposition header should return 400""" # given url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id" # when response = post_archive( authenticated_client, url, sample_archive, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION=None, ) # then assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"CONTENT_DISPOSITION header is mandatory" in response.content with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) def test_post_deposit_mediation_not_supported( authenticated_client, deposit_collection, sample_archive ): - """Binary upload with mediation should return a 412 response - - """ + """Binary upload with mediation should return a 412 response""" # given url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id-1" # when response = post_archive( authenticated_client, url, sample_archive, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", HTTP_ON_BEHALF_OF="someone", ) # then assert response.status_code == status.HTTP_412_PRECONDITION_FAILED with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) def test_post_deposit_binary_upload_fail_if_upload_size_limit_exceeded( authenticated_client, deposit_collection, sample_archive, tmp_path ): - """Binary upload must not exceed the limit set up... - - """ + """Binary upload must not exceed the limit set up...""" tmp_path = str(tmp_path) url = reverse(COL_IRI, args=[deposit_collection.name]) archive = create_arborescence_archive( tmp_path, "archive2", "file2", b"some content in file", up_to_size=5000 ) external_id = "some-external-id" # when response = post_archive( authenticated_client, url, archive, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_413_REQUEST_ENTITY_TOO_LARGE assert b"Upload size limit exceeded" in response.content with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) def test_post_deposit_binary_upload_fail_if_content_length_missing( authenticated_client, deposit_collection, sample_archive, tmp_path ): - """The Content-Length header is mandatory - - """ + """The Content-Length header is mandatory""" tmp_path = str(tmp_path) url = reverse(COL_IRI, args=[deposit_collection.name]) archive = create_arborescence_archive( tmp_path, "archive2", "file2", b"some content in file", up_to_size=500 ) external_id = "some-external-id" # when response = post_archive( authenticated_client, url, archive, CONTENT_LENGTH=None, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"the CONTENT_LENGTH header must be sent." in response.content with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) def test_post_deposit_2_post_2_different_deposits( authenticated_client, deposit_collection, sample_archive ): - """2 posting deposits should return 2 different 201 with receipt - - """ + """2 posting deposits should return 2 different 201 with receipt""" url = reverse(COL_IRI, args=[deposit_collection.name]) # when response = post_archive( authenticated_client, url, sample_archive, HTTP_SLUG="some-external-id-1", HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) deposits = Deposit.objects.all() assert len(deposits) == 1 assert deposits[0] == deposit # second post response = post_archive( authenticated_client, url, sample_archive, content_type="application/x-tar", HTTP_SLUG="another-external-id", HTTP_IN_PROGRESS="false", ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id2 = int( response_content.findtext("swh:deposit_id", namespaces=NAMESPACES) ) deposit2 = Deposit.objects.get(pk=deposit_id2) assert deposit != deposit2 deposits = Deposit.objects.all().order_by("id") assert len(deposits) == 2 assert list(deposits), [deposit == deposit2] diff --git a/swh/deposit/tests/api/test_collection_post_multipart.py b/swh/deposit/tests/api/test_collection_post_multipart.py index aa0759e8..20f1bdb4 100644 --- a/swh/deposit/tests/api/test_collection_post_multipart.py +++ b/swh/deposit/tests/api/test_collection_post_multipart.py @@ -1,392 +1,393 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Tests handling of multipart requests to POST Col-IRI.""" from io import BytesIO import uuid from django.core.files.uploadedfile import InMemoryUploadedFile from django.urls import reverse_lazy as reverse import pytest from rest_framework import status from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED from swh.deposit.models import Deposit, DepositRequest from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import check_archive, post_multipart from swh.deposit.utils import NAMESPACES def test_post_deposit_multipart( authenticated_client, deposit_collection, atom_dataset, mocker, deposit_user, sample_archive, ): # given external_id = "foobar" origin_url = deposit_user.provider_url + external_id url = reverse(COL_IRI, args=[deposit_collection.name]) data_atom_entry = atom_dataset["entry-data0"] % origin_url # when response = post_multipart( authenticated_client, url, sample_archive, data_atom_entry, HTTP_IN_PROGRESS="false", ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == origin_url assert deposit.status == DEPOSIT_STATUS_DEPOSITED def test_post_deposit_multipart_without_origin_url( authenticated_client, deposit_collection, atom_dataset, mocker, deposit_user, sample_archive, ): # given url = reverse(COL_IRI, args=[deposit_collection.name]) data_atom_entry = atom_dataset["entry-data-deposit-binary"] id_ = str(uuid.uuid4()) mocker.patch("uuid.uuid4", return_value=id_) # when response = post_multipart( authenticated_client, url, sample_archive, data_atom_entry, HTTP_IN_PROGRESS="false", ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == deposit_user.provider_url + id_ assert deposit.status == DEPOSIT_STATUS_DEPOSITED def test_post_deposit_multipart_zip( authenticated_client, deposit_collection, atom_dataset, sample_archive ): - """one multipart deposit (zip+xml) should be accepted - - """ + """one multipart deposit (zip+xml) should be accepted""" # given url = reverse(COL_IRI, args=[deposit_collection.name]) data_atom_entry = atom_dataset["entry-data-deposit-binary"] external_id = "external-id" # when response = post_multipart( authenticated_client, url, sample_archive, data_atom_entry, HTTP_IN_PROGRESS="false", HTTP_SLUG=external_id, ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swhid is None deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert len(deposit_requests) == 2 for deposit_request in deposit_requests: assert deposit_request.deposit == deposit if deposit_request.type == "archive": check_archive(sample_archive["name"], deposit_request.archive.name) assert deposit_request.raw_metadata is None else: assert ( parse_xml(deposit_request.raw_metadata).findtext( "atom:id", namespaces=NAMESPACES ) == "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" ) assert deposit_request.raw_metadata == data_atom_entry def test_post_deposit_multipart_tar( authenticated_client, deposit_collection, atom_dataset, sample_archive ): - """one multipart deposit (tar+xml) should be accepted - - """ + """one multipart deposit (tar+xml) should be accepted""" # given url = reverse(COL_IRI, args=[deposit_collection.name]) data_atom_entry = atom_dataset["entry-data-deposit-binary"] external_id = "external-id" # when response = post_multipart( authenticated_client, url, sample_archive, data_atom_entry, HTTP_IN_PROGRESS="false", HTTP_SLUG=external_id, ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swhid is None deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert len(deposit_requests) == 2 for deposit_request in deposit_requests: assert deposit_request.deposit == deposit if deposit_request.type == "archive": check_archive(sample_archive["name"], deposit_request.archive.name) assert deposit_request.raw_metadata is None else: assert ( parse_xml(deposit_request.raw_metadata).findtext( "atom:id", namespaces=NAMESPACES ) == "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" ) assert deposit_request.raw_metadata == data_atom_entry def test_post_deposit_multipart_put_to_replace_metadata( authenticated_client, deposit_collection, atom_dataset, sample_archive ): """One multipart deposit followed by a metadata update should be - accepted + accepted """ # given url = reverse(COL_IRI, args=[deposit_collection.name]) data_atom_entry = atom_dataset["entry-data-deposit-binary"] external_id = "external-id" # when response = post_multipart( authenticated_client, url, sample_archive, data_atom_entry, HTTP_IN_PROGRESS="true", HTTP_SLUG=external_id, ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == "partial" assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swhid is None deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert len(deposit_requests) == 2 for deposit_request in deposit_requests: assert deposit_request.deposit == deposit if deposit_request.type == "archive": check_archive(sample_archive["name"], deposit_request.archive.name) else: assert ( parse_xml(deposit_request.raw_metadata).findtext( "atom:id", namespaces=NAMESPACES ) == "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" ) assert deposit_request.raw_metadata == data_atom_entry replace_metadata_uri = response["location"] response = authenticated_client.put( replace_metadata_uri, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data-deposit-binary"], HTTP_IN_PROGRESS="false", ) assert response.status_code == status.HTTP_204_NO_CONTENT # deposit_id did not change deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swhid is None deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert len(deposit_requests) == 2 for deposit_request in deposit_requests: assert deposit_request.deposit == deposit if deposit_request.type == "archive": check_archive(sample_archive["name"], deposit_request.archive.name) else: assert ( parse_xml(deposit_request.raw_metadata).findtext( "atom:id", namespaces=NAMESPACES ) == "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" ) assert ( deposit_request.raw_metadata == atom_dataset["entry-data-deposit-binary"] ) # FAILURE scenarios def test_post_deposit_multipart_only_archive_and_atom_entry( authenticated_client, deposit_collection ): """Multipart deposit only accepts one archive and one atom+xml""" # given url = reverse(COL_IRI, args=[deposit_collection.name]) archive_content = b"some content representing archive" archive = InMemoryUploadedFile( BytesIO(archive_content), field_name="archive0", name="archive0", content_type="application/x-tar", size=len(archive_content), charset=None, ) other_archive_content = b"some-other-content" other_archive = InMemoryUploadedFile( BytesIO(other_archive_content), field_name="atom0", name="atom0", content_type="application/x-tar", size=len(other_archive_content), charset="utf-8", ) # when response = authenticated_client.post( url, format="multipart", - data={"archive": archive, "atom_entry": other_archive,}, + data={ + "archive": archive, + "atom_entry": other_archive, + }, # + headers HTTP_IN_PROGRESS="false", HTTP_SLUG="external-id", ) # then assert response.status_code == status.HTTP_415_UNSUPPORTED_MEDIA_TYPE assert ( "Only 1 application/zip (or application/x-tar) archive" in response.content.decode("utf-8") ) # when archive.seek(0) response = authenticated_client.post( url, format="multipart", - data={"archive": archive,}, + data={ + "archive": archive, + }, # + headers HTTP_IN_PROGRESS="false", HTTP_SLUG="external-id", ) # then assert response.status_code == status.HTTP_415_UNSUPPORTED_MEDIA_TYPE assert ( "You must provide both 1 application/zip (or " "application/x-tar) and 1 atom+xml entry for " "multipart deposit" in response.content.decode("utf-8") ) is True def test_post_deposit_multipart_400_when_badly_formatted_xml( authenticated_client, deposit_collection, sample_archive, atom_dataset ): # given url = reverse(COL_IRI, args=[deposit_collection.name]) data_atom_entry_ko = atom_dataset["entry-data-ko"] # when response = post_multipart( authenticated_client, url, sample_archive, data_atom_entry_ko, HTTP_IN_PROGRESS="false", HTTP_SLUG="external-id", ) assert b"Malformed xml metadata" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_multipart_if_upload_size_limit_exceeded( authenticated_client, deposit_collection, atom_dataset, sample_archive ): # given url = reverse(COL_IRI, args=[deposit_collection.name]) archive = { **sample_archive, "data": sample_archive["data"] * 100, } data_atom_entry = atom_dataset["entry-data-deposit-binary"] external_id = "external-id" # when response = post_multipart( authenticated_client, url, archive, data_atom_entry, HTTP_IN_PROGRESS="false", HTTP_SLUG=external_id, ) # then assert response.status_code == status.HTTP_413_REQUEST_ENTITY_TOO_LARGE assert b"Upload size limit exceeded" in response.content with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) diff --git a/swh/deposit/tests/api/test_collection_reuse_slug.py b/swh/deposit/tests/api/test_collection_reuse_slug.py index 2d533657..ed6fe1c0 100644 --- a/swh/deposit/tests/api/test_collection_reuse_slug.py +++ b/swh/deposit/tests/api/test_collection_reuse_slug.py @@ -1,283 +1,281 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_PARTIAL, SE_IRI, ) from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import post_atom from swh.deposit.utils import NAMESPACES from ..conftest import internal_create_deposit def test_act_on_deposit_rejected_is_not_permitted( authenticated_client, deposit_collection, rejected_deposit, atom_dataset ): deposit = rejected_deposit response = post_atom( authenticated_client, reverse(SE_IRI, args=[deposit.collection.name, deposit.id]), data=atom_dataset["entry-data1"], HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_400_BAD_REQUEST print(response.content) assert ( parse_xml(response.content).findtext("atom:summary", namespaces=NAMESPACES) == f"You can only act on deposit with status '{DEPOSIT_STATUS_PARTIAL}'" ) def test_add_deposit_when_partial_makes_new_deposit( authenticated_client, deposit_collection, partial_deposit, atom_dataset, deposit_user, ): - """Posting deposit on collection when previous is partial makes new deposit - - """ + """Posting deposit on collection when previous is partial makes new deposit""" deposit = partial_deposit assert deposit.status == DEPOSIT_STATUS_PARTIAL origin_url = deposit_user.provider_url + deposit.external_id # adding a new deposit with the same external id response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert deposit_id != deposit.id # new deposit new_deposit = Deposit.objects.get(pk=deposit_id) assert new_deposit != deposit assert new_deposit.parent is None assert new_deposit.origin_url == origin_url def test_add_deposit_when_failed_makes_new_deposit_with_no_parent( authenticated_client, deposit_collection, failed_deposit, atom_dataset, deposit_user ): """Posting deposit on collection when deposit done makes new deposit with parent """ deposit = failed_deposit assert deposit.status == DEPOSIT_STATUS_LOAD_FAILURE origin_url = deposit_user.provider_url + deposit.external_id # adding a new deposit with the same external id as a completed deposit # creates the parenting chain response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert deposit_id != deposit.id new_deposit = Deposit.objects.get(pk=deposit_id) assert new_deposit != deposit assert new_deposit.parent is None assert new_deposit.origin_url == origin_url def test_add_deposit_when_done_makes_new_deposit_with_parent_old_one( authenticated_client, deposit_collection, completed_deposit, atom_dataset, deposit_user, ): """Posting deposit on collection when deposit done makes new deposit with parent """ # given multiple deposit already loaded deposit = completed_deposit assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS origin_url = deposit_user.provider_url + deposit.external_id # adding a new deposit with the same external id as a completed deposit # creates the parenting chain response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert deposit_id != deposit.id new_deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == new_deposit.collection assert deposit.origin_url == origin_url assert new_deposit != deposit assert new_deposit.parent == deposit assert new_deposit.origin_url == origin_url def test_add_deposit_with_external_identifier( authenticated_client, deposit_collection, completed_deposit, atom_dataset, deposit_user, ): """Even though is deprecated, it should still be allowed when it matches the slug, so that we don't break existing clients """ # given multiple deposit already loaded deposit = completed_deposit assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS origin_url = deposit_user.provider_url + deposit.external_id # adding a new deposit with the same external id as a completed deposit # creates the parenting chain response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["error-with-external-identifier"] % deposit.external_id, HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert deposit_id != deposit.id new_deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == new_deposit.collection assert deposit.origin_url == origin_url assert new_deposit != deposit assert new_deposit.parent == deposit assert new_deposit.origin_url == origin_url def test_add_deposit_external_id_conflict_no_parent( authenticated_client, deposit_collection, deposit_another_collection, atom_dataset, deposit_user, deposit_another_user, ): """Posting a deposit with an external_id conflicting with an external_id of a different client does not create a parent relationship """ external_id = "foobar" origin_url = deposit_user.provider_url + external_id # create a deposit for that other user, with the same slug other_deposit = internal_create_deposit( deposit_another_user, deposit_another_collection, external_id, DEPOSIT_STATUS_LOAD_SUCCESS, ) # adding a new deposit with the same external id as a completed deposit response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, HTTP_SLUG=external_id, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert other_deposit.id != deposit_id new_deposit = Deposit.objects.get(pk=deposit_id) assert new_deposit.parent is None assert new_deposit.origin_url == origin_url def test_add_deposit_external_id_conflict_with_parent( authenticated_client, deposit_collection, deposit_another_collection, completed_deposit, atom_dataset, deposit_user, deposit_another_user, ): """Posting a deposit with an external_id conflicting with an external_id of a different client creates a parent relationship with the deposit of the right client instead of the last matching deposit This test does not have an equivalent for origin url conflicts, as these can not happen (assuming clients do not have provider_url overlaps) """ # given multiple deposit already loaded deposit = completed_deposit assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS origin_url = deposit_user.provider_url + deposit.external_id # create a deposit for that other user, with the same slug other_deposit = internal_create_deposit( deposit_another_user, deposit_another_collection, deposit.external_id, DEPOSIT_STATUS_LOAD_SUCCESS, ) # adding a new deposit with the same external id as a completed deposit response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data0"] % origin_url, HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert deposit_id != deposit.id assert other_deposit.id != deposit.id new_deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == new_deposit.collection assert deposit.external_id == new_deposit.external_id assert new_deposit != deposit assert new_deposit.parent == deposit assert new_deposit.origin_url == origin_url diff --git a/swh/deposit/tests/api/test_converters.py b/swh/deposit/tests/api/test_converters.py index a446d1c7..f4bd177a 100644 --- a/swh/deposit/tests/api/test_converters.py +++ b/swh/deposit/tests/api/test_converters.py @@ -1,98 +1,128 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.deposit.api.converters import convert_status_detail def test_convert_status_detail_empty(): for status_detail in [{}, {"dummy-keys": []}, None]: assert convert_status_detail(status_detail) is None def test_convert_status_detail(): status_detail = { "url": { "summary": "At least one url field must be compatible with the client's domain name. The following url fields failed the check", # noqa "fields": ["blahurl", "testurl"], }, "metadata": [ - {"summary": "Mandatory fields missing", "fields": ["url", "title"],}, + { + "summary": "Mandatory fields missing", + "fields": ["url", "title"], + }, { "summary": "Alternate fields missing", "fields": ["name or title", "url or badurl"], }, ], - "archive": [{"summary": "Unreadable archive", "fields": ["1"],}], + "archive": [ + { + "summary": "Unreadable archive", + "fields": ["1"], + } + ], } expected_status_detail = """- Mandatory fields missing (url, title) - Alternate fields missing (name or title, url or badurl) - Unreadable archive (1) - At least one url field must be compatible with the client's domain name. The following url fields failed the check (blahurl, testurl) """ # noqa actual_status_detail = convert_status_detail(status_detail) assert actual_status_detail == expected_status_detail def test_convert_status_detail_2(): status_detail = { "url": { "summary": "At least one compatible url field. Failed", "fields": ["testurl"], }, - "metadata": [{"summary": "Mandatory fields missing", "fields": ["name"],},], + "metadata": [ + { + "summary": "Mandatory fields missing", + "fields": ["name"], + }, + ], "archive": [ - {"summary": "Invalid archive", "fields": ["2"],}, - {"summary": "Unsupported archive", "fields": ["1"],}, + { + "summary": "Invalid archive", + "fields": ["2"], + }, + { + "summary": "Unsupported archive", + "fields": ["1"], + }, ], "loading": ["error1", "error 2"], } expected_status_detail = """- Mandatory fields missing (name) - Invalid archive (2) - Unsupported archive (1) - At least one compatible url field. Failed (testurl) - error1 - error 2 """ actual_status_detail = convert_status_detail(status_detail) assert actual_status_detail == expected_status_detail def test_convert_status_detail_3(): status_detail = { - "url": {"summary": "At least one compatible url field",}, + "url": { + "summary": "At least one compatible url field", + }, } expected_status_detail = "- At least one compatible url field\n" actual_status_detail = convert_status_detail(status_detail) assert actual_status_detail == expected_status_detail def test_convert_status_detail_edge_case(): status_detail = { "url": { "summary": "At least one compatible url field. Failed", "fields": ["testurl"], }, "metadata": [ - {"summary": "Mandatory fields missing", "fields": ["9", 10, 1.212],}, + { + "summary": "Mandatory fields missing", + "fields": ["9", 10, 1.212], + }, ], "archive": [ - {"summary": "Invalid archive", "fields": ["3"],}, - {"summary": "Unsupported archive", "fields": [2],}, + { + "summary": "Invalid archive", + "fields": ["3"], + }, + { + "summary": "Unsupported archive", + "fields": [2], + }, ], } expected_status_detail = """- Mandatory fields missing (9, 10, 1.212) - Invalid archive (3) - Unsupported archive (2) - At least one compatible url field. Failed (testurl) """ actual_status_detail = convert_status_detail(status_detail) assert actual_status_detail == expected_status_detail diff --git a/swh/deposit/tests/api/test_delete.py b/swh/deposit/tests/api/test_delete.py index af274d87..780882e4 100644 --- a/swh/deposit/tests/api/test_delete.py +++ b/swh/deposit/tests/api/test_delete.py @@ -1,135 +1,125 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict from typing import Dict, Mapping import xml.etree.ElementTree as ET from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.config import ( ARCHIVE_TYPE, DEPOSIT_STATUS_DEPOSITED, EDIT_IRI, EM_IRI, METADATA_TYPE, ) from swh.deposit.models import Deposit, DepositRequest from swh.deposit.utils import NAMESPACES def count_deposit_request_types(deposit_requests) -> Mapping[str, int]: deposit_request_types = defaultdict(int) # type: Dict[str, int] for dr in deposit_requests: deposit_request_types[dr.type] += 1 return deposit_request_types def test_delete_archive_on_partial_deposit_works( authenticated_client, partial_deposit_with_metadata, deposit_collection ): - """Removing partial deposit's archive should return a 204 response - - """ + """Removing partial deposit's archive should return a 204 response""" deposit_id = partial_deposit_with_metadata.id deposit = Deposit.objects.get(pk=deposit_id) deposit_requests = DepositRequest.objects.filter(deposit=deposit) # deposit request type: 'archive', 1 'metadata' deposit_request_types = count_deposit_request_types(deposit_requests) assert deposit_request_types == {ARCHIVE_TYPE: 1, METADATA_TYPE: 1} # when update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit_id]) response = authenticated_client.delete(update_uri) # then assert response.status_code == status.HTTP_204_NO_CONTENT deposit = Deposit.objects.get(pk=deposit_id) deposit_requests2 = DepositRequest.objects.filter(deposit=deposit) deposit_request_types = count_deposit_request_types(deposit_requests2) assert deposit_request_types == {METADATA_TYPE: 1} def test_delete_archive_on_undefined_deposit_fails( authenticated_client, deposit_collection, sample_archive ): - """Delete undefined deposit returns a 404 response - - """ + """Delete undefined deposit returns a 404 response""" # when update_uri = reverse(EM_IRI, args=[deposit_collection.name, 999]) response = authenticated_client.delete(update_uri) # then assert response.status_code == status.HTTP_404_NOT_FOUND def test_delete_non_partial_deposit( authenticated_client, deposit_collection, deposited_deposit ): - """Delete !partial status deposit should return a 400 response - - """ + """Delete !partial status deposit should return a 400 response""" deposit = deposited_deposit assert deposit.status == DEPOSIT_STATUS_DEPOSITED # when update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.delete(update_uri) # then assert response.status_code == status.HTTP_400_BAD_REQUEST assert ( ET.fromstring(response.content).findtext("atom:summary", namespaces=NAMESPACES) == "You can only act on deposit with status 'partial'" ) deposit = Deposit.objects.get(pk=deposit.id) assert deposit is not None def test_delete_partial_deposit( authenticated_client, deposit_collection, partial_deposit ): - """Delete deposit should return a 204 response - - """ + """Delete deposit should return a 204 response""" # given deposit = partial_deposit # when url = reverse(EDIT_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.delete(url) # then assert response.status_code == status.HTTP_204_NO_CONTENT deposit_requests = list(DepositRequest.objects.filter(deposit=deposit)) assert deposit_requests == [] deposits = list(Deposit.objects.filter(pk=deposit.id)) assert deposits == [] def test_delete_on_edit_iri_cannot_delete_non_partial_deposit( authenticated_client, deposit_collection, complete_deposit ): - """Delete !partial deposit should return a 400 response - - """ + """Delete !partial deposit should return a 400 response""" # given deposit = complete_deposit # when url = reverse(EDIT_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.delete(url) # then assert response.status_code == status.HTTP_400_BAD_REQUEST assert ( ET.fromstring(response.content).findtext("atom:summary", namespaces=NAMESPACES) == "You can only act on deposit with status 'partial'" ) deposit = Deposit.objects.get(pk=deposit.id) assert deposit is not None diff --git a/swh/deposit/tests/api/test_deposit_private_check.py b/swh/deposit/tests/api/test_deposit_private_check.py index c2dda547..6f7acc76 100644 --- a/swh/deposit/tests/api/test_deposit_private_check.py +++ b/swh/deposit/tests/api/test_deposit_private_check.py @@ -1,216 +1,208 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse_lazy as reverse import pytest from rest_framework import status from swh.deposit.api.checks import METADATA_PROVENANCE_KEY, SUGGESTED_FIELDS_MISSING from swh.deposit.api.private.deposit_check import ( MANDATORY_ARCHIVE_INVALID, MANDATORY_ARCHIVE_MISSING, MANDATORY_ARCHIVE_UNSUPPORTED, ) from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED, PRIVATE_CHECK_DEPOSIT, ) from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import ( create_arborescence_archive, create_archive_with_archive, ) from swh.deposit.utils import NAMESPACES PRIVATE_CHECK_DEPOSIT_NC = PRIVATE_CHECK_DEPOSIT + "-nc" def private_check_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" return [ reverse(PRIVATE_CHECK_DEPOSIT, args=[collection.name, deposit.id]), reverse(PRIVATE_CHECK_DEPOSIT_NC, args=[deposit.id]), ] @pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) def test_deposit_ok( authenticated_client, deposit_collection, ready_deposit_ok, extension ): - """Proper deposit should succeed the checks (-> status ready) - - """ + """Proper deposit should succeed the checks (-> status ready)""" deposit = ready_deposit_ok for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_VERIFIED deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_VERIFIED # Deposit is ok but it's missing suggested fields in its metadata detected by # the checks status_detail = deposit.status_detail["metadata"] assert len(status_detail) == 1 suggested = status_detail[0] assert suggested["summary"] == SUGGESTED_FIELDS_MISSING assert set(suggested["fields"]) == set([METADATA_PROVENANCE_KEY]) deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() @pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) def test_deposit_invalid_tarball( tmp_path, authenticated_client, deposit_collection, extension ): - """Deposit with tarball (of 1 tarball) should fail the checks: rejected - - """ + """Deposit with tarball (of 1 tarball) should fail the checks: rejected""" deposit = create_deposit_archive_with_archive( tmp_path, extension, authenticated_client, deposit_collection.name ) for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_REJECTED details = data["details"] # archive checks failure assert len(details["archive"]) == 1 assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_INVALID deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_REJECTED def test_deposit_ko_missing_tarball( authenticated_client, deposit_collection, ready_deposit_only_metadata ): - """Deposit without archive should fail the checks: rejected - - """ + """Deposit without archive should fail the checks: rejected""" deposit = ready_deposit_only_metadata assert deposit.status == DEPOSIT_STATUS_DEPOSITED for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_REJECTED details = data["details"] # archive checks failure assert len(details["archive"]) == 1 assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_MISSING deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_REJECTED deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() def test_deposit_ko_unsupported_tarball( tmp_path, authenticated_client, deposit_collection, ready_deposit_invalid_archive ): - """Deposit with an unsupported tarball should fail the checks: rejected - - """ + """Deposit with an unsupported tarball should fail the checks: rejected""" deposit = ready_deposit_invalid_archive assert DEPOSIT_STATUS_DEPOSITED == deposit.status for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_REJECTED details = data["details"] # archive checks failure assert len(details["archive"]) == 1 assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_UNSUPPORTED # metadata check failure assert len(details["metadata"]) == 1 mandatory = details["metadata"][0] assert mandatory["summary"] == "Missing Atom document" deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_REJECTED deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() def test_check_deposit_metadata_ok( authenticated_client, deposit_collection, ready_deposit_ok ): """Proper deposit should succeed the checks (-> status ready) - with all **MUST** metadata + with all **MUST** metadata - using the codemeta metadata test set + using the codemeta metadata test set """ deposit = ready_deposit_ok assert deposit.status == DEPOSIT_STATUS_DEPOSITED for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_VERIFIED deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_VERIFIED deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() def create_deposit_archive_with_archive( root_path, archive_extension, client, collection_name ): # we create the holding archive to a given extension archive = create_arborescence_archive( root_path, "archive1", "file1", b"some content in file", extension=archive_extension, ) # now we create an archive holding the first created archive invalid_archive = create_archive_with_archive(root_path, "invalid.tgz", archive) # we deposit it response = client.post( reverse(COL_IRI, args=[collection_name]), content_type="application/x-tar", data=invalid_archive["data"], CONTENT_LENGTH=invalid_archive["length"], HTTP_MD5SUM=invalid_archive["md5sum"], HTTP_SLUG="external-id", HTTP_IN_PROGRESS=False, HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (invalid_archive["name"],), ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_status = response_content.findtext( "swh:deposit_status", namespaces=NAMESPACES ) assert deposit_status == DEPOSIT_STATUS_DEPOSITED deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert DEPOSIT_STATUS_DEPOSITED == deposit.status return deposit diff --git a/swh/deposit/tests/api/test_deposit_private_list.py b/swh/deposit/tests/api/test_deposit_private_list.py index a5c135ac..7017d8d4 100644 --- a/swh/deposit/tests/api/test_deposit_private_list.py +++ b/swh/deposit/tests/api/test_deposit_private_list.py @@ -1,165 +1,172 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.api.converters import convert_status_detail from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS, PRIVATE_LIST_DEPOSITS from swh.deposit.models import DEPOSIT_CODE, DEPOSIT_METADATA_ONLY, DepositClient from swh.deposit.tests.conftest import internal_create_deposit STATUS_DETAIL = { "url": { "summary": "At least one compatible url field. Failed", "fields": ["testurl"], }, - "metadata": [{"summary": "Mandatory fields missing", "fields": ["9", 10, 1.212],},], + "metadata": [ + { + "summary": "Mandatory fields missing", + "fields": ["9", 10, 1.212], + }, + ], "archive": [ - {"summary": "Invalid archive", "fields": ["3"],}, - {"summary": "Unsupported archive", "fields": [2],}, + { + "summary": "Invalid archive", + "fields": ["3"], + }, + { + "summary": "Unsupported archive", + "fields": [2], + }, ], } def test_deposit_list( partial_deposit_with_metadata, partial_deposit_only_metadata, partial_deposit, authenticated_client, ): - """Deposit list api should return all deposits in a paginated way - - """ + """Deposit list api should return all deposits in a paginated way""" partial_deposit_with_metadata.status_detail = STATUS_DETAIL partial_deposit_with_metadata.save() deposit1 = partial_deposit_with_metadata deposit2 = partial_deposit_only_metadata deposit2.type = DEPOSIT_METADATA_ONLY deposit2.save() deposit3 = partial_deposit main_url = reverse(PRIVATE_LIST_DEPOSITS) url = f"{main_url}?page_size=1" response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data_p1 = response.json() assert data_p1["count"] == 3 # total nb of deposits expected_next_p1 = f"{main_url}?page=2&page_size=1" assert data_p1["next"].endswith(expected_next_p1) is True assert data_p1["previous"] is None assert len(data_p1["results"]) == 1 # page of size 1 deposit_d = data_p1["results"][0] assert deposit_d["id"] == deposit1.id assert deposit_d["status"] == deposit1.status expected_status_detail = convert_status_detail(STATUS_DETAIL) assert deposit_d["status_detail"] == expected_status_detail assert deposit_d["raw_metadata"] is not None assert deposit_d["type"] == DEPOSIT_CODE assert ( deposit_d["raw_metadata"] == deposit1.depositrequest_set.filter(type="metadata")[0].raw_metadata ) # then 2nd page response2 = authenticated_client.get(data_p1["next"]) assert response2.status_code == status.HTTP_200_OK data_p2 = response2.json() assert data_p2["count"] == 3 # total nb of deposits expected_next_p2 = f"{main_url}?page=3&page_size=1" assert data_p2["next"].endswith(expected_next_p2) assert data_p2["previous"].endswith(url) assert len(data_p2["results"]) == 1 # page of size 1 deposit2_d = data_p2["results"][0] assert deposit2_d["id"] == deposit2.id assert deposit2_d["status"] == deposit2.status assert deposit2_d["raw_metadata"] is not None assert deposit2_d["type"] == DEPOSIT_METADATA_ONLY assert ( deposit2_d["raw_metadata"] == deposit2.depositrequest_set.filter(type="metadata")[0].raw_metadata ) # then 3rd (and last) page response3 = authenticated_client.get(data_p2["next"]) assert response3.status_code == status.HTTP_200_OK data_p3 = response3.json() assert data_p3["count"] == 3 # total nb of deposits assert data_p3["next"] is None, "No more page beyond that point" assert data_p3["previous"] == data_p1["next"] assert len(data_p3["results"]) == 1 # page of size 1 deposit3_d = data_p3["results"][0] assert deposit3_d["id"] == deposit3.id assert deposit3_d["status"] == deposit3.status assert deposit3_d["type"] == DEPOSIT_CODE assert not deposit3.depositrequest_set.filter( type="metadata" ), "No metadata type request for that deposit" # hence no raw metadata set for that deposit assert deposit3_d["raw_metadata"] is None, "no raw metadata for that deposit" def test_deposit_list_exclude(partial_deposit, deposited_deposit, authenticated_client): - """Exclusion pattern on external_id should be respected - - """ + """Exclusion pattern on external_id should be respected""" partial_deposit.status_detail = STATUS_DETAIL partial_deposit.save() main_url = reverse(PRIVATE_LIST_DEPOSITS) # Testing exclusion pattern exclude_pattern = "external-id" assert partial_deposit.external_id.startswith(exclude_pattern) assert deposited_deposit.external_id.startswith(exclude_pattern) url = f"{main_url}?page_size=1&exclude=external-id" response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["count"] == 0 url = "%s?page_size=1&exclude=dummy" % main_url # that won't exclude anything response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["count"] == 2 def test_deposit_list_for_username( authenticated_client, deposit_another_collection, completed_deposit, deposit_user, deposit_another_user, ): # create a new deposit with a user different from deposit_user, # the one that created completed_deposit internal_create_deposit( client=deposit_another_user, collection=deposit_another_collection, external_id="external-id-bar", status=DEPOSIT_STATUS_LOAD_SUCCESS, ) for user in (deposit_user, deposit_another_user): # check deposit filtering by username url = f"{reverse(PRIVATE_LIST_DEPOSITS)}?username={user.username}" json_response = authenticated_client.get(url).json() assert len(json_response["results"]) == 1 deposit_client = DepositClient.objects.all().get( id=json_response["results"][0]["client"] ) assert deposit_client.username == user.username diff --git a/swh/deposit/tests/api/test_deposit_private_read_archive.py b/swh/deposit/tests/api/test_deposit_private_read_archive.py index 3ec0c186..85eeb5bb 100644 --- a/swh/deposit/tests/api/test_deposit_private_read_archive.py +++ b/swh/deposit/tests/api/test_deposit_private_read_archive.py @@ -1,106 +1,102 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from os.path import exists, join import tarfile from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.api.private.deposit_read import aggregate_tarballs from swh.deposit.config import EM_IRI, PRIVATE_GET_RAW_CONTENT from swh.deposit.tests.common import create_arborescence_archive PRIVATE_GET_RAW_CONTENT_NC = PRIVATE_GET_RAW_CONTENT + "-nc" def private_get_raw_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" return [ reverse(PRIVATE_GET_RAW_CONTENT, args=[collection.name, deposit.id]), reverse(PRIVATE_GET_RAW_CONTENT_NC, args=[deposit.id]), ] def test_access_to_existing_deposit_with_one_archive( authenticated_client, deposit_collection, complete_deposit, sample_archive, tmp_path, ): - """Access to deposit should stream a 200 response with its raw content - - """ + """Access to deposit should stream a 200 response with its raw content""" deposit = complete_deposit for i, url in enumerate(private_get_raw_url_endpoints(deposit_collection, deposit)): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response["content-type"] == "application/tar" # write the response stream in a temporary archive archive_path = join(tmp_path, f"archive_{i}.tar") with open(archive_path, "wb") as f: for chunk in response.streaming_content: f.write(chunk) # to check its properties are correct tfile = tarfile.open(archive_path) assert set(tfile.getnames()) == {".", "./file1"} assert tfile.extractfile("./file1").read() == b"some content in file" def test_access_to_existing_deposit_with_multiple_archives( tmp_path, authenticated_client, deposit_collection, partial_deposit, sample_archive ): - """Access to deposit should stream a 200 response with its raw contents - - """ + """Access to deposit should stream a 200 response with its raw contents""" deposit = partial_deposit archive2 = create_arborescence_archive( tmp_path, "archive2", "file2", b"some other content in file" ) # Add a second archive to deposit update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.post( update_uri, content_type="application/zip", # as zip data=archive2["data"], # + headers CONTENT_LENGTH=archive2["length"], HTTP_SLUG=deposit.external_id, HTTP_CONTENT_MD5=archive2["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive2["name"],), ) assert response.status_code == status.HTTP_201_CREATED for i, url in enumerate(private_get_raw_url_endpoints(deposit_collection, deposit)): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response["content-type"] == "application/tar" # write the response stream in a temporary archive archive_path = join(tmp_path, f"archive_{i}.tar") with open(archive_path, "wb") as f: for chunk in response.streaming_content: f.write(chunk) # to check its properties are correct tfile = tarfile.open(archive_path) assert set(tfile.getnames()) == {".", "./file1", "./file2"} assert tfile.extractfile("./file1").read() == b"some content in file" assert tfile.extractfile("./file2").read() == b"some other content in file" def test_aggregate_tarballs_with_strange_archive(datadir, tmp_path): archive = join(datadir, "archives", "single-artifact-package.tar.gz") with aggregate_tarballs(tmp_path, [archive]) as tarball_path: assert exists(tarball_path) diff --git a/swh/deposit/tests/api/test_deposit_private_read_metadata.py b/swh/deposit/tests/api/test_deposit_private_read_metadata.py index 7e8879b6..8428266a 100644 --- a/swh/deposit/tests/api/test_deposit_private_read_metadata.py +++ b/swh/deposit/tests/api/test_deposit_private_read_metadata.py @@ -1,473 +1,475 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit import __version__ from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA, SE_IRI, SWH_PERSON from swh.deposit.models import Deposit PRIVATE_GET_DEPOSIT_METADATA_NC = PRIVATE_GET_DEPOSIT_METADATA + "-nc" def private_get_raw_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" deposit_id = deposit if isinstance(deposit, int) else deposit.id return [ reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[collection.name, deposit_id]), reverse(PRIVATE_GET_DEPOSIT_METADATA_NC, args=[deposit_id]), ] def update_deposit_with_metadata(authenticated_client, collection, deposit, metadata): # update deposit's metadata response = authenticated_client.post( reverse(SE_IRI, args=[collection.name, deposit.id]), content_type="application/atom+xml;type=entry", data=metadata, HTTP_SLUG=deposit.external_id, HTTP_IN_PROGRESS=True, ) assert response.status_code == status.HTTP_201_CREATED return deposit def test_read_missing_metadata( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): - """Private metadata read api to existing deposit should return metadata - - """ + """Private metadata read api to existing deposit should return metadata""" deposit = partial_deposit deposit.external_id = "some-external-id" deposit.origin_url = f"https://hal-test.archives-ouvertes.fr/{deposit.external_id}" deposit.save() for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response["content-type"] == "application/json" actual_data = response.json() assert actual_data == { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", }, "raw_metadata": None, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": None, "author_date": None, "client": "test", "id": deposit.id, "collection": "test", "revision_parents": [], "release_notes": None, }, } def test_read_metadata( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): - """Private metadata read api to existing deposit should return metadata - - """ + """Private metadata read api to existing deposit should return metadata""" deposit = partial_deposit deposit.external_id = "some-external-id" deposit.origin_url = f"https://hal-test.archives-ouvertes.fr/{deposit.external_id}" deposit.save() metadata_xml_raw = atom_dataset["entry-data2"] deposit = update_deposit_with_metadata( - authenticated_client, deposit_collection, deposit, metadata_xml_raw, + authenticated_client, + deposit_collection, + deposit, + metadata_xml_raw, ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response["content-type"] == "application/json" actual_data = response.json() assert actual_data == { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", }, "raw_metadata": metadata_xml_raw, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "author_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": "test", "id": deposit.id, "collection": "test", "revision_parents": [], "release_notes": "This is the release of October 7th, 2017.", }, } def test_read_metadata_revision_with_parent( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): - """Private read metadata to a deposit (with parent) returns metadata - - """ + """Private read metadata to a deposit (with parent) returns metadata""" deposit = partial_deposit deposit.external_id = "some-external-id" deposit.origin_url = f"https://hal-test.archives-ouvertes.fr/{deposit.external_id}" deposit.save() metadata_xml_raw = atom_dataset["entry-data2"] deposit = update_deposit_with_metadata( - authenticated_client, deposit_collection, deposit, metadata_xml_raw, + authenticated_client, + deposit_collection, + deposit, + metadata_xml_raw, ) rev_id = "da78a9d4cf1d5d29873693fd496142e3a18c20fa" swhid = "swh:1:rev:%s" % rev_id fake_parent = Deposit( swhid=swhid, client=deposit.client, collection=deposit.collection ) fake_parent.save() deposit.parent = fake_parent deposit.save() for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response["content-type"] == "application/json" actual_data = response.json() assert actual_data == { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", }, "raw_metadata": metadata_xml_raw, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "author_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": "test", "id": deposit.id, "collection": "test", "revision_parents": [rev_id], "release_notes": "This is the release of October 7th, 2017.", }, } def test_read_metadata_3( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): - """date(Created|Published) provided, uses author/committer date - - """ + """date(Created|Published) provided, uses author/committer date""" deposit = partial_deposit deposit.external_id = "hal-01243065" deposit.origin_url = f"https://hal-test.archives-ouvertes.fr/{deposit.external_id}" deposit.save() metadata_xml_raw = atom_dataset["entry-data3"] update_deposit_with_metadata( - authenticated_client, deposit_collection, deposit, metadata_xml_raw, + authenticated_client, + deposit_collection, + deposit, + metadata_xml_raw, ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response["content-type"] == "application/json" actual_data = response.json() assert actual_data == { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/hal-01243065", }, "raw_metadata": metadata_xml_raw, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1493820527}, }, "author_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], "release_notes": "This is the release of October 7th, 2017.", }, } def test_read_metadata_4( authenticated_client, deposit_collection, atom_dataset, partial_deposit ): - """dateCreated/datePublished not provided, revision uses complete_date - - """ + """dateCreated/datePublished not provided, revision uses complete_date""" deposit = partial_deposit codemeta_entry_data = atom_dataset["metadata"] % "" deposit = update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) # will use the deposit completed date as fallback date deposit.complete_date = "2016-04-06" deposit.save() for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response["content-type"] == "application/json" actual_data = response.json() assert actual_data == { - "origin": {"type": "deposit", "url": None,}, + "origin": { + "type": "deposit", + "url": None, + }, "raw_metadata": codemeta_entry_data, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1459900800}, }, "author_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1459900800}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], "release_notes": None, }, } def test_read_metadata_5( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """dateCreated/datePublished provided, revision uses author/committer date If multiple dateCreated provided, the first occurrence (of dateCreated) is selected. If multiple datePublished provided, the first occurrence (of datePublished) is selected. """ deposit = partial_deposit # add metadata to the deposit with multiple datePublished/dateCreated codemeta_entry_data = ( atom_dataset["metadata"] % """ 2015-04-06T17:08:47+02:00 2017-05-03T16:08:47+02:00 2016-04-06T17:08:47+02:00 2018-05-03T16:08:47+02:00 """ ) deposit = update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response["content-type"] == "application/json" actual_data = response.json() assert actual_data == { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/hal-01243065", }, "raw_metadata": codemeta_entry_data, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1493820527}, }, "author_date": { "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1428332927}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], "release_notes": None, }, } def test_access_to_nonexisting_deposit_returns_404_response( - authenticated_client, deposit_collection, + authenticated_client, + deposit_collection, ): - """Read unknown collection should return a 404 response - - """ + """Read unknown collection should return a 404 response""" unknown_id = 999 try: Deposit.objects.get(pk=unknown_id) except Deposit.DoesNotExist: assert True for url in private_get_raw_url_endpoints(deposit_collection, unknown_id): response = authenticated_client.get(url) assert response.status_code == status.HTTP_404_NOT_FOUND msg = "Deposit %s does not exist" % unknown_id assert msg in response.content.decode("utf-8") def test_read_metadata_multiple_release_notes( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): - """Private metadata read api to existing deposit should return metadata - - """ + """Private metadata read api to existing deposit should return metadata""" deposit = partial_deposit deposit.external_id = "some-external-id" deposit.origin_url = f"https://hal-test.archives-ouvertes.fr/{deposit.external_id}" deposit.save() metadata_xml_raw = atom_dataset["entry-data-multiple-release-notes"] deposit = update_deposit_with_metadata( - authenticated_client, deposit_collection, deposit, metadata_xml_raw, + authenticated_client, + deposit_collection, + deposit, + metadata_xml_raw, ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response["content-type"] == "application/json" actual_data = response.json() assert actual_data == { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", }, "raw_metadata": metadata_xml_raw, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "author_date": { "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": "test", "id": deposit.id, "collection": "test", "revision_parents": [], "release_notes": ( "This is the release of October 7th, 2017.\n\n" "It fixes some bugs." ), }, } diff --git a/swh/deposit/tests/api/test_deposit_private_update_status.py b/swh/deposit/tests/api/test_deposit_private_update_status.py index 7ac6974b..e859635e 100644 --- a/swh/deposit/tests/api/test_deposit_private_update_status.py +++ b/swh/deposit/tests/api/test_deposit_private_update_status.py @@ -1,198 +1,190 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import json from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.api.private.deposit_update_status import MANDATORY_KEYS from swh.deposit.config import ( DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS, PRIVATE_PUT_DEPOSIT, ) from swh.deposit.models import Deposit PRIVATE_PUT_DEPOSIT_NC = PRIVATE_PUT_DEPOSIT + "-nc" def private_check_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" return [ reverse(PRIVATE_PUT_DEPOSIT, args=[collection.name, deposit.id]), reverse(PRIVATE_PUT_DEPOSIT_NC, args=[deposit.id]), ] def test_update_deposit_status_success_with_info( authenticated_client, deposit_collection, ready_deposit_verified ): - """Update deposit with load success should require all information to succeed - - """ + """Update deposit with load success should require all information to succeed""" deposit = ready_deposit_verified expected_status = DEPOSIT_STATUS_LOAD_SUCCESS status_detail = "it works!" origin_url = "something" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" release_id = "47dc6b4636c7f6cba0df83e3d5490bf4334d987e" snapshot_id = "68c0d26104d47e278dd6be07ed61fafb561d0d20" full_body_info = { "status": DEPOSIT_STATUS_LOAD_SUCCESS, "status_detail": status_detail, "release_id": release_id, "directory_id": directory_id, "snapshot_id": snapshot_id, "origin_url": origin_url, } for url in private_check_url_endpoints(deposit_collection, deposit): expected_swhid = "swh:1:dir:%s" % directory_id expected_swhid_context = ( f"{expected_swhid}" f";origin={origin_url}" f";visit=swh:1:snp:{snapshot_id}" f";anchor=swh:1:rel:{release_id}" f";path=/" ) response = authenticated_client.put( - url, content_type="application/json", data=json.dumps(full_body_info), + url, + content_type="application/json", + data=json.dumps(full_body_info), ) assert response.status_code == status.HTTP_204_NO_CONTENT deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == expected_status assert deposit.status_detail == status_detail assert deposit.swhid == expected_swhid assert deposit.swhid_context == expected_swhid_context # Reset deposit deposit = ready_deposit_verified deposit.save() def test_update_deposit_status_rejected_with_info( authenticated_client, deposit_collection, ready_deposit_verified ): - """Update deposit with rejected status needs few information to succeed - - """ + """Update deposit with rejected status needs few information to succeed""" deposit = ready_deposit_verified for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.put( url, content_type="application/json", data=json.dumps({"status": DEPOSIT_STATUS_LOAD_FAILURE}), ) assert response.status_code == status.HTTP_204_NO_CONTENT deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_LOAD_FAILURE assert deposit.swhid is None assert deposit.swhid_context is None # Reset status deposit = ready_deposit_verified deposit.save() def test_update_deposit_status_success_with_incomplete_data( authenticated_client, deposit_collection, ready_deposit_verified ): - """Update deposit status with status success and incomplete information should fail - - """ + """Update deposit status with status success and incomplete information should fail""" deposit = ready_deposit_verified origin_url = "something" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" release_id = "47dc6b4636c7f6cba0df83e3d5490bf4334d987e" snapshot_id = "68c0d26104d47e278dd6be07ed61fafb561d0d20" new_status = DEPOSIT_STATUS_LOAD_SUCCESS full_body_info = { "status": new_status, "release_id": release_id, "directory_id": directory_id, "snapshot_id": snapshot_id, "origin_url": origin_url, } for url in private_check_url_endpoints(deposit_collection, deposit): for key in MANDATORY_KEYS: # Crafting body with missing information so that it raises body = copy.deepcopy(full_body_info) body.pop(key) # make the body incomplete response = authenticated_client.put( - url, content_type="application/json", data=json.dumps(body), + url, + content_type="application/json", + data=json.dumps(body), ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert ( f"deposit status to {new_status} requires information {key}" in response.content.decode("utf-8") ) def test_update_deposit_status_will_fail_with_unknown_status( authenticated_client, deposit_collection, ready_deposit_verified ): - """Unknown status for update should return a 400 response - - """ + """Unknown status for update should return a 400 response""" deposit = ready_deposit_verified for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.put( url, content_type="application/json", data=json.dumps({"status": "unknown"}) ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Possible status in " in response.content def test_update_deposit_status_will_fail_with_no_status_key( authenticated_client, deposit_collection, ready_deposit_verified ): - """No status provided for update should return a 400 response - - """ + """No status provided for update should return a 400 response""" deposit = ready_deposit_verified for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.put( url, content_type="application/json", data=json.dumps({"something": "something"}), ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"The status key is mandatory with possible values" in response.content def test_update_deposit_status_success_without_swhid_fail( authenticated_client, deposit_collection, ready_deposit_verified ): - """Providing successful status without swhid should return a 400 - - """ + """Providing successful status without swhid should return a 400""" deposit = ready_deposit_verified for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.put( url, content_type="application/json", data=json.dumps({"status": DEPOSIT_STATUS_LOAD_SUCCESS}), ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert ( b"Updating deposit status to done requires information" in response.content ) diff --git a/swh/deposit/tests/api/test_deposit_schedule.py b/swh/deposit/tests/api/test_deposit_schedule.py index caf64d9a..7c617e0f 100644 --- a/swh/deposit/tests/api/test_deposit_schedule.py +++ b/swh/deposit/tests/api/test_deposit_schedule.py @@ -1,139 +1,138 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import datetime from django.urls import reverse_lazy as reverse import pytest from rest_framework import status from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL, SE_IRI, ) from swh.deposit.parsers import parse_xml from swh.deposit.utils import NAMESPACES @pytest.fixture() def deposit_config(deposit_config): """Overrides the `deposit_config` fixture define in swh/deposit/tests/conftest.py to re-enable the checks.""" config_d = copy.deepcopy(deposit_config) config_d["checks"] = True return config_d def now() -> datetime.datetime: return datetime.datetime.now(tz=datetime.timezone.utc) def assert_task_for_deposit( swh_scheduler, deposit_id, timestamp_before_call, timestamp_after_call ): tasks = swh_scheduler.grab_ready_tasks("check-deposit") assert len(tasks) == 1 task = tasks[0] assert timestamp_before_call <= task.pop("next_run") <= timestamp_after_call assert task["arguments"] == { "args": [], - "kwargs": {"collection": "test", "deposit_id": deposit_id,}, + "kwargs": { + "collection": "test", + "deposit_id": deposit_id, + }, } assert task["policy"] == "oneshot" assert task["type"] == "check-deposit" assert task["retries_left"] == 3 def test_add_deposit_schedules_check( authenticated_client, deposit_collection, sample_archive, swh_scheduler ): - """Posting deposit by POST Col-IRI creates a checker task - - """ + """Posting deposit by POST Col-IRI creates a checker task""" tasks = swh_scheduler.grab_ready_tasks("check-deposit") assert len(tasks) == 0 external_id = "external-id-schedules-check" url = reverse(COL_IRI, args=[deposit_collection.name]) timestamp_before_call = now() response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"]), ) timestamp_after_call = now() assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) actual_state = response_content.findtext( "swh:deposit_status", namespaces=NAMESPACES ) assert actual_state == DEPOSIT_STATUS_DEPOSITED deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert_task_for_deposit( swh_scheduler, deposit_id, timestamp_before_call, timestamp_after_call ) def test_update_deposit_schedules_check( authenticated_client, deposit_collection, partial_deposit_with_metadata, atom_dataset, swh_scheduler, ): - """Updating deposit by POST SE-IRI creates a checker task - - """ + """Updating deposit by POST SE-IRI creates a checker task""" deposit = partial_deposit_with_metadata assert deposit.status == DEPOSIT_STATUS_PARTIAL tasks = swh_scheduler.grab_ready_tasks("check-deposit") assert len(tasks) == 0 update_uri = reverse(SE_IRI, args=[deposit_collection.name, deposit.id]) timestamp_before_call = now() response = authenticated_client.post( update_uri, content_type="application/atom+xml;type=entry", data="", size=0, HTTP_IN_PROGRESS=False, ) timestamp_after_call = now() assert response.status_code == status.HTTP_200_OK response_content = parse_xml(response.content) actual_state = response_content.findtext( "swh:deposit_status", namespaces=NAMESPACES ) assert actual_state == DEPOSIT_STATUS_DEPOSITED assert deposit.id == int( response_content.findtext("swh:deposit_id", namespaces=NAMESPACES) ) assert_task_for_deposit( swh_scheduler, deposit.id, timestamp_before_call, timestamp_after_call ) diff --git a/swh/deposit/tests/api/test_deposit_state.py b/swh/deposit/tests/api/test_deposit_state.py index beaafdee..f9cf861f 100644 --- a/swh/deposit/tests/api/test_deposit_state.py +++ b/swh/deposit/tests/api/test_deposit_state.py @@ -1,151 +1,141 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from xml.etree import ElementTree from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.config import ( DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_REJECTED, STATE_IRI, ) from swh.deposit.models import DEPOSIT_STATUS_DETAIL, DEPOSIT_STATUS_LOAD_SUCCESS from swh.deposit.utils import NAMESPACES def test_post_deposit_with_status_check(authenticated_client, deposited_deposit): - """Successful but not loaded deposit should have a status 'deposited' - - """ + """Successful but not loaded deposit should have a status 'deposited'""" deposit = deposited_deposit status_url = reverse(STATE_IRI, args=[deposit.collection.name, deposit.id]) # check status status_response = authenticated_client.get(status_url) assert status_response.status_code == status.HTTP_200_OK r = ElementTree.fromstring(status_response.content) assert int(r.findtext("swh:deposit_id", namespaces=NAMESPACES)) == deposit.id assert ( r.findtext("swh:deposit_status", namespaces=NAMESPACES) == DEPOSIT_STATUS_DEPOSITED ) assert ( r.findtext("swh:deposit_status_detail", namespaces=NAMESPACES) == DEPOSIT_STATUS_DETAIL[DEPOSIT_STATUS_DEPOSITED] ) assert ( r.findtext("swh:deposit_external_id", namespaces=NAMESPACES) == deposit.external_id ) assert ( r.findtext("swh:deposit_origin_url", namespaces=NAMESPACES) == deposit.origin_url ) def test_status_unknown_deposit(authenticated_client, deposit_collection): - """Unknown deposit status should return 404 response - - """ + """Unknown deposit status should return 404 response""" unknown_deposit_id = 999 status_url = reverse(STATE_IRI, args=[deposit_collection.name, unknown_deposit_id]) status_response = authenticated_client.get(status_url) assert status_response.status_code == status.HTTP_404_NOT_FOUND def test_status_unknown_collection(authenticated_client, deposited_deposit): """Unknown collection status should return 404 response""" deposit = deposited_deposit unknown_collection = "something-unknown" status_url = reverse(STATE_IRI, args=[unknown_collection, deposit.id]) status_response = authenticated_client.get(status_url) assert status_response.status_code == status.HTTP_404_NOT_FOUND def test_status_deposit_rejected(authenticated_client, rejected_deposit): - """Rejected deposit status should be 'rejected' with detailed summary - - """ + """Rejected deposit status should be 'rejected' with detailed summary""" deposit = rejected_deposit # _status_detail = {'url': {'summary': 'Wrong url'}} url = reverse(STATE_IRI, args=[deposit.collection.name, deposit.id]) # when status_response = authenticated_client.get(url) # then assert status_response.status_code == status.HTTP_200_OK r = ElementTree.fromstring(status_response.content) assert int(r.findtext("swh:deposit_id", namespaces=NAMESPACES)) == deposit.id assert ( r.findtext("swh:deposit_status", namespaces=NAMESPACES) == DEPOSIT_STATUS_REJECTED ) assert ( r.findtext("swh:deposit_status_detail", namespaces=NAMESPACES) == "Deposit failed the checks" ) if deposit.swhid: assert r.findtext("swh:deposit_swhid", namespaces=NAMESPACES) == deposit.swhid def test_status_with_http_accept_header_should_not_break( authenticated_client, partial_deposit ): - """Asking deposit status with Accept header should return 200 - - """ + """Asking deposit status with Accept header should return 200""" deposit = partial_deposit status_url = reverse(STATE_IRI, args=[deposit.collection.name, deposit.id]) response = authenticated_client.get(status_url) assert response.status_code == status.HTTP_200_OK response = authenticated_client.get( status_url, HTTP_ACCEPT="text/html,application/xml;q=9,*/*,q=8" ) assert response.status_code == status.HTTP_200_OK def test_status_complete_deposit(authenticated_client, complete_deposit): - """Successful and loaded deposit should be 'done' and have detailed swh ids - - """ + """Successful and loaded deposit should be 'done' and have detailed swh ids""" deposit = complete_deposit url = reverse(STATE_IRI, args=[deposit.collection.name, deposit.id]) # when status_response = authenticated_client.get(url) # then assert status_response.status_code == status.HTTP_200_OK r = ElementTree.fromstring(status_response.content) assert int(r.findtext("swh:deposit_id", namespaces=NAMESPACES)) == deposit.id assert ( r.findtext("swh:deposit_status", namespaces=NAMESPACES) == DEPOSIT_STATUS_LOAD_SUCCESS ) assert ( r.findtext("swh:deposit_status_detail", namespaces=NAMESPACES) == DEPOSIT_STATUS_DETAIL[DEPOSIT_STATUS_LOAD_SUCCESS] ) assert deposit.swhid is not None assert r.findtext("swh:deposit_swh_id", namespaces=NAMESPACES) == deposit.swhid assert deposit.swhid_context is not None assert ( r.findtext("swh:deposit_swh_id_context", namespaces=NAMESPACES) == deposit.swhid_context ) assert ( r.findtext("swh:deposit_origin_url", namespaces=NAMESPACES) == deposit.origin_url ) diff --git a/swh/deposit/tests/api/test_deposit_update.py b/swh/deposit/tests/api/test_deposit_update.py index 828c9d8a..ed0248c7 100644 --- a/swh/deposit/tests/api/test_deposit_update.py +++ b/swh/deposit/tests/api/test_deposit_update.py @@ -1,140 +1,144 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Tests updates on SE-IRI.""" from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.config import ( DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL, EDIT_IRI, SE_IRI, ) from swh.deposit.models import Deposit, DepositRequest from swh.deposit.tests.common import post_atom, post_multipart, put_multipart def test_add_both_archive_and_metadata_to_deposit( authenticated_client, deposit_collection, partial_deposit_with_metadata, atom_dataset, sample_archive, deposit_user, ): """Scenario: Add both a new archive and new metadata to a partial deposit is ok Response: 201 """ deposit = partial_deposit_with_metadata origin_url = deposit_user.provider_url + deposit.external_id requests = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests) == 1 requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive0) == 1 data_atom_entry = atom_dataset["entry-data1"] response = post_multipart( authenticated_client, reverse(SE_IRI, args=[deposit_collection.name, deposit.id]), sample_archive, data_atom_entry, ) assert response.status_code == status.HTTP_201_CREATED requests = DepositRequest.objects.filter(deposit=deposit, type="metadata").order_by( "id" ) assert len(requests) == 1 + 1, "New deposit request archive got added" expected_raw_meta0 = atom_dataset["entry-data0"] % origin_url # a new one was added assert requests[0].raw_metadata == expected_raw_meta0 assert requests[1].raw_metadata == data_atom_entry # check we did not touch the other parts requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive1) == 1 + 1, "New deposit request metadata got added" def test_post_metadata_empty_post_finalize_deposit_ok( authenticated_client, deposit_collection, partial_deposit_with_metadata, atom_dataset, ): """Empty atom post entry with header in-progress to false transitions deposit to 'deposited' status Response: 200 """ deposit = partial_deposit_with_metadata assert deposit.status == DEPOSIT_STATUS_PARTIAL update_uri = reverse(SE_IRI, args=[deposit_collection.name, deposit.id]) response = post_atom( - authenticated_client, update_uri, data="", size=0, HTTP_IN_PROGRESS=False, + authenticated_client, + update_uri, + data="", + size=0, + HTTP_IN_PROGRESS=False, ) assert response.status_code == status.HTTP_200_OK deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED def test_put_update_metadata_and_archive_deposit_partial_nominal( tmp_path, authenticated_client, partial_deposit_with_metadata, deposit_collection, atom_dataset, sample_archive, deposit_user, ): """Scenario: Replace metadata and archive(s) with new ones should be ok Response: 204 """ # given deposit = partial_deposit_with_metadata origin_url = deposit_user.provider_url + deposit.external_id raw_metadata0 = atom_dataset["entry-data0"] % origin_url requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta) == 1 request_meta0 = requests_meta[0] assert request_meta0.raw_metadata == raw_metadata0 requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive0) == 1 data_atom_entry = atom_dataset["entry-data1"] response = put_multipart( authenticated_client, reverse(EDIT_IRI, args=[deposit_collection.name, deposit.id]), sample_archive, data_atom_entry, ) assert response.status_code == status.HTTP_204_NO_CONTENT # check we updated the metadata part requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta) == 1 request_meta1 = requests_meta[0] raw_metadata1 = request_meta1.raw_metadata assert raw_metadata1 == data_atom_entry assert raw_metadata0 != raw_metadata1 assert request_meta0 != request_meta1 # and the archive part requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive1) == 1 assert set(requests_archive0) != set(requests_archive1) diff --git a/swh/deposit/tests/api/test_deposit_update_atom.py b/swh/deposit/tests/api/test_deposit_update_atom.py index 4a243cf9..4809d982 100644 --- a/swh/deposit/tests/api/test_deposit_update_atom.py +++ b/swh/deposit/tests/api/test_deposit_update_atom.py @@ -1,592 +1,594 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse_lazy as reverse import pytest from rest_framework import status from swh.deposit.api.common import ACCEPT_ARCHIVE_CONTENT_TYPES from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, EDIT_IRI, EM_IRI, SE_IRI, APIConfig, ) from swh.deposit.models import Deposit, DepositCollection, DepositRequest from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import post_atom, put_atom from swh.deposit.utils import NAMESPACES from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, RawExtrinsicMetadata, ) from swh.model.swhids import CoreSWHID, ExtendedSWHID, ObjectType from swh.storage.interface import PagedResult def test_post_deposit_atom_entry_multiple_steps( authenticated_client, deposit_collection, atom_dataset, deposit_user ): - """After initial deposit, updating a deposit should return a 201 - - """ + """After initial deposit, updating a deposit should return a 201""" # given origin_url = deposit_user.provider_url + "2225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): deposit = Deposit.objects.get(origin_url=origin_url) # when response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data1"], HTTP_IN_PROGRESS="True", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url is None # not provided yet assert deposit.status == "partial" # one associated request to a deposit deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert len(deposit_requests) == 1 atom_entry_data = atom_dataset["entry-only-create-origin"] % (origin_url) se_iri = response_content.find( "atom:link[@rel='http://purl.org/net/sword/terms/add']", namespaces=NAMESPACES ).attrib["href"] # when updating the first deposit post response = post_atom( - authenticated_client, se_iri, data=atom_entry_data, HTTP_IN_PROGRESS="False", + authenticated_client, + se_iri, + data=atom_entry_data, + HTTP_IN_PROGRESS="False", ) # then assert response.status_code == status.HTTP_201_CREATED, response.content.decode() response_content = parse_xml(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.origin_url == origin_url assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert len(Deposit.objects.all()) == 1 # now 2 associated requests to a same deposit deposit_requests = DepositRequest.objects.filter(deposit=deposit).order_by("id") assert len(deposit_requests) == 2 atom_entry_data1 = atom_dataset["entry-data1"] expected_meta = [ atom_entry_data1, atom_entry_data, ] for i, deposit_request in enumerate(deposit_requests): assert deposit_request.raw_metadata == expected_meta[i] assert bool(deposit_request.archive) is False def test_replace_metadata_to_deposit_is_possible( tmp_path, authenticated_client, partial_deposit_with_metadata, deposit_collection, atom_dataset, deposit_user, ): - """Replace all metadata with another one should return a 204 response - - """ + """Replace all metadata with another one should return a 204 response""" # given deposit = partial_deposit_with_metadata origin_url = deposit_user.provider_url + deposit.external_id raw_metadata0 = atom_dataset["entry-data0"] % origin_url requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta) == 1 request_meta0 = requests_meta[0] assert request_meta0.raw_metadata == raw_metadata0 requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive0) == 1 update_uri = reverse(EDIT_IRI, args=[deposit_collection.name, deposit.id]) response = put_atom( - authenticated_client, update_uri, data=atom_dataset["entry-data1"], + authenticated_client, + update_uri, + data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_204_NO_CONTENT requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta) == 1 request_meta1 = requests_meta[0] raw_metadata1 = request_meta1.raw_metadata assert raw_metadata1 == atom_dataset["entry-data1"] assert raw_metadata0 != raw_metadata1 assert request_meta0 != request_meta1 # check we did not touch the other parts requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive1) == 1 assert set(requests_archive0) == set(requests_archive1) def test_add_metadata_to_deposit_is_possible( authenticated_client, deposit_collection, partial_deposit_with_metadata, atom_dataset, deposit_user, ): - """Add metadata with another one should return a 204 response - - """ + """Add metadata with another one should return a 204 response""" deposit = partial_deposit_with_metadata origin_url = deposit_user.provider_url + deposit.external_id requests = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests) == 1 requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive0) == 1 update_uri = reverse(SE_IRI, args=[deposit_collection.name, deposit.id]) atom_entry = atom_dataset["entry-data1"] response = post_atom(authenticated_client, update_uri, data=atom_entry) assert response.status_code == status.HTTP_201_CREATED requests = DepositRequest.objects.filter(deposit=deposit, type="metadata").order_by( "id" ) assert len(requests) == 2 expected_raw_meta0 = atom_dataset["entry-data0"] % origin_url # a new one was added assert requests[0].raw_metadata == expected_raw_meta0 assert requests[1].raw_metadata == atom_entry # check we did not touch the other parts requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive1) == 1 assert set(requests_archive0) == set(requests_archive1) def test_add_metadata_to_unknown_deposit( deposit_collection, authenticated_client, atom_dataset ): - """Replacing metadata to unknown deposit should return a 404 response - - """ + """Replacing metadata to unknown deposit should return a 404 response""" unknown_deposit_id = 1000 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(SE_IRI, args=[deposit_collection, unknown_deposit_id]) - response = post_atom(authenticated_client, url, data=atom_dataset["entry-data1"],) + response = post_atom( + authenticated_client, + url, + data=atom_dataset["entry-data1"], + ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert "Deposit 1000 does not exist" in response_content.findtext( "atom:summary", namespaces=NAMESPACES ) def test_add_metadata_to_unknown_collection( partial_deposit, authenticated_client, atom_dataset ): - """Replacing metadata to unknown deposit should return a 404 response - - """ + """Replacing metadata to unknown deposit should return a 404 response""" deposit = partial_deposit unknown_collection_name = "unknown-collection" try: DepositCollection.objects.get(name=unknown_collection_name) except DepositCollection.DoesNotExist: assert True url = reverse(SE_IRI, args=[unknown_collection_name, deposit.id]) - response = post_atom(authenticated_client, url, data=atom_dataset["entry-data1"],) + response = post_atom( + authenticated_client, + url, + data=atom_dataset["entry-data1"], + ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert "Unknown collection name" in response_content.findtext( "atom:summary", namespaces=NAMESPACES ) def test_replace_metadata_to_unknown_deposit( authenticated_client, deposit_collection, atom_dataset ): - """Adding metadata to unknown deposit should return a 404 response - - """ + """Adding metadata to unknown deposit should return a 404 response""" unknown_deposit_id = 998 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(EDIT_IRI, args=[deposit_collection.name, unknown_deposit_id]) - response = put_atom(authenticated_client, url, data=atom_dataset["entry-data1"],) + response = put_atom( + authenticated_client, + url, + data=atom_dataset["entry-data1"], + ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert ( response_content.findtext("atom:summary", namespaces=NAMESPACES) == "Deposit %s does not exist" % unknown_deposit_id ) def test_post_metadata_to_em_iri_failure( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): - """Update (POST) archive with wrong content type should return 400 - - """ + """Update (POST) archive with wrong content type should return 400""" deposit = partial_deposit update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.post( update_uri, content_type="application/x-gtar-compressed", data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Packaging format supported is restricted" in response.content for supported_format in ACCEPT_ARCHIVE_CONTENT_TYPES: assert supported_format.encode() in response.content def test_put_metadata_to_em_iri_failure( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): - """Update (PUT) archive with wrong content type should return 400 - - """ + """Update (PUT) archive with wrong content type should return 400""" # given deposit = partial_deposit # when update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) response = put_atom( - authenticated_client, update_uri, data=atom_dataset["entry-data1"], + authenticated_client, + update_uri, + data=atom_dataset["entry-data1"], ) # then assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Packaging format supported is restricted" in response.content for supported_format in ACCEPT_ARCHIVE_CONTENT_TYPES: assert supported_format.encode() in response.content def test_put_update_metadata_done_deposit_nominal( tmp_path, authenticated_client, complete_deposit, deposit_collection, atom_dataset, sample_data, swh_storage, ): """Nominal scenario, client send an update of metadata on a deposit with status "done" - with an existing swhid. Such swhid has its metadata updated accordingly both in - the deposit backend and in the metadata storage. + with an existing swhid. Such swhid has its metadata updated accordingly both in + the deposit backend and in the metadata storage. - Response: 204 + Response: 204 """ deposit_swhid = CoreSWHID.from_string(complete_deposit.swhid) assert deposit_swhid.object_type == ObjectType.DIRECTORY directory_id = hash_to_bytes(deposit_swhid.object_id) # directory targeted by the complete_deposit does not exist in the storage assert list(swh_storage.directory_missing([directory_id])) == [directory_id] # so let's create a directory reference in the storage (current deposit targets an # unknown swhid) existing_directory = sample_data.directory swh_storage.directory_add([existing_directory]) assert list(swh_storage.directory_missing([existing_directory.id])) == [] # and patch one complete deposit swhid so it targets said reference complete_deposit.swhid = str(existing_directory.swhid()) complete_deposit.save() actual_existing_requests_archive = DepositRequest.objects.filter( deposit=complete_deposit, type="archive" ) nb_archives = len(actual_existing_requests_archive) actual_existing_requests_metadata = DepositRequest.objects.filter( deposit=complete_deposit, type="metadata" ) nb_metadata = len(actual_existing_requests_metadata) update_uri = reverse(EDIT_IRI, args=[deposit_collection.name, complete_deposit.id]) response = put_atom( authenticated_client, update_uri, data=atom_dataset["entry-data1"], HTTP_X_CHECK_SWHID=complete_deposit.swhid, ) assert response.status_code == status.HTTP_204_NO_CONTENT new_requests_meta = DepositRequest.objects.filter( deposit=complete_deposit, type="metadata" ) assert len(new_requests_meta) == nb_metadata + 1 request_meta1 = new_requests_meta[0] raw_metadata1 = request_meta1.raw_metadata assert raw_metadata1 == atom_dataset["entry-data1"] # check we did not touch the other parts requests_archive1 = DepositRequest.objects.filter( deposit=complete_deposit, type="archive" ) assert len(requests_archive1) == nb_archives assert set(actual_existing_requests_archive) == set(requests_archive1) # Ensure metadata stored in the metadata storage is consistent metadata_authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url=complete_deposit.client.provider_url, ) actual_authority = swh_storage.metadata_authority_get( MetadataAuthorityType.DEPOSIT_CLIENT, url=complete_deposit.client.provider_url ) assert actual_authority == metadata_authority config = APIConfig() metadata_fetcher = MetadataFetcher( - name=config.tool["name"], version=config.tool["version"], + name=config.tool["name"], + version=config.tool["version"], ) actual_fetcher = swh_storage.metadata_fetcher_get( config.tool["name"], config.tool["version"] ) assert actual_fetcher == metadata_fetcher directory_swhid = ExtendedSWHID.from_string(complete_deposit.swhid) page_results = swh_storage.raw_extrinsic_metadata_get( directory_swhid, metadata_authority ) assert page_results == PagedResult( results=[ RawExtrinsicMetadata( target=directory_swhid, discovery_date=request_meta1.date, authority=metadata_authority, fetcher=metadata_fetcher, format="sword-v2-atom-codemeta", metadata=raw_metadata1.encode(), origin=complete_deposit.origin_url, ) ], next_page_token=None, ) def test_put_update_metadata_done_deposit_failure_mismatched_swhid( tmp_path, authenticated_client, complete_deposit, deposit_collection, atom_dataset, swh_storage, ): """failure: client updates metadata on deposit with SWHID not matching the deposit's. - Response: 400 + Response: 400 """ incorrect_swhid = "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea" assert complete_deposit.swhid != incorrect_swhid update_uri = reverse(EDIT_IRI, args=[deposit_collection.name, complete_deposit.id]) response = put_atom( authenticated_client, update_uri, data=atom_dataset["entry-data1"], HTTP_X_CHECK_SWHID=incorrect_swhid, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Mismatched provided SWHID" in response.content def test_put_update_metadata_done_deposit_failure_malformed_xml( tmp_path, authenticated_client, complete_deposit, deposit_collection, atom_dataset, swh_storage, ): """failure: client updates metadata on deposit done with a malformed xml - Response: 400 + Response: 400 """ update_uri = reverse(EDIT_IRI, args=[deposit_collection.name, complete_deposit.id]) response = put_atom( authenticated_client, update_uri, data=atom_dataset["entry-data-ko"], HTTP_X_CHECK_SWHID=complete_deposit.swhid, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Malformed xml metadata" in response.content def test_put_update_metadata_done_deposit_failure_empty_xml( tmp_path, authenticated_client, complete_deposit, deposit_collection, atom_dataset, swh_storage, ): """failure: client updates metadata on deposit done with an empty xml. - Response: 400 + Response: 400 """ update_uri = reverse(EDIT_IRI, args=[deposit_collection.name, complete_deposit.id]) atom_content = atom_dataset["entry-data-empty-body"] response = put_atom( authenticated_client, update_uri, data=atom_content, HTTP_X_CHECK_SWHID=complete_deposit.swhid, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Empty body request is not supported" in response.content def test_put_update_metadata_done_deposit_failure_functional_checks( tmp_path, authenticated_client, complete_deposit, deposit_collection, atom_dataset, swh_storage, ): """failure: client updates metadata on deposit done without required incomplete metadata - Response: 400 + Response: 400 """ update_uri = reverse(EDIT_IRI, args=[deposit_collection.name, complete_deposit.id]) response = put_atom( authenticated_client, update_uri, # no title, nor author, nor name fields data=atom_dataset["entry-data-fail-metadata-functional-checks"], HTTP_X_CHECK_SWHID=complete_deposit.swhid, ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert b"Functional metadata checks failure" in response.content # detail on the errors msg = ( b"- Mandatory fields are missing (" b"atom:name or atom:title or codemeta:name, " b"atom:author or codemeta:author)" ) assert msg in response.content def test_put_atom_with_create_origin_and_external_identifier( authenticated_client, deposit_collection, atom_dataset, deposit_user ): """ was deprecated before was introduced, clients should get an error when trying to use both """ external_id = "foobar" origin_url = deposit_user.provider_url + external_id url = reverse(COL_IRI, args=[deposit_collection.name]) response = post_atom( authenticated_client, url, data=atom_dataset["entry-data0"] % origin_url, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) edit_iri = response_content.find( "atom:link[@rel='edit']", namespaces=NAMESPACES ).attrib["href"] # when response = put_atom( authenticated_client, edit_iri, data=atom_dataset["error-with-external-identifier"] % external_id, HTTP_IN_PROGRESS="false", ) assert b"<external_identifier> is deprecated" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST def test_put_atom_with_create_origin_and_reference( authenticated_client, deposit_collection, atom_dataset, deposit_user ): - """ and are mutually exclusive - - """ + """ and are mutually exclusive""" external_id = "foobar" origin_url = deposit_user.provider_url + external_id url = reverse(COL_IRI, args=[deposit_collection.name]) response = post_atom( authenticated_client, url, data=atom_dataset["entry-data0"] % origin_url, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) edit_iri = response_content.find( "atom:link[@rel='edit']", namespaces=NAMESPACES ).attrib["href"] # when response = put_atom( authenticated_client, edit_iri, data=atom_dataset["entry-data-with-origin-reference"].format(url=origin_url), HTTP_IN_PROGRESS="false", ) assert b"only one may be used on a given deposit" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST diff --git a/swh/deposit/tests/api/test_deposit_update_binary.py b/swh/deposit/tests/api/test_deposit_update_binary.py index 22e97240..e1c7a73f 100644 --- a/swh/deposit/tests/api/test_deposit_update_binary.py +++ b/swh/deposit/tests/api/test_deposit_update_binary.py @@ -1,428 +1,425 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Tests updates on EM-IRI""" from io import BytesIO import xml.etree.ElementTree as ET from django.core.files.uploadedfile import InMemoryUploadedFile from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED, EM_IRI, SE_IRI from swh.deposit.models import Deposit, DepositRequest from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import ( check_archive, create_arborescence_archive, post_archive, post_atom, put_archive, put_atom, ) from swh.deposit.utils import NAMESPACES def test_post_deposit_binary_and_post_to_add_another_archive( authenticated_client, deposit_collection, sample_archive, tmp_path ): - """Updating a deposit should return a 201 with receipt - - """ + """Updating a deposit should return a 201 with receipt""" tmp_path = str(tmp_path) url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id-1" # when response = post_archive( authenticated_client, url, sample_archive, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="true", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = response_content.findtext("swh:deposit_id", namespaces=NAMESPACES) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == "partial" assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swhid is None deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.deposit == deposit assert deposit_request.type == "archive" check_archive(sample_archive["name"], deposit_request.archive.name) # 2nd archive to upload archive2 = create_arborescence_archive( tmp_path, "archive2", "file2", b"some other content in file" ) # uri to update the content update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit_id]) # adding another archive for the deposit and finalizing it response = post_archive( - authenticated_client, update_uri, archive2, HTTP_SLUG=external_id, + authenticated_client, + update_uri, + archive2, + HTTP_SLUG=external_id, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swhid is None deposit_requests = list( DepositRequest.objects.filter(deposit=deposit).order_by("id") ) # 2 deposit requests for the same deposit assert len(deposit_requests) == 2 assert deposit_requests[0].deposit == deposit assert deposit_requests[0].type == "archive" check_archive(sample_archive["name"], deposit_requests[0].archive.name) assert deposit_requests[1].deposit == deposit assert deposit_requests[1].type == "archive" check_archive(archive2["name"], deposit_requests[1].archive.name) # only 1 deposit in db deposits = Deposit.objects.all() assert len(deposits) == 1 def test_replace_archive_to_deposit_is_possible( tmp_path, partial_deposit, deposit_collection, authenticated_client, sample_archive, atom_dataset, ): - """Replace all archive with another one should return a 204 response - - """ + """Replace all archive with another one should return a 204 response""" tmp_path = str(tmp_path) # given deposit = partial_deposit requests = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(list(requests)) == 1 check_archive(sample_archive["name"], requests[0].archive.name) # we have no metadata for that deposit requests = list(DepositRequest.objects.filter(deposit=deposit, type="metadata")) assert len(requests) == 0 response = post_atom( authenticated_client, reverse(SE_IRI, args=[deposit_collection.name, deposit.id]), data=atom_dataset["entry-data1"], HTTP_SLUG=deposit.external_id, HTTP_IN_PROGRESS=True, ) requests = list(DepositRequest.objects.filter(deposit=deposit, type="metadata")) assert len(requests) == 1 update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) external_id = "some-external-id-1" archive2 = create_arborescence_archive( tmp_path, "archive2", "file2", b"some other content in file" ) response = put_archive( authenticated_client, update_uri, archive2, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", ) assert response.status_code == status.HTTP_204_NO_CONTENT requests = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(list(requests)) == 1 check_archive(archive2["name"], requests[0].archive.name) # check we did not touch the other parts requests = list(DepositRequest.objects.filter(deposit=deposit, type="metadata")) assert len(requests) == 1 def test_add_archive_to_unknown_deposit( authenticated_client, deposit_collection, atom_dataset ): - """Adding metadata to unknown deposit should return a 404 response - - """ + """Adding metadata to unknown deposit should return a 404 response""" unknown_deposit_id = 997 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(EM_IRI, args=[deposit_collection.name, unknown_deposit_id]) response = authenticated_client.post( url, content_type="application/zip", data=atom_dataset["entry-data1"] ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert ( response_content.findtext("atom:summary", namespaces=NAMESPACES) == "Deposit %s does not exist" % unknown_deposit_id ) def test_replace_archive_to_unknown_deposit( authenticated_client, deposit_collection, atom_dataset ): - """Replacing archive to unknown deposit should return a 404 response - - """ + """Replacing archive to unknown deposit should return a 404 response""" unknown_deposit_id = 996 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(EM_IRI, args=[deposit_collection.name, unknown_deposit_id]) response = authenticated_client.put( url, content_type="application/zip", data=atom_dataset["entry-data1"] ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert ( response_content.findtext("atom:summary", namespaces=NAMESPACES) == "Deposit %s does not exist" % unknown_deposit_id ) def test_add_archive_to_deposit_is_possible( tmp_path, authenticated_client, deposit_collection, partial_deposit_with_metadata, sample_archive, ): - """Add another archive to a deposit return a 201 response - - """ + """Add another archive to a deposit return a 201 response""" tmp_path = str(tmp_path) deposit = partial_deposit_with_metadata requests = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests) == 1 check_archive(sample_archive["name"], requests[0].archive.name) requests_meta0 = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta0) == 1 update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) external_id = "some-external-id-1" archive2 = create_arborescence_archive( tmp_path, "archive2", "file2", b"some other content in file" ) response = post_archive( authenticated_client, update_uri, archive2, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", ) assert response.status_code == status.HTTP_201_CREATED requests = DepositRequest.objects.filter(deposit=deposit, type="archive").order_by( "id" ) assert len(requests) == 2 # first archive still exists check_archive(sample_archive["name"], requests[0].archive.name) # a new one was added check_archive(archive2["name"], requests[1].archive.name) # check we did not touch the other parts requests_meta1 = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta1) == 1 assert set(requests_meta0) == set(requests_meta1) def test_post_deposit_then_update_refused( authenticated_client, deposit_collection, sample_archive, atom_dataset, tmp_path ): - """Updating a deposit with status 'ready' should return a 400 - - """ + """Updating a deposit with status 'ready' should return a 400""" tmp_path = str(tmp_path) url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id-1" # when response = post_archive( authenticated_client, url, sample_archive, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = response_content.findtext("swh:deposit_id", namespaces=NAMESPACES) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swhid is None deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.deposit == deposit check_archive(sample_archive["name"], deposit_request.archive.name) # updating/adding is forbidden # uri to update the content edit_iri = reverse("edit_iri", args=[deposit_collection.name, deposit_id]) se_iri = reverse("se_iri", args=[deposit_collection.name, deposit_id]) em_iri = reverse("em_iri", args=[deposit_collection.name, deposit_id]) # Testing all update/add endpoint should fail # since the status is ready archive2 = create_arborescence_archive( tmp_path, "archive2", "file2", b"some content in file 2" ) # replacing file is no longer possible since the deposit's # status is ready r = put_archive( authenticated_client, em_iri, archive2, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", ) assert r.status_code == status.HTTP_400_BAD_REQUEST assert ( ET.fromstring(r.content).findtext("atom:summary", namespaces=NAMESPACES) == "You can only act on deposit with status 'partial'" ) # adding file is no longer possible since the deposit's status # is ready r = post_archive( authenticated_client, em_iri, archive2, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", ) assert r.status_code == status.HTTP_400_BAD_REQUEST assert ( ET.fromstring(r.content).findtext("atom:summary", namespaces=NAMESPACES) == "You can only act on deposit with status 'partial'" ) # replacing metadata is no longer possible since the deposit's # status is ready r = put_atom( authenticated_client, edit_iri, data=atom_dataset["entry-data-deposit-binary"], CONTENT_LENGTH=len(atom_dataset["entry-data-deposit-binary"]), HTTP_SLUG=external_id, ) assert r.status_code == status.HTTP_400_BAD_REQUEST assert ( ET.fromstring(r.content).findtext("atom:summary", namespaces=NAMESPACES) == "You can only act on deposit with status 'partial'" ) # adding new metadata is no longer possible since the # deposit's status is ready r = post_atom( authenticated_client, se_iri, data=atom_dataset["entry-data-deposit-binary"], CONTENT_LENGTH=len(atom_dataset["entry-data-deposit-binary"]), HTTP_SLUG=external_id, ) assert r.status_code == status.HTTP_400_BAD_REQUEST assert ( ET.fromstring(r.content).findtext("atom:summary", namespaces=NAMESPACES) == "You can only act on deposit with status 'partial'" ) archive_content = b"some content representing archive" archive = InMemoryUploadedFile( BytesIO(archive_content), field_name="archive0", name="archive0", content_type="application/zip", size=len(archive_content), charset=None, ) atom_entry = InMemoryUploadedFile( BytesIO(atom_dataset["entry-data-deposit-binary"].encode("utf-8")), field_name="atom0", name="atom0", content_type='application/atom+xml; charset="utf-8"', size=len(atom_dataset["entry-data-deposit-binary"]), charset="utf-8", ) # replacing multipart metadata is no longer possible since the # deposit's status is ready r = authenticated_client.put( edit_iri, format="multipart", - data={"archive": archive, "atom_entry": atom_entry,}, + data={ + "archive": archive, + "atom_entry": atom_entry, + }, ) assert r.status_code == status.HTTP_400_BAD_REQUEST assert ( ET.fromstring(r.content).findtext("atom:summary", namespaces=NAMESPACES) == "You can only act on deposit with status 'partial'" ) # adding new metadata is no longer possible since the # deposit's status is ready r = authenticated_client.post( se_iri, format="multipart", - data={"archive": archive, "atom_entry": atom_entry,}, + data={ + "archive": archive, + "atom_entry": atom_entry, + }, ) assert r.status_code == status.HTTP_400_BAD_REQUEST assert ( ET.fromstring(r.content).findtext("atom:summary", namespaces=NAMESPACES) == "You can only act on deposit with status 'partial'" ) diff --git a/swh/deposit/tests/api/test_exception.py b/swh/deposit/tests/api/test_exception.py index f1ea1e8c..8d106887 100644 --- a/swh/deposit/tests/api/test_exception.py +++ b/swh/deposit/tests/api/test_exception.py @@ -1,52 +1,48 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.db.utils import OperationalError from rest_framework.exceptions import APIException from rest_framework.response import Response from swh.deposit.exception import custom_exception_handler def test_custom_exception_handler_operational_error(mocker): - """Operation error are translated to service unavailable - - """ + """Operation error are translated to service unavailable""" fake_exception = OperationalError("Fake internal error", 503) response = custom_exception_handler(fake_exception, {}) assert response is not None assert response.status_code == 503 status = "Database backend maintenance" detail = "Service temporarily unavailable, try again later." assert ( response.content.decode("utf-8") == f""" {status} {detail} """ ) def test_custom_exception_handler_default_behavior_maintained(mocker): - """Other internal errors are transmitted as is - - """ + """Other internal errors are transmitted as is""" fake_exception = APIException("Fake internal error", 500) fake_response = Response( exception=fake_exception, status=fake_exception.status_code ) mock_exception_handler = mocker.patch("rest_framework.views.exception_handler") mock_exception_handler.return_value = fake_response response = custom_exception_handler(fake_exception, {}) assert response is not None assert response == fake_response diff --git a/swh/deposit/tests/api/test_get_file.py b/swh/deposit/tests/api/test_get_file.py index dbaf64d3..186fc2ba 100644 --- a/swh/deposit/tests/api/test_get_file.py +++ b/swh/deposit/tests/api/test_get_file.py @@ -1,67 +1,63 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Tests 'GET File-IRI'.""" import datetime from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.config import CONT_FILE_IRI from swh.deposit.models import DEPOSIT_STATUS_DETAIL from swh.deposit.parsers import parse_xml from swh.deposit.utils import NAMESPACES def test_api_deposit_content_nominal( authenticated_client, complete_deposit, partial_deposit_only_metadata ): - """Retrieve information on deposit should return 200 response - - """ + """Retrieve information on deposit should return 200 response""" now = datetime.datetime.now(tz=datetime.timezone.utc) for deposit in [complete_deposit, partial_deposit_only_metadata]: url = reverse(CONT_FILE_IRI, args=[deposit.collection.name, deposit.id]) response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK actual_deposit = parse_xml(response.content) assert actual_deposit.findtext("swh:deposit_id", namespaces=NAMESPACES) == str( deposit.id ) assert ( actual_deposit.findtext("swh:deposit_status", namespaces=NAMESPACES) == deposit.status ) assert ( actual_deposit.findtext("swh:deposit_status_detail", namespaces=NAMESPACES) == DEPOSIT_STATUS_DETAIL[deposit.status] ) assert ( now - datetime.timedelta(hours=1) <= datetime.datetime.fromisoformat( actual_deposit.findtext("swh:deposit_date", namespaces=NAMESPACES) ) <= now ) def test_api_deposit_content_unknown( authenticated_client, complete_deposit, deposit_collection ): - """Retrieve information on unknown deposit or collection should return 404 - - """ + """Retrieve information on unknown deposit or collection should return 404""" unknown_deposit_id = 999 unknown_collection = "unknown" for collection, deposit_id in [ (deposit_collection.name, unknown_deposit_id), (unknown_collection, complete_deposit.id), (complete_deposit.collection.name, complete_deposit.id + 10), ]: url = reverse(CONT_FILE_IRI, args=[collection, deposit_id]) response = authenticated_client.get(url) assert response.status_code == status.HTTP_404_NOT_FOUND diff --git a/swh/deposit/tests/api/test_keycloak_auth.py b/swh/deposit/tests/api/test_keycloak_auth.py index cdb834e9..5278d402 100644 --- a/swh/deposit/tests/api/test_keycloak_auth.py +++ b/swh/deposit/tests/api/test_keycloak_auth.py @@ -1,35 +1,33 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from django.urls import reverse_lazy as reverse import pytest from swh.auth.keycloak import KeycloakError from swh.deposit.config import SD_IRI from swh.deposit.tests.conftest import mock_keycloakopenidconnect @pytest.fixture def mock_keycloakopenidconnect_ko(mocker, keycloak_mock_auth_failure): error = { "error": "unknown_error", # does not help much but that can happen } error_message = json.dumps(error).encode() keycloak_mock_auth_failure.login.side_effect = KeycloakError( error_message=error_message, response_code=401 ) return mock_keycloakopenidconnect(mocker, keycloak_mock_auth_failure) def test_keycloak_failure_service_document(unauthorized_client): - """With authentication failure without detail, exception is returned correctly - - """ + """With authentication failure without detail, exception is returned correctly""" url = reverse(SD_IRI) response = unauthorized_client.get(url) assert response.status_code == 401 assert b"unknown_error" in response.content diff --git a/swh/deposit/tests/api/test_service_document.py b/swh/deposit/tests/api/test_service_document.py index 0c5d69cb..1a2bccee 100644 --- a/swh/deposit/tests/api/test_service_document.py +++ b/swh/deposit/tests/api/test_service_document.py @@ -1,82 +1,74 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.config import SD_IRI def test_service_document_no_auth_fails(client): - """Without authentication, service document endpoint should return 401 - - """ + """Without authentication, service document endpoint should return 401""" url = reverse(SD_IRI) response = client.get(url) assert response.status_code == status.HTTP_401_UNAUTHORIZED def test_service_document_no_auth_with_http_auth_should_not_break(client): - """Without auth, sd endpoint through browser should return 401 - - """ + """Without auth, sd endpoint through browser should return 401""" url = reverse(SD_IRI) response = client.get(url, HTTP_ACCEPT="text/html,application/xml;q=9,*/*,q=8") assert response.status_code == status.HTTP_401_UNAUTHORIZED def test_service_document(authenticated_client): - """With authentication, service document list user's collection - - """ + """With authentication, service document list user's collection""" url = reverse(SD_IRI) response = authenticated_client.get(url) check_response(response, authenticated_client.deposit_client.username) def test_service_document_with_http_accept_header(authenticated_client): - """With authentication, with browser, sd list user's collection - - """ + """With authentication, with browser, sd list user's collection""" url = reverse(SD_IRI) response = authenticated_client.get( url, HTTP_ACCEPT="text/html,application/xml;q=9,*/*,q=8" ) check_response(response, authenticated_client.deposit_client.username) def check_response(response, username): assert response.status_code == status.HTTP_200_OK, f"Response: {response.content}" assert ( response.content.decode("utf-8") == """ 2.0 %s The Software Heritage (SWH) Archive %s Software Collection application/zip application/x-tar Collection Policy Software Heritage Archive Collect, Preserve, Share false false http://purl.org/net/sword/package/SimpleZip http://testserver/1/%s/ %s """ % (5000, username, username, username, username) ) # noqa diff --git a/swh/deposit/tests/cli/test_admin.py b/swh/deposit/tests/cli/test_admin.py index 69860ba9..af576c69 100644 --- a/swh/deposit/tests/cli/test_admin.py +++ b/swh/deposit/tests/cli/test_admin.py @@ -1,319 +1,353 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.deposit.cli.admin import admin as cli from swh.deposit.config import ( DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_VERIFIED, ) from swh.deposit.models import DepositClient, DepositCollection from swh.scheduler.utils import create_oneshot_task_dict @pytest.fixture(autouse=True) def enable_db_access_for_all_tests(db): pass def test_cli_admin_user_list_nothing(cli_runner): - result = cli_runner.invoke(cli, ["user", "list",]) + result = cli_runner.invoke( + cli, + [ + "user", + "list", + ], + ) assert result.exit_code == 0, f"Unexpected output: {result.output}" assert result.output == "Empty user list\n" def test_cli_admin_user_list_with_users(cli_runner, deposit_user): - result = cli_runner.invoke(cli, ["user", "list",]) + result = cli_runner.invoke( + cli, + [ + "user", + "list", + ], + ) assert result.exit_code == 0, f"Unexpected output: {result.output}" assert result.output == f"{deposit_user.username}\n" # only 1 user def test_cli_admin_collection_list_nothing(cli_runner): - result = cli_runner.invoke(cli, ["collection", "list",]) + result = cli_runner.invoke( + cli, + [ + "collection", + "list", + ], + ) assert result.exit_code == 0, f"Unexpected output: {result.output}" assert result.output == "Empty collection list\n" def test_cli_admin_collection_list_with_collections(cli_runner, deposit_collection): from swh.deposit.tests.conftest import create_deposit_collection new_collection = create_deposit_collection("something") - result = cli_runner.invoke(cli, ["collection", "list",]) + result = cli_runner.invoke( + cli, + [ + "collection", + "list", + ], + ) assert result.exit_code == 0, f"Unexpected output: {result.output}" collections = "\n".join([deposit_collection.name, new_collection.name]) assert result.output == f"{collections}\n" def test_cli_admin_user_exists_unknown(cli_runner): result = cli_runner.invoke(cli, ["user", "exists", "unknown"]) assert result.exit_code == 1, f"Unexpected output: {result.output}" assert result.output == "User unknown does not exist.\n" def test_cli_admin_user_exists(cli_runner, deposit_user): result = cli_runner.invoke(cli, ["user", "exists", deposit_user.username]) assert result.exit_code == 0, f"Unexpected output: {result.output}" assert result.output == f"User {deposit_user.username} exists.\n" def test_cli_admin_create_collection(cli_runner): collection_name = "something" try: DepositCollection.objects.get(name=collection_name) except DepositCollection.DoesNotExist: pass result = cli_runner.invoke( - cli, ["collection", "create", "--name", collection_name,] + cli, + [ + "collection", + "create", + "--name", + collection_name, + ], ) assert result.exit_code == 0, f"Unexpected output: {result.output}" collection = DepositCollection.objects.get(name=collection_name) assert collection is not None assert ( result.output == f"""Create collection '{collection_name}'. Collection '{collection_name}' created. """ ) result2 = cli_runner.invoke( - cli, ["collection", "create", "--name", collection_name,] + cli, + [ + "collection", + "create", + "--name", + collection_name, + ], ) assert result2.exit_code == 0, f"Unexpected output: {result.output}" assert ( result2.output == f"""Collection '{collection_name}' exists, skipping. """ ) def test_cli_admin_user_create(cli_runner): user_name = "user" collection_name = user_name try: DepositClient.objects.get(username=user_name) except DepositClient.DoesNotExist: pass try: DepositCollection.objects.get(name=collection_name) except DepositCollection.DoesNotExist: pass result = cli_runner.invoke( - cli, ["user", "create", "--username", user_name, "--password", "password",] + cli, + [ + "user", + "create", + "--username", + user_name, + "--password", + "password", + ], ) assert result.exit_code == 0, f"Unexpected output: {result.output}" user = DepositClient.objects.get(username=user_name) assert user is not None collection = DepositCollection.objects.get(name=collection_name) assert collection is not None assert ( result.output == f"""Create collection '{user_name}'. Collection '{collection_name}' created. Create user '{user_name}'. User '{user_name}' created. """ ) assert collection.name == collection_name assert user.username == user_name first_password = user.password assert first_password is not None assert user.collections == [collection.id] assert user.is_active is True assert user.domain == "" assert user.provider_url == "" assert user.email == "" assert user.first_name == "" assert user.last_name == "" # create a user that already exists result2 = cli_runner.invoke( cli, [ "user", "create", "--username", "user", "--password", "another-password", # changing password "--collection", collection_name, # specifying the collection this time "--firstname", "User", "--lastname", "no one", "--email", "user@org.org", "--provider-url", "http://some-provider.org", "--domain", "domain", ], ) assert result2.exit_code == 0, f"Unexpected output: {result2.output}" user = DepositClient.objects.get(username=user_name) assert user is not None assert user.username == user_name assert user.collections == [collection.id] assert user.is_active is True second_password = user.password assert second_password is not None # For the transition period, we can choose either basic or keycloak so we need to be # able to still define a password (basic), so there it's updated. assert second_password != first_password, "Password changed" assert user.domain == "domain" assert user.provider_url == "http://some-provider.org" assert user.email == "user@org.org" assert user.first_name == "User" assert user.last_name == "no one" assert ( result2.output == f"""Collection '{collection_name}' exists, skipping. Update user '{user_name}'. User '{user_name}' updated. """ ) def test_cli_admin_reschedule_unknown_deposit(cli_runner): - """Rescheduling unknown deposit should report failure - - """ + """Rescheduling unknown deposit should report failure""" unknown_deposit_id = 666 from swh.deposit.models import Deposit try: Deposit.objects.get(id=unknown_deposit_id) except Deposit.DoesNotExist: pass result = cli_runner.invoke( cli, ["deposit", "reschedule", "--deposit-id", unknown_deposit_id] ) assert result.output == f"Deposit {unknown_deposit_id} does not exist.\n" assert result.exit_code == 1 def test_cli_admin_reschedule_verified_deposit(cli_runner, complete_deposit): - """Rescheduling verified deposit should do nothing but report - - """ + """Rescheduling verified deposit should do nothing but report""" deposit = complete_deposit deposit.status = "verified" deposit.save() result = cli_runner.invoke( cli, ["deposit", "reschedule", "--deposit-id", deposit.id] ) assert result.output == f"Deposit {deposit.id} already set for rescheduling.\n" assert result.exit_code == 0 @pytest.mark.parametrize( "status_to_check", [DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_DEPOSITED] ) def test_cli_admin_reschedule_unaccepted_deposit_status( status_to_check, cli_runner, complete_deposit ): - """Rescheduling verified deposit should do nothing but report - - """ + """Rescheduling verified deposit should do nothing but report""" deposit = complete_deposit deposit.status = status_to_check # not accepted status will fail the check deposit.save() result = cli_runner.invoke( cli, ["deposit", "reschedule", "--deposit-id", deposit.id] ) assert result.output == ( f"Deposit {deposit.id} cannot be rescheduled (status: {deposit.status}).\n" "Rescheduling deposit is only accepted for deposit with status: done, failed.\n" ) assert result.exit_code == 1 def test_cli_admin_reschedule_missing_task_id(cli_runner, complete_deposit): - """Rescheduling deposit with no load_task_id cannot work. - - """ + """Rescheduling deposit with no load_task_id cannot work.""" deposit = complete_deposit deposit.load_task_id = "" # drop the load-task-id so it fails the check deposit.save() result = cli_runner.invoke( cli, ["deposit", "reschedule", "--deposit-id", deposit.id] ) assert result.output == ( f"Deposit {deposit.id} cannot be rescheduled. It misses the " "associated scheduler task id (field load_task_id).\n" ) assert result.exit_code == 1 def test_cli_admin_reschedule_nominal(cli_runner, complete_deposit, swh_scheduler): - """Rescheduling deposit with no load_task_id cannot work. - - """ + """Rescheduling deposit with no load_task_id cannot work.""" deposit = complete_deposit from swh.deposit.models import Deposit # create a task to keep a reference on it task = create_oneshot_task_dict( "load-deposit", url=deposit.origin_url, deposit_id=deposit.id, retries_left=3 ) scheduled_task = swh_scheduler.create_tasks([task])[0] # disable it swh_scheduler.set_status_tasks([scheduled_task["id"]], status="disabled") # Now update the deposit state with some swhid and relevant load_task_id deposit = complete_deposit deposit.load_task_id = scheduled_task["id"] deposit.swhid = "swh:1:dir:02ed6084fb0e8384ac58980e07548a547431cf74" deposit.swhid_context = f"{deposit.swhid};origin=https://url/external-id" deposit.save() # Reschedule it result = cli_runner.invoke( cli, ["deposit", "reschedule", "--deposit-id", deposit.id] ) assert result.exit_code == 0 # Now, ensure the deposit and the associated task are in the right shape deposit = Deposit.objects.get(id=deposit.id) # got reset to a state which allows rescheduling assert deposit.id assert deposit.swhid is None assert deposit.swhid_context is None assert deposit.status == DEPOSIT_STATUS_VERIFIED task = swh_scheduler.search_tasks(task_id=deposit.load_task_id)[0] assert task["status"] == "next_run_not_scheduled" diff --git a/swh/deposit/tests/cli/test_client.py b/swh/deposit/tests/cli/test_client.py index fd753b7d..a1699601 100644 --- a/swh/deposit/tests/cli/test_client.py +++ b/swh/deposit/tests/cli/test_client.py @@ -1,1176 +1,1170 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import ast import contextlib import json import logging import os from typing import Optional from unittest.mock import MagicMock from xml.etree import ElementTree import pytest import yaml from swh.deposit.api.checks import ( METADATA_PROVENANCE_KEY, SUGGESTED_FIELDS_MISSING, check_metadata, ) from swh.deposit.cli import deposit as cli from swh.deposit.cli.client import InputError, _collection, _url, generate_metadata from swh.deposit.client import ( BaseDepositClient, MaintenanceError, PublicApiDepositClient, ServiceDocumentDepositClient, ) from swh.deposit.parsers import parse_xml from swh.deposit.utils import NAMESPACES from swh.model.exceptions import ValidationError from ..conftest import TEST_USER def generate_slug() -> str: - """Generate a slug (sample purposes). - - """ + """Generate a slug (sample purposes).""" import uuid return str(uuid.uuid4()) @pytest.fixture def datadir(request): """Override default datadir to target main test datadir""" return os.path.join(os.path.dirname(str(request.fspath)), "../data") @pytest.fixture def slug(): return generate_slug() @pytest.fixture def patched_tmp_path(tmp_path, mocker): mocker.patch( "tempfile.TemporaryDirectory", return_value=contextlib.nullcontext(str(tmp_path)), ) return tmp_path @pytest.fixture def client_mock_api_down(mocker, slug): - """A mock client whose connection with api fails due to maintenance issue - - """ + """A mock client whose connection with api fails due to maintenance issue""" mock_client = MagicMock() mocker.patch("swh.deposit.client.PublicApiDepositClient", return_value=mock_client) mock_client.service_document.side_effect = MaintenanceError( "Database backend maintenance: Temporarily unavailable, try again later." ) return mock_client def test_cli_url(): assert _url("http://deposit") == "http://deposit/1" assert _url("https://other/1") == "https://other/1" def test_cli_collection_error(): mock_client = MagicMock() mock_client.service_document.return_value = {"error": "something went wrong"} with pytest.raises(InputError) as e: _collection(mock_client) assert "Service document retrieval: something went wrong" == str(e.value) def test_cli_collection_ok(requests_mock_datadir): client = PublicApiDepositClient( url="https://deposit.swh.test/1", auth=("test", "test") ) collection_name = _collection(client) assert collection_name == "test" def test_cli_collection_ko_because_downtime(): mock_client = MagicMock() mock_client.service_document.side_effect = MaintenanceError("downtime") with pytest.raises(MaintenanceError, match="downtime"): _collection(mock_client) def test_cli_upload_conflictual_flags( - datadir, requests_mock_datadir, cli_runner, atom_dataset, tmp_path, + datadir, + requests_mock_datadir, + cli_runner, + atom_dataset, + tmp_path, ): - """Post metadata-only deposit through cli with invalid swhid raises - - """ + """Post metadata-only deposit through cli with invalid swhid raises""" api_url_basename = "deposit.test.metadataonly" metadata = atom_dataset["entry-data-minimal"] metadata_path = os.path.join(tmp_path, "entry-data-minimal.xml") with open(metadata_path, "w") as f: f.write(metadata) with pytest.raises(InputError, match="both with different values"): # fmt: off cli_runner.invoke( cli, [ "upload", "--url", f"https://{api_url_basename}/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, "--slug", "some-slug", # deprecated flag "--create-origin", "some-other-slug", # conflictual value, so raise "--format", "json", ], catch_exceptions=False, ) # fmt: on def test_cli_deposit_with_server_down_for_maintenance( sample_archive, caplog, client_mock_api_down, slug, patched_tmp_path, cli_runner ): - """ Deposit failure due to maintenance down time should be explicit - - """ + """Deposit failure due to maintenance down time should be explicit""" # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], "--author", "Jane Doe", ], ) # fmt: on assert result.exit_code == 1, result.output assert result.output == "" down_for_maintenance_log_record = ( "swh.deposit.cli.client", logging.ERROR, "Database backend maintenance: Temporarily unavailable, try again later.", ) assert down_for_maintenance_log_record in caplog.record_tuples client_mock_api_down.service_document.assert_called_once_with() def test_cli_client_generate_metadata_ok(slug): - """Generated metadata is well formed and pass service side metadata checks - - """ + """Generated metadata is well formed and pass service side metadata checks""" actual_metadata_xml = generate_metadata( "deposit-client", "project-name", authors=["some", "authors"], external_id="http://example.org/external-id", create_origin="origin-url", metadata_provenance_url="meta-prov-url", ) actual_metadata = parse_xml(actual_metadata_xml) assert ( actual_metadata.findtext("atom:author", namespaces=NAMESPACES) == "deposit-client" ) assert ( actual_metadata.findtext("atom:title", namespaces=NAMESPACES) == "project-name" ) assert actual_metadata.findtext("atom:updated", namespaces=NAMESPACES) is not None assert ( actual_metadata.findtext("codemeta:name", namespaces=NAMESPACES) == "project-name" ) assert ( actual_metadata.findtext("codemeta:identifier", namespaces=NAMESPACES) == "http://example.org/external-id" ) authors = actual_metadata.findall( "codemeta:author/codemeta:name", namespaces=NAMESPACES ) assert len(authors) == 2 assert authors[0].text == "some" assert authors[1].text == "authors" assert ( actual_metadata.find( "swh:deposit/swh:create_origin/swh:origin", namespaces=NAMESPACES ).attrib["url"] == "origin-url" ) assert ( actual_metadata.findtext( "swh:deposit/swh:metadata-provenance/schema:url", namespaces=NAMESPACES ) == "meta-prov-url" ) checks_ok, detail = check_metadata(ElementTree.fromstring(actual_metadata_xml)) assert checks_ok is True assert detail is None def test_cli_client_generate_metadata_ok2(slug): - """Generated metadata is well formed and pass service side metadata checks - - """ + """Generated metadata is well formed and pass service side metadata checks""" actual_metadata_xml = generate_metadata( - "deposit-client", "project-name", authors=["some", "authors"], + "deposit-client", + "project-name", + authors=["some", "authors"], ) actual_metadata = parse_xml(actual_metadata_xml) assert ( actual_metadata.findtext("atom:author", namespaces=NAMESPACES) == "deposit-client" ) assert ( actual_metadata.findtext("atom:title", namespaces=NAMESPACES) == "project-name" ) assert actual_metadata.findtext("atom:updated", namespaces=NAMESPACES) is not None assert ( actual_metadata.findtext("codemeta:name", namespaces=NAMESPACES) == "project-name" ) authors = actual_metadata.findall( "codemeta:author/codemeta:name", namespaces=NAMESPACES ) assert len(authors) == 2 assert authors[0].text == "some" assert authors[1].text == "authors" assert actual_metadata.find("codemeta:identifier", namespaces=NAMESPACES) is None assert actual_metadata.find("swh:deposit", namespaces=NAMESPACES) is None checks_ok, detail = check_metadata(ElementTree.fromstring(actual_metadata_xml)) assert checks_ok is True assert detail == { "metadata": [ {"summary": SUGGESTED_FIELDS_MISSING, "fields": [METADATA_PROVENANCE_KEY]} ] } def test_cli_single_minimal_deposit_with_slug( - sample_archive, slug, patched_tmp_path, requests_mock_datadir, cli_runner, caplog, + sample_archive, + slug, + patched_tmp_path, + requests_mock_datadir, + cli_runner, + caplog, ): - """ This ensure a single deposit upload through the cli is fine, cf. + """This ensure a single deposit upload through the cli is fine, cf. https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#single-deposit """ # noqa metadata_path = os.path.join(patched_tmp_path, "metadata.xml") # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], "--metadata-provenance-url", "meta-prov-url", "--author", "Jane Doe", "--slug", slug, "--format", "json", ], ) # fmt: on assert result.exit_code == 0, result.output assert json.loads(result.output) == { "deposit_id": "615", "deposit_status": "partial", "deposit_status_detail": None, "deposit_date": "2020-10-08T13:52:34.509655Z", } with open(metadata_path) as fd: actual_metadata = parse_xml(fd.read()) assert ( actual_metadata.findtext("atom:author", namespaces=NAMESPACES) == TEST_USER["username"] ) assert ( actual_metadata.findtext("codemeta:name", namespaces=NAMESPACES) == "test-project" ) assert ( actual_metadata.findtext("atom:title", namespaces=NAMESPACES) == "test-project" ) assert ( actual_metadata.findtext("atom:updated", namespaces=NAMESPACES) is not None ) assert ( actual_metadata.findtext("codemeta:identifier", namespaces=NAMESPACES) == slug ) authors = actual_metadata.findall( "codemeta:author/codemeta:name", namespaces=NAMESPACES ) assert len(authors) == 1 assert authors[0].text == "Jane Doe" count_warnings = 0 for (_, log_level, _) in caplog.record_tuples: count_warnings += 1 if log_level == logging.WARNING else 0 assert ( count_warnings == 1 ), "We should have 1 warning as we are using slug instead of create_origin" def test_cli_single_minimal_deposit_with_create_origin( - sample_archive, slug, patched_tmp_path, requests_mock_datadir, cli_runner, caplog, + sample_archive, + slug, + patched_tmp_path, + requests_mock_datadir, + cli_runner, + caplog, ): - """ This ensure a single deposit upload through the cli is fine, cf. + """This ensure a single deposit upload through the cli is fine, cf. https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#single-deposit """ # noqa metadata_path = os.path.join(patched_tmp_path, "metadata.xml") origin = slug # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], "--author", "Jane Doe", "--create-origin", origin, "--metadata-provenance-url", "meta-prov-url", "--format", "json", ], ) # fmt: on assert result.exit_code == 0, result.output assert json.loads(result.output) == { "deposit_id": "615", "deposit_status": "partial", "deposit_status_detail": None, "deposit_date": "2020-10-08T13:52:34.509655Z", } with open(metadata_path) as fd: actual_metadata = parse_xml(fd.read()) assert ( actual_metadata.findtext("atom:author", namespaces=NAMESPACES) == TEST_USER["username"] ) assert ( actual_metadata.findtext("codemeta:name", namespaces=NAMESPACES) == "test-project" ) assert ( actual_metadata.findtext("atom:title", namespaces=NAMESPACES) == "test-project" ) assert ( actual_metadata.findtext("atom:updated", namespaces=NAMESPACES) is not None ) assert ( actual_metadata.find( "swh:deposit/swh:create_origin/swh:origin", namespaces=NAMESPACES ).attrib["url"] == origin ) assert ( actual_metadata.findtext( "swh:deposit/swh:metadata-provenance/schema:url", namespaces=NAMESPACES ) == "meta-prov-url" ) authors = actual_metadata.findall( "codemeta:author/codemeta:name", namespaces=NAMESPACES ) assert len(authors) == 1 assert authors[0].text == "Jane Doe" count_warnings = 0 for (_, log_level, _) in caplog.record_tuples: count_warnings += 1 if log_level == logging.WARNING else 0 assert ( count_warnings == 0 ), "We should have no warning as we are using create_origin" def test_cli_validation_metadata( sample_archive, caplog, patched_tmp_path, cli_runner, slug ): - """Multiple metadata flags scenario (missing, conflicts) properly fails the calls - - """ + """Multiple metadata flags scenario (missing, conflicts) properly fails the calls""" metadata_path = os.path.join(patched_tmp_path, "metadata.xml") with open(metadata_path, "a"): pass # creates the file for flag_title_or_name, author_or_name in [ ("--author", "no one"), ("--name", "test-project"), ]: # Test missing author then missing name # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--slug", slug, flag_title_or_name, author_or_name, ], ) # fmt: on assert result.exit_code == 1, f"unexpected result: {result.output}" assert result.output == "" expected_error_log_record = ( "swh.deposit.cli.client", logging.ERROR, ( "Problem during parsing options: " "For metadata deposit request, either a metadata file with " "--metadata or both --author and --name must be provided. " ), ) assert expected_error_log_record in caplog.record_tuples # Clear mocking state caplog.clear() # incompatible flags: Test both --metadata and --author, then --metadata and # --name # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--deposit-id", 666, "--archive", sample_archive["path"], "--slug", slug, ], ) # fmt: on assert result.exit_code == 1, f"unexpected result: {result.output}" assert result.output == "" expected_error_log_record = ( "swh.deposit.cli.client", logging.ERROR, ( "Problem during parsing options: " "For metadata deposit request, either a metadata file with " "--metadata or both --author and --name must be provided." ), ) assert expected_error_log_record in caplog.record_tuples # Clear mocking state caplog.clear() # incompatible flags check (Test both --metadata and --author, # then --metadata and --name) # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--metadata", metadata_path, "--author", "Jane Doe", "--slug", slug, ], ) # fmt: on assert result.exit_code == 1, result.output assert result.output == "" expected_error_log_record = ( "swh.deposit.cli.client", logging.ERROR, ( "Problem during parsing options: " "Using --metadata flag is incompatible with --author " "and --name and --create-origin (those are used to generate " "one metadata file)." ), ) assert expected_error_log_record in caplog.record_tuples caplog.clear() def test_cli_validation_no_actionable_command(caplog, cli_runner): - """Multiple metadata flags scenario (missing, conflicts) properly fails the calls - - """ + """Multiple metadata flags scenario (missing, conflicts) properly fails the calls""" # no actionable command # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--partial", ], ) # fmt: on assert result.exit_code == 1, result.output assert result.output == "" expected_error_log_record = ( "swh.deposit.cli.client", logging.ERROR, ( "Problem during parsing options: " "Please provide an actionable command. See --help for more information" ), ) assert expected_error_log_record in caplog.record_tuples def test_cli_validation_replace_with_no_deposit_id_fails( sample_archive, caplog, patched_tmp_path, requests_mock_datadir, datadir, cli_runner ): - """--replace flags require --deposit-id otherwise fails - - """ + """--replace flags require --deposit-id otherwise fails""" metadata_path = os.path.join(datadir, "atom", "entry-data-deposit-binary.xml") # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, "--archive", sample_archive["path"], "--replace", ], ) # fmt: on assert result.exit_code == 1, result.output assert result.output == "" expected_error_log_record = ( "swh.deposit.cli.client", logging.ERROR, ( "Problem during parsing options: " "To update an existing deposit, you must provide its id" ), ) assert expected_error_log_record in caplog.record_tuples def test_cli_single_deposit_slug_generation( sample_archive, patched_tmp_path, requests_mock_datadir, cli_runner ): """Single deposit scenario without providing the slug, it should not be generated. """ metadata_path = os.path.join(patched_tmp_path, "metadata.xml") # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], "--author", "Jane Doe", "--format", "json", ], ) # fmt: on assert result.exit_code == 0, result.output assert json.loads(result.output) == { "deposit_id": "615", "deposit_status": "partial", "deposit_status_detail": None, "deposit_date": "2020-10-08T13:52:34.509655Z", } with open(metadata_path) as fd: metadata_xml = fd.read() actual_metadata = parse_xml(metadata_xml) assert "codemeta:identifier" not in actual_metadata def test_cli_multisteps_deposit( sample_archive, datadir, slug, requests_mock_datadir, cli_runner ): - """ First deposit a partial deposit (no metadata, only archive), then update the metadata part. + """First deposit a partial deposit (no metadata, only archive), then update the metadata part. https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#multisteps-deposit """ # noqa api_url = "https://deposit.test.metadata/1" deposit_id = 666 # Create a partial deposit with only 1 archive # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", api_url, "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--slug", slug, "--format", "json", "--partial", ], ) # fmt: on assert result.exit_code == 0, f"unexpected output: {result.output}" actual_deposit = json.loads(result.output) assert actual_deposit == { "deposit_id": str(deposit_id), "deposit_status": "partial", "deposit_status_detail": None, "deposit_date": "2020-10-08T13:52:34.509655Z", } # Update the partial deposit with only 1 archive # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", api_url, "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--deposit-id", deposit_id, "--slug", slug, "--format", "json", "--partial", # in-progress: True, because remains the metadata to upload ], ) # fmt: on assert result.exit_code == 0, f"unexpected output: {result.output}" assert result.output is not None actual_deposit = json.loads(result.output) # deposit update scenario actually returns a deposit status dict assert actual_deposit["deposit_id"] == str(deposit_id) assert actual_deposit["deposit_status"] == "partial" # Update the partial deposit with only some metadata (and then finalize it) # https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#add-content-or-metadata-to-the-deposit metadata_path = os.path.join(datadir, "atom", "entry-data-deposit-binary.xml") # Update deposit with metadata # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", api_url, "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, "--deposit-id", deposit_id, "--slug", slug, "--format", "json", ], # this time, ^ we no longer flag it to partial, so the status changes to # in-progress false ) # fmt: on assert result.exit_code == 0, f"unexpected output: {result.output}" assert result.output is not None actual_deposit = json.loads(result.output) # deposit update scenario actually returns a deposit status dict assert actual_deposit["deposit_id"] == str(deposit_id) # FIXME: should be "deposited" but current limitation in the # requests_mock_datadir_visits use, cannot find a way to make it work right now assert actual_deposit["deposit_status"] == "partial" @pytest.mark.parametrize( "output_format,parser_fn", [ ("json", json.loads), ("yaml", yaml.safe_load), ( "logging", ast.literal_eval, ), # not enough though, the caplog fixture is needed ], ) def test_cli_deposit_status_with_output_format( output_format, parser_fn, datadir, slug, requests_mock_datadir, caplog, cli_runner ): - """Check deposit status cli with all possible output formats (json, yaml, logging). - - """ + """Check deposit status cli with all possible output formats (json, yaml, logging).""" api_url_basename = "deposit.test.status" deposit_id = 1033 expected_deposit_status = { "deposit_id": str(deposit_id), "deposit_status": "done", "deposit_status_detail": ( "The deposit has been successfully loaded into the " "Software Heritage archive" ), "deposit_swh_id": "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea", "deposit_swh_id_context": "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea;origin=https://www.softwareheritage.org/check-deposit-2020-10-08T13:52:34.509655;visit=swh:1:snp:c477c6ef51833127b13a86ece7d75e5b3cc4e93d;anchor=swh:1:rev:f26f3960c175f15f6e24200171d446b86f6f7230;path=/", # noqa "deposit_external_id": "check-deposit-2020-10-08T13:52:34.509655", } # fmt: off result = cli_runner.invoke( cli, [ "status", "--url", f"https://{api_url_basename}/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--deposit-id", deposit_id, "--format", output_format, ], ) # fmt: on assert result.exit_code == 0, f"unexpected output: {result.output}" if output_format == "logging": assert len(caplog.record_tuples) == 1 # format: (, , ) _, _, result_output = caplog.record_tuples[0] else: result_output = result.output actual_deposit = parser_fn(result_output) assert actual_deposit == expected_deposit_status def test_cli_update_metadata_with_swhid_on_completed_deposit( datadir, requests_mock_datadir, cli_runner ): - """Update new metadata on a completed deposit (status done) is ok - """ + """Update new metadata on a completed deposit (status done) is ok""" api_url_basename = "deposit.test.updateswhid" deposit_id = 123 expected_deposit_status = { "deposit_external_id": "check-deposit-2020-10-08T13:52:34.509655", "deposit_id": str(deposit_id), "deposit_status": "done", "deposit_status_detail": ( "The deposit has been successfully loaded into the " "Software Heritage archive" ), "deposit_swh_id": "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea", "deposit_swh_id_context": "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea;origin=https://www.softwareheritage.org/check-deposit-2020-10-08T13:52:34.509655;visit=swh:1:snp:c477c6ef51833127b13a86ece7d75e5b3cc4e93d;anchor=swh:1:rev:f26f3960c175f15f6e24200171d446b86f6f7230;path=/", # noqa } assert expected_deposit_status["deposit_status"] == "done" assert expected_deposit_status["deposit_swh_id"] is not None # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", f"https://{api_url_basename}/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--author", "John Doe", "--deposit-id", deposit_id, "--swhid", expected_deposit_status["deposit_swh_id"], "--format", "json", ], ) # fmt: on assert result.exit_code == 0, result.output actual_deposit_status = json.loads(result.output) assert "error" not in actual_deposit_status assert actual_deposit_status == expected_deposit_status def test_cli_update_metadata_with_swhid_on_other_status_deposit( datadir, requests_mock_datadir, cli_runner ): - """Update new metadata with swhid on other deposit status is not possible - """ + """Update new metadata with swhid on other deposit status is not possible""" api_url_basename = "deposit.test.updateswhid" deposit_id = "321" # fmt: off result = cli_runner.invoke( cli, [ "upload", "--url", f"https://{api_url_basename}/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--author", "John Doe", "--deposit-id", deposit_id, "--swhid", "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea", "--format", "json", ], ) # fmt: on assert result.exit_code == 0, result.output actual_result = json.loads(result.output) assert "error" in actual_result assert actual_result == { "error": "You can only update metadata on deposit with status 'done'", "detail": f"The deposit {deposit_id} has status 'partial'", "deposit_status": "partial", "deposit_id": deposit_id, } @pytest.mark.parametrize( "metadata_entry_key", ["entry-data-with-swhid", "entry-data-with-swhid-no-prov"] ) def test_cli_metadata_only_deposit_full_metadata_file( datadir, requests_mock_datadir, cli_runner, atom_dataset, tmp_path, metadata_entry_key, caplog, ): """Post metadata-only deposit through cli The metadata file posted by the client already contains the swhid """ api_url_basename = "deposit.test.metadataonly" swhid = "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea" atom_data = atom_dataset[metadata_entry_key] if metadata_entry_key == "entry-data-with-swhid": metadata = atom_data.format( swhid=swhid, metadata_provenance_url=( "https://inria.halpreprod.archives-ouvertes.fr/hal-abcdefgh" ), ) else: metadata = atom_data.format(swhid=swhid) metadata_path = os.path.join(tmp_path, "entry-data-with-swhid.xml") with open(metadata_path, "w") as m: m.write(metadata) expected_deposit_status = { "deposit_id": "100", "deposit_status": "done", "deposit_date": "2020-10-08T13:52:34.509655Z", } assert expected_deposit_status["deposit_status"] == "done" # fmt: off result = cli_runner.invoke( cli, [ "metadata-only", "--url", f"https://{api_url_basename}/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, "--format", "json", ], ) # fmt: on assert result.exit_code == 0, result.output actual_deposit_status = json.loads(result.output) assert "error" not in actual_deposit_status assert actual_deposit_status == expected_deposit_status count_warnings = 0 warning_record: Optional[str] = None for (_, log_level, msg) in caplog.record_tuples: if log_level == logging.WARNING: count_warnings += 1 warning_record = msg if "no-prov" in metadata_entry_key: assert count_warnings == 1 assert "metadata-provenance>' should be provided" in warning_record else: assert count_warnings == 0 def test_cli_metadata_only_deposit_invalid_swhid( - datadir, requests_mock_datadir, cli_runner, atom_dataset, tmp_path, + datadir, + requests_mock_datadir, + cli_runner, + atom_dataset, + tmp_path, ): - """Post metadata-only deposit through cli with invalid swhid raises - - """ + """Post metadata-only deposit through cli with invalid swhid raises""" api_url_basename = "deposit.test.metadataonly" invalid_swhid = "ssh:2:sth:xxx" metadata = atom_dataset["entry-data-with-swhid-no-prov"].format(swhid=invalid_swhid) metadata_path = os.path.join(tmp_path, "entry-data-with-swhid.xml") with open(metadata_path, "w") as f: f.write(metadata) with pytest.raises(ValidationError, match="Invalid"): # fmt: off cli_runner.invoke( cli, [ "metadata-only", "--url", f"https://{api_url_basename}/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, "--format", "json", ], catch_exceptions=False, ) # fmt: on def test_cli_metadata_only_deposit_no_swhid( - datadir, requests_mock_datadir, cli_runner, atom_dataset, tmp_path, + datadir, + requests_mock_datadir, + cli_runner, + atom_dataset, + tmp_path, ): - """Post metadata-only deposit through cli with invalid swhid raises - - """ + """Post metadata-only deposit through cli with invalid swhid raises""" api_url_basename = "deposit.test.metadataonly" metadata = atom_dataset["entry-data-minimal"] metadata_path = os.path.join(tmp_path, "entry-data-minimal.xml") with open(metadata_path, "w") as f: f.write(metadata) with pytest.raises(InputError, match="SWHID must be provided"): # fmt: off cli_runner.invoke( cli, [ "metadata-only", "--url", f"https://{api_url_basename}/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, "--format", "json", ], catch_exceptions=False, ) # fmt: on @pytest.mark.parametrize( "metadata_entry_key", ["entry-data-with-add-to-origin", "entry-only-create-origin"] ) def test_cli_deposit_warning_missing_origin( metadata_entry_key, tmp_path, atom_dataset, caplog, cli_runner, requests_mock_datadir, ): - """Deposit cli should warn when provided metadata xml is missing 'origins' tags - - """ + """Deposit cli should warn when provided metadata xml is missing 'origins' tags""" # For the next deposit, no warning should be logged as either or # are provided, and is always # provided. raw_metadata = atom_dataset[metadata_entry_key] % "some-url" metadata_path = os.path.join(tmp_path, "metadata-with-origin-tag-to-deposit.xml") with open(metadata_path, "w") as f: f.write(raw_metadata) # fmt: off cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, ], ) # fmt: on for (_, log_level, _) in caplog.record_tuples: # all messages are info or below messages so everything is fine assert log_level < logging.WARNING def test_cli_deposit_warning_missing_provenance_url( - tmp_path, atom_dataset, caplog, cli_runner, requests_mock_datadir, + tmp_path, + atom_dataset, + caplog, + cli_runner, + requests_mock_datadir, ): - """Deposit cli should warn when no metadata provenance is provided - - """ + """Deposit cli should warn when no metadata provenance is provided""" atom_template = atom_dataset["entry-data-with-add-to-origin-no-prov"] raw_metadata = atom_template % "some-url" metadata_path = os.path.join(tmp_path, "metadata-with-missing-prov-url.xml") with open(metadata_path, "w") as f: f.write(raw_metadata) # fmt: off cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, ], ) # fmt: on count_warnings = sum( 1 for (_, log_level, _) in caplog.record_tuples if log_level == logging.WARNING ) assert count_warnings == 1 def test_cli_failure_should_be_parseable(atom_dataset, mocker): summary = "Cannot load metadata" verbose_description = ( "Cannot load metadata on swh:1:dir:0eda267e7d3c2e37b3f6a78e542b16190ac4574e, " "this directory object does not exist in the archive (yet?)." ) error_xml = atom_dataset["error-cli"].format( summary=summary, verboseDescription=verbose_description ) api_call = BaseDepositClient(url="https://somewhere.org/") actual_error = api_call.parse_result_error(error_xml) assert actual_error == { "summary": summary, "detail": "", "sword:verboseDescription": verbose_description, } def test_cli_service_document_failure(atom_dataset, mocker): - """Ensure service document failures are properly served - - """ + """Ensure service document failures are properly served""" summary = "Invalid user credentials" error_xml = atom_dataset["error-cli"].format(summary=summary, verboseDescription="") api_call = ServiceDocumentDepositClient(url="https://somewhere.org/") actual_error = api_call.parse_result_error(error_xml) assert actual_error == {"error": summary} @pytest.mark.parametrize( "output_format,parser_fn", [ ("json", json.loads), ("yaml", yaml.safe_load), ( "logging", ast.literal_eval, ), # not enough though, the caplog fixture is needed ], ) def test_cli_deposit_collection_list( output_format, parser_fn, datadir, slug, requests_mock_datadir, caplog, cli_runner ): - """Check deposit status cli with all possible output formats (json, yaml, logging). - - """ + """Check deposit status cli with all possible output formats (json, yaml, logging).""" api_url_basename = "deposit.test.list" expected_deposits = { "count": "3", "deposits": [ { "external_id": "check-deposit-2020-10-09T13:10:00.000000", "id": "1031", "status": "rejected", "status_detail": "Deposit without archive", }, { "external_id": "check-deposit-2020-10-10T13:20:00.000000", "id": "1032", "status": "rejected", "status_detail": "Deposit without archive", }, { "complete_date": "2020-10-08T13:52:34.509655", "external_id": "check-deposit-2020-10-08T13:52:34.509655", "id": "1033", "reception_date": "2020-10-08T13:50:30", "status": "done", "status_detail": "The deposit has been successfully loaded into " "the Software Heritage archive", "swhid": "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea", "swhid_context": "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea;origin=https://www.softwareheritage.org/check-deposit-2020-10-08T13:52:34.509655;visit=swh:1:snp:c477c6ef51833127b13a86ece7d75e5b3cc4e93d;anchor=swh:1:rev:f26f3960c175f15f6e24200171d446b86f6f7230;path=/", # noqa }, ], } # fmt: off result = cli_runner.invoke( cli, [ "list", "--url", f"https://{api_url_basename}/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--page", 1, "--page-size", 10, "--format", output_format, ], ) # fmt: on assert result.exit_code == 0, f"unexpected output: {result.output}" if output_format == "logging": assert len(caplog.record_tuples) == 1 # format: (, , ) _, _, result_output = caplog.record_tuples[0] else: result_output = result.output actual_deposit = parser_fn(result_output) assert actual_deposit == expected_deposits diff --git a/swh/deposit/tests/common.py b/swh/deposit/tests/common.py index 2e9008e0..2651f158 100644 --- a/swh/deposit/tests/common.py +++ b/swh/deposit/tests/common.py @@ -1,215 +1,217 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib from io import BytesIO import os import re import tarfile import tempfile from django.core.files.uploadedfile import InMemoryUploadedFile from swh.core import tarball def compute_info(archive_path): - """Given a path, compute information on path. - - """ + """Given a path, compute information on path.""" with open(archive_path, "rb") as f: length = 0 sha1sum = hashlib.sha1() md5sum = hashlib.md5() data = b"" for chunk in f: sha1sum.update(chunk) md5sum.update(chunk) length += len(chunk) data += chunk return { "dir": os.path.dirname(archive_path), "name": os.path.basename(archive_path), "path": archive_path, "length": length, "sha1sum": sha1sum.hexdigest(), "md5sum": md5sum.hexdigest(), "data": data, } def _compress(path, extension, dir_path): - """Compress path according to extension - - """ + """Compress path according to extension""" if extension == "zip" or extension == "tar": return tarball.compress(path, extension, dir_path) elif "." in extension: split_ext = extension.split(".") if split_ext[0] != "tar": raise ValueError( "Development error, only zip or tar archive supported, " "%s not supported" % extension ) # deal with specific tar mode = split_ext[1] supported_mode = ["xz", "gz", "bz2"] if mode not in supported_mode: raise ValueError( "Development error, only %s supported, %s not supported" % (supported_mode, mode) ) files = tarball._ls(dir_path) with tarfile.open(path, "w:%s" % mode) as t: for fpath, fname in files: t.add(fpath, arcname=fname, recursive=False) return path def create_arborescence_archive( root_path, archive_name, filename, content, up_to_size=None, extension="zip" ): """Build an archive named archive_name in the root_path. This archive contains one file named filename with the content content. Args: root_path (str): Location path of the archive to create archive_name (str): Archive's name (without extension) filename (str): Archive's content is only one filename content (bytes): Content of the filename up_to_size (int | None): Fill in the blanks size to oversize or complete an archive's size extension (str): Extension of the archive to write (default is zip) Returns: dict with the keys: - dir: the directory of that archive - path: full path to the archive - sha1sum: archive's sha1sum - length: archive's length """ os.makedirs(root_path, exist_ok=True) archive_path_dir = tempfile.mkdtemp(dir=root_path) dir_path = os.path.join(archive_path_dir, archive_name) os.mkdir(dir_path) filepath = os.path.join(dir_path, filename) _length = len(content) count = 0 batch_size = 128 with open(filepath, "wb") as f: f.write(content) if up_to_size: # fill with blank content up to a given size count += _length while count < up_to_size: f.write(b"0" * batch_size) count += batch_size _path = "%s.%s" % (dir_path, extension) _path = _compress(_path, extension, dir_path) return compute_info(_path) def create_archive_with_archive(root_path, name, archive): - """Create an archive holding another. - - """ + """Create an archive holding another.""" invalid_archive_path = os.path.join(root_path, name) with tarfile.open(invalid_archive_path, "w:gz") as _archive: _archive.add(archive["path"], arcname=archive["name"]) return compute_info(invalid_archive_path) def check_archive(archive_name: str, archive_name_to_check: str): """Helper function to ensure archive_name is present within the archive_name_to_check. Raises: AssertionError if archive_name is not present within archive_name_to_check """ ARCHIVE_FILEPATH_PATTERN = re.compile( r"client_[0-9].*/[0-9]{8}-[0-9]{6}\.[0-9]{6}/[a-zA-Z0-9.].*" ) assert ARCHIVE_FILEPATH_PATTERN.match(archive_name_to_check) if "." in archive_name: filename, extension = archive_name.split(".") pattern = re.compile(".*/%s.*\\.%s" % (filename, extension)) else: pattern = re.compile(".*/%s" % archive_name) assert pattern.match(archive_name_to_check) is not None def _post_or_put_archive(f, url, archive, slug=None, in_progress=None, **kwargs): default_kwargs = dict( content_type="application/zip", CONTENT_LENGTH=archive["length"], HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive["name"],), HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", ) kwargs = {**default_kwargs, **kwargs} - return f(url, data=archive["data"], HTTP_CONTENT_MD5=archive["md5sum"], **kwargs,) + return f( + url, + data=archive["data"], + HTTP_CONTENT_MD5=archive["md5sum"], + **kwargs, + ) def post_archive(authenticated_client, *args, **kwargs): return _post_or_put_archive(authenticated_client.post, *args, **kwargs) def put_archive(authenticated_client, *args, **kwargs): return _post_or_put_archive(authenticated_client.put, *args, **kwargs) def post_atom(authenticated_client, url, data, **kwargs): return authenticated_client.post( url, content_type="application/atom+xml;type=entry", data=data, **kwargs ) def put_atom(authenticated_client, url, data, **kwargs): return authenticated_client.put( url, content_type="application/atom+xml;type=entry", data=data, **kwargs ) def _post_or_put_multipart(f, url, archive, atom_entry, **kwargs): archive = InMemoryUploadedFile( BytesIO(archive["data"]), field_name=archive["name"], name=archive["name"], content_type="application/x-tar", size=archive["length"], charset=None, ) atom_entry = InMemoryUploadedFile( BytesIO(atom_entry.encode("utf-8")), field_name="atom0", name="atom0", content_type='application/atom+xml; charset="utf-8"', size=len(atom_entry), charset="utf-8", ) return f( url, format="multipart", - data={"archive": archive, "atom_entry": atom_entry,}, + data={ + "archive": archive, + "atom_entry": atom_entry, + }, **kwargs, ) def post_multipart(authenticated_client, *args, **kwargs): return _post_or_put_multipart(authenticated_client.post, *args, **kwargs) def put_multipart(authenticated_client, *args, **kwargs): return _post_or_put_multipart(authenticated_client.put, *args, **kwargs) diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py index 04fae35e..be3b947f 100644 --- a/swh/deposit/tests/conftest.py +++ b/swh/deposit/tests/conftest.py @@ -1,611 +1,592 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 from copy import deepcopy from functools import partial import os import re from typing import TYPE_CHECKING, Dict, Mapping from xml.etree import ElementTree from django.test.utils import setup_databases # type: ignore from django.urls import reverse_lazy as reverse import psycopg2 from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT import pytest from rest_framework import status from rest_framework.test import APIClient import yaml from swh.auth.pytest_plugin import keycloak_mock_factory from swh.core.config import read from swh.core.pytest_plugin import get_response_cb from swh.deposit.auth import DEPOSIT_PERMISSION from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED, SE_IRI, setup_django_for, ) from swh.deposit.tests.common import ( create_arborescence_archive, post_archive, post_atom, ) from swh.deposit.utils import NAMESPACES from swh.model.hashutil import hash_to_bytes from swh.model.swhids import CoreSWHID, ObjectType, QualifiedSWHID from swh.scheduler import get_scheduler if TYPE_CHECKING: from swh.deposit.models import Deposit, DepositClient, DepositCollection # mypy is asked to ignore the import statement above because setup_databases # is not part of the d.t.utils.__all__ variable. USERNAME = "test" EMAIL = "test@example.org" COLLECTION = "test" TEST_USER = { "username": USERNAME, "password": "pass", "email": EMAIL, "provider_url": "https://hal-test.archives-ouvertes.fr/", "domain": "archives-ouvertes.fr/", "collection": {"name": COLLECTION}, } USER_INFO = { "name": USERNAME, "email": EMAIL, "email_verified": False, "family_name": "", "given_name": "", "groups": [], "preferred_username": USERNAME, "sub": "ffffffff-bbbb-4444-aaaa-14f61e6b7200", } USERNAME2 = "test2" EMAIL2 = "test@example.org" COLLECTION2 = "another-collection" TEST_USER2 = { "username": USERNAME2, "password": "", "email": EMAIL2, "provider_url": "https://hal-test.archives-ouvertes.example/", "domain": "archives-ouvertes.example/", "collection": {"name": COLLECTION2}, } KEYCLOAK_SERVER_URL = "https://auth.swh.org/SWHTest" KEYCLOAK_REALM_NAME = "SWHTest" CLIENT_ID = "swh-deposit" keycloak_mock_auth_success = keycloak_mock_factory( server_url=KEYCLOAK_SERVER_URL, realm_name=KEYCLOAK_REALM_NAME, client_id=CLIENT_ID, auth_success=True, user_info=USER_INFO, client_permissions=[DEPOSIT_PERMISSION], ) keycloak_mock_auth_failure = keycloak_mock_factory( server_url=KEYCLOAK_SERVER_URL, realm_name=KEYCLOAK_REALM_NAME, client_id=CLIENT_ID, auth_success=False, ) def pytest_configure(): setup_django_for("testing") @pytest.fixture def requests_mock_datadir(datadir, requests_mock_datadir): - """Override default behavior to deal with put/post methods - - """ + """Override default behavior to deal with put/post methods""" cb = partial(get_response_cb, datadir=datadir) requests_mock_datadir.put(re.compile("https://"), body=cb) requests_mock_datadir.post(re.compile("https://"), body=cb) return requests_mock_datadir @pytest.fixture def common_deposit_config(swh_scheduler_config, swh_storage_backend_config): return { "max_upload_size": 5000, "extraction_dir": "/tmp/swh-deposit/test/extraction-dir", "checks": False, - "scheduler": {"cls": "local", **swh_scheduler_config,}, + "scheduler": { + "cls": "local", + **swh_scheduler_config, + }, "storage": swh_storage_backend_config, "storage_metadata": swh_storage_backend_config, "swh_authority_url": "http://deposit.softwareheritage.example/", } @pytest.fixture() def deposit_config(common_deposit_config): return { **common_deposit_config, "authentication_provider": "keycloak", "keycloak": { "server_url": KEYCLOAK_SERVER_URL, "realm_name": KEYCLOAK_REALM_NAME, }, } @pytest.fixture() def deposit_config_path(tmp_path, monkeypatch, deposit_config): conf_path = os.path.join(tmp_path, "deposit.yml") with open(conf_path, "w") as f: f.write(yaml.dump(deposit_config)) monkeypatch.setenv("SWH_CONFIG_FILENAME", conf_path) return conf_path @pytest.fixture(autouse=True) def deposit_autoconfig(deposit_config_path): """Enforce config for deposit classes inherited from APIConfig.""" cfg = read(deposit_config_path) if "scheduler" in cfg: # scheduler setup: require the check-deposit and load-deposit tasks scheduler = get_scheduler(**cfg["scheduler"]) task_types = [ { "type": "check-deposit", "backend_name": "swh.deposit.loader.tasks.ChecksDepositTsk", "description": "Check deposit metadata/archive before loading", "num_retries": 3, }, { "type": "load-deposit", "backend_name": "swh.loader.package.deposit.tasks.LoadDeposit", "description": "Loading deposit archive into swh archive", "num_retries": 3, }, ] for task_type in task_types: scheduler.create_task_type(task_type) @pytest.fixture(scope="session") def django_db_setup(request, django_db_blocker, postgresql_proc): from django.conf import settings settings.DATABASES["default"].update( { ("ENGINE", "django.db.backends.postgresql"), ("NAME", "tests"), ("USER", postgresql_proc.user), # noqa ("HOST", postgresql_proc.host), # noqa ("PORT", postgresql_proc.port), # noqa } ) with django_db_blocker.unblock(): setup_databases( verbosity=request.config.option.verbose, interactive=False, keepdb=False ) def execute_sql(sql): """Execute sql to postgres db""" with psycopg2.connect(database="postgres") as conn: conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() cur.execute(sql) @pytest.fixture(autouse=True, scope="session") def swh_proxy(): """Automatically inject this fixture in all tests to ensure no outside - connection takes place. + connection takes place. """ os.environ["http_proxy"] = "http://localhost:999" os.environ["https_proxy"] = "http://localhost:999" def create_deposit_collection(collection_name: str): - """Create a deposit collection with name collection_name - - """ + """Create a deposit collection with name collection_name""" from swh.deposit.models import DepositCollection try: collection = DepositCollection._default_manager.get(name=collection_name) except DepositCollection.DoesNotExist: collection = DepositCollection(name=collection_name) collection.save() return collection def deposit_collection_factory(collection_name): @pytest.fixture def _deposit_collection(db, collection_name=collection_name): return create_deposit_collection(collection_name) return _deposit_collection deposit_collection = deposit_collection_factory(COLLECTION) deposit_another_collection = deposit_collection_factory(COLLECTION2) def _create_deposit_user( collection: "DepositCollection", user_data: Dict ) -> "DepositClient": """Create/Return the test_user "test" For basic authentication, this will save a password. This is not required for keycloak authentication scheme. """ from swh.deposit.models import DepositClient user_data_d = deepcopy(user_data) user_data_d.pop("collection", None) passwd = user_data_d.pop("password", None) user, _ = DepositClient.objects.get_or_create( # type: ignore username=user_data_d["username"], defaults={**user_data_d, "collections": [collection.id]}, ) if passwd: user.set_password(passwd) user.save() return user @pytest.fixture def deposit_user(db, deposit_collection): return _create_deposit_user(deposit_collection, TEST_USER) @pytest.fixture def deposit_another_user(db, deposit_another_collection): return _create_deposit_user(deposit_another_collection, TEST_USER2) @pytest.fixture def anonymous_client(): - """Create an anonymous client (no credentials during queries to the deposit) - - """ + """Create an anonymous client (no credentials during queries to the deposit)""" return APIClient() # <- drf's client def mock_keycloakopenidconnect(mocker, keycloak_mock): - """Mock swh.deposit.auth.KeycloakOpenIDConnect to return the keycloak_mock - - """ + """Mock swh.deposit.auth.KeycloakOpenIDConnect to return the keycloak_mock""" mock = mocker.patch("swh.deposit.auth.KeycloakOpenIDConnect") mock.from_configfile.return_value = keycloak_mock return mock @pytest.fixture def mock_keycloakopenidconnect_ok(mocker, keycloak_mock_auth_success): """Mock keycloak so it always accepts connection for user with the right - permissions + permissions """ return mock_keycloakopenidconnect(mocker, keycloak_mock_auth_success) @pytest.fixture def mock_keycloakopenidconnect_ko(mocker, keycloak_mock_auth_failure): """Mock keycloak so it always refuses connections.""" return mock_keycloakopenidconnect(mocker, keycloak_mock_auth_failure) def _create_authenticated_client(client, user, password=None): """Return a client whose credentials will be proposed to the deposit server. This also patched the client instance to keep a reference on the associated deposit_user. """ if not password: password = "irrelevant-if-not-set" _token = "%s:%s" % (user.username, password) token = base64.b64encode(_token.encode("utf-8")) authorization = "Basic %s" % token.decode("utf-8") client.credentials(HTTP_AUTHORIZATION=authorization) client.deposit_client = user yield client client.logout() @pytest.fixture def basic_authenticated_client(anonymous_client, deposit_user): yield from _create_authenticated_client( anonymous_client, deposit_user, password=TEST_USER["password"] ) @pytest.fixture def authenticated_client(mock_keycloakopenidconnect_ok, anonymous_client, deposit_user): yield from _create_authenticated_client(anonymous_client, deposit_user) @pytest.fixture def unauthorized_client(mock_keycloakopenidconnect_ko, anonymous_client, deposit_user): - """Create an unauthorized client (will see their authentication fail) - - """ + """Create an unauthorized client (will see their authentication fail)""" yield from _create_authenticated_client(anonymous_client, deposit_user) @pytest.fixture def insufficient_perm_client( mocker, keycloak_mock_auth_success, anonymous_client, deposit_user ): """keycloak accepts connection but client returned has no deposit permission, so access - is not allowed. + is not allowed. """ keycloak_mock_auth_success.client_permissions = [] mock_keycloakopenidconnect(mocker, keycloak_mock_auth_success) yield from _create_authenticated_client(anonymous_client, deposit_user) @pytest.fixture def sample_archive(tmp_path): - """Returns a sample archive - - """ + """Returns a sample archive""" tmp_path = str(tmp_path) # pytest version limitation in previous version archive = create_arborescence_archive( tmp_path, "archive1", "file1", b"some content in file" ) return archive @pytest.fixture def atom_dataset(datadir) -> Mapping[str, str]: """Compute the paths to atom files. Returns: Dict of atom name per content (bytes) """ atom_path = os.path.join(datadir, "atom") data = {} for filename in os.listdir(atom_path): filepath = os.path.join(atom_path, filename) with open(filepath, "rb") as f: raw_content = f.read().decode("utf-8") # Keep the filename without extension atom_name = filename.split(".")[0] data[atom_name] = raw_content return data def internal_create_deposit( client: "DepositClient", collection: "DepositCollection", external_id: str, status: str, ) -> "Deposit": - """Create a deposit for a given collection with internal tool - - """ + """Create a deposit for a given collection with internal tool""" from swh.deposit.models import Deposit deposit = Deposit( client=client, external_id=external_id, status=status, collection=collection ) deposit.save() return deposit def create_deposit( client, collection_name: str, sample_archive, external_id: str, deposit_status=DEPOSIT_STATUS_DEPOSITED, in_progress=False, ): - """Create a skeleton shell deposit - - """ + """Create a skeleton shell deposit""" url = reverse(COL_IRI, args=[collection_name]) # when response = post_archive( client, url, sample_archive, HTTP_SLUG=external_id, HTTP_IN_PROGRESS=str(in_progress).lower(), ) # then assert response.status_code == status.HTTP_201_CREATED, response.content.decode() from swh.deposit.models import Deposit response_content = ElementTree.fromstring(response.content) deposit_id = int( response_content.findtext("swh:deposit_id", "", namespaces=NAMESPACES) ) deposit = Deposit._default_manager.get(id=deposit_id) if deposit.status != deposit_status: deposit.status = deposit_status deposit.save() assert deposit.status == deposit_status return deposit def create_binary_deposit( authenticated_client, collection_name: str, deposit_status: str = DEPOSIT_STATUS_DEPOSITED, atom_dataset: Mapping[str, bytes] = {}, **kwargs, ): """Create a deposit with both metadata and archive set. Then alters its status - to `deposit_status`. + to `deposit_status`. """ deposit = create_deposit( authenticated_client, collection_name, deposit_status=DEPOSIT_STATUS_PARTIAL, **kwargs, ) origin_url = deposit.client.provider_url + deposit.external_id response = post_atom( authenticated_client, reverse(SE_IRI, args=[collection_name, deposit.id]), data=atom_dataset["entry-data0"] % origin_url, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_201_CREATED assert deposit.status == DEPOSIT_STATUS_PARTIAL from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit.id) assert deposit.status == deposit_status return deposit def deposit_factory(deposit_status=DEPOSIT_STATUS_DEPOSITED, in_progress=False): - """Build deposit with a specific status - - """ + """Build deposit with a specific status""" @pytest.fixture() def _deposit( sample_archive, deposit_collection, authenticated_client, deposit_status=deposit_status, ): external_id = "external-id-%s" % deposit_status return create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id=external_id, deposit_status=deposit_status, in_progress=in_progress, ) return _deposit deposited_deposit = deposit_factory() rejected_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_REJECTED) partial_deposit = deposit_factory( deposit_status=DEPOSIT_STATUS_PARTIAL, in_progress=True ) verified_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_VERIFIED) completed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS) failed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_FAILURE) @pytest.fixture def partial_deposit_with_metadata( sample_archive, deposit_collection, authenticated_client, atom_dataset ): - """Returns deposit with archive and metadata provided, status 'partial' - - """ + """Returns deposit with archive and metadata provided, status 'partial'""" return create_binary_deposit( authenticated_client, deposit_collection.name, sample_archive=sample_archive, external_id="external-id-partial", in_progress=True, deposit_status=DEPOSIT_STATUS_PARTIAL, atom_dataset=atom_dataset, ) @pytest.fixture def partial_deposit_only_metadata( deposit_collection, authenticated_client, atom_dataset ): response = post_atom( authenticated_client, reverse(COL_IRI, args=[deposit_collection.name]), data=atom_dataset["entry-data1"], HTTP_SLUG="external-id-partial", HTTP_IN_PROGRESS=True, ) assert response.status_code == status.HTTP_201_CREATED response_content = ElementTree.fromstring(response.content) deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_PARTIAL return deposit @pytest.fixture def complete_deposit(sample_archive, deposit_collection, authenticated_client): - """Returns a completed deposit (load success) - - """ + """Returns a completed deposit (load success)""" deposit = create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id="external-id-complete", deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS, ) origin = "https://hal.archives-ouvertes.fr/hal-01727745" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" release_id = hash_to_bytes("548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10") snapshot_id = hash_to_bytes("e5e82d064a9c3df7464223042e0c55d72ccff7f0") deposit.swhid = f"swh:1:dir:{directory_id}" deposit.swhid_context = str( QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(directory_id), origin=origin, visit=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snapshot_id), anchor=CoreSWHID(object_type=ObjectType.RELEASE, object_id=release_id), path=b"/", ) ) deposit.save() return deposit @pytest.fixture() def tmp_path(tmp_path): return str(tmp_path) # issue with oldstable's pytest version diff --git a/swh/deposit/tests/loader/common.py b/swh/deposit/tests/loader/common.py index fd466f0c..942429ba 100644 --- a/swh/deposit/tests/loader/common.py +++ b/swh/deposit/tests/loader/common.py @@ -1,139 +1,137 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from typing import Dict, Optional from swh.deposit.client import PrivateApiDepositClient from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.model import SnapshotBranch, TargetType from swh.storage.algos.snapshot import snapshot_get_all_branches CLIENT_TEST_CONFIG = { "url": "http://nowhere:9000/", "auth": {}, # no authentication in test scenario } class SWHDepositTestClient(PrivateApiDepositClient): """Deposit test client to permit overriding the default request - client. + client. """ def __init__(self, client, config): super().__init__(config=config) self.client = client def archive_get(self, archive_update_url, archive_path, log=None): r = self.client.get(archive_update_url) with open(archive_path, "wb") as f: for chunk in r.streaming_content: f.write(chunk) return archive_path def metadata_get(self, metadata_url, log=None): r = self.client.get(metadata_url) return json.loads(r.content.decode("utf-8")) def status_update( self, update_status_url, status, release_id=None, directory_id=None, origin_url=None, ): payload = {"status": status} if release_id: payload["release_id"] = release_id if directory_id: payload["directory_id"] = directory_id if origin_url: payload["origin_url"] = origin_url self.client.put( update_status_url, content_type="application/json", data=json.dumps(payload) ) def check(self, check_url): r = self.client.get(check_url) data = json.loads(r.content.decode("utf-8")) return data["status"] def get_stats(storage) -> Dict: """Adaptation utils to unify the stats counters across storage - implementation. + implementation. """ storage.refresh_stat_counters() stats = storage.stat_counters() keys = [ "content", "directory", "origin", "origin_visit", "person", "release", "revision", "skipped_content", "snapshot", ] return {k: stats.get(k) for k in keys} def decode_target(branch: Optional[SnapshotBranch]) -> Optional[Dict]: - """Test helper to ease readability in test - - """ + """Test helper to ease readability in test""" if not branch: return None target_type = branch.target_type if target_type == TargetType.ALIAS: decoded_target = branch.target.decode("utf-8") else: decoded_target = hash_to_hex(branch.target) return {"target": decoded_target, "target_type": target_type} def check_snapshot(expected_snapshot, storage): """Check for snapshot match. Provide the hashes as hexadecimal, the conversion is done within the method. Args: expected_snapshot (dict): full snapshot with hex ids storage (Storage): expected storage """ expected_snapshot_id = expected_snapshot["id"] expected_branches = expected_snapshot["branches"] snap = snapshot_get_all_branches(hash_to_bytes(expected_snapshot_id)) if snap is None: # display known snapshots instead if possible if hasattr(storage, "_snapshots"): # in-mem storage from pprint import pprint for snap_id, (_snap, _) in storage._snapshots.items(): snapd = _snap.to_dict() snapd["id"] = hash_to_hex(snapd["id"]) branches = { branch.decode("utf-8"): decode_target(target) for branch, target in snapd["branches"].items() } snapd["branches"] = branches pprint(snapd) raise AssertionError("Snapshot is not found") branches = { branch.decode("utf-8"): decode_target(branch) for branch_name, branch in snap["branches"].items() } assert expected_branches == branches diff --git a/swh/deposit/tests/loader/test_checker.py b/swh/deposit/tests/loader/test_checker.py index 60d451ef..56189517 100644 --- a/swh/deposit/tests/loader/test_checker.py +++ b/swh/deposit/tests/loader/test_checker.py @@ -1,32 +1,26 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from unittest.mock import patch def test_checker_deposit_ready(requests_mock_datadir, deposit_checker): - """Check on a valid 'deposited' deposit should result in 'verified' - - """ + """Check on a valid 'deposited' deposit should result in 'verified'""" actual_result = deposit_checker.check(collection="test", deposit_id=1) assert actual_result == {"status": "eventful"} def test_checker_deposit_rejected(requests_mock_datadir, deposit_checker): - """Check on invalid 'deposited' deposit should result in 'rejected' - - """ + """Check on invalid 'deposited' deposit should result in 'rejected'""" actual_result = deposit_checker.check(collection="test", deposit_id=2) assert actual_result == {"status": "failed"} @patch("swh.deposit.client.requests.get") def test_checker_deposit_rejected_exception(mock_requests, deposit_checker): - """Check on invalid 'deposited' deposit should result in 'rejected' - - """ + """Check on invalid 'deposited' deposit should result in 'rejected'""" mock_requests.side_effect = ValueError("simulated problem when checking") actual_result = deposit_checker.check(collection="test", deposit_id=3) assert actual_result == {"status": "failed"} diff --git a/swh/deposit/tests/loader/test_client.py b/swh/deposit/tests/loader/test_client.py index 7745a009..e9949f01 100644 --- a/swh/deposit/tests/loader/test_client.py +++ b/swh/deposit/tests/loader/test_client.py @@ -1,246 +1,226 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os from typing import Any, Callable, Optional from urllib.parse import urlparse import pytest from requests import Session from swh.deposit.client import PrivateApiDepositClient from swh.deposit.config import DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS CLIENT_TEST_CONFIG = { "url": "https://nowhere.org/", "auth": {}, # no authentication in test scenario } @pytest.fixture def deposit_config(): return CLIENT_TEST_CONFIG def test_client_config(deposit_config_path): for client in [ # config passed as constructor parameter PrivateApiDepositClient(config=CLIENT_TEST_CONFIG), # config loaded from environment PrivateApiDepositClient(), ]: assert client.base_url == CLIENT_TEST_CONFIG["url"] assert client.auth is None def build_expected_path(datadir, base_url: str, api_url: str) -> str: - """Build expected path from api to served file - - """ + """Build expected path from api to served file""" url = urlparse(base_url) dirname = "%s_%s" % (url.scheme, url.hostname) if api_url.endswith("/"): api_url = api_url[:-1] if api_url.startswith("/"): api_url = api_url[1:] suffix_path = api_url.replace("/", "_") return os.path.join(datadir, dirname, suffix_path) def test_build_expected_path(datadir): actual_path = build_expected_path(datadir, "http://example.org", "/hello/you/") assert actual_path == os.path.join(datadir, "http_example.org", "hello_you") def read_served_path( datadir, base_url: str, api_url: str, convert_fn: Optional[Callable[[str], Any]] = None, ) -> bytes: - """Read served path - - """ + """Read served path""" archive_path = build_expected_path(datadir, base_url, api_url) with open(archive_path, "rb") as f: content = f.read() if convert_fn: content = convert_fn(content.decode("utf-8")) return content def test_read_served_path(datadir): actual_content = read_served_path(datadir, "http://example.org", "/hello/you/") assert actual_content == b"hello people\n" actual_content2 = read_served_path( datadir, "http://example.org", "/hello.json", convert_fn=json.loads ) assert actual_content2 == {"a": [1, 3]} # private api to retrieve archive def test_archive_get(tmp_path, datadir, requests_mock_datadir): - """Retrieving archive data through private api should stream data - - """ + """Retrieving archive data through private api should stream data""" api_url = "/1/private/test/1/raw/" client = PrivateApiDepositClient(CLIENT_TEST_CONFIG) expected_content = read_served_path(datadir, client.base_url, api_url) archive_path = os.path.join(tmp_path, "test.archive") archive_path = client.archive_get(api_url, archive_path) assert os.path.exists(archive_path) is True with open(archive_path, "rb") as f: actual_content = f.read() assert actual_content == expected_content assert client.base_url == CLIENT_TEST_CONFIG["url"] assert client.auth is None def test_archive_get_auth(tmp_path, datadir, requests_mock_datadir): - """Retrieving archive data through private api should stream data - - """ + """Retrieving archive data through private api should stream data""" api_url = "/1/private/test/1/raw/" config = CLIENT_TEST_CONFIG.copy() config["auth"] = { # add authentication setup "username": "user", "password": "pass", } client = PrivateApiDepositClient(config) expected_content = read_served_path(datadir, client.base_url, api_url) archive_path = os.path.join(tmp_path, "test.archive") archive_path = client.archive_get(api_url, archive_path) assert os.path.exists(archive_path) is True with open(archive_path, "rb") as f: actual_content = f.read() assert actual_content == expected_content assert client.base_url == CLIENT_TEST_CONFIG["url"] assert client.auth == ("user", "pass") def test_archive_get_ko(tmp_path, datadir, requests_mock_datadir): - """Reading archive can fail for some reasons - - """ + """Reading archive can fail for some reasons""" unknown_api_url = "/1/private/unknown/deposit-id/raw/" client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) with pytest.raises(ValueError, match="Problem when retrieving deposit"): client.archive_get(unknown_api_url, "some/path") # private api read metadata def test_metadata_get(datadir, requests_mock_datadir): - """Reading archive should write data in temporary directory - - """ + """Reading archive should write data in temporary directory""" api_url = "/1/private/test/1/metadata" client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) actual_metadata = client.metadata_get(api_url) assert isinstance(actual_metadata, str) is False expected_content = read_served_path( datadir, client.base_url, api_url, convert_fn=json.loads ) assert actual_metadata == expected_content def test_metadata_get_ko(requests_mock_datadir): - """Reading metadata can fail for some reasons - - """ + """Reading metadata can fail for some reasons""" unknown_api_url = "/1/private/unknown/deposit-id/metadata/" client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) with pytest.raises(ValueError, match="Problem when retrieving metadata"): client.metadata_get(unknown_api_url) # private api check def test_check(requests_mock_datadir): - """When check ok, this should return the deposit's status - - """ + """When check ok, this should return the deposit's status""" api_url = "/1/private/test/1/check" client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) r = client.check(api_url) assert r == "something" def test_check_fails(requests_mock_datadir): - """Checking deposit can fail for some reason - - """ + """Checking deposit can fail for some reason""" unknown_api_url = "/1/private/test/10/check" client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) with pytest.raises(ValueError, match="Problem when checking deposit"): client.check(unknown_api_url) # private api update status def test_status_update(mocker): - """Update status - - """ + """Update status""" mocked_put = mocker.patch.object(Session, "request") deposit_client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) deposit_client.status_update( "/update/status", DEPOSIT_STATUS_LOAD_SUCCESS, release_id="some-release-id", status_detail="foo bar", ) mocked_put.assert_called_once_with( "put", "https://nowhere.org/update/status", json={ "status": DEPOSIT_STATUS_LOAD_SUCCESS, "status_detail": "foo bar", "release_id": "some-release-id", }, ) def test_status_update_with_no_release_id(mocker): - """Reading metadata can fail for some reasons - - """ + """Reading metadata can fail for some reasons""" mocked_put = mocker.patch.object(Session, "request") deposit_client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) deposit_client.status_update("/update/status/fail", DEPOSIT_STATUS_LOAD_FAILURE) mocked_put.assert_called_once_with( "put", "https://nowhere.org/update/status/fail", - json={"status": DEPOSIT_STATUS_LOAD_FAILURE,}, + json={ + "status": DEPOSIT_STATUS_LOAD_FAILURE, + }, ) diff --git a/swh/deposit/tests/loader/test_tasks.py b/swh/deposit/tests/loader/test_tasks.py index 5f85ebcd..068444d4 100644 --- a/swh/deposit/tests/loader/test_tasks.py +++ b/swh/deposit/tests/loader/test_tasks.py @@ -1,75 +1,69 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest @pytest.mark.db def test_task_check_eventful( mocker, deposit_config_path, swh_scheduler_celery_app, swh_scheduler_celery_worker ): - """Successful check should make the check succeed - - """ + """Successful check should make the check succeed""" client = mocker.patch("swh.deposit.loader.checker.PrivateApiDepositClient.check") client.return_value = "verified" collection = "collection" deposit_id = 42 res = swh_scheduler_celery_app.send_task( "swh.deposit.loader.tasks.ChecksDepositTsk", args=[collection, deposit_id] ) assert res res.wait() assert res.successful() assert res.result == {"status": "eventful"} client.assert_called_once_with(f"/{collection}/{deposit_id}/check/") @pytest.mark.db def test_task_check_failure( mocker, deposit_config_path, swh_scheduler_celery_app, swh_scheduler_celery_worker ): - """Unverified check status should make the check fail - - """ + """Unverified check status should make the check fail""" client = mocker.patch("swh.deposit.loader.checker.PrivateApiDepositClient.check") client.return_value = "not-verified" # will make the status "failed" collection = "collec" deposit_id = 666 res = swh_scheduler_celery_app.send_task( "swh.deposit.loader.tasks.ChecksDepositTsk", args=[collection, deposit_id] ) assert res res.wait() assert res.successful() assert res.result == {"status": "failed"} client.assert_called_once_with(f"/{collection}/{deposit_id}/check/") @pytest.mark.db def test_task_check_3( mocker, deposit_config_path, swh_scheduler_celery_app, swh_scheduler_celery_worker ): - """Unexpected failures should fail the check - - """ + """Unexpected failures should fail the check""" client = mocker.patch("swh.deposit.loader.checker.PrivateApiDepositClient.check") client.side_effect = ValueError("unexpected failure will make it fail") collection = "another-collection" deposit_id = 999 res = swh_scheduler_celery_app.send_task( "swh.deposit.loader.tasks.ChecksDepositTsk", args=[collection, deposit_id] ) assert res res.wait() assert res.successful() assert res.result == {"status": "failed"} client.assert_called_once_with(f"/{collection}/{deposit_id}/check/") diff --git a/swh/deposit/tests/test_backend.py b/swh/deposit/tests/test_backend.py index ca31d40a..b647ea78 100644 --- a/swh/deposit/tests/test_backend.py +++ b/swh/deposit/tests/test_backend.py @@ -1,71 +1,69 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from rest_framework.exceptions import AuthenticationFailed from swh.deposit.auth import KeycloakBasicAuthentication from swh.deposit.tests.conftest import TEST_USER REQUEST_OBJECT = "request-unused" PASSWORD = "some-deposit-pass" @pytest.fixture def backend_success(mock_keycloakopenidconnect_ok, deposit_config, db): """Backend whose connection to keycloak will systematically succeed.""" return KeycloakBasicAuthentication() @pytest.fixture def backend_failure(mock_keycloakopenidconnect_ko, deposit_config): """Backend whose connection to keycloak will systematically fail.""" return KeycloakBasicAuthentication() def test_backend_authentication_refused(backend_failure): with pytest.raises(AuthenticationFailed): backend_failure.authenticate_credentials( TEST_USER["username"], PASSWORD, REQUEST_OBJECT ) def test_backend_authentication_db_misconfigured(backend_success): """Keycloak configured ok, backend db misconfigured (missing user), this raises""" with pytest.raises(AuthenticationFailed, match="Unknown"): backend_success.authenticate_credentials( TEST_USER["username"], PASSWORD, REQUEST_OBJECT ) def test_backend_authentication_user_inactive(backend_success, deposit_user): """Keycloak configured ok, backend db configured, user inactive, this raises""" deposit_user.is_active = False deposit_user.save() with pytest.raises(AuthenticationFailed, match="Deactivated"): backend_success.authenticate_credentials( deposit_user.username, PASSWORD, REQUEST_OBJECT ) def test_backend_authentication_ok(backend_success, deposit_user): - """Keycloak configured ok, backend db configured ok, user logs in - - """ + """Keycloak configured ok, backend db configured ok, user logs in""" user0, _ = backend_success.authenticate_credentials( deposit_user.username, PASSWORD, REQUEST_OBJECT ) assert user0 is not None # A second authentication call should leverage the django cache feature. user1, _ = backend_success.authenticate_credentials( deposit_user.username, PASSWORD, REQUEST_OBJECT ) assert user1 is not None assert user0 == user1, "Should have been retrieved from the cache" diff --git a/swh/deposit/tests/test_client_module.py b/swh/deposit/tests/test_client_module.py index b6a5f9f0..d52b2c79 100644 --- a/swh/deposit/tests/test_client_module.py +++ b/swh/deposit/tests/test_client_module.py @@ -1,215 +1,221 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # Ensure the gist of the BaseDepositClient.execute works as expected in corner cases The # following tests uses the ServiceDocumentDepositClient and StatusDepositClient because # they are BaseDepositClient subclasses. We could have used other classes but those ones # got elected as they are fairly simple ones. import pytest from swh.deposit.client import ( CollectionListDepositClient, MaintenanceError, PublicApiDepositClient, ServiceDocumentDepositClient, StatusDepositClient, ) from swh.deposit.utils import to_header_link def test_client_read_data_ok(requests_mock_datadir): client = ServiceDocumentDepositClient( url="https://deposit.swh.test/1", auth=("test", "test") ) result = client.execute() assert isinstance(result, dict) collection = result["app:service"]["app:workspace"][0]["app:collection"] assert collection["sword:name"] == "test" def test_client_read_data_fails(mocker): mock = mocker.patch("swh.deposit.client.BaseDepositClient.do_execute") mock.side_effect = ValueError("here comes trouble") client = ServiceDocumentDepositClient( url="https://deposit.swh.test/1", auth=("test", "test") ) result = client.execute() assert isinstance(result, dict) assert "error" in result assert mock.called def test_client_read_data_no_result(requests_mock): url = "https://deposit.swh.test/1" requests_mock.get(f"{url}/servicedocument/", status_code=204) client = ServiceDocumentDepositClient( url="https://deposit.swh.test/1", auth=("test", "test") ) result = client.execute() assert isinstance(result, dict) assert result == {"status": 204} def test_client_read_data_collection_error_503(requests_mock, atom_dataset): error_content = atom_dataset["error-cli"].format( - summary="forbidden", verboseDescription="Access restricted", + summary="forbidden", + verboseDescription="Access restricted", ) url = "https://deposit.swh.test/1" requests_mock.get(f"{url}/servicedocument/", status_code=503, text=error_content) client = ServiceDocumentDepositClient( url="https://deposit.swh.test/1", auth=("test", "test") ) result = client.execute() assert isinstance(result, dict) assert result == { "error": "forbidden", "status": 503, "collection": None, } def test_client_read_data_status_error_503(requests_mock, atom_dataset): error_content = atom_dataset["error-cli"].format( - summary="forbidden", verboseDescription="Access restricted", + summary="forbidden", + verboseDescription="Access restricted", ) collection = "test" deposit_id = 1 url = "https://deposit.swh.test/1" requests_mock.get( f"{url}/{collection}/{deposit_id}/status/", status_code=503, text=error_content ) client = StatusDepositClient( url="https://deposit.swh.test/1", auth=("test", "test") ) with pytest.raises(MaintenanceError, match="forbidden"): client.execute(collection, deposit_id) EXPECTED_DEPOSIT = { "id": "1031", "external_id": "check-deposit-2020-10-09T13:10:00.000000", "status": "rejected", "status_detail": "Deposit without archive", } EXPECTED_DEPOSIT2 = { "id": "1032", "external_id": "check-deposit-2020-10-10T13:20:00.000000", "status": "rejected", "status_detail": "Deposit without archive", } EXPECTED_DEPOSIT3 = { "id": "1033", "external_id": "check-deposit-2020-10-08T13:52:34.509655", "status": "done", "status_detail": ( "The deposit has been successfully loaded into the Software " "Heritage archive" ), "reception_date": "2020-10-08T13:50:30", "complete_date": "2020-10-08T13:52:34.509655", "swhid": "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea", "swhid_context": "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea;origin=https://www.softwareheritage.org/check-deposit-2020-10-08T13:52:34.509655;visit=swh:1:snp:c477c6ef51833127b13a86ece7d75e5b3cc4e93d;anchor=swh:1:rev:f26f3960c175f15f6e24200171d446b86f6f7230;path=/", # noqa } def test_client_collection_list(requests_mock, atom_dataset): collection_list_xml = atom_dataset["entry-list-deposits"] base_url = "https://deposit.test.list/1" collection = "test" url = f"{base_url}/{collection}/" requests_mock.get(url, status_code=200, text=collection_list_xml) expected_result = { "count": "3", "deposits": [EXPECTED_DEPOSIT, EXPECTED_DEPOSIT2, EXPECTED_DEPOSIT3], } # use dedicated client client = CollectionListDepositClient(url=base_url, auth=("test", "test")) # no pagination result = client.execute(collection) assert result == expected_result # The main public client should work the same way client2 = PublicApiDepositClient(url=base_url, auth=("test", "test")) result2 = client2.deposit_list(collection) assert result2 == expected_result assert requests_mock.called request_history = [m.url for m in requests_mock.request_history] assert request_history == [url] * 2 def test_client_collection_list_with_pagination_headers(requests_mock, atom_dataset): collection_list_xml_page1 = atom_dataset["entry-list-deposits-page1"] collection_list_xml_page2 = atom_dataset["entry-list-deposits-page2"] base_url = "https://deposit.test.list/1" collection = "test" url = f"{base_url}/{collection}/" page1 = 1 page2 = 2 page_size = 10 url_page1 = f"{url}?page={page1}" url_page2 = f"{url}?page={page2}&page_size={page_size}" requests_mock.get( url_page1, status_code=200, text=collection_list_xml_page1, - headers={"Link": to_header_link(url_page2, "next"),}, + headers={ + "Link": to_header_link(url_page2, "next"), + }, ) requests_mock.get( url_page2, status_code=200, text=collection_list_xml_page2, - headers={"Link": to_header_link(url_page1, "previous"),}, + headers={ + "Link": to_header_link(url_page1, "previous"), + }, ) expected_result_page1 = { "count": "3", "deposits": [EXPECTED_DEPOSIT, EXPECTED_DEPOSIT2], "next": url_page2, } expected_result_page2 = { "count": "3", "deposits": [EXPECTED_DEPOSIT3], "previous": url_page1, } client = CollectionListDepositClient( url="https://deposit.test.list/1", auth=("test", "test") ) client2 = PublicApiDepositClient(url=base_url, auth=("test", "test")) result = client.execute(collection, page=page1) assert result == expected_result_page1 result2 = client.execute(collection, page=page2, page_size=page_size) assert result2 == expected_result_page2 # The main public client should work the same way result = client2.deposit_list(collection, page=page1) assert result == expected_result_page1 result2 = client2.deposit_list(collection, page=page2, page_size=page_size) assert result2 == expected_result_page2 assert requests_mock.called request_history = [m.url for m in requests_mock.request_history] assert request_history == [url_page1, url_page2] * 2 diff --git a/swh/deposit/tests/test_gunicorn_config.py b/swh/deposit/tests/test_gunicorn_config.py index 48fc5d61..5f522a7b 100644 --- a/swh/deposit/tests/test_gunicorn_config.py +++ b/swh/deposit/tests/test_gunicorn_config.py @@ -1,62 +1,65 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from unittest.mock import patch import swh.deposit.gunicorn_config as gunicorn_config def test_post_fork_default(): with patch("sentry_sdk.init") as sentry_sdk_init: gunicorn_config.post_fork(None, None) sentry_sdk_init.assert_not_called() def test_post_fork_with_dsn_env(): django_integration = object() # unique object to check for equality with patch( "swh.deposit.gunicorn_config.DjangoIntegration", new=lambda: django_integration ): with patch("sentry_sdk.init") as sentry_sdk_init: with patch.dict( os.environ, - {"SWH_SENTRY_DSN": "test_dsn", "SWH_SENTRY_ENVIRONMENT": "test",}, + { + "SWH_SENTRY_DSN": "test_dsn", + "SWH_SENTRY_ENVIRONMENT": "test", + }, ): gunicorn_config.post_fork(None, None) sentry_sdk_init.assert_called_once_with( dsn="test_dsn", integrations=[django_integration], environment="test", debug=False, release=None, ) def test_post_fork_debug(): django_integration = object() # unique object to check for equality with patch( "swh.deposit.gunicorn_config.DjangoIntegration", new=lambda: django_integration ): with patch("sentry_sdk.init") as sentry_sdk_init: with patch.dict( os.environ, { "SWH_SENTRY_DSN": "test_dsn", "SWH_SENTRY_DEBUG": "1", "SWH_SENTRY_ENVIRONMENT": "test", }, ): gunicorn_config.post_fork(None, None) sentry_sdk_init.assert_called_once_with( dsn="test_dsn", integrations=[django_integration], environment="test", debug=True, release=None, ) diff --git a/swh/deposit/tests/test_migrations.py b/swh/deposit/tests/test_migrations.py index 0fb97ce9..9565f0a8 100644 --- a/swh/deposit/tests/test_migrations.py +++ b/swh/deposit/tests/test_migrations.py @@ -1,131 +1,132 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # Quick note: Django migrations already depend on one another. So to migrate a schema up # to a point, it's enough to migrate the model to the last but one migration. Then # assert something is not there, trigger the next migration and check the last state is # as expected. That's what's the following scenarios do. from datetime import datetime, timezone from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS from swh.model.hashutil import hash_to_bytes from swh.model.swhids import CoreSWHID, ObjectType, QualifiedSWHID def now() -> datetime: return datetime.now(tz=timezone.utc) def test_migrations_20_rename_swhid_column_in_deposit_model(migrator): """Ensures the 20 migration renames appropriately the swh_id* Deposit columns""" old_state = migrator.apply_initial_migration(("deposit", "0019_auto_20200519_1035")) old_deposit = old_state.apps.get_model("deposit", "Deposit") assert hasattr(old_deposit, "swh_id") is True assert hasattr(old_deposit, "swhid") is False assert hasattr(old_deposit, "swh_id_context") is True assert hasattr(old_deposit, "swhid_context") is False new_state = migrator.apply_tested_migration( ("deposit", "0021_deposit_origin_url_20201124_1438") ) new_deposit = new_state.apps.get_model("deposit", "Deposit") assert hasattr(new_deposit, "swh_id") is False assert hasattr(new_deposit, "swhid") is True assert hasattr(new_deposit, "swh_id_context") is False assert hasattr(new_deposit, "swhid_context") is True def test_migrations_21_add_origin_url_column_to_deposit_model(migrator): """Ensures the 21 migration adds the origin_url field to the Deposit table""" old_state = migrator.apply_initial_migration(("deposit", "0020_auto_20200929_0855")) old_deposit = old_state.apps.get_model("deposit", "Deposit") assert hasattr(old_deposit, "origin_url") is False new_state = migrator.apply_tested_migration( ("deposit", "0021_deposit_origin_url_20201124_1438") ) new_deposit = new_state.apps.get_model("deposit", "Deposit") assert hasattr(new_deposit, "origin_url") is True def test_migrations_22_add_deposit_type_column_model_and_data(migrator): """22 migration should add the type column and migrate old values with new type""" from swh.deposit.models import ( DEPOSIT_CODE, DEPOSIT_METADATA_ONLY, Deposit, DepositClient, DepositCollection, ) old_state = migrator.apply_initial_migration( ("deposit", "0021_deposit_origin_url_20201124_1438") ) old_deposit = old_state.apps.get_model("deposit", "Deposit") collection = DepositCollection.objects.create(name="hello") client = DepositClient.objects.create(username="name", collections=[collection.id]) # Create old deposits to make sure they are migrated properly deposit1 = old_deposit.objects.create( status="partial", client_id=client.id, collection_id=collection.id ) deposit2 = old_deposit.objects.create( status="verified", client_id=client.id, collection_id=collection.id ) origin = "https://hal.archives-ouvertes.fr/hal-01727745" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" release_id = hash_to_bytes("548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10") snapshot_id = hash_to_bytes("e5e82d064a9c3df7464223042e0c55d72ccff7f0") date_now = now() # metadata deposit deposit3 = old_deposit.objects.create( status=DEPOSIT_STATUS_LOAD_SUCCESS, client_id=client.id, collection_id=collection.id, swhid=CoreSWHID( - object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(directory_id), + object_type=ObjectType.DIRECTORY, + object_id=hash_to_bytes(directory_id), ), swhid_context=QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(directory_id), origin=origin, visit=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snapshot_id), anchor=CoreSWHID(object_type=ObjectType.RELEASE, object_id=release_id), path=b"/", ), ) # work around (complete date is installed on creation) deposit3.complete_date = date_now deposit3.reception_date = date_now deposit3.save() assert hasattr(old_deposit, "type") is False # Migrate to the latest schema new_state = migrator.apply_tested_migration(("deposit", "0022_auto_20220223_1542")) new_deposit = new_state.apps.get_model("deposit", "Deposit") assert hasattr(new_deposit, "type") is True assert Deposit().type == DEPOSIT_CODE all_deposits = Deposit.objects.all() assert len(all_deposits) == 3 for deposit in all_deposits: if deposit.id in (deposit1.id, deposit2.id): assert deposit.type == DEPOSIT_CODE else: assert deposit.id == deposit3.id and deposit.type == DEPOSIT_METADATA_ONLY diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py index 91ff6be2..8c56bbec 100644 --- a/swh/deposit/tests/test_utils.py +++ b/swh/deposit/tests/test_utils.py @@ -1,218 +1,218 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from xml.etree import ElementTree import pytest from swh.deposit import utils from swh.model.exceptions import ValidationError from swh.model.swhids import CoreSWHID, QualifiedSWHID @pytest.fixture def xml_with_origin_reference(): xml_data = """ """ return xml_data.strip() def test_normalize_date_0(): - """When date is a list, choose the first date and normalize it - """ + """When date is a list, choose the first date and normalize it""" actual_date = utils.normalize_date(["2017-10-12", "date1"]) assert actual_date == { "timestamp": {"microseconds": 0, "seconds": 1507766400}, "offset": 0, } def test_normalize_date_1(): - """Providing a date in a reasonable format, everything is fine - """ + """Providing a date in a reasonable format, everything is fine""" actual_date = utils.normalize_date("2018-06-11 17:02:02") assert actual_date == { "timestamp": {"microseconds": 0, "seconds": 1528736522}, "offset": 0, } def test_normalize_date_doing_irrelevant_stuff(): - """Providing a date with only the year results in a reasonable date - """ + """Providing a date with only the year results in a reasonable date""" actual_date = utils.normalize_date("2017") assert actual_date == { "timestamp": {"seconds": 1483228800, "microseconds": 0}, "offset": 0, } @pytest.mark.parametrize( "swhid,expected_metadata_context", [ - ("swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49", {"origin": None},), + ( + "swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49", + {"origin": None}, + ), ( "swh:1:snp:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=http://blah", {"origin": "http://blah", "path": None}, ), ( "swh:1:dir:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;path=/path", {"origin": None, "path": b"/path"}, ), ( "swh:1:rev:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;visit=swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", # noqa { "origin": None, "path": None, "snapshot": CoreSWHID.from_string( "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49" ), }, ), ( "swh:1:rel:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", # noqa { "origin": None, "path": None, "directory": CoreSWHID.from_string( "swh:1:dir:41b5c8cc985d190b5a7ef4878128ebfdc2358f49" ), }, ), ], ) def test_compute_metadata_context(swhid: str, expected_metadata_context): assert expected_metadata_context == utils.compute_metadata_context( QualifiedSWHID.from_string(swhid) ) def test_parse_swh_reference_origin(xml_with_origin_reference): url = "https://url" xml_data = xml_with_origin_reference.format(url=url) metadata = ElementTree.fromstring(xml_data) actual_origin = utils.parse_swh_reference(metadata) assert actual_origin == url @pytest.fixture def xml_swh_deposit_template(): xml_data = """ {swh_deposit} """ return xml_data.strip() @pytest.mark.parametrize( "xml_ref", [ "", "", "", """""", ], ) def test_parse_swh_reference_empty(xml_swh_deposit_template, xml_ref): xml_body = xml_swh_deposit_template.format(swh_deposit=xml_ref) metadata = ElementTree.fromstring(xml_body) assert utils.parse_swh_reference(metadata) is None @pytest.fixture def xml_with_swhid(atom_dataset): return atom_dataset["entry-data-with-swhid-no-prov"] @pytest.mark.parametrize( "swhid", [ "swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=https://hal.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:4fc1e36fca86b2070204bedd51106014a614f321;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba;path=/moranegg-AffectationRO-df7f68b/", # noqa "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:rev:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rel:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:snp:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:snp:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa "swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", ], ) def test_parse_swh_reference_swhid(swhid, xml_with_swhid): - xml_data = xml_with_swhid.format(swhid=swhid,) + xml_data = xml_with_swhid.format( + swhid=swhid, + ) metadata = ElementTree.fromstring(xml_data) actual_swhid = utils.parse_swh_reference(metadata) assert actual_swhid is not None expected_swhid = QualifiedSWHID.from_string(swhid) assert actual_swhid == expected_swhid @pytest.mark.parametrize( "invalid_swhid", [ # incorrect length "swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc235" # noqa # visit qualifier should be a core SWHID with type, "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:rev:0175049fc45055a3824a1675ac06e3711619a55a", # noqa # anchor qualifier should be a core SWHID with type one of "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;anchor=swh:1:cnt:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa "swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:snp:b5f505b005435fa5c4fa4c279792bd7b17167c04", # noqa ], ) def test_parse_swh_reference_invalid_swhid(invalid_swhid, xml_with_swhid): - """Unparsable swhid should raise - - """ + """Unparsable swhid should raise""" xml_invalid_swhid = xml_with_swhid.format(swhid=invalid_swhid) metadata = ElementTree.fromstring(xml_invalid_swhid) with pytest.raises(ValidationError): utils.parse_swh_reference(metadata) @pytest.mark.parametrize( "xml_ref", [ "", "", "", ], ) def test_parse_swh_metatada_provenance_empty(xml_swh_deposit_template, xml_ref): xml_body = xml_swh_deposit_template.format(swh_deposit=xml_ref) metadata = ElementTree.fromstring(xml_body) assert utils.parse_swh_metadata_provenance(metadata) is None @pytest.fixture def xml_with_metadata_provenance(atom_dataset): return atom_dataset["entry-data-with-metadata-provenance"] def test_parse_swh_metadata_provenance2(xml_with_metadata_provenance): xml_data = xml_with_metadata_provenance.format(url="https://url.org/metadata/url") metadata = ElementTree.fromstring(xml_data) actual_url = utils.parse_swh_metadata_provenance(metadata) assert actual_url == "https://url.org/metadata/url" diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py index 7b6b5cab..772cc955 100644 --- a/swh/deposit/utils.py +++ b/swh/deposit/utils.py @@ -1,257 +1,257 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from typing import Any, Dict, Optional, Tuple, Union from xml.etree import ElementTree import iso8601 from swh.model.exceptions import ValidationError from swh.model.model import TimestampWithTimezone from swh.model.swhids import ExtendedSWHID, ObjectType, QualifiedSWHID logger = logging.getLogger(__name__) NAMESPACES = { "atom": "http://www.w3.org/2005/Atom", "app": "http://www.w3.org/2007/app", "dc": "http://purl.org/dc/terms/", "codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "sword": "http://purl.org/net/sword/terms/", "swh": "https://www.softwareheritage.org/schema/2018/deposit", "schema": "http://schema.org/", } def normalize_date(date): """Normalize date fields as expected by swh workers. If date is a list, elect arbitrarily the first element of that list If date is (then) a string, parse it through dateutil.parser.parse to extract a datetime. Then normalize it through :class:`swh.model.model.TimestampWithTimezone` Returns The swh date object """ if isinstance(date, list): date = date[0] if isinstance(date, str): date = iso8601.parse_date(date) tstz = TimestampWithTimezone.from_dict(date) return { "timestamp": tstz.timestamp.to_dict(), "offset": tstz.offset_minutes(), } def compute_metadata_context(swhid_reference: QualifiedSWHID) -> Dict[str, Any]: - """Given a SWHID object, determine the context as a dict. - - """ + """Given a SWHID object, determine the context as a dict.""" metadata_context: Dict[str, Any] = {"origin": None} if swhid_reference.qualifiers(): metadata_context = { "origin": swhid_reference.origin, "path": swhid_reference.path, } snapshot = swhid_reference.visit if snapshot: metadata_context["snapshot"] = snapshot anchor = swhid_reference.anchor if anchor: metadata_context[anchor.object_type.name.lower()] = anchor return metadata_context ALLOWED_QUALIFIERS_NODE_TYPE = ( ObjectType.SNAPSHOT, ObjectType.REVISION, ObjectType.RELEASE, ObjectType.DIRECTORY, ) -def parse_swh_metadata_provenance(metadata: ElementTree.Element,) -> Optional[str]: +def parse_swh_metadata_provenance( + metadata: ElementTree.Element, +) -> Optional[str]: """Parse swh metadata-provenance within the metadata dict reference if found, None otherwise. .. code-block:: xml https://example.org/metadata/url Args: metadata: result of parsing an Atom document with :func:`parse_xml` Raises: ValidationError in case of invalid xml Returns: Either the metadata provenance url if any or None otherwise """ url_element = metadata.find( "swh:deposit/swh:metadata-provenance/schema:url", namespaces=NAMESPACES ) if url_element is not None: return url_element.text return None def parse_swh_deposit_origin( metadata: ElementTree.Element, ) -> Tuple[Optional[str], Optional[str]]: """Parses and from metadata document, if any. .. code-block:: xml .. code-block:: xml Returns: tuple of (origin_to_create, origin_to_add). If both are non-None, this should typically be an error raised to the user. """ create_origin = metadata.find( "swh:deposit/swh:create_origin/swh:origin", namespaces=NAMESPACES ) add_to_origin = metadata.find( "swh:deposit/swh:add_to_origin/swh:origin", namespaces=NAMESPACES ) return ( None if create_origin is None else create_origin.attrib["url"], None if add_to_origin is None else add_to_origin.attrib["url"], ) def parse_swh_reference( metadata: ElementTree.Element, ) -> Optional[Union[QualifiedSWHID, str]]: """Parse within the metadata document, if any. .. code-block:: xml or: .. code-block:: xml Args: metadata: result of parsing an Atom document Raises: ValidationError in case the swhid referenced (if any) is invalid Returns: Either swhid or origin reference if any. None otherwise. """ # noqa ref_origin = metadata.find( "swh:deposit/swh:reference/swh:origin[@url]", namespaces=NAMESPACES ) if ref_origin is not None: return ref_origin.attrib["url"] ref_object = metadata.find( "swh:deposit/swh:reference/swh:object[@swhid]", namespaces=NAMESPACES ) if ref_object is None: return None swhid = ref_object.attrib["swhid"] if not swhid: return None swhid_reference = QualifiedSWHID.from_string(swhid) if swhid_reference.qualifiers(): anchor = swhid_reference.anchor if anchor: if anchor.object_type not in ALLOWED_QUALIFIERS_NODE_TYPE: error_msg = ( "anchor qualifier should be a core SWHID with type one of " f"{', '.join(t.name.lower() for t in ALLOWED_QUALIFIERS_NODE_TYPE)}" ) raise ValidationError(error_msg) visit = swhid_reference.visit if visit: if visit.object_type != ObjectType.SNAPSHOT: raise ValidationError( f"visit qualifier should be a core SWHID with type snp, " f"not {visit.object_type.value}" ) if ( visit and anchor and visit.object_type == ObjectType.SNAPSHOT and anchor.object_type == ObjectType.SNAPSHOT ): logger.warn( "SWHID use of both anchor and visit targeting " f"a snapshot: {swhid_reference}" ) raise ValidationError( "'anchor=swh:1:snp:' is not supported when 'visit' is also provided." ) return swhid_reference def extended_swhid_from_qualified(swhid: QualifiedSWHID) -> ExtendedSWHID: """Used to get the target of a metadata object from a , as the latter uses a QualifiedSWHID.""" return ExtendedSWHID.from_string(str(swhid).split(";")[0]) def to_header_link(link: str, link_name: str) -> str: """Build a single header link. >>> link_next = to_header_link("next-url", "next") >>> link_next '; rel="next"' >>> ','.join([link_next, to_header_link("prev-url", "prev")]) '; rel="next",; rel="prev"' """ return f'<{link}>; rel="{link_name}"'