diff --git a/swh/deposit/__init__.py b/swh/deposit/__init__.py --- a/swh/deposit/__init__.py +++ b/swh/deposit/__init__.py @@ -0,0 +1,11 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pkg_resources + +try: + __version__ = pkg_resources.get_distribution("swh.deposit").version +except pkg_resources.DistributionNotFound: + __version__ = "devel" diff --git a/swh/deposit/api/__init__.py b/swh/deposit/api/__init__.py --- a/swh/deposit/api/__init__.py +++ b/swh/deposit/api/__init__.py @@ -2,10 +2,3 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - -import pkg_resources - -try: - __version__ = pkg_resources.get_distribution("swh.deposit").version -except pkg_resources.DistributionNotFound: - __version__ = "devel" diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -114,6 +114,8 @@ on_behalf_of = meta.get("HTTP_ON_BEHALF_OF") metadata_relevant = meta.get("HTTP_METADATA_RELEVANT") + swhid = meta.get("X_Check_SWHID") + return { "content-type": content_type, "content-length": content_length, @@ -124,6 +126,7 @@ "slug": slug, "on-behalf-of": on_behalf_of, "metadata-relevant": metadata_relevant, + "swhid": swhid, } def _compute_md5(self, filehandler) -> bytes: @@ -610,7 +613,7 @@ """Atom entry deposit. Args: - request (Request): the request holding information to parse + request: the request holding information to parse and inject in db headers: request headers formatted collection_name: the associated client @@ -777,6 +780,8 @@ f"Client {username} cannot access collection {collection_name}", ) + headers = self._read_headers(request) + if deposit_id: try: deposit = Deposit.objects.get(pk=deposit_id) @@ -785,11 +790,11 @@ NOT_FOUND, f"Deposit with id {deposit_id} does not exist" ) - checks = self.restrict_access(request, deposit) + assert deposit is not None + checks = self.restrict_access(request, headers, deposit) if checks: return checks - headers = self._read_headers(request) if headers["on-behalf-of"]: return make_error_dict(MEDIATION_NOT_ALLOWED, "Mediation is not supported.") @@ -800,17 +805,20 @@ return {"headers": headers} def restrict_access( - self, request: Request, deposit: Optional[Deposit] = None + self, request: Request, headers: Dict, deposit: Deposit ) -> Dict[str, Any]: - if deposit: - if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL: - summary = "You can only act on deposit with status '%s'" % ( - DEPOSIT_STATUS_PARTIAL, - ) - description = f"This deposit has status '{deposit.status}'" - return make_error_dict( - BAD_REQUEST, summary=summary, verbose_description=description - ) + """Add further checks on a deposit to prevent an update on a deposit not in partial + state. + + """ + if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL: + summary = "You can only act on deposit with status '%s'" % ( + DEPOSIT_STATUS_PARTIAL, + ) + description = f"This deposit has status '{deposit.status}'" + return make_error_dict( + BAD_REQUEST, summary=summary, verbose_description=description + ) return {} def _basic_not_allowed_method(self, request: Request, method: str): diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py --- a/swh/deposit/api/deposit_update.py +++ b/swh/deposit/api/deposit_update.py @@ -3,12 +3,36 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import datetime, timezone +import json from typing import Any, Dict, Optional, Tuple from rest_framework import status - -from ..config import CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI -from ..errors import BAD_REQUEST, make_error_dict +from rest_framework.request import Request + +from swh.deposit.models import Deposit +from swh.model.hashutil import hash_to_bytes +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + RawExtrinsicMetadata, +) +from swh.storage import get_storage +from swh.storage.interface import StorageInterface + +from ..config import ( + CONT_FILE_IRI, + DEPOSIT_STATUS_LOAD_SUCCESS, + EDIT_SE_IRI, + EM_IRI, + METADATA_KEY, + RAW_METADATA_KEY, + SWH_METADATA_AUTHORITY, +) +from ..errors import BAD_REQUEST, ParserError, make_error_dict from ..parsers import ( SWHAtomEntryParser, SWHFileUploadTarParser, @@ -103,8 +127,29 @@ parser_classes = (SWHMultiPartParser, SWHAtomEntryParser) + def __init__(self): + super().__init__() + self.storage: StorageInterface = get_storage(**self.config["storage"]) + + def restrict_access( + self, request: Request, headers: Dict, deposit: Deposit + ) -> Dict[str, Any]: + """For the metadata-only update of a deposit, we need to allow some checks on "done" + deposits. This overrides allow for such case to occur. + + """ + if ( + request.method == "PUT" + and headers["swhid"] is not None + and deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS + ): + # Allow update metadata on deposit already ingested + return {} + # otherwise, let the standard access restriction check occur + return super().restrict_access(request, headers, deposit) + def process_put( - self, req, headers: Dict, collection_name: str, deposit_id: int + self, request, headers: Dict, collection_name: str, deposit_id: int ) -> Dict[str, Any]: """Replace existing deposit's metadata/archive with new ones. @@ -116,19 +161,113 @@ 204 No content """ - if req.content_type.startswith("multipart/"): + if request.content_type.startswith("multipart/"): return self._multipart_upload( - req, + request, headers, collection_name, deposit_id=deposit_id, replace_archives=True, replace_metadata=True, ) - return self._atom_entry( - req, headers, collection_name, deposit_id=deposit_id, replace_metadata=True + swhid = headers.get("swhid") + if swhid is None: + return self._atom_entry( + request, + headers, + collection_name, + deposit_id=deposit_id, + replace_metadata=True, + ) + + # Update metadata on a deposit already ingested + # Only write to the metadata storage + + try: + raw_metadata, metadata = self._read_metadata(request.data) + except ParserError: + return make_error_dict( + BAD_REQUEST, + "Malformed xml metadata", + "The xml received is malformed. " + "Please ensure your metadata file is correctly formatted.", + ) + if not metadata: + return make_error_dict( + BAD_REQUEST, + "Empty body request is not supported", + "Atom entry deposit is supposed to send for metadata. " + "If the body is empty, there is no metadata.", + ) + + metadata_authority = MetadataAuthority( + type=MetadataAuthorityType.DEPOSIT_CLIENT, + url=self.provider["provider_url"], + metadata={ + "name": self.provider["provider_name"], + **(self.provider["metadata"] or {}), + }, + ) + + metadata_fetcher = MetadataFetcher( + name=self.tool["name"], + version=self.tool["version"], + metadata=self.tool["configuration"], + ) + + deposit_swhid = parse_swhid(swhid) + + assert deposit_swhid.object_type == "directory" + directory_id = hash_to_bytes(deposit_swhid.object_id) + + # check the swhid exists in the archive + directories_missing = list(self.storage.directory_missing([directory_id])) + if len(directories_missing) > 0: + return make_error_dict( + BAD_REQUEST, + f"Unknown directory SWHID {swhid} reference", + "The SWHID provided is not a known directory SWHID in SWH archive. " + "Please provide an existing SWHID.", + ) + + discovery_date = datetime.now(tz=timezone.utc) + + # QUESTION: Do we also put the raw xml? + # A priori, no since we don't do this originally in the loader. + metadata_object = RawExtrinsicMetadata( + type=MetadataTargetType.DIRECTORY, + id=deposit_swhid, + discovery_date=discovery_date, + authority=SWH_METADATA_AUTHORITY, + fetcher=metadata_fetcher, + format="sword-v2-atom-codemeta-v2-in-json", + metadata=json.dumps(metadata).encode(), + ) + + # write to metadata storage + self.storage.metadata_authority_add([metadata_authority]) + self.storage.metadata_fetcher_add([metadata_fetcher]) + self.storage.raw_extrinsic_metadata_add([metadata_object]) + + # replace metadata within the deposit backend + deposit = Deposit.objects.get(pk=deposit_id) + + deposit_request_data = { + METADATA_KEY: metadata, + RAW_METADATA_KEY: raw_metadata, + } + + self._deposit_request_put( + deposit, deposit_request_data, replace_metadata=True, ) + return { + "deposit_id": deposit_id, + "deposit_date": discovery_date, + "status": deposit.status, + "archive": None, + } + def process_post( self, request, diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -12,7 +12,6 @@ from rest_framework import status from swh.core import tarball -from swh.deposit.api import __version__ from swh.deposit.utils import normalize_date from swh.model import identifiers @@ -104,15 +103,6 @@ """ - def __init__(self): - super().__init__() - self.provider = self.config["provider"] - self.tool = { - "name": "swh-deposit", - "version": __version__, - "configuration": {"sword_version": "2"}, - } - def _normalize_dates(self, deposit, metadata): """Normalize the date to use as a tuple of author date, committer date from the incoming metadata. diff --git a/swh/deposit/config.py b/swh/deposit/config.py --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -7,6 +7,8 @@ from typing import Any, Dict from swh.core import config +from swh.deposit import __version__ +from swh.model.model import MetadataAuthority, MetadataAuthorityType from swh.scheduler import get_scheduler from swh.scheduler.interface import SchedulerInterface @@ -53,6 +55,12 @@ "checks": True, } +SWH_METADATA_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, + url="https://softwareheritage.org/", + metadata={}, +) + def setup_django_for(platform=None, config_file=None): """Setup function for command line tools (swh.deposit.create_user) to @@ -97,3 +105,9 @@ conf = config.read_raw_config(config.config_basepath(config_file)) self.config: Dict[str, Any] = config.merge_configs(DEFAULT_CONFIG, conf) self.scheduler: SchedulerInterface = get_scheduler(**self.config["scheduler"]) + self.provider = self.config["provider"] + self.tool = { + "name": "swh-deposit", + "version": __version__, + "configuration": {"sword_version": "2"}, + } diff --git a/swh/deposit/tests/api/test_deposit_private_read_metadata.py b/swh/deposit/tests/api/test_deposit_private_read_metadata.py --- a/swh/deposit/tests/api/test_deposit_private_read_metadata.py +++ b/swh/deposit/tests/api/test_deposit_private_read_metadata.py @@ -6,7 +6,7 @@ from django.urls import reverse from rest_framework import status -from swh.deposit.api import __version__ +from swh.deposit import __version__ from swh.deposit.config import EDIT_SE_IRI, PRIVATE_GET_DEPOSIT_METADATA, SWH_PERSON from swh.deposit.models import Deposit diff --git a/swh/deposit/tests/api/test_deposit_update.py b/swh/deposit/tests/api/test_deposit_update.py --- a/swh/deposit/tests/api/test_deposit_update.py +++ b/swh/deposit/tests/api/test_deposit_update.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -10,6 +10,8 @@ from swh.deposit.models import Deposit, DepositCollection, DepositRequest from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import check_archive, create_arborescence_archive +from swh.model.hashutil import hash_to_bytes +from swh.model.identifiers import parse_swhid, swhid def test_replace_archive_to_deposit_is_possible( @@ -124,6 +126,103 @@ assert set(requests_archive0) == set(requests_archive1) +def test_put_update_metadata_on_done_deposit( + tmp_path, + authenticated_client, + complete_deposit, + deposit_collection, + atom_dataset, + sample_data, + swh_storage, +): + """Update all metadata on a deposit whose status is done is possible, 204 response + + """ + deposit = complete_deposit + + directory_id = hash_to_bytes(parse_swhid(deposit.swh_id).object_id) + # directory targeted by the complete_deposit does not exist in the storage + assert list(swh_storage.directory_missing([directory_id])) == [directory_id] + + # so let's create a directory reference in the storage (current deposit targets an + # unknown swhid) + directory = sample_data.directory + swh_storage.directory_add([directory]) + assert list(swh_storage.directory_missing([directory.id])) == [] + # and patch one complete deposit swh_id so it targets said reference + deposit.swh_id = swhid("directory", directory.id) + deposit.save() + + raw_metadata0 = atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8") + + requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") + assert len(requests_archive0) == 1 + + update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) + + response = authenticated_client.put( + update_uri, + content_type="application/atom+xml;type=entry", + data=atom_dataset["entry-data1"], + X_Check_SWHID=deposit.swh_id, + ) + + assert response.status_code == status.HTTP_204_NO_CONTENT + + requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata") + + assert len(requests_meta) == 1 + request_meta1 = requests_meta[0] + raw_metadata1 = request_meta1.raw_metadata + assert raw_metadata1 == atom_dataset["entry-data1"] + assert raw_metadata0 != raw_metadata1 + + # check we did not touch the other parts + requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive") + assert len(requests_archive1) == 1 + assert set(requests_archive0) == set(requests_archive1) + + # FIXME: Check the metadata storage information created is consistent + pass + + +def test_put_update_metadata_on_done_deposit_failure( + tmp_path, + authenticated_client, + complete_deposit, + deposit_collection, + atom_dataset, + swh_storage, +): + """Update all metadata on a deposit whose status is done, wrong swhid is impossible + + """ + deposit = complete_deposit + + directory_id = hash_to_bytes(parse_swhid(deposit.swh_id).object_id) + # directory targeted by the complete_deposit does not exist in the storage + assert list(swh_storage.directory_missing([directory_id]))[0] is not None + + requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") + assert len(requests_archive0) == 1 + + update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) + + response = authenticated_client.put( + update_uri, + content_type="application/atom+xml;type=entry", + data=atom_dataset["entry-data1"], + X_Check_SWHID=deposit.swh_id, + ) + + # so the deposit update is rejected + assert response.status_code == status.HTTP_400_BAD_REQUEST + + response_str = response.content.decode() + + assert f"Unknown directory SWHID {deposit.swh_id} reference" in response_str + + def test_add_archive_to_deposit_is_possible( tmp_path, authenticated_client, diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py --- a/swh/deposit/tests/conftest.py +++ b/swh/deposit/tests/conftest.py @@ -16,6 +16,7 @@ from rest_framework.test import APIClient import yaml +from swh.core.config import read from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, @@ -25,12 +26,14 @@ DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED, EDIT_SE_IRI, + SWH_METADATA_AUTHORITY, setup_django_for, ) from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import create_arborescence_archive from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid from swh.scheduler import get_scheduler +from swh.storage import get_storage # mypy is asked to ignore the import statement above because setup_databases # is not part of the d.t.utils.__all__ variable. @@ -51,7 +54,7 @@ @pytest.fixture() -def deposit_config(swh_scheduler_config): +def deposit_config(swh_scheduler_config, swh_storage_backend_config): return { "max_upload_size": 500, "extraction_dir": "/tmp/swh-deposit/test/extraction-dir", @@ -63,6 +66,7 @@ "metadata": {}, }, "scheduler": {"cls": "local", "args": swh_scheduler_config,}, + "storage": swh_storage_backend_config, } @@ -78,14 +82,24 @@ @pytest.fixture(autouse=True) def deposit_autoconfig(deposit_config_path, swh_scheduler_config): """Enforce config for deposit classes inherited from APIConfig.""" + cfg = read(deposit_config_path) + + if "scheduler" in cfg: + # scheduler setup: require the load-deposit tasks (already existing in + # production) + scheduler = get_scheduler(**cfg["scheduler"]) + task_type = { + "type": "load-deposit", + "backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit", + "description": "Load deposit task", + } + scheduler.create_task_type(task_type) - scheduler = get_scheduler("local", swh_scheduler_config) - task_type = { - "type": "load-deposit", - "backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit", - "description": "Load deposit task", - } - scheduler.create_task_type(task_type) + if "storage" in cfg: + # `deposit_update` tests requires the SWH_METADATA_AUTHORITY populated (matching + # production) + storage = get_storage(**cfg["storage"]) + storage.metadata_authority_add([SWH_METADATA_AUTHORITY]) @pytest.fixture(scope="session") diff --git a/swh/deposit/tests/test_init.py b/swh/deposit/tests/test_init.py --- a/swh/deposit/tests/test_init.py +++ b/swh/deposit/tests/test_init.py @@ -5,6 +5,6 @@ def test_version(): - from swh.deposit.api import __version__ + from swh.deposit import __version__ assert __version__ is not None