diff --git a/swh/deposit/__init__.py b/swh/deposit/__init__.py --- a/swh/deposit/__init__.py +++ b/swh/deposit/__init__.py @@ -0,0 +1,11 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pkg_resources + +try: + __version__ = pkg_resources.get_distribution("swh.deposit").version +except pkg_resources.DistributionNotFound: + __version__ = "devel" diff --git a/swh/deposit/api/__init__.py b/swh/deposit/api/__init__.py --- a/swh/deposit/api/__init__.py +++ b/swh/deposit/api/__init__.py @@ -2,10 +2,3 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - -import pkg_resources - -try: - __version__ = pkg_resources.get_distribution("swh.deposit").version -except pkg_resources.DistributionNotFound: - __version__ = "devel" diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -114,6 +114,8 @@ on_behalf_of = meta.get("HTTP_ON_BEHALF_OF") metadata_relevant = meta.get("HTTP_METADATA_RELEVANT") + swhid = meta.get("HTTP_X_CHECK_SWHID") + return { "content-type": content_type, "content-length": content_length, @@ -124,6 +126,7 @@ "slug": slug, "on-behalf-of": on_behalf_of, "metadata-relevant": metadata_relevant, + "swhid": swhid, } def _compute_md5(self, filehandler) -> bytes: @@ -222,7 +225,7 @@ deposit_request_data: Dict[str, Any], replace_metadata: bool = False, replace_archives: bool = False, - ) -> None: + ) -> DepositRequest: """Save a deposit request with metadata attached to a deposit. Args: @@ -235,7 +238,7 @@ archives to existing deposit Returns: - None + the DepositRequest object stored in the backend """ if replace_metadata: @@ -265,6 +268,7 @@ deposit_request.save() assert deposit_request is not None + return deposit_request def _delete_archives(self, collection_name: str, deposit_id: int) -> Dict: """Delete archive references from the deposit id. @@ -610,7 +614,7 @@ """Atom entry deposit. Args: - request (Request): the request holding information to parse + request: the request holding information to parse and inject in db headers: request headers formatted collection_name: the associated client @@ -777,6 +781,8 @@ f"Client {username} cannot access collection {collection_name}", ) + headers = self._read_headers(request) + if deposit_id: try: deposit = Deposit.objects.get(pk=deposit_id) @@ -785,11 +791,11 @@ NOT_FOUND, f"Deposit with id {deposit_id} does not exist" ) - checks = self.restrict_access(request, deposit) + assert deposit is not None + checks = self.restrict_access(request, headers, deposit) if checks: return checks - headers = self._read_headers(request) if headers["on-behalf-of"]: return make_error_dict(MEDIATION_NOT_ALLOWED, "Mediation is not supported.") @@ -800,17 +806,19 @@ return {"headers": headers} def restrict_access( - self, request: Request, deposit: Optional[Deposit] = None + self, request: Request, headers: Dict, deposit: Deposit ) -> Dict[str, Any]: - if deposit: - if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL: - summary = "You can only act on deposit with status '%s'" % ( - DEPOSIT_STATUS_PARTIAL, - ) - description = f"This deposit has status '{deposit.status}'" - return make_error_dict( - BAD_REQUEST, summary=summary, verbose_description=description - ) + """Allow modifications on deposit with status 'partial' only, reject the rest. + + """ + if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL: + summary = "You can only act on deposit with status '%s'" % ( + DEPOSIT_STATUS_PARTIAL, + ) + description = f"This deposit has status '{deposit.status}'" + return make_error_dict( + BAD_REQUEST, summary=summary, verbose_description=description + ) return {} def _basic_not_allowed_method(self, request: Request, method: str): diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py --- a/swh/deposit/api/deposit_update.py +++ b/swh/deposit/api/deposit_update.py @@ -6,9 +6,30 @@ from typing import Any, Dict, Optional, Tuple from rest_framework import status - -from ..config import CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI -from ..errors import BAD_REQUEST, make_error_dict +from rest_framework.request import Request + +from swh.deposit.models import Deposit +from swh.model.hashutil import hash_to_bytes +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + RawExtrinsicMetadata, +) +from swh.storage import get_storage +from swh.storage.interface import StorageInterface + +from ..config import ( + CONT_FILE_IRI, + DEPOSIT_STATUS_LOAD_SUCCESS, + EDIT_SE_IRI, + EM_IRI, + METADATA_KEY, + RAW_METADATA_KEY, +) +from ..errors import BAD_REQUEST, ParserError, make_error_dict from ..parsers import ( SWHAtomEntryParser, SWHFileUploadTarParser, @@ -103,32 +124,169 @@ parser_classes = (SWHMultiPartParser, SWHAtomEntryParser) + def __init__(self): + super().__init__() + self.storage_metadata: StorageInterface = get_storage( + **self.config["storage_metadata"] + ) + + def restrict_access( + self, request: Request, headers: Dict, deposit: Deposit + ) -> Dict[str, Any]: + """Relax restriction access to allow metadata update on deposit with status "done" when + a swhid is provided. + + """ + if ( + request.method == "PUT" + and headers["swhid"] is not None + and deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS + ): + # Allow metadata update on deposit with status "done" when swhid provided + return {} + # otherwise, let the standard access restriction check occur + return super().restrict_access(request, headers, deposit) + def process_put( - self, req, headers: Dict, collection_name: str, deposit_id: int + self, request, headers: Dict, collection_name: str, deposit_id: int ) -> Dict[str, Any]: - """Replace existing deposit's metadata/archive with new ones. + """This allows the following scenarios: + + - multipart: replace all the deposit (status partial) metadata and archive + with the provided ones. + - atom: replace all the deposit (status partial) metadata with the + provided ones. + - with swhid, atom: Add new metatada to deposit (status done) with provided ones + and push such metadata to the metadata storage directly. source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_metadata - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_multipart + Raises: + 400 if any of the following occur: + - the swhid provided and the deposit swhid do not match + - the provided metadata xml file is malformed + - the provided xml atom entry is empty + - the provided swhid does not exist in the archive + Returns: 204 No content """ # noqa - if req.content_type.startswith("multipart/"): - return self._multipart_upload( - req, + swhid = headers.get("swhid") + if swhid is None: + if request.content_type.startswith("multipart/"): + return self._multipart_upload( + request, + headers, + collection_name, + deposit_id=deposit_id, + replace_archives=True, + replace_metadata=True, + ) + # standard metadata update (replace all metadata already provided to the + # deposit by the new ones) + return self._atom_entry( + request, headers, collection_name, deposit_id=deposit_id, - replace_archives=True, replace_metadata=True, ) - return self._atom_entry( - req, headers, collection_name, deposit_id=deposit_id, replace_metadata=True + + # Update metadata on a deposit already ingested + # Write to the metadata storage (and the deposit backend) + # no ingestion triggered + + deposit = Deposit.objects.get(pk=deposit_id) + assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS + + if swhid != deposit.swh_id: + return make_error_dict( + BAD_REQUEST, + f"Mismatched provided SWHID {swhid} with deposit's {deposit.swh_id}.", + "The provided SWHID does not match the deposit to update. " + "Please ensure you send the correct deposit SWHID.", + ) + + try: + raw_metadata, metadata = self._read_metadata(request.data) + except ParserError: + return make_error_dict( + BAD_REQUEST, + "Malformed xml metadata", + "The xml received is malformed. " + "Please ensure your metadata file is correctly formatted.", + ) + + if not metadata: + return make_error_dict( + BAD_REQUEST, + "Empty body request is not supported", + "Atom entry deposit is supposed to send for metadata. " + "If the body is empty, there is no metadata.", + ) + + metadata_authority = MetadataAuthority( + type=MetadataAuthorityType.DEPOSIT_CLIENT, + url=deposit.client.provider_url, + metadata={"name": deposit.client.last_name}, ) + metadata_fetcher = MetadataFetcher( + name=self.tool["name"], + version=self.tool["version"], + metadata=self.tool["configuration"], + ) + + deposit_swhid = parse_swhid(swhid) + + directory_id = hash_to_bytes(deposit_swhid.object_id) + + # check the swhid exists in the archive + directories_missing = list( + self.storage_metadata.directory_missing([directory_id]) + ) + if len(directories_missing) > 0: + return make_error_dict( + BAD_REQUEST, + f"Unknown directory SWHID {swhid} reference", + "The SWHID provided is not a known directory SWHID in SWH archive. " + "Please provide an existing SWHID.", + ) + + # replace metadata within the deposit backend + deposit_request_data = { + METADATA_KEY: metadata, + RAW_METADATA_KEY: raw_metadata, + } + + # actually add the metadata to the completed deposit + deposit_request = self._deposit_request_put(deposit, deposit_request_data) + # store that metadata to the metadata storage + metadata_object = RawExtrinsicMetadata( + type=MetadataTargetType.DIRECTORY, + id=deposit_swhid, + discovery_date=deposit_request.date, + authority=metadata_authority, + fetcher=metadata_fetcher, + format="sword-v2-atom-codemeta", + metadata=raw_metadata, + ) + + # write to metadata storage + self.storage_metadata.metadata_authority_add([metadata_authority]) + self.storage_metadata.metadata_fetcher_add([metadata_fetcher]) + self.storage_metadata.raw_extrinsic_metadata_add([metadata_object]) + + return { + "deposit_id": deposit_id, + "deposit_date": deposit_request.date, + "status": deposit.status, + "archive": None, + } + def process_post( self, request, @@ -138,12 +296,18 @@ ) -> Tuple[int, str, Dict]: """Add new metadata/archive to existing deposit. + This allows the following scenarios to occur: + + - multipart: Add new metadata and archive to a deposit in status partial with + the provided ones. + + - empty atom: Allows to finalize a deposit in status partial (transition to + deposited). + source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_metadata - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_multipart - - This also deals with an empty post corner case to finalize a - deposit. + - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#continueddeposit_complete Returns: In optimal case for a multipart and atom-entry update, a @@ -164,8 +328,6 @@ content_length = headers["content-length"] or 0 if content_length == 0 and headers["in-progress"] is False: # check for final empty post - # source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html - # #continueddeposit_complete data = self._empty_post(request, headers, collection_name, deposit_id) return (status.HTTP_200_OK, EDIT_SE_IRI, data) diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -12,9 +12,9 @@ from rest_framework import status from swh.core import tarball -from swh.deposit.api import __version__ from swh.deposit.utils import normalize_date from swh.model import identifiers +from swh.model.model import MetadataAuthorityType from . import APIPrivateView, DepositReadMixin from ...config import ARCHIVE_TYPE, SWH_PERSON @@ -104,15 +104,6 @@ """ - def __init__(self): - super().__init__() - self.provider = self.config["provider"] - self.tool = { - "name": "swh-deposit", - "version": __version__, - "configuration": {"sword_version": "2"}, - } - def _normalize_dates(self, deposit, metadata): """Normalize the date to use as a tuple of author date, committer date from the incoming metadata. @@ -155,10 +146,6 @@ # Read information metadata data = {"origin": {"type": "deposit", "url": deposit.origin_url,}} - # metadata provider - self.provider["provider_name"] = deposit.client.last_name - self.provider["provider_url"] = deposit.client.provider_url - author_date, commit_date = self._normalize_dates(deposit, metadata) if deposit.parent: @@ -170,7 +157,13 @@ parents = [] data["origin_metadata"] = { - "provider": self.provider, + # metadata provider + "provider": { + "provider_name": deposit.client.last_name, + "provider_url": deposit.client.provider_url, + "provider_type": MetadataAuthorityType.DEPOSIT_CLIENT.value, + "metadata": {}, + }, "tool": self.tool, "metadata": metadata, } diff --git a/swh/deposit/config.py b/swh/deposit/config.py --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -7,6 +7,7 @@ from typing import Any, Dict from swh.core import config +from swh.deposit import __version__ from swh.scheduler import get_scheduler from swh.scheduler.interface import SchedulerInterface @@ -97,3 +98,8 @@ conf = config.read_raw_config(config.config_basepath(config_file)) self.config: Dict[str, Any] = config.merge_configs(DEFAULT_CONFIG, conf) self.scheduler: SchedulerInterface = get_scheduler(**self.config["scheduler"]) + self.tool = { + "name": "swh-deposit", + "version": __version__, + "configuration": {"sword_version": "2"}, + } diff --git a/swh/deposit/tests/api/test_deposit_atom.py b/swh/deposit/tests/api/test_deposit_atom.py --- a/swh/deposit/tests/api/test_deposit_atom.py +++ b/swh/deposit/tests/api/test_deposit_atom.py @@ -54,10 +54,8 @@ reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data-empty-body"], - HTTP_SLUG="something", ) assert response.status_code == status.HTTP_400_BAD_REQUEST - assert b"Empty body request is not supported" in response.content def test_post_deposit_atom_400_badly_formatted_atom( diff --git a/swh/deposit/tests/api/test_deposit_private_read_metadata.py b/swh/deposit/tests/api/test_deposit_private_read_metadata.py --- a/swh/deposit/tests/api/test_deposit_private_read_metadata.py +++ b/swh/deposit/tests/api/test_deposit_private_read_metadata.py @@ -6,7 +6,7 @@ from django.urls import reverse from rest_framework import status -from swh.deposit.api import __version__ +from swh.deposit import __version__ from swh.deposit.config import EDIT_SE_IRI, PRIVATE_GET_DEPOSIT_METADATA, SWH_PERSON from swh.deposit.models import Deposit diff --git a/swh/deposit/tests/api/test_deposit_update.py b/swh/deposit/tests/api/test_deposit_update.py --- a/swh/deposit/tests/api/test_deposit_update.py +++ b/swh/deposit/tests/api/test_deposit_update.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -18,6 +18,8 @@ from swh.deposit.models import Deposit, DepositCollection, DepositRequest from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import check_archive, create_arborescence_archive +from swh.model.hashutil import hash_to_bytes +from swh.model.identifiers import parse_swhid, swhid def test_replace_archive_to_deposit_is_possible( @@ -561,3 +563,199 @@ requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive1) == 1 assert set(requests_archive0) != set(requests_archive1) + + +def test_put_update_metadata_done_deposit_nominal( + tmp_path, + authenticated_client, + complete_deposit, + deposit_collection, + atom_dataset, + sample_data, + swh_storage, +): + """Nominal scenario, client send an update of metadata on a deposit with status "done" + with an existing swhid. Such swhid has its metadata updated accordingly both in + the deposit backend and in the metadata storage. + + Response: 204 + + """ + deposit_swhid = parse_swhid(complete_deposit.swh_id) + assert deposit_swhid.object_type == "directory" + directory_id = hash_to_bytes(deposit_swhid.object_id) + + # directory targeted by the complete_deposit does not exist in the storage + assert list(swh_storage.directory_missing([directory_id])) == [directory_id] + + # so let's create a directory reference in the storage (current deposit targets an + # unknown swhid) + existing_directory = sample_data.directory + swh_storage.directory_add([existing_directory]) + assert list(swh_storage.directory_missing([existing_directory.id])) == [] + + # and patch one complete deposit swh_id so it targets said reference + complete_deposit.swh_id = swhid("directory", existing_directory.id) + complete_deposit.save() + + actual_existing_requests_archive = DepositRequest.objects.filter( + deposit=complete_deposit, type="archive" + ) + nb_archives = len(actual_existing_requests_archive) + actual_existing_requests_metadata = DepositRequest.objects.filter( + deposit=complete_deposit, type="metadata" + ) + nb_metadata = len(actual_existing_requests_metadata) + + update_uri = reverse( + EDIT_SE_IRI, args=[deposit_collection.name, complete_deposit.id] + ) + response = authenticated_client.put( + update_uri, + content_type="application/atom+xml;type=entry", + data=atom_dataset["entry-data1"], + HTTP_X_CHECK_SWHID=complete_deposit.swh_id, + ) + + assert response.status_code == status.HTTP_204_NO_CONTENT + + new_requests_meta = DepositRequest.objects.filter( + deposit=complete_deposit, type="metadata" + ) + assert len(new_requests_meta) == nb_metadata + 1 + request_meta1 = new_requests_meta[0] + raw_metadata1 = request_meta1.raw_metadata + assert raw_metadata1 == atom_dataset["entry-data1"] + + # check we did not touch the other parts + requests_archive1 = DepositRequest.objects.filter( + deposit=complete_deposit, type="archive" + ) + assert len(requests_archive1) == nb_archives + assert set(actual_existing_requests_archive) == set(requests_archive1) + + # FIXME: Check the metadata storage information created is consistent + pass + + +def test_put_update_metadata_done_deposit_failure_swhid_unknown( + tmp_path, + authenticated_client, + complete_deposit, + deposit_collection, + atom_dataset, + swh_storage, +): + """Failure: client updates metadata with a SWHID matching the deposit's. Said SWHID does + not exist in the archive somehow. + + This should not happen though, it is still technically possible so it's + covered... + + Response: 400 + + """ + # directory targeted by the complete_deposit does not exist in the storage + missing_directory_id = hash_to_bytes(parse_swhid(complete_deposit.swh_id).object_id) + assert list(swh_storage.directory_missing([missing_directory_id])) == [ + missing_directory_id + ] + + update_uri = reverse( + EDIT_SE_IRI, args=[deposit_collection.name, complete_deposit.id] + ) + + response = authenticated_client.put( + update_uri, + content_type="application/atom+xml;type=entry", + data=atom_dataset["entry-data1"], + HTTP_X_CHECK_SWHID=complete_deposit.swh_id, + ) + + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert b"Unknown directory SWHID" in response.content + + +def test_put_update_metadata_done_deposit_failure_mismatched_swhid( + tmp_path, + authenticated_client, + complete_deposit, + deposit_collection, + atom_dataset, + swh_storage, +): + """failure: client updates metadata on deposit with SWHID not matching the deposit's. + + Response: 400 + + """ + incorrect_swhid = "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea" + assert complete_deposit.swh_id != incorrect_swhid + + update_uri = reverse( + EDIT_SE_IRI, args=[deposit_collection.name, complete_deposit.id] + ) + response = authenticated_client.put( + update_uri, + content_type="application/atom+xml;type=entry", + data=atom_dataset["entry-data1"], + HTTP_X_CHECK_SWHID=incorrect_swhid, + ) + + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert b"Mismatched provided SWHID" in response.content + + +def test_put_update_metadata_done_deposit_failure_malformed_xml( + tmp_path, + authenticated_client, + complete_deposit, + deposit_collection, + atom_dataset, + swh_storage, +): + """failure: client updates metadata on deposit done with a malformed xml + + Response: 400 + + """ + update_uri = reverse( + EDIT_SE_IRI, args=[deposit_collection.name, complete_deposit.id] + ) + response = authenticated_client.put( + update_uri, + content_type="application/atom+xml;type=entry", + data=atom_dataset["entry-data-ko"], + HTTP_X_CHECK_SWHID=complete_deposit.swh_id, + ) + + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert b"Malformed xml metadata" in response.content + + +def test_put_update_metadata_done_deposit_failure_empty_xml( + tmp_path, + authenticated_client, + complete_deposit, + deposit_collection, + atom_dataset, + swh_storage, +): + """failure: client updates metadata on deposit done with an empty xml. + + Response: 400 + + """ + update_uri = reverse( + EDIT_SE_IRI, args=[deposit_collection.name, complete_deposit.id] + ) + + response = authenticated_client.put( + update_uri, + content_type="application/atom+xml;type=entry", + data=atom_dataset["entry-data-empty-body"], + HTTP_X_CHECK_SWHID=complete_deposit.swh_id, + ) + + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert b"Empty body request is not supported" in response.content diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py --- a/swh/deposit/tests/conftest.py +++ b/swh/deposit/tests/conftest.py @@ -16,6 +16,7 @@ from rest_framework.test import APIClient import yaml +from swh.core.config import read from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, @@ -51,18 +52,13 @@ @pytest.fixture() -def deposit_config(swh_scheduler_config): +def deposit_config(swh_scheduler_config, swh_storage_backend_config): return { "max_upload_size": 500, "extraction_dir": "/tmp/swh-deposit/test/extraction-dir", "checks": False, - "provider": { - "provider_name": "", - "provider_type": "deposit_client", - "provider_url": "", - "metadata": {}, - }, "scheduler": {"cls": "local", "args": swh_scheduler_config,}, + "storage_metadata": swh_storage_backend_config, } @@ -78,14 +74,18 @@ @pytest.fixture(autouse=True) def deposit_autoconfig(deposit_config_path, swh_scheduler_config): """Enforce config for deposit classes inherited from APIConfig.""" - - scheduler = get_scheduler("local", swh_scheduler_config) - task_type = { - "type": "load-deposit", - "backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit", - "description": "Load deposit task", - } - scheduler.create_task_type(task_type) + cfg = read(deposit_config_path) + + if "scheduler" in cfg: + # scheduler setup: require the load-deposit tasks (already existing in + # production) + scheduler = get_scheduler(**cfg["scheduler"]) + task_type = { + "type": "load-deposit", + "backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit", + "description": "Load deposit task", + } + scheduler.create_task_type(task_type) @pytest.fixture(scope="session") diff --git a/swh/deposit/tests/test_init.py b/swh/deposit/tests/test_init.py --- a/swh/deposit/tests/test_init.py +++ b/swh/deposit/tests/test_init.py @@ -5,6 +5,6 @@ def test_version(): - from swh.deposit.api import __version__ + from swh.deposit import __version__ assert __version__ is not None