diff --git a/swh/deposit/__init__.py b/swh/deposit/__init__.py --- a/swh/deposit/__init__.py +++ b/swh/deposit/__init__.py @@ -0,0 +1,11 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pkg_resources + +try: + __version__ = pkg_resources.get_distribution("swh.deposit").version +except pkg_resources.DistributionNotFound: + __version__ = "devel" diff --git a/swh/deposit/api/__init__.py b/swh/deposit/api/__init__.py --- a/swh/deposit/api/__init__.py +++ b/swh/deposit/api/__init__.py @@ -2,10 +2,3 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - -import pkg_resources - -try: - __version__ = pkg_resources.get_distribution("swh.deposit").version -except pkg_resources.DistributionNotFound: - __version__ = "devel" diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -114,6 +114,8 @@ on_behalf_of = meta.get("HTTP_ON_BEHALF_OF") metadata_relevant = meta.get("HTTP_METADATA_RELEVANT") + swhid = meta.get("HTTP_CHECK_SWHID") + return { "content-type": content_type, "content-length": content_length, @@ -124,6 +126,7 @@ "slug": slug, "on-behalf-of": on_behalf_of, "metadata-relevant": metadata_relevant, + "swhid": swhid, } def _compute_md5(self, filehandler) -> bytes: @@ -610,7 +613,7 @@ """Atom entry deposit. Args: - request (Request): the request holding information to parse + request: the request holding information to parse and inject in db headers: request headers formatted collection_name: the associated client @@ -785,6 +788,7 @@ NOT_FOUND, f"Deposit with id {deposit_id} does not exist" ) + assert deposit is not None checks = self.restrict_access(request, deposit) if checks: return checks @@ -799,18 +803,19 @@ return {"headers": headers} - def restrict_access( - self, request: Request, deposit: Optional[Deposit] = None - ) -> Dict[str, Any]: - if deposit: - if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL: - summary = "You can only act on deposit with status '%s'" % ( - DEPOSIT_STATUS_PARTIAL, - ) - description = f"This deposit has status '{deposit.status}'" - return make_error_dict( - BAD_REQUEST, summary=summary, verbose_description=description - ) + def restrict_access(self, request: Request, deposit: Deposit) -> Dict[str, Any]: + """Add further checks on a deposit to prevent an update on a deposit not in partial + state. + + """ + if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL: + summary = "You can only act on deposit with status '%s'" % ( + DEPOSIT_STATUS_PARTIAL, + ) + description = f"This deposit has status '{deposit.status}'" + return make_error_dict( + BAD_REQUEST, summary=summary, verbose_description=description + ) return {} def _basic_not_allowed_method(self, request: Request, method: str): diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py --- a/swh/deposit/api/deposit_update.py +++ b/swh/deposit/api/deposit_update.py @@ -3,12 +3,34 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import datetime, timezone +import json from typing import Any, Dict, Optional, Tuple from rest_framework import status +from rest_framework.request import Request + +from swh.deposit.models import Deposit +from swh.model.hashutil import hash_to_bytes +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + RawExtrinsicMetadata, +) -from ..config import CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI -from ..errors import BAD_REQUEST, make_error_dict +from ..config import ( + CONT_FILE_IRI, + DEPOSIT_STATUS_LOAD_SUCCESS, + EDIT_SE_IRI, + EM_IRI, + METADATA_KEY, + RAW_METADATA_KEY, + SWH_METADATA_AUTHORITY, +) +from ..errors import BAD_REQUEST, ParserError, make_error_dict from ..parsers import ( SWHAtomEntryParser, SWHFileUploadTarParser, @@ -103,8 +125,24 @@ parser_classes = (SWHMultiPartParser, SWHAtomEntryParser) + def restrict_access(self, request: Request, deposit: Deposit) -> Dict[str, Any]: + """For the metadata-only update of a deposit, we need to allow some checks on "done" + deposits. This overrides allow for such case to occur. + + """ + headers = self._read_headers(request) + if ( + request.method == "PUT" + and headers["swhid"] is not None + and deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS + ): + # Update only metadata is authorized for deposit ingested + return {} + # otherwise, let the standard access restriction occur + return super().restrict_access(request, deposit) + def process_put( - self, req, headers: Dict, collection_name: str, deposit_id: int + self, request, headers: Dict, collection_name: str, deposit_id: int ) -> Dict[str, Any]: """Replace existing deposit's metadata/archive with new ones. @@ -116,19 +154,113 @@ 204 No content """ - if req.content_type.startswith("multipart/"): + if request.content_type.startswith("multipart/"): return self._multipart_upload( - req, + request, headers, collection_name, deposit_id=deposit_id, replace_archives=True, replace_metadata=True, ) - return self._atom_entry( - req, headers, collection_name, deposit_id=deposit_id, replace_metadata=True + swhid = headers.get("swhid") + if swhid is None: + return self._atom_entry( + request, + headers, + collection_name, + deposit_id=deposit_id, + replace_metadata=True, + ) + + # Update metadata on a deposit already ingested + # Only write to the metadata storage + + try: + raw_metadata, metadata = self._read_metadata(request.data) + except ParserError: + return make_error_dict( + BAD_REQUEST, + "Malformed xml metadata", + "The xml received is malformed. " + "Please ensure your metadata file is correctly formatted.", + ) + if not metadata: + return make_error_dict( + BAD_REQUEST, + "Empty body request is not supported", + "Atom entry deposit is supposed to send for metadata. " + "If the body is empty, there is no metadata.", + ) + + metadata_authority = MetadataAuthority( + type=MetadataAuthorityType.DEPOSIT_CLIENT, + url=self.provider["provider_url"], + metadata={ + "name": self.provider["provider_name"], + **(self.provider["metadata"] or {}), + }, ) + metadata_fetcher = MetadataFetcher( + name=self.tool["name"], + version=self.tool["version"], + metadata=self.tool["configuration"], + ) + + deposit_swhid = parse_swhid(swhid) + + assert deposit_swhid.object_type == "directory" + directory_id = hash_to_bytes(deposit_swhid.object_id) + + # check the swhid exists in the archive + directories_missing = list(self.storage.directory_missing([directory_id])) + if len(directories_missing) > 0: + return make_error_dict( + BAD_REQUEST, + f"Unknown directory SWHID {swhid} reference", + "The SWHID provided is not a know directory SWHID in SWH archive. " + "Please provide an existing SWHID.", + ) + + discovery_date = datetime.now(tz=timezone.utc) + + # QUESTION: Do we also put the raw xml? + # A priori, no since we don't do this originally in the loader. + metadata_object = RawExtrinsicMetadata( + type=MetadataTargetType.DIRECTORY, + id=deposit_swhid, + discovery_date=discovery_date, + authority=SWH_METADATA_AUTHORITY, + fetcher=metadata_fetcher, + format="sword-v2-atom-codemeta-v2-in-json", + metadata=json.dumps(metadata).encode(), + ) + + # write to metadata storage + self.storage.metadata_authority_add([metadata_authority]) + self.storage.metadata_fetcher_add([metadata_fetcher]) + self.storage.raw_extrinsic_metadata_add([metadata_object]) + + # replace metadata within the deposit backend + deposit = Deposit.objects.get(pk=deposit_id) + + deposit_request_data = { + METADATA_KEY: metadata, + RAW_METADATA_KEY: raw_metadata, + } + + self._deposit_request_put( + deposit, deposit_request_data, replace_metadata=True, + ) + + return { + "deposit_id": deposit_id, + "deposit_date": discovery_date, + "status": deposit.status, + "archive": None, + } + def process_post( self, request, diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -12,7 +12,6 @@ from rest_framework import status from swh.core import tarball -from swh.deposit.api import __version__ from swh.deposit.utils import normalize_date from swh.model import identifiers @@ -106,12 +105,6 @@ def __init__(self): super().__init__() - self.provider = self.config["provider"] - self.tool = { - "name": "swh-deposit", - "version": __version__, - "configuration": {"sword_version": "2"}, - } def _normalize_dates(self, deposit, metadata): """Normalize the date to use as a tuple of author date, committer date diff --git a/swh/deposit/config.py b/swh/deposit/config.py --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -7,8 +7,12 @@ from typing import Any, Dict from swh.core import config +from swh.deposit import __version__ +from swh.model.model import MetadataAuthority, MetadataAuthorityType from swh.scheduler import get_scheduler from swh.scheduler.interface import SchedulerInterface +from swh.storage import get_storage +from swh.storage.interface import StorageInterface # IRIs (Internationalized Resource identifier) sword 2.0 specified EDIT_SE_IRI = "edit_se_iri" @@ -53,6 +57,12 @@ "checks": True, } +SWH_METADATA_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, + url="https://softwareheritage.org/", + metadata={}, +) + def setup_django_for(platform=None, config_file=None): """Setup function for command line tools (swh.deposit.create_user) to @@ -97,3 +107,10 @@ conf = config.read_raw_config(config.config_basepath(config_file)) self.config: Dict[str, Any] = config.merge_configs(DEFAULT_CONFIG, conf) self.scheduler: SchedulerInterface = get_scheduler(**self.config["scheduler"]) + self.storage: StorageInterface = get_storage(**self.config["storage"]) + self.provider = self.config["provider"] + self.tool = { + "name": "swh-deposit", + "version": __version__, + "configuration": {"sword_version": "2"}, + } diff --git a/swh/deposit/settings/production.py b/swh/deposit/settings/production.py --- a/swh/deposit/settings/production.py +++ b/swh/deposit/settings/production.py @@ -58,7 +58,7 @@ "disable_existing_loggers": False, "formatters": { "standard": { - "format": "[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s", # noqa + "format": "[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s", "datefmt": "%d/%b/%Y %H:%M:%S", }, }, diff --git a/swh/deposit/tests/api/test_deposit_private_read_metadata.py b/swh/deposit/tests/api/test_deposit_private_read_metadata.py --- a/swh/deposit/tests/api/test_deposit_private_read_metadata.py +++ b/swh/deposit/tests/api/test_deposit_private_read_metadata.py @@ -6,7 +6,7 @@ from django.urls import reverse from rest_framework import status -from swh.deposit.api import __version__ +from swh.deposit import __version__ from swh.deposit.config import EDIT_SE_IRI, PRIVATE_GET_DEPOSIT_METADATA, SWH_PERSON from swh.deposit.models import Deposit diff --git a/swh/deposit/tests/api/test_deposit_update.py b/swh/deposit/tests/api/test_deposit_update.py --- a/swh/deposit/tests/api/test_deposit_update.py +++ b/swh/deposit/tests/api/test_deposit_update.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -10,6 +10,8 @@ from swh.deposit.models import Deposit, DepositCollection, DepositRequest from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import check_archive, create_arborescence_archive +from swh.model.hashutil import hash_to_bytes +from swh.model.identifiers import parse_swhid, swhid def test_replace_archive_to_deposit_is_possible( @@ -124,6 +126,103 @@ assert set(requests_archive0) == set(requests_archive1) +def test_put_update_metadata_on_done_deposit( + tmp_path, + authenticated_client, + complete_deposit, + deposit_collection, + atom_dataset, + sample_data, + swh_storage, +): + """Update all metadata on a deposit whose status is done is possible, 204 response + + """ + deposit = complete_deposit + + directory_id = hash_to_bytes(parse_swhid(deposit.swh_id).object_id) + # directory targeted by the complete_deposit does not exist in the storage + assert list(swh_storage.directory_missing([directory_id])) == [directory_id] + + # so let's create a directory reference in the storage (current deposit targets an + # unknown swhid) + directory = sample_data.directory + swh_storage.directory_add([directory]) + assert list(swh_storage.directory_missing([directory.id])) == [] + # and patch one complete deposit swh_id so it targets said reference + deposit.swh_id = swhid("directory", directory.id) + deposit.save() + + raw_metadata0 = atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8") + + requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") + assert len(requests_archive0) == 1 + + update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) + + response = authenticated_client.put( + update_uri, + content_type="application/atom+xml;type=entry", + data=atom_dataset["entry-data1"], + HTTP_CHECK_SWHID=deposit.swh_id, + ) + + assert response.status_code == status.HTTP_204_NO_CONTENT + + requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata") + + assert len(requests_meta) == 1 + request_meta1 = requests_meta[0] + raw_metadata1 = request_meta1.raw_metadata + assert raw_metadata1 == atom_dataset["entry-data1"] + assert raw_metadata0 != raw_metadata1 + + # check we did not touch the other parts + requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive") + assert len(requests_archive1) == 1 + assert set(requests_archive0) == set(requests_archive1) + + # FIXME: Check the metadata storage information created is consistent + pass + + +def test_put_update_metadata_on_done_deposit_failure( + tmp_path, + authenticated_client, + complete_deposit, + deposit_collection, + atom_dataset, + swh_storage, +): + """Update all metadata on a deposit whose status is done, wrong swhid is impossible + + """ + deposit = complete_deposit + + directory_id = hash_to_bytes(parse_swhid(deposit.swh_id).object_id) + # directory targeted by the complete_deposit does not exist in the storage + assert list(swh_storage.directory_missing([directory_id]))[0] is not None + + requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") + assert len(requests_archive0) == 1 + + update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) + + response = authenticated_client.put( + update_uri, + content_type="application/atom+xml;type=entry", + data=atom_dataset["entry-data1"], + HTTP_CHECK_SWHID=deposit.swh_id, + ) + + # so the deposit update is rejected + assert response.status_code == status.HTTP_400_BAD_REQUEST + + response_str = response.content.decode() + + assert f"Unknown directory SWHID {deposit.swh_id} reference" in response_str + + def test_add_archive_to_deposit_is_possible( tmp_path, authenticated_client, diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py --- a/swh/deposit/tests/conftest.py +++ b/swh/deposit/tests/conftest.py @@ -16,6 +16,7 @@ from rest_framework.test import APIClient import yaml +from swh.core.config import read from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, @@ -25,12 +26,14 @@ DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED, EDIT_SE_IRI, + SWH_METADATA_AUTHORITY, setup_django_for, ) from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import create_arborescence_archive from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid from swh.scheduler import get_scheduler +from swh.storage import get_storage # mypy is asked to ignore the import statement above because setup_databases # is not part of the d.t.utils.__all__ variable. @@ -51,7 +54,11 @@ @pytest.fixture() -def deposit_config(swh_scheduler_config): +def deposit_config(swh_scheduler_config, swh_storage_backend_config): + # storage_cfg = deepcopy(swh_storage_backend_config) + # # actual side-track # otherwise: OSError: storage check config failed + # storage_cfg["check_config"] = {"check_write": False} + return { "max_upload_size": 500, "extraction_dir": "/tmp/swh-deposit/test/extraction-dir", @@ -63,6 +70,7 @@ "metadata": {}, }, "scheduler": {"cls": "local", "args": swh_scheduler_config,}, + "storage": swh_storage_backend_config, } @@ -78,8 +86,10 @@ @pytest.fixture(autouse=True) def deposit_autoconfig(deposit_config_path, swh_scheduler_config): """Enforce config for deposit classes inherited from APIConfig.""" + cfg = read(deposit_config_path) - scheduler = get_scheduler("local", swh_scheduler_config) + # scheduler setup: require the load-deposit tasks (already existing in production) + scheduler = get_scheduler(**cfg["scheduler"]) task_type = { "type": "load-deposit", "backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit", @@ -87,6 +97,10 @@ } scheduler.create_task_type(task_type) + # storage setup: require the metadata authority swh (already existing in production) + storage = get_storage(**cfg["storage"]) + storage.metadata_authority_add([SWH_METADATA_AUTHORITY]) + @pytest.fixture(scope="session") def django_db_setup(request, django_db_blocker, postgresql_proc): diff --git a/swh/deposit/tests/test_init.py b/swh/deposit/tests/test_init.py --- a/swh/deposit/tests/test_init.py +++ b/swh/deposit/tests/test_init.py @@ -5,6 +5,6 @@ def test_version(): - from swh.deposit.api import __version__ + from swh.deposit import __version__ assert __version__ is not None