diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -3,6 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import attr + from abc import ABCMeta, abstractmethod import datetime import hashlib @@ -19,7 +21,18 @@ from rest_framework.request import Request from rest_framework.views import APIView +from swh.deposit.api.checks import check_metadata +from swh.deposit.api.converters import convert_status_detail +from swh.deposit.models import Deposit from swh.model import hashutil +from swh.model.identifiers import SWHID, ValidationError +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + RawExtrinsicMetadata, +) from swh.scheduler.utils import create_oneshot_task_dict from ..config import ( @@ -52,8 +65,8 @@ make_error_response, make_error_response_from_dict, ) -from ..models import Deposit, DepositClient, DepositCollection, DepositRequest -from ..parsers import parse_xml +from ..models import DepositClient, DepositCollection, DepositRequest +from ..parsers import parse_swh_reference, parse_xml ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"] ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"] @@ -603,6 +616,111 @@ "status": deposit.status, } + def _store_metadata_deposit( + self, + deposit: Deposit, + swhid_reference: Union[str, SWHID], + metadata: Dict, + raw_metadata: bytes, + with_deposit_origin: bool = False, + ) -> Dict: + """Metadata-only deposit + + Checks: + - The metadata received should pass the functional metadata checks. + - The SWHID is technically valid + + """ + if not metadata: + return make_error_dict( + BAD_REQUEST, + "Empty body request is not supported", + "Atom entry deposit is supposed to send for metadata. " + "If the body is empty, there is no metadata.", + ) + + metadata_ok, error_details = check_metadata(metadata) + if not metadata_ok: + assert error_details, "Details should be set when a failure occurs" + return make_error_dict( + BAD_REQUEST, + "Functional metadata checks failure", + convert_status_detail(error_details), + ) + + metadata_authority = MetadataAuthority( + type=MetadataAuthorityType.DEPOSIT_CLIENT, + url=deposit.client.provider_url, + metadata={"name": deposit.client.last_name}, + ) + + metadata_fetcher = MetadataFetcher( + name=self.tool["name"], + version=self.tool["version"], + metadata=self.tool["configuration"], + ) + + # replace metadata within the deposit backend + deposit_request_data = { + METADATA_KEY: metadata, + RAW_METADATA_KEY: raw_metadata, + } + + # actually add the metadata to the completed deposit + deposit_request = self._deposit_request_put(deposit, deposit_request_data) + + map_type = { + "content": MetadataTargetType.CONTENT, + "directory": MetadataTargetType.DIRECTORY, + "revision": MetadataTargetType.REVISION, + "release": MetadataTargetType.RELEASE, + "snapshot": MetadataTargetType.SNAPSHOT, + "origin": MetadataTargetType.ORIGIN, + } + + metadata_d = {"origin": None} + if isinstance(swhid_reference, SWHID): + object_type = map_type[swhid_reference.object_type] + if swhid_reference.metadata: + path = swhid_reference.metadata.get("path") + metadata_d = { + "origin": swhid_reference.metadata.get("origin"), + # "visit": swhid_reference.metadata.get("visit"), + "path": path.encode() if path else None, + # "...": swhid_reference.metadata.get("anchor") + } + # Make the swhid a swhid core + swhid_reference = attr.evolve(swhid_reference, metadata={}) + else: + object_type = MetadataTargetType.ORIGIN + + if with_deposit_origin: # metadata deposit update on completed deposit + metadata_d["origin"] = deposit.origin_url + + # store that metadata to the metadata storage + metadata_object = RawExtrinsicMetadata( + type=object_type, + target=swhid_reference, + discovery_date=deposit_request.date, + authority=metadata_authority, + fetcher=metadata_fetcher, + format="sword-v2-atom-codemeta", + metadata=raw_metadata, + **metadata_d + ) + + # write to metadata storage + self.storage_metadata.metadata_authority_add([metadata_authority]) + self.storage_metadata.metadata_fetcher_add([metadata_fetcher]) + self.storage_metadata.raw_extrinsic_metadata_add([metadata_object]) + + return { + "deposit_id": deposit.id, + "deposit_date": deposit_request.date, + "status": deposit.status, + "archive": None, + } + def _atom_entry( self, request: Request, @@ -662,11 +780,13 @@ "If the body is empty, there is no metadata.", ) - external_id = metadata.get("external_identifier", headers["slug"]) + # Determine if we are in the metadata-only deposit case + try: + swhid = parse_swh_reference(metadata) + except ValidationError as e: + return make_error_dict(PARSING_ERROR, "Invalid SWHID reference", str(e),) - # TODO: Determine if we are in the metadata-only deposit case. If it is, then - # save deposit and deposit request typed 'metadata' and send metadata to the - # metadata storage. Otherwise, do as existing deposit. + external_id = metadata.get("external_identifier", headers["slug"]) deposit = self._deposit_put( request, @@ -675,6 +795,9 @@ external_id=external_id, ) + if swhid is not None: + return self._store_metadata_deposit(deposit, swhid, metadata, raw_metadata) + self._deposit_request_put( deposit, {METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata}, diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py --- a/swh/deposit/api/deposit_update.py +++ b/swh/deposit/api/deposit_update.py @@ -8,28 +8,10 @@ from rest_framework import status from rest_framework.request import Request -from swh.deposit.api.checks import check_metadata -from swh.deposit.api.converters import convert_status_detail from swh.deposit.models import Deposit from swh.model.identifiers import parse_swhid -from swh.model.model import ( - MetadataAuthority, - MetadataAuthorityType, - MetadataFetcher, - MetadataTargetType, - RawExtrinsicMetadata, -) -from swh.storage import get_storage -from swh.storage.interface import StorageInterface - -from ..config import ( - CONT_FILE_IRI, - DEPOSIT_STATUS_LOAD_SUCCESS, - EDIT_SE_IRI, - EM_IRI, - METADATA_KEY, - RAW_METADATA_KEY, -) + +from ..config import CONT_FILE_IRI, DEPOSIT_STATUS_LOAD_SUCCESS, EDIT_SE_IRI, EM_IRI from ..errors import BAD_REQUEST, ParserError, make_error_dict from ..parsers import ( SWHAtomEntryParser, @@ -125,12 +107,6 @@ parser_classes = (SWHMultiPartParser, SWHAtomEntryParser) - def __init__(self): - super().__init__() - self.storage_metadata: StorageInterface = get_storage( - **self.config["storage_metadata"] - ) - def restrict_access( self, request: Request, headers: Dict, deposit: Deposit ) -> Dict[str, Any]: @@ -221,69 +197,14 @@ "Please ensure your metadata file is correctly formatted.", ) - if not metadata: - return make_error_dict( - BAD_REQUEST, - "Empty body request is not supported", - "Atom entry deposit is supposed to send for metadata. " - "If the body is empty, there is no metadata.", - ) - - metadata_ok, error_details = check_metadata(metadata) - if not metadata_ok: - assert error_details, "Details should be set when a failure occurs" - return make_error_dict( - BAD_REQUEST, - "Functional metadata checks failure", - convert_status_detail(error_details), - ) - - metadata_authority = MetadataAuthority( - type=MetadataAuthorityType.DEPOSIT_CLIENT, - url=deposit.client.provider_url, - metadata={"name": deposit.client.last_name}, + return self._store_metadata_deposit( + deposit, + parse_swhid(swhid), + metadata, + raw_metadata, + with_deposit_origin=True, ) - metadata_fetcher = MetadataFetcher( - name=self.tool["name"], - version=self.tool["version"], - metadata=self.tool["configuration"], - ) - - deposit_swhid = parse_swhid(swhid) - - # replace metadata within the deposit backend - deposit_request_data = { - METADATA_KEY: metadata, - RAW_METADATA_KEY: raw_metadata, - } - - # actually add the metadata to the completed deposit - deposit_request = self._deposit_request_put(deposit, deposit_request_data) - # store that metadata to the metadata storage - metadata_object = RawExtrinsicMetadata( - type=MetadataTargetType.DIRECTORY, - target=deposit_swhid, - discovery_date=deposit_request.date, - authority=metadata_authority, - fetcher=metadata_fetcher, - format="sword-v2-atom-codemeta", - metadata=raw_metadata, - origin=deposit.origin_url, - ) - - # write to metadata storage - self.storage_metadata.metadata_authority_add([metadata_authority]) - self.storage_metadata.metadata_fetcher_add([metadata_fetcher]) - self.storage_metadata.raw_extrinsic_metadata_add([metadata_object]) - - return { - "deposit_id": deposit_id, - "deposit_date": deposit_request.date, - "status": deposit.status, - "archive": None, - } - def process_post( self, request, diff --git a/swh/deposit/config.py b/swh/deposit/config.py --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -10,6 +10,8 @@ from swh.deposit import __version__ from swh.scheduler import get_scheduler from swh.scheduler.interface import SchedulerInterface +from swh.storage import get_storage +from swh.storage.interface import StorageInterface # IRIs (Internationalized Resource identifier) sword 2.0 specified EDIT_SE_IRI = "edit_se_iri" @@ -101,3 +103,6 @@ "version": __version__, "configuration": {"sword_version": "2"}, } + self.storage_metadata: StorageInterface = get_storage( + **self.config["storage_metadata"] + ) diff --git a/swh/deposit/tests/api/test_deposit_metadata.py b/swh/deposit/tests/api/test_deposit_metadata.py new file mode 100644 --- /dev/null +++ b/swh/deposit/tests/api/test_deposit_metadata.py @@ -0,0 +1,169 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import attr +from django.urls import reverse +import pytest +from rest_framework import status + +from swh.deposit.config import COL_IRI, APIConfig +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + RawExtrinsicMetadata, +) +from swh.storage.interface import PagedResult + + +def test_deposit_metadata_invalid( + authenticated_client, deposit_collection, atom_dataset +): + """Posting an invalid swhid reference is propagated to clients + + """ + invalid_swhid = "swh:1:dir :31b5c8cc985d190b5a7ef4878128ebfdc2358f49" + xml_data = atom_dataset["entry-data-with-swhid"].format(swhid=invalid_swhid) + + response = authenticated_client.post( + reverse(COL_IRI, args=[deposit_collection.name]), + content_type="application/atom+xml;type=entry", + data=xml_data, + HTTP_SLUG="external-id", + ) + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert b"Invalid SWHID reference" in response.content + + +@pytest.mark.parametrize( + "swhid,target_type", + [ + ( + "swh:1:cnt:01b5c8cc985d190b5a7ef4878128ebfdc2358f49", + MetadataTargetType.CONTENT, + ), + ( + "swh:1:dir:11b5c8cc985d190b5a7ef4878128ebfdc2358f49", + MetadataTargetType.DIRECTORY, + ), + ( + "swh:1:rev:21b5c8cc985d190b5a7ef4878128ebfdc2358f49", + MetadataTargetType.REVISION, + ), + ( + "swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49", + MetadataTargetType.RELEASE, + ), + ( + "swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", + MetadataTargetType.SNAPSHOT, + ), + ( + "swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", + MetadataTargetType.CONTENT, + ), + ( + "swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;origin=https://inria.halpreprod.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:rev:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa + MetadataTargetType.DIRECTORY, + ), + ( + "swh:1:rev:71b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", + MetadataTargetType.REVISION, + ), + ( + "swh:1:rel:81b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", + MetadataTargetType.RELEASE, + ), + ( + "swh:1:snp:91b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=h://g.c/o/repo", + MetadataTargetType.SNAPSHOT, + ), + ], +) +def test_deposit_metadata_swhid( + swhid, + target_type, + authenticated_client, + deposit_collection, + atom_dataset, + swh_storage, +): + """Posting a swhid reference is stored on raw extrinsic metadata storage + + """ + swhid_reference = parse_swhid(swhid) + swhid_core = attr.evolve(swhid_reference, metadata={}) + + xml_data = atom_dataset["entry-data-with-swhid"].format(swhid=swhid) + deposit_client = authenticated_client.deposit_client + + response = authenticated_client.post( + reverse(COL_IRI, args=[deposit_collection.name]), + content_type="application/atom+xml;type=entry", + data=xml_data, + HTTP_SLUG="external-id", + ) + + assert response.status_code == status.HTTP_201_CREATED + + # Ensure metadata stored in the metadata storage is consistent + metadata_authority = MetadataAuthority( + type=MetadataAuthorityType.DEPOSIT_CLIENT, + url=deposit_client.provider_url, + metadata={"name": deposit_client.last_name}, + ) + + actual_authority = swh_storage.metadata_authority_get( + MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url + ) + assert actual_authority == metadata_authority + + config = APIConfig() + metadata_fetcher = MetadataFetcher( + name=config.tool["name"], + version=config.tool["version"], + metadata=config.tool["configuration"], + ) + + actual_fetcher = swh_storage.metadata_fetcher_get( + config.tool["name"], config.tool["version"] + ) + assert actual_fetcher == metadata_fetcher + + page_results = swh_storage.raw_extrinsic_metadata_get( + target_type, swhid_core, metadata_authority + ) + discovery_date = page_results.results[0].discovery_date + + assert len(page_results.results) == 1 + assert page_results.next_page_token is None + + metadata_dict = {} + if swhid_reference.metadata: + path = swhid_reference.metadata.get("path") + metadata_dict = { + "origin": swhid_reference.metadata.get("origin"), + # "visit": swhid_reference.metadata.get("visit"), # type is int, we got a swhid ¯\_(ツ)_/¯ # noqa + "path": path.encode() if path else None, + # "...": swhid_reference.metadata.get("anchor") # anchor is a swhid... # noqa + } + + assert page_results == PagedResult( + results=[ + RawExtrinsicMetadata( + type=target_type, + target=swhid_core, + discovery_date=discovery_date, + authority=attr.evolve(metadata_authority, metadata=None), + fetcher=attr.evolve(metadata_fetcher, metadata=None), + format="sword-v2-atom-codemeta", + metadata=xml_data.encode(), + **metadata_dict + ) + ], + next_page_token=None, + ) diff --git a/swh/deposit/tests/api/test_parsers.py b/swh/deposit/tests/api/test_parsers.py --- a/swh/deposit/tests/api/test_parsers.py +++ b/swh/deposit/tests/api/test_parsers.py @@ -187,19 +187,8 @@ @pytest.fixture -def xml_with_swhid(): - xml_data = """ - - - - - - - - """ - return xml_data.strip() +def xml_with_swhid(atom_dataset): + return atom_dataset["entry-data-with-swhid"] @pytest.mark.parametrize( diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py --- a/swh/deposit/tests/conftest.py +++ b/swh/deposit/tests/conftest.py @@ -204,15 +204,19 @@ return APIClient() # <- drf's client -@pytest.yield_fixture +@pytest.fixture def authenticated_client(client, deposit_user): """Returned a logged client + This also patched the client instance to keep a reference on the associated + deposit_user. + """ _token = "%s:%s" % (deposit_user.username, TEST_USER["password"]) token = base64.b64encode(_token.encode("utf-8")) authorization = "Basic %s" % token.decode("utf-8") client.credentials(HTTP_AUTHORIZATION=authorization) + client.deposit_client = deposit_user yield client client.logout() diff --git a/swh/deposit/tests/data/atom/entry-data-with-swhid.xml b/swh/deposit/tests/data/atom/entry-data-with-swhid.xml new file mode 100644 --- /dev/null +++ b/swh/deposit/tests/data/atom/entry-data-with-swhid.xml @@ -0,0 +1,13 @@ + + + Awesome Compiler + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + dudess + + + + + +