Page MenuHomeSoftware Heritage

D4013.id14237.diff
No OneTemporary

D4013.id14237.diff

diff --git a/swh/deposit/__init__.py b/swh/deposit/__init__.py
--- a/swh/deposit/__init__.py
+++ b/swh/deposit/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import pkg_resources
+
+try:
+ __version__ = pkg_resources.get_distribution("swh.deposit").version
+except pkg_resources.DistributionNotFound:
+ __version__ = "devel"
diff --git a/swh/deposit/api/__init__.py b/swh/deposit/api/__init__.py
--- a/swh/deposit/api/__init__.py
+++ b/swh/deposit/api/__init__.py
@@ -2,10 +2,3 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-
-import pkg_resources
-
-try:
- __version__ = pkg_resources.get_distribution("swh.deposit").version
-except pkg_resources.DistributionNotFound:
- __version__ = "devel"
diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py
--- a/swh/deposit/api/common.py
+++ b/swh/deposit/api/common.py
@@ -114,6 +114,8 @@
on_behalf_of = meta.get("HTTP_ON_BEHALF_OF")
metadata_relevant = meta.get("HTTP_METADATA_RELEVANT")
+ swhid = meta.get("HTTP_X_CHECK_SWHID")
+
return {
"content-type": content_type,
"content-length": content_length,
@@ -124,6 +126,7 @@
"slug": slug,
"on-behalf-of": on_behalf_of,
"metadata-relevant": metadata_relevant,
+ "swhid": swhid,
}
def _compute_md5(self, filehandler) -> bytes:
@@ -222,7 +225,7 @@
deposit_request_data: Dict[str, Any],
replace_metadata: bool = False,
replace_archives: bool = False,
- ) -> None:
+ ) -> DepositRequest:
"""Save a deposit request with metadata attached to a deposit.
Args:
@@ -235,7 +238,7 @@
archives to existing deposit
Returns:
- None
+ the DepositRequest object stored in the backend
"""
if replace_metadata:
@@ -265,6 +268,7 @@
deposit_request.save()
assert deposit_request is not None
+ return deposit_request
def _delete_archives(self, collection_name: str, deposit_id: int) -> Dict:
"""Delete archive references from the deposit id.
@@ -610,7 +614,7 @@
"""Atom entry deposit.
Args:
- request (Request): the request holding information to parse
+ request: the request holding information to parse
and inject in db
headers: request headers formatted
collection_name: the associated client
@@ -777,6 +781,8 @@
f"Client {username} cannot access collection {collection_name}",
)
+ headers = self._read_headers(request)
+
if deposit_id:
try:
deposit = Deposit.objects.get(pk=deposit_id)
@@ -785,11 +791,11 @@
NOT_FOUND, f"Deposit with id {deposit_id} does not exist"
)
- checks = self.restrict_access(request, deposit)
+ assert deposit is not None
+ checks = self.restrict_access(request, headers, deposit)
if checks:
return checks
- headers = self._read_headers(request)
if headers["on-behalf-of"]:
return make_error_dict(MEDIATION_NOT_ALLOWED, "Mediation is not supported.")
@@ -800,17 +806,19 @@
return {"headers": headers}
def restrict_access(
- self, request: Request, deposit: Optional[Deposit] = None
+ self, request: Request, headers: Dict, deposit: Deposit
) -> Dict[str, Any]:
- if deposit:
- if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL:
- summary = "You can only act on deposit with status '%s'" % (
- DEPOSIT_STATUS_PARTIAL,
- )
- description = f"This deposit has status '{deposit.status}'"
- return make_error_dict(
- BAD_REQUEST, summary=summary, verbose_description=description
- )
+ """Allow modifications on deposit with status 'partial' only, reject the rest.
+
+ """
+ if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL:
+ summary = "You can only act on deposit with status '%s'" % (
+ DEPOSIT_STATUS_PARTIAL,
+ )
+ description = f"This deposit has status '{deposit.status}'"
+ return make_error_dict(
+ BAD_REQUEST, summary=summary, verbose_description=description
+ )
return {}
def _basic_not_allowed_method(self, request: Request, method: str):
diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py
--- a/swh/deposit/api/deposit_update.py
+++ b/swh/deposit/api/deposit_update.py
@@ -3,12 +3,36 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import json
from typing import Any, Dict, Optional, Tuple
from rest_framework import status
-
-from ..config import CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI
-from ..errors import BAD_REQUEST, make_error_dict
+from rest_framework.request import Request
+
+from swh.deposit.models import Deposit
+from swh.model.hashutil import hash_to_bytes
+from swh.model.identifiers import parse_swhid
+from swh.model.model import (
+ MetadataAuthority,
+ MetadataAuthorityType,
+ MetadataFetcher,
+ MetadataTargetType,
+ RawExtrinsicMetadata,
+)
+from swh.storage import get_storage
+from swh.storage.interface import StorageInterface
+
+from ..config import (
+ CONT_FILE_IRI,
+ DEPOSIT_STATUS_LOAD_SUCCESS,
+ DEPOSIT_STATUS_PARTIAL,
+ EDIT_SE_IRI,
+ EM_IRI,
+ METADATA_KEY,
+ RAW_METADATA_KEY,
+ SWH_METADATA_AUTHORITY,
+)
+from ..errors import BAD_REQUEST, ParserError, make_error_dict
from ..parsers import (
SWHAtomEntryParser,
SWHFileUploadTarParser,
@@ -103,32 +127,172 @@
parser_classes = (SWHMultiPartParser, SWHAtomEntryParser)
+ def __init__(self):
+ super().__init__()
+ self.storage: StorageInterface = get_storage(**self.config["storage"])
+
+ def restrict_access(
+ self, request: Request, headers: Dict, deposit: Deposit
+ ) -> Dict[str, Any]:
+ """Relax restriction access to allow metadata update on deposit with status "done" when
+ a swhid is provided.
+
+ """
+ if (
+ request.method == "PUT"
+ and headers["swhid"] is not None
+ and deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS
+ ):
+ # Allow metadata update on deposit with status "done" when swhid provided
+ return {}
+ # otherwise, let the standard access restriction check occur
+ return super().restrict_access(request, headers, deposit)
+
def process_put(
- self, req, headers: Dict, collection_name: str, deposit_id: int
+ self, request, headers: Dict, collection_name: str, deposit_id: int
) -> Dict[str, Any]:
- """Replace existing deposit's metadata/archive with new ones.
+ """This allows the following update metadata scenarios to occur:
+
+ - multipart: replace all the deposit (status partial) metadata and archive
+ with the provided ones.
+
+ - no swhid, atom: replace all the deposit (status partial) metadata with the
+ provided ones.
+
+ - with swhid, atom: replace all the deposit (status done) metadata with the
+ provided ones and push such metadata to the metadata storage directly.
source:
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_metadata # noqa
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_multipart # noqa
+ Raises:
+ 400 if any of the following occur:
+ - the swhid provided and the deposit swhid do not match
+ - the provided metadata xml file is malformed
+ - the provided xml atom entry is empty
+ - the provided swhid does not exist in the archive
+
Returns:
204 No content
"""
- if req.content_type.startswith("multipart/"):
+ if request.content_type.startswith("multipart/"):
return self._multipart_upload(
- req,
+ request,
headers,
collection_name,
deposit_id=deposit_id,
replace_archives=True,
replace_metadata=True,
)
- return self._atom_entry(
- req, headers, collection_name, deposit_id=deposit_id, replace_metadata=True
+ deposit = Deposit.objects.get(pk=deposit_id)
+ swhid = headers.get("swhid")
+ if swhid is None:
+ # standard metadata update (replace all metadata already provided to the
+ # deposit by the new ones)
+ assert deposit.status == DEPOSIT_STATUS_PARTIAL
+ return self._atom_entry(
+ request,
+ headers,
+ collection_name,
+ deposit_id=deposit_id,
+ replace_metadata=True,
+ )
+
+ # Update metadata on a deposit already ingested
+ # Only write to the metadata storage
+
+ assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS
+
+ if swhid != deposit.swh_id:
+ return make_error_dict(
+ BAD_REQUEST,
+ f"Mismatched provided SWHID {swhid} with deposit's {deposit.swh_id}.",
+ "The provided SWHID does not match the deposit to update. "
+ "Please ensure you send the correct deposit SWHID.",
+ )
+
+ try:
+ raw_metadata, metadata = self._read_metadata(request.data)
+ except ParserError:
+ return make_error_dict(
+ BAD_REQUEST,
+ "Malformed xml metadata",
+ "The xml received is malformed. "
+ "Please ensure your metadata file is correctly formatted.",
+ )
+ if not metadata:
+ return make_error_dict(
+ BAD_REQUEST,
+ "Empty body request is not supported",
+ "Atom entry deposit is supposed to send for metadata. "
+ "If the body is empty, there is no metadata.",
+ )
+
+ metadata_authority = MetadataAuthority(
+ type=MetadataAuthorityType.DEPOSIT_CLIENT,
+ url=self.provider["provider_url"],
+ metadata={
+ "name": self.provider["provider_name"],
+ **(self.provider["metadata"] or {}),
+ },
)
+ metadata_fetcher = MetadataFetcher(
+ name=self.tool["name"],
+ version=self.tool["version"],
+ metadata=self.tool["configuration"],
+ )
+
+ deposit_swhid = parse_swhid(swhid)
+
+ directory_id = hash_to_bytes(deposit_swhid.object_id)
+
+ # check the swhid exists in the archive
+ directories_missing = list(self.storage.directory_missing([directory_id]))
+ if len(directories_missing) > 0:
+ return make_error_dict(
+ BAD_REQUEST,
+ f"Unknown directory SWHID {swhid} reference",
+ "The SWHID provided is not a known directory SWHID in SWH archive. "
+ "Please provide an existing SWHID.",
+ )
+
+ # replace metadata within the deposit backend
+ deposit_request_data = {
+ METADATA_KEY: metadata,
+ RAW_METADATA_KEY: raw_metadata,
+ }
+
+ deposit_request = self._deposit_request_put(
+ deposit, deposit_request_data, replace_metadata=True,
+ )
+
+ # QUESTION: Do we also put the raw xml?
+ # A priori, no since we don't do this originally in the loader.
+ metadata_object = RawExtrinsicMetadata(
+ type=MetadataTargetType.DIRECTORY,
+ id=deposit_swhid,
+ discovery_date=deposit_request.date,
+ authority=SWH_METADATA_AUTHORITY,
+ fetcher=metadata_fetcher,
+ format="sword-v2-atom-codemeta-v2-in-json",
+ metadata=json.dumps(metadata).encode(),
+ )
+
+ # write to metadata storage
+ self.storage.metadata_authority_add([metadata_authority])
+ self.storage.metadata_fetcher_add([metadata_fetcher])
+ self.storage.raw_extrinsic_metadata_add([metadata_object])
+
+ return {
+ "deposit_id": deposit_id,
+ "deposit_date": deposit_request.date,
+ "status": deposit.status,
+ "archive": None,
+ }
+
def process_post(
self,
request,
diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py
--- a/swh/deposit/api/private/deposit_read.py
+++ b/swh/deposit/api/private/deposit_read.py
@@ -12,7 +12,6 @@
from rest_framework import status
from swh.core import tarball
-from swh.deposit.api import __version__
from swh.deposit.utils import normalize_date
from swh.model import identifiers
@@ -104,15 +103,6 @@
"""
- def __init__(self):
- super().__init__()
- self.provider = self.config["provider"]
- self.tool = {
- "name": "swh-deposit",
- "version": __version__,
- "configuration": {"sword_version": "2"},
- }
-
def _normalize_dates(self, deposit, metadata):
"""Normalize the date to use as a tuple of author date, committer date
from the incoming metadata.
diff --git a/swh/deposit/config.py b/swh/deposit/config.py
--- a/swh/deposit/config.py
+++ b/swh/deposit/config.py
@@ -7,6 +7,8 @@
from typing import Any, Dict
from swh.core import config
+from swh.deposit import __version__
+from swh.model.model import MetadataAuthority, MetadataAuthorityType
from swh.scheduler import get_scheduler
from swh.scheduler.interface import SchedulerInterface
@@ -53,6 +55,12 @@
"checks": True,
}
+SWH_METADATA_AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.REGISTRY,
+ url="https://softwareheritage.org/",
+ metadata={},
+)
+
def setup_django_for(platform=None, config_file=None):
"""Setup function for command line tools (swh.deposit.create_user) to
@@ -97,3 +105,9 @@
conf = config.read_raw_config(config.config_basepath(config_file))
self.config: Dict[str, Any] = config.merge_configs(DEFAULT_CONFIG, conf)
self.scheduler: SchedulerInterface = get_scheduler(**self.config["scheduler"])
+ self.provider = self.config["provider"]
+ self.tool = {
+ "name": "swh-deposit",
+ "version": __version__,
+ "configuration": {"sword_version": "2"},
+ }
diff --git a/swh/deposit/tests/api/test_deposit_private_read_metadata.py b/swh/deposit/tests/api/test_deposit_private_read_metadata.py
--- a/swh/deposit/tests/api/test_deposit_private_read_metadata.py
+++ b/swh/deposit/tests/api/test_deposit_private_read_metadata.py
@@ -6,7 +6,7 @@
from django.urls import reverse
from rest_framework import status
-from swh.deposit.api import __version__
+from swh.deposit import __version__
from swh.deposit.config import EDIT_SE_IRI, PRIVATE_GET_DEPOSIT_METADATA, SWH_PERSON
from swh.deposit.models import Deposit
diff --git a/swh/deposit/tests/api/test_deposit_update.py b/swh/deposit/tests/api/test_deposit_update.py
--- a/swh/deposit/tests/api/test_deposit_update.py
+++ b/swh/deposit/tests/api/test_deposit_update.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2019 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -10,6 +10,8 @@
from swh.deposit.models import Deposit, DepositCollection, DepositRequest
from swh.deposit.parsers import parse_xml
from swh.deposit.tests.common import check_archive, create_arborescence_archive
+from swh.model.hashutil import hash_to_bytes
+from swh.model.identifiers import parse_swhid, swhid
def test_replace_archive_to_deposit_is_possible(
@@ -124,6 +126,141 @@
assert set(requests_archive0) == set(requests_archive1)
+def test_put_update_metadata_done_deposit_nominal(
+ tmp_path,
+ authenticated_client,
+ complete_deposit,
+ deposit_collection,
+ atom_dataset,
+ sample_data,
+ swh_storage,
+):
+ """Nominal scenario, client send an update of metadata on a deposit with status "done"
+ with an existing swhid. Such swhid has its metadata updated accordingly both in
+ the deposit backend and in the metadata storage.
+
+ Response: 204
+
+ """
+ deposit = complete_deposit
+
+ directory_id = hash_to_bytes(parse_swhid(deposit.swh_id).object_id)
+ # directory targeted by the complete_deposit does not exist in the storage
+ assert list(swh_storage.directory_missing([directory_id])) == [directory_id]
+
+ # so let's create a directory reference in the storage (current deposit targets an
+ # unknown swhid)
+ directory = sample_data.directory
+ swh_storage.directory_add([directory])
+ assert list(swh_storage.directory_missing([directory.id])) == []
+ # and patch one complete deposit swh_id so it targets said reference
+ deposit.swh_id = swhid("directory", directory.id)
+ deposit.save()
+
+ raw_metadata0 = atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8")
+
+ requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive")
+ assert len(requests_archive0) == 1
+
+ update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id])
+
+ response = authenticated_client.put(
+ update_uri,
+ content_type="application/atom+xml;type=entry",
+ data=atom_dataset["entry-data1"],
+ HTTP_X_CHECK_SWHID=deposit.swh_id,
+ )
+
+ assert response.status_code == status.HTTP_204_NO_CONTENT
+
+ requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata")
+
+ assert len(requests_meta) == 1
+ request_meta1 = requests_meta[0]
+ raw_metadata1 = request_meta1.raw_metadata
+ assert raw_metadata1 == atom_dataset["entry-data1"]
+ assert raw_metadata0 != raw_metadata1
+
+ # check we did not touch the other parts
+ requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive")
+ assert len(requests_archive1) == 1
+ assert set(requests_archive0) == set(requests_archive1)
+
+ # FIXME: Check the metadata storage information created is consistent
+ pass
+
+
+def test_put_update_metadata_done_deposit_swhid_unknown(
+ tmp_path,
+ authenticated_client,
+ complete_deposit,
+ deposit_collection,
+ atom_dataset,
+ swh_storage,
+):
+ """Failure scenario: client sends a correct swhid (which matches the deposit's swhid),
+ but somehow the swhid does not exist in the archive (should not happen though,
+ still technically possible...)
+
+ Response: 400
+
+ """
+ deposit = complete_deposit # deposit with status "done"
+
+ # directory targeted by the complete_deposit does not exist in the storage
+ missing_directory_id = hash_to_bytes(parse_swhid(deposit.swh_id).object_id)
+ assert list(swh_storage.directory_missing([missing_directory_id])) == [
+ missing_directory_id
+ ]
+
+ update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id])
+
+ response = authenticated_client.put(
+ update_uri,
+ content_type="application/atom+xml;type=entry",
+ data=atom_dataset["entry-data1"],
+ HTTP_X_CHECK_SWHID=deposit.swh_id,
+ )
+
+ # so the deposit update is rejected
+ assert response.status_code == status.HTTP_400_BAD_REQUEST
+
+ response_str = response.content.decode()
+
+ assert f"Unknown directory SWHID {deposit.swh_id} reference" in response_str
+
+
+def test_put_update_metadata_done_deposit_mismatched_swhid(
+ tmp_path,
+ authenticated_client,
+ complete_deposit,
+ deposit_collection,
+ atom_dataset,
+ swh_storage,
+):
+ """Failure scenario: client sends a swhid which does not match the deposit swhid.
+
+ Response: 400
+
+ """
+ deposit = complete_deposit # deposit with status "done"
+ incorrect_swhid = "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea"
+ assert deposit.swh_id != incorrect_swhid
+
+ update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id])
+ response = authenticated_client.put(
+ update_uri,
+ content_type="application/atom+xml;type=entry",
+ data=atom_dataset["entry-data1"],
+ HTTP_X_CHECK_SWHID=incorrect_swhid,
+ )
+
+ # so the deposit update is rejected
+ assert response.status_code == status.HTTP_400_BAD_REQUEST
+ response_str = response.content.decode()
+ assert f"Mismatched provided SWHID {incorrect_swhid}" in response_str
+
+
def test_add_archive_to_deposit_is_possible(
tmp_path,
authenticated_client,
diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py
--- a/swh/deposit/tests/conftest.py
+++ b/swh/deposit/tests/conftest.py
@@ -16,6 +16,7 @@
from rest_framework.test import APIClient
import yaml
+from swh.core.config import read
from swh.deposit.config import (
COL_IRI,
DEPOSIT_STATUS_DEPOSITED,
@@ -25,12 +26,14 @@
DEPOSIT_STATUS_REJECTED,
DEPOSIT_STATUS_VERIFIED,
EDIT_SE_IRI,
+ SWH_METADATA_AUTHORITY,
setup_django_for,
)
from swh.deposit.parsers import parse_xml
from swh.deposit.tests.common import create_arborescence_archive
from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid
from swh.scheduler import get_scheduler
+from swh.storage import get_storage
# mypy is asked to ignore the import statement above because setup_databases
# is not part of the d.t.utils.__all__ variable.
@@ -51,7 +54,7 @@
@pytest.fixture()
-def deposit_config(swh_scheduler_config):
+def deposit_config(swh_scheduler_config, swh_storage_backend_config):
return {
"max_upload_size": 500,
"extraction_dir": "/tmp/swh-deposit/test/extraction-dir",
@@ -63,6 +66,7 @@
"metadata": {},
},
"scheduler": {"cls": "local", "args": swh_scheduler_config,},
+ "storage": swh_storage_backend_config,
}
@@ -78,14 +82,24 @@
@pytest.fixture(autouse=True)
def deposit_autoconfig(deposit_config_path, swh_scheduler_config):
"""Enforce config for deposit classes inherited from APIConfig."""
+ cfg = read(deposit_config_path)
+
+ if "scheduler" in cfg:
+ # scheduler setup: require the load-deposit tasks (already existing in
+ # production)
+ scheduler = get_scheduler(**cfg["scheduler"])
+ task_type = {
+ "type": "load-deposit",
+ "backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit",
+ "description": "Load deposit task",
+ }
+ scheduler.create_task_type(task_type)
- scheduler = get_scheduler("local", swh_scheduler_config)
- task_type = {
- "type": "load-deposit",
- "backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit",
- "description": "Load deposit task",
- }
- scheduler.create_task_type(task_type)
+ if "storage" in cfg:
+ # `deposit_update` tests requires the SWH_METADATA_AUTHORITY populated (matching
+ # production)
+ storage = get_storage(**cfg["storage"])
+ storage.metadata_authority_add([SWH_METADATA_AUTHORITY])
@pytest.fixture(scope="session")
diff --git a/swh/deposit/tests/test_init.py b/swh/deposit/tests/test_init.py
--- a/swh/deposit/tests/test_init.py
+++ b/swh/deposit/tests/test_init.py
@@ -5,6 +5,6 @@
def test_version():
- from swh.deposit.api import __version__
+ from swh.deposit import __version__
assert __version__ is not None

File Metadata

Mime Type
text/plain
Expires
Tue, Dec 17, 7:58 AM (2 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3233174

Event Timeline