Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7122854
D4013.id14237.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
23 KB
Subscribers
None
D4013.id14237.diff
View Options
diff --git a/swh/deposit/__init__.py b/swh/deposit/__init__.py
--- a/swh/deposit/__init__.py
+++ b/swh/deposit/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import pkg_resources
+
+try:
+ __version__ = pkg_resources.get_distribution("swh.deposit").version
+except pkg_resources.DistributionNotFound:
+ __version__ = "devel"
diff --git a/swh/deposit/api/__init__.py b/swh/deposit/api/__init__.py
--- a/swh/deposit/api/__init__.py
+++ b/swh/deposit/api/__init__.py
@@ -2,10 +2,3 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-
-import pkg_resources
-
-try:
- __version__ = pkg_resources.get_distribution("swh.deposit").version
-except pkg_resources.DistributionNotFound:
- __version__ = "devel"
diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py
--- a/swh/deposit/api/common.py
+++ b/swh/deposit/api/common.py
@@ -114,6 +114,8 @@
on_behalf_of = meta.get("HTTP_ON_BEHALF_OF")
metadata_relevant = meta.get("HTTP_METADATA_RELEVANT")
+ swhid = meta.get("HTTP_X_CHECK_SWHID")
+
return {
"content-type": content_type,
"content-length": content_length,
@@ -124,6 +126,7 @@
"slug": slug,
"on-behalf-of": on_behalf_of,
"metadata-relevant": metadata_relevant,
+ "swhid": swhid,
}
def _compute_md5(self, filehandler) -> bytes:
@@ -222,7 +225,7 @@
deposit_request_data: Dict[str, Any],
replace_metadata: bool = False,
replace_archives: bool = False,
- ) -> None:
+ ) -> DepositRequest:
"""Save a deposit request with metadata attached to a deposit.
Args:
@@ -235,7 +238,7 @@
archives to existing deposit
Returns:
- None
+ the DepositRequest object stored in the backend
"""
if replace_metadata:
@@ -265,6 +268,7 @@
deposit_request.save()
assert deposit_request is not None
+ return deposit_request
def _delete_archives(self, collection_name: str, deposit_id: int) -> Dict:
"""Delete archive references from the deposit id.
@@ -610,7 +614,7 @@
"""Atom entry deposit.
Args:
- request (Request): the request holding information to parse
+ request: the request holding information to parse
and inject in db
headers: request headers formatted
collection_name: the associated client
@@ -777,6 +781,8 @@
f"Client {username} cannot access collection {collection_name}",
)
+ headers = self._read_headers(request)
+
if deposit_id:
try:
deposit = Deposit.objects.get(pk=deposit_id)
@@ -785,11 +791,11 @@
NOT_FOUND, f"Deposit with id {deposit_id} does not exist"
)
- checks = self.restrict_access(request, deposit)
+ assert deposit is not None
+ checks = self.restrict_access(request, headers, deposit)
if checks:
return checks
- headers = self._read_headers(request)
if headers["on-behalf-of"]:
return make_error_dict(MEDIATION_NOT_ALLOWED, "Mediation is not supported.")
@@ -800,17 +806,19 @@
return {"headers": headers}
def restrict_access(
- self, request: Request, deposit: Optional[Deposit] = None
+ self, request: Request, headers: Dict, deposit: Deposit
) -> Dict[str, Any]:
- if deposit:
- if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL:
- summary = "You can only act on deposit with status '%s'" % (
- DEPOSIT_STATUS_PARTIAL,
- )
- description = f"This deposit has status '{deposit.status}'"
- return make_error_dict(
- BAD_REQUEST, summary=summary, verbose_description=description
- )
+ """Allow modifications on deposit with status 'partial' only, reject the rest.
+
+ """
+ if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL:
+ summary = "You can only act on deposit with status '%s'" % (
+ DEPOSIT_STATUS_PARTIAL,
+ )
+ description = f"This deposit has status '{deposit.status}'"
+ return make_error_dict(
+ BAD_REQUEST, summary=summary, verbose_description=description
+ )
return {}
def _basic_not_allowed_method(self, request: Request, method: str):
diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py
--- a/swh/deposit/api/deposit_update.py
+++ b/swh/deposit/api/deposit_update.py
@@ -3,12 +3,36 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import json
from typing import Any, Dict, Optional, Tuple
from rest_framework import status
-
-from ..config import CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI
-from ..errors import BAD_REQUEST, make_error_dict
+from rest_framework.request import Request
+
+from swh.deposit.models import Deposit
+from swh.model.hashutil import hash_to_bytes
+from swh.model.identifiers import parse_swhid
+from swh.model.model import (
+ MetadataAuthority,
+ MetadataAuthorityType,
+ MetadataFetcher,
+ MetadataTargetType,
+ RawExtrinsicMetadata,
+)
+from swh.storage import get_storage
+from swh.storage.interface import StorageInterface
+
+from ..config import (
+ CONT_FILE_IRI,
+ DEPOSIT_STATUS_LOAD_SUCCESS,
+ DEPOSIT_STATUS_PARTIAL,
+ EDIT_SE_IRI,
+ EM_IRI,
+ METADATA_KEY,
+ RAW_METADATA_KEY,
+ SWH_METADATA_AUTHORITY,
+)
+from ..errors import BAD_REQUEST, ParserError, make_error_dict
from ..parsers import (
SWHAtomEntryParser,
SWHFileUploadTarParser,
@@ -103,32 +127,172 @@
parser_classes = (SWHMultiPartParser, SWHAtomEntryParser)
+ def __init__(self):
+ super().__init__()
+ self.storage: StorageInterface = get_storage(**self.config["storage"])
+
+ def restrict_access(
+ self, request: Request, headers: Dict, deposit: Deposit
+ ) -> Dict[str, Any]:
+ """Relax restriction access to allow metadata update on deposit with status "done" when
+ a swhid is provided.
+
+ """
+ if (
+ request.method == "PUT"
+ and headers["swhid"] is not None
+ and deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS
+ ):
+ # Allow metadata update on deposit with status "done" when swhid provided
+ return {}
+ # otherwise, let the standard access restriction check occur
+ return super().restrict_access(request, headers, deposit)
+
def process_put(
- self, req, headers: Dict, collection_name: str, deposit_id: int
+ self, request, headers: Dict, collection_name: str, deposit_id: int
) -> Dict[str, Any]:
- """Replace existing deposit's metadata/archive with new ones.
+ """This allows the following update metadata scenarios to occur:
+
+ - multipart: replace all the deposit (status partial) metadata and archive
+ with the provided ones.
+
+ - no swhid, atom: replace all the deposit (status partial) metadata with the
+ provided ones.
+
+ - with swhid, atom: replace all the deposit (status done) metadata with the
+ provided ones and push such metadata to the metadata storage directly.
source:
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_metadata # noqa
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_multipart # noqa
+ Raises:
+ 400 if any of the following occur:
+ - the swhid provided and the deposit swhid do not match
+ - the provided metadata xml file is malformed
+ - the provided xml atom entry is empty
+ - the provided swhid does not exist in the archive
+
Returns:
204 No content
"""
- if req.content_type.startswith("multipart/"):
+ if request.content_type.startswith("multipart/"):
return self._multipart_upload(
- req,
+ request,
headers,
collection_name,
deposit_id=deposit_id,
replace_archives=True,
replace_metadata=True,
)
- return self._atom_entry(
- req, headers, collection_name, deposit_id=deposit_id, replace_metadata=True
+ deposit = Deposit.objects.get(pk=deposit_id)
+ swhid = headers.get("swhid")
+ if swhid is None:
+ # standard metadata update (replace all metadata already provided to the
+ # deposit by the new ones)
+ assert deposit.status == DEPOSIT_STATUS_PARTIAL
+ return self._atom_entry(
+ request,
+ headers,
+ collection_name,
+ deposit_id=deposit_id,
+ replace_metadata=True,
+ )
+
+ # Update metadata on a deposit already ingested
+ # Only write to the metadata storage
+
+ assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS
+
+ if swhid != deposit.swh_id:
+ return make_error_dict(
+ BAD_REQUEST,
+ f"Mismatched provided SWHID {swhid} with deposit's {deposit.swh_id}.",
+ "The provided SWHID does not match the deposit to update. "
+ "Please ensure you send the correct deposit SWHID.",
+ )
+
+ try:
+ raw_metadata, metadata = self._read_metadata(request.data)
+ except ParserError:
+ return make_error_dict(
+ BAD_REQUEST,
+ "Malformed xml metadata",
+ "The xml received is malformed. "
+ "Please ensure your metadata file is correctly formatted.",
+ )
+ if not metadata:
+ return make_error_dict(
+ BAD_REQUEST,
+ "Empty body request is not supported",
+ "Atom entry deposit is supposed to send for metadata. "
+ "If the body is empty, there is no metadata.",
+ )
+
+ metadata_authority = MetadataAuthority(
+ type=MetadataAuthorityType.DEPOSIT_CLIENT,
+ url=self.provider["provider_url"],
+ metadata={
+ "name": self.provider["provider_name"],
+ **(self.provider["metadata"] or {}),
+ },
)
+ metadata_fetcher = MetadataFetcher(
+ name=self.tool["name"],
+ version=self.tool["version"],
+ metadata=self.tool["configuration"],
+ )
+
+ deposit_swhid = parse_swhid(swhid)
+
+ directory_id = hash_to_bytes(deposit_swhid.object_id)
+
+ # check the swhid exists in the archive
+ directories_missing = list(self.storage.directory_missing([directory_id]))
+ if len(directories_missing) > 0:
+ return make_error_dict(
+ BAD_REQUEST,
+ f"Unknown directory SWHID {swhid} reference",
+ "The SWHID provided is not a known directory SWHID in SWH archive. "
+ "Please provide an existing SWHID.",
+ )
+
+ # replace metadata within the deposit backend
+ deposit_request_data = {
+ METADATA_KEY: metadata,
+ RAW_METADATA_KEY: raw_metadata,
+ }
+
+ deposit_request = self._deposit_request_put(
+ deposit, deposit_request_data, replace_metadata=True,
+ )
+
+ # QUESTION: Do we also put the raw xml?
+ # A priori, no since we don't do this originally in the loader.
+ metadata_object = RawExtrinsicMetadata(
+ type=MetadataTargetType.DIRECTORY,
+ id=deposit_swhid,
+ discovery_date=deposit_request.date,
+ authority=SWH_METADATA_AUTHORITY,
+ fetcher=metadata_fetcher,
+ format="sword-v2-atom-codemeta-v2-in-json",
+ metadata=json.dumps(metadata).encode(),
+ )
+
+ # write to metadata storage
+ self.storage.metadata_authority_add([metadata_authority])
+ self.storage.metadata_fetcher_add([metadata_fetcher])
+ self.storage.raw_extrinsic_metadata_add([metadata_object])
+
+ return {
+ "deposit_id": deposit_id,
+ "deposit_date": deposit_request.date,
+ "status": deposit.status,
+ "archive": None,
+ }
+
def process_post(
self,
request,
diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py
--- a/swh/deposit/api/private/deposit_read.py
+++ b/swh/deposit/api/private/deposit_read.py
@@ -12,7 +12,6 @@
from rest_framework import status
from swh.core import tarball
-from swh.deposit.api import __version__
from swh.deposit.utils import normalize_date
from swh.model import identifiers
@@ -104,15 +103,6 @@
"""
- def __init__(self):
- super().__init__()
- self.provider = self.config["provider"]
- self.tool = {
- "name": "swh-deposit",
- "version": __version__,
- "configuration": {"sword_version": "2"},
- }
-
def _normalize_dates(self, deposit, metadata):
"""Normalize the date to use as a tuple of author date, committer date
from the incoming metadata.
diff --git a/swh/deposit/config.py b/swh/deposit/config.py
--- a/swh/deposit/config.py
+++ b/swh/deposit/config.py
@@ -7,6 +7,8 @@
from typing import Any, Dict
from swh.core import config
+from swh.deposit import __version__
+from swh.model.model import MetadataAuthority, MetadataAuthorityType
from swh.scheduler import get_scheduler
from swh.scheduler.interface import SchedulerInterface
@@ -53,6 +55,12 @@
"checks": True,
}
+SWH_METADATA_AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.REGISTRY,
+ url="https://softwareheritage.org/",
+ metadata={},
+)
+
def setup_django_for(platform=None, config_file=None):
"""Setup function for command line tools (swh.deposit.create_user) to
@@ -97,3 +105,9 @@
conf = config.read_raw_config(config.config_basepath(config_file))
self.config: Dict[str, Any] = config.merge_configs(DEFAULT_CONFIG, conf)
self.scheduler: SchedulerInterface = get_scheduler(**self.config["scheduler"])
+ self.provider = self.config["provider"]
+ self.tool = {
+ "name": "swh-deposit",
+ "version": __version__,
+ "configuration": {"sword_version": "2"},
+ }
diff --git a/swh/deposit/tests/api/test_deposit_private_read_metadata.py b/swh/deposit/tests/api/test_deposit_private_read_metadata.py
--- a/swh/deposit/tests/api/test_deposit_private_read_metadata.py
+++ b/swh/deposit/tests/api/test_deposit_private_read_metadata.py
@@ -6,7 +6,7 @@
from django.urls import reverse
from rest_framework import status
-from swh.deposit.api import __version__
+from swh.deposit import __version__
from swh.deposit.config import EDIT_SE_IRI, PRIVATE_GET_DEPOSIT_METADATA, SWH_PERSON
from swh.deposit.models import Deposit
diff --git a/swh/deposit/tests/api/test_deposit_update.py b/swh/deposit/tests/api/test_deposit_update.py
--- a/swh/deposit/tests/api/test_deposit_update.py
+++ b/swh/deposit/tests/api/test_deposit_update.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2019 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -10,6 +10,8 @@
from swh.deposit.models import Deposit, DepositCollection, DepositRequest
from swh.deposit.parsers import parse_xml
from swh.deposit.tests.common import check_archive, create_arborescence_archive
+from swh.model.hashutil import hash_to_bytes
+from swh.model.identifiers import parse_swhid, swhid
def test_replace_archive_to_deposit_is_possible(
@@ -124,6 +126,141 @@
assert set(requests_archive0) == set(requests_archive1)
+def test_put_update_metadata_done_deposit_nominal(
+ tmp_path,
+ authenticated_client,
+ complete_deposit,
+ deposit_collection,
+ atom_dataset,
+ sample_data,
+ swh_storage,
+):
+ """Nominal scenario, client send an update of metadata on a deposit with status "done"
+ with an existing swhid. Such swhid has its metadata updated accordingly both in
+ the deposit backend and in the metadata storage.
+
+ Response: 204
+
+ """
+ deposit = complete_deposit
+
+ directory_id = hash_to_bytes(parse_swhid(deposit.swh_id).object_id)
+ # directory targeted by the complete_deposit does not exist in the storage
+ assert list(swh_storage.directory_missing([directory_id])) == [directory_id]
+
+ # so let's create a directory reference in the storage (current deposit targets an
+ # unknown swhid)
+ directory = sample_data.directory
+ swh_storage.directory_add([directory])
+ assert list(swh_storage.directory_missing([directory.id])) == []
+ # and patch one complete deposit swh_id so it targets said reference
+ deposit.swh_id = swhid("directory", directory.id)
+ deposit.save()
+
+ raw_metadata0 = atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8")
+
+ requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive")
+ assert len(requests_archive0) == 1
+
+ update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id])
+
+ response = authenticated_client.put(
+ update_uri,
+ content_type="application/atom+xml;type=entry",
+ data=atom_dataset["entry-data1"],
+ HTTP_X_CHECK_SWHID=deposit.swh_id,
+ )
+
+ assert response.status_code == status.HTTP_204_NO_CONTENT
+
+ requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata")
+
+ assert len(requests_meta) == 1
+ request_meta1 = requests_meta[0]
+ raw_metadata1 = request_meta1.raw_metadata
+ assert raw_metadata1 == atom_dataset["entry-data1"]
+ assert raw_metadata0 != raw_metadata1
+
+ # check we did not touch the other parts
+ requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive")
+ assert len(requests_archive1) == 1
+ assert set(requests_archive0) == set(requests_archive1)
+
+ # FIXME: Check the metadata storage information created is consistent
+ pass
+
+
+def test_put_update_metadata_done_deposit_swhid_unknown(
+ tmp_path,
+ authenticated_client,
+ complete_deposit,
+ deposit_collection,
+ atom_dataset,
+ swh_storage,
+):
+ """Failure scenario: client sends a correct swhid (which matches the deposit's swhid),
+ but somehow the swhid does not exist in the archive (should not happen though,
+ still technically possible...)
+
+ Response: 400
+
+ """
+ deposit = complete_deposit # deposit with status "done"
+
+ # directory targeted by the complete_deposit does not exist in the storage
+ missing_directory_id = hash_to_bytes(parse_swhid(deposit.swh_id).object_id)
+ assert list(swh_storage.directory_missing([missing_directory_id])) == [
+ missing_directory_id
+ ]
+
+ update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id])
+
+ response = authenticated_client.put(
+ update_uri,
+ content_type="application/atom+xml;type=entry",
+ data=atom_dataset["entry-data1"],
+ HTTP_X_CHECK_SWHID=deposit.swh_id,
+ )
+
+ # so the deposit update is rejected
+ assert response.status_code == status.HTTP_400_BAD_REQUEST
+
+ response_str = response.content.decode()
+
+ assert f"Unknown directory SWHID {deposit.swh_id} reference" in response_str
+
+
+def test_put_update_metadata_done_deposit_mismatched_swhid(
+ tmp_path,
+ authenticated_client,
+ complete_deposit,
+ deposit_collection,
+ atom_dataset,
+ swh_storage,
+):
+ """Failure scenario: client sends a swhid which does not match the deposit swhid.
+
+ Response: 400
+
+ """
+ deposit = complete_deposit # deposit with status "done"
+ incorrect_swhid = "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea"
+ assert deposit.swh_id != incorrect_swhid
+
+ update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id])
+ response = authenticated_client.put(
+ update_uri,
+ content_type="application/atom+xml;type=entry",
+ data=atom_dataset["entry-data1"],
+ HTTP_X_CHECK_SWHID=incorrect_swhid,
+ )
+
+ # so the deposit update is rejected
+ assert response.status_code == status.HTTP_400_BAD_REQUEST
+ response_str = response.content.decode()
+ assert f"Mismatched provided SWHID {incorrect_swhid}" in response_str
+
+
def test_add_archive_to_deposit_is_possible(
tmp_path,
authenticated_client,
diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py
--- a/swh/deposit/tests/conftest.py
+++ b/swh/deposit/tests/conftest.py
@@ -16,6 +16,7 @@
from rest_framework.test import APIClient
import yaml
+from swh.core.config import read
from swh.deposit.config import (
COL_IRI,
DEPOSIT_STATUS_DEPOSITED,
@@ -25,12 +26,14 @@
DEPOSIT_STATUS_REJECTED,
DEPOSIT_STATUS_VERIFIED,
EDIT_SE_IRI,
+ SWH_METADATA_AUTHORITY,
setup_django_for,
)
from swh.deposit.parsers import parse_xml
from swh.deposit.tests.common import create_arborescence_archive
from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid
from swh.scheduler import get_scheduler
+from swh.storage import get_storage
# mypy is asked to ignore the import statement above because setup_databases
# is not part of the d.t.utils.__all__ variable.
@@ -51,7 +54,7 @@
@pytest.fixture()
-def deposit_config(swh_scheduler_config):
+def deposit_config(swh_scheduler_config, swh_storage_backend_config):
return {
"max_upload_size": 500,
"extraction_dir": "/tmp/swh-deposit/test/extraction-dir",
@@ -63,6 +66,7 @@
"metadata": {},
},
"scheduler": {"cls": "local", "args": swh_scheduler_config,},
+ "storage": swh_storage_backend_config,
}
@@ -78,14 +82,24 @@
@pytest.fixture(autouse=True)
def deposit_autoconfig(deposit_config_path, swh_scheduler_config):
"""Enforce config for deposit classes inherited from APIConfig."""
+ cfg = read(deposit_config_path)
+
+ if "scheduler" in cfg:
+ # scheduler setup: require the load-deposit tasks (already existing in
+ # production)
+ scheduler = get_scheduler(**cfg["scheduler"])
+ task_type = {
+ "type": "load-deposit",
+ "backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit",
+ "description": "Load deposit task",
+ }
+ scheduler.create_task_type(task_type)
- scheduler = get_scheduler("local", swh_scheduler_config)
- task_type = {
- "type": "load-deposit",
- "backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit",
- "description": "Load deposit task",
- }
- scheduler.create_task_type(task_type)
+ if "storage" in cfg:
+ # `deposit_update` tests requires the SWH_METADATA_AUTHORITY populated (matching
+ # production)
+ storage = get_storage(**cfg["storage"])
+ storage.metadata_authority_add([SWH_METADATA_AUTHORITY])
@pytest.fixture(scope="session")
diff --git a/swh/deposit/tests/test_init.py b/swh/deposit/tests/test_init.py
--- a/swh/deposit/tests/test_init.py
+++ b/swh/deposit/tests/test_init.py
@@ -5,6 +5,6 @@
def test_version():
- from swh.deposit.api import __version__
+ from swh.deposit import __version__
assert __version__ is not None
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 17 2024, 7:58 AM (4 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3233174
Attached To
D4013: Allow deposit metadata update on deposit already complete
Event Timeline
Log In to Comment