Page MenuHomeSoftware Heritage

D4013.id14193.diff
No OneTemporary

D4013.id14193.diff

diff --git a/swh/deposit/__init__.py b/swh/deposit/__init__.py
--- a/swh/deposit/__init__.py
+++ b/swh/deposit/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import pkg_resources
+
+try:
+ __version__ = pkg_resources.get_distribution("swh.deposit").version
+except pkg_resources.DistributionNotFound:
+ __version__ = "devel"
diff --git a/swh/deposit/api/__init__.py b/swh/deposit/api/__init__.py
--- a/swh/deposit/api/__init__.py
+++ b/swh/deposit/api/__init__.py
@@ -2,10 +2,3 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-
-import pkg_resources
-
-try:
- __version__ = pkg_resources.get_distribution("swh.deposit").version
-except pkg_resources.DistributionNotFound:
- __version__ = "devel"
diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py
--- a/swh/deposit/api/common.py
+++ b/swh/deposit/api/common.py
@@ -114,6 +114,8 @@
on_behalf_of = meta.get("HTTP_ON_BEHALF_OF")
metadata_relevant = meta.get("HTTP_METADATA_RELEVANT")
+ swhid = meta.get("X_Check_SWHID")
+
return {
"content-type": content_type,
"content-length": content_length,
@@ -124,6 +126,7 @@
"slug": slug,
"on-behalf-of": on_behalf_of,
"metadata-relevant": metadata_relevant,
+ "swhid": swhid,
}
def _compute_md5(self, filehandler) -> bytes:
@@ -610,7 +613,7 @@
"""Atom entry deposit.
Args:
- request (Request): the request holding information to parse
+ request: the request holding information to parse
and inject in db
headers: request headers formatted
collection_name: the associated client
@@ -785,6 +788,7 @@
NOT_FOUND, f"Deposit with id {deposit_id} does not exist"
)
+ assert deposit is not None
checks = self.restrict_access(request, deposit)
if checks:
return checks
@@ -799,18 +803,19 @@
return {"headers": headers}
- def restrict_access(
- self, request: Request, deposit: Optional[Deposit] = None
- ) -> Dict[str, Any]:
- if deposit:
- if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL:
- summary = "You can only act on deposit with status '%s'" % (
- DEPOSIT_STATUS_PARTIAL,
- )
- description = f"This deposit has status '{deposit.status}'"
- return make_error_dict(
- BAD_REQUEST, summary=summary, verbose_description=description
- )
+ def restrict_access(self, request: Request, deposit: Deposit) -> Dict[str, Any]:
+ """Add further checks on a deposit to prevent an update on a deposit not in partial
+ state.
+
+ """
+ if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL:
+ summary = "You can only act on deposit with status '%s'" % (
+ DEPOSIT_STATUS_PARTIAL,
+ )
+ description = f"This deposit has status '{deposit.status}'"
+ return make_error_dict(
+ BAD_REQUEST, summary=summary, verbose_description=description
+ )
return {}
def _basic_not_allowed_method(self, request: Request, method: str):
diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py
--- a/swh/deposit/api/deposit_update.py
+++ b/swh/deposit/api/deposit_update.py
@@ -3,12 +3,36 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from datetime import datetime, timezone
+import json
from typing import Any, Dict, Optional, Tuple
from rest_framework import status
-
-from ..config import CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI
-from ..errors import BAD_REQUEST, make_error_dict
+from rest_framework.request import Request
+
+from swh.deposit.models import Deposit
+from swh.model.hashutil import hash_to_bytes
+from swh.model.identifiers import parse_swhid
+from swh.model.model import (
+ MetadataAuthority,
+ MetadataAuthorityType,
+ MetadataFetcher,
+ MetadataTargetType,
+ RawExtrinsicMetadata,
+)
+from swh.storage import get_storage
+from swh.storage.interface import StorageInterface
+
+from ..config import (
+ CONT_FILE_IRI,
+ DEPOSIT_STATUS_LOAD_SUCCESS,
+ EDIT_SE_IRI,
+ EM_IRI,
+ METADATA_KEY,
+ RAW_METADATA_KEY,
+ SWH_METADATA_AUTHORITY,
+)
+from ..errors import BAD_REQUEST, ParserError, make_error_dict
from ..parsers import (
SWHAtomEntryParser,
SWHFileUploadTarParser,
@@ -103,8 +127,28 @@
parser_classes = (SWHMultiPartParser, SWHAtomEntryParser)
+ def __init__(self):
+ super().__init__()
+ self.storage: StorageInterface = get_storage(**self.config["storage"])
+
+ def restrict_access(self, request: Request, deposit: Deposit) -> Dict[str, Any]:
+ """For the metadata-only update of a deposit, we need to allow some checks on "done"
+ deposits. This overrides allow for such case to occur.
+
+ """
+ headers = self._read_headers(request)
+ if (
+ request.method == "PUT"
+ and headers["swhid"] is not None
+ and deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS
+ ):
+ # Update only metadata is authorized for deposit ingested
+ return {}
+ # otherwise, let the standard access restriction occur
+ return super().restrict_access(request, deposit)
+
def process_put(
- self, req, headers: Dict, collection_name: str, deposit_id: int
+ self, request, headers: Dict, collection_name: str, deposit_id: int
) -> Dict[str, Any]:
"""Replace existing deposit's metadata/archive with new ones.
@@ -116,19 +160,113 @@
204 No content
"""
- if req.content_type.startswith("multipart/"):
+ if request.content_type.startswith("multipart/"):
return self._multipart_upload(
- req,
+ request,
headers,
collection_name,
deposit_id=deposit_id,
replace_archives=True,
replace_metadata=True,
)
- return self._atom_entry(
- req, headers, collection_name, deposit_id=deposit_id, replace_metadata=True
+ swhid = headers.get("swhid")
+ if swhid is None:
+ return self._atom_entry(
+ request,
+ headers,
+ collection_name,
+ deposit_id=deposit_id,
+ replace_metadata=True,
+ )
+
+ # Update metadata on a deposit already ingested
+ # Only write to the metadata storage
+
+ try:
+ raw_metadata, metadata = self._read_metadata(request.data)
+ except ParserError:
+ return make_error_dict(
+ BAD_REQUEST,
+ "Malformed xml metadata",
+ "The xml received is malformed. "
+ "Please ensure your metadata file is correctly formatted.",
+ )
+ if not metadata:
+ return make_error_dict(
+ BAD_REQUEST,
+ "Empty body request is not supported",
+ "Atom entry deposit is supposed to send for metadata. "
+ "If the body is empty, there is no metadata.",
+ )
+
+ metadata_authority = MetadataAuthority(
+ type=MetadataAuthorityType.DEPOSIT_CLIENT,
+ url=self.provider["provider_url"],
+ metadata={
+ "name": self.provider["provider_name"],
+ **(self.provider["metadata"] or {}),
+ },
+ )
+
+ metadata_fetcher = MetadataFetcher(
+ name=self.tool["name"],
+ version=self.tool["version"],
+ metadata=self.tool["configuration"],
)
+ deposit_swhid = parse_swhid(swhid)
+
+ assert deposit_swhid.object_type == "directory"
+ directory_id = hash_to_bytes(deposit_swhid.object_id)
+
+ # check the swhid exists in the archive
+ directories_missing = list(self.storage.directory_missing([directory_id]))
+ if len(directories_missing) > 0:
+ return make_error_dict(
+ BAD_REQUEST,
+ f"Unknown directory SWHID {swhid} reference",
+ "The SWHID provided is not a know directory SWHID in SWH archive. "
+ "Please provide an existing SWHID.",
+ )
+
+ discovery_date = datetime.now(tz=timezone.utc)
+
+ # QUESTION: Do we also put the raw xml?
+ # A priori, no since we don't do this originally in the loader.
+ metadata_object = RawExtrinsicMetadata(
+ type=MetadataTargetType.DIRECTORY,
+ id=deposit_swhid,
+ discovery_date=discovery_date,
+ authority=SWH_METADATA_AUTHORITY,
+ fetcher=metadata_fetcher,
+ format="sword-v2-atom-codemeta-v2-in-json",
+ metadata=json.dumps(metadata).encode(),
+ )
+
+ # write to metadata storage
+ self.storage.metadata_authority_add([metadata_authority])
+ self.storage.metadata_fetcher_add([metadata_fetcher])
+ self.storage.raw_extrinsic_metadata_add([metadata_object])
+
+ # replace metadata within the deposit backend
+ deposit = Deposit.objects.get(pk=deposit_id)
+
+ deposit_request_data = {
+ METADATA_KEY: metadata,
+ RAW_METADATA_KEY: raw_metadata,
+ }
+
+ self._deposit_request_put(
+ deposit, deposit_request_data, replace_metadata=True,
+ )
+
+ return {
+ "deposit_id": deposit_id,
+ "deposit_date": discovery_date,
+ "status": deposit.status,
+ "archive": None,
+ }
+
def process_post(
self,
request,
diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py
--- a/swh/deposit/api/private/deposit_read.py
+++ b/swh/deposit/api/private/deposit_read.py
@@ -12,7 +12,6 @@
from rest_framework import status
from swh.core import tarball
-from swh.deposit.api import __version__
from swh.deposit.utils import normalize_date
from swh.model import identifiers
@@ -106,12 +105,6 @@
def __init__(self):
super().__init__()
- self.provider = self.config["provider"]
- self.tool = {
- "name": "swh-deposit",
- "version": __version__,
- "configuration": {"sword_version": "2"},
- }
def _normalize_dates(self, deposit, metadata):
"""Normalize the date to use as a tuple of author date, committer date
diff --git a/swh/deposit/config.py b/swh/deposit/config.py
--- a/swh/deposit/config.py
+++ b/swh/deposit/config.py
@@ -7,6 +7,8 @@
from typing import Any, Dict
from swh.core import config
+from swh.deposit import __version__
+from swh.model.model import MetadataAuthority, MetadataAuthorityType
from swh.scheduler import get_scheduler
from swh.scheduler.interface import SchedulerInterface
@@ -53,6 +55,12 @@
"checks": True,
}
+SWH_METADATA_AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.REGISTRY,
+ url="https://softwareheritage.org/",
+ metadata={},
+)
+
def setup_django_for(platform=None, config_file=None):
"""Setup function for command line tools (swh.deposit.create_user) to
@@ -97,3 +105,9 @@
conf = config.read_raw_config(config.config_basepath(config_file))
self.config: Dict[str, Any] = config.merge_configs(DEFAULT_CONFIG, conf)
self.scheduler: SchedulerInterface = get_scheduler(**self.config["scheduler"])
+ self.provider = self.config["provider"]
+ self.tool = {
+ "name": "swh-deposit",
+ "version": __version__,
+ "configuration": {"sword_version": "2"},
+ }
diff --git a/swh/deposit/settings/production.py b/swh/deposit/settings/production.py
--- a/swh/deposit/settings/production.py
+++ b/swh/deposit/settings/production.py
@@ -58,7 +58,7 @@
"disable_existing_loggers": False,
"formatters": {
"standard": {
- "format": "[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s", # noqa
+ "format": "[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s",
"datefmt": "%d/%b/%Y %H:%M:%S",
},
},
diff --git a/swh/deposit/tests/api/test_deposit_private_read_metadata.py b/swh/deposit/tests/api/test_deposit_private_read_metadata.py
--- a/swh/deposit/tests/api/test_deposit_private_read_metadata.py
+++ b/swh/deposit/tests/api/test_deposit_private_read_metadata.py
@@ -6,7 +6,7 @@
from django.urls import reverse
from rest_framework import status
-from swh.deposit.api import __version__
+from swh.deposit import __version__
from swh.deposit.config import EDIT_SE_IRI, PRIVATE_GET_DEPOSIT_METADATA, SWH_PERSON
from swh.deposit.models import Deposit
diff --git a/swh/deposit/tests/api/test_deposit_update.py b/swh/deposit/tests/api/test_deposit_update.py
--- a/swh/deposit/tests/api/test_deposit_update.py
+++ b/swh/deposit/tests/api/test_deposit_update.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2019 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -10,6 +10,8 @@
from swh.deposit.models import Deposit, DepositCollection, DepositRequest
from swh.deposit.parsers import parse_xml
from swh.deposit.tests.common import check_archive, create_arborescence_archive
+from swh.model.hashutil import hash_to_bytes
+from swh.model.identifiers import parse_swhid, swhid
def test_replace_archive_to_deposit_is_possible(
@@ -124,6 +126,103 @@
assert set(requests_archive0) == set(requests_archive1)
+def test_put_update_metadata_on_done_deposit(
+ tmp_path,
+ authenticated_client,
+ complete_deposit,
+ deposit_collection,
+ atom_dataset,
+ sample_data,
+ swh_storage,
+):
+ """Update all metadata on a deposit whose status is done is possible, 204 response
+
+ """
+ deposit = complete_deposit
+
+ directory_id = hash_to_bytes(parse_swhid(deposit.swh_id).object_id)
+ # directory targeted by the complete_deposit does not exist in the storage
+ assert list(swh_storage.directory_missing([directory_id])) == [directory_id]
+
+ # so let's create a directory reference in the storage (current deposit targets an
+ # unknown swhid)
+ directory = sample_data.directory
+ swh_storage.directory_add([directory])
+ assert list(swh_storage.directory_missing([directory.id])) == []
+ # and patch one complete deposit swh_id so it targets said reference
+ deposit.swh_id = swhid("directory", directory.id)
+ deposit.save()
+
+ raw_metadata0 = atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8")
+
+ requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive")
+ assert len(requests_archive0) == 1
+
+ update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id])
+
+ response = authenticated_client.put(
+ update_uri,
+ content_type="application/atom+xml;type=entry",
+ data=atom_dataset["entry-data1"],
+ X_Check_SWHID=deposit.swh_id,
+ )
+
+ assert response.status_code == status.HTTP_204_NO_CONTENT
+
+ requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata")
+
+ assert len(requests_meta) == 1
+ request_meta1 = requests_meta[0]
+ raw_metadata1 = request_meta1.raw_metadata
+ assert raw_metadata1 == atom_dataset["entry-data1"]
+ assert raw_metadata0 != raw_metadata1
+
+ # check we did not touch the other parts
+ requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive")
+ assert len(requests_archive1) == 1
+ assert set(requests_archive0) == set(requests_archive1)
+
+ # FIXME: Check the metadata storage information created is consistent
+ pass
+
+
+def test_put_update_metadata_on_done_deposit_failure(
+ tmp_path,
+ authenticated_client,
+ complete_deposit,
+ deposit_collection,
+ atom_dataset,
+ swh_storage,
+):
+ """Update all metadata on a deposit whose status is done, wrong swhid is impossible
+
+ """
+ deposit = complete_deposit
+
+ directory_id = hash_to_bytes(parse_swhid(deposit.swh_id).object_id)
+ # directory targeted by the complete_deposit does not exist in the storage
+ assert list(swh_storage.directory_missing([directory_id]))[0] is not None
+
+ requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive")
+ assert len(requests_archive0) == 1
+
+ update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id])
+
+ response = authenticated_client.put(
+ update_uri,
+ content_type="application/atom+xml;type=entry",
+ data=atom_dataset["entry-data1"],
+ X_Check_SWHID=deposit.swh_id,
+ )
+
+ # so the deposit update is rejected
+ assert response.status_code == status.HTTP_400_BAD_REQUEST
+
+ response_str = response.content.decode()
+
+ assert f"Unknown directory SWHID {deposit.swh_id} reference" in response_str
+
+
def test_add_archive_to_deposit_is_possible(
tmp_path,
authenticated_client,
diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py
--- a/swh/deposit/tests/conftest.py
+++ b/swh/deposit/tests/conftest.py
@@ -16,6 +16,7 @@
from rest_framework.test import APIClient
import yaml
+from swh.core.config import read
from swh.deposit.config import (
COL_IRI,
DEPOSIT_STATUS_DEPOSITED,
@@ -25,12 +26,14 @@
DEPOSIT_STATUS_REJECTED,
DEPOSIT_STATUS_VERIFIED,
EDIT_SE_IRI,
+ SWH_METADATA_AUTHORITY,
setup_django_for,
)
from swh.deposit.parsers import parse_xml
from swh.deposit.tests.common import create_arborescence_archive
from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid
from swh.scheduler import get_scheduler
+from swh.storage import get_storage
# mypy is asked to ignore the import statement above because setup_databases
# is not part of the d.t.utils.__all__ variable.
@@ -51,7 +54,11 @@
@pytest.fixture()
-def deposit_config(swh_scheduler_config):
+def deposit_config(swh_scheduler_config, swh_storage_backend_config):
+ # storage_cfg = deepcopy(swh_storage_backend_config)
+ # # actual side-track # otherwise: OSError: storage check config failed
+ # storage_cfg["check_config"] = {"check_write": False}
+
return {
"max_upload_size": 500,
"extraction_dir": "/tmp/swh-deposit/test/extraction-dir",
@@ -63,6 +70,7 @@
"metadata": {},
},
"scheduler": {"cls": "local", "args": swh_scheduler_config,},
+ "storage": swh_storage_backend_config,
}
@@ -78,14 +86,24 @@
@pytest.fixture(autouse=True)
def deposit_autoconfig(deposit_config_path, swh_scheduler_config):
"""Enforce config for deposit classes inherited from APIConfig."""
+ cfg = read(deposit_config_path)
+
+ if "scheduler" in cfg:
+ # scheduler setup: require the load-deposit tasks (already existing in
+ # production)
+ scheduler = get_scheduler(**cfg["scheduler"])
+ task_type = {
+ "type": "load-deposit",
+ "backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit",
+ "description": "Load deposit task",
+ }
+ scheduler.create_task_type(task_type)
- scheduler = get_scheduler("local", swh_scheduler_config)
- task_type = {
- "type": "load-deposit",
- "backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit",
- "description": "Load deposit task",
- }
- scheduler.create_task_type(task_type)
+ if "storage" in cfg:
+ # `deposit_update` tests requires the SWH_METADATA_AUTHORITY populated (matching
+ # production)
+ storage = get_storage(**cfg["storage"])
+ storage.metadata_authority_add([SWH_METADATA_AUTHORITY])
@pytest.fixture(scope="session")
diff --git a/swh/deposit/tests/test_init.py b/swh/deposit/tests/test_init.py
--- a/swh/deposit/tests/test_init.py
+++ b/swh/deposit/tests/test_init.py
@@ -5,6 +5,6 @@
def test_version():
- from swh.deposit.api import __version__
+ from swh.deposit import __version__
assert __version__ is not None

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 3:29 AM (2 d, 3 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218171

Event Timeline