diff --git a/requirements-swh-server.txt b/requirements-swh-server.txt index 86a85993..5e81fabe 100644 --- a/requirements-swh-server.txt +++ b/requirements-swh-server.txt @@ -1,4 +1,4 @@ swh.core[http] swh.loader.core >= 0.0.71 swh.scheduler >= 0.0.39 -swh.model >= 0.1.0 +swh.model >= 0.3.8 diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py index a387fc7f..91ec2e3d 100644 --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -1,218 +1,216 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os import shutil import tempfile from contextlib import contextmanager from django.http import FileResponse from rest_framework import status from swh.core import tarball from swh.model import identifiers from swh.deposit.utils import normalize_date from . import DepositReadMixin, SWHPrivateAPIView from ...config import SWH_PERSON, ARCHIVE_TYPE from ..common import SWHGetDepositAPI from ...models import Deposit @contextmanager def aggregate_tarballs(extraction_dir, archive_paths): """Aggregate multiple tarballs into one and returns this new archive's path. Args: extraction_dir (path): Path to use for the tarballs computation archive_paths ([str]): Deposit's archive paths Returns: Tuple (directory to clean up, archive path (aggregated or not)) """ # rebuild one zip archive from (possibly) multiple ones os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix="swh.deposit-", dir=extraction_dir) # root folder to build an aggregated tarball aggregated_tarball_rootdir = os.path.join(dir_path, "aggregate") os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True) # uncompress in a temporary location all archives for archive_path in archive_paths: tarball.uncompress(archive_path, aggregated_tarball_rootdir) # Aggregate into one big tarball the multiple smaller ones temp_tarpath = shutil.make_archive( aggregated_tarball_rootdir, "zip", aggregated_tarball_rootdir ) # can already clean up temporary directory shutil.rmtree(aggregated_tarball_rootdir) try: yield temp_tarpath finally: shutil.rmtree(dir_path) class SWHDepositReadArchives(SWHPrivateAPIView, SWHGetDepositAPI, DepositReadMixin): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ ADDITIONAL_CONFIG = { "extraction_dir": ("str", "/tmp/swh-deposit/archive/"), } def __init__(self): super().__init__() self.extraction_dir = self.config["extraction_dir"] if not os.path.exists(self.extraction_dir): os.makedirs(self.extraction_dir) def process_get(self, request, collection_name, deposit_id): """Build a unique tarball from the multiple received and stream that content to the client. Args: request (Request): collection_name (str): Collection owning the deposit deposit_id (id): Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ archive_paths = [ r.archive.path for r in self._deposit_requests(deposit_id, request_type=ARCHIVE_TYPE) ] with aggregate_tarballs(self.extraction_dir, archive_paths) as path: return FileResponse( open(path, "rb"), status=status.HTTP_200_OK, content_type="application/zip", ) class SWHDepositReadMetadata(SWHPrivateAPIView, SWHGetDepositAPI, DepositReadMixin): """Class in charge of aggregating metadata on a deposit. """ ADDITIONAL_CONFIG = { "provider": ( "dict", { # 'provider_name': '', # those are not set since read from the # 'provider_url': '', # deposit's client "provider_type": "deposit_client", "metadata": {}, }, ), "tool": ( "dict", { "name": "swh-deposit", "version": "0.0.1", "configuration": {"sword_version": "2"}, }, ), } def __init__(self): super().__init__() self.provider = self.config["provider"] self.tool = self.config["tool"] def _normalize_dates(self, deposit, metadata): """Normalize the date to use as a tuple of author date, committer date from the incoming metadata. Args: deposit (Deposit): Deposit model representation metadata (Dict): Metadata dict representation Returns: Tuple of author date, committer date. Those dates are swh normalized. """ commit_date = metadata.get("codemeta:datePublished") author_date = metadata.get("codemeta:dateCreated") if author_date and commit_date: pass elif commit_date: author_date = commit_date elif author_date: commit_date = author_date else: author_date = deposit.complete_date commit_date = deposit.complete_date return (normalize_date(author_date), normalize_date(commit_date)) def metadata_read(self, deposit): """Read and aggregate multiple data on deposit into one unified data dictionary. Args: deposit (Deposit): Deposit concerned by the data aggregation. Returns: Dictionary of data representing the deposit to inject in swh. """ metadata = self._metadata_get(deposit) # Read information metadata data = {"origin": {"type": "deposit", "url": deposit.origin_url,}} # metadata provider self.provider["provider_name"] = deposit.client.last_name self.provider["provider_url"] = deposit.client.provider_url author_date, commit_date = self._normalize_dates(deposit, metadata) if deposit.parent: swh_persistent_id = deposit.parent.swh_id - persistent_identifier = identifiers.parse_persistent_identifier( - swh_persistent_id - ) - parent_revision = persistent_identifier.object_id + swhid = identifiers.parse_swhid(swh_persistent_id) + parent_revision = swhid.object_id parents = [parent_revision] else: parents = [] data["origin_metadata"] = { "provider": self.provider, "tool": self.tool, "metadata": metadata, } data["deposit"] = { "id": deposit.id, "client": deposit.client.username, "collection": deposit.collection.name, "author": SWH_PERSON, "author_date": author_date, "committer": SWH_PERSON, "committer_date": commit_date, "revision_parents": parents, } return data def process_get(self, request, collection_name, deposit_id): deposit = Deposit.objects.get(pk=deposit_id) data = self.metadata_read(deposit) d = {} if data: d = json.dumps(data) return status.HTTP_200_OK, d, "application/json" diff --git a/swh/deposit/api/private/deposit_update_status.py b/swh/deposit/api/private/deposit_update_status.py index 67fa99f0..c7b7e63a 100644 --- a/swh/deposit/api/private/deposit_update_status.py +++ b/swh/deposit/api/private/deposit_update_status.py @@ -1,105 +1,105 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework.parsers import JSONParser -from swh.model.identifiers import DIRECTORY, persistent_identifier, REVISION, SNAPSHOT +from swh.model.identifiers import DIRECTORY, swhid, REVISION, SNAPSHOT from . import SWHPrivateAPIView from ..common import SWHPutDepositAPI from ...errors import make_error_dict, BAD_REQUEST from ...models import Deposit, DEPOSIT_STATUS_DETAIL from ...models import DEPOSIT_STATUS_LOAD_SUCCESS MANDATORY_KEYS = ["origin_url", "revision_id", "directory_id", "snapshot_id"] class SWHUpdateStatusDeposit(SWHPrivateAPIView, SWHPutDepositAPI): """Deposit request class to update the deposit's status. HTTP verbs supported: PUT """ parser_classes = (JSONParser,) def additional_checks(self, request, headers, collection_name, deposit_id=None): """Enrich existing checks to the default ones. New checks: - Ensure the status is provided - Ensure it exists - no missing information on load success update """ data = request.data status = data.get("status") if not status: msg = "The status key is mandatory with possible values %s" % list( DEPOSIT_STATUS_DETAIL.keys() ) return make_error_dict(BAD_REQUEST, msg) if status not in DEPOSIT_STATUS_DETAIL: msg = "Possible status in %s" % list(DEPOSIT_STATUS_DETAIL.keys()) return make_error_dict(BAD_REQUEST, msg) if status == DEPOSIT_STATUS_LOAD_SUCCESS: missing_keys = [] for key in MANDATORY_KEYS: value = data.get(key) if value is None: missing_keys.append(key) if missing_keys: msg = ( f"Updating deposit status to {status}" f" requires information {','.join(missing_keys)}" ) return make_error_dict(BAD_REQUEST, msg) return {} def process_put(self, request, headers, collection_name, deposit_id): """Update the deposit with status and SWHIDs Returns: 204 No content 400 Bad request if checks fail """ data = request.data deposit = Deposit.objects.get(pk=deposit_id) status = data["status"] deposit.status = status if status == DEPOSIT_STATUS_LOAD_SUCCESS: origin_url = data["origin_url"] directory_id = data["directory_id"] revision_id = data["revision_id"] - dir_id = persistent_identifier(DIRECTORY, directory_id) - snp_id = persistent_identifier(SNAPSHOT, data["snapshot_id"]) - rev_id = persistent_identifier(REVISION, revision_id) + dir_id = swhid(DIRECTORY, directory_id) + snp_id = swhid(SNAPSHOT, data["snapshot_id"]) + rev_id = swhid(REVISION, revision_id) deposit.swh_id = dir_id # new id with contextual information - deposit.swh_id_context = persistent_identifier( + deposit.swh_id_context = swhid( DIRECTORY, directory_id, metadata={ "origin": origin_url, "visit": snp_id, "anchor": rev_id, "path": "/", }, ) else: # rejected deposit.status = status deposit.save() return {} diff --git a/swh/deposit/migrations/0018_migrate_swhids.py b/swh/deposit/migrations/0018_migrate_swhids.py index ebac5f14..2c1b5ecd 100644 --- a/swh/deposit/migrations/0018_migrate_swhids.py +++ b/swh/deposit/migrations/0018_migrate_swhids.py @@ -1,363 +1,363 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals import os import logging from django.db import migrations from typing import Any, Dict, Optional, Tuple from swh.core import config from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.identifiers import ( - parse_persistent_identifier, - persistent_identifier, + parse_swhid, + swhid, DIRECTORY, REVISION, SNAPSHOT, ) from swh.storage import get_storage as get_storage_client SWH_PROVIDER_URL = "https://www.softwareheritage.org" logger = logging.getLogger(__name__) swh_storage = None def get_storage() -> Optional[Any]: """Instantiate a storage client """ settings = os.environ.get("DJANGO_SETTINGS_MODULE") if settings != "swh.deposit.settings.production": # Bypass for now return None global swh_storage if not swh_storage: config_file = os.environ.get("SWH_CONFIG_FILENAME") if not config_file: raise ValueError( "Production: SWH_CONFIG_FILENAME must be set to the" " configuration file needed!" ) if not os.path.exists(config_file): raise ValueError( "Production: configuration file %s does not exist!" % (config_file,) ) conf = config.load_named_config(config_file) if not conf: raise ValueError( "Production: configuration %s does not exist." % (config_file,) ) storage_config = conf.get("storage") if not storage_config: raise ValueError( "Production: invalid configuration; missing 'storage' config entry." ) swh_storage = get_storage_client(**storage_config) return swh_storage def get_snapshot(storage, origin: str, revision_id: str) -> Optional[str]: """Retrieve the snapshot targeting the revision_id for the given origin. """ all_visits = storage.origin_visit_get(origin) for visit in all_visits: if not visit["snapshot"]: continue detail_snapshot = storage.snapshot_get(visit["snapshot"]) if not detail_snapshot: continue for branch_name, branch in detail_snapshot["branches"].items(): if branch["target_type"] == "revision": revision = branch["target"] if hash_to_hex(revision) == revision_id: # Found the snapshot return hash_to_hex(visit["snapshot"]) return None def migrate_deposit_swhid_context_not_null(apps, schema_editor): """Migrate deposit SWHIDs to the new format. Migrate deposit SWHIDs to the new format. Only deposit with status done and swh_id_context not null are concerned. """ storage = get_storage() if not storage: logging.warning("Nothing to do") return None Deposit = apps.get_model("deposit", "Deposit") for deposit in Deposit.objects.filter( status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=False ): - obj_dir = parse_persistent_identifier(deposit.swh_id_context) + obj_dir = parse_swhid(deposit.swh_id_context) assert obj_dir.object_type == DIRECTORY - obj_rev = parse_persistent_identifier(deposit.swh_anchor_id) + obj_rev = parse_swhid(deposit.swh_anchor_id) assert obj_rev.object_type == REVISION if set(obj_dir.metadata.keys()) != {"origin"}: # Assuming the migration is already done for that deposit logger.warning( "Deposit id %s: Migration already done, skipping", deposit.id ) continue # Starting migration dir_id = obj_dir.object_id origin = obj_dir.metadata["origin"] check_origin = storage.origin_get({"url": origin}) if not check_origin: logger.warning("Deposit id %s: Origin %s not found!", deposit.id, origin) continue rev_id = obj_rev.object_id # Find the snapshot targeting the revision snp_id = get_snapshot(storage, origin, rev_id) if not snp_id: logger.warning( "Deposit id %s: Snapshot targeting revision %s not found!", deposit.id, rev_id, ) continue # Reference the old values to do some checks later old_swh_id = deposit.swh_id old_swh_id_context = deposit.swh_id_context old_swh_anchor_id = deposit.swh_anchor_id old_swh_anchor_id_context = deposit.swh_anchor_id_context # Update - deposit.swh_id_context = persistent_identifier( + deposit.swh_id_context = swhid( DIRECTORY, dir_id, metadata={ "origin": origin, - "visit": persistent_identifier(SNAPSHOT, snp_id), - "anchor": persistent_identifier(REVISION, rev_id), + "visit": swhid(SNAPSHOT, snp_id), + "anchor": swhid(REVISION, rev_id), "path": "/", }, ) # Ensure only deposit.swh_id_context changed logging.debug("deposit.id: {deposit.id}") logging.debug("deposit.swh_id: %s -> %s", old_swh_id, deposit.swh_id) assert old_swh_id == deposit.swh_id logging.debug( "deposit.swh_id_context: %s -> %s", old_swh_id_context, deposit.swh_id_context, ) assert old_swh_id_context != deposit.swh_id_context logging.debug( "deposit.swh_anchor_id: %s -> %s", old_swh_anchor_id, deposit.swh_anchor_id ) assert old_swh_anchor_id == deposit.swh_anchor_id logging.debug( "deposit.swh_anchor_id_context: %s -> %s", old_swh_anchor_id_context, deposit.swh_anchor_id_context, ) assert old_swh_anchor_id_context == deposit.swh_anchor_id_context # Commit deposit.save() def resolve_origin(deposit_id: int, provider_url: str, external_id: str) -> str: """Resolve the origin from provider-url and external-id For some edge case, only the external_id is used as there is some old inconsistency from testing which exists. """ map_edge_case_origin: Dict[Tuple[int, str], str] = { ( 76, "hal-01588782", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588782", ( 87, "hal-01588927", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588927", (89, "hal-01588935"): "https://hal-preprod.archives-ouvertes.fr/hal-01588935", ( 88, "hal-01588928", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588928", ( 90, "hal-01588942", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588942", (143, "hal-01592430"): "https://hal-preprod.archives-ouvertes.fr/hal-01592430", ( 75, "hal-01588781", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588781", } origin = map_edge_case_origin.get((deposit_id, external_id)) if origin: return origin # Some simpler origin edge cases (mostly around the initial deposits) map_origin = { ( SWH_PROVIDER_URL, "je-suis-gpl", ): "https://forge.softwareheritage.org/source/jesuisgpl/", ( SWH_PROVIDER_URL, "external-id", ): "https://hal.archives-ouvertes.fr/external-id", } key = (provider_url, external_id) return map_origin.get(key, f"{provider_url.rstrip('/')}/{external_id}") def migrate_deposit_swhid_context_null(apps, schema_editor): """Migrate deposit SWHIDs to the new format. Migrate deposit whose swh_id_context is not set (initial deposits not migrated at the time). Only deposit with status done and swh_id_context null are concerned. Note: Those deposits have their swh_id being the SWHPIDs of the revision! So we can align them as well. """ storage = get_storage() if not storage: logging.warning("Nothing to do") return None Deposit = apps.get_model("deposit", "Deposit") for deposit in Deposit.objects.filter( status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=True ): - obj_rev = parse_persistent_identifier(deposit.swh_id) + obj_rev = parse_swhid(deposit.swh_id) if obj_rev.object_type == DIRECTORY: # Assuming the migration is already done for that deposit logger.warning( "Deposit id %s: Migration already done, skipping", deposit.id ) continue # Ensuring Migration not done assert obj_rev.object_type == REVISION assert deposit.swh_id is not None assert deposit.swh_id_context is None assert deposit.swh_anchor_id is None assert deposit.swh_anchor_id_context is None rev_id = obj_rev.object_id revisions = list(storage.revision_get([hash_to_bytes(rev_id)])) if not revisions: logger.warning("Deposit id %s: Revision %s not found!", deposit.id, rev_id) continue revision = revisions[0] provider_url = deposit.client.provider_url external_id = deposit.external_id origin = resolve_origin(deposit.id, provider_url, external_id) check_origin = storage.origin_get({"url": origin}) if not check_origin: logger.warning("Deposit id %s: Origin %s not found!", deposit.id, origin) continue dir_id = hash_to_hex(revision["directory"]) # Reference the old values to do some checks later old_swh_id = deposit.swh_id old_swh_id_context = deposit.swh_id_context old_swh_anchor_id = deposit.swh_anchor_id old_swh_anchor_id_context = deposit.swh_anchor_id_context # retrieve the snapshot from the archive snp_id = get_snapshot(storage, origin, rev_id) if not snp_id: logger.warning( "Deposit id %s: Snapshot targeting revision %s not found!", deposit.id, rev_id, ) continue # New SWHIDs ids - deposit.swh_id = persistent_identifier(DIRECTORY, dir_id) - deposit.swh_id_context = persistent_identifier( + deposit.swh_id = swhid(DIRECTORY, dir_id) + deposit.swh_id_context = swhid( DIRECTORY, dir_id, metadata={ "origin": origin, - "visit": persistent_identifier(SNAPSHOT, snp_id), - "anchor": persistent_identifier(REVISION, rev_id), + "visit": swhid(SNAPSHOT, snp_id), + "anchor": swhid(REVISION, rev_id), "path": "/", }, ) # Realign the remaining deposit SWHIDs fields - deposit.swh_anchor_id = persistent_identifier(REVISION, rev_id) - deposit.swh_anchor_id_context = persistent_identifier( + deposit.swh_anchor_id = swhid(REVISION, rev_id) + deposit.swh_anchor_id_context = swhid( REVISION, rev_id, metadata={"origin": origin,} ) # Ensure only deposit.swh_id_context changed logging.debug("deposit.id: {deposit.id}") logging.debug("deposit.swh_id: %s -> %s", old_swh_id, deposit.swh_id) assert old_swh_id != deposit.swh_id logging.debug( "deposit.swh_id_context: %s -> %s", old_swh_id_context, deposit.swh_id_context, ) assert old_swh_id_context != deposit.swh_id_context assert deposit.swh_id_context is not None logging.debug( "deposit.swh_anchor_id: %s -> %s", old_swh_anchor_id, deposit.swh_anchor_id ) assert deposit.swh_anchor_id == old_swh_id assert deposit.swh_anchor_id is not None logging.debug( "deposit.swh_anchor_id_context: %s -> %s", old_swh_anchor_id_context, deposit.swh_anchor_id_context, ) assert deposit.swh_anchor_id_context is not None deposit.save() class Migration(migrations.Migration): dependencies = [ ("deposit", "0017_auto_20190925_0906"), ] operations = [ # Migrate and make the operations possibly reversible # https://docs.djangoproject.com/en/3.0/ref/migration-operations/#django.db.migrations.operations.RunPython.noop # noqa migrations.RunPython( migrate_deposit_swhid_context_not_null, reverse_code=migrations.RunPython.noop, ), migrations.RunPython( migrate_deposit_swhid_context_null, reverse_code=migrations.RunPython.noop ), ] diff --git a/swh/deposit/tests/api/test_deposit_private_update_status.py b/swh/deposit/tests/api/test_deposit_private_update_status.py index ba07f0bc..fce57539 100644 --- a/swh/deposit/tests/api/test_deposit_private_update_status.py +++ b/swh/deposit/tests/api/test_deposit_private_update_status.py @@ -1,194 +1,194 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import json from django.urls import reverse from rest_framework import status -from swh.model.identifiers import DIRECTORY, persistent_identifier, REVISION, SNAPSHOT +from swh.model.identifiers import DIRECTORY, swhid, REVISION, SNAPSHOT from swh.deposit.api.private.deposit_update_status import MANDATORY_KEYS from swh.deposit.models import Deposit from swh.deposit.config import ( PRIVATE_PUT_DEPOSIT, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE, ) PRIVATE_PUT_DEPOSIT_NC = PRIVATE_PUT_DEPOSIT + "-nc" def private_check_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" return [ reverse(PRIVATE_PUT_DEPOSIT, args=[collection.name, deposit.id]), reverse(PRIVATE_PUT_DEPOSIT_NC, args=[deposit.id]), ] def test_update_deposit_status_success_with_info( authenticated_client, deposit_collection, ready_deposit_verified ): """Update deposit with load success should require all information to succeed """ deposit = ready_deposit_verified expected_status = DEPOSIT_STATUS_LOAD_SUCCESS origin_url = "something" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" revision_id = "47dc6b4636c7f6cba0df83e3d5490bf4334d987e" snapshot_id = "68c0d26104d47e278dd6be07ed61fafb561d0d20" full_body_info = { "status": DEPOSIT_STATUS_LOAD_SUCCESS, "revision_id": revision_id, "directory_id": directory_id, "snapshot_id": snapshot_id, "origin_url": origin_url, } for url in private_check_url_endpoints(deposit_collection, deposit): - dir_id = persistent_identifier(DIRECTORY, directory_id) - rev_id = persistent_identifier(REVISION, revision_id) - snp_id = persistent_identifier(SNAPSHOT, snapshot_id) + dir_id = swhid(DIRECTORY, directory_id) + rev_id = swhid(REVISION, revision_id) + snp_id = swhid(SNAPSHOT, snapshot_id) expected_swh_id = "swh:1:dir:%s" % directory_id expected_swh_id_context = ( f"{dir_id};origin={origin_url};" + f"visit={snp_id};anchor={rev_id};path=/" ) response = authenticated_client.put( url, content_type="application/json", data=json.dumps(full_body_info), ) assert response.status_code == status.HTTP_204_NO_CONTENT deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == expected_status assert deposit.swh_id == expected_swh_id assert deposit.swh_id_context == expected_swh_id_context # Reset deposit deposit = ready_deposit_verified deposit.save() def test_update_deposit_status_rejected_with_info( authenticated_client, deposit_collection, ready_deposit_verified ): """Update deposit with rejected status needs few information to succeed """ deposit = ready_deposit_verified for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.put( url, content_type="application/json", data=json.dumps({"status": DEPOSIT_STATUS_LOAD_FAILURE}), ) assert response.status_code == status.HTTP_204_NO_CONTENT deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_LOAD_FAILURE assert deposit.swh_id is None assert deposit.swh_id_context is None # Reset status deposit = ready_deposit_verified deposit.save() def test_update_deposit_status_success_with_incomplete_data( authenticated_client, deposit_collection, ready_deposit_verified ): """Update deposit status with status success and incomplete information should fail """ deposit = ready_deposit_verified origin_url = "something" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" revision_id = "47dc6b4636c7f6cba0df83e3d5490bf4334d987e" snapshot_id = "68c0d26104d47e278dd6be07ed61fafb561d0d20" new_status = DEPOSIT_STATUS_LOAD_SUCCESS full_body_info = { "status": new_status, "revision_id": revision_id, "directory_id": directory_id, "snapshot_id": snapshot_id, "origin_url": origin_url, } for url in private_check_url_endpoints(deposit_collection, deposit): for key in MANDATORY_KEYS: # Crafting body with missing information so that it raises body = copy.deepcopy(full_body_info) body.pop(key) # make the body incomplete response = authenticated_client.put( url, content_type="application/json", data=json.dumps(body), ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert ( f"deposit status to {new_status} requires information {key}" in response.content.decode("utf-8") ) def test_update_deposit_status_will_fail_with_unknown_status( authenticated_client, deposit_collection, ready_deposit_verified ): """Unknown status for update should return a 400 response """ deposit = ready_deposit_verified for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.put( url, content_type="application/json", data=json.dumps({"status": "unknown"}) ) assert response.status_code == status.HTTP_400_BAD_REQUEST def test_update_deposit_status_will_fail_with_no_status_key( authenticated_client, deposit_collection, ready_deposit_verified ): """No status provided for update should return a 400 response """ deposit = ready_deposit_verified for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.put( url, content_type="application/json", data=json.dumps({"something": "something"}), ) assert response.status_code == status.HTTP_400_BAD_REQUEST def test_update_deposit_status_success_without_swh_id_fail( authenticated_client, deposit_collection, ready_deposit_verified ): """Providing successful status without swh_id should return a 400 """ deposit = ready_deposit_verified for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.put( url, content_type="application/json", data=json.dumps({"status": DEPOSIT_STATUS_LOAD_SUCCESS}), ) assert response.status_code == status.HTTP_400_BAD_REQUEST diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py index 5bbe064e..5158c4e7 100644 --- a/swh/deposit/tests/conftest.py +++ b/swh/deposit/tests/conftest.py @@ -1,428 +1,428 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import base64 import pytest import psycopg2 from django.urls import reverse from django.test.utils import setup_databases # type: ignore # mypy is asked to ignore the import statement above because setup_databases # is not part of the d.t.utils.__all__ variable. from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT from rest_framework import status from rest_framework.test import APIClient from typing import Mapping from swh.scheduler import get_scheduler from swh.scheduler.tests.conftest import * # noqa -from swh.model.identifiers import DIRECTORY, persistent_identifier, REVISION, SNAPSHOT +from swh.model.identifiers import DIRECTORY, swhid, REVISION, SNAPSHOT from swh.deposit.config import setup_django_for from swh.deposit.parsers import parse_xml from swh.deposit.config import SWHDefaultConfig from swh.deposit.config import ( COL_IRI, EDIT_SE_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_LOAD_FAILURE, ) from swh.deposit.tests.common import create_arborescence_archive TEST_USER = { "username": "test", "password": "password", "email": "test@example.org", "provider_url": "https://hal-test.archives-ouvertes.fr/", "domain": "archives-ouvertes.fr/", "collection": {"name": "test"}, } TEST_CONFIG = { "max_upload_size": 500, "extraction_dir": "/tmp/swh-deposit/test/extraction-dir", "checks": False, "provider": { "provider_name": "", "provider_type": "deposit_client", "provider_url": "", "metadata": {}, }, "tool": { "name": "swh-deposit", "version": "0.0.1", "configuration": {"sword_version": "2"}, }, } def pytest_configure(): setup_django_for("testing") @pytest.fixture() def deposit_config(): return TEST_CONFIG @pytest.fixture(autouse=True) def deposit_autoconfig(monkeypatch, deposit_config, swh_scheduler_config): """Enforce config for deposit classes inherited from SWHDefaultConfig.""" def mock_parse_config(*args, **kw): config = deposit_config.copy() config["scheduler"] = { "cls": "local", "args": swh_scheduler_config, } return config monkeypatch.setattr(SWHDefaultConfig, "parse_config_file", mock_parse_config) scheduler = get_scheduler("local", swh_scheduler_config) task_type = { "type": "load-deposit", "backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit", "description": "why does this have not-null constraint?", } scheduler.create_task_type(task_type) @pytest.fixture(scope="session") def django_db_setup(request, django_db_blocker, postgresql_proc): from django.conf import settings settings.DATABASES["default"].update( { ("ENGINE", "django.db.backends.postgresql"), ("NAME", "tests"), ("USER", postgresql_proc.user), # noqa ("HOST", postgresql_proc.host), # noqa ("PORT", postgresql_proc.port), # noqa } ) with django_db_blocker.unblock(): setup_databases( verbosity=request.config.option.verbose, interactive=False, keepdb=False ) def execute_sql(sql): """Execute sql to postgres db""" with psycopg2.connect(database="postgres") as conn: conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() cur.execute(sql) @pytest.fixture(autouse=True, scope="session") def swh_proxy(): """Automatically inject this fixture in all tests to ensure no outside connection takes place. """ os.environ["http_proxy"] = "http://localhost:999" os.environ["https_proxy"] = "http://localhost:999" def create_deposit_collection(collection_name: str): """Create a deposit collection with name collection_name """ from swh.deposit.models import DepositCollection try: collection = DepositCollection._default_manager.get(name=collection_name) except DepositCollection.DoesNotExist: collection = DepositCollection(name=collection_name) collection.save() return collection def deposit_collection_factory(collection_name=TEST_USER["collection"]["name"]): @pytest.fixture def _deposit_collection(db, collection_name=collection_name): return create_deposit_collection(collection_name) return _deposit_collection deposit_collection = deposit_collection_factory() deposit_another_collection = deposit_collection_factory("another-collection") @pytest.fixture def deposit_user(db, deposit_collection): """Create/Return the test_user "test" """ from swh.deposit.models import DepositClient try: user = DepositClient._default_manager.get(username=TEST_USER["username"]) except DepositClient.DoesNotExist: user = DepositClient._default_manager.create_user( username=TEST_USER["username"], email=TEST_USER["email"], password=TEST_USER["password"], provider_url=TEST_USER["provider_url"], domain=TEST_USER["domain"], ) user.collections = [deposit_collection.id] user.save() return user @pytest.fixture def client(): """Override pytest-django one which does not work for djangorestframework. """ return APIClient() # <- drf's client @pytest.yield_fixture def authenticated_client(client, deposit_user): """Returned a logged client """ _token = "%s:%s" % (deposit_user.username, TEST_USER["password"]) token = base64.b64encode(_token.encode("utf-8")) authorization = "Basic %s" % token.decode("utf-8") client.credentials(HTTP_AUTHORIZATION=authorization) yield client client.logout() @pytest.fixture def sample_archive(tmp_path): """Returns a sample archive """ tmp_path = str(tmp_path) # pytest version limitation in previous version archive = create_arborescence_archive( tmp_path, "archive1", "file1", b"some content in file" ) return archive @pytest.fixture def atom_dataset(datadir) -> Mapping[str, str]: """Compute the paths to atom files. Returns: Dict of atom name per content (bytes) """ atom_path = os.path.join(datadir, "atom") data = {} for filename in os.listdir(atom_path): filepath = os.path.join(atom_path, filename) with open(filepath, "rb") as f: raw_content = f.read().decode("utf-8") # Keep the filename without extension atom_name = filename.split(".")[0] data[atom_name] = raw_content return data def create_deposit( authenticated_client, collection_name: str, sample_archive, external_id: str, deposit_status=DEPOSIT_STATUS_DEPOSITED, ): """Create a skeleton shell deposit """ url = reverse(COL_IRI, args=[collection_name]) # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"]), ) # then assert response.status_code == status.HTTP_201_CREATED from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(external_id=external_id) if deposit.status != deposit_status: deposit.status = deposit_status deposit.save() assert deposit.status == deposit_status return deposit def create_binary_deposit( authenticated_client, collection_name: str, sample_archive, external_id: str, deposit_status: str = DEPOSIT_STATUS_DEPOSITED, atom_dataset: Mapping[str, bytes] = {}, ): """Create a deposit with both metadata and archive set. Then alters its status to `deposit_status`. """ deposit = create_deposit( authenticated_client, collection_name, sample_archive, external_id=external_id, deposit_status=DEPOSIT_STATUS_PARTIAL, ) response = authenticated_client.post( reverse(EDIT_SE_IRI, args=[collection_name, deposit.id]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8"), HTTP_SLUG=deposit.external_id, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_201_CREATED assert deposit.status == DEPOSIT_STATUS_PARTIAL from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit.id) if deposit.status != deposit_status: deposit.status = deposit_status deposit.save() assert deposit.status == deposit_status return deposit def deposit_factory(deposit_status=DEPOSIT_STATUS_DEPOSITED): """Build deposit with a specific status """ @pytest.fixture() def _deposit( sample_archive, deposit_collection, authenticated_client, deposit_status=deposit_status, ): external_id = "external-id-%s" % deposit_status return create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id=external_id, deposit_status=deposit_status, ) return _deposit deposited_deposit = deposit_factory() rejected_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_REJECTED) partial_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_PARTIAL) verified_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_VERIFIED) completed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS) failed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_FAILURE) @pytest.fixture def partial_deposit_with_metadata( sample_archive, deposit_collection, authenticated_client, atom_dataset ): """Returns deposit with archive and metadata provided, status 'partial' """ return create_binary_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id="external-id-partial", deposit_status=DEPOSIT_STATUS_PARTIAL, atom_dataset=atom_dataset, ) @pytest.fixture def partial_deposit_only_metadata( deposit_collection, authenticated_client, atom_dataset ): response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], HTTP_SLUG="external-id-partial", HTTP_IN_PROGRESS=True, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = response_content["deposit_id"] from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_PARTIAL return deposit @pytest.fixture def complete_deposit(sample_archive, deposit_collection, authenticated_client): """Returns a completed deposit (load success) """ deposit = create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id="external-id-complete", deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS, ) origin = "https://hal.archives-ouvertes.fr/hal-01727745" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" revision_id = "548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10" snapshot_id = "e5e82d064a9c3df7464223042e0c55d72ccff7f0" - deposit.swh_id = persistent_identifier(DIRECTORY, directory_id) - deposit.swh_id_context = persistent_identifier( + deposit.swh_id = swhid(DIRECTORY, directory_id) + deposit.swh_id_context = swhid( DIRECTORY, directory_id, metadata={ "origin": origin, - "visit": persistent_identifier(SNAPSHOT, snapshot_id), - "anchor": persistent_identifier(REVISION, revision_id), + "visit": swhid(SNAPSHOT, snapshot_id), + "anchor": swhid(REVISION, revision_id), "path": "/", }, ) deposit.save() return deposit @pytest.fixture() def tmp_path(tmp_path): return str(tmp_path) # issue with oldstable's pytest version