diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py index ab6647b1..563f1a80 100644 --- a/swh/deposit/api/private/deposit_check.py +++ b/swh/deposit/api/private/deposit_check.py @@ -1,187 +1,191 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from itertools import chain import re from shutil import get_unpack_formats import tarfile from typing import Dict, Optional, Tuple import zipfile from rest_framework import status from rest_framework.request import Request from swh.scheduler.utils import create_oneshot_task_dict from . import APIPrivateView, DepositReadMixin from ...config import ARCHIVE_TYPE, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED from ...models import Deposit, DepositRequest from ..checks import check_metadata from ..common import APIGet MANDATORY_ARCHIVE_UNREADABLE = ( "At least one of its associated archives is not readable" # noqa ) MANDATORY_ARCHIVE_INVALID = ( "Mandatory archive is invalid (i.e contains only one archive)" # noqa ) MANDATORY_ARCHIVE_UNSUPPORTED = "Mandatory archive type is not supported" MANDATORY_ARCHIVE_MISSING = "Deposit without archive is rejected" ARCHIVE_EXTENSIONS = [ "zip", "tar", "tar.gz", "xz", "tar.xz", "bz2", "tar.bz2", "Z", "tar.Z", "tgz", "7z", ] PATTERN_ARCHIVE_EXTENSION = re.compile(r".*\.(%s)$" % "|".join(ARCHIVE_EXTENSIONS)) def known_archive_format(filename): return any( filename.endswith(t) for t in chain(*(x[1] for x in get_unpack_formats())) ) class APIChecks(APIPrivateView, APIGet, DepositReadMixin): - """Dedicated class to read a deposit's raw archives content. + """Dedicated class to trigger the deposit checks on deposit archives and metadata. Only GET is supported. """ def _check_deposit_archives(self, deposit: Deposit) -> Tuple[bool, Optional[Dict]]: """Given a deposit, check each deposit request of type archive. Args: The deposit to check archives for Returns - tuple (status, error_detail): True, None if all archives + tuple (status, details): True, None if all archives are ok, (False, ) otherwise. """ requests = list(self._deposit_requests(deposit, request_type=ARCHIVE_TYPE)) if len(requests) == 0: # no associated archive is refused return False, {"archive": [{"summary": MANDATORY_ARCHIVE_MISSING,}]} errors = [] for archive_request in requests: check, error_message = self._check_archive(archive_request) if not check: errors.append( {"summary": error_message, "fields": [archive_request.id]} ) if not errors: return True, None return False, {"archive": errors} def _check_archive( self, archive_request: DepositRequest ) -> Tuple[bool, Optional[str]]: """Check that a deposit associated archive is ok: - readable - supported archive format - valid content: the archive does not contain a single archive file If any of those checks are not ok, return the corresponding failing check. Args: archive_path (DepositRequest): Archive to check Returns: (True, None) if archive is check compliant, (False, ) otherwise. """ archive_path = archive_request.archive.path if not known_archive_format(archive_path): return False, MANDATORY_ARCHIVE_UNSUPPORTED try: if zipfile.is_zipfile(archive_path): with zipfile.ZipFile(archive_path) as zipfile_: files = zipfile_.namelist() elif tarfile.is_tarfile(archive_path): with tarfile.open(archive_path) as tarfile_: files = tarfile_.getnames() else: return False, MANDATORY_ARCHIVE_UNSUPPORTED except Exception: return False, MANDATORY_ARCHIVE_UNREADABLE if len(files) > 1: return True, None element = files[0] if PATTERN_ARCHIVE_EXTENSION.match(element): # archive in archive! return False, MANDATORY_ARCHIVE_INVALID return True, None def process_get( self, req: Request, collection_name: str, deposit: Deposit ) -> Tuple[int, Dict, str]: - """Build a unique tarball from the multiple received and stream that - content to the client. + """Trigger the checks on the deposit archives and then on the deposit metadata. + If any problems (or warnings) are raised, the deposit status and status detail + are updated accordingly. If all checks are ok, the deposit status is updated to + the 'verified' status (details updated with warning if any) and a loading task + is scheduled for the deposit to be ingested. Otherwise, the deposit is marked as + 'rejected' with the error details. A json response is returned to the caller + with the deposit checks. Args: req: Client request collection_name: Collection owning the deposit deposit: Deposit concerned by the reading Returns: - Tuple status, stream of content, content-type + Tuple (status, json response, content-type) """ metadata, _ = self._metadata_get(deposit) - problems: Dict = {} + details_dict: Dict = {} # will check each deposit's associated request (both of type # archive and metadata) for errors - archives_status, error_detail = self._check_deposit_archives(deposit) - if not archives_status: - assert error_detail is not None - problems.update(error_detail) - - metadata_status, error_detail = check_metadata(metadata) - if not metadata_status: - assert error_detail is not None - problems.update(error_detail) - - deposit_status = archives_status and metadata_status - - # if any problems arose, the deposit is rejected - if not deposit_status: - deposit.status = DEPOSIT_STATUS_REJECTED - deposit.status_detail = problems - response = { - "status": deposit.status, - "details": deposit.status_detail, - } - else: - deposit.status = DEPOSIT_STATUS_VERIFIED - response = { - "status": deposit.status, - } - if not deposit.load_task_id and self.config["checks"]: - url = deposit.origin_url - task = create_oneshot_task_dict( - "load-deposit", url=url, deposit_id=deposit.id, retries_left=3 - ) - load_task_id = self.scheduler.create_tasks([task])[0]["id"] - deposit.load_task_id = load_task_id + archives_status_ok, details = self._check_deposit_archives(deposit) + if not archives_status_ok: + assert details is not None + details_dict.update(details) + + metadata_status_ok, details = check_metadata(metadata) + # Ensure in case of error, we do have the rejection details + assert metadata_status_ok or (not metadata_status_ok and details is not None) + # we can have warnings even if checks are ok (e.g. missing suggested field) + details_dict.update(details or {}) + + deposit_status_ok = archives_status_ok and metadata_status_ok + # if any details_dict arose, the deposit is rejected + deposit.status = ( + DEPOSIT_STATUS_VERIFIED if deposit_status_ok else DEPOSIT_STATUS_REJECTED + ) + response: Dict = { + "status": deposit.status, + } + if details_dict: + deposit.status_detail = details_dict + response["details"] = details_dict + + # Deposit ok, then we schedule the deposit loading task (if not already done) + if deposit_status_ok and not deposit.load_task_id and self.config["checks"]: + url = deposit.origin_url + task = create_oneshot_task_dict( + "load-deposit", url=url, deposit_id=deposit.id, retries_left=3 + ) + load_task_id = self.scheduler.create_tasks([task])[0]["id"] + deposit.load_task_id = load_task_id deposit.save() return status.HTTP_200_OK, response, "application/json" diff --git a/swh/deposit/tests/api/test_deposit_private_check.py b/swh/deposit/tests/api/test_deposit_private_check.py index d4fd5bd7..9456e094 100644 --- a/swh/deposit/tests/api/test_deposit_private_check.py +++ b/swh/deposit/tests/api/test_deposit_private_check.py @@ -1,218 +1,226 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse_lazy as reverse import pytest from rest_framework import status from swh.deposit.api.checks import ( MANDATORY_FIELDS_MISSING, METADATA_PROVENANCE_KEY, SUGGESTED_FIELDS_MISSING, ) from swh.deposit.api.private.deposit_check import ( MANDATORY_ARCHIVE_INVALID, MANDATORY_ARCHIVE_MISSING, MANDATORY_ARCHIVE_UNSUPPORTED, ) from swh.deposit.config import ( COL_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED, PRIVATE_CHECK_DEPOSIT, ) from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import ( create_arborescence_archive, create_archive_with_archive, ) PRIVATE_CHECK_DEPOSIT_NC = PRIVATE_CHECK_DEPOSIT + "-nc" def private_check_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" return [ reverse(PRIVATE_CHECK_DEPOSIT, args=[collection.name, deposit.id]), reverse(PRIVATE_CHECK_DEPOSIT_NC, args=[deposit.id]), ] @pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) def test_deposit_ok( authenticated_client, deposit_collection, ready_deposit_ok, extension ): """Proper deposit should succeed the checks (-> status ready) """ deposit = ready_deposit_ok for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_VERIFIED deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_VERIFIED + # Deposit is ok but it's missing suggested fields in its metadata detected by + # the checks + status_detail = deposit.status_detail["metadata"] + assert len(status_detail) == 1 + suggested = status_detail[0] + assert suggested["summary"] == SUGGESTED_FIELDS_MISSING + assert set(suggested["fields"]) == set([METADATA_PROVENANCE_KEY]) + deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() @pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) def test_deposit_invalid_tarball( tmp_path, authenticated_client, deposit_collection, extension ): """Deposit with tarball (of 1 tarball) should fail the checks: rejected """ deposit = create_deposit_archive_with_archive( tmp_path, extension, authenticated_client, deposit_collection.name ) for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_REJECTED details = data["details"] # archive checks failure assert len(details["archive"]) == 1 assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_INVALID deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_REJECTED def test_deposit_ko_missing_tarball( authenticated_client, deposit_collection, ready_deposit_only_metadata ): """Deposit without archive should fail the checks: rejected """ deposit = ready_deposit_only_metadata assert deposit.status == DEPOSIT_STATUS_DEPOSITED for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_REJECTED details = data["details"] # archive checks failure assert len(details["archive"]) == 1 assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_MISSING deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_REJECTED deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() def test_deposit_ko_unsupported_tarball( tmp_path, authenticated_client, deposit_collection, ready_deposit_invalid_archive ): """Deposit with an unsupported tarball should fail the checks: rejected """ deposit = ready_deposit_invalid_archive assert DEPOSIT_STATUS_DEPOSITED == deposit.status for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_REJECTED details = data["details"] # archive checks failure assert len(details["archive"]) == 1 assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_UNSUPPORTED # metadata check failure assert len(details["metadata"]) == 2 mandatory = details["metadata"][0] assert mandatory["summary"] == MANDATORY_FIELDS_MISSING assert set(mandatory["fields"]) == set( [ "atom:author or codemeta:author", "atom:name or atom:title or codemeta:name", ] ) suggested = details["metadata"][1] assert suggested["summary"] == SUGGESTED_FIELDS_MISSING assert set(suggested["fields"]) == set([METADATA_PROVENANCE_KEY]) deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_REJECTED deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() def test_check_deposit_metadata_ok( authenticated_client, deposit_collection, ready_deposit_ok ): """Proper deposit should succeed the checks (-> status ready) with all **MUST** metadata using the codemeta metadata test set """ deposit = ready_deposit_ok assert deposit.status == DEPOSIT_STATUS_DEPOSITED for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_VERIFIED deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_VERIFIED deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() def create_deposit_archive_with_archive( root_path, archive_extension, client, collection_name ): # we create the holding archive to a given extension archive = create_arborescence_archive( root_path, "archive1", "file1", b"some content in file", extension=archive_extension, ) # now we create an archive holding the first created archive invalid_archive = create_archive_with_archive(root_path, "invalid.tgz", archive) # we deposit it response = client.post( reverse(COL_IRI, args=[collection_name]), content_type="application/x-tar", data=invalid_archive["data"], CONTENT_LENGTH=invalid_archive["length"], HTTP_MD5SUM=invalid_archive["md5sum"], HTTP_SLUG="external-id", HTTP_IN_PROGRESS=False, HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (invalid_archive["name"],), ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_status = response_content["swh:deposit_status"] assert deposit_status == DEPOSIT_STATUS_DEPOSITED deposit_id = int(response_content["swh:deposit_id"]) deposit = Deposit.objects.get(pk=deposit_id) assert DEPOSIT_STATUS_DEPOSITED == deposit.status return deposit