diff --git a/docs/endpoints/status.rst b/docs/endpoints/status.rst --- a/docs/endpoints/status.rst +++ b/docs/endpoints/status.rst @@ -3,15 +3,14 @@ .. http:get:: /1/// - Display deposit's status in regards to loading. - + Returns deposit's status. The different statuses: - **partial**: multipart deposit is still ongoing - - **deposited**: deposit completed + - **deposited**: deposit completed, ready for checks - **rejected**: deposit failed the checks - - **verified**: content and metadata verified + - **verified**: content and metadata verified, ready for loading - **loading**: loading in-progress - **done**: loading completed successfully - **failed**: the deposit loading has failed @@ -24,6 +23,51 @@ :statuscode 404: access to an unknown deposit +Rejected deposit +~~~~~~~~~~~~~~~~ + +It so happens that deposit could be rejected. In that case, the +`deposit_status_detail` entry will explain failed checks. + +Many reasons are possibles, here are some: + +- Deposit without software archive (main goal of the deposit is to + deposit software source code) + +- Deposit with malformed software archive (i.e archive within archive) + +- Deposit with invalid software archive (corrupted archive, although, + this one should happen during upload and not during checks) + +- Deposit with unsupported archive format + +- Deposit with missing metadata + Sample response ~~~~~~~~~~~~~~~ + + Successful deposit: + + .. code:: xml + + + 150 + done + The deposit has been successfully loaded into the Software Heritage archive + swh:1:rev:c648730299c2a4f4df3c1fe6e527ef3681f9527e + + + Rejected deposit: + + .. code:: xml + + + 148 + rejected + - At least one url field must be compatible with the client's domain name (codemeta:url) + diff --git a/swh/deposit/api/deposit_status.py b/swh/deposit/api/deposit_status.py --- a/swh/deposit/api/deposit_status.py +++ b/swh/deposit/api/deposit_status.py @@ -26,10 +26,10 @@ 'summary': , 'fields': , }], - 'archive': { + 'archive': [{ 'summary': , - 'fields': [], - } + 'fields': , + }] } @@ -45,19 +45,23 @@ return None msg = [] - if 'metadata' in status_detail: - for data in status_detail['metadata']: - fields = ', '.join(data['fields']) - msg.append('- %s (%s)\n' % (data['summary'], fields)) - - for key in ['url', 'archive']: - if key in status_detail: - _detail = status_detail[key] - fields = _detail.get('fields') - suffix_msg = '' - if fields: - suffix_msg = ' (%s)' % ', '.join(fields) - msg.append('- %s%s\n' % (_detail['summary'], suffix_msg)) + for key in ['metadata', 'archive']: + _detail = status_detail.get(key) + if _detail: + for data in _detail: + suffix_msg = '' + fields = data.get('fields') + if fields: + suffix_msg = ' (%s)' % ', '.join(fields) + msg.append('- %s%s\n' % (data['summary'], suffix_msg)) + + _detail = status_detail.get('url') + if _detail: + fields = _detail.get('fields') + suffix_msg = '' + if fields: + suffix_msg = ' (%s)' % ', '.join(fields) + msg.append('- %s%s\n' % (_detail['summary'], suffix_msg)) if not msg: return None diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py --- a/swh/deposit/api/private/deposit_check.py +++ b/swh/deposit/api/private/deposit_check.py @@ -4,10 +4,13 @@ # See top-level LICENSE file for more information import json -import patoolib +import re +import tarfile +import zipfile from rest_framework import status + from . import DepositReadMixin from ..common import SWHGetDepositAPI, SWHPrivateAPIView from ...config import DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_REJECTED @@ -16,10 +19,19 @@ MANDATORY_FIELDS_MISSING = 'Mandatory fields are missing' ALTERNATE_FIELDS_MISSING = 'Mandatory alternate fields are missing' - -MANDATORY_ARCHIVE_UNREADABLE = 'Deposit was rejected because at least one of its associated archives was not readable' # noqa -MANDATORY_ARCHIVE_MISSING = 'Deposit without archive is rejected' INCOMPATIBLE_URL_FIELDS = "At least one url field must be compatible with the client's domain name" # noqa +MANDATORY_ARCHIVE_UNREADABLE = 'At least one of its associated archives is not readable' # noqa +MANDATORY_ARCHIVE_INVALID = 'Mandatory archive is invalid (i.e contains only one archive)' # noqa +MANDATORY_ARCHIVE_UNSUPPORTED = 'Mandatory archive type is not supported' +MANDATORY_ARCHIVE_MISSING = 'Deposit without archive is rejected' + +ARCHIVE_EXTENSIONS = [ + 'zip', 'tar', 'tar.gz', 'xz', 'tar.xz', 'bz2', + 'tar.bz2', 'Z', 'tar.Z', 'tgz', '7z' +] + +PATTERN_ARCHIVE_EXTENSION = re.compile( + r'.*\.(%s)$' % '|'.join(ARCHIVE_EXTENSIONS)) class SWHChecksDeposit(SWHGetDepositAPI, SWHPrivateAPIView, DepositReadMixin): @@ -43,42 +55,62 @@ deposit, request_type=ARCHIVE_TYPE)) if len(requests) == 0: # no associated archive is refused return False, { - 'archive': { + 'archive': [{ 'summary': MANDATORY_ARCHIVE_MISSING, - } + }] } - rejected_dr_ids = [] - for dr in requests: - _path = dr.archive.path - check = self._check_archive(_path) + errors = [] + for archive_request in requests: + check, error_message = self._check_archive(archive_request) if not check: - rejected_dr_ids.append(dr.id) + errors.append({ + 'summary': error_message, + 'fields': [archive_request.id] + }) - if rejected_dr_ids: - return False, { - 'archive': { - 'summary': MANDATORY_ARCHIVE_UNREADABLE, - 'fields': rejected_dr_ids, - }} - return True, None + if not errors: + return True, None + return False, { + 'archive': errors + } + + def _check_archive(self, archive_request): + """Check that a deposit associated archive is ok: + - readable + - supported archive format + - valid content: the archive does not contain a single archive file - def _check_archive(self, archive_path): - """Check that a given archive is actually ok for reading. + If any of those checks are not ok, return the corresponding + failing check. Args: - archive_path (str): Archive to check + archive_path (DepositRequest): Archive to check Returns: - True if archive is successfully read, False otherwise. + (True, None) if archive is check compliant, (False, + ) otherwise. """ + archive_path = archive_request.archive.path try: - patoolib.test_archive(archive_path, verbosity=-1) + if zipfile.is_zipfile(archive_path): + with zipfile.ZipFile(archive_path) as f: + files = f.namelist() + elif tarfile.is_tarfile(archive_path): + with tarfile.open(archive_path) as f: + files = f.getnames() + else: + return False, MANDATORY_ARCHIVE_UNSUPPORTED except Exception: - return False - else: - return True + return False, MANDATORY_ARCHIVE_UNREADABLE + if len(files) > 1: + return True, None + element = files[0] + if PATTERN_ARCHIVE_EXTENSION.match(element): + # archive in archive! + return False, MANDATORY_ARCHIVE_INVALID + return True, None def _check_metadata(self, metadata): """Check to execute on all metadata for mandatory field presence. @@ -98,7 +130,7 @@ } alternate_fields = { ('name', 'title'): False, # alternate field, at least one - # of them must be present + # of them must be present } for field, value in metadata.items(): diff --git a/swh/deposit/models.py b/swh/deposit/models.py --- a/swh/deposit/models.py +++ b/swh/deposit/models.py @@ -13,9 +13,11 @@ from django.db import models from django.utils.timezone import now -from .config import DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_DEPOSITED -from .config import DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_LOAD_SUCCESS -from .config import DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_REJECTED +from .config import ( + DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL, + DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE, + DEPOSIT_STATUS_REJECTED +) class Dbversion(models.Model): @@ -43,7 +45,7 @@ ('expired', 'expired'), (DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_DEPOSITED), (DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_VERIFIED), - ('rejected', 'rejected'), + (DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_REJECTED), ('loading', 'loading'), (DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_SUCCESS), (DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_FAILURE), @@ -60,7 +62,7 @@ '(tarball ok, metadata, etc...)', DEPOSIT_STATUS_VERIFIED: 'Deposit is fully received, checked, and ' 'ready for loading', - 'rejected': 'Deposit failed the checks', + DEPOSIT_STATUS_REJECTED: 'Deposit failed the checks', 'loading': "Loading is ongoing on swh's side", DEPOSIT_STATUS_LOAD_SUCCESS: 'The deposit has been successfully ' 'loaded into the Software Heritage archive', diff --git a/swh/deposit/tests/api/test_deposit_binary.py b/swh/deposit/tests/api/test_deposit_binary.py --- a/swh/deposit/tests/api/test_deposit_binary.py +++ b/swh/deposit/tests/api/test_deposit_binary.py @@ -16,7 +16,7 @@ from swh.deposit.config import DEPOSIT_STATUS_DEPOSITED from swh.deposit.models import Deposit, DepositRequest from swh.deposit.parsers import parse_xml -from ..common import BasicTestCase, WithAuthTestCase, create_arborescence_zip +from ..common import BasicTestCase, WithAuthTestCase, create_arborescence_archive from ..common import FileSystemCreationRoutine @@ -329,7 +329,7 @@ # given url = reverse(COL_IRI, args=[self.collection.name]) - archive = create_arborescence_zip( + archive = create_arborescence_archive( self.root_path, 'archive2', 'file2', b'some content in file', up_to_size=TEST_CONFIG['max_upload_size']) @@ -457,7 +457,7 @@ self.assertRegex(deposit_request.archive.name, self.archive['name']) # 2nd archive to upload - archive2 = create_arborescence_zip( + archive2 = create_arborescence_archive( self.root_path, 'archive2', 'file2', b'some other content in file') # uri to update the content @@ -555,7 +555,7 @@ # Testing all update/add endpoint should fail # since the status is ready - archive2 = create_arborescence_zip( + archive2 = create_arborescence_archive( self.root_path, 'archive2', 'file2', b'some content in file 2') # replacing file is no longer possible since the deposit's diff --git a/swh/deposit/tests/api/test_deposit_check.py b/swh/deposit/tests/api/test_deposit_check.py --- a/swh/deposit/tests/api/test_deposit_check.py +++ b/swh/deposit/tests/api/test_deposit_check.py @@ -17,9 +17,10 @@ DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_REJECTED ) from swh.deposit.api.private.deposit_check import ( - SWHChecksDeposit, + SWHChecksDeposit, MANDATORY_ARCHIVE_INVALID, MANDATORY_FIELDS_MISSING, INCOMPATIBLE_URL_FIELDS, - MANDATORY_ARCHIVE_UNREADABLE, ALTERNATE_FIELDS_MISSING + MANDATORY_ARCHIVE_UNSUPPORTED, ALTERNATE_FIELDS_MISSING, + MANDATORY_ARCHIVE_MISSING ) from swh.deposit.models import Deposit @@ -61,8 +62,62 @@ self.assertEquals(deposit.status, DEPOSIT_STATUS_VERIFIED) @istest - def deposit_ko(self): - """Invalid deposit should fail the checks (-> status rejected) + def deposit_invalid_tarball(self): + """Deposit with tarball (of 1 tarball) should fail the checks: rejected + + """ + for archive_extension in ['zip', 'tar', 'tar.gz', 'tar.bz2', 'tar.xz']: + deposit_id = self.create_deposit_archive_with_archive( + archive_extension) + + deposit = Deposit.objects.get(pk=deposit_id) + self.assertEquals(DEPOSIT_STATUS_DEPOSITED, deposit.status) + + url = reverse(PRIVATE_CHECK_DEPOSIT, + args=[self.collection.name, deposit.id]) + + response = self.client.get(url) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + data = json.loads(response.content.decode('utf-8')) + self.assertEqual(data['status'], DEPOSIT_STATUS_REJECTED) + details = data['details'] + # archive checks failure + self.assertEqual(len(details['archive']), 1) + self.assertEqual(details['archive'][0]['summary'], + MANDATORY_ARCHIVE_INVALID) + + deposit = Deposit.objects.get(pk=deposit.id) + self.assertEquals(deposit.status, DEPOSIT_STATUS_REJECTED) + + @istest + def deposit_ko_missing_tarball(self): + """Deposit without archive should fail the checks: rejected + + """ + deposit_id = self.create_deposit_ready() # no archive, only atom + deposit = Deposit.objects.get(pk=deposit_id) + self.assertEquals(DEPOSIT_STATUS_DEPOSITED, deposit.status) + + url = reverse(PRIVATE_CHECK_DEPOSIT, + args=[self.collection.name, deposit.id]) + + response = self.client.get(url) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + data = json.loads(response.content.decode('utf-8')) + self.assertEqual(data['status'], DEPOSIT_STATUS_REJECTED) + details = data['details'] + # archive checks failure + self.assertEqual(len(details['archive']), 1) + self.assertEqual(details['archive'][0]['summary'], + MANDATORY_ARCHIVE_MISSING) + deposit = Deposit.objects.get(pk=deposit.id) + self.assertEquals(deposit.status, DEPOSIT_STATUS_REJECTED) + + @istest + def deposit_ko_unsupported_tarball(self): + """Deposit with an unsupported tarball should fail the checks: rejected """ deposit_id = self.create_deposit_with_invalid_archive() @@ -80,9 +135,9 @@ self.assertEqual(data['status'], DEPOSIT_STATUS_REJECTED) details = data['details'] # archive checks failure - self.assertEqual(len(details['archive']['fields']), 1) - self.assertEqual(details['archive']['summary'], - MANDATORY_ARCHIVE_UNREADABLE) + self.assertEqual(len(details['archive']), 1) + self.assertEqual(details['archive'][0]['summary'], + MANDATORY_ARCHIVE_UNSUPPORTED) # metadata check failure self.assertEqual(len(details['metadata']), 2) mandatory = details['metadata'][0] diff --git a/swh/deposit/tests/api/test_deposit_read_archive.py b/swh/deposit/tests/api/test_deposit_read_archive.py --- a/swh/deposit/tests/api/test_deposit_read_archive.py +++ b/swh/deposit/tests/api/test_deposit_read_archive.py @@ -17,7 +17,7 @@ from swh.deposit.tests import TEST_CONFIG from ..common import BasicTestCase, WithAuthTestCase, CommonCreationRoutine -from ..common import FileSystemCreationRoutine, create_arborescence_zip +from ..common import FileSystemCreationRoutine, create_arborescence_archive @attr('fs') @@ -27,7 +27,7 @@ def setUp(self): super().setUp() - self.archive2 = create_arborescence_zip( + self.archive2 = create_arborescence_archive( self.root_path, 'archive2', 'file2', b'some other content in file') self.workdir = os.path.join(self.root_path, 'workdir') diff --git a/swh/deposit/tests/api/test_deposit_status.py b/swh/deposit/tests/api/test_deposit_status.py --- a/swh/deposit/tests/api/test_deposit_status.py +++ b/swh/deposit/tests/api/test_deposit_status.py @@ -143,16 +143,16 @@ 'fields': ['name or title', 'url or badurl'] } ], - 'archive': { + 'archive': [{ 'summary': 'Unreadable archive', - 'fields': ['1', '2'], - }, + 'fields': ['1'], + }], } expected_status_detail = '''- Mandatory fields missing (url, title) - Alternate fields missing (name or title, url or badurl) +- Unreadable archive (1) - At least one url field must be compatible with the client's domain name. The following url fields failed the check (blahurl, testurl) -- Unreadable archive (1, 2) ''' # noqa actual_status_detail = convert_status_detail(status_detail) @@ -172,9 +172,21 @@ 'fields': ['name'], }, ], + 'archive': [ + { + 'summary': 'Invalid archive', + 'fields': ['2'], + }, + { + 'summary': 'Unsupported archive', + 'fields': ['1'], + } + ], } expected_status_detail = '''- Mandatory fields missing (name) +- Invalid archive (2) +- Unsupported archive (1) - At least one compatible url field. Failed (testurl) ''' diff --git a/swh/deposit/tests/api/test_deposit_update.py b/swh/deposit/tests/api/test_deposit_update.py --- a/swh/deposit/tests/api/test_deposit_update.py +++ b/swh/deposit/tests/api/test_deposit_update.py @@ -12,7 +12,7 @@ from swh.deposit.config import EDIT_SE_IRI, EM_IRI from ..common import BasicTestCase, WithAuthTestCase, CommonCreationRoutine -from ..common import FileSystemCreationRoutine, create_arborescence_zip +from ..common import FileSystemCreationRoutine, create_arborescence_archive class DepositUpdateOrReplaceExistingDataTest( @@ -34,7 +34,7 @@ bar """ - self.archive2 = create_arborescence_zip( + self.archive2 = create_arborescence_archive( self.root_path, 'archive2', 'file2', b'some other content in file') @istest diff --git a/swh/deposit/tests/common.py b/swh/deposit/tests/common.py --- a/swh/deposit/tests/common.py +++ b/swh/deposit/tests/common.py @@ -7,6 +7,7 @@ import hashlib import os import shutil +import tarfile import tempfile from django.core.urlresolvers import reverse @@ -28,11 +29,74 @@ from swh.core import tarball -def create_arborescence_zip(root_path, archive_name, filename, content, - up_to_size=None): +def compute_info(archive_path): + """Given a path, compute information on path. + + """ + with open(archive_path, 'rb') as f: + length = 0 + sha1sum = hashlib.sha1() + md5sum = hashlib.md5() + data = b'' + for chunk in f: + sha1sum.update(chunk) + md5sum.update(chunk) + length += len(chunk) + data += chunk + + return { + 'dir': os.path.dirname(archive_path), + 'name': os.path.basename(archive_path), + 'path': archive_path, + 'length': length, + 'sha1sum': sha1sum.hexdigest(), + 'md5sum': md5sum.hexdigest(), + 'data': data + } + + +def _compress(path, extension, dir_path): + """Compress path according to extension + + """ + if extension == 'zip' or extension == 'tar': + return tarball.compress(path, extension, dir_path) + elif '.' in extension: + split_ext = extension.split('.') + if split_ext[0] != 'tar': + raise ValueError( + 'Development error, only zip or tar archive supported, ' + '%s not supported' % extension) + + # deal with specific tar + mode = split_ext[1] + supported_mode = ['xz', 'gz', 'bz2'] + if mode not in supported_mode: + raise ValueError( + 'Development error, only %s supported, %s not supported' % ( + supported_mode, mode)) + files = tarball._ls(dir_path) + with tarfile.open(path, 'w:%s' % mode) as t: + for fpath, fname in files: + t.add(fpath, arcname=fname, recursive=False) + + return path + + +def create_arborescence_archive(root_path, archive_name, filename, content, + up_to_size=None, extension='zip'): """Build an archive named archive_name in the root_path. This archive contains one file named filename with the content content. + Args: + root_path (str): Location path of the archive to create + archive_name (str): Archive's name (without extension) + filename (str): Archive's content is only one filename + content (bytes): Content of the filename + up_to_size (int | None): Fill in the blanks size to oversize + or complete an archive's size + extension (str): Extension of the archive to write (default is zip) + Returns: dict with the keys: - dir: the directory of that archive @@ -59,29 +123,19 @@ f.write(b'0'*batch_size) count += batch_size - zip_path = dir_path + '.zip' - zip_path = tarball.compress(zip_path, 'zip', dir_path) + _path = '%s.%s' % (dir_path, extension) + _path = _compress(_path, extension, dir_path) + return compute_info(_path) - with open(zip_path, 'rb') as f: - length = 0 - sha1sum = hashlib.sha1() - md5sum = hashlib.md5() - data = b'' - for chunk in f: - sha1sum.update(chunk) - md5sum.update(chunk) - length += len(chunk) - data += chunk - return { - 'dir': archive_path_dir, - 'name': archive_name, - 'data': data, - 'path': zip_path, - 'sha1sum': sha1sum.hexdigest(), - 'md5sum': md5sum.hexdigest(), - 'length': length, - } +def create_archive_with_archive(root_path, name, archive): + """Create an archive holding another. + + """ + invalid_archive_path = os.path.join(root_path, name) + with tarfile.open(invalid_archive_path, 'w:gz') as _archive: + _archive.add(archive['path'], arcname=archive['name']) + return compute_info(invalid_archive_path) @attr('fs') @@ -95,7 +149,7 @@ self.root_path = '/tmp/swh-deposit/test/build-zip/' os.makedirs(self.root_path, exist_ok=True) - self.archive = create_arborescence_zip( + self.archive = create_arborescence_archive( self.root_path, 'archive1', 'file1', b'some content in file') self.atom_entry = b""" @@ -158,6 +212,36 @@ deposit_id = int(response_content['deposit_id']) return deposit_id + def create_deposit_archive_with_archive(self, archive_extension): + # we create the holding archive to a given extension + archive = create_arborescence_archive( + self.root_path, 'archive1', 'file1', b'some content in file', + extension=archive_extension) + + # now we create an archive holding the first created archive + invalid_archive = create_archive_with_archive( + self.root_path, 'invalid.tar.gz', archive) + + # we deposit it + response = self.client.post( + reverse(COL_IRI, args=[self.collection.name]), + content_type='application/x-tar', + data=invalid_archive['data'], + CONTENT_LENGTH=invalid_archive['length'], + HTTP_MD5SUM=invalid_archive['md5sum'], + HTTP_SLUG='external-id', + HTTP_IN_PROGRESS=False, + HTTP_CONTENT_DISPOSITION='attachment; filename=%s' % ( + invalid_archive['name'], )) + + # then + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + response_content = parse_xml(BytesIO(response.content)) + _status = response_content['deposit_status'] + self.assertEqual(_status, DEPOSIT_STATUS_DEPOSITED) + deposit_id = int(response_content['deposit_id']) + return deposit_id + def update_binary_deposit(self, deposit_id, status_partial=False): # update existing deposit with atom entry metadata response = self.client.post(