Page MenuHomeSoftware Heritage

D380.id1203.diff
No OneTemporary

D380.id1203.diff

diff --git a/docs/endpoints/status.rst b/docs/endpoints/status.rst
--- a/docs/endpoints/status.rst
+++ b/docs/endpoints/status.rst
@@ -3,15 +3,14 @@
.. http:get:: /1/<collection-name>/<deposit-id>/
- Display deposit's status in regards to loading.
-
+ Returns deposit's status.
The different statuses:
- **partial**: multipart deposit is still ongoing
- - **deposited**: deposit completed
+ - **deposited**: deposit completed, ready for checks
- **rejected**: deposit failed the checks
- - **verified**: content and metadata verified
+ - **verified**: content and metadata verified, ready for loading
- **loading**: loading in-progress
- **done**: loading completed successfully
- **failed**: the deposit loading has failed
@@ -24,6 +23,51 @@
:statuscode 404: access to an unknown deposit
+Rejected deposit
+~~~~~~~~~~~~~~~~
+
+It so happens that deposit could be rejected. In that case, the
+`deposit_status_detail` entry will explain failed checks.
+
+Many reasons are possibles, here are some:
+
+- Deposit without software archive (main goal of the deposit is to
+ deposit software source code)
+
+- Deposit with malformed software archive (i.e archive within archive)
+
+- Deposit with invalid software archive (corrupted archive, although,
+ this one should happen during upload and not during checks)
+
+- Deposit with unsupported archive format
+
+- Deposit with missing metadata
+
Sample response
~~~~~~~~~~~~~~~
+
+ Successful deposit:
+
+ .. code:: xml
+
+ <entry xmlns="http://www.w3.org/2005/Atom"
+ xmlns:sword="http://purl.org/net/sword/"
+ xmlns:dcterms="http://purl.org/dc/terms/">
+ <deposit_id>150</deposit_id>
+ <deposit_status>done</deposit_status>
+ <deposit_status_detail>The deposit has been successfully loaded into the Software Heritage archive</deposit_status_detail>
+ <deposit_swh_id>swh:1:rev:c648730299c2a4f4df3c1fe6e527ef3681f9527e</deposit_swh_id>
+ </entry>
+
+ Rejected deposit:
+
+ .. code:: xml
+
+ <entry xmlns="http://www.w3.org/2005/Atom"
+ xmlns:sword="http://purl.org/net/sword/"
+ xmlns:dcterms="http://purl.org/dc/terms/">
+ <deposit_id>148</deposit_id>
+ <deposit_status>rejected</deposit_status>
+ <deposit_status_detail>- At least one url field must be compatible with the client&#39;s domain name (codemeta:url)</deposit_status_detail>
+ </entry>
diff --git a/swh/deposit/api/deposit_status.py b/swh/deposit/api/deposit_status.py
--- a/swh/deposit/api/deposit_status.py
+++ b/swh/deposit/api/deposit_status.py
@@ -26,10 +26,10 @@
'summary': <summary-string>,
'fields': <impacted-fields-list>,
}],
- 'archive': {
+ 'archive': [{
'summary': <summary-string>,
- 'fields': [],
- }
+ 'fields': <impacted-fields-list>,
+ }]
}
@@ -45,19 +45,23 @@
return None
msg = []
- if 'metadata' in status_detail:
- for data in status_detail['metadata']:
- fields = ', '.join(data['fields'])
- msg.append('- %s (%s)\n' % (data['summary'], fields))
-
- for key in ['url', 'archive']:
- if key in status_detail:
- _detail = status_detail[key]
- fields = _detail.get('fields')
- suffix_msg = ''
- if fields:
- suffix_msg = ' (%s)' % ', '.join(fields)
- msg.append('- %s%s\n' % (_detail['summary'], suffix_msg))
+ for key in ['metadata', 'archive']:
+ _detail = status_detail.get(key)
+ if _detail:
+ for data in _detail:
+ suffix_msg = ''
+ fields = data.get('fields')
+ if fields:
+ suffix_msg = ' (%s)' % ', '.join(fields)
+ msg.append('- %s%s\n' % (data['summary'], suffix_msg))
+
+ _detail = status_detail.get('url')
+ if _detail:
+ fields = _detail.get('fields')
+ suffix_msg = ''
+ if fields:
+ suffix_msg = ' (%s)' % ', '.join(fields)
+ msg.append('- %s%s\n' % (_detail['summary'], suffix_msg))
if not msg:
return None
diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py
--- a/swh/deposit/api/private/deposit_check.py
+++ b/swh/deposit/api/private/deposit_check.py
@@ -4,7 +4,9 @@
# See top-level LICENSE file for more information
import json
-import patoolib
+import re
+import tarfile
+import zipfile
from rest_framework import status
@@ -16,10 +18,11 @@
MANDATORY_FIELDS_MISSING = 'Mandatory fields are missing'
ALTERNATE_FIELDS_MISSING = 'Mandatory alternate fields are missing'
-
-MANDATORY_ARCHIVE_UNREADABLE = 'Deposit was rejected because at least one of its associated archives was not readable' # noqa
-MANDATORY_ARCHIVE_MISSING = 'Deposit without archive is rejected'
INCOMPATIBLE_URL_FIELDS = "At least one url field must be compatible with the client's domain name" # noqa
+MANDATORY_ARCHIVE_UNREADABLE = 'At least one of its associated archives is not readable' # noqa
+MANDATORY_ARCHIVE_INVALID = 'Mandatory archive is invalid (i.e contains only one archive)' # noqa
+MANDATORY_ARCHIVE_UNSUPPORTED = 'Mandatory archive type is not supported'
+MANDATORY_ARCHIVE_MISSING = 'Deposit without archive is rejected'
class SWHChecksDeposit(SWHGetDepositAPI, SWHPrivateAPIView, DepositReadMixin):
@@ -43,42 +46,63 @@
deposit, request_type=ARCHIVE_TYPE))
if len(requests) == 0: # no associated archive is refused
return False, {
- 'archive': {
+ 'archive': [{
'summary': MANDATORY_ARCHIVE_MISSING,
- }
+ }]
}
- rejected_dr_ids = []
- for dr in requests:
- _path = dr.archive.path
- check = self._check_archive(_path)
+ errors = []
+ for archive_request in requests:
+ check, error_message = self._check_archive(archive_request)
if not check:
- rejected_dr_ids.append(dr.id)
+ errors.append({
+ 'summary': error_message,
+ 'fields': [archive_request.id]
+ })
- if rejected_dr_ids:
- return False, {
- 'archive': {
- 'summary': MANDATORY_ARCHIVE_UNREADABLE,
- 'fields': rejected_dr_ids,
- }}
- return True, None
+ if not errors:
+ return True, None
+ return False, {
+ 'archive': errors
+ }
+
+ def _check_archive(self, archive_request):
+ """Check that a deposit associated archive is ok:
+ - readable
+ - supported archive format
+ - content of the archive is not a single archive
- def _check_archive(self, archive_path):
- """Check that a given archive is actually ok for reading.
+ If any of those checks are not ok, return the corresponding
+ failing check.
Args:
- archive_path (str): Archive to check
+ archive_path (DepositRequest): Archive to check
Returns:
- True if archive is successfully read, False otherwise.
+ (True, None) if archive is check compliant, (False,
+ <detail-error>) otherwise.
"""
+ archive_path = archive_request.archive.path
try:
- patoolib.test_archive(archive_path, verbosity=-1)
+ if zipfile.is_zipfile(archive_path):
+ with zipfile.ZipFile(archive_path) as f:
+ files = f.namelist()
+ elif tarfile.is_tarfile(archive_path):
+ with tarfile.open(archive_path) as f:
+ files = f.getnames()
+ else:
+ return False, MANDATORY_ARCHIVE_UNSUPPORTED
except Exception:
- return False
- else:
- return True
+ return False, MANDATORY_ARCHIVE_UNREADABLE
+ if len(files) > 1:
+ return True, None
+ element = files[0]
+ pattern = re.compile(
+ r'.*\.(zip|tar|tar.gz|.xz|tar.xz|Z|.tar.Z|bz2|tar.bz2)$')
+ if pattern.match(element): # invalid archive in archive
+ return False, MANDATORY_ARCHIVE_INVALID
+ return True, None
def _check_metadata(self, metadata):
"""Check to execute on all metadata for mandatory field presence.
diff --git a/swh/deposit/models.py b/swh/deposit/models.py
--- a/swh/deposit/models.py
+++ b/swh/deposit/models.py
@@ -13,9 +13,11 @@
from django.db import models
from django.utils.timezone import now
-from .config import DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_DEPOSITED
-from .config import DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_LOAD_SUCCESS
-from .config import DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_REJECTED
+from .config import (
+ DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL,
+ DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE,
+ DEPOSIT_STATUS_REJECTED
+)
class Dbversion(models.Model):
@@ -43,7 +45,7 @@
('expired', 'expired'),
(DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_DEPOSITED),
(DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_VERIFIED),
- ('rejected', 'rejected'),
+ (DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_REJECTED),
('loading', 'loading'),
(DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_SUCCESS),
(DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_FAILURE),
@@ -60,7 +62,7 @@
'(tarball ok, metadata, etc...)',
DEPOSIT_STATUS_VERIFIED: 'Deposit is fully received, checked, and '
'ready for loading',
- 'rejected': 'Deposit failed the checks',
+ DEPOSIT_STATUS_REJECTED: 'Deposit failed the checks',
'loading': "Loading is ongoing on swh's side",
DEPOSIT_STATUS_LOAD_SUCCESS: 'The deposit has been successfully '
'loaded into the Software Heritage archive',
diff --git a/swh/deposit/tests/api/test_deposit_check.py b/swh/deposit/tests/api/test_deposit_check.py
--- a/swh/deposit/tests/api/test_deposit_check.py
+++ b/swh/deposit/tests/api/test_deposit_check.py
@@ -17,9 +17,10 @@
DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_REJECTED
)
from swh.deposit.api.private.deposit_check import (
- SWHChecksDeposit,
+ SWHChecksDeposit, MANDATORY_ARCHIVE_INVALID,
MANDATORY_FIELDS_MISSING, INCOMPATIBLE_URL_FIELDS,
- MANDATORY_ARCHIVE_UNREADABLE, ALTERNATE_FIELDS_MISSING
+ MANDATORY_ARCHIVE_UNSUPPORTED, ALTERNATE_FIELDS_MISSING,
+ MANDATORY_ARCHIVE_MISSING
)
from swh.deposit.models import Deposit
@@ -61,8 +62,71 @@
self.assertEquals(deposit.status, DEPOSIT_STATUS_VERIFIED)
@istest
- def deposit_ko(self):
- """Invalid deposit should fail the checks (-> status rejected)
+ def deposit_invalid_tarball(self):
+ """Deposit with tarball (of 1 tarball) should fail the checks: rejected
+
+ """
+ deposit_id = self.create_deposit_archive_with_archive()
+
+ deposit = Deposit.objects.get(pk=deposit_id)
+ self.assertEquals(DEPOSIT_STATUS_DEPOSITED, deposit.status)
+
+ url = reverse(PRIVATE_CHECK_DEPOSIT,
+ args=[self.collection.name, deposit.id])
+
+ response = self.client.get(url)
+
+ self.assertEqual(response.status_code, status.HTTP_200_OK)
+ data = json.loads(response.content.decode('utf-8'))
+ self.assertEqual(data['status'], DEPOSIT_STATUS_REJECTED)
+ details = data['details']
+ # archive checks failure
+ self.assertEqual(len(details['archive']), 1)
+ self.assertEqual(details['archive'][0]['summary'],
+ MANDATORY_ARCHIVE_INVALID)
+ # metadata check failure
+ self.assertEqual(len(details['metadata']), 2)
+ mandatory = details['metadata'][0]
+ self.assertEqual(mandatory['summary'], MANDATORY_FIELDS_MISSING)
+ self.assertEqual(set(mandatory['fields']),
+ set(['url', 'external_identifier', 'author']))
+ alternate = details['metadata'][1]
+ self.assertEqual(alternate['summary'], ALTERNATE_FIELDS_MISSING)
+ self.assertEqual(alternate['fields'], ['name or title'])
+ # url check failure
+ self.assertEqual(details['url']['summary'], INCOMPATIBLE_URL_FIELDS)
+
+ deposit = Deposit.objects.get(pk=deposit.id)
+ self.assertEquals(deposit.status, DEPOSIT_STATUS_REJECTED)
+
+ @istest
+ def deposit_ko_missing_tarball(self):
+ """Deposit without archive should fail the checks: rejected
+
+ """
+ deposit_id = self.create_deposit_ready() # no archive, only atom
+ deposit = Deposit.objects.get(pk=deposit_id)
+ self.assertEquals(DEPOSIT_STATUS_DEPOSITED, deposit.status)
+
+ url = reverse(PRIVATE_CHECK_DEPOSIT,
+ args=[self.collection.name, deposit.id])
+
+ response = self.client.get(url)
+
+ self.assertEqual(response.status_code, status.HTTP_200_OK)
+ data = json.loads(response.content.decode('utf-8'))
+ self.assertEqual(data['status'], DEPOSIT_STATUS_REJECTED)
+ details = data['details']
+ # archive checks failure
+ self.assertEqual(len(details['archive']), 1)
+ self.assertEqual(details['archive'][0]['summary'],
+ MANDATORY_ARCHIVE_MISSING)
+ deposit = Deposit.objects.get(pk=deposit.id)
+ self.assertEquals(deposit.status, DEPOSIT_STATUS_REJECTED)
+
+ @istest
+ def deposit_ko_unsupported_tarball(self):
+ """Deposit with an unsupported tarball should fail the checks: rejected
"""
deposit_id = self.create_deposit_with_invalid_archive()
@@ -80,9 +144,9 @@
self.assertEqual(data['status'], DEPOSIT_STATUS_REJECTED)
details = data['details']
# archive checks failure
- self.assertEqual(len(details['archive']['fields']), 1)
- self.assertEqual(details['archive']['summary'],
- MANDATORY_ARCHIVE_UNREADABLE)
+ self.assertEqual(len(details['archive']), 1)
+ self.assertEqual(details['archive'][0]['summary'],
+ MANDATORY_ARCHIVE_UNSUPPORTED)
# metadata check failure
self.assertEqual(len(details['metadata']), 2)
mandatory = details['metadata'][0]
diff --git a/swh/deposit/tests/api/test_deposit_status.py b/swh/deposit/tests/api/test_deposit_status.py
--- a/swh/deposit/tests/api/test_deposit_status.py
+++ b/swh/deposit/tests/api/test_deposit_status.py
@@ -143,16 +143,16 @@
'fields': ['name or title', 'url or badurl']
}
],
- 'archive': {
+ 'archive': [{
'summary': 'Unreadable archive',
- 'fields': ['1', '2'],
- },
+ 'fields': ['1'],
+ }],
}
expected_status_detail = '''- Mandatory fields missing (url, title)
- Alternate fields missing (name or title, url or badurl)
+- Unreadable archive (1)
- At least one url field must be compatible with the client's domain name. The following url fields failed the check (blahurl, testurl)
-- Unreadable archive (1, 2)
''' # noqa
actual_status_detail = convert_status_detail(status_detail)
@@ -172,9 +172,21 @@
'fields': ['name'],
},
],
+ 'archive': [
+ {
+ 'summary': 'Invalid archive',
+ 'fields': ['2'],
+ },
+ {
+ 'summary': 'Unsupported archive',
+ 'fields': ['1'],
+ }
+ ],
}
expected_status_detail = '''- Mandatory fields missing (name)
+- Invalid archive (2)
+- Unsupported archive (1)
- At least one compatible url field. Failed (testurl)
'''
diff --git a/swh/deposit/tests/common.py b/swh/deposit/tests/common.py
--- a/swh/deposit/tests/common.py
+++ b/swh/deposit/tests/common.py
@@ -7,6 +7,7 @@
import hashlib
import os
import shutil
+import tarfile
import tempfile
from django.core.urlresolvers import reverse
@@ -28,6 +29,32 @@
from swh.core import tarball
+def compute_info(archive_path):
+ """Given a path, compute information on path.
+
+ """
+ with open(archive_path, 'rb') as f:
+ length = 0
+ sha1sum = hashlib.sha1()
+ md5sum = hashlib.md5()
+ data = b''
+ for chunk in f:
+ sha1sum.update(chunk)
+ md5sum.update(chunk)
+ length += len(chunk)
+ data += chunk
+
+ return {
+ 'dir': os.path.dirname(archive_path),
+ 'name': os.path.basename(archive_path),
+ 'path': archive_path,
+ 'length': length,
+ 'sha1sum': sha1sum.hexdigest(),
+ 'md5sum': md5sum.hexdigest(),
+ 'data': data
+ }
+
+
def create_arborescence_zip(root_path, archive_name, filename, content,
up_to_size=None):
"""Build an archive named archive_name in the root_path.
@@ -61,27 +88,17 @@
zip_path = dir_path + '.zip'
zip_path = tarball.compress(zip_path, 'zip', dir_path)
+ return compute_info(zip_path)
- with open(zip_path, 'rb') as f:
- length = 0
- sha1sum = hashlib.sha1()
- md5sum = hashlib.md5()
- data = b''
- for chunk in f:
- sha1sum.update(chunk)
- md5sum.update(chunk)
- length += len(chunk)
- data += chunk
- return {
- 'dir': archive_path_dir,
- 'name': archive_name,
- 'data': data,
- 'path': zip_path,
- 'sha1sum': sha1sum.hexdigest(),
- 'md5sum': md5sum.hexdigest(),
- 'length': length,
- }
+def create_archive_with_archive(root_path, name, archive):
+ """Create an archive holding another.
+
+ """
+ invalid_archive_path = os.path.join(root_path, name)
+ with tarfile.open(invalid_archive_path, 'w:gz') as _archive:
+ _archive.add(archive['path'], arcname=archive['name'])
+ return compute_info(invalid_archive_path)
@attr('fs')
@@ -158,6 +175,29 @@
deposit_id = int(response_content['deposit_id'])
return deposit_id
+ def create_deposit_archive_with_archive(self):
+ invalid_archive = create_archive_with_archive(
+ self.root_path, 'invalid.tar.gz', self.archive)
+
+ response = self.client.post(
+ reverse(COL_IRI, args=[self.collection.name]),
+ content_type='application/x-tar',
+ data=invalid_archive['data'],
+ CONTENT_LENGTH=invalid_archive['length'],
+ HTTP_MD5SUM=invalid_archive['md5sum'],
+ HTTP_SLUG='external-id',
+ HTTP_IN_PROGRESS=False,
+ HTTP_CONTENT_DISPOSITION='attachment; filename=%s' % (
+ invalid_archive['name'], ))
+
+ # then
+ self.assertEqual(response.status_code, status.HTTP_201_CREATED)
+ response_content = parse_xml(BytesIO(response.content))
+ _status = response_content['deposit_status']
+ self.assertEqual(_status, DEPOSIT_STATUS_DEPOSITED)
+ deposit_id = int(response_content['deposit_id'])
+ return deposit_id
+
def update_binary_deposit(self, deposit_id, status_partial=False):
# update existing deposit with atom entry metadata
response = self.client.post(

File Metadata

Mime Type
text/plain
Expires
Thu, Dec 19, 8:01 AM (13 h, 58 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3228226

Event Timeline