Changeset View
Changeset View
Standalone View
Standalone View
swh/deposit/api/private/deposit_check.py
# Copyright (C) 2017-2018 The Software Heritage developers | # Copyright (C) 2017-2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import json | import json | ||||
import patoolib | import re | ||||
import tarfile | |||||
import zipfile | |||||
from rest_framework import status | from rest_framework import status | ||||
from . import DepositReadMixin | from . import DepositReadMixin | ||||
from ..common import SWHGetDepositAPI, SWHPrivateAPIView | from ..common import SWHGetDepositAPI, SWHPrivateAPIView | ||||
from ...config import DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_REJECTED | from ...config import DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_REJECTED | ||||
from ...config import ARCHIVE_TYPE | from ...config import ARCHIVE_TYPE | ||||
from ...models import Deposit | from ...models import Deposit | ||||
MANDATORY_FIELDS_MISSING = 'Mandatory fields are missing' | MANDATORY_FIELDS_MISSING = 'Mandatory fields are missing' | ||||
ALTERNATE_FIELDS_MISSING = 'Mandatory alternate fields are missing' | ALTERNATE_FIELDS_MISSING = 'Mandatory alternate fields are missing' | ||||
MANDATORY_ARCHIVE_UNREADABLE = 'Deposit was rejected because at least one of its associated archives was not readable' # noqa | MANDATORY_ARCHIVE_UNREADABLE = 'Deposit was rejected because at least one of its associated archives was not readable' # noqa | ||||
MANDATORY_ARCHIVE_INVALID = 'Mandatory archive is invalid (e.g contains an archive)' # noqa | |||||
MANDATORY_ARCHIVE_UNSUPPORTED = 'Mandatory archive type is not supported' | |||||
MANDATORY_ARCHIVE_MISSING = 'Deposit without archive is rejected' | MANDATORY_ARCHIVE_MISSING = 'Deposit without archive is rejected' | ||||
INCOMPATIBLE_URL_FIELDS = "At least one url field must be compatible with the client's domain name" # noqa | INCOMPATIBLE_URL_FIELDS = "At least one url field must be compatible with the client's domain name" # noqa | ||||
moranegg: I like this new approach | |||||
class SWHChecksDeposit(SWHGetDepositAPI, SWHPrivateAPIView, DepositReadMixin): | class SWHChecksDeposit(SWHGetDepositAPI, SWHPrivateAPIView, DepositReadMixin): | ||||
"""Dedicated class to read a deposit's raw archives content. | """Dedicated class to read a deposit's raw archives content. | ||||
Only GET is supported. | Only GET is supported. | ||||
""" | """ | ||||
def _check_deposit_archives(self, deposit): | def _check_deposit_archives(self, deposit): | ||||
"""Given a deposit, check each deposit request of type archive. | """Given a deposit, check each deposit request of type archive. | ||||
Args: | Args: | ||||
The deposit to check archives for | The deposit to check archives for | ||||
Returns | Returns | ||||
tuple (status, error_detail): True, None if all archives | tuple (status, error_detail): True, None if all archives | ||||
are ok, (False, <detailed-error>) otherwise. | are ok, (False, <detailed-error>) otherwise. | ||||
""" | """ | ||||
requests = list(self._deposit_requests( | requests = list(self._deposit_requests( | ||||
deposit, request_type=ARCHIVE_TYPE)) | deposit, request_type=ARCHIVE_TYPE)) | ||||
if len(requests) == 0: # no associated archive is refused | if len(requests) == 0: # no associated archive is refused | ||||
return False, { | return False, { | ||||
'archive': { | 'archive': [{ | ||||
'summary': MANDATORY_ARCHIVE_MISSING, | 'summary': MANDATORY_ARCHIVE_MISSING, | ||||
} | }] | ||||
} | } | ||||
rejected_dr_ids = [] | errors = [] | ||||
for dr in requests: | for archive_request in requests: | ||||
_path = dr.archive.path | check, error_message = self._check_archive(archive_request) | ||||
check = self._check_archive(_path) | |||||
if not check: | if not check: | ||||
rejected_dr_ids.append(dr.id) | errors.append({ | ||||
'summary': error_message, | |||||
'fields': [archive_request.id] | |||||
}) | |||||
if rejected_dr_ids: | if not errors: | ||||
return False, { | |||||
'archive': { | |||||
'summary': MANDATORY_ARCHIVE_UNREADABLE, | |||||
'fields': rejected_dr_ids, | |||||
}} | |||||
return True, None | return True, None | ||||
return False, { | |||||
'archive': errors | |||||
} | |||||
def _check_archive(self, archive_path): | def _check_archive(self, archive_request): | ||||
"""Check that a given archive is actually ok for reading. | """Check that a deposit associated archive is ok: | ||||
- readable | |||||
- supported archive format | |||||
- content of the archive is not a single archive | |||||
If any of those checks are not ok, return the corresponding | |||||
Done Inline Actionsyes that's good.
moranegg: yes that's good.
on the third check i would add the name valid (as the opposite of invalid)… | |||||
failing check. | |||||
Args: | Args: | ||||
archive_path (str): Archive to check | archive_path (DepositRequest): Archive to check | ||||
Returns: | Returns: | ||||
True if archive is successfully read, False otherwise. | (True, None) if archive is check compliant, (False, | ||||
<detail-error>) otherwise. | |||||
""" | """ | ||||
archive_path = archive_request.archive.path | |||||
try: | try: | ||||
patoolib.test_archive(archive_path, verbosity=-1) | if zipfile.is_zipfile(archive_path): | ||||
except Exception: | with zipfile.ZipFile(archive_path) as f: | ||||
return False | files = f.namelist() | ||||
elif tarfile.is_tarfile(archive_path): | |||||
with tarfile.open(archive_path) as f: | |||||
files = f.getnames() | |||||
else: | else: | ||||
return True | return False, MANDATORY_ARCHIVE_UNSUPPORTED | ||||
except Exception: | |||||
return False, MANDATORY_ARCHIVE_UNREADABLE | |||||
if len(files) > 1: | |||||
return True, None | |||||
element = files[0] | |||||
pattern = re.compile( | |||||
r'.*\.(zip|tar|tar.gz|.xz|tar.xz|Z|.tar.Z|bz2|tar.bz2)$') | |||||
if pattern.match(element): # invalid archive in archive | |||||
return False, MANDATORY_ARCHIVE_INVALID | |||||
return True, None | |||||
def _check_metadata(self, metadata): | def _check_metadata(self, metadata): | ||||
"""Check to execute on all metadata for mandatory field presence. | """Check to execute on all metadata for mandatory field presence. | ||||
Args: | Args: | ||||
metadata (dict): Metadata dictionary to check for mandatory fields | metadata (dict): Metadata dictionary to check for mandatory fields | ||||
Returns: | Returns: | ||||
▲ Show 20 Lines • Show All 125 Lines • Show Last 20 Lines |
I like this new approach