Changeset View
Changeset View
Standalone View
Standalone View
swh/deposit/api/private/deposit_check.py
# Copyright (C) 2017-2020 The Software Heritage developers | # Copyright (C) 2017-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from itertools import chain | from itertools import chain | ||||
import json | |||||
import re | import re | ||||
from shutil import get_unpack_formats | from shutil import get_unpack_formats | ||||
import tarfile | import tarfile | ||||
from typing import Dict, Optional, Tuple | |||||
import zipfile | import zipfile | ||||
from rest_framework import status | from rest_framework import status | ||||
from swh.scheduler.utils import create_oneshot_task_dict | from swh.scheduler.utils import create_oneshot_task_dict | ||||
from . import APIPrivateView, DepositReadMixin | from . import APIPrivateView, DepositReadMixin | ||||
from ...config import ARCHIVE_TYPE, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED | from ...config import ARCHIVE_TYPE, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED | ||||
from ...models import Deposit | from ...models import Deposit, DepositRequest | ||||
from ..common import APIGet | from ..common import APIGet | ||||
MANDATORY_FIELDS_MISSING = "Mandatory fields are missing" | MANDATORY_FIELDS_MISSING = "Mandatory fields are missing" | ||||
ALTERNATE_FIELDS_MISSING = "Mandatory alternate fields are missing" | ALTERNATE_FIELDS_MISSING = "Mandatory alternate fields are missing" | ||||
MANDATORY_ARCHIVE_UNREADABLE = ( | MANDATORY_ARCHIVE_UNREADABLE = ( | ||||
"At least one of its associated archives is not readable" # noqa | "At least one of its associated archives is not readable" # noqa | ||||
) | ) | ||||
MANDATORY_ARCHIVE_INVALID = ( | MANDATORY_ARCHIVE_INVALID = ( | ||||
Show All 27 Lines | |||||
class APIChecks(APIPrivateView, APIGet, DepositReadMixin): | class APIChecks(APIPrivateView, APIGet, DepositReadMixin): | ||||
"""Dedicated class to read a deposit's raw archives content. | """Dedicated class to read a deposit's raw archives content. | ||||
Only GET is supported. | Only GET is supported. | ||||
""" | """ | ||||
def _check_deposit_archives(self, deposit): | def _check_deposit_archives(self, deposit: Deposit) -> Tuple[bool, Optional[Dict]]: | ||||
"""Given a deposit, check each deposit request of type archive. | """Given a deposit, check each deposit request of type archive. | ||||
Args: | Args: | ||||
The deposit to check archives for | The deposit to check archives for | ||||
Returns | Returns | ||||
tuple (status, error_detail): True, None if all archives | tuple (status, error_detail): True, None if all archives | ||||
are ok, (False, <detailed-error>) otherwise. | are ok, (False, <detailed-error>) otherwise. | ||||
Show All 10 Lines | def _check_deposit_archives(self, deposit: Deposit) -> Tuple[bool, Optional[Dict]]: | ||||
errors.append( | errors.append( | ||||
{"summary": error_message, "fields": [archive_request.id]} | {"summary": error_message, "fields": [archive_request.id]} | ||||
) | ) | ||||
if not errors: | if not errors: | ||||
return True, None | return True, None | ||||
return False, {"archive": errors} | return False, {"archive": errors} | ||||
def _check_archive(self, archive_request): | def _check_archive( | ||||
self, archive_request: DepositRequest | |||||
) -> Tuple[bool, Optional[str]]: | |||||
"""Check that a deposit associated archive is ok: | """Check that a deposit associated archive is ok: | ||||
- readable | - readable | ||||
- supported archive format | - supported archive format | ||||
- valid content: the archive does not contain a single archive file | - valid content: the archive does not contain a single archive file | ||||
If any of those checks are not ok, return the corresponding | If any of those checks are not ok, return the corresponding | ||||
failing check. | failing check. | ||||
Args: | Args: | ||||
archive_path (DepositRequest): Archive to check | archive_path (DepositRequest): Archive to check | ||||
Returns: | Returns: | ||||
(True, None) if archive is check compliant, (False, | (True, None) if archive is check compliant, (False, | ||||
<detail-error>) otherwise. | <detail-error>) otherwise. | ||||
""" | """ | ||||
archive_path = archive_request.archive.path | archive_path = archive_request.archive.path | ||||
if not known_archive_format(archive_path): | if not known_archive_format(archive_path): | ||||
return False, MANDATORY_ARCHIVE_UNSUPPORTED | return False, MANDATORY_ARCHIVE_UNSUPPORTED | ||||
try: | try: | ||||
if zipfile.is_zipfile(archive_path): | if zipfile.is_zipfile(archive_path): | ||||
with zipfile.ZipFile(archive_path) as f: | with zipfile.ZipFile(archive_path) as zipfile_: | ||||
files = f.namelist() | files = zipfile_.namelist() | ||||
elif tarfile.is_tarfile(archive_path): | elif tarfile.is_tarfile(archive_path): | ||||
with tarfile.open(archive_path) as f: | with tarfile.open(archive_path) as tarfile_: | ||||
files = f.getnames() | files = tarfile_.getnames() | ||||
else: | else: | ||||
return False, MANDATORY_ARCHIVE_UNSUPPORTED | return False, MANDATORY_ARCHIVE_UNSUPPORTED | ||||
except Exception: | except Exception: | ||||
return False, MANDATORY_ARCHIVE_UNREADABLE | return False, MANDATORY_ARCHIVE_UNREADABLE | ||||
if len(files) > 1: | if len(files) > 1: | ||||
return True, None | return True, None | ||||
element = files[0] | element = files[0] | ||||
if PATTERN_ARCHIVE_EXTENSION.match(element): | if PATTERN_ARCHIVE_EXTENSION.match(element): | ||||
# archive in archive! | # archive in archive! | ||||
return False, MANDATORY_ARCHIVE_INVALID | return False, MANDATORY_ARCHIVE_INVALID | ||||
return True, None | return True, None | ||||
def _check_metadata(self, metadata): | def _check_metadata(self, metadata: Dict) -> Tuple[bool, Optional[Dict]]: | ||||
"""Check to execute on all metadata for mandatory field presence. | """Check to execute on all metadata for mandatory field presence. | ||||
Args: | Args: | ||||
metadata (dict): Metadata dictionary to check for mandatory fields | metadata (dict): Metadata dictionary to check for mandatory fields | ||||
Returns: | Returns: | ||||
tuple (status, error_detail): True, None if metadata are | tuple (status, error_detail): True, None if metadata are | ||||
ok (False, <detailed-error>) otherwise. | ok (False, <detailed-error>) otherwise. | ||||
Show All 29 Lines | def _check_metadata(self, metadata: Dict) -> Tuple[bool, Optional[Dict]]: | ||||
{"summary": MANDATORY_FIELDS_MISSING, "fields": mandatory_result} | {"summary": MANDATORY_FIELDS_MISSING, "fields": mandatory_result} | ||||
) | ) | ||||
if optional_result != []: | if optional_result != []: | ||||
detail.append( | detail.append( | ||||
{"summary": ALTERNATE_FIELDS_MISSING, "fields": optional_result,} | {"summary": ALTERNATE_FIELDS_MISSING, "fields": optional_result,} | ||||
) | ) | ||||
return False, {"metadata": detail} | return False, {"metadata": detail} | ||||
def process_get(self, req, collection_name, deposit_id): | def process_get( | ||||
self, req, collection_name: str, deposit_id: int | |||||
) -> Tuple[int, Dict, str]: | |||||
"""Build a unique tarball from the multiple received and stream that | """Build a unique tarball from the multiple received and stream that | ||||
content to the client. | content to the client. | ||||
Args: | Args: | ||||
req (Request): | req (Request): | ||||
collection_name (str): Collection owning the deposit | collection_name (str): Collection owning the deposit | ||||
deposit_id (id): Deposit concerned by the reading | deposit_id (id): Deposit concerned by the reading | ||||
Returns: | Returns: | ||||
Tuple status, stream of content, content-type | Tuple status, stream of content, content-type | ||||
""" | """ | ||||
deposit = Deposit.objects.get(pk=deposit_id) | deposit = Deposit.objects.get(pk=deposit_id) | ||||
metadata = self._metadata_get(deposit) | metadata = self._metadata_get(deposit) | ||||
problems = {} | problems: Dict = {} | ||||
# will check each deposit's associated request (both of type | # will check each deposit's associated request (both of type | ||||
# archive and metadata) for errors | # archive and metadata) for errors | ||||
archives_status, error_detail = self._check_deposit_archives(deposit) | archives_status, error_detail = self._check_deposit_archives(deposit) | ||||
if not archives_status: | if not archives_status: | ||||
assert error_detail is not None | |||||
problems.update(error_detail) | problems.update(error_detail) | ||||
metadata_status, error_detail = self._check_metadata(metadata) | metadata_status, error_detail = self._check_metadata(metadata) | ||||
if not metadata_status: | if not metadata_status: | ||||
assert error_detail is not None | |||||
problems.update(error_detail) | problems.update(error_detail) | ||||
deposit_status = archives_status and metadata_status | deposit_status = archives_status and metadata_status | ||||
# if any problems arose, the deposit is rejected | # if any problems arose, the deposit is rejected | ||||
if not deposit_status: | if not deposit_status: | ||||
deposit.status = DEPOSIT_STATUS_REJECTED | deposit.status = DEPOSIT_STATUS_REJECTED | ||||
deposit.status_detail = problems | deposit.status_detail = problems | ||||
Show All 11 Lines | ) -> Tuple[int, Dict, str]: | ||||
task = create_oneshot_task_dict( | task = create_oneshot_task_dict( | ||||
"load-deposit", url=url, deposit_id=deposit.id, retries_left=3 | "load-deposit", url=url, deposit_id=deposit.id, retries_left=3 | ||||
) | ) | ||||
load_task_id = self.scheduler.create_tasks([task])[0]["id"] | load_task_id = self.scheduler.create_tasks([task])[0]["id"] | ||||
deposit.load_task_id = load_task_id | deposit.load_task_id = load_task_id | ||||
deposit.save() | deposit.save() | ||||
return status.HTTP_200_OK, json.dumps(response), "application/json" | return status.HTTP_200_OK, response, "application/json" |