diff --git a/debian/control b/debian/control index dad571fc..4e82e3a3 100644 --- a/debian/control +++ b/debian/control @@ -1,40 +1,41 @@ Source: swh-deposit Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python (>= 2), python3-setuptools, python3-all, python3-nose, python3-django-nose, python3-vcversioner, python3-swh.core (>= 0.0.14~), python3-swh.loader.core (>= 0.0.24~), python3-swh.loader.tar (>= 0.0.27~), python3-swh.scheduler (>= 0.0.17~), python3-django, python3-click, python3-vcversioner, python3-djangorestframework, python3-djangorestframework-xml, python3-requests Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/source/swh-deposit/ Package: python3-swh.deposit Architecture: all Depends: python3-swh.core (>= 0.0.14~), python3-swh.loader.tar (>= 0.0.27~), + python3-swh.scheduler (>= 0.0.17~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Deposit Server Package: python3-swh.deposit.injection Architecture: all Depends: python3-swh.core (>= 0.0.14~), python3-swh.loader.core (>= 0.0.24~), python3-swh.loader.tar (>= 0.0.27~), python3-swh.scheduler (>= 0.0.17~), python3-requests, ${misc:Depends}, ${python3:Depends} Description: Software Heritage Deposit Injection diff --git a/requirements-swh.txt b/requirements-swh.txt index 6233ce5c..35b474c4 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,3 +1,4 @@ swh.core >= 0.0.14 swh.loader.tar >= 0.0.27 swh.loader.core >= 0.0.24 +swh.scheduler >= 0.0.17 diff --git a/swh/deposit/config.py b/swh/deposit/config.py index 188e2931..80d27a3c 100644 --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -1,78 +1,81 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import logging from swh.core.config import SWHConfig # IRIs (Internationalized Resource identifier) sword 2.0 specified EDIT_SE_IRI = 'edit_se_iri' EM_IRI = 'em_iri' CONT_FILE_IRI = 'cont_file_iri' SD_IRI = 'servicedocument' COL_IRI = 'upload' STATE_IRI = 'state_iri' PRIVATE_GET_RAW_CONTENT = 'private-download' PRIVATE_CHECK_DEPOSIT = 'check-deposit' PRIVATE_PUT_DEPOSIT = 'private-update' PRIVATE_GET_DEPOSIT_METADATA = 'private-read' ARCHIVE_KEY = 'archive' METADATA_KEY = 'metadata' ARCHIVE_TYPE = 'archive' METADATA_TYPE = 'metadata' AUTHORIZED_PLATFORMS = ['development', 'production', 'testing'] DEPOSIT_STATUS_REJECTED = 'rejected' DEPOSIT_STATUS_PARTIAL = 'partial' DEPOSIT_STATUS_READY = 'ready' DEPOSIT_STATUS_READY_FOR_CHECKS = 'ready-for-checks' def setup_django_for(platform): """Setup function for command line tools (swh.deposit.create_user, swh.deposit.scheduler.cli) to initialize the needed db access. Note: Do not import any django related module prior to this function call. Otherwise, this will raise an django.core.exceptions.ImproperlyConfigured error message. Args: platform (str): the platform the scheduling is running Raises: ValueError in case of wrong platform inputs. """ if platform not in AUTHORIZED_PLATFORMS: raise ValueError('Platform should be one of %s' % AUTHORIZED_PLATFORMS) os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'swh.deposit.settings.%s' % platform) import django django.setup() class SWHDefaultConfig(SWHConfig): """Mixin intended to enrich views with SWH configuration. """ CONFIG_BASE_FILENAME = 'deposit/server' DEFAULT_CONFIG = { 'max_upload_size': ('int', 209715200), 'checks': ('bool', True), } def __init__(self, **config): super().__init__() self.config = self.parse_config_file() self.config.update(config) self.log = logging.getLogger('swh.deposit') + if self.config['checks']: + from swh.scheduler.backend import SchedulerBackend + self.scheduler = SchedulerBackend() diff --git a/swh/deposit/injection/checker.py b/swh/deposit/injection/checker.py new file mode 100644 index 00000000..7f867d5e --- /dev/null +++ b/swh/deposit/injection/checker.py @@ -0,0 +1,23 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from .client import DepositClient + + +class DepositChecker(): + """Deposit checker implementation. + + Trigger deposit's checks through the private api. + + """ + def __init__(self, client=None): + super().__init__() + if client: + self.client = client + else: + self.client = DepositClient() + + def check(self, deposit_check_url): + self.client.get(deposit_check_url) diff --git a/swh/deposit/injection/tasks.py b/swh/deposit/injection/tasks.py index 8923119a..8d9dced5 100644 --- a/swh/deposit/injection/tasks.py +++ b/swh/deposit/injection/tasks.py @@ -1,33 +1,51 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.scheduler.task import Task from swh.deposit.injection.loader import DepositLoader +from swh.deposit.injection.checker import DepositChecker class LoadDepositArchiveTsk(Task): """Deposit archive ingestion task described by the following steps: 1. Retrieve tarball from deposit's private api and store locally in a temporary directory 2. Trigger the ingestion 3. clean up the temporary directory 4. Update the deposit's status according to result using the deposit's private update status api """ task_queue = 'swh_deposit_archive' def run_task(self, *, archive_url, deposit_meta_url, deposit_update_url): """Import a deposit tarball into swh. Args: see :func:`DepositLoader.load`. """ loader = DepositLoader() loader.log = self.log loader.load(archive_url=archive_url, deposit_meta_url=deposit_meta_url, deposit_update_url=deposit_update_url) + + +class DepositChecksTsk(Task): + """Deposit checks task. + + """ + task_queue = 'swh_deposit_checks' + + def run_task(self, *, deposit_check_url, deposit_update_url): + """Check a deposit's status + + Args: see :func:`DepositChecker.check`. + + """ + checker = DepositChecker() + checker.log = self.log + checker.check(deposit_check_url=deposit_check_url) diff --git a/swh/deposit/signals.py b/swh/deposit/signals.py index cd2c7a9d..1d0d0f07 100644 --- a/swh/deposit/signals.py +++ b/swh/deposit/signals.py @@ -1,87 +1,74 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining some uncoupled actions on deposit. Typically, checking that the archives deposited are ok are not directly testing in the request/answer to avoid too long computations. So this is done in the deposit_on_status_ready_for_check callback. """ -import zipfile +import datetime from django.db.models.signals import post_save from django.dispatch import receiver from .models import DepositRequest from .config import SWHDefaultConfig -from .config import DEPOSIT_STATUS_READY, DEPOSIT_STATUS_REJECTED -from .config import DEPOSIT_STATUS_READY_FOR_CHECKS, ARCHIVE_TYPE - - -def checks(deposit_request): - """Additional checks to execute on the deposit request's associated - data (archive). - - Args: - The deposit request whose archive we need to check - - Returns: - True if we can at least read some content to the - request's deposit associated archive. False otherwise. - - """ - if deposit_request.type.name != ARCHIVE_TYPE: # no check for other types - return True - - try: - archive = deposit_request.archive - zf = zipfile.ZipFile(archive.path) - zf.infolist() - except Exception as e: - return False - else: - return True @receiver(post_save, sender=DepositRequest) def deposit_on_status_ready_for_check(sender, instance, created, raw, using, update_fields, **kwargs): """Check the status is ready for check. If so, try and check the associated archives. If not, move along. When Triggered when a deposit is saved. Args: sender (DepositRequest): The model class instance (DepositRequest): The actual instance being saved created (bool): True if a new record was created raw (bool): True if the model is saved exactly as presented (i.e. when loading a fixture). One should not query/modify other records in the database as the database might not be in a consistent state yet using: The database alias being used update_fields: The set of fields to update as passed to Model.save(), or None if update_fields wasn’t passed to save() """ - if not SWHDefaultConfig().config['checks']: + default_config = SWHDefaultConfig() + if not default_config.config['checks']: return - if instance.deposit.status is not DEPOSIT_STATUS_READY_FOR_CHECKS: - return - - if not checks(instance): - instance.deposit.status = DEPOSIT_STATUS_REJECTED - else: - instance.deposit.status = DEPOSIT_STATUS_READY - - instance.deposit.save() + # Schedule oneshot task for checking archives + from swh.deposit.config import PRIVATE_CHECK_DEPOSIT + from django.core.urlresolvers import reverse + + # FIXME: Generate absolute uri + args = [instance.deposit.collection.name, instance.deposit.id] + archive_check_url = reverse( + PRIVATE_CHECK_DEPOSIT, args=args) + + task = { + 'policy': 'oneshot', + 'type': 'swh-deposit-archive-checks', + 'next_run': datetime.datetime.now(tz=datetime.timezone.utc), + 'arguments': { + 'args': [], + 'kwargs': { + 'archive_check_url': archive_check_url, + }, + } + } + + default_config.scheduler.create_tasks([task])