diff --git a/swh/deposit/apps.py b/swh/deposit/apps.py index b885be10..f45146bb 100644 --- a/swh/deposit/apps.py +++ b/swh/deposit/apps.py @@ -1,16 +1,16 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.apps import AppConfig class DepositConfig(AppConfig): name = 'swh.deposit' def ready(self): super().ready() # install the signal permitting to trigger the status' check - from .signals import deposit_on_status_ready_for_check # noqa + from .signals import post_deposit_save # noqa diff --git a/swh/deposit/injection/scheduler.py b/swh/deposit/injection/scheduler.py index 5c2fbaa4..4737f37e 100644 --- a/swh/deposit/injection/scheduler.py +++ b/swh/deposit/injection/scheduler.py @@ -1,228 +1,212 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of sending deposit loading/checking as either celery task or scheduled one-shot tasks. """ import click import logging from abc import ABCMeta, abstractmethod from celery import group from swh.core import utils from swh.core.config import SWHConfig from swh.deposit.config import setup_django_for, DEPOSIT_STATUS_READY from swh.deposit.config import DEPOSIT_STATUS_READY_FOR_CHECKS +from swh.scheduler.utils import get_task, create_oneshot_task_dict class SWHScheduling(SWHConfig, metaclass=ABCMeta): """Base swh scheduling class to aggregate the schedule deposit injection. """ CONFIG_BASE_FILENAME = 'deposit/server' DEFAULT_CONFIG = { 'dry_run': ('bool', False), } ADDITIONAL_CONFIG = {} def __init__(self): super().__init__() self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) self.log = logging.getLogger('swh.deposit.scheduling') @abstractmethod def schedule(self, deposits): """Schedule the new deposit injection. Args: data (dict): Deposit aggregated data Returns: None """ pass class SWHCeleryScheduling(SWHScheduling): """Deposit injection as Celery task scheduling. """ def __init__(self, config=None): super().__init__() if config: self.config.update(**config) self.dry_run = self.config['dry_run'] self.check = self.config['check'] if self.check: task_name = 'swh.deposit.injection.tasks.DepositChecksTsk' else: task_name = 'swh.deposit.injection.tasks.LoadDepositArchiveTsk' - from swh.scheduler import utils - self.task = utils.get_task(task_name) + self.task = get_task(task_name) def _convert(self, deposits): """Convert tuple to celery task signature. """ task = self.task for archive_url, meta_url, update_url, check_url in deposits: if self.check: yield task.s(deposit_check_url=check_url) else: yield task.s(archive_url=archive_url, deposit_meta_url=meta_url, deposit_update_url=update_url) def schedule(self, deposits): """Schedule the new deposit injection directly through celery. Args: depositdata (dict): Deposit aggregated information. Returns: None """ if self.dry_run: return return group(self._convert(deposits)).delay() class SWHSchedulerScheduling(SWHScheduling): """Deposit injection through SWH's task scheduling interface. """ ADDITIONAL_CONFIG = {} def __init__(self, config=None): super().__init__() from swh.scheduler.backend import SchedulerBackend if config: self.config.update(**config) self.dry_run = self.config['dry_run'] self.scheduler = SchedulerBackend(**self.config) self.check = self.config['check'] def _convert(self, deposits): """Convert tuple to one-shot scheduling tasks. """ - import datetime for archive_url, meta_url, update_url, check_url in deposits: if self.check: - type = 'swh-deposit-archive-checks' - kwargs = { - 'deposit_check_url': check_url - } + task = create_oneshot_task_dict( + 'swh-deposit-archive-checks', + deposit_check_url=check_url) else: - type = 'swh-deposit-archive-ingestion' - kwargs = { - 'archive_url': archive_url, - 'deposit_meta_url': meta_url, - 'deposit_update_url': update_url, - } - - yield { - 'policy': 'oneshot', - 'type': type, - 'next_run': datetime.datetime.now(tz=datetime.timezone.utc), - 'arguments': { - 'args': [], - 'kwargs': kwargs, - } - } + task = create_oneshot_task_dict( + 'swh-deposit-archive-ingestion', + archive_url=archive_url, + deposit_meta_url=meta_url, + deposit_update_url=update_url) + + yield task def schedule(self, deposits): """Schedule the new deposit injection through swh.scheduler's api. Args: deposits (dict): Deposit aggregated information. """ if self.dry_run: return self.scheduler.create_tasks(self._convert(deposits)) def get_deposit_by(status): """Filter deposit given a specific status. """ from swh.deposit.models import Deposit yield from Deposit.objects.filter(status=status) def prepare_task_arguments(check): """Convert deposit to argument for task to be executed. """ from swh.deposit.config import PRIVATE_GET_RAW_CONTENT from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA from swh.deposit.config import PRIVATE_PUT_DEPOSIT from swh.deposit.config import PRIVATE_CHECK_DEPOSIT from django.core.urlresolvers import reverse if check: status = DEPOSIT_STATUS_READY_FOR_CHECKS else: status = DEPOSIT_STATUS_READY for deposit in get_deposit_by(status): args = [deposit.collection.name, deposit.id] - archive_url = reverse( - PRIVATE_GET_RAW_CONTENT, args=args) - meta_url = reverse( - PRIVATE_GET_DEPOSIT_METADATA, args=args) - update_url = reverse( - PRIVATE_PUT_DEPOSIT, args=args) - check_url = reverse( - PRIVATE_CHECK_DEPOSIT, args=args) - + archive_url = reverse(PRIVATE_GET_RAW_CONTENT, args=args) + meta_url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=args) + update_url = reverse(PRIVATE_PUT_DEPOSIT, args=args) + check_url = reverse(PRIVATE_CHECK_DEPOSIT, args=args) yield archive_url, meta_url, update_url, check_url @click.command( help='Schedule one-shot deposit injections') @click.option('--platform', default='development', help='development or production platform') @click.option('--scheduling-method', default='celery', help='Scheduling method') @click.option('--batch-size', default=1000, type=click.INT, help='Task batch size') @click.option('--dry-run/--no-dry-run', is_flag=True, default=False, help='Dry run') @click.option('--check', is_flag=True, default=False) def main(platform, scheduling_method, batch_size, dry_run, check): setup_django_for(platform) override_config = {} if dry_run: override_config['dry_run'] = dry_run override_config['check'] = check if scheduling_method == 'celery': scheduling = SWHCeleryScheduling(override_config) elif scheduling_method == 'swh-scheduler': scheduling = SWHSchedulerScheduling(override_config) else: raise ValueError( 'Only `celery` or `swh-scheduler` values are accepted') for deposits in utils.grouper(prepare_task_arguments(check), batch_size): scheduling.schedule(deposits) if __name__ == '__main__': main() diff --git a/swh/deposit/signals.py b/swh/deposit/signals.py index 1f707a35..f70ad763 100644 --- a/swh/deposit/signals.py +++ b/swh/deposit/signals.py @@ -1,73 +1,83 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining some uncoupled actions on deposit. Typically, checking that the archives deposited are ok are not directly testing in the request/answer to avoid too long computations. So this is done in the deposit_on_status_ready_for_check callback. """ -import datetime - from django.db.models.signals import post_save from django.dispatch import receiver from .models import Deposit -from .config import SWHDefaultConfig +from .config import SWHDefaultConfig, DEPOSIT_STATUS_READY +from .config import DEPOSIT_STATUS_READY_FOR_CHECKS @receiver(post_save, sender=Deposit) -def deposit_on_status_ready_for_check(sender, instance, created, raw, using, - update_fields, **kwargs): - """Check the status is ready for check. - If so, try and check the associated archives. - If not, move along. +def post_deposit_save(sender, instance, created, raw, using, + update_fields, **kwargs): + """When a deposit is saved, check for the deposit's status change and + schedule actions accordingly. - When - Triggered when a deposit is saved. + When the status passes to ready-for-checks, schedule checks. + When the status pass to ready, schedule loading. Otherwise, do + nothing. Args: - sender (DepositRequest): The model class - instance (DepositRequest): The actual instance being saved + sender (Deposit): The model class + instance (Deposit): The actual instance being saved created (bool): True if a new record was created raw (bool): True if the model is saved exactly as presented (i.e. when loading a fixture). One should not query/modify other records in the database as the database might not be in a consistent state yet using: The database alias being used update_fields: The set of fields to update as passed to Model.save(), or None if update_fields wasn’t passed to save() """ default_config = SWHDefaultConfig() if not default_config.config['checks']: return - # Schedule oneshot task for checking archives - from swh.deposit.config import PRIVATE_CHECK_DEPOSIT - from django.core.urlresolvers import reverse + if instance.status not in {DEPOSIT_STATUS_READY_FOR_CHECKS, + DEPOSIT_STATUS_READY}: + return - args = [instance.deposit.collection.name, instance.deposit.id] - archive_check_url = reverse( - PRIVATE_CHECK_DEPOSIT, args=args) - - task = { - 'policy': 'oneshot', - 'type': 'swh-deposit-archive-checks', - 'next_run': datetime.datetime.now(tz=datetime.timezone.utc), - 'arguments': { - 'args': [], - 'kwargs': { - 'archive_check_url': archive_check_url, - }, - } - } + from django.core.urlresolvers import reverse + from swh.scheduler.utils import create_oneshot_task_dict + + args = [instance.collection.name, instance.id] + + if instance.status == DEPOSIT_STATUS_READY_FOR_CHECKS: + # schedule archive check + from swh.deposit.config import PRIVATE_CHECK_DEPOSIT + check_url = reverse(PRIVATE_CHECK_DEPOSIT, args=args) + task = create_oneshot_task_dict( + 'swh-deposit-archive-checks', + archive_check_url=check_url) + else: # instance.status == DEPOSIT_STATUS_READY: + # schedule loading + from swh.deposit.config import PRIVATE_GET_RAW_CONTENT + from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA + from swh.deposit.config import PRIVATE_PUT_DEPOSIT + archive_url = reverse(PRIVATE_GET_RAW_CONTENT, args=args) + meta_url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=args) + update_url = reverse(PRIVATE_PUT_DEPOSIT, args=args) + + task = create_oneshot_task_dict( + 'swh-deposit-archive-ingestion', + archive_url=archive_url, + deposit_meta_url=meta_url, + deposit_update_url=update_url) default_config.scheduler.create_tasks([task])