diff --git a/PKG-INFO b/PKG-INFO index 1b74163f..b505874f 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.deposit -Version: 0.0.51 +Version: 0.0.52 Summary: Software Heritage Deposit Server Home-page: https://forge.softwareheritage.org/source/swh-deposit/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/control b/debian/control index 4d986c11..fab46535 100644 --- a/debian/control +++ b/debian/control @@ -1,56 +1,56 @@ Source: swh-deposit Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python (>= 2), python3-setuptools, python3-all, python3-nose, python3-django-nose, python3-vcversioner, python3-swh.core (>= 0.0.36~), python3-swh.model (>= 0.0.21~), python3-swh.loader.core (>= 0.0.32~), python3-swh.loader.tar (>= 0.0.35~), - python3-swh.scheduler (>= 0.0.19~), + python3-swh.scheduler (>= 0.0.26~), python3-django, python3-click, python3-vcversioner, python3-djangorestframework, python3-djangorestframework-xml, python3-requests, python3-lxml, patool Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/source/swh-deposit/ Package: python3-swh.deposit Architecture: all Depends: python3-swh.core (>= 0.0.36~), python3-swh.model (>= 0.0.21~), - python3-swh.scheduler (>= 0.0.19~), + python3-swh.scheduler (>= 0.0.26~), patool, ${misc:Depends}, ${python3:Depends} Description: Software Heritage Deposit Server Package: python3-swh.deposit.client Architecture: all Depends: python3-swh.core (>= 0.0.36~), python3-swh.model (>= 0.0.21~), python3-requests, python3-lxml, ${misc:Depends}, ${python3:Depends} Description: Software Heritage Deposit Api Client Package: python3-swh.deposit.loader Conflict: python3-swh.deposit.injection Architecture: all Depends: python3-swh.deposit.client (= ${binary:Version}), python3-swh.core (>= 0.0.36~), python3-swh.model (>= 0.0.21~), python3-swh.loader.core (>= 0.0.32~), python3-swh.loader.tar (>= 0.0.35~), - python3-swh.scheduler (>= 0.0.19~), + python3-swh.scheduler (>= 0.0.26~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Deposit Loader diff --git a/requirements-swh.txt b/requirements-swh.txt index 4a51cbcf..1f63fe7e 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.36 swh.loader.tar >= 0.0.35 swh.loader.core >= 0.0.32 -swh.scheduler >= 0.0.19 +swh.scheduler >= 0.0.26 swh.model >= 0.0.21 diff --git a/swh.deposit.egg-info/PKG-INFO b/swh.deposit.egg-info/PKG-INFO index 1b74163f..b505874f 100644 --- a/swh.deposit.egg-info/PKG-INFO +++ b/swh.deposit.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.deposit -Version: 0.0.51 +Version: 0.0.52 Summary: Software Heritage Deposit Server Home-page: https://forge.softwareheritage.org/source/swh-deposit/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.deposit.egg-info/requires.txt b/swh.deposit.egg-info/requires.txt index dcc067a0..963df3d6 100644 --- a/swh.deposit.egg-info/requires.txt +++ b/swh.deposit.egg-info/requires.txt @@ -1,11 +1,11 @@ Django click djangorestframework djangorestframework-xml lxml swh.core>=0.0.36 swh.loader.core>=0.0.32 swh.loader.tar>=0.0.35 swh.model>=0.0.21 -swh.scheduler>=0.0.19 +swh.scheduler>=0.0.26 vcversioner diff --git a/swh/deposit/config.py b/swh/deposit/config.py index d3a1a8ae..999a20ae 100644 --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -1,93 +1,99 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import logging from swh.core.config import SWHConfig +from swh.scheduler import get_scheduler # IRIs (Internationalized Resource identifier) sword 2.0 specified EDIT_SE_IRI = 'edit_se_iri' EM_IRI = 'em_iri' CONT_FILE_IRI = 'cont_file_iri' SD_IRI = 'servicedocument' COL_IRI = 'upload' STATE_IRI = 'state_iri' PRIVATE_GET_RAW_CONTENT = 'private-download' PRIVATE_CHECK_DEPOSIT = 'check-deposit' PRIVATE_PUT_DEPOSIT = 'private-update' PRIVATE_GET_DEPOSIT_METADATA = 'private-read' ARCHIVE_KEY = 'archive' METADATA_KEY = 'metadata' ARCHIVE_TYPE = 'archive' METADATA_TYPE = 'metadata' AUTHORIZED_PLATFORMS = ['development', 'production', 'testing'] DEPOSIT_STATUS_REJECTED = 'rejected' DEPOSIT_STATUS_PARTIAL = 'partial' DEPOSIT_STATUS_DEPOSITED = 'deposited' DEPOSIT_STATUS_VERIFIED = 'verified' DEPOSIT_STATUS_LOAD_SUCCESS = 'done' DEPOSIT_STATUS_LOAD_FAILURE = 'failed' # Revision author for deposit SWH_PERSON = { 'name': 'Software Heritage', 'fullname': 'Software Heritage', 'email': 'robot@softwareheritage.org' } def setup_django_for(platform): """Setup function for command line tools (swh.deposit.create_user, swh.deposit.scheduler.cli) to initialize the needed db access. Note: Do not import any django related module prior to this function call. Otherwise, this will raise an django.core.exceptions.ImproperlyConfigured error message. Args: platform (str): the platform the scheduling is running Raises: ValueError in case of wrong platform inputs. """ if platform not in AUTHORIZED_PLATFORMS: raise ValueError('Platform should be one of %s' % AUTHORIZED_PLATFORMS) os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'swh.deposit.settings.%s' % platform) import django django.setup() class SWHDefaultConfig(SWHConfig): """Mixin intended to enrich views with SWH configuration. """ CONFIG_BASE_FILENAME = 'deposit/server' DEFAULT_CONFIG = { 'max_upload_size': ('int', 209715200), 'checks': ('bool', True), + 'scheduler': ('dict', { + 'cls': 'remote', + 'args': { + 'url': 'http://localhost:5008/' + } + }) } ADDITIONAL_CONFIG = {} def __init__(self, **config): super().__init__() self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) self.config.update(config) self.log = logging.getLogger('swh.deposit') if self.config['checks']: - from swh.scheduler.backend import SchedulerBackend - self.scheduler = SchedulerBackend() + self.scheduler = get_scheduler(**self.config['scheduler']) diff --git a/swh/deposit/loader/scheduler.py b/swh/deposit/loader/scheduler.py index e5f38a15..bc2713c6 100644 --- a/swh/deposit/loader/scheduler.py +++ b/swh/deposit/loader/scheduler.py @@ -1,212 +1,219 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of sending deposit loading/checking as either celery task or scheduled one-shot tasks. """ import click import logging from abc import ABCMeta, abstractmethod from celery import group from swh.core import utils from swh.core.config import SWHConfig from swh.deposit.config import setup_django_for, DEPOSIT_STATUS_VERIFIED from swh.deposit.config import DEPOSIT_STATUS_DEPOSITED from swh.scheduler.utils import get_task, create_oneshot_task_dict class SWHScheduling(SWHConfig, metaclass=ABCMeta): """Base swh scheduling class to aggregate the schedule deposit loading. """ CONFIG_BASE_FILENAME = 'deposit/server' DEFAULT_CONFIG = { 'dry_run': ('bool', False), } ADDITIONAL_CONFIG = {} def __init__(self): super().__init__() self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) self.log = logging.getLogger('swh.deposit.scheduling') @abstractmethod def schedule(self, deposits): """Schedule the new deposit loading. Args: data (dict): Deposit aggregated data Returns: None """ pass class SWHCeleryScheduling(SWHScheduling): """Deposit loading as Celery task scheduling. """ def __init__(self, config=None): super().__init__() if config: self.config.update(**config) self.dry_run = self.config['dry_run'] self.check = self.config['check'] if self.check: task_name = 'swh.deposit.loader.tasks.ChecksDepositTsk' else: task_name = 'swh.deposit.loader.tasks.LoadDepositArchiveTsk' self.task = get_task(task_name) def _convert(self, deposits): """Convert tuple to celery task signature. """ task = self.task for archive_url, meta_url, update_url, check_url in deposits: if self.check: yield task.s(deposit_check_url=check_url) else: yield task.s(archive_url=archive_url, deposit_meta_url=meta_url, deposit_update_url=update_url) def schedule(self, deposits): """Schedule the new deposit loading directly through celery. Args: depositdata (dict): Deposit aggregated information. Returns: None """ if self.dry_run: return return group(self._convert(deposits)).delay() class SWHSchedulerScheduling(SWHScheduling): """Deposit loading through SWH's task scheduling interface. """ - ADDITIONAL_CONFIG = {} + ADDITIONAL_CONFIG = { + 'scheduler': ('dict', { + 'cls': 'remote', + 'args': { + 'url': 'http://localhost:5008', + } + }) + } def __init__(self, config=None): super().__init__() - from swh.scheduler.backend import SchedulerBackend + from swh.scheduler import get_scheduler if config: self.config.update(**config) self.dry_run = self.config['dry_run'] - self.scheduler = SchedulerBackend(**self.config) + self.scheduler = get_scheduler(**self.config['scheduler']) self.check = self.config['check'] def _convert(self, deposits): """Convert tuple to one-shot scheduling tasks. """ for archive_url, meta_url, update_url, check_url in deposits: if self.check: task = create_oneshot_task_dict( 'swh-deposit-archive-checks', deposit_check_url=check_url) else: task = create_oneshot_task_dict( 'swh-deposit-archive-loading', archive_url=archive_url, deposit_meta_url=meta_url, deposit_update_url=update_url) yield task def schedule(self, deposits): """Schedule the new deposit loading through swh.scheduler's api. Args: deposits (dict): Deposit aggregated information. """ if self.dry_run: return self.scheduler.create_tasks(self._convert(deposits)) def get_deposit_by(status): """Filter deposit given a specific status. """ from swh.deposit.models import Deposit yield from Deposit.objects.filter(status=status) def prepare_task_arguments(check): """Convert deposit to argument for task to be executed. """ from swh.deposit.config import PRIVATE_GET_RAW_CONTENT from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA from swh.deposit.config import PRIVATE_PUT_DEPOSIT from swh.deposit.config import PRIVATE_CHECK_DEPOSIT from django.core.urlresolvers import reverse if check: status = DEPOSIT_STATUS_DEPOSITED else: status = DEPOSIT_STATUS_VERIFIED for deposit in get_deposit_by(status): args = [deposit.collection.name, deposit.id] archive_url = reverse(PRIVATE_GET_RAW_CONTENT, args=args) meta_url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=args) update_url = reverse(PRIVATE_PUT_DEPOSIT, args=args) check_url = reverse(PRIVATE_CHECK_DEPOSIT, args=args) yield archive_url, meta_url, update_url, check_url @click.command( help='Schedule one-shot deposit loadings') @click.option('--platform', default='development', help='development or production platform') @click.option('--scheduling-method', default='celery', help='Scheduling method') @click.option('--batch-size', default=1000, type=click.INT, help='Task batch size') @click.option('--dry-run/--no-dry-run', is_flag=True, default=False, help='Dry run') @click.option('--check', is_flag=True, default=False) def main(platform, scheduling_method, batch_size, dry_run, check): setup_django_for(platform) override_config = {} if dry_run: override_config['dry_run'] = dry_run override_config['check'] = check if scheduling_method == 'celery': scheduling = SWHCeleryScheduling(override_config) elif scheduling_method == 'swh-scheduler': scheduling = SWHSchedulerScheduling(override_config) else: raise ValueError( 'Only `celery` or `swh-scheduler` values are accepted') for deposits in utils.grouper(prepare_task_arguments(check), batch_size): scheduling.schedule(deposits) if __name__ == '__main__': main() diff --git a/swh/deposit/settings/development.py b/swh/deposit/settings/development.py index bc39dca5..04823686 100644 --- a/swh/deposit/settings/development.py +++ b/swh/deposit/settings/development.py @@ -1,59 +1,59 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from .common import * # noqa # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True # SECURITY WARNING: keep the secret key used in production secret! SECRET_KEY = 'development-key' # https://docs.djangoproject.com/en/1.10/ref/settings/#logging LOGGING = { 'version': 1, - 'disable_existing_loggers': True, + 'disable_existing_loggers': False, 'formatters': { 'standard': { 'format': "[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s", # noqa 'datefmt': "%d/%b/%Y %H:%M:%S" }, }, 'handlers': { 'console': { 'level': 'DEBUG', 'class': 'logging.StreamHandler', 'formatter': 'standard' }, }, 'loggers': { 'django': { 'handlers': ['console'], 'level': 'DEBUG', 'propagate': True, }, 'django.db.backends': { 'handlers': ['console'], 'level': 'INFO', 'propagate': False, }, 'swh.deposit': { 'handlers': ['console'], 'level': 'DEBUG', }, } } # https://docs.djangoproject.com/en/1.10/ref/settings/#databases DATABASES = { 'default': { 'ENGINE': 'django.db.backends.postgresql', 'NAME': 'swh-deposit-dev', } } # https://docs.djangoproject.com/en/1.11/ref/settings/#std:setting-MEDIA_ROOT # SECURITY WARNING: Override this in the production.py module MEDIA_ROOT = '/tmp/swh-deposit/uploads/' diff --git a/version.txt b/version.txt index f548cb09..667e1dda 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.51-0-g735b248 \ No newline at end of file +v0.0.52-0-g7e535ab \ No newline at end of file