diff --git a/swh/lister/cli.py b/swh/lister/cli.py index 9894256..365c36a 100644 --- a/swh/lister/cli.py +++ b/swh/lister/cli.py @@ -1,204 +1,126 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import logging from copy import deepcopy -from importlib import import_module -import celery.app.task import click from sqlalchemy import create_engine from swh.core.cli import CONTEXT_SETTINGS -from swh.scheduler import get_scheduler from swh.lister import get_lister, SUPPORTED_LISTERS, LISTERS from swh.lister.core.models import initialize logger = logging.getLogger(__name__) # the key in this dict is the suffix used to match new task-type to be added. # For example for a task which function name is "list_gitlab_full', the default # value used when inserting a new task-type in the scheduler db will be the one # under the 'full' key below (because it matches xxx_full). DEFAULT_TASK_TYPE = { 'full': { # for tasks like 'list_xxx_full()' 'default_interval': '90 days', 'min_interval': '90 days', 'max_interval': '90 days', 'backoff_factor': 1 }, '*': { # value if not suffix matches 'default_interval': '1 day', 'min_interval': '1 day', 'max_interval': '1 day', 'backoff_factor': 1 }, } @click.group(name='lister', context_settings=CONTEXT_SETTINGS) @click.option('--config-file', '-C', default=None, type=click.Path(exists=True, dir_okay=False,), help="Configuration file.") @click.option('--db-url', '-d', default=None, help='SQLAlchemy DB URL; see ' '') # noqa @click.pass_context def lister(ctx, config_file, db_url): '''Software Heritage Lister tools.''' from swh.core import config ctx.ensure_object(dict) if not config_file: config_file = os.environ.get('SWH_CONFIG_FILENAME') conf = config.read(config_file) if db_url: conf['lister'] = { 'cls': 'local', 'args': {'db': db_url} } ctx.obj['config'] = conf @lister.command(name='db-init', context_settings=CONTEXT_SETTINGS) @click.option('--drop-tables', '-D', is_flag=True, default=False, help='Drop tables before creating the database schema') @click.pass_context def db_init(ctx, drop_tables): """Initialize the database model for given listers. """ cfg = ctx.obj['config'] lister_cfg = cfg['lister'] if lister_cfg['cls'] != 'local': click.echo('A local lister configuration is required') ctx.exit(1) db_url = lister_cfg['args']['db'] db_engine = create_engine(db_url) registry = {} for lister, entrypoint in LISTERS.items(): logger.info('Loading lister %s', lister) registry[lister] = entrypoint.load()() logger.info('Initializing database') initialize(db_engine, drop_tables) for lister, entrypoint in LISTERS.items(): registry_entry = registry[lister] init_hook = registry_entry.get('init') if callable(init_hook): logger.info('Calling init hook for %s', lister) init_hook(db_engine) -@lister.command(name='register-task-types', context_settings=CONTEXT_SETTINGS) -@click.option('--lister', '-l', 'listers', multiple=True, - default=('all', ), show_default=True, - help='Only registers task-types for these listers', - type=click.Choice(['all'] + SUPPORTED_LISTERS)) -@click.pass_context -def register_task_types(ctx, listers): - """Insert missing task-type entries in the scheduler - - According to declared tasks in each loaded lister plugin. - """ - - cfg = ctx.obj['config'] - scheduler = get_scheduler(**cfg['scheduler']) - - for lister, entrypoint in LISTERS.items(): - if 'all' not in listers and lister not in listers: - continue - logger.info('Loading lister %s', lister) - - registry_entry = entrypoint.load()() - for task_module in registry_entry['task_modules']: - mod = import_module(task_module) - for task_name in (x for x in dir(mod) if not x.startswith('_')): - taskobj = getattr(mod, task_name) - if isinstance(taskobj, celery.app.task.Task): - task_type = task_name.replace('_', '-') - task_cfg = registry_entry.get('task_types', {}).get( - task_type, {}) - ensure_task_type(task_type, taskobj, task_cfg, scheduler) - - -def ensure_task_type(task_type, swhtask, task_config, scheduler): - """Ensure a task-type is known by the scheduler - - Args: - task_type (str): the type of the task to check/insert (correspond to - the 'type' field in the db) - swhtask (SWHTask): the SWHTask instance the task-type correspond to - task_config (dict): a dict with specific/overloaded values for the - task-type to be created - scheduler: the scheduler object used to access the scheduler db - """ - for suffix, defaults in DEFAULT_TASK_TYPE.items(): - if task_type.endswith('-' + suffix): - task_type_dict = defaults.copy() - break - else: - task_type_dict = DEFAULT_TASK_TYPE['*'].copy() - - task_type_dict['type'] = task_type - task_type_dict['backend_name'] = swhtask.name - if swhtask.__doc__: - task_type_dict['description'] = swhtask.__doc__.splitlines()[0] - - task_type_dict.update(task_config) - - current_task_type = scheduler.get_task_type(task_type) - if current_task_type: - # check some stuff - if current_task_type['backend_name'] != task_type_dict['backend_name']: - logger.warning('Existing task type %s for lister %s has a ' - 'different backend name than current ' - 'code version provides (%s vs. %s)', - task_type, - lister, - current_task_type['backend_name'], - task_type_dict['backend_name'], - ) - else: - logger.info('Create task type %s in scheduler', task_type) - logger.debug(' %s', task_type_dict) - scheduler.create_task_type(task_type_dict) - - @lister.command(name='run', context_settings=CONTEXT_SETTINGS, help='Trigger a full listing run for a particular forge ' 'instance. The output of this listing results in ' '"oneshot" tasks in the scheduler db with a priority ' 'defined by the user') @click.option('--lister', '-l', help='Lister to run', type=click.Choice(SUPPORTED_LISTERS)) @click.option('--priority', '-p', default='high', type=click.Choice(['high', 'medium', 'low']), help='Task priority for the listed repositories to ingest') @click.argument('options', nargs=-1) @click.pass_context def run(ctx, lister, priority, options): from swh.scheduler.cli.utils import parse_options config = deepcopy(ctx.obj['config']) if options: config.update(parse_options(options)[1]) config['priority'] = priority config['policy'] = 'oneshot' get_lister(lister, **config).run() if __name__ == '__main__': lister() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py index 2e9f9a1..3224c81 100644 --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -1,142 +1,67 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import glob import pytest -import traceback -from datetime import timedelta -import yaml - -from swh.core.utils import numfile_sortkey as sortkey -from swh.scheduler import get_scheduler -from swh.scheduler.tests.conftest import DUMP_FILES from swh.lister.core.lister_base import ListerBase -from swh.lister.cli import lister as cli, get_lister, SUPPORTED_LISTERS +from swh.lister.cli import get_lister, SUPPORTED_LISTERS from .test_utils import init_db -from click.testing import CliRunner - - -@pytest.fixture -def swh_scheduler_config(request, postgresql_proc, postgresql): - scheduler_config = { - 'db': 'postgresql://{user}@{host}:{port}/{dbname}'.format( - host=postgresql_proc.host, - port=postgresql_proc.port, - user='postgres', - dbname='tests') - } - - all_dump_files = sorted(glob.glob(DUMP_FILES), key=sortkey) - - cursor = postgresql.cursor() - for fname in all_dump_files: - with open(fname) as fobj: - cursor.execute(fobj.read()) - postgresql.commit() - - return scheduler_config def test_get_lister_wrong_input(): """Unsupported lister should raise""" with pytest.raises(ValueError) as e: get_lister('unknown', 'db-url') assert "Invalid lister" in str(e.value) def test_get_lister(): """Instantiating a supported lister should be ok """ db_url = init_db().url() for lister_name in SUPPORTED_LISTERS: lst = get_lister(lister_name, db_url) assert isinstance(lst, ListerBase) def test_get_lister_override(): """Overriding the lister configuration should populate its config """ db_url = init_db().url() listers = { 'gitlab': 'https://other.gitlab.uni/api/v4/', 'phabricator': 'https://somewhere.org/api/diffusion.repository.search', 'cgit': 'https://some.where/cgit', } # check the override ends up defined in the lister for lister_name, url in listers.items(): lst = get_lister( lister_name, db_url, **{ 'url': url, 'priority': 'high', 'policy': 'oneshot', }) assert lst.url == url assert lst.config['priority'] == 'high' assert lst.config['policy'] == 'oneshot' # check the default urls are used and not the override (since it's not # passed) for lister_name, url in listers.items(): lst = get_lister(lister_name, db_url) # no override so this does not end up in lister's configuration assert 'url' not in lst.config assert 'priority' not in lst.config assert 'oneshot' not in lst.config assert lst.url == lst.DEFAULT_URL - - -def test_task_types(swh_scheduler_config, tmp_path): - configfile = tmp_path / 'config.yml' - config = { - 'scheduler': { - 'cls': 'local', - 'args': swh_scheduler_config - } - } - configfile.write_text(yaml.dump(config)) - runner = CliRunner() - result = runner.invoke(cli, [ - '--config-file', configfile.as_posix(), - 'register-task-types']) - - assert result.exit_code == 0, traceback.print_exception(*result.exc_info) - - scheduler = get_scheduler(**config['scheduler']) - all_tasks = [ - 'list-bitbucket-full', 'list-bitbucket-incremental', - 'list-cran', - 'list-cgit', - 'list-debian-distribution', - 'list-gitlab-full', 'list-gitlab-incremental', - 'list-github-full', 'list-github-incremental', - 'list-gnu-full', - 'list-npm-full', 'list-npm-incremental', - 'list-phabricator-full', - 'list-packagist', - 'list-pypi', - ] - for task in all_tasks: - task_type_desc = scheduler.get_task_type(task) - assert task_type_desc - assert task_type_desc['type'] == task - assert task_type_desc['backoff_factor'] == 1 - - if task == 'list-npm-full': - delay = timedelta(days=7) # overloaded in the plugin registry - elif task.endswith('-full'): - delay = timedelta(days=90) # default value for 'full' lister tasks - else: - delay = timedelta(days=1) # default value for other lister tasks - assert task_type_desc['default_interval'] == delay, task