diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,3 @@ swh.core -swh.storage >= 0.0.122 -swh.storage[schemata] -swh.scheduler >= 0.0.39 +swh.storage[schemata] >= 0.0.122 +swh.scheduler >= 0.0.58 diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,4 +1,4 @@ -pytest<4 +pytest pytest-postgresql requests_mock testing.postgresql diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -55,6 +55,18 @@ swh-lister=swh.lister.cli:cli [swh.cli.subcommands] lister=swh.lister.cli:lister + [swh.workers] + lister.bitbucket=swh.lister.bitbucket:register + lister.cgit=swh.lister.cgit:register + lister.cran=swh.lister.cran:register + lister.debian=swh.lister.debian:register + lister.github=swh.lister.github:register + lister.gitlab=swh.lister.gitlab:register + lister.gnu=swh.lister.gnu:register + lister.npm=swh.lister.npm:register + lister.packagist=swh.lister.packagist:register + lister.phabricator=swh.lister.phabricator:register + lister.pypi=swh.lister.pypi:register ''', classifiers=[ "Programming Language :: Python :: 3", diff --git a/swh/lister/bitbucket/__init__.py b/swh/lister/bitbucket/__init__.py --- a/swh/lister/bitbucket/__init__.py +++ b/swh/lister/bitbucket/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .models import BitBucketModel + from .lister import BitBucketLister + + return {'models': [BitBucketModel], + 'lister': BitBucketLister, + 'task_modules': ['%s.tasks' % __name__], + } diff --git a/swh/lister/cgit/__init__.py b/swh/lister/cgit/__init__.py --- a/swh/lister/cgit/__init__.py +++ b/swh/lister/cgit/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .models import CGitModel + from .lister import CGitLister + + return {'models': [CGitModel], + 'lister': CGitLister, + 'task_modules': ['%s.tasks' % __name__], + } diff --git a/swh/lister/cli.py b/swh/lister/cli.py --- a/swh/lister/cli.py +++ b/swh/lister/cli.py @@ -4,191 +4,103 @@ # See top-level LICENSE file for more information import logging +import pkg_resources +from copy import deepcopy + import click +from sqlalchemy import create_engine from swh.core.cli import CONTEXT_SETTINGS +from swh.lister.core.models import initialize logger = logging.getLogger(__name__) -SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi', - 'npm', 'phabricator', 'gnu', 'cran', 'cgit', 'packagist'] - - -# Base urls for most listers -DEFAULT_BASEURLS = { - 'gitlab': 'https://gitlab.com/api/v4/', - 'phabricator': 'https://forge.softwareheritage.org', -} +LISTERS = {entry_point.name.split('.', 1)[1]: entry_point + for entry_point in pkg_resources.iter_entry_points('swh.workers') + if entry_point.name.split('.', 1)[0] == 'lister'} +SUPPORTED_LISTERS = list(LISTERS) -def get_lister(lister_name, db_url, drop_tables=False, **conf): +def get_lister(lister_name, db_url=None, **conf): """Instantiate a lister given its name. Args: lister_name (str): Lister's name - db_url (str): Db's service url access - conf (dict): Extra configuration (policy, priority for example) + conf (dict): Configuration dict (lister db cnx, policy, priority...) Returns: Tuple (instantiated lister, drop_tables function, init schema function, insert minimum data function) """ - override_conf = { - 'lister': { - 'cls': 'local', - 'args': {'db': db_url} - }, - **conf, - } - - # To allow api_baseurl override per lister - if 'api_baseurl' in override_conf: - api_baseurl = override_conf.pop('api_baseurl') - else: - api_baseurl = DEFAULT_BASEURLS.get(lister_name) - - insert_minimum_data_fn = None - if lister_name == 'github': - from .github.models import IndexingModelBase as ModelBase - from .github.lister import GitHubLister - - _lister = GitHubLister(api_baseurl='https://api.github.com', - override_config=override_conf) - elif lister_name == 'bitbucket': - from .bitbucket.models import IndexingModelBase as ModelBase - from .bitbucket.lister import BitBucketLister - _lister = BitBucketLister(api_baseurl='https://api.bitbucket.org/2.0', - override_config=override_conf) - - elif lister_name == 'gitlab': - from .gitlab.models import ModelBase - from .gitlab.lister import GitLabLister - _lister = GitLabLister(api_baseurl=api_baseurl, - override_config=override_conf) - elif lister_name == 'debian': - from .debian.lister import DebianLister - ModelBase = DebianLister.MODEL # noqa - _lister = DebianLister(override_config=override_conf) - - def insert_minimum_data_fn(lister_name, lister): - logger.info('Inserting minimal data for %s', lister_name) - from swh.storage.schemata.distribution import ( - Distribution, Area) - d = Distribution( - name='Debian', - type='deb', - mirror_uri='http://deb.debian.org/debian/') - lister.db_session.add(d) - - areas = [] - for distribution_name in ['stretch']: - for area_name in ['main', 'contrib', 'non-free']: - areas.append(Area( - name='%s/%s' % (distribution_name, area_name), - distribution=d, - )) - lister.db_session.add_all(areas) - lister.db_session.commit() - - elif lister_name == 'pypi': - from .pypi.models import ModelBase - from .pypi.lister import PyPILister - _lister = PyPILister(override_config=override_conf) - - elif lister_name == 'npm': - from .npm.models import IndexingModelBase as ModelBase - from .npm.models import NpmVisitModel - from .npm.lister import NpmLister - _lister = NpmLister(override_config=override_conf) - - def insert_minimum_data_fn(lister_name, lister): - logger.info('Inserting minimal data for %s', lister_name) - if drop_tables: - NpmVisitModel.metadata.drop_all(lister.db_engine) - NpmVisitModel.metadata.create_all(lister.db_engine) - - elif lister_name == 'phabricator': - from .phabricator.models import IndexingModelBase as ModelBase - from .phabricator.lister import PhabricatorLister - _lister = PhabricatorLister(api_baseurl=api_baseurl, - override_config=override_conf) - - elif lister_name == 'gnu': - from .gnu.models import ModelBase - from .gnu.lister import GNULister - _lister = GNULister(override_config=override_conf) - - elif lister_name == 'cran': - from .cran.models import ModelBase - from .cran.lister import CRANLister - _lister = CRANLister(override_config=override_conf) - - elif lister_name == 'cgit': - from .cgit.models import ModelBase - from .cgit.lister import CGitLister - _lister = CGitLister(url=api_baseurl, - override_config=override_conf) - - elif lister_name == 'packagist': - from .packagist.models import ModelBase # noqa - from .packagist.lister import PackagistLister - _lister = PackagistLister(override_config=override_conf) - - else: + if lister_name not in LISTERS: raise ValueError( 'Invalid lister %s: only supported listers are %s' % (lister_name, SUPPORTED_LISTERS)) - - drop_table_fn = None - if drop_tables: - def drop_table_fn(lister_name, lister): - logger.info('Dropping tables for %s', lister_name) - ModelBase.metadata.drop_all(lister.db_engine) - - def init_schema_fn(lister_name, lister): - logger.info('Creating tables for %s', lister_name) - ModelBase.metadata.create_all(lister.db_engine) - - return _lister, drop_table_fn, init_schema_fn, insert_minimum_data_fn + if db_url: + conf['lister'] = {'cls': 'local', 'args': {'db': db_url}} + # To allow api_baseurl override per lister + registry_entry = LISTERS[lister_name].load()() + lister_cls = registry_entry['lister'] + lister = lister_cls(override_config=conf) + return lister @click.group(name='lister', context_settings=CONTEXT_SETTINGS) +@click.option('--config-file', '-C', default=None, + type=click.Path(exists=True, dir_okay=False,), + help="Configuration file.") +@click.option('--db-url', '-d', default=None, + help='SQLAlchemy DB URL; see ' + '') # noqa @click.pass_context -def lister(ctx): +def lister(ctx, config_file, db_url): '''Software Heritage Lister tools.''' - pass + from swh.core import config + ctx.ensure_object(dict) + + override_conf = {} + if db_url: + override_conf['lister'] = { + 'cls': 'local', + 'args': {'db': db_url} + } + conf = config.read(config_file, override_conf) + ctx.obj['config'] = conf + ctx.obj['override_conf'] = override_conf @lister.command(name='db-init', context_settings=CONTEXT_SETTINGS) -@click.option('--db-url', '-d', default='postgres:///lister', - help='SQLAlchemy DB URL; see ' - '') # noqa -@click.argument('listers', required=1, nargs=-1, - type=click.Choice(SUPPORTED_LISTERS + ['all'])) @click.option('--drop-tables', '-D', is_flag=True, default=False, help='Drop tables before creating the database schema') @click.pass_context -def cli(ctx, db_url, listers, drop_tables): +def db_init(ctx, drop_tables): """Initialize the database model for given listers. """ - if 'all' in listers: - listers = SUPPORTED_LISTERS - for lister_name in listers: - logger.info('Initializing lister %s', lister_name) - lister, drop_schema_fn, init_schema_fn, insert_minimum_data_fn = \ - get_lister(lister_name, db_url, drop_tables=drop_tables) + cfg = ctx.obj['config'] + lister_cfg = cfg['lister'] + if lister_cfg['cls'] != 'local': + click.echo('A local lister configuration is required') + ctx.exit(1) + + db_url = lister_cfg['args']['db'] + db_engine = create_engine(db_url) - if drop_schema_fn: - drop_schema_fn(lister_name, lister) + for lister, entrypoint in LISTERS.items(): + logger.info('Loading lister %s', lister) + registry_entry = entrypoint.load()() - init_schema_fn(lister_name, lister) + logger.info('Initializing database') + initialize(db_engine, drop_tables) - if insert_minimum_data_fn: - insert_minimum_data_fn(lister_name, lister) + for lister, entrypoint in LISTERS.items(): + init_hook = registry_entry.get('init') + if callable(init_hook): + logger.info('Calling init hook for %s', lister) + init_hook(db_engine) @lister.command(name='run', context_settings=CONTEXT_SETTINGS, @@ -196,9 +108,6 @@ 'instance. The output of this listing results in ' '"oneshot" tasks in the scheduler db with a priority ' 'defined by the user') -@click.option('--db-url', '-d', default='postgres:///lister', - help='SQLAlchemy DB URL; see ' - '') # noqa @click.option('--lister', '-l', help='Lister to run', type=click.Choice(SUPPORTED_LISTERS)) @click.option('--priority', '-p', default='high', @@ -206,23 +115,19 @@ help='Task priority for the listed repositories to ingest') @click.argument('options', nargs=-1) @click.pass_context -def run(ctx, db_url, lister, priority, options): +def run(ctx, lister, priority, options): from swh.scheduler.cli.utils import parse_options + config = deepcopy(ctx.obj['config']) + if options: - _, kwargs = parse_options(options) - else: - kwargs = {} + config.update(parse_options(options)[1]) - override_config = { - 'priority': priority, - 'policy': 'oneshot', - **kwargs, - } + config['priority'] = priority + config['policy'] = 'oneshot' - lister, _, _, _ = get_lister(lister, db_url, **override_config) - lister.run() + get_lister(lister, **config).run() if __name__ == '__main__': - cli() + lister() diff --git a/swh/lister/core/models.py b/swh/lister/core/models.py --- a/swh/lister/core/models.py +++ b/swh/lister/core/models.py @@ -4,12 +4,15 @@ import abc from datetime import datetime +import logging from sqlalchemy import Column, DateTime, Integer, String from sqlalchemy.ext.declarative import declarative_base, DeclarativeMeta from .abstractattribute import AbstractAttribute +logger = logging.getLogger(__name__) + SQLBase = declarative_base() @@ -46,3 +49,23 @@ # The value used for sorting, segmenting, or api query paging, # because uids aren't always sequential. indexable = AbstractAttribute('Column(, index=True)') + + +def initialize(db_engine, drop_tables=False, **kwargs): + """Default database initialization function for a lister. + + Typically called from the lister's initialization hook. + + Args: + models (list): list of SQLAlchemy tables/models to drop/create. + db_enfine (): the SQLAlchemy DB engine. + drop_tables (bool): if True, tables will be dropped before + (re)creating them. + """ + + if drop_tables: + logger.info('Dropping tables') + SQLBase.metadata.drop_all(db_engine, checkfirst=True) + + logger.info('Creating tables') + SQLBase.metadata.create_all(db_engine, checkfirst=True) diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py --- a/swh/lister/core/tests/conftest.py +++ b/swh/lister/core/tests/conftest.py @@ -1,19 +1 @@ -import pytest from swh.scheduler.tests.conftest import * # noqa - - -@pytest.fixture(scope='session') -def celery_includes(): - return [ - 'swh.lister.bitbucket.tasks', - 'swh.lister.cgit.tasks', - 'swh.lister.cran.tasks', - 'swh.lister.debian.tasks', - 'swh.lister.github.tasks', - 'swh.lister.gitlab.tasks', - 'swh.lister.gnu.tasks', - 'swh.lister.npm.tasks', - 'swh.lister.packagist.tasks', - 'swh.lister.phabricator.tasks', - 'swh.lister.pypi.tasks', - ] diff --git a/swh/lister/cran/__init__.py b/swh/lister/cran/__init__.py --- a/swh/lister/cran/__init__.py +++ b/swh/lister/cran/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .models import CRANModel + from .lister import CRANLister + + return {'models': [CRANModel], + 'lister': CRANLister, + 'task_modules': ['%s.tasks' % __name__], + } diff --git a/swh/lister/debian/__init__.py b/swh/lister/debian/__init__.py --- a/swh/lister/debian/__init__.py +++ b/swh/lister/debian/__init__.py @@ -0,0 +1,40 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def debian_init(db_engine, override_conf=None): + from swh.storage.schemata.distribution import ( + Distribution, Area) + from .lister import DebianLister + + lister = DebianLister(override_config=override_conf) + + if not lister.db_session\ + .query(Distribution)\ + .filter(Distribution.name == 'Debian')\ + .one_or_none(): + + d = Distribution( + name='Debian', + type='deb', + mirror_uri='http://deb.debian.org/debian/') + lister.db_session.add(d) + + areas = [] + for distribution_name in ['stretch', 'buster']: + for area_name in ['main', 'contrib', 'non-free']: + areas.append(Area( + name='%s/%s' % (distribution_name, area_name), + distribution=d, + )) + lister.db_session.add_all(areas) + lister.db_session.commit() + + +def register(): + from .lister import DebianLister + return {'models': [DebianLister.MODEL], + 'lister': DebianLister, + 'task_modules': ['%s.tasks' % __name__], + 'init': debian_init} diff --git a/swh/lister/github/__init__.py b/swh/lister/github/__init__.py --- a/swh/lister/github/__init__.py +++ b/swh/lister/github/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .models import GitHubModel + from .lister import GitHubLister + + return {'models': [GitHubModel], + 'lister': GitHubLister, + 'task_modules': ['%s.tasks' % __name__], + } diff --git a/swh/lister/gitlab/__init__.py b/swh/lister/gitlab/__init__.py --- a/swh/lister/gitlab/__init__.py +++ b/swh/lister/gitlab/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .models import GitLabModel + from .lister import GitLabLister + + return {'models': [GitLabModel], + 'lister': GitLabLister, + 'task_modules': ['%s.tasks' % __name__], + } diff --git a/swh/lister/gnu/__init__.py b/swh/lister/gnu/__init__.py --- a/swh/lister/gnu/__init__.py +++ b/swh/lister/gnu/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .models import GNUModel + from .lister import GNULister + + return {'models': [GNUModel], + 'lister': GNULister, + 'task_modules': ['%s.tasks' % __name__], + } diff --git a/swh/lister/npm/__init__.py b/swh/lister/npm/__init__.py --- a/swh/lister/npm/__init__.py +++ b/swh/lister/npm/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .models import NpmVisitModel, NpmModel + from .lister import NpmLister + + return {'models': [NpmVisitModel, NpmModel], + 'lister': NpmLister, + 'task_modules': ['%s.tasks' % __name__], + } diff --git a/swh/lister/packagist/__init__.py b/swh/lister/packagist/__init__.py --- a/swh/lister/packagist/__init__.py +++ b/swh/lister/packagist/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .models import PackagistModel + from .lister import PackagistLister + + return {'models': [PackagistModel], + 'lister': PackagistLister, + 'task_modules': ['%s.tasks' % __name__], + } diff --git a/swh/lister/phabricator/__init__.py b/swh/lister/phabricator/__init__.py --- a/swh/lister/phabricator/__init__.py +++ b/swh/lister/phabricator/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .models import PhabricatorModel + from .lister import PhabricatorLister + + return {'models': [PhabricatorModel], + 'lister': PhabricatorLister, + 'task_modules': ['%s.tasks' % __name__], + } diff --git a/swh/lister/pypi/__init__.py b/swh/lister/pypi/__init__.py --- a/swh/lister/pypi/__init__.py +++ b/swh/lister/pypi/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .models import PyPIModel + from .lister import PyPILister + + return {'models': [PyPIModel], + 'lister': PyPILister, + 'task_modules': ['%s.tasks' % __name__], + } diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -6,7 +6,7 @@ import pytest from swh.lister.core.lister_base import ListerBase -from swh.lister.cli import get_lister, SUPPORTED_LISTERS, DEFAULT_BASEURLS +from swh.lister.cli import get_lister, SUPPORTED_LISTERS from .test_utils import init_db @@ -24,32 +24,9 @@ """ db_url = init_db().url() - supported_listers_with_init = {'npm', 'debian'} - supported_listers = set(SUPPORTED_LISTERS) - supported_listers_with_init - for lister_name in supported_listers: - lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url) - + for lister_name in SUPPORTED_LISTERS: + lst = get_lister(lister_name, db_url) assert isinstance(lst, ListerBase) - assert drop_fn is None - assert init_fn is not None - assert insert_data_fn is None - - for lister_name in supported_listers_with_init: - lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url) - - assert isinstance(lst, ListerBase) - assert drop_fn is None - assert init_fn is not None - assert insert_data_fn is not None - - for lister_name in supported_listers_with_init: - lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url, - drop_tables=True) - - assert isinstance(lst, ListerBase) - assert drop_fn is not None - assert init_fn is not None - assert insert_data_fn is not None def test_get_lister_override(): @@ -67,9 +44,9 @@ # check the override ends up defined in the lister for lister_name, (url_key, url_value) in listers.items(): - lst, drop_fn, init_fn, insert_data_fn = get_lister( + lst = get_lister( lister_name, db_url, **{ - 'api_baseurl': url_value, + url_key: url_value, 'priority': 'high', 'policy': 'oneshot', }) @@ -81,14 +58,9 @@ # check the default urls are used and not the override (since it's not # passed) for lister_name, (url_key, url_value) in listers.items(): - lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url) + lst = get_lister(lister_name, db_url) # no override so this does not end up in lister's configuration assert url_key not in lst.config - - # then the default base url is used - default_url = DEFAULT_BASEURLS[lister_name] - - assert getattr(lst, url_key) == default_url assert 'priority' not in lst.config assert 'oneshot' not in lst.config