diff --git a/swh/lister/cli.py b/swh/lister/cli.py --- a/swh/lister/cli.py +++ b/swh/lister/cli.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018 The Software Heritage developers +# Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -15,6 +15,154 @@ 'npm', 'phabricator', 'gnu', 'cran', 'cgit', 'packagist'] +# Base urls for most listers +DEFAULT_BASEURLS = { + 'gitlab': 'https://gitlab.com/api/v4/', + 'phabricator': 'https://forge.softwareheritage.org', + 'cgit': ( + 'http://git.savannah.gnu.org/cgit/', + 'http://git.savannah.gnu.org/git/' + ), +} + + +def new_lister(lister_name, db_url, drop_tables=False, **conf): + """Instantiate a lister given its name. + + Args: + lister_name (str): Lister's name + db_url (str): Db's service url access + conf (dict): Extra configuration (policy, priority for example) + + Returns: + Tuple (instantiated lister, drop_tables function, init schema function, + insert minimum data function) + + """ + override_conf = { + 'lister': { + 'cls': 'local', + 'args': {'db': db_url} + }, + **conf, + } + + # To allow api_baseurl override per lister + if 'api_baseurl' in override_conf: + api_baseurl = override_conf.pop('api_baseurl') + else: + api_baseurl = DEFAULT_BASEURLS.get(lister_name) + + insert_minimum_data_fn = None + if lister_name == 'github': + from .github.models import IndexingModelBase as ModelBase + from .github.lister import GitHubLister + + _lister = GitHubLister(api_baseurl='https://api.github.com', + override_config=override_conf) + elif lister_name == 'bitbucket': + from .bitbucket.models import IndexingModelBase as ModelBase + from .bitbucket.lister import BitBucketLister + _lister = BitBucketLister(api_baseurl='https://api.bitbucket.org/2.0', + override_config=override_conf) + + elif lister_name == 'gitlab': + from .gitlab.models import ModelBase + from .gitlab.lister import GitLabLister + _lister = GitLabLister(api_baseurl=api_baseurl, + override_config=override_conf) + elif lister_name == 'debian': + from .debian.lister import DebianLister + ModelBase = DebianLister.MODEL # noqa + _lister = DebianLister(override_config=override_conf) + + def insert_minimum_data_fn(lister_name, lister): + logger.info('Inserting minimal data for %s', lister_name) + from swh.storage.schemata.distribution import ( + Distribution, Area) + d = Distribution( + name='Debian', + type='deb', + mirror_uri='http://deb.debian.org/debian/') + lister.db_session.add(d) + + areas = [] + for distribution_name in ['stretch']: + for area_name in ['main', 'contrib', 'non-free']: + areas.append(Area( + name='%s/%s' % (distribution_name, area_name), + distribution=d, + )) + lister.db_session.add_all(areas) + lister.db_session.commit() + + elif lister_name == 'pypi': + from .pypi.models import ModelBase + from .pypi.lister import PyPILister + _lister = PyPILister(override_config=override_conf) + + elif lister_name == 'npm': + from .npm.models import IndexingModelBase as ModelBase + from .npm.models import NpmVisitModel + from .npm.lister import NpmLister + _lister = NpmLister(override_config=override_conf) + + def insert_minimum_data_fn(lister_name, lister): + logger.info('Inserting minimal data for %s', lister_name) + if drop_tables: + NpmVisitModel.metadata.drop_all(lister.db_engine) + NpmVisitModel.metadata.create_all(lister.db_engine) + + elif lister_name == 'phabricator': + from .phabricator.models import IndexingModelBase as ModelBase + from .phabricator.lister import PhabricatorLister + _lister = PhabricatorLister(forge_url=api_baseurl, + override_config=override_conf) + + elif lister_name == 'gnu': + from .gnu.models import ModelBase + from .gnu.lister import GNULister + _lister = GNULister(override_config=override_conf) + + elif lister_name == 'cran': + from .cran.models import ModelBase + from .cran.lister import CRANLister + _lister = CRANLister(override_config=override_conf) + + elif lister_name == 'cgit': + from .cgit.models import ModelBase + from .cgit.lister import CGitLister + if isinstance(api_baseurl, str): + _lister = CGitLister(url=api_baseurl, + override_config=override_conf) + else: # tuple + _lister = CGitLister(url=api_baseurl[0], + url_prefix=api_baseurl[1], + override_config=override_conf) + + elif lister_name == 'packagist': + from .packagist.models import ModelBase # noqa + from .packagist.lister import PackagistLister + _lister = PackagistLister(override_config=override_conf) + + else: + raise ValueError( + 'Invalid lister %s: only supported listers are %s' % + (lister_name, SUPPORTED_LISTERS)) + + drop_table_fn = None + if drop_tables: + def drop_table_fn(lister_name, lister): + logger.info('Dropping tables for %s', lister_name) + ModelBase.metadata.drop_all(lister.db_engine) + + def init_schema_fn(lister_name, lister): + logger.info('Creating tables for %s', lister_name) + ModelBase.metadata.create_all(lister.db_engine) + + return _lister, drop_table_fn, init_schema_fn, insert_minimum_data_fn + + @click.group(name='lister', context_settings=CONTEXT_SETTINGS) @click.pass_context def lister(ctx): @@ -36,127 +184,56 @@ """Initialize the database model for given listers. """ - override_conf = { - 'lister': { - 'cls': 'local', - 'args': {'db': db_url} - } - } - if 'all' in listers: listers = SUPPORTED_LISTERS - for lister in listers: - logger.info('Initializing lister %s', lister) - insert_minimum_data = None - if lister == 'github': - from .github.models import IndexingModelBase as ModelBase - from .github.lister import GitHubLister - - _lister = GitHubLister( - api_baseurl='https://api.github.com', - override_config=override_conf) - elif lister == 'bitbucket': - from .bitbucket.models import IndexingModelBase as ModelBase - from .bitbucket.lister import BitBucketLister - _lister = BitBucketLister( - api_baseurl='https://api.bitbucket.org/2.0', - override_config=override_conf) - - elif lister == 'gitlab': - from .gitlab.models import ModelBase - from .gitlab.lister import GitLabLister - _lister = GitLabLister( - api_baseurl='https://gitlab.com/api/v4/', - override_config=override_conf) - elif lister == 'debian': - from .debian.lister import DebianLister - ModelBase = DebianLister.MODEL # noqa - _lister = DebianLister(override_config=override_conf) - - def insert_minimum_data(lister): - from swh.storage.schemata.distribution import ( - Distribution, Area) - d = Distribution( - name='Debian', - type='deb', - mirror_uri='http://deb.debian.org/debian/') - lister.db_session.add(d) - - areas = [] - for distribution_name in ['stretch']: - for area_name in ['main', 'contrib', 'non-free']: - areas.append(Area( - name='%s/%s' % (distribution_name, area_name), - distribution=d, - )) - lister.db_session.add_all(areas) - lister.db_session.commit() - - elif lister == 'pypi': - from .pypi.models import ModelBase - from .pypi.lister import PyPILister - _lister = PyPILister(override_config=override_conf) - - elif lister == 'npm': - from .npm.models import IndexingModelBase as ModelBase - from .npm.models import NpmVisitModel - from .npm.lister import NpmLister - _lister = NpmLister(override_config=override_conf) - if drop_tables: - NpmVisitModel.metadata.drop_all(_lister.db_engine) - NpmVisitModel.metadata.create_all(_lister.db_engine) - - elif lister == 'phabricator': - from .phabricator.models import IndexingModelBase as ModelBase - from .phabricator.lister import PhabricatorLister - _lister = PhabricatorLister( - forge_url='https://forge.softwareheritage.org', - api_token='', - override_config=override_conf) - - elif lister == 'gnu': - from .gnu.models import ModelBase - from .gnu.lister import GNULister - _lister = GNULister(override_config=override_conf) - - elif lister == 'cran': - from .cran.models import ModelBase - from .cran.lister import CRANLister - _lister = CRANLister(override_config=override_conf) - - elif lister == 'cgit': - from .cgit.models import ModelBase - from .cgit.lister import CGitLister - _lister = CGitLister( - url='http://git.savannah.gnu.org/cgit/', - url_prefix='http://git.savannah.gnu.org/git/', - override_config=override_conf) - - elif lister == 'packagist': - from .packagist.models import ModelBase - from .packagist.lister import PackagistLister - _lister = PackagistLister(override_config=override_conf) - - else: - raise ValueError( - 'Invalid lister %s: only supported listers are %s' % - (lister, SUPPORTED_LISTERS)) - - if drop_tables: - logger.info('Dropping tables for %s', lister) - ModelBase.metadata.drop_all(_lister.db_engine) - - logger.info('Creating tables for %s', lister) - ModelBase.metadata.create_all(_lister.db_engine) - - if insert_minimum_data: - logger.info('Inserting minimal data for %s', lister) - try: - insert_minimum_data(_lister) - except Exception: - logger.warning( - 'Failed to insert minimum data in %s', lister) + for lister_name in listers: + logger.info('Initializing lister %s', lister_name) + lister, drop_schema_fn, init_schema_fn, insert_minimum_data_fn = \ + new_lister(lister_name, db_url, drop_tables=drop_tables) + + if drop_schema_fn: + drop_schema_fn(lister_name, lister) + + init_schema_fn(lister_name, lister) + + if insert_minimum_data_fn: + insert_minimum_data_fn(lister_name, lister) + + +@lister.command(name='list', context_settings=CONTEXT_SETTINGS, + help='Trigger a full listing for a particular instance. ' + 'Expected use case would be to list some small forge ' + 'instance with a policy "oneshot" and some priority.') +@click.option('--db-url', '-d', default='postgres:///lister', + help='SQLAlchemy DB URL; see ' + '') # noqa +@click.option('--lister', '-l', help='Lister to run', + type=click.Choice(SUPPORTED_LISTERS)) +@click.option('--priority', '-p', default='high', + type=click.Choice(['high', 'medium', 'low']), + help='Task priority for the listed repositories to ingest') +@click.option('--policy', '-P', + default='oneshot', + type=click.Choice(['recurring', 'oneshot'])) +@click.argument('options', nargs=-1) +@click.pass_context +def list(ctx, db_url, lister, priority, policy, options): + from swh.scheduler.cli.utils import parse_options + + if options: + _, kwargs = parse_options(options) + else: + kwargs = {} + + override_config = { + 'priority': priority, + 'policy': policy, + **kwargs, + } + + lister, _, _, _ = new_lister(lister, db_url, **override_config) + lister.run() if __name__ == '__main__': diff --git a/swh/lister/core/lister_base.py b/swh/lister/core/lister_base.py --- a/swh/lister/core/lister_base.py +++ b/swh/lister/core/lister_base.py @@ -391,8 +391,10 @@ the same information in a different form """ _type = 'load-%s' % origin_type - _policy = 'recurring' - return utils.create_task_dict(_type, _policy, origin_url) + _policy = kwargs.get('policy', 'recurring') + priority = kwargs.get('priority') + kw = {'priority': priority} if priority else {} + return utils.create_task_dict(_type, _policy, origin_url, **kw) def string_pattern_check(self, a, b, c=None): """When comparing indexable types in is_within_bounds, complex strings @@ -460,6 +462,12 @@ for m in models_list: ir = injected_repos[m['uid']] if not ir.task_id: + # Patching the model instance to add the policy/priority task + # scheduling + if 'policy' in self.config: + m['policy'] = self.config['policy'] + if 'priority' in self.config: + m['priority'] = self.config['priority'] task_dict = self.task_dict(**m) tasks[_task_key(task_dict)] = (ir, m, task_dict) diff --git a/swh/lister/core/tests/test_lister.py b/swh/lister/core/tests/test_lister.py --- a/swh/lister/core/tests/test_lister.py +++ b/swh/lister/core/tests/test_lister.py @@ -9,9 +9,9 @@ import requests_mock from sqlalchemy import create_engine -from testing.postgresql import Postgresql from swh.lister.core.abstractattribute import AbstractAttribute +from swh.lister.tests.test_utils import init_db def noop(*args, **kwargs): @@ -166,9 +166,7 @@ @requests_mock.Mocker() def test_fetch_multiple_pages_yesdb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) - initdb_args = Postgresql.DEFAULT_SETTINGS['initdb_args'] - initdb_args = ' '.join([initdb_args, '-E UTF-8']) - db = Postgresql(initdb_args=initdb_args) + db = init_db() fl = self.get_fl(override_config={ 'lister': { diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -27,7 +27,8 @@ needed for the ingestion task creation. """ return create_task_dict( - 'load-%s' % origin_type, 'recurring', + 'load-%s' % origin_type, + kwargs.get('policy', 'recurring'), kwargs.get('name'), origin_url, kwargs.get('version'), project_metadata=self.descriptions[kwargs.get('name')]) diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -32,7 +32,8 @@ needed for the ingestion task creation. """ return utils.create_task_dict( - 'load-%s' % origin_type, 'recurring', kwargs.get('name'), + 'load-%s' % origin_type, kwargs.get('policy', 'recurring'), + kwargs.get('name'), origin_url, tarballs=self.tarballs[kwargs.get('name')]) def get_file(self): diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py --- a/swh/lister/packagist/lister.py +++ b/swh/lister/packagist/lister.py @@ -51,7 +51,8 @@ needed for the ingestion task creation. """ - return utils.create_task_dict('load-%s' % origin_type, 'recurring', + return utils.create_task_dict('load-%s' % origin_type, + kwargs.get('policy', 'recurring'), kwargs.get('name'), origin_url) def list_packages(self, response): diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -30,7 +30,7 @@ """ _type = 'load-%s' % origin_type - _policy = 'recurring' + _policy = kwargs.get('policy', 'recurring') project_name = kwargs.get('name') project_metadata_url = kwargs.get('html_url') return utils.create_task_dict( diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py new file mode 100644 --- /dev/null +++ b/swh/lister/tests/test_cli.py @@ -0,0 +1,95 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from swh.lister.core.lister_base import ListerBase +from swh.lister.cli import new_lister, SUPPORTED_LISTERS, DEFAULT_BASEURLS + +from .test_utils import init_db + + +def test_new_lister_wrong_input(): + """Unsupported lister should raise""" + with pytest.raises(ValueError) as e: + new_lister('unknown', 'db-url') + + assert "Invalid lister" in str(e.value) + + +def test_new_lister(): + """Instantiating a supported lister should be ok + + """ + db_url = init_db().url() + supported_listers_with_init = {'npm', 'debian'} + supported_listers = set(SUPPORTED_LISTERS) - supported_listers_with_init + for lister_name in supported_listers: + lst, drop_fn, init_fn, insert_data_fn = new_lister(lister_name, db_url) + + assert isinstance(lst, ListerBase) + assert drop_fn is None + assert init_fn is not None + assert insert_data_fn is None + + for lister_name in supported_listers_with_init: + lst, drop_fn, init_fn, insert_data_fn = new_lister(lister_name, db_url) + + assert isinstance(lst, ListerBase) + assert drop_fn is None + assert init_fn is not None + assert insert_data_fn is not None + + for lister_name in supported_listers_with_init: + lst, drop_fn, init_fn, insert_data_fn = new_lister(lister_name, db_url, + drop_tables=True) + + assert isinstance(lst, ListerBase) + assert drop_fn is not None + assert init_fn is not None + assert insert_data_fn is not None + + +def test_new_lister_override(): + """Overriding the lister configuration should populate its config + + """ + db_url = init_db().url() + + listers = { + 'gitlab': ('api_baseurl', 'https://gitlab.uni/api/v4/'), + 'phabricator': ('forge_url', 'https://somewhere.org'), + 'cgit': ('url_prefix', 'https://some-cgit.eu/'), + } + + # check the override ends up defined in the lister + for lister_name, (url_key, url_value) in listers.items(): + lst, drop_fn, init_fn, insert_data_fn = new_lister( + lister_name, db_url, **{ + 'api_baseurl': url_value, + 'priority': 'high', + 'policy': 'oneshot', + }) + + assert getattr(lst, url_key) == url_value + assert lst.config['priority'] == 'high' + assert lst.config['policy'] == 'oneshot' + + # check the default urls are used and not the override (since it's not + # passed) + for lister_name, (url_key, url_value) in listers.items(): + lst, drop_fn, init_fn, insert_data_fn = new_lister(lister_name, db_url) + + # no override so this does not end up in lister's configuration + assert url_key not in lst.config + + # then the default base url is used + default_url = DEFAULT_BASEURLS[lister_name] + if isinstance(default_url, tuple): # cgit implementation detail... + default_url = default_url[1] + + assert getattr(lst, url_key) == default_url + assert 'priority' not in lst.config + assert 'oneshot' not in lst.config diff --git a/swh/lister/tests/test_utils.py b/swh/lister/tests/test_utils.py --- a/swh/lister/tests/test_utils.py +++ b/swh/lister/tests/test_utils.py @@ -1,9 +1,11 @@ -# Copyright (C) 2018 the Software Heritage developers +# Copyright (C) 2018-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest +from testing.postgresql import Postgresql + from swh.lister import utils @@ -22,3 +24,15 @@ with self.assertRaises(TypeError): list(utils.split_range(100, None)) + + +def init_db(): + """Factorize the db_url instantiation + + Returns: + db object to ease db manipulation + + """ + initdb_args = Postgresql.DEFAULT_SETTINGS['initdb_args'] + initdb_args = ' '.join([initdb_args, '-E UTF-8']) + return Postgresql(initdb_args=initdb_args)