Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9339523
D1504.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
23 KB
Subscribers
None
D1504.diff
View Options
diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,4 +1,3 @@
swh.core
-swh.storage >= 0.0.122
-swh.storage[schemata]
-swh.scheduler >= 0.0.39
+swh.storage[schemata] >= 0.0.122
+swh.scheduler >= 0.0.58
diff --git a/requirements-test.txt b/requirements-test.txt
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,4 +1,4 @@
-pytest<4
+pytest
pytest-postgresql
requests_mock
testing.postgresql
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -55,6 +55,18 @@
swh-lister=swh.lister.cli:cli
[swh.cli.subcommands]
lister=swh.lister.cli:lister
+ [swh.workers]
+ lister.bitbucket=swh.lister.bitbucket:register
+ lister.cgit=swh.lister.cgit:register
+ lister.cran=swh.lister.cran:register
+ lister.debian=swh.lister.debian:register
+ lister.github=swh.lister.github:register
+ lister.gitlab=swh.lister.gitlab:register
+ lister.gnu=swh.lister.gnu:register
+ lister.npm=swh.lister.npm:register
+ lister.packagist=swh.lister.packagist:register
+ lister.phabricator=swh.lister.phabricator:register
+ lister.pypi=swh.lister.pypi:register
''',
classifiers=[
"Programming Language :: Python :: 3",
diff --git a/swh/lister/bitbucket/__init__.py b/swh/lister/bitbucket/__init__.py
--- a/swh/lister/bitbucket/__init__.py
+++ b/swh/lister/bitbucket/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import BitBucketModel
+ from .lister import BitBucketLister
+
+ return {'models': [BitBucketModel],
+ 'lister': BitBucketLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/cgit/__init__.py b/swh/lister/cgit/__init__.py
--- a/swh/lister/cgit/__init__.py
+++ b/swh/lister/cgit/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import CGitModel
+ from .lister import CGitLister
+
+ return {'models': [CGitModel],
+ 'lister': CGitLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/cli.py b/swh/lister/cli.py
--- a/swh/lister/cli.py
+++ b/swh/lister/cli.py
@@ -4,191 +4,103 @@
# See top-level LICENSE file for more information
import logging
+import pkg_resources
+from copy import deepcopy
+
import click
+from sqlalchemy import create_engine
from swh.core.cli import CONTEXT_SETTINGS
+from swh.lister.core.models import initialize
logger = logging.getLogger(__name__)
-SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi',
- 'npm', 'phabricator', 'gnu', 'cran', 'cgit', 'packagist']
-
-
-# Base urls for most listers
-DEFAULT_BASEURLS = {
- 'gitlab': 'https://gitlab.com/api/v4/',
- 'phabricator': 'https://forge.softwareheritage.org',
-}
+LISTERS = {entry_point.name.split('.', 1)[1]: entry_point
+ for entry_point in pkg_resources.iter_entry_points('swh.workers')
+ if entry_point.name.split('.', 1)[0] == 'lister'}
+SUPPORTED_LISTERS = list(LISTERS)
-def get_lister(lister_name, db_url, drop_tables=False, **conf):
+def get_lister(lister_name, db_url=None, **conf):
"""Instantiate a lister given its name.
Args:
lister_name (str): Lister's name
- db_url (str): Db's service url access
- conf (dict): Extra configuration (policy, priority for example)
+ conf (dict): Configuration dict (lister db cnx, policy, priority...)
Returns:
Tuple (instantiated lister, drop_tables function, init schema function,
insert minimum data function)
"""
- override_conf = {
- 'lister': {
- 'cls': 'local',
- 'args': {'db': db_url}
- },
- **conf,
- }
-
- # To allow api_baseurl override per lister
- if 'api_baseurl' in override_conf:
- api_baseurl = override_conf.pop('api_baseurl')
- else:
- api_baseurl = DEFAULT_BASEURLS.get(lister_name)
-
- insert_minimum_data_fn = None
- if lister_name == 'github':
- from .github.models import IndexingModelBase as ModelBase
- from .github.lister import GitHubLister
-
- _lister = GitHubLister(api_baseurl='https://api.github.com',
- override_config=override_conf)
- elif lister_name == 'bitbucket':
- from .bitbucket.models import IndexingModelBase as ModelBase
- from .bitbucket.lister import BitBucketLister
- _lister = BitBucketLister(api_baseurl='https://api.bitbucket.org/2.0',
- override_config=override_conf)
-
- elif lister_name == 'gitlab':
- from .gitlab.models import ModelBase
- from .gitlab.lister import GitLabLister
- _lister = GitLabLister(api_baseurl=api_baseurl,
- override_config=override_conf)
- elif lister_name == 'debian':
- from .debian.lister import DebianLister
- ModelBase = DebianLister.MODEL # noqa
- _lister = DebianLister(override_config=override_conf)
-
- def insert_minimum_data_fn(lister_name, lister):
- logger.info('Inserting minimal data for %s', lister_name)
- from swh.storage.schemata.distribution import (
- Distribution, Area)
- d = Distribution(
- name='Debian',
- type='deb',
- mirror_uri='http://deb.debian.org/debian/')
- lister.db_session.add(d)
-
- areas = []
- for distribution_name in ['stretch']:
- for area_name in ['main', 'contrib', 'non-free']:
- areas.append(Area(
- name='%s/%s' % (distribution_name, area_name),
- distribution=d,
- ))
- lister.db_session.add_all(areas)
- lister.db_session.commit()
-
- elif lister_name == 'pypi':
- from .pypi.models import ModelBase
- from .pypi.lister import PyPILister
- _lister = PyPILister(override_config=override_conf)
-
- elif lister_name == 'npm':
- from .npm.models import IndexingModelBase as ModelBase
- from .npm.models import NpmVisitModel
- from .npm.lister import NpmLister
- _lister = NpmLister(override_config=override_conf)
-
- def insert_minimum_data_fn(lister_name, lister):
- logger.info('Inserting minimal data for %s', lister_name)
- if drop_tables:
- NpmVisitModel.metadata.drop_all(lister.db_engine)
- NpmVisitModel.metadata.create_all(lister.db_engine)
-
- elif lister_name == 'phabricator':
- from .phabricator.models import IndexingModelBase as ModelBase
- from .phabricator.lister import PhabricatorLister
- _lister = PhabricatorLister(api_baseurl=api_baseurl,
- override_config=override_conf)
-
- elif lister_name == 'gnu':
- from .gnu.models import ModelBase
- from .gnu.lister import GNULister
- _lister = GNULister(override_config=override_conf)
-
- elif lister_name == 'cran':
- from .cran.models import ModelBase
- from .cran.lister import CRANLister
- _lister = CRANLister(override_config=override_conf)
-
- elif lister_name == 'cgit':
- from .cgit.models import ModelBase
- from .cgit.lister import CGitLister
- _lister = CGitLister(url=api_baseurl,
- override_config=override_conf)
-
- elif lister_name == 'packagist':
- from .packagist.models import ModelBase # noqa
- from .packagist.lister import PackagistLister
- _lister = PackagistLister(override_config=override_conf)
-
- else:
+ if lister_name not in LISTERS:
raise ValueError(
'Invalid lister %s: only supported listers are %s' %
(lister_name, SUPPORTED_LISTERS))
-
- drop_table_fn = None
- if drop_tables:
- def drop_table_fn(lister_name, lister):
- logger.info('Dropping tables for %s', lister_name)
- ModelBase.metadata.drop_all(lister.db_engine)
-
- def init_schema_fn(lister_name, lister):
- logger.info('Creating tables for %s', lister_name)
- ModelBase.metadata.create_all(lister.db_engine)
-
- return _lister, drop_table_fn, init_schema_fn, insert_minimum_data_fn
+ if db_url:
+ conf['lister'] = {'cls': 'local', 'args': {'db': db_url}}
+ # To allow api_baseurl override per lister
+ registry_entry = LISTERS[lister_name].load()()
+ lister_cls = registry_entry['lister']
+ lister = lister_cls(override_config=conf)
+ return lister
@click.group(name='lister', context_settings=CONTEXT_SETTINGS)
+@click.option('--config-file', '-C', default=None,
+ type=click.Path(exists=True, dir_okay=False,),
+ help="Configuration file.")
+@click.option('--db-url', '-d', default=None,
+ help='SQLAlchemy DB URL; see '
+ '<http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>') # noqa
@click.pass_context
-def lister(ctx):
+def lister(ctx, config_file, db_url):
'''Software Heritage Lister tools.'''
- pass
+ from swh.core import config
+ ctx.ensure_object(dict)
+
+ override_conf = {}
+ if db_url:
+ override_conf['lister'] = {
+ 'cls': 'local',
+ 'args': {'db': db_url}
+ }
+ conf = config.read(config_file, override_conf)
+ ctx.obj['config'] = conf
+ ctx.obj['override_conf'] = override_conf
@lister.command(name='db-init', context_settings=CONTEXT_SETTINGS)
-@click.option('--db-url', '-d', default='postgres:///lister',
- help='SQLAlchemy DB URL; see '
- '<http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>') # noqa
-@click.argument('listers', required=1, nargs=-1,
- type=click.Choice(SUPPORTED_LISTERS + ['all']))
@click.option('--drop-tables', '-D', is_flag=True, default=False,
help='Drop tables before creating the database schema')
@click.pass_context
-def cli(ctx, db_url, listers, drop_tables):
+def db_init(ctx, drop_tables):
"""Initialize the database model for given listers.
"""
- if 'all' in listers:
- listers = SUPPORTED_LISTERS
- for lister_name in listers:
- logger.info('Initializing lister %s', lister_name)
- lister, drop_schema_fn, init_schema_fn, insert_minimum_data_fn = \
- get_lister(lister_name, db_url, drop_tables=drop_tables)
+ cfg = ctx.obj['config']
+ lister_cfg = cfg['lister']
+ if lister_cfg['cls'] != 'local':
+ click.echo('A local lister configuration is required')
+ ctx.exit(1)
+
+ db_url = lister_cfg['args']['db']
+ db_engine = create_engine(db_url)
- if drop_schema_fn:
- drop_schema_fn(lister_name, lister)
+ for lister, entrypoint in LISTERS.items():
+ logger.info('Loading lister %s', lister)
+ registry_entry = entrypoint.load()()
- init_schema_fn(lister_name, lister)
+ logger.info('Initializing database')
+ initialize(db_engine, drop_tables)
- if insert_minimum_data_fn:
- insert_minimum_data_fn(lister_name, lister)
+ for lister, entrypoint in LISTERS.items():
+ init_hook = registry_entry.get('init')
+ if callable(init_hook):
+ logger.info('Calling init hook for %s', lister)
+ init_hook(db_engine)
@lister.command(name='run', context_settings=CONTEXT_SETTINGS,
@@ -196,9 +108,6 @@
'instance. The output of this listing results in '
'"oneshot" tasks in the scheduler db with a priority '
'defined by the user')
-@click.option('--db-url', '-d', default='postgres:///lister',
- help='SQLAlchemy DB URL; see '
- '<http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>') # noqa
@click.option('--lister', '-l', help='Lister to run',
type=click.Choice(SUPPORTED_LISTERS))
@click.option('--priority', '-p', default='high',
@@ -206,23 +115,19 @@
help='Task priority for the listed repositories to ingest')
@click.argument('options', nargs=-1)
@click.pass_context
-def run(ctx, db_url, lister, priority, options):
+def run(ctx, lister, priority, options):
from swh.scheduler.cli.utils import parse_options
+ config = deepcopy(ctx.obj['config'])
+
if options:
- _, kwargs = parse_options(options)
- else:
- kwargs = {}
+ config.update(parse_options(options)[1])
- override_config = {
- 'priority': priority,
- 'policy': 'oneshot',
- **kwargs,
- }
+ config['priority'] = priority
+ config['policy'] = 'oneshot'
- lister, _, _, _ = get_lister(lister, db_url, **override_config)
- lister.run()
+ get_lister(lister, **config).run()
if __name__ == '__main__':
- cli()
+ lister()
diff --git a/swh/lister/core/models.py b/swh/lister/core/models.py
--- a/swh/lister/core/models.py
+++ b/swh/lister/core/models.py
@@ -4,12 +4,15 @@
import abc
from datetime import datetime
+import logging
from sqlalchemy import Column, DateTime, Integer, String
from sqlalchemy.ext.declarative import declarative_base, DeclarativeMeta
from .abstractattribute import AbstractAttribute
+logger = logging.getLogger(__name__)
+
SQLBase = declarative_base()
@@ -46,3 +49,23 @@
# The value used for sorting, segmenting, or api query paging,
# because uids aren't always sequential.
indexable = AbstractAttribute('Column(<indexable_type>, index=True)')
+
+
+def initialize(db_engine, drop_tables=False, **kwargs):
+ """Default database initialization function for a lister.
+
+ Typically called from the lister's initialization hook.
+
+ Args:
+ models (list): list of SQLAlchemy tables/models to drop/create.
+ db_enfine (): the SQLAlchemy DB engine.
+ drop_tables (bool): if True, tables will be dropped before
+ (re)creating them.
+ """
+
+ if drop_tables:
+ logger.info('Dropping tables')
+ SQLBase.metadata.drop_all(db_engine, checkfirst=True)
+
+ logger.info('Creating tables')
+ SQLBase.metadata.create_all(db_engine, checkfirst=True)
diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py
--- a/swh/lister/core/tests/conftest.py
+++ b/swh/lister/core/tests/conftest.py
@@ -1,19 +1 @@
-import pytest
from swh.scheduler.tests.conftest import * # noqa
-
-
-@pytest.fixture(scope='session')
-def celery_includes():
- return [
- 'swh.lister.bitbucket.tasks',
- 'swh.lister.cgit.tasks',
- 'swh.lister.cran.tasks',
- 'swh.lister.debian.tasks',
- 'swh.lister.github.tasks',
- 'swh.lister.gitlab.tasks',
- 'swh.lister.gnu.tasks',
- 'swh.lister.npm.tasks',
- 'swh.lister.packagist.tasks',
- 'swh.lister.phabricator.tasks',
- 'swh.lister.pypi.tasks',
- ]
diff --git a/swh/lister/cran/__init__.py b/swh/lister/cran/__init__.py
--- a/swh/lister/cran/__init__.py
+++ b/swh/lister/cran/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import CRANModel
+ from .lister import CRANLister
+
+ return {'models': [CRANModel],
+ 'lister': CRANLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/debian/__init__.py b/swh/lister/debian/__init__.py
--- a/swh/lister/debian/__init__.py
+++ b/swh/lister/debian/__init__.py
@@ -0,0 +1,40 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def debian_init(db_engine, override_conf=None):
+ from swh.storage.schemata.distribution import (
+ Distribution, Area)
+ from .lister import DebianLister
+
+ lister = DebianLister(override_config=override_conf)
+
+ if not lister.db_session\
+ .query(Distribution)\
+ .filter(Distribution.name == 'Debian')\
+ .one_or_none():
+
+ d = Distribution(
+ name='Debian',
+ type='deb',
+ mirror_uri='http://deb.debian.org/debian/')
+ lister.db_session.add(d)
+
+ areas = []
+ for distribution_name in ['stretch', 'buster']:
+ for area_name in ['main', 'contrib', 'non-free']:
+ areas.append(Area(
+ name='%s/%s' % (distribution_name, area_name),
+ distribution=d,
+ ))
+ lister.db_session.add_all(areas)
+ lister.db_session.commit()
+
+
+def register():
+ from .lister import DebianLister
+ return {'models': [DebianLister.MODEL],
+ 'lister': DebianLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ 'init': debian_init}
diff --git a/swh/lister/github/__init__.py b/swh/lister/github/__init__.py
--- a/swh/lister/github/__init__.py
+++ b/swh/lister/github/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import GitHubModel
+ from .lister import GitHubLister
+
+ return {'models': [GitHubModel],
+ 'lister': GitHubLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/gitlab/__init__.py b/swh/lister/gitlab/__init__.py
--- a/swh/lister/gitlab/__init__.py
+++ b/swh/lister/gitlab/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import GitLabModel
+ from .lister import GitLabLister
+
+ return {'models': [GitLabModel],
+ 'lister': GitLabLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/gnu/__init__.py b/swh/lister/gnu/__init__.py
--- a/swh/lister/gnu/__init__.py
+++ b/swh/lister/gnu/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import GNUModel
+ from .lister import GNULister
+
+ return {'models': [GNUModel],
+ 'lister': GNULister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/npm/__init__.py b/swh/lister/npm/__init__.py
--- a/swh/lister/npm/__init__.py
+++ b/swh/lister/npm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import NpmVisitModel, NpmModel
+ from .lister import NpmLister
+
+ return {'models': [NpmVisitModel, NpmModel],
+ 'lister': NpmLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/packagist/__init__.py b/swh/lister/packagist/__init__.py
--- a/swh/lister/packagist/__init__.py
+++ b/swh/lister/packagist/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import PackagistModel
+ from .lister import PackagistLister
+
+ return {'models': [PackagistModel],
+ 'lister': PackagistLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/phabricator/__init__.py b/swh/lister/phabricator/__init__.py
--- a/swh/lister/phabricator/__init__.py
+++ b/swh/lister/phabricator/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import PhabricatorModel
+ from .lister import PhabricatorLister
+
+ return {'models': [PhabricatorModel],
+ 'lister': PhabricatorLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/pypi/__init__.py b/swh/lister/pypi/__init__.py
--- a/swh/lister/pypi/__init__.py
+++ b/swh/lister/pypi/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import PyPIModel
+ from .lister import PyPILister
+
+ return {'models': [PyPIModel],
+ 'lister': PyPILister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py
--- a/swh/lister/tests/test_cli.py
+++ b/swh/lister/tests/test_cli.py
@@ -6,7 +6,7 @@
import pytest
from swh.lister.core.lister_base import ListerBase
-from swh.lister.cli import get_lister, SUPPORTED_LISTERS, DEFAULT_BASEURLS
+from swh.lister.cli import get_lister, SUPPORTED_LISTERS
from .test_utils import init_db
@@ -24,32 +24,9 @@
"""
db_url = init_db().url()
- supported_listers_with_init = {'npm', 'debian'}
- supported_listers = set(SUPPORTED_LISTERS) - supported_listers_with_init
- for lister_name in supported_listers:
- lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url)
-
+ for lister_name in SUPPORTED_LISTERS:
+ lst = get_lister(lister_name, db_url)
assert isinstance(lst, ListerBase)
- assert drop_fn is None
- assert init_fn is not None
- assert insert_data_fn is None
-
- for lister_name in supported_listers_with_init:
- lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url)
-
- assert isinstance(lst, ListerBase)
- assert drop_fn is None
- assert init_fn is not None
- assert insert_data_fn is not None
-
- for lister_name in supported_listers_with_init:
- lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url,
- drop_tables=True)
-
- assert isinstance(lst, ListerBase)
- assert drop_fn is not None
- assert init_fn is not None
- assert insert_data_fn is not None
def test_get_lister_override():
@@ -67,9 +44,9 @@
# check the override ends up defined in the lister
for lister_name, (url_key, url_value) in listers.items():
- lst, drop_fn, init_fn, insert_data_fn = get_lister(
+ lst = get_lister(
lister_name, db_url, **{
- 'api_baseurl': url_value,
+ url_key: url_value,
'priority': 'high',
'policy': 'oneshot',
})
@@ -81,14 +58,9 @@
# check the default urls are used and not the override (since it's not
# passed)
for lister_name, (url_key, url_value) in listers.items():
- lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url)
+ lst = get_lister(lister_name, db_url)
# no override so this does not end up in lister's configuration
assert url_key not in lst.config
-
- # then the default base url is used
- default_url = DEFAULT_BASEURLS[lister_name]
-
- assert getattr(lst, url_key) == default_url
assert 'priority' not in lst.config
assert 'oneshot' not in lst.config
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Jul 3 2025, 9:44 AM (5 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3225124
Attached To
D1504: implement listers as plugins
Event Timeline
Log In to Comment