Page MenuHomeSoftware Heritage

D1504.diff
No OneTemporary

D1504.diff

diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,4 +1,3 @@
swh.core
-swh.storage >= 0.0.122
-swh.storage[schemata]
-swh.scheduler >= 0.0.39
+swh.storage[schemata] >= 0.0.122
+swh.scheduler >= 0.0.58
diff --git a/requirements-test.txt b/requirements-test.txt
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,4 +1,4 @@
-pytest<4
+pytest
pytest-postgresql
requests_mock
testing.postgresql
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -55,6 +55,18 @@
swh-lister=swh.lister.cli:cli
[swh.cli.subcommands]
lister=swh.lister.cli:lister
+ [swh.workers]
+ lister.bitbucket=swh.lister.bitbucket:register
+ lister.cgit=swh.lister.cgit:register
+ lister.cran=swh.lister.cran:register
+ lister.debian=swh.lister.debian:register
+ lister.github=swh.lister.github:register
+ lister.gitlab=swh.lister.gitlab:register
+ lister.gnu=swh.lister.gnu:register
+ lister.npm=swh.lister.npm:register
+ lister.packagist=swh.lister.packagist:register
+ lister.phabricator=swh.lister.phabricator:register
+ lister.pypi=swh.lister.pypi:register
''',
classifiers=[
"Programming Language :: Python :: 3",
diff --git a/swh/lister/bitbucket/__init__.py b/swh/lister/bitbucket/__init__.py
--- a/swh/lister/bitbucket/__init__.py
+++ b/swh/lister/bitbucket/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import BitBucketModel
+ from .lister import BitBucketLister
+
+ return {'models': [BitBucketModel],
+ 'lister': BitBucketLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/cgit/__init__.py b/swh/lister/cgit/__init__.py
--- a/swh/lister/cgit/__init__.py
+++ b/swh/lister/cgit/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import CGitModel
+ from .lister import CGitLister
+
+ return {'models': [CGitModel],
+ 'lister': CGitLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/cli.py b/swh/lister/cli.py
--- a/swh/lister/cli.py
+++ b/swh/lister/cli.py
@@ -4,191 +4,103 @@
# See top-level LICENSE file for more information
import logging
+import pkg_resources
+from copy import deepcopy
+
import click
+from sqlalchemy import create_engine
from swh.core.cli import CONTEXT_SETTINGS
+from swh.lister.core.models import initialize
logger = logging.getLogger(__name__)
-SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi',
- 'npm', 'phabricator', 'gnu', 'cran', 'cgit', 'packagist']
-
-
-# Base urls for most listers
-DEFAULT_BASEURLS = {
- 'gitlab': 'https://gitlab.com/api/v4/',
- 'phabricator': 'https://forge.softwareheritage.org',
-}
+LISTERS = {entry_point.name.split('.', 1)[1]: entry_point
+ for entry_point in pkg_resources.iter_entry_points('swh.workers')
+ if entry_point.name.split('.', 1)[0] == 'lister'}
+SUPPORTED_LISTERS = list(LISTERS)
-def get_lister(lister_name, db_url, drop_tables=False, **conf):
+def get_lister(lister_name, db_url=None, **conf):
"""Instantiate a lister given its name.
Args:
lister_name (str): Lister's name
- db_url (str): Db's service url access
- conf (dict): Extra configuration (policy, priority for example)
+ conf (dict): Configuration dict (lister db cnx, policy, priority...)
Returns:
Tuple (instantiated lister, drop_tables function, init schema function,
insert minimum data function)
"""
- override_conf = {
- 'lister': {
- 'cls': 'local',
- 'args': {'db': db_url}
- },
- **conf,
- }
-
- # To allow api_baseurl override per lister
- if 'api_baseurl' in override_conf:
- api_baseurl = override_conf.pop('api_baseurl')
- else:
- api_baseurl = DEFAULT_BASEURLS.get(lister_name)
-
- insert_minimum_data_fn = None
- if lister_name == 'github':
- from .github.models import IndexingModelBase as ModelBase
- from .github.lister import GitHubLister
-
- _lister = GitHubLister(api_baseurl='https://api.github.com',
- override_config=override_conf)
- elif lister_name == 'bitbucket':
- from .bitbucket.models import IndexingModelBase as ModelBase
- from .bitbucket.lister import BitBucketLister
- _lister = BitBucketLister(api_baseurl='https://api.bitbucket.org/2.0',
- override_config=override_conf)
-
- elif lister_name == 'gitlab':
- from .gitlab.models import ModelBase
- from .gitlab.lister import GitLabLister
- _lister = GitLabLister(api_baseurl=api_baseurl,
- override_config=override_conf)
- elif lister_name == 'debian':
- from .debian.lister import DebianLister
- ModelBase = DebianLister.MODEL # noqa
- _lister = DebianLister(override_config=override_conf)
-
- def insert_minimum_data_fn(lister_name, lister):
- logger.info('Inserting minimal data for %s', lister_name)
- from swh.storage.schemata.distribution import (
- Distribution, Area)
- d = Distribution(
- name='Debian',
- type='deb',
- mirror_uri='http://deb.debian.org/debian/')
- lister.db_session.add(d)
-
- areas = []
- for distribution_name in ['stretch']:
- for area_name in ['main', 'contrib', 'non-free']:
- areas.append(Area(
- name='%s/%s' % (distribution_name, area_name),
- distribution=d,
- ))
- lister.db_session.add_all(areas)
- lister.db_session.commit()
-
- elif lister_name == 'pypi':
- from .pypi.models import ModelBase
- from .pypi.lister import PyPILister
- _lister = PyPILister(override_config=override_conf)
-
- elif lister_name == 'npm':
- from .npm.models import IndexingModelBase as ModelBase
- from .npm.models import NpmVisitModel
- from .npm.lister import NpmLister
- _lister = NpmLister(override_config=override_conf)
-
- def insert_minimum_data_fn(lister_name, lister):
- logger.info('Inserting minimal data for %s', lister_name)
- if drop_tables:
- NpmVisitModel.metadata.drop_all(lister.db_engine)
- NpmVisitModel.metadata.create_all(lister.db_engine)
-
- elif lister_name == 'phabricator':
- from .phabricator.models import IndexingModelBase as ModelBase
- from .phabricator.lister import PhabricatorLister
- _lister = PhabricatorLister(api_baseurl=api_baseurl,
- override_config=override_conf)
-
- elif lister_name == 'gnu':
- from .gnu.models import ModelBase
- from .gnu.lister import GNULister
- _lister = GNULister(override_config=override_conf)
-
- elif lister_name == 'cran':
- from .cran.models import ModelBase
- from .cran.lister import CRANLister
- _lister = CRANLister(override_config=override_conf)
-
- elif lister_name == 'cgit':
- from .cgit.models import ModelBase
- from .cgit.lister import CGitLister
- _lister = CGitLister(url=api_baseurl,
- override_config=override_conf)
-
- elif lister_name == 'packagist':
- from .packagist.models import ModelBase # noqa
- from .packagist.lister import PackagistLister
- _lister = PackagistLister(override_config=override_conf)
-
- else:
+ if lister_name not in LISTERS:
raise ValueError(
'Invalid lister %s: only supported listers are %s' %
(lister_name, SUPPORTED_LISTERS))
-
- drop_table_fn = None
- if drop_tables:
- def drop_table_fn(lister_name, lister):
- logger.info('Dropping tables for %s', lister_name)
- ModelBase.metadata.drop_all(lister.db_engine)
-
- def init_schema_fn(lister_name, lister):
- logger.info('Creating tables for %s', lister_name)
- ModelBase.metadata.create_all(lister.db_engine)
-
- return _lister, drop_table_fn, init_schema_fn, insert_minimum_data_fn
+ if db_url:
+ conf['lister'] = {'cls': 'local', 'args': {'db': db_url}}
+ # To allow api_baseurl override per lister
+ registry_entry = LISTERS[lister_name].load()()
+ lister_cls = registry_entry['lister']
+ lister = lister_cls(override_config=conf)
+ return lister
@click.group(name='lister', context_settings=CONTEXT_SETTINGS)
+@click.option('--config-file', '-C', default=None,
+ type=click.Path(exists=True, dir_okay=False,),
+ help="Configuration file.")
+@click.option('--db-url', '-d', default=None,
+ help='SQLAlchemy DB URL; see '
+ '<http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>') # noqa
@click.pass_context
-def lister(ctx):
+def lister(ctx, config_file, db_url):
'''Software Heritage Lister tools.'''
- pass
+ from swh.core import config
+ ctx.ensure_object(dict)
+
+ override_conf = {}
+ if db_url:
+ override_conf['lister'] = {
+ 'cls': 'local',
+ 'args': {'db': db_url}
+ }
+ conf = config.read(config_file, override_conf)
+ ctx.obj['config'] = conf
+ ctx.obj['override_conf'] = override_conf
@lister.command(name='db-init', context_settings=CONTEXT_SETTINGS)
-@click.option('--db-url', '-d', default='postgres:///lister',
- help='SQLAlchemy DB URL; see '
- '<http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>') # noqa
-@click.argument('listers', required=1, nargs=-1,
- type=click.Choice(SUPPORTED_LISTERS + ['all']))
@click.option('--drop-tables', '-D', is_flag=True, default=False,
help='Drop tables before creating the database schema')
@click.pass_context
-def cli(ctx, db_url, listers, drop_tables):
+def db_init(ctx, drop_tables):
"""Initialize the database model for given listers.
"""
- if 'all' in listers:
- listers = SUPPORTED_LISTERS
- for lister_name in listers:
- logger.info('Initializing lister %s', lister_name)
- lister, drop_schema_fn, init_schema_fn, insert_minimum_data_fn = \
- get_lister(lister_name, db_url, drop_tables=drop_tables)
+ cfg = ctx.obj['config']
+ lister_cfg = cfg['lister']
+ if lister_cfg['cls'] != 'local':
+ click.echo('A local lister configuration is required')
+ ctx.exit(1)
+
+ db_url = lister_cfg['args']['db']
+ db_engine = create_engine(db_url)
- if drop_schema_fn:
- drop_schema_fn(lister_name, lister)
+ for lister, entrypoint in LISTERS.items():
+ logger.info('Loading lister %s', lister)
+ registry_entry = entrypoint.load()()
- init_schema_fn(lister_name, lister)
+ logger.info('Initializing database')
+ initialize(db_engine, drop_tables)
- if insert_minimum_data_fn:
- insert_minimum_data_fn(lister_name, lister)
+ for lister, entrypoint in LISTERS.items():
+ init_hook = registry_entry.get('init')
+ if callable(init_hook):
+ logger.info('Calling init hook for %s', lister)
+ init_hook(db_engine)
@lister.command(name='run', context_settings=CONTEXT_SETTINGS,
@@ -196,9 +108,6 @@
'instance. The output of this listing results in '
'"oneshot" tasks in the scheduler db with a priority '
'defined by the user')
-@click.option('--db-url', '-d', default='postgres:///lister',
- help='SQLAlchemy DB URL; see '
- '<http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>') # noqa
@click.option('--lister', '-l', help='Lister to run',
type=click.Choice(SUPPORTED_LISTERS))
@click.option('--priority', '-p', default='high',
@@ -206,23 +115,19 @@
help='Task priority for the listed repositories to ingest')
@click.argument('options', nargs=-1)
@click.pass_context
-def run(ctx, db_url, lister, priority, options):
+def run(ctx, lister, priority, options):
from swh.scheduler.cli.utils import parse_options
+ config = deepcopy(ctx.obj['config'])
+
if options:
- _, kwargs = parse_options(options)
- else:
- kwargs = {}
+ config.update(parse_options(options)[1])
- override_config = {
- 'priority': priority,
- 'policy': 'oneshot',
- **kwargs,
- }
+ config['priority'] = priority
+ config['policy'] = 'oneshot'
- lister, _, _, _ = get_lister(lister, db_url, **override_config)
- lister.run()
+ get_lister(lister, **config).run()
if __name__ == '__main__':
- cli()
+ lister()
diff --git a/swh/lister/core/models.py b/swh/lister/core/models.py
--- a/swh/lister/core/models.py
+++ b/swh/lister/core/models.py
@@ -4,12 +4,15 @@
import abc
from datetime import datetime
+import logging
from sqlalchemy import Column, DateTime, Integer, String
from sqlalchemy.ext.declarative import declarative_base, DeclarativeMeta
from .abstractattribute import AbstractAttribute
+logger = logging.getLogger(__name__)
+
SQLBase = declarative_base()
@@ -46,3 +49,23 @@
# The value used for sorting, segmenting, or api query paging,
# because uids aren't always sequential.
indexable = AbstractAttribute('Column(<indexable_type>, index=True)')
+
+
+def initialize(db_engine, drop_tables=False, **kwargs):
+ """Default database initialization function for a lister.
+
+ Typically called from the lister's initialization hook.
+
+ Args:
+ models (list): list of SQLAlchemy tables/models to drop/create.
+ db_enfine (): the SQLAlchemy DB engine.
+ drop_tables (bool): if True, tables will be dropped before
+ (re)creating them.
+ """
+
+ if drop_tables:
+ logger.info('Dropping tables')
+ SQLBase.metadata.drop_all(db_engine, checkfirst=True)
+
+ logger.info('Creating tables')
+ SQLBase.metadata.create_all(db_engine, checkfirst=True)
diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py
--- a/swh/lister/core/tests/conftest.py
+++ b/swh/lister/core/tests/conftest.py
@@ -1,19 +1 @@
-import pytest
from swh.scheduler.tests.conftest import * # noqa
-
-
-@pytest.fixture(scope='session')
-def celery_includes():
- return [
- 'swh.lister.bitbucket.tasks',
- 'swh.lister.cgit.tasks',
- 'swh.lister.cran.tasks',
- 'swh.lister.debian.tasks',
- 'swh.lister.github.tasks',
- 'swh.lister.gitlab.tasks',
- 'swh.lister.gnu.tasks',
- 'swh.lister.npm.tasks',
- 'swh.lister.packagist.tasks',
- 'swh.lister.phabricator.tasks',
- 'swh.lister.pypi.tasks',
- ]
diff --git a/swh/lister/cran/__init__.py b/swh/lister/cran/__init__.py
--- a/swh/lister/cran/__init__.py
+++ b/swh/lister/cran/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import CRANModel
+ from .lister import CRANLister
+
+ return {'models': [CRANModel],
+ 'lister': CRANLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/debian/__init__.py b/swh/lister/debian/__init__.py
--- a/swh/lister/debian/__init__.py
+++ b/swh/lister/debian/__init__.py
@@ -0,0 +1,40 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def debian_init(db_engine, override_conf=None):
+ from swh.storage.schemata.distribution import (
+ Distribution, Area)
+ from .lister import DebianLister
+
+ lister = DebianLister(override_config=override_conf)
+
+ if not lister.db_session\
+ .query(Distribution)\
+ .filter(Distribution.name == 'Debian')\
+ .one_or_none():
+
+ d = Distribution(
+ name='Debian',
+ type='deb',
+ mirror_uri='http://deb.debian.org/debian/')
+ lister.db_session.add(d)
+
+ areas = []
+ for distribution_name in ['stretch', 'buster']:
+ for area_name in ['main', 'contrib', 'non-free']:
+ areas.append(Area(
+ name='%s/%s' % (distribution_name, area_name),
+ distribution=d,
+ ))
+ lister.db_session.add_all(areas)
+ lister.db_session.commit()
+
+
+def register():
+ from .lister import DebianLister
+ return {'models': [DebianLister.MODEL],
+ 'lister': DebianLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ 'init': debian_init}
diff --git a/swh/lister/github/__init__.py b/swh/lister/github/__init__.py
--- a/swh/lister/github/__init__.py
+++ b/swh/lister/github/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import GitHubModel
+ from .lister import GitHubLister
+
+ return {'models': [GitHubModel],
+ 'lister': GitHubLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/gitlab/__init__.py b/swh/lister/gitlab/__init__.py
--- a/swh/lister/gitlab/__init__.py
+++ b/swh/lister/gitlab/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import GitLabModel
+ from .lister import GitLabLister
+
+ return {'models': [GitLabModel],
+ 'lister': GitLabLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/gnu/__init__.py b/swh/lister/gnu/__init__.py
--- a/swh/lister/gnu/__init__.py
+++ b/swh/lister/gnu/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import GNUModel
+ from .lister import GNULister
+
+ return {'models': [GNUModel],
+ 'lister': GNULister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/npm/__init__.py b/swh/lister/npm/__init__.py
--- a/swh/lister/npm/__init__.py
+++ b/swh/lister/npm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import NpmVisitModel, NpmModel
+ from .lister import NpmLister
+
+ return {'models': [NpmVisitModel, NpmModel],
+ 'lister': NpmLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/packagist/__init__.py b/swh/lister/packagist/__init__.py
--- a/swh/lister/packagist/__init__.py
+++ b/swh/lister/packagist/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import PackagistModel
+ from .lister import PackagistLister
+
+ return {'models': [PackagistModel],
+ 'lister': PackagistLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/phabricator/__init__.py b/swh/lister/phabricator/__init__.py
--- a/swh/lister/phabricator/__init__.py
+++ b/swh/lister/phabricator/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import PhabricatorModel
+ from .lister import PhabricatorLister
+
+ return {'models': [PhabricatorModel],
+ 'lister': PhabricatorLister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/pypi/__init__.py b/swh/lister/pypi/__init__.py
--- a/swh/lister/pypi/__init__.py
+++ b/swh/lister/pypi/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .models import PyPIModel
+ from .lister import PyPILister
+
+ return {'models': [PyPIModel],
+ 'lister': PyPILister,
+ 'task_modules': ['%s.tasks' % __name__],
+ }
diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py
--- a/swh/lister/tests/test_cli.py
+++ b/swh/lister/tests/test_cli.py
@@ -6,7 +6,7 @@
import pytest
from swh.lister.core.lister_base import ListerBase
-from swh.lister.cli import get_lister, SUPPORTED_LISTERS, DEFAULT_BASEURLS
+from swh.lister.cli import get_lister, SUPPORTED_LISTERS
from .test_utils import init_db
@@ -24,32 +24,9 @@
"""
db_url = init_db().url()
- supported_listers_with_init = {'npm', 'debian'}
- supported_listers = set(SUPPORTED_LISTERS) - supported_listers_with_init
- for lister_name in supported_listers:
- lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url)
-
+ for lister_name in SUPPORTED_LISTERS:
+ lst = get_lister(lister_name, db_url)
assert isinstance(lst, ListerBase)
- assert drop_fn is None
- assert init_fn is not None
- assert insert_data_fn is None
-
- for lister_name in supported_listers_with_init:
- lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url)
-
- assert isinstance(lst, ListerBase)
- assert drop_fn is None
- assert init_fn is not None
- assert insert_data_fn is not None
-
- for lister_name in supported_listers_with_init:
- lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url,
- drop_tables=True)
-
- assert isinstance(lst, ListerBase)
- assert drop_fn is not None
- assert init_fn is not None
- assert insert_data_fn is not None
def test_get_lister_override():
@@ -67,9 +44,9 @@
# check the override ends up defined in the lister
for lister_name, (url_key, url_value) in listers.items():
- lst, drop_fn, init_fn, insert_data_fn = get_lister(
+ lst = get_lister(
lister_name, db_url, **{
- 'api_baseurl': url_value,
+ url_key: url_value,
'priority': 'high',
'policy': 'oneshot',
})
@@ -81,14 +58,9 @@
# check the default urls are used and not the override (since it's not
# passed)
for lister_name, (url_key, url_value) in listers.items():
- lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url)
+ lst = get_lister(lister_name, db_url)
# no override so this does not end up in lister's configuration
assert url_key not in lst.config
-
- # then the default base url is used
- default_url = DEFAULT_BASEURLS[lister_name]
-
- assert getattr(lst, url_key) == default_url
assert 'priority' not in lst.config
assert 'oneshot' not in lst.config

File Metadata

Mime Type
text/plain
Expires
Jul 3 2025, 9:44 AM (5 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3225124

Event Timeline