diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ ``` sh $ createdb lister-github.com -$ bin/ghlister --db-url postgres:///lister-github.com createdb +$ python3 -m swh.lister.cli --db-url postgres:///lister-github.com github --createdb ``` Configuration file samples diff --git a/swh/lister/bitbucket/tasks.py b/swh/lister/bitbucket/tasks.py --- a/swh/lister/bitbucket/tasks.py +++ b/swh/lister/bitbucket/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -10,7 +10,7 @@ class BitBucketListerTask(ListerTaskBase): - def new_lister(self): + def new_lister(self, *args, **kwargs): return BitBucketLister(lister_name='bitbucket.com', api_baseurl='https://api.bitbucket.org/2.0') diff --git a/swh/lister/cli.py b/swh/lister/cli.py new file mode 100644 --- /dev/null +++ b/swh/lister/cli.py @@ -0,0 +1,98 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import click + + +CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) + + +@click.group(context_settings=CONTEXT_SETTINGS) +@click.option( + '--db-url', '-d', default='postgres:///lister-gitlab.com', + help='SQLAlchemy DB URL; see ' + '') # noqa +@click.pass_context +def cli(ctx, db_url): + """Initialize db model according to lister. + + """ + config = {} + if db_url: + config['db_url'] = db_url + ctx.obj = config + + +@cli.command('github') +@click.option('--createdb', is_flag=True, default=False, + help='create db') +@click.option('--dropdb', is_flag=True, default=False, + help='Drop db') +@click.pass_context +def github(ctx, createdb, dropdb): + from .github import models + from .github.lister import GitHubLister + + override_conf = {'lister_db_url': ctx.obj['db_url']} + + lister = GitHubLister(lister_name='github.com', + api_baseurl='https://api.github.com', + override_config=override_conf) + + if dropdb: + models.ModelBase.metadata.drop_all(lister.db_engine) + + if createdb: + models.ModelBase.metadata.create_all(lister.db_engine) + + +@cli.command('gitlab') +@click.option('--createdb', is_flag=True, default=False, + help='create db') +@click.option('--dropdb', is_flag=True, default=False, + help='Drop db') +@click.pass_context +def gitlab(ctx, createdb, dropdb): + from .gitlab import models + from .gitlab.lister import GitLabLister + + override_conf = {'lister_db_url': ctx.obj['db_url']} + + lister = GitLabLister(lister_name='gitlab.com', + api_baseurl='https://gitlab.com/api/v4/', + override_config=override_conf) + + if dropdb: + models.ModelBase.metadata.drop_all(lister.db_engine) + + if createdb: + models.ModelBase.metadata.create_all(lister.db_engine) + + +@cli.command('bitbucket') +@click.option('--createdb', is_flag=True, default=False, + help='create db') +@click.option('--dropdb', is_flag=True, default=False, + help='Drop db') +@click.pass_context +def bitbucket(ctx, createdb, dropdb): + from .bitbucket import models + from .bitbucket.lister import BitBucketLister + + override_conf = {'lister_db_url': ctx.obj['db_url']} + + lister = BitBucketLister(lister_name='bitbucket.com', + api_baseurl='https://api.bitbucket.org/2.0', + override_config=override_conf) + + if dropdb: + models.ModelBase.metadata.drop_all(lister.db_engine) + + if createdb: + models.ModelBase.metadata.create_all(lister.db_engine) + + +if __name__ == '__main__': + cli() diff --git a/swh/lister/core/models.py b/swh/lister/core/models.py --- a/swh/lister/core/models.py +++ b/swh/lister/core/models.py @@ -42,10 +42,13 @@ def __init__(self, uid=None, indexable=None, name=None, full_name=None, html_url=None, origin_url=None, origin_type=None, - description=None, task_id=None, origin_id=None): + description=None, task_id=None, origin_id=None, + instance=None): self.uid = uid self.last_seen = datetime.now() + if instance is not None: + self.instance = instance if indexable is not None: self.indexable = indexable if name is not None: diff --git a/swh/lister/core/tasks.py b/swh/lister/core/tasks.py --- a/swh/lister/core/tasks.py +++ b/swh/lister/core/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -39,34 +39,35 @@ task_queue = AbstractAttribute('Celery Task queue name') @abc.abstractmethod - def new_lister(self): + def new_lister(self, *args, **kwargs): """Return a new lister of the appropriate type. """ pass @abc.abstractmethod - def run_task(self): + def run_task(self, *args, **kwargs): pass class IndexingDiscoveryListerTask(ListerTaskBase): - def run_task(self): - lister = self.new_lister() + def run_task(self, *args, **kwargs): + lister = self.new_lister(*args, **kwargs) return lister.run(min_index=lister.db_last_index(), max_index=None) class IndexingRangeListerTask(ListerTaskBase): - def run_task(self, start, end): - lister = self.new_lister() + def run_task(self, start, end, *args, **kwargs): + lister = self.new_lister(*args, **kwargs) return lister.run(min_index=start, max_index=end) class IndexingRefreshListerTask(ListerTaskBase): GROUP_SPLIT = 10000 - def run_task(self): - lister = self.new_lister() + def run_task(self, *args, **kwargs): + lister = self.new_lister(*args, **kwargs) ranges = lister.db_partition_indices(self.GROUP_SPLIT) random.shuffle(ranges) range_task = IndexingRangeListerTask() - group(range_task.s(minv, maxv) for minv, maxv in ranges)() + group(range_task.s(minv, maxv, *args, **kwargs) + for minv, maxv in ranges)() diff --git a/swh/lister/debian/tasks.py b/swh/lister/debian/tasks.py --- a/swh/lister/debian/tasks.py +++ b/swh/lister/debian/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -10,9 +10,9 @@ class DebianListerTask(ListerTaskBase): task_queue = 'swh_lister_debian' - def new_lister(self): + def new_lister(self, *args, **kwargs): return DebianLister() - def run_task(self, distribution): + def run_task(self, distribution, *args, **kwargs): lister = self.new_lister() return lister.run(distribution) diff --git a/swh/lister/github/tasks.py b/swh/lister/github/tasks.py --- a/swh/lister/github/tasks.py +++ b/swh/lister/github/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -10,7 +10,7 @@ class GitHubListerTask(ListerTaskBase): - def new_lister(self): + def new_lister(self, *args, **kwargs): return GitHubLister(lister_name='github.com', api_baseurl='https://api.github.com') diff --git a/swh/lister/gitlab/__init__.py b/swh/lister/gitlab/__init__.py new file mode 100644 diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/lister.py @@ -0,0 +1,111 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import random +import re +import time + +from ..core.indexing_lister import SWHIndexingHttpLister +from .models import GitLabModel + + +class GitLabLister(SWHIndexingHttpLister): + # Path to give and mentioning the last id for the next page + PATH_TEMPLATE = '/projects?page=%d&order_by=id&sort=asc&simple=true' + # gitlab api do not have an indexable identifier so using the page + # id + API_URL_INDEX_RE = re.compile(r'^.*/projects.*\&page=(\d+).*') + # The indexable field, the one we are supposed to use in the api + # query is not part of the lookup query. So, we cannot filter + # (method filter_before_inject), nor detect and disable origins + # (method disable_deleted_repo_tasks) + MODEL = GitLabModel + + @property + def ADDITIONAL_CONFIG(self): # noqa: N802 + return { + 'lister_db_url': + ('str', 'postgresql:///lister-%s' % self.lister_name), + 'credentials': # credentials is a dict + ('dict', {}), + 'cache_responses': + ('bool', False), + 'cache_dir': + ('str', '~/.cache/swh/lister/%s' % self.lister_name), + } + + def request_params(self, identifier): + """Get the full parameters passed to requests given the + transport_request identifier. + + For the gitlab lister, the 'credentials' entries is configured + per instance. For example: + + - credentials: + - gitlab.com: + - username: user0 + password: + - username: user1 + password: + - ... + - other-gitlab-instance: + ... + + """ + params = { + 'headers': self.request_headers() or {} + } + # Retrieve the credentials per instance + creds = self.config['credentials'] + if creds: + creds_lister = creds[self.lister_name] + auth = random.choice(creds_lister) if creds else None + if auth: + params['auth'] = (auth['username'], auth['password']) + return params + + def filter_before_inject(self, models_list): + """We cannot filter so returns the models_list as is. + + """ + return models_list + + def get_model_from_repo(self, repo): + return { + 'instance': self.lister_name, + 'uid': repo['id'], + 'indexable': repo['id'], + 'name': repo['name'], + 'full_name': repo['path_with_namespace'], + 'html_url': repo['web_url'], + 'origin_url': repo['http_url_to_repo'], + 'origin_type': 'git', + 'description': repo['description'], + } + + def transport_quota_check(self, response): + """Deal with rate limit + + """ + reqs_remaining = int(response.headers['RateLimit-Remaining']) + # TODO: need to dig further about the actual returned code + # (not seen yet in documentation) + if response.status_code == 403 and reqs_remaining == 0: + reset_at = int(response.headers['RateLimit-Reset']) + delay = min(reset_at - time.time(), 3600) + return True, delay + return False, 0 + + def get_next_target_from_response(self, response): + """Deal with pagination + + """ + if 'next' in response.links: + next_url = response.links['next']['url'] + return int(self.API_URL_INDEX_RE.match(next_url).group(1)) + return None + + def transport_response_simplified(self, response): + repos = response.json() + return [self.get_model_from_repo(repo) for repo in repos] diff --git a/swh/lister/gitlab/models.py b/swh/lister/gitlab/models.py new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/models.py @@ -0,0 +1,17 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, Integer, String + +from ..core.models import ModelBase + + +class GitLabModel(ModelBase): + """a Gitlab repository""" + __tablename__ = 'gitlab_repo' + + id = Column(Integer, primary_key=True) + uid = Column(Integer, index=True) + instance = Column(String, index=True) + indexable = Column(Integer, index=True) diff --git a/swh/lister/gitlab/tasks.py b/swh/lister/gitlab/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/tasks.py @@ -0,0 +1,29 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.core.tasks import (IndexingDiscoveryListerTask, + IndexingRangeListerTask, + IndexingRefreshListerTask, ListerTaskBase) + +from .lister import GitLabLister + + +class GitLabDotComListerTask(ListerTaskBase): + def new_lister(self, lister_name='gitlab.com', + api_baseurl='https://gitlab.com/api/v4'): + return GitLabLister( + lister_name=lister_name, api_baseurl=api_baseurl) + + +class IncrementalGitLabDotComLister(GitLabDotComListerTask, + IndexingDiscoveryListerTask): + task_queue = 'swh_lister_gitlab_discover' + + +class RangeGitLabLister(GitLabDotComListerTask, IndexingRangeListerTask): + task_queue = 'swh_lister_gitlab_refresh' + + +class FullGitLabRelister(GitLabDotComListerTask, IndexingRefreshListerTask): + task_queue = 'swh_lister_gitlab_refresh'