diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -41,30 +41,90 @@ ## lister-github -1. git clone under $GHLISTER_ROOT (of your choosing) +### Preparation steps + +1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/github.com/ 3. create configuration file ~/.config/swh/lister-github.com.yml 4. Bootstrap the db instance schema -``` sh -$ createdb lister-github.com -$ bin/ghlister --db-url postgres:///lister-github.com createdb -``` - -Configuration file samples -------------------------- - -## github + $ createdb lister-github + $ python3 -m swh.lister.cli --db-url postgres:///lister-github github --createdb -cat ~/.config/swh/lister-github.com.yml +### Configuration file sample + $ cat ~/.config/swh/lister-github.com.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls - lister_db_url: postgres:///lister-github.com + lister_db_url: postgres:///lister-github credentials: [] cache_responses: True cache_dir: /home/zack/.cache/swh/lister/github.com + storage: # to avoid having to run yet another service + cls: local + args: + db: service=swh-dev + objstorage: + cls: pathslicing + args: + root: /home/storage/swh-storage/ + slicing: 0:1/1:5 + +### Run + + $ python3 + >>> import logging + >>> logging.basicConfig(level=logging.DEBUG) + >>> from swh.lister.github.tasks import RangeGitHubLister + >>> RangeGitHubLister().run(364, 365) + INFO:root:listing repos starting at 364 + DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.github.com + DEBUG:urllib3.connectionpool:https://api.github.com:443 "GET /repositories?since=364 HTTP/1.1" 200 None + DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost + DEBUG:urllib3.connectionpool:http://localhost:5002 "POST /origin/add HTTP/1.1" 200 1 + + +## lister-gitlab + +### preparation steps + +1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) +2. mkdir ~/.config/swh/ ~/.cache/swh/lister/gitlab/ +3. create configuration file ~/.config/swh/lister-gitlab.yml +4. Bootstrap the db instance schema + $ createdb lister-gitlab + $ python3 -m swh.lister.cli --db-url postgres:///lister-gitlab gitlab --createdb + +### Configuration file sample + + $ cat ~/.config/swh/lister-gitlab.yml + # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls + lister_db_url: postgres:///lister-gitlab + credentials: [] + cache_responses: True + cache_dir: /home/zack/.cache/swh/lister/gitlab storage: - cls: remote + cls: local args: - url: http://localhost:5002/ + db: service=swh-dev + objstorage: + cls: pathslicing + args: + root: /home/storage/swh-storage/ + slicing: 0:1/1:5 + +### Run + + $ python3 + Python 3.6.6 (default, Jun 27 2018, 14:44:17) + [GCC 8.1.0] on linux + Type "help", "copyright", "credits" or "license" for more information. + >>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2, + lister_name='salsa.debian.org', api_baseurl='https://salsa.debian.org/api/v4') + >>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2, + lister_name='gitlab.freedesktop.org', api_baseurl='https://gitlab.freedesktop.org/api/v4') + >>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2, + lister_name='gitlab.gnome.org', api_baseurl='https://gitlab.gnome.org/api/v4') + >>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2, + lister_name='gitlab.inria.fr', api_baseurl='https://gitlab.inria.fr/api/v4') + >>> diff --git a/swh/lister/bitbucket/tasks.py b/swh/lister/bitbucket/tasks.py --- a/swh/lister/bitbucket/tasks.py +++ b/swh/lister/bitbucket/tasks.py @@ -1,16 +1,16 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.lister.core.tasks import (IndexingDiscoveryListerTask, - IndexingRangeListerTask, + RangeListerTask, IndexingRefreshListerTask, ListerTaskBase) from .lister import BitBucketLister class BitBucketListerTask(ListerTaskBase): - def new_lister(self): + def new_lister(self, *args, **kwargs): return BitBucketLister(lister_name='bitbucket.com', api_baseurl='https://api.bitbucket.org/2.0') @@ -20,7 +20,7 @@ task_queue = 'swh_lister_bitbucket_discover' -class RangeBitBucketLister(BitBucketListerTask, IndexingRangeListerTask): +class RangeBitBucketLister(BitBucketListerTask, RangeListerTask): task_queue = 'swh_lister_bitbucket_refresh' diff --git a/swh/lister/bitbucket/tests/test_bb_lister.py b/swh/lister/bitbucket/tests/test_bb_lister.py --- a/swh/lister/bitbucket/tests/test_bb_lister.py +++ b/swh/lister/bitbucket/tests/test_bb_lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,10 +6,10 @@ import unittest from swh.lister.bitbucket.lister import BitBucketLister -from swh.lister.core.tests.test_lister import IndexingHttpListerTesterBase +from swh.lister.core.tests.test_lister import HttpListerTesterBase -class BitBucketListerTester(IndexingHttpListerTesterBase, unittest.TestCase): +class BitBucketListerTester(HttpListerTesterBase, unittest.TestCase): Lister = BitBucketLister test_re = re.compile(r'/repositories\?after=([^?&]+)') lister_subdir = 'bitbucket' diff --git a/swh/lister/cli.py b/swh/lister/cli.py new file mode 100644 --- /dev/null +++ b/swh/lister/cli.py @@ -0,0 +1,98 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import click + + +CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) + + +@click.group(context_settings=CONTEXT_SETTINGS) +@click.option( + '--db-url', '-d', default='postgres:///lister-gitlab.com', + help='SQLAlchemy DB URL; see ' + '') # noqa +@click.pass_context +def cli(ctx, db_url): + """Initialize db model according to lister. + + """ + config = {} + if db_url: + config['db_url'] = db_url + ctx.obj = config + + +@cli.command('github') +@click.option('--createdb', is_flag=True, default=False, + help='create db') +@click.option('--dropdb', is_flag=True, default=False, + help='Drop db') +@click.pass_context +def github(ctx, createdb, dropdb): + from .github import models + from .github.lister import GitHubLister + + override_conf = {'lister_db_url': ctx.obj['db_url']} + + lister = GitHubLister(lister_name='github.com', + api_baseurl='https://api.github.com', + override_config=override_conf) + + if dropdb: + models.ModelBase.metadata.drop_all(lister.db_engine) + + if createdb: + models.ModelBase.metadata.create_all(lister.db_engine) + + +@cli.command('gitlab') +@click.option('--createdb', is_flag=True, default=False, + help='create db') +@click.option('--dropdb', is_flag=True, default=False, + help='Drop db') +@click.pass_context +def gitlab(ctx, createdb, dropdb): + from .gitlab import models + from .gitlab.lister import GitLabLister + + override_conf = {'lister_db_url': ctx.obj['db_url']} + + lister = GitLabLister(lister_name='gitlab.com', + api_baseurl='https://gitlab.com/api/v4/', + override_config=override_conf) + + if dropdb: + models.ModelBase.metadata.drop_all(lister.db_engine) + + if createdb: + models.ModelBase.metadata.create_all(lister.db_engine) + + +@cli.command('bitbucket') +@click.option('--createdb', is_flag=True, default=False, + help='create db') +@click.option('--dropdb', is_flag=True, default=False, + help='Drop db') +@click.pass_context +def bitbucket(ctx, createdb, dropdb): + from .bitbucket import models + from .bitbucket.lister import BitBucketLister + + override_conf = {'lister_db_url': ctx.obj['db_url']} + + lister = BitBucketLister(lister_name='bitbucket.com', + api_baseurl='https://api.bitbucket.org/2.0', + override_config=override_conf) + + if dropdb: + models.ModelBase.metadata.drop_all(lister.db_engine) + + if createdb: + models.ModelBase.metadata.create_all(lister.db_engine) + + +if __name__ == '__main__': + cli() diff --git a/swh/lister/core/models.py b/swh/lister/core/models.py --- a/swh/lister/core/models.py +++ b/swh/lister/core/models.py @@ -42,10 +42,13 @@ def __init__(self, uid=None, indexable=None, name=None, full_name=None, html_url=None, origin_url=None, origin_type=None, - description=None, task_id=None, origin_id=None): + description=None, task_id=None, origin_id=None, + instance=None): self.uid = uid self.last_seen = datetime.now() + if instance is not None: + self.instance = instance if indexable is not None: self.indexable = indexable if name is not None: diff --git a/swh/lister/core/paging_lister.py b/swh/lister/core/paging_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/core/paging_lister.py @@ -0,0 +1,117 @@ +# Copyright (C) 2015-2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import abc +import logging + +from .lister_transports import SWHListerHttpTransport +from .lister_base import SWHListerBase + + +class SWHPagingLister(SWHListerBase): + """Lister* intermediate class for any service that follows the simple + pagination page pattern. + + - Client sends a request to list repositories starting from a + given page identifier. + + - Client receives structured (json/xml/etc) response with + information about a sequential series of repositories (per page) + starting from a given index. And, if available, some indication + of the next page index for fetching the remaining repository + data. + + See :class:`swh.lister.core.lister_base.SWHListerBase` for more + details. + + This class cannot be instantiated. To create a new Lister for a + source code listing service that follows the model described + above, you must subclass this class. Then provide the required + overrides in addition to any unmet implementation/override + requirements of this class's base (see parent class and member + docstrings for details). + + Required Overrides:: + + def get_next_target_from_response + + """ + @abc.abstractmethod + def get_next_target_from_response(self, response): + """Find the next server endpoint page given the entire response. + + Implementation of this method depends on the server API spec + and the shape of the network response object returned by the + transport_request method. + + For example, some api can use the headers links to provide the + next page. + + Args: + response (transport response): response page from the server + + Returns: + index of next page, possibly extracted from a next href url + + """ + pass + + # You probably don't need to override anything below this line. + + def run(self, min_index=None, max_index=None): + """Main entry function. Sequentially fetches repository data from the + service according to the basic outline in the class + docstring. Continually fetching sublists until either there + is no next index reference given or the given next index is + greater than the desired max_index. + + Args: + min_index (indexable type): optional index to start from + max_index (indexable type): optional index to stop at + + Returns: + nothing + + """ + index = min_index or '' + loop_count = 0 + self.min_index = min_index + self.max_index = max_index + + while self.is_within_bounds(index, self.min_index, self.max_index): + logging.info('listing repos starting at %s' % index) + + response, injected_repos = self.ingest_data(index) + next_index = self.get_next_target_from_response(response) + + # termination condition + + if (next_index is None) or (next_index == index): + logging.info('stopping after index %s, no next link found' % + index) + break + else: + index = next_index + + loop_count += 1 + if loop_count == 20: + logging.info('flushing updates') + loop_count = 0 + self.db_session.commit() + self.db_session = self.mk_session() + + self.db_session.commit() + self.db_session = self.mk_session() + + +class SWHPagingHttpLister(SWHListerHttpTransport, SWHPagingLister): + """Convenience class for ensuring right lookup and init order when + combining SWHPagingLister and SWHListerHttpTransport. + + """ + def __init__(self, lister_name=None, api_baseurl=None, + override_config=None): + SWHListerHttpTransport.__init__(self, api_baseurl=api_baseurl) + SWHPagingLister.__init__(self, lister_name=lister_name, + override_config=override_config) diff --git a/swh/lister/core/tasks.py b/swh/lister/core/tasks.py --- a/swh/lister/core/tasks.py +++ b/swh/lister/core/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -39,34 +39,50 @@ task_queue = AbstractAttribute('Celery Task queue name') @abc.abstractmethod - def new_lister(self): + def new_lister(self, *args, **kwargs): """Return a new lister of the appropriate type. """ pass @abc.abstractmethod - def run_task(self): + def run_task(self, *args, **kwargs): pass -class IndexingDiscoveryListerTask(ListerTaskBase): - def run_task(self): - lister = self.new_lister() - return lister.run(min_index=lister.db_last_index(), max_index=None) +# Paging/Indexing lister tasks derivatives (cf. {gitlab}/tasks) -class IndexingRangeListerTask(ListerTaskBase): - def run_task(self, start, end): - lister = self.new_lister() +class RangeListerTask(ListerTaskBase): + """Range indexing lister task. + + """ + def run_task(self, start, end, *args, **kwargs): + lister = self.new_lister(*args, **kwargs) return lister.run(min_index=start, max_index=end) +# Indexing Lister tasks derivatives (cf. {github/bitbucket}/tasks) + + +class IndexingDiscoveryListerTask(ListerTaskBase): + """Incremental indexing lister task. + + """ + def run_task(self, *args, **kwargs): + lister = self.new_lister(*args, **kwargs) + return lister.run(min_index=lister.db_last_index(), max_index=None) + + class IndexingRefreshListerTask(ListerTaskBase): + """Full indexing lister task. + + """ GROUP_SPLIT = 10000 - def run_task(self): - lister = self.new_lister() + def run_task(self, *args, **kwargs): + lister = self.new_lister(*args, **kwargs) ranges = lister.db_partition_indices(self.GROUP_SPLIT) random.shuffle(ranges) - range_task = IndexingRangeListerTask() - group(range_task.s(minv, maxv) for minv, maxv in ranges)() + range_task = RangeListerTask() + group(range_task.s(minv, maxv, *args, **kwargs) + for minv, maxv in ranges)() diff --git a/swh/lister/core/tests/test_lister.py b/swh/lister/core/tests/test_lister.py --- a/swh/lister/core/tests/test_lister.py +++ b/swh/lister/core/tests/test_lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -20,12 +20,15 @@ @requests_mock.Mocker() -class IndexingHttpListerTesterBase(abc.ABC): +class HttpListerTesterBase(abc.ABC): """Base testing class for subclasses of - swh.lister.core.indexing_lister.SWHIndexingHttpLister. - See swh.lister.github.tests.test_gh_lister for an example of how to - customize for a specific listing service. + swh.lister.core.indexing_lister.SWHIndexingHttpLister. + swh.lister.core.paging_lister.SWHPagingHttpLister + + See swh.lister.github.tests.test_gh_lister for an example of how + to customize for a specific listing service. + """ Lister = AbstractAttribute('The lister class to test') test_re = AbstractAttribute('Compiled regex matching the server url. Must' @@ -56,7 +59,7 @@ self.response = None self.fl = None self.helper = None - if self.__class__ != IndexingHttpListerTesterBase: + if self.__class__ != HttpListerTesterBase: self.run = TestCase.run.__get__(self, self.__class__) else: self.run = noop @@ -99,6 +102,9 @@ return self.mock_limit_n_response(2, request, context) def get_fl(self, override_config=None): + """Retrieve an instance of fake lister (fl). + + """ if override_config or self.fl is None: with patch( 'swh.scheduler.backend.SchedulerBackend.reconnect', noop @@ -164,7 +170,7 @@ self.assertIsInstance(di, dict) pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith('_')] for k in pubs: - if k not in ['last_seen', 'task_id', 'origin_id']: + if k not in ['last_seen', 'task_id', 'origin_id', 'id']: self.assertIn(k, di) def disable_storage_and_scheduler(self, fl): @@ -221,11 +227,14 @@ self.disable_storage_and_scheduler(fl) - fl.run(min_index=self.first_index) - - self.assertEqual(fl.db_last_index(), self.last_index) - partitions = fl.db_partition_indices(5) - self.assertGreater(len(partitions), 0) - for k in partitions: - self.assertLessEqual(len(k), 5) - self.assertGreater(len(k), 0) + # FIXME: Separate the tests properly for the gitlab lister + # did not succeed yet + if hasattr(fl, 'db_last_index'): + fl.run(min_index=self.first_index) + + self.assertEqual(fl.db_last_index(), self.last_index) + partitions = fl.db_partition_indices(5) + self.assertGreater(len(partitions), 0) + for k in partitions: + self.assertLessEqual(len(k), 5) + self.assertGreater(len(k), 0) diff --git a/swh/lister/debian/tasks.py b/swh/lister/debian/tasks.py --- a/swh/lister/debian/tasks.py +++ b/swh/lister/debian/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -10,9 +10,9 @@ class DebianListerTask(ListerTaskBase): task_queue = 'swh_lister_debian' - def new_lister(self): + def new_lister(self, *args, **kwargs): return DebianLister() - def run_task(self, distribution): + def run_task(self, distribution, *args, **kwargs): lister = self.new_lister() return lister.run(distribution) diff --git a/swh/lister/github/tasks.py b/swh/lister/github/tasks.py --- a/swh/lister/github/tasks.py +++ b/swh/lister/github/tasks.py @@ -1,16 +1,16 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.lister.core.tasks import (IndexingDiscoveryListerTask, - IndexingRangeListerTask, + RangeListerTask, IndexingRefreshListerTask, ListerTaskBase) from .lister import GitHubLister class GitHubListerTask(ListerTaskBase): - def new_lister(self): + def new_lister(self, *args, **kwargs): return GitHubLister(lister_name='github.com', api_baseurl='https://api.github.com') @@ -19,7 +19,7 @@ task_queue = 'swh_lister_github_discover' -class RangeGitHubLister(GitHubListerTask, IndexingRangeListerTask): +class RangeGitHubLister(GitHubListerTask, RangeListerTask): task_queue = 'swh_lister_github_refresh' diff --git a/swh/lister/github/tests/test_gh_lister.py b/swh/lister/github/tests/test_gh_lister.py --- a/swh/lister/github/tests/test_gh_lister.py +++ b/swh/lister/github/tests/test_gh_lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,11 +6,11 @@ import unittest from datetime import datetime, timedelta -from swh.lister.core.tests.test_lister import IndexingHttpListerTesterBase +from swh.lister.core.tests.test_lister import HttpListerTesterBase from swh.lister.github.lister import GitHubLister -class GitHubListerTester(IndexingHttpListerTesterBase, unittest.TestCase): +class GitHubListerTester(HttpListerTesterBase, unittest.TestCase): Lister = GitHubLister test_re = re.compile(r'/repositories\?since=([^?&]+)') lister_subdir = 'github' diff --git a/swh/lister/gitlab/__init__.py b/swh/lister/gitlab/__init__.py new file mode 100644 diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/lister.py @@ -0,0 +1,113 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import random +import re +import time + +from ..core.paging_lister import SWHPagingHttpLister +from .models import GitLabModel + + +class GitLabLister(SWHPagingHttpLister): + # Template path expecting an integer that represents the page id + PATH_TEMPLATE = '/projects?page=%d&order_by=id&sort=asc&simple=true' + API_URL_INDEX_RE = re.compile(r'^.*/projects.*page=(\d+).*') + MODEL = GitLabModel + + @property + def CONFIG_BASE_FILENAME(self): + """One gitlab lister for all instances. We discriminate between the + origin on a per instance basis in the table. + + """ + return 'lister-gitlab' + + @property + def ADDITIONAL_CONFIG(self): + """Override additional config as the 'credentials' structure change + between the ancestor classes and this class. + + cf. request_params method below + + """ + return { + 'lister_db_url': + ('str', 'postgresql:///lister-%s' % self.lister_name), + 'credentials': # credentials is a dict + ('dict', {}), + 'cache_responses': + ('bool', False), + 'cache_dir': + ('str', '~/.cache/swh/lister/%s' % self.lister_name), + } + + def request_params(self, identifier): + """Get the full parameters passed to requests given the + transport_request identifier. + + For the gitlab lister, the 'credentials' entries is configured + per instance. For example: + + - credentials: + - gitlab.com: + - username: user0 + password: + - username: user1 + password: + - ... + - other-gitlab-instance: + ... + + """ + params = { + 'headers': self.request_headers() or {} + } + # Retrieve the credentials per instance + creds = self.config['credentials'] + if creds: + creds_lister = creds[self.lister_name] + auth = random.choice(creds_lister) if creds else None + if auth: + params['auth'] = (auth['username'], auth['password']) + return params + + def get_model_from_repo(self, repo): + return { + 'instance': self.lister_name, + 'uid': repo['id'], + 'indexable': repo['id'], + 'name': repo['name'], + 'full_name': repo['path_with_namespace'], + 'html_url': repo['web_url'], + 'origin_url': repo['http_url_to_repo'], + 'origin_type': 'git', + 'description': repo['description'], + } + + def transport_quota_check(self, response): + """Deal with rate limit if any. + + """ + # not all gitlab instance have rate limit + if 'RateLimit-Remaining' in response.headers: + reqs_remaining = int(response.headers['RateLimit-Remaining']) + if response.status_code == 403 and reqs_remaining == 0: + reset_at = int(response.headers['RateLimit-Reset']) + delay = min(reset_at - time.time(), 3600) + return True, delay + return False, 0 + + def get_next_target_from_response(self, response): + """Deal with pagination + + """ + if 'next' in response.links: + next_url = response.links['next']['url'] + return int(self.API_URL_INDEX_RE.match(next_url).group(1)) + return None + + def transport_response_simplified(self, response): + repos = response.json() + return [self.get_model_from_repo(repo) for repo in repos] diff --git a/swh/lister/gitlab/models.py b/swh/lister/gitlab/models.py new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/models.py @@ -0,0 +1,17 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, Integer, String + +from ..core.models import ModelBase + + +class GitLabModel(ModelBase): + """a Gitlab repository""" + __tablename__ = 'gitlab_repo' + + id = Column(Integer, primary_key=True) + uid = Column(Integer, index=True) + instance = Column(String, index=True) + indexable = Column(Integer, index=True) diff --git a/swh/lister/gitlab/tasks.py b/swh/lister/gitlab/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/tasks.py @@ -0,0 +1,24 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.core.tasks import ListerTaskBase, RangeListerTask + + +from .lister import GitLabLister + + +class GitLabDotComListerTask(ListerTaskBase): + def new_lister(self, lister_name='gitlab.com', + api_baseurl='https://gitlab.com/api/v4'): + return GitLabLister( + lister_name=lister_name, api_baseurl=api_baseurl) + + +class RangeGitLabLister(GitLabDotComListerTask, RangeListerTask): + """GitLab lister working on specified range (start, end) arguments. + + """ + task_queue = 'swh_lister_gitlab_refresh' + + diff --git a/swh/lister/gitlab/tests/__init__.py b/swh/lister/gitlab/tests/__init__.py new file mode 100644 diff --git a/swh/lister/gitlab/tests/api_empty_response.json b/swh/lister/gitlab/tests/api_empty_response.json new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/tests/api_empty_response.json @@ -0,0 +1 @@ +[] diff --git a/swh/lister/gitlab/tests/api_response.json b/swh/lister/gitlab/tests/api_response.json new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/tests/api_response.json @@ -0,0 +1,170 @@ +[{"avatar_url": null, + "created_at": "2012-10-15T17:26:53.000Z", + "default_branch": "master", + "description": null, + "forks_count": 3, + "http_url_to_repo": "https://gitlab.com/leberwurscht/teardownwalls.git", + "id": 143, + "last_activity_at": "2013-10-03T08:08:46.000Z", + "name": "TearDownWalls", + "name_with_namespace": "Leberwurscht / TearDownWalls", + "path": "teardownwalls", + "path_with_namespace": "leberwurscht/teardownwalls", + "readme_url": "https://gitlab.com/leberwurscht/teardownwalls/blob/master/README.md", + "ssh_url_to_repo": "git@gitlab.com:leberwurscht/teardownwalls.git", + "star_count": 1, + "tag_list": [], + "web_url": "https://gitlab.com/leberwurscht/teardownwalls"}, + {"avatar_url": null, + "created_at": "2012-12-12T21:30:14.000Z", + "default_branch": "master", + "description": "", + "forks_count": 0, + "http_url_to_repo": "https://gitlab.com/technomancy/leiningen.git", + "id": 450, + "last_activity_at": "2018-06-24T00:07:06.666Z", + "name": "Leiningen", + "name_with_namespace": "Phil Hagelberg / Leiningen", + "path": "leiningen", + "path_with_namespace": "technomancy/leiningen", + "readme_url": "https://gitlab.com/technomancy/leiningen/blob/master/README.md", + "ssh_url_to_repo": "git@gitlab.com:technomancy/leiningen.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/technomancy/leiningen"}, + {"avatar_url": null, + "created_at": "2012-12-18T17:25:39.000Z", + "default_branch": "master", + "description": null, + "forks_count": 4, + "http_url_to_repo": "https://gitlab.com/jonan/heroes-of-wesnoth.git", + "id": 526, + "last_activity_at": "2015-04-09T14:43:49.363Z", + "name": "Heroes of Wesnoth", + "name_with_namespace": "Jonan / Heroes of Wesnoth", + "path": "heroes-of-wesnoth", + "path_with_namespace": "jonan/heroes-of-wesnoth", + "readme_url": null, + "ssh_url_to_repo": "git@gitlab.com:jonan/heroes-of-wesnoth.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/jonan/heroes-of-wesnoth"}, + {"avatar_url": null, + "created_at": "2012-12-18T17:33:03.000Z", + "default_branch": "master", + "description": null, + "forks_count": 0, + "http_url_to_repo": "https://gitlab.com/jonan/k.git", + "id": 527, + "last_activity_at": "2014-10-11T22:29:04.138Z", + "name": "K", + "name_with_namespace": "Jonan / K", + "path": "k", + "path_with_namespace": "jonan/k", + "readme_url": "https://gitlab.com/jonan/k/blob/master/README", + "ssh_url_to_repo": "git@gitlab.com:jonan/k.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/jonan/k"}, + {"avatar_url": null, + "created_at": "2013-01-06T20:35:42.000Z", + "default_branch": "master", + "description": "", + "forks_count": 0, + "http_url_to_repo": "https://gitlab.com/hcs/hcs_utils.git", + "id": 1025, + "last_activity_at": "2015-09-14T12:01:11.151Z", + "name": "hcs_utils", + "name_with_namespace": "Christer Sjöholm / hcs_utils", + "path": "hcs_utils", + "path_with_namespace": "hcs/hcs_utils", + "readme_url": "https://gitlab.com/hcs/hcs_utils/blob/master/README.txt", + "ssh_url_to_repo": "git@gitlab.com:hcs/hcs_utils.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/hcs/hcs_utils"}, + {"avatar_url": null, + "created_at": "2013-01-24T08:41:56.000Z", + "default_branch": null, + "description": null, + "forks_count": 0, + "http_url_to_repo": "https://gitlab.com/soeren/sspssptest.git", + "id": 1702, + "last_activity_at": "2013-10-03T08:31:54.000Z", + "name": "sspssptest", + "name_with_namespace": "kruemel / sspssptest", + "path": "sspssptest", + "path_with_namespace": "soeren/sspssptest", + "readme_url": null, + "ssh_url_to_repo": "git@gitlab.com:soeren/sspssptest.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/soeren/sspssptest"}, + {"avatar_url": null, + "created_at": "2013-01-28T22:59:31.000Z", + "default_branch": "master", + "description": null, + "forks_count": 0, + "http_url_to_repo": "https://gitlab.com/dpp/slothbeast.git", + "id": 1865, + "last_activity_at": "2013-05-05T09:44:57.000Z", + "name": "slothbeast", + "name_with_namespace": "David Pollak / slothbeast", + "path": "slothbeast", + "path_with_namespace": "dpp/slothbeast", + "readme_url": "https://gitlab.com/dpp/slothbeast/blob/master/README.md", + "ssh_url_to_repo": "git@gitlab.com:dpp/slothbeast.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/dpp/slothbeast"}, + {"avatar_url": null, + "created_at": "2013-02-07T20:50:20.000Z", + "default_branch": "master", + "description": null, + "forks_count": 0, + "http_url_to_repo": "https://gitlab.com/rocksoniko/easy.git", + "id": 2227, + "last_activity_at": "2013-05-05T09:45:00.000Z", + "name": "easy", + "name_with_namespace": "Hugo / easy", + "path": "easy", + "path_with_namespace": "rocksoniko/easy", + "readme_url": "https://gitlab.com/rocksoniko/easy/blob/master/README", + "ssh_url_to_repo": "git@gitlab.com:rocksoniko/easy.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/rocksoniko/easy"}, + {"avatar_url": null, + "created_at": "2013-02-10T17:21:24.000Z", + "default_branch": null, + "description": null, + "forks_count": 0, + "http_url_to_repo": "https://gitlab.com/grup/grup.git", + "id": 2294, + "last_activity_at": "2013-05-05T09:45:01.000Z", + "name": "grup", + "name_with_namespace": "grup / grup", + "path": "grup", + "path_with_namespace": "grup/grup", + "readme_url": null, + "ssh_url_to_repo": "git@gitlab.com:grup/grup.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/grup/grup"}, + {"avatar_url": null, + "created_at": "2013-02-14T09:31:50.000Z", + "default_branch": "master", + "description": "", + "forks_count": 0, + "http_url_to_repo": "https://gitlab.com/varac/test.git", + "id": 2390, + "last_activity_at": "2016-02-11T13:51:47.463Z", + "name": "test", + "name_with_namespace": "varac / test", + "path": "test", + "path_with_namespace": "varac/test", + "readme_url": null, + "ssh_url_to_repo": "git@gitlab.com:varac/test.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/varac/test"}] diff --git a/swh/lister/github/tests/test_gh_lister.py b/swh/lister/gitlab/tests/test_gitlab_lister.py copy from swh/lister/github/tests/test_gh_lister.py copy to swh/lister/gitlab/tests/test_gitlab_lister.py --- a/swh/lister/github/tests/test_gh_lister.py +++ b/swh/lister/gitlab/tests/test_gitlab_lister.py @@ -1,37 +1,37 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import re import unittest + from datetime import datetime, timedelta -from swh.lister.core.tests.test_lister import IndexingHttpListerTesterBase -from swh.lister.github.lister import GitHubLister +from swh.lister.gitlab.lister import GitLabLister +from swh.lister.core.tests.test_lister import HttpListerTesterBase -class GitHubListerTester(IndexingHttpListerTesterBase, unittest.TestCase): - Lister = GitHubLister - test_re = re.compile(r'/repositories\?since=([^?&]+)') - lister_subdir = 'github' +class GitLabListerTester(HttpListerTesterBase, unittest.TestCase): + Lister = GitLabLister + test_re = GitLabLister.API_URL_INDEX_RE + lister_subdir = 'gitlab' good_api_response_file = 'api_response.json' bad_api_response_file = 'api_empty_response.json' - first_index = 26 - last_index = 368 - entries_per_page = 100 + first_index = 1 + last_index = 2 + entries_per_page = 10 def response_headers(self, request): - headers = {'X-RateLimit-Remaining': '1'} + headers = {'RateLimit-Remaining': '1'} if self.request_index(request) == str(self.first_index): headers.update({ - 'Link': ';' + 'Link': ';' ' rel="next",' - ';' + ';' ' rel="first"' }) else: headers.update({ - 'Link': ';' + 'Link': ';' ' rel="first"' }) @@ -40,7 +40,7 @@ def mock_rate_quota(self, n, request, context): self.rate_limit += 1 context.status_code = 403 - context.headers['X-RateLimit-Remaining'] = '0' + context.headers['RateLimit-Remaining'] = '0' one_second = int((datetime.now() + timedelta(seconds=1.5)).timestamp()) - context.headers['X-RateLimit-Reset'] = str(one_second) + context.headers['RateLimit-Reset'] = str(one_second) return '{"error":"dummy"}'