diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -41,30 +41,84 @@ ## lister-github -1. git clone under $GHLISTER_ROOT (of your choosing) +### Preparation steps + +1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/github.com/ 3. create configuration file ~/.config/swh/lister-github.com.yml 4. Bootstrap the db instance schema -``` sh -$ createdb lister-github.com -$ bin/ghlister --db-url postgres:///lister-github.com createdb -``` - -Configuration file samples -------------------------- - -## github + $ createdb lister-github + $ python3 -m swh.lister.cli --db-url postgres:///lister-github \ + --lister github \ + --create-tables -cat ~/.config/swh/lister-github.com.yml +### Configuration file sample + $ cat ~/.config/swh/lister-github.com.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls - lister_db_url: postgres:///lister-github.com + lister_db_url: postgres:///lister-github credentials: [] cache_responses: True cache_dir: /home/zack/.cache/swh/lister/github.com + storage: + cls: remote + args: + url: http://localhost:5002/ + +### Run + + $ python3 + >>> import logging + >>> logging.basicConfig(level=logging.DEBUG) + >>> from swh.lister.github.tasks import RangeGitHubLister + >>> RangeGitHubLister().run(364, 365) + INFO:root:listing repos starting at 364 + DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.github.com + DEBUG:urllib3.connectionpool:https://api.github.com:443 "GET /repositories?since=364 HTTP/1.1" 200 None + DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost + DEBUG:urllib3.connectionpool:http://localhost:5002 "POST /origin/add HTTP/1.1" 200 1 + +## lister-gitlab + +### preparation steps + +1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) +2. mkdir ~/.config/swh/ ~/.cache/swh/lister/gitlab/ +3. create configuration file ~/.config/swh/lister-gitlab.yml +4. Bootstrap the db instance schema + + $ createdb lister-gitlab + $ python3 -m swh.lister.cli --db-url postgres:///lister-gitlab \ + --lister gitlab \ + --create-tables + +### Configuration file sample + + $ cat ~/.config/swh/lister-gitlab.yml + # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls + lister_db_url: postgres:///lister-gitlab + credentials: [] + cache_responses: True + cache_dir: /home/zack/.cache/swh/lister/gitlab storage: cls: remote args: url: http://localhost:5002/ + +### Run + + $ python3 + Python 3.6.6 (default, Jun 27 2018, 14:44:17) + [GCC 8.1.0] on linux + Type "help", "copyright", "credits" or "license" for more information. + >>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2, + instance='salsa.debian.org', api_baseurl='https://salsa.debian.org/api/v4') + >>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2, + instance='gitlab.freedesktop.org', api_baseurl='https://gitlab.freedesktop.org/api/v4') + >>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2, + instance='gitlab.gnome.org', api_baseurl='https://gitlab.gnome.org/api/v4') + >>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2, + instance='gitlab.inria.fr', api_baseurl='https://gitlab.inria.fr/api/v4') + >>> diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -11,6 +11,7 @@ class BitBucketLister(SWHIndexingHttpLister): PATH_TEMPLATE = '/repositories?after=%s' MODEL = BitBucketModel + LISTER_NAME = 'bitbucket.com' def get_model_from_repo(self, repo): return { diff --git a/swh/lister/bitbucket/models.py b/swh/lister/bitbucket/models.py --- a/swh/lister/bitbucket/models.py +++ b/swh/lister/bitbucket/models.py @@ -1,13 +1,13 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from sqlalchemy import Column, String -from swh.lister.core.models import ModelBase +from swh.lister.core.models import IndexingModelBase -class BitBucketModel(ModelBase): +class BitBucketModel(IndexingModelBase): """a BitBucket repository""" __tablename__ = 'bitbucket_repos' diff --git a/swh/lister/bitbucket/tasks.py b/swh/lister/bitbucket/tasks.py --- a/swh/lister/bitbucket/tasks.py +++ b/swh/lister/bitbucket/tasks.py @@ -1,9 +1,9 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.lister.core.tasks import (IndexingDiscoveryListerTask, - IndexingRangeListerTask, + RangeListerTask, IndexingRefreshListerTask, ListerTaskBase) from .lister import BitBucketLister @@ -11,8 +11,7 @@ class BitBucketListerTask(ListerTaskBase): def new_lister(self): - return BitBucketLister(lister_name='bitbucket.com', - api_baseurl='https://api.bitbucket.org/2.0') + return BitBucketLister(api_baseurl='https://api.bitbucket.org/2.0') class IncrementalBitBucketLister(BitBucketListerTask, @@ -20,7 +19,7 @@ task_queue = 'swh_lister_bitbucket_discover' -class RangeBitBucketLister(BitBucketListerTask, IndexingRangeListerTask): +class RangeBitBucketLister(BitBucketListerTask, RangeListerTask): task_queue = 'swh_lister_bitbucket_refresh' diff --git a/swh/lister/bitbucket/tests/test_bb_lister.py b/swh/lister/bitbucket/tests/test_bb_lister.py --- a/swh/lister/bitbucket/tests/test_bb_lister.py +++ b/swh/lister/bitbucket/tests/test_bb_lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,10 +6,10 @@ import unittest from swh.lister.bitbucket.lister import BitBucketLister -from swh.lister.core.tests.test_lister import IndexingHttpListerTesterBase +from swh.lister.core.tests.test_lister import HttpListerTester -class BitBucketListerTester(IndexingHttpListerTesterBase, unittest.TestCase): +class BitBucketListerTester(HttpListerTester, unittest.TestCase): Lister = BitBucketLister test_re = re.compile(r'/repositories\?after=([^?&]+)') lister_subdir = 'bitbucket' diff --git a/swh/lister/cli.py b/swh/lister/cli.py new file mode 100644 --- /dev/null +++ b/swh/lister/cli.py @@ -0,0 +1,56 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import click + + +@click.command() +@click.option( + '--db-url', '-d', default='postgres:///lister-gitlab.com', + help='SQLAlchemy DB URL; see ' + '') # noqa +@click.option('--lister', required=1, + type=click.Choice(['github', 'gitlab', 'bitbucket']), + help='Lister to act upon') +@click.option('--create-tables', is_flag=True, default=False, + help='create tables') +@click.option('--drop-tables', is_flag=True, default=False, + help='Drop tables') +def cli(db_url, lister, create_tables, drop_tables): + """Initialize db model according to lister. + + """ + supported_listers = ['github', 'gitlab', 'bitbucket'] + override_conf = {'lister_db_url': db_url} + + if lister == 'github': + from .github import models + from .github.lister import GitHubLister + + _lister = GitHubLister(api_baseurl='https://api.github.com', + override_config=override_conf) + elif lister == 'bitbucket': + from .bitbucket import models + from .bitbucket.lister import BitBucketLister + _lister = BitBucketLister(api_baseurl='https://api.bitbucket.org/2.0', + override_config=override_conf) + + elif lister == 'gitlab': + from .gitlab import models + from .gitlab.lister import GitLabLister + _lister = GitLabLister(api_baseurl='https://gitlab.com/api/v4/', + override_config=override_conf) + else: + raise ValueError('Only supported listers are %s' % supported_listers) + + if drop_tables: + models.ModelBase.metadata.drop_all(_lister.db_engine) + + if create_tables: + models.ModelBase.metadata.create_all(_lister.db_engine) + + +if __name__ == '__main__': + cli() diff --git a/swh/lister/core/indexing_lister.py b/swh/lister/core/indexing_lister.py --- a/swh/lister/core/indexing_lister.py +++ b/swh/lister/core/indexing_lister.py @@ -152,23 +152,23 @@ for repo in deleted_repos: repo.task_id = None - def run(self, min_index=None, max_index=None): + def run(self, min_bound=None, max_bound=None): """Main entry function. Sequentially fetches repository data from the service according to the basic outline in the class docstring, continually fetching sublists until either there is no next index reference given or the given next index is greater - than the desired max_index. + than the desired max_bound. Args: - min_index (indexable type): optional index to start from - max_index (indexable type): optional index to stop at + min_bound (indexable type): optional index to start from + max_bound (indexable type): optional index to stop at Returns: nothing """ - index = min_index or '' + index = min_bound or '' loop_count = 0 - self.min_index = min_index - self.max_index = max_index + self.min_index = min_bound + self.max_index = max_bound while self.is_within_bounds(index, self.min_index, self.max_index): logging.info('listing repos starting at %s' % index) @@ -205,8 +205,6 @@ class SWHIndexingHttpLister(SWHListerHttpTransport, SWHIndexingLister): """Convenience class for ensuring right lookup and init order when combining SWHIndexingLister and SWHListerHttpTransport.""" - def __init__(self, lister_name=None, api_baseurl=None, - override_config=None): + def __init__(self, api_baseurl=None, override_config=None): SWHListerHttpTransport.__init__(self, api_baseurl=api_baseurl) - SWHIndexingLister.__init__(self, lister_name=lister_name, - override_config=override_config) + SWHIndexingLister.__init__(self, override_config=override_config) diff --git a/swh/lister/core/lister_base.py b/swh/lister/core/lister_base.py --- a/swh/lister/core/lister_base.py +++ b/swh/lister/core/lister_base.py @@ -64,6 +64,7 @@ MODEL = AbstractAttribute('Subclass type (not instance)' ' of swh.lister.core.models.ModelBase' ' customized for a specific service.') + LISTER_NAME = AbstractAttribute("Lister's name") @abc.abstractmethod def transport_request(self, identifier): @@ -149,6 +150,25 @@ """ return models_list + def do_additional_checks(self, models_list): + """Execute some additional checks on the model list. For example, to + check for existing repositories in the db. + + MAY BE OVERRIDDEN if an intermediate Lister class needs to + check some more the results before injection. + + Checks are fine by default, returns the models_list as is by default. + + Args: + models_list: list of dicts returned by + transport_response_simplified. + + Returns: + models_list with entries if checks ok, False otherwise + + """ + return models_list + def is_within_bounds(self, inner, lower=None, upper=None): """See if a sortable value is inside the range [lower,upper]. @@ -199,30 +219,27 @@ @property def CONFIG_BASE_FILENAME(self): # noqa: N802 - return 'lister-%s' % self.lister_name + return 'lister-%s' % self.LISTER_NAME @property def ADDITIONAL_CONFIG(self): # noqa: N802 return { 'lister_db_url': - ('str', 'postgresql:///lister-%s' % self.lister_name), + ('str', 'postgresql:///lister-%s' % self.LISTER_NAME), 'credentials': ('list[dict]', []), 'cache_responses': ('bool', False), 'cache_dir': - ('str', '~/.cache/swh/lister/%s' % self.lister_name), + ('str', '~/.cache/swh/lister/%s' % self.LISTER_NAME), } INITIAL_BACKOFF = 10 MAX_RETRIES = 7 CONN_SLEEP = 10 - def __init__(self, lister_name=None, override_config=None): + def __init__(self, override_config=None): self.backoff = self.INITIAL_BACKOFF - if lister_name is None: - raise NameError("Every lister must be assigned a lister_name.") - self.lister_name = lister_name # 'github?', 'bitbucket?', 'foo.com?' self.config = self.parse_config_file( base_filename=self.CONFIG_BASE_FILENAME, additional_configs=[self.ADDITIONAL_CONFIG] @@ -455,18 +472,23 @@ [self.task_dict(m['origin_type'], m['origin_url'])] )[0]['id'] - def ingest_data(self, identifier): + def ingest_data(self, identifier, checks=False): """The core data fetch sequence. Request server endpoint. Simplify and filter response list of repositories. Inject repo information into local db. Queue loader tasks for linked repositories. Args: identifier: Resource identifier. + checks (bool): Additional checks required """ # Request (partial?) list of repositories info response = self.safely_issue_request(identifier) models_list = self.transport_response_simplified(response) models_list = self.filter_before_inject(models_list) + if checks: + models_list = self.do_additional_checks(models_list) + if not models_list: + return response, [] # inject into local db injected = self.inject_repo_data_into_db(models_list) # queue workers diff --git a/swh/lister/core/lister_transports.py b/swh/lister/core/lister_transports.py --- a/swh/lister/core/lister_transports.py +++ b/swh/lister/core/lister_transports.py @@ -100,14 +100,19 @@ self.session = requests.Session() self.lister_version = __version__ - def transport_request(self, identifier): - """Implements SWHListerBase.transport_request for HTTP using Requests. + def _transport_action(self, identifier, method='get'): + """Permit to ask information to the api prior to actually executing + query. + """ path = self.request_uri(identifier) params = self.request_params(identifier) try: - response = self.session.get(path, **params) + if method == 'head': + response = self.session.head(path, **params) + else: + response = self.session.get(path, **params) except requests.exceptions.ConnectionError as e: raise FetchError(e) else: @@ -115,6 +120,20 @@ raise FetchError(response) return response + def transport_head(self, identifier): + """Retrieve head information on api. + + """ + return self._transport_action(identifier, method='head') + + def transport_request(self, identifier): + """Implements SWHListerBase.transport_request for HTTP using Requests. + + Retrieve get information on api. + + """ + return self._transport_action(identifier) + def transport_response_to_string(self, response): """Implements SWHListerBase.transport_response_to_string for HTTP given Requests responses. diff --git a/swh/lister/core/models.py b/swh/lister/core/models.py --- a/swh/lister/core/models.py +++ b/swh/lister/core/models.py @@ -24,10 +24,6 @@ uid = AbstractAttribute('Column(, primary_key=True)') - # The value used for sorting, segmenting, or api query paging, - # because uids aren't always sequential. - indexable = AbstractAttribute('Column(, index=True)') - name = Column(String, index=True) full_name = Column(String, index=True) html_url = Column(String) @@ -40,14 +36,12 @@ task_id = Column(Integer) origin_id = Column(Integer) - def __init__(self, uid=None, indexable=None, name=None, full_name=None, + def __init__(self, uid=None, name=None, full_name=None, html_url=None, origin_url=None, origin_type=None, description=None, task_id=None, origin_id=None): self.uid = uid self.last_seen = datetime.now() - if indexable is not None: - self.indexable = indexable if name is not None: self.name = name if full_name is not None: @@ -65,3 +59,24 @@ self.task_id = task_id if origin_id is not None: self.origin_id = origin_id + + +class IndexingModelBase(ModelBase, metaclass=ABCSQLMeta): + __abstract__ = True + __tablename__ = AbstractAttribute + + # The value used for sorting, segmenting, or api query paging, + # because uids aren't always sequential. + indexable = AbstractAttribute('Column(, index=True)') + + def __init__(self, uid=None, name=None, full_name=None, + html_url=None, origin_url=None, origin_type=None, + description=None, task_id=None, origin_id=None, + indexable=None): + super().__init__( + uid=uid, name=name, full_name=full_name, html_url=html_url, + origin_url=origin_url, origin_type=origin_type, + description=description, task_id=task_id, origin_id=origin_id) + + if indexable is not None: + self.indexable = indexable diff --git a/swh/lister/core/paging_lister.py b/swh/lister/core/paging_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/core/paging_lister.py @@ -0,0 +1,157 @@ +# Copyright (C) 2015-2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import abc +import logging + +from .lister_transports import SWHListerHttpTransport +from .lister_base import SWHListerBase + + +class PageByPageLister(SWHListerBase): + """Lister* intermediate class for any service that follows the simple + pagination page pattern. + + - Client sends a request to list repositories starting from a + given page identifier. + + - Client receives structured (json/xml/etc) response with + information about a sequential series of repositories (per page) + starting from a given index. And, if available, some indication + of the next page index for fetching the remaining repository + data. + + See :class:`swh.lister.core.lister_base.SWHListerBase` for more + details. + + This class cannot be instantiated. To create a new Lister for a + source code listing service that follows the model described + above, you must subclass this class. Then provide the required + overrides in addition to any unmet implementation/override + requirements of this class's base (see parent class and member + docstrings for details). + + Required Overrides:: + + def get_next_target_from_response + + """ + @abc.abstractmethod + def get_next_target_from_response(self, response): + """Find the next server endpoint page given the entire response. + + Implementation of this method depends on the server API spec + and the shape of the network response object returned by the + transport_request method. + + For example, some api can use the headers links to provide the + next page. + + Args: + response (transport response): response page from the server + + Returns: + index of next page, possibly extracted from a next href url + + """ + pass + + @abc.abstractmethod + def get_pages_information(self): + """Find the total number of pages. + + Implementation of this method depends on the server API spec + and the shape of the network response object returned by the + transport_request method. + + For example, some api can use dedicated headers: + - x-total-pages to provide the total number of pages + - x-total to provide the total number of repositories + - x-per-page to provide the number of elements per page + + Returns: + tuple (total number of repositories, total number of + pages, per_page) + + """ + pass + + # You probably don't need to override anything below this line. + + def do_additional_checks(self, models_list): + """Potentially check for existence of repositories in models_list. + + This will be called only if check_existence is flipped on in + the run method below. + + """ + for m in models_list: + sql_repo = self.db_query_equal('uid', m['uid']) + if sql_repo: + return False + return models_list + + def run(self, min_bound=None, max_bound=None, check_existence=False): + """Main entry function. Sequentially fetches repository data from the + service according to the basic outline in the class + docstring. Continually fetching sublists until either there + is no next page reference given or the given next page is + greater than the desired max_page. + + Args: + min_bound: optional page to start from + max_bound: optional page to stop at + check_existence (bool): optional existence check (for + incremental lister whose sort + order is inverted) + + Returns: + nothing + + """ + page = min_bound or 0 + loop_count = 0 + + self.min_page = min_bound + self.max_page = max_bound + + while self.is_within_bounds(page, self.min_page, self.max_page): + logging.info('listing repos starting at %s' % page) + + response, injected_repos = self.ingest_data(page, + checks=check_existence) + if not injected_repos: + logging.info('Repositories already seen, stopping') + break + + next_page = self.get_next_target_from_response(response) + + # termination condition + + if (next_page is None) or (next_page == page): + logging.info('stopping after page %s, no next link found' % + page) + break + else: + page = next_page + + loop_count += 1 + if loop_count == 20: + logging.info('flushing updates') + loop_count = 0 + self.db_session.commit() + self.db_session = self.mk_session() + + self.db_session.commit() + self.db_session = self.mk_session() + + +class PageByPageHttpLister(SWHListerHttpTransport, PageByPageLister): + """Convenience class for ensuring right lookup and init order when + combining PageByPageLister and SWHListerHttpTransport. + + """ + def __init__(self, api_baseurl=None, override_config=None): + SWHListerHttpTransport.__init__(self, api_baseurl=api_baseurl) + PageByPageLister.__init__(self, override_config=override_config) diff --git a/swh/lister/core/tasks.py b/swh/lister/core/tasks.py --- a/swh/lister/core/tasks.py +++ b/swh/lister/core/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -39,34 +39,51 @@ task_queue = AbstractAttribute('Celery Task queue name') @abc.abstractmethod - def new_lister(self): + def new_lister(self, *args, **kwargs): """Return a new lister of the appropriate type. """ pass @abc.abstractmethod - def run_task(self): + def run_task(self, *args, **kwargs): pass -class IndexingDiscoveryListerTask(ListerTaskBase): - def run_task(self): - lister = self.new_lister() - return lister.run(min_index=lister.db_last_index(), max_index=None) +# Paging/Indexing lister tasks derivatives +# (cf. {github/bitbucket/gitlab}/tasks) + + +class RangeListerTask(ListerTaskBase): + """Range lister task. + + """ + def run_task(self, start, end, *args, **kwargs): + lister = self.new_lister(*args, **kwargs) + return lister.run(min_bound=start, max_bound=end) + +# Indexing Lister tasks derivatives (cf. {github/bitbucket}/tasks) -class IndexingRangeListerTask(ListerTaskBase): - def run_task(self, start, end): - lister = self.new_lister() - return lister.run(min_index=start, max_index=end) + +class IndexingDiscoveryListerTask(ListerTaskBase): + """Incremental indexing lister task. + + """ + def run_task(self, *args, **kwargs): + lister = self.new_lister(*args, **kwargs) + return lister.run(min_bound=lister.db_last_index(), max_bound=None) class IndexingRefreshListerTask(ListerTaskBase): + """Full indexing lister task. + + """ GROUP_SPLIT = 10000 - def run_task(self): - lister = self.new_lister() + def run_task(self, *args, **kwargs): + lister = self.new_lister(*args, **kwargs) ranges = lister.db_partition_indices(self.GROUP_SPLIT) random.shuffle(ranges) - range_task = IndexingRangeListerTask() - group(range_task.s(minv, maxv) for minv, maxv in ranges)() + range_task = RangeListerTask() + group(range_task.s(minv, maxv, *args, **kwargs) + for minv, maxv in ranges)() diff --git a/swh/lister/core/tests/test_lister.py b/swh/lister/core/tests/test_lister.py --- a/swh/lister/core/tests/test_lister.py +++ b/swh/lister/core/tests/test_lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -20,12 +20,15 @@ @requests_mock.Mocker() -class IndexingHttpListerTesterBase(abc.ABC): +class HttpListerTesterBase(abc.ABC): """Base testing class for subclasses of - swh.lister.core.indexing_lister.SWHIndexingHttpLister. - See swh.lister.github.tests.test_gh_lister for an example of how to - customize for a specific listing service. + swh.lister.core.indexing_lister.SWHIndexingHttpLister. + swh.lister.core.paging_lister.PageByPageHttpLister + + See swh.lister.github.tests.test_gh_lister for an example of how + to customize for a specific listing service. + """ Lister = AbstractAttribute('The lister class to test') test_re = AbstractAttribute('Compiled regex matching the server url. Must' @@ -34,8 +37,8 @@ good_api_response_file = AbstractAttribute('Example good response body') bad_api_response_file = AbstractAttribute('Example bad response body') first_index = AbstractAttribute('First index in good_api_response') - last_index = AbstractAttribute('Last index in good_api_response') entries_per_page = AbstractAttribute('Number of results in good response') + LISTER_NAME = 'fake-lister' # May need to override this if the headers are used for something def response_headers(self, request): @@ -56,7 +59,7 @@ self.response = None self.fl = None self.helper = None - if self.__class__ != IndexingHttpListerTesterBase: + if self.__class__ != HttpListerTesterBase: self.run = TestCase.run.__get__(self, self.__class__) else: self.run = noop @@ -99,12 +102,14 @@ return self.mock_limit_n_response(2, request, context) def get_fl(self, override_config=None): + """Retrieve an instance of fake lister (fl). + + """ if override_config or self.fl is None: with patch( 'swh.scheduler.backend.SchedulerBackend.reconnect', noop ): - self.fl = self.Lister(lister_name='fakelister', - api_baseurl='https://fakeurl', + self.fl = self.Lister(api_baseurl='https://fakeurl', override_config=override_config) self.fl.INITIAL_BACKOFF = 1 @@ -164,7 +169,7 @@ self.assertIsInstance(di, dict) pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith('_')] for k in pubs: - if k not in ['last_seen', 'task_id', 'origin_id']: + if k not in ['last_seen', 'task_id', 'origin_id', 'id']: self.assertIn(k, di) def disable_storage_and_scheduler(self, fl): @@ -183,7 +188,7 @@ self.disable_storage_and_scheduler(fl) self.disable_db(fl) - fl.run(min_index=1, max_index=1) # stores no results + fl.run(min_bound=1, max_bound=1) # stores no results @istest def test_fetch_one_nodb(self, http_mocker): @@ -193,7 +198,7 @@ self.disable_storage_and_scheduler(fl) self.disable_db(fl) - fl.run(min_index=self.first_index, max_index=self.first_index) + fl.run(min_bound=self.first_index, max_bound=self.first_index) @istest def test_fetch_multiple_pages_nodb(self, http_mocker): @@ -203,12 +208,17 @@ self.disable_storage_and_scheduler(fl) self.disable_db(fl) - fl.run(min_index=self.first_index) + fl.run(min_bound=self.first_index) def init_db(self, db, model): engine = create_engine(db.url()) model.metadata.create_all(engine) + +class HttpListerTester(HttpListerTesterBase, abc.ABC): + last_index = AbstractAttribute('Last index in good_api_response') + + @requests_mock.Mocker() @istest def test_fetch_multiple_pages_yesdb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) @@ -221,7 +231,7 @@ self.disable_storage_and_scheduler(fl) - fl.run(min_index=self.first_index) + fl.run(min_bound=self.first_index) self.assertEqual(fl.db_last_index(), self.last_index) partitions = fl.db_partition_indices(5) diff --git a/swh/lister/core/tests/test_model.py b/swh/lister/core/tests/test_model.py --- a/swh/lister/core/tests/test_model.py +++ b/swh/lister/core/tests/test_model.py @@ -7,7 +7,7 @@ from nose.tools import istest from sqlalchemy import Column, Integer -from swh.lister.core.models import ModelBase +from swh.lister.core.models import ModelBase, IndexingModelBase class BadSubclass1(ModelBase): @@ -30,6 +30,27 @@ indexable = Column(Integer, index=True) +class IndexingBadSubclass(IndexingModelBase): + __abstract__ = True + pass + + +class IndexingBadSubclass2(IndexingModelBase): + __abstract__ = True + __tablename__ = 'foo' + + +class IndexingBadSubclass3(IndexingBadSubclass2): + __abstract__ = True + pass + + +class IndexingGoodSubclass(IndexingModelBase): + uid = Column(Integer, primary_key=True) + indexable = Column(Integer, index=True) + __tablename__ = 'bar' + + class TestModel(unittest.TestCase): @istest def test_model_instancing(self): @@ -46,8 +67,28 @@ BadSubclass3() self.assertIsInstance(GoodSubclass(), GoodSubclass) - gsc = GoodSubclass(uid='uid', indexable='indexable') + gsc = GoodSubclass(uid='uid') self.assertEqual(gsc.__tablename__, 'foo') self.assertEqual(gsc.uid, 'uid') + + @istest + def test_indexing_model_instancing(self): + with self.assertRaises(TypeError): + IndexingModelBase() + + with self.assertRaises(TypeError): + IndexingBadSubclass() + + with self.assertRaises(TypeError): + IndexingBadSubclass2() + + with self.assertRaises(TypeError): + IndexingBadSubclass3() + + self.assertIsInstance(IndexingGoodSubclass(), IndexingGoodSubclass) + gsc = IndexingGoodSubclass(uid='uid', indexable='indexable') + + self.assertEqual(gsc.__tablename__, 'bar') + self.assertEqual(gsc.uid, 'uid') self.assertEqual(gsc.indexable, 'indexable') diff --git a/swh/lister/debian/lister.py b/swh/lister/debian/lister.py --- a/swh/lister/debian/lister.py +++ b/swh/lister/debian/lister.py @@ -31,11 +31,11 @@ class DebianLister(SWHListerHttpTransport, SWHListerBase): MODEL = Package PATH_TEMPLATE = None + LISTER_NAME = 'debian' def __init__(self, override_config=None): SWHListerHttpTransport.__init__(self, api_baseurl="bogus") - SWHListerBase.__init__(self, lister_name='debian', - override_config=override_config) + SWHListerBase.__init__(self, override_config=override_config) def transport_request(self, identifier): """Subvert SWHListerHttpTransport.transport_request, to try several diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -13,6 +13,7 @@ PATH_TEMPLATE = '/repositories?since=%d' MODEL = GitHubModel API_URL_INDEX_RE = re.compile(r'^.*/repositories\?since=(\d+)') + LISTER_NAME = 'github.com' def get_model_from_repo(self, repo): return { diff --git a/swh/lister/github/models.py b/swh/lister/github/models.py --- a/swh/lister/github/models.py +++ b/swh/lister/github/models.py @@ -1,13 +1,13 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from sqlalchemy import Column, Boolean, Integer -from swh.lister.core.models import ModelBase +from swh.lister.core.models import IndexingModelBase -class GitHubModel(ModelBase): +class GitHubModel(IndexingModelBase): """a GitHub repository""" __tablename__ = 'github_repos' diff --git a/swh/lister/github/tasks.py b/swh/lister/github/tasks.py --- a/swh/lister/github/tasks.py +++ b/swh/lister/github/tasks.py @@ -1,9 +1,9 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.lister.core.tasks import (IndexingDiscoveryListerTask, - IndexingRangeListerTask, + RangeListerTask, IndexingRefreshListerTask, ListerTaskBase) from .lister import GitHubLister @@ -11,15 +11,14 @@ class GitHubListerTask(ListerTaskBase): def new_lister(self): - return GitHubLister(lister_name='github.com', - api_baseurl='https://api.github.com') + return GitHubLister(api_baseurl='https://api.github.com') class IncrementalGitHubLister(GitHubListerTask, IndexingDiscoveryListerTask): task_queue = 'swh_lister_github_discover' -class RangeGitHubLister(GitHubListerTask, IndexingRangeListerTask): +class RangeGitHubLister(GitHubListerTask, RangeListerTask): task_queue = 'swh_lister_github_refresh' diff --git a/swh/lister/github/tests/test_gh_lister.py b/swh/lister/github/tests/test_gh_lister.py --- a/swh/lister/github/tests/test_gh_lister.py +++ b/swh/lister/github/tests/test_gh_lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,11 +6,11 @@ import unittest from datetime import datetime, timedelta -from swh.lister.core.tests.test_lister import IndexingHttpListerTesterBase +from swh.lister.core.tests.test_lister import HttpListerTester from swh.lister.github.lister import GitHubLister -class GitHubListerTester(IndexingHttpListerTesterBase, unittest.TestCase): +class GitHubListerTester(HttpListerTester, unittest.TestCase): Lister = GitHubLister test_re = re.compile(r'/repositories\?since=([^?&]+)') lister_subdir = 'github' diff --git a/swh/lister/gitlab/__init__.py b/swh/lister/gitlab/__init__.py new file mode 100644 diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/lister.py @@ -0,0 +1,124 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import random +import time + +from .. import utils +from ..core.paging_lister import PageByPageHttpLister +from .models import GitLabModel + + +class GitLabLister(PageByPageHttpLister): + # Template path expecting an integer that represents the page id + PATH_TEMPLATE = '/projects?page=%d&order_by=id' + MODEL = GitLabModel + LISTER_NAME = 'gitlab' + + def __init__(self, api_baseurl=None, instance=None, + override_config=None, sort='asc'): + super().__init__(api_baseurl=api_baseurl, + override_config=override_config) + self.instance = instance + self.PATH_TEMPLATE = '%s&sort=%s' % (self.PATH_TEMPLATE, sort) + + @property + def ADDITIONAL_CONFIG(self): + """Override additional config as the 'credentials' structure change + between the ancestor classes and this class. + + cf. request_params method below + + """ + default_config = super().ADDITIONAL_CONFIG + # 'credentials' is a dict of (instance, {username, password}) dict + default_config['credentials'] = ('dict', {}) + return default_config + + def request_params(self, identifier): + """Get the full parameters passed to requests given the + transport_request identifier. + + For the gitlab lister, the 'credentials' entries is configured + per instance. For example: + + - credentials: + - gitlab.com: + - username: user0 + password: + - username: user1 + password: + - ... + - other-gitlab-instance: + ... + + """ + params = { + 'headers': self.request_headers() or {} + } + # Retrieve the credentials per instance + creds = self.config['credentials'] + if creds: + creds_lister = creds[self.instance] + auth = random.choice(creds_lister) if creds else None + if auth: + params['auth'] = (auth['username'], auth['password']) + return params + + def uid(self, repo): + return '%s/%s' % (self.instance, repo['path_with_namespace']) + + def get_model_from_repo(self, repo): + return { + 'instance': self.instance, + 'uid': self.uid(repo), + 'name': repo['name'], + 'full_name': repo['path_with_namespace'], + 'html_url': repo['web_url'], + 'origin_url': repo['http_url_to_repo'], + 'origin_type': 'git', + 'description': repo['description'], + } + + def transport_quota_check(self, response): + """Deal with rate limit if any. + + """ + # not all gitlab instance have rate limit + if 'RateLimit-Remaining' in response.headers: + reqs_remaining = int(response.headers['RateLimit-Remaining']) + if response.status_code == 403 and reqs_remaining == 0: + reset_at = int(response.headers['RateLimit-Reset']) + delay = min(reset_at - time.time(), 3600) + return True, delay + return False, 0 + + def get_next_target_from_response(self, response): + """Determine the next page identifier. + + """ + _next = utils.get(response.headers, ['X-Next-Page', 'x-next-page']) + if _next: + return int(_next) + + def get_pages_information(self): + """Determine pages information. + + """ + response = self.transport_head(identifier=1) + h = response.headers + total = utils.get(h, ['X-Total', 'x-total']) + total_pages = utils.get(h, ['X-Total-Pages', 'x-total-pages']) + per_page = utils.get(h, ['X-Per-Page', 'x-per-page']) + if total is not None: + total = int(total) + if total_pages is not None: + total_pages = int(total_pages) + if per_page is not None: + per_page = int(per_page) + return total, total_pages, per_page + + def transport_response_simplified(self, response): + repos = response.json() + return [self.get_model_from_repo(repo) for repo in repos] diff --git a/swh/lister/gitlab/models.py b/swh/lister/gitlab/models.py new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/models.py @@ -0,0 +1,28 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, Integer, String + +from ..core.models import ModelBase + + +class GitLabModel(ModelBase): + """a Gitlab repository from a gitlab instance + + """ + __tablename__ = 'gitlab_repo' + + uid = Column(String, primary_key=True) + instance = Column(String, index=True) + + def __init__(self, uid=None, indexable=None, name=None, + full_name=None, html_url=None, origin_url=None, + origin_type=None, description=None, task_id=None, + origin_id=None, instance=None): + super().__init__(uid=uid, name=name, + full_name=full_name, html_url=html_url, + origin_url=origin_url, origin_type=origin_type, + description=description, task_id=task_id, + origin_id=origin_id) + self.instance = instance diff --git a/swh/lister/gitlab/tasks.py b/swh/lister/gitlab/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/tasks.py @@ -0,0 +1,63 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import random + +from celery import group + +from .. import utils +from ..core.tasks import ListerTaskBase, RangeListerTask +from .lister import GitLabLister + + +class GitLabListerTask(ListerTaskBase): + def new_lister(self, api_baseurl='https://gitlab.com/api/v4', + instance='gitlab.com'): + return GitLabLister(api_baseurl=api_baseurl, instance=instance) + + +class RangeGitLabLister(GitLabListerTask, RangeListerTask): + """Range GitLab lister (list available origins on specified range) + + """ + task_queue = 'swh_lister_gitlab_refresh' + + +class FullGitLabRelister(GitLabListerTask): + """Full GitLab lister (list all available origins from the api). + + """ + task_queue = 'swh_lister_gitlab_refresh' + + # nb pages + nb_pages = 10 + + def run_task(self, *args, **kwargs): + lister = self.new_lister(*args, **kwargs) + _, total_pages, _ = lister.get_pages_information() + ranges = list(utils.split_range(total_pages, self.nb_pages)) + random.shuffle(ranges) + range_task = RangeGitLabLister() + group(range_task.s(minv, maxv, *args, **kwargs) + for minv, maxv in ranges)() + + +class IncrementalGitLabLister(ListerTaskBase): + """Incremental GitLab lister (list only new available origins). + + """ + task_queue = 'swh_lister_gitlab_discover' + + def new_lister(self, api_baseurl='https://gitlab.com/api/v4', + instance='gitlab.com'): + # assuming going forward in desc order, page 1 through + return GitLabLister(instance=instance, api_baseurl=api_baseurl, + sort='desc') + + def run_task(self, *args, **kwargs): + lister = self.new_lister(*args, **kwargs) + _, total_pages, _ = lister.get_pages_information() + # stopping as soon as existing origins for that instance are detected + return lister.run(min_bound=1, max_bound=total_pages, + check_existence=True) diff --git a/swh/lister/gitlab/tests/__init__.py b/swh/lister/gitlab/tests/__init__.py new file mode 100644 diff --git a/swh/lister/gitlab/tests/api_empty_response.json b/swh/lister/gitlab/tests/api_empty_response.json new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/tests/api_empty_response.json @@ -0,0 +1 @@ +[] diff --git a/swh/lister/gitlab/tests/api_response.json b/swh/lister/gitlab/tests/api_response.json new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/tests/api_response.json @@ -0,0 +1,170 @@ +[{"avatar_url": null, + "created_at": "2012-10-15T17:26:53.000Z", + "default_branch": "master", + "description": null, + "forks_count": 3, + "http_url_to_repo": "https://gitlab.com/leberwurscht/teardownwalls.git", + "id": 143, + "last_activity_at": "2013-10-03T08:08:46.000Z", + "name": "TearDownWalls", + "name_with_namespace": "Leberwurscht / TearDownWalls", + "path": "teardownwalls", + "path_with_namespace": "leberwurscht/teardownwalls", + "readme_url": "https://gitlab.com/leberwurscht/teardownwalls/blob/master/README.md", + "ssh_url_to_repo": "git@gitlab.com:leberwurscht/teardownwalls.git", + "star_count": 1, + "tag_list": [], + "web_url": "https://gitlab.com/leberwurscht/teardownwalls"}, + {"avatar_url": null, + "created_at": "2012-12-12T21:30:14.000Z", + "default_branch": "master", + "description": "", + "forks_count": 0, + "http_url_to_repo": "https://gitlab.com/technomancy/leiningen.git", + "id": 450, + "last_activity_at": "2018-06-24T00:07:06.666Z", + "name": "Leiningen", + "name_with_namespace": "Phil Hagelberg / Leiningen", + "path": "leiningen", + "path_with_namespace": "technomancy/leiningen", + "readme_url": "https://gitlab.com/technomancy/leiningen/blob/master/README.md", + "ssh_url_to_repo": "git@gitlab.com:technomancy/leiningen.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/technomancy/leiningen"}, + {"avatar_url": null, + "created_at": "2012-12-18T17:25:39.000Z", + "default_branch": "master", + "description": null, + "forks_count": 4, + "http_url_to_repo": "https://gitlab.com/jonan/heroes-of-wesnoth.git", + "id": 526, + "last_activity_at": "2015-04-09T14:43:49.363Z", + "name": "Heroes of Wesnoth", + "name_with_namespace": "Jonan / Heroes of Wesnoth", + "path": "heroes-of-wesnoth", + "path_with_namespace": "jonan/heroes-of-wesnoth", + "readme_url": null, + "ssh_url_to_repo": "git@gitlab.com:jonan/heroes-of-wesnoth.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/jonan/heroes-of-wesnoth"}, + {"avatar_url": null, + "created_at": "2012-12-18T17:33:03.000Z", + "default_branch": "master", + "description": null, + "forks_count": 0, + "http_url_to_repo": "https://gitlab.com/jonan/k.git", + "id": 527, + "last_activity_at": "2014-10-11T22:29:04.138Z", + "name": "K", + "name_with_namespace": "Jonan / K", + "path": "k", + "path_with_namespace": "jonan/k", + "readme_url": "https://gitlab.com/jonan/k/blob/master/README", + "ssh_url_to_repo": "git@gitlab.com:jonan/k.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/jonan/k"}, + {"avatar_url": null, + "created_at": "2013-01-06T20:35:42.000Z", + "default_branch": "master", + "description": "", + "forks_count": 0, + "http_url_to_repo": "https://gitlab.com/hcs/hcs_utils.git", + "id": 1025, + "last_activity_at": "2015-09-14T12:01:11.151Z", + "name": "hcs_utils", + "name_with_namespace": "Christer Sjöholm / hcs_utils", + "path": "hcs_utils", + "path_with_namespace": "hcs/hcs_utils", + "readme_url": "https://gitlab.com/hcs/hcs_utils/blob/master/README.txt", + "ssh_url_to_repo": "git@gitlab.com:hcs/hcs_utils.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/hcs/hcs_utils"}, + {"avatar_url": null, + "created_at": "2013-01-24T08:41:56.000Z", + "default_branch": null, + "description": null, + "forks_count": 0, + "http_url_to_repo": "https://gitlab.com/soeren/sspssptest.git", + "id": 1702, + "last_activity_at": "2013-10-03T08:31:54.000Z", + "name": "sspssptest", + "name_with_namespace": "kruemel / sspssptest", + "path": "sspssptest", + "path_with_namespace": "soeren/sspssptest", + "readme_url": null, + "ssh_url_to_repo": "git@gitlab.com:soeren/sspssptest.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/soeren/sspssptest"}, + {"avatar_url": null, + "created_at": "2013-01-28T22:59:31.000Z", + "default_branch": "master", + "description": null, + "forks_count": 0, + "http_url_to_repo": "https://gitlab.com/dpp/slothbeast.git", + "id": 1865, + "last_activity_at": "2013-05-05T09:44:57.000Z", + "name": "slothbeast", + "name_with_namespace": "David Pollak / slothbeast", + "path": "slothbeast", + "path_with_namespace": "dpp/slothbeast", + "readme_url": "https://gitlab.com/dpp/slothbeast/blob/master/README.md", + "ssh_url_to_repo": "git@gitlab.com:dpp/slothbeast.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/dpp/slothbeast"}, + {"avatar_url": null, + "created_at": "2013-02-07T20:50:20.000Z", + "default_branch": "master", + "description": null, + "forks_count": 0, + "http_url_to_repo": "https://gitlab.com/rocksoniko/easy.git", + "id": 2227, + "last_activity_at": "2013-05-05T09:45:00.000Z", + "name": "easy", + "name_with_namespace": "Hugo / easy", + "path": "easy", + "path_with_namespace": "rocksoniko/easy", + "readme_url": "https://gitlab.com/rocksoniko/easy/blob/master/README", + "ssh_url_to_repo": "git@gitlab.com:rocksoniko/easy.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/rocksoniko/easy"}, + {"avatar_url": null, + "created_at": "2013-02-10T17:21:24.000Z", + "default_branch": null, + "description": null, + "forks_count": 0, + "http_url_to_repo": "https://gitlab.com/grup/grup.git", + "id": 2294, + "last_activity_at": "2013-05-05T09:45:01.000Z", + "name": "grup", + "name_with_namespace": "grup / grup", + "path": "grup", + "path_with_namespace": "grup/grup", + "readme_url": null, + "ssh_url_to_repo": "git@gitlab.com:grup/grup.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/grup/grup"}, + {"avatar_url": null, + "created_at": "2013-02-14T09:31:50.000Z", + "default_branch": "master", + "description": "", + "forks_count": 0, + "http_url_to_repo": "https://gitlab.com/varac/test.git", + "id": 2390, + "last_activity_at": "2016-02-11T13:51:47.463Z", + "name": "test", + "name_with_namespace": "varac / test", + "path": "test", + "path_with_namespace": "varac/test", + "readme_url": null, + "ssh_url_to_repo": "git@gitlab.com:varac/test.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gitlab.com/varac/test"}] diff --git a/swh/lister/gitlab/tests/test_gitlab_lister.py b/swh/lister/gitlab/tests/test_gitlab_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/tests/test_gitlab_lister.py @@ -0,0 +1,38 @@ +# Copyright (C) 2017-2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import re +import unittest + +from datetime import datetime, timedelta + +from swh.lister.gitlab.lister import GitLabLister +from swh.lister.core.tests.test_lister import HttpListerTesterBase + + +class GitLabListerTester(HttpListerTesterBase, unittest.TestCase): + Lister = GitLabLister + test_re = re.compile(r'^.*/projects.*page=(\d+).*') + lister_subdir = 'gitlab' + good_api_response_file = 'api_response.json' + bad_api_response_file = 'api_empty_response.json' + first_index = 1 + entries_per_page = 10 + + def response_headers(self, request): + headers = {'RateLimit-Remaining': '1'} + if self.request_index(request) == str(self.first_index): + headers.update({ + 'x-next-page': '3', + }) + + return headers + + def mock_rate_quota(self, n, request, context): + self.rate_limit += 1 + context.status_code = 403 + context.headers['RateLimit-Remaining'] = '0' + one_second = int((datetime.now() + timedelta(seconds=1.5)).timestamp()) + context.headers['RateLimit-Reset'] = str(one_second) + return '{"error":"dummy"}' diff --git a/swh/lister/tests/__init__.py b/swh/lister/tests/__init__.py new file mode 100644 diff --git a/swh/lister/tests/test_utils.py b/swh/lister/tests/test_utils.py new file mode 100644 --- /dev/null +++ b/swh/lister/tests/test_utils.py @@ -0,0 +1,67 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest + +from nose.tools import istest + +from swh.lister import utils + + +class UtilsTest(unittest.TestCase): + + @istest + def get(self): + data = { + 'X-Next-Page': None, + 'x-next-page': 1, + } + actual_value = utils.get(data, ['X-Next-Page', 'x-next-page']) + + self.assertEqual(actual_value, 1) + + data = { + 'X-Next-Page': 10, + 'x-next-page': 1, + } + actual_value = utils.get(data, ['X-Next-Page', 'x-next-page']) + + self.assertEqual(actual_value, 10) + + data = { + 'x-next-page': 100, + } + actual_value = utils.get(data, ['X-Next-Page', 'x-next-page']) + + self.assertEqual(actual_value, 100) + + @istest + def get_empty(self): + self.assertIsNone(utils.get({}, [])) + self.assertIsNone(utils.get({'a': 1}, ['b'])) + self.assertIsNone(utils.get({'b': 2}, [])) + self.assertIsNone(utils.get({'b': 2}, [])) + + @istest + def get_errors(self): + with self.assertRaises(TypeError): + self.assertIsNone(utils.get({}, None)) + with self.assertRaises(AttributeError): + self.assertIsNone(utils.get(None, ['a'])) + + @istest + def split_range(self): + actual_ranges = list(utils.split_range(14, 5)) + self.assertEqual(actual_ranges, [(0, 5), (5, 10), (10, 14)]) + + actual_ranges = list(utils.split_range(19, 10)) + self.assertEqual(actual_ranges, [(0, 10), (10, 19)]) + + @istest + def split_range_errors(self): + with self.assertRaises(TypeError): + list(utils.split_range(None, 1)) + + with self.assertRaises(TypeError): + list(utils.split_range(100, None)) diff --git a/swh/lister/utils.py b/swh/lister/utils.py new file mode 100644 --- /dev/null +++ b/swh/lister/utils.py @@ -0,0 +1,25 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def get(d, keys): + """Given a dict, lookup in order for keys with values not None. + + """ + for key in keys: + v = d.get(key) + if v is not None: + return v + return None + + +def split_range(total_pages, nb_pages): + prev_index = None + for index in range(0, total_pages, nb_pages): + if index is not None and prev_index is not None: + yield prev_index, index + prev_index = index + + if index != total_pages: + yield index, total_pages