diff --git a/swh/lister/core/paging_lister.py b/swh/lister/core/paging_lister.py new file mode 100644 index 0000000..f3c5b5f --- /dev/null +++ b/swh/lister/core/paging_lister.py @@ -0,0 +1,117 @@ +# Copyright (C) 2015-2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import abc +import logging + +from .lister_transports import SWHListerHttpTransport +from .lister_base import SWHListerBase + + +class SWHPagingLister(SWHListerBase): + """Lister* intermediate class for any service that follows the simple + pagination page pattern. + + - Client sends a request to list repositories starting from a + given page identifier. + + - Client receives structured (json/xml/etc) response with + information about a sequential series of repositories (per page) + starting from a given index. And, if available, some indication + of the next page index for fetching the remaining repository + data. + + See :class:`swh.lister.core.lister_base.SWHListerBase` for more + details. + + This class cannot be instantiated. To create a new Lister for a + source code listing service that follows the model described + above, you must subclass this class. Then provide the required + overrides in addition to any unmet implementation/override + requirements of this class's base (see parent class and member + docstrings for details). + + Required Overrides:: + + def get_next_target_from_response + + """ + @abc.abstractmethod + def get_next_target_from_response(self, response): + """Find the next server endpoint page given the entire response. + + Implementation of this method depends on the server API spec + and the shape of the network response object returned by the + transport_request method. + + For example, some api can use the headers links to provide the + next page. + + Args: + response (transport response): response page from the server + + Returns: + index of next page, possibly extracted from a next href url + + """ + pass + + # You probably don't need to override anything below this line. + + def run(self, min_index=None, max_index=None): + """Main entry function. Sequentially fetches repository data from the + service according to the basic outline in the class + docstring. Continually fetching sublists until either there + is no next index reference given or the given next index is + greater than the desired max_index. + + Args: + min_index (indexable type): optional index to start from + max_index (indexable type): optional index to stop at + + Returns: + nothing + + """ + index = min_index or '' + loop_count = 0 + self.min_index = min_index + self.max_index = max_index + + while self.is_within_bounds(index, self.min_index, self.max_index): + logging.info('listing repos starting at %s' % index) + + response, injected_repos = self.ingest_data(index) + next_index = self.get_next_target_from_response(response) + + # termination condition + + if (next_index is None) or (next_index == index): + logging.info('stopping after index %s, no next link found' % + index) + break + else: + index = next_index + + loop_count += 1 + if loop_count == 20: + logging.info('flushing updates') + loop_count = 0 + self.db_session.commit() + self.db_session = self.mk_session() + + self.db_session.commit() + self.db_session = self.mk_session() + + +class SWHPagingHttpLister(SWHListerHttpTransport, SWHPagingLister): + """Convenience class for ensuring right lookup and init order when + combining SWHPagingLister and SWHListerHttpTransport. + + """ + def __init__(self, lister_name=None, api_baseurl=None, + override_config=None): + SWHListerHttpTransport.__init__(self, api_baseurl=api_baseurl) + SWHPagingLister.__init__(self, lister_name=lister_name, + override_config=override_config) diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py index 224d297..1c0248b 100644 --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -1,125 +1,113 @@ # Copyright (C) 2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import random import re import time -from ..core.indexing_lister import SWHIndexingHttpLister +from ..core.paging_lister import SWHPagingHttpLister from .models import GitLabModel -class GitLabLister(SWHIndexingHttpLister): - # Path to give and mentioning the last id for the next page +class GitLabLister(SWHPagingHttpLister): + # Template path expecting an integer that represents the page id PATH_TEMPLATE = '/projects?page=%d&order_by=id&sort=asc&simple=true' - # gitlab api do not have an indexable identifier so using the page - # id API_URL_INDEX_RE = re.compile(r'^.*/projects.*\&page=(\d+).*') - # The indexable field, the one we are supposed to use in the api - # query is not part of the lookup query. So, we cannot filter - # (method filter_before_inject), nor detect and disable origins - # (method disable_deleted_repo_tasks) MODEL = GitLabModel @property def CONFIG_BASE_FILENAME(self): """One gitlab lister for all instances. We discriminate between the origin on a per instance basis in the table. """ return 'lister-gitlab' @property def ADDITIONAL_CONFIG(self): """Override additional config as the 'credentials' structure change between the ancestor classes and the subclass. cf. request_params method below """ return { 'lister_db_url': ('str', 'postgresql:///lister-gitlab'), 'credentials': # credentials is a dict ('dict', {}), 'cache_responses': ('bool', False), 'cache_dir': ('str', '~/.cache/swh/lister/%s' % self.lister_name), } def request_params(self, identifier): """Get the full parameters passed to requests given the transport_request identifier. For the gitlab lister, the 'credentials' entries is configured per instance. For example: - credentials: - gitlab.com: - username: user0 password: - username: user1 password: - ... - other-gitlab-instance: ... """ params = { 'headers': self.request_headers() or {} } # Retrieve the credentials per instance creds = self.config['credentials'] if creds: creds_lister = creds[self.lister_name] auth = random.choice(creds_lister) if creds else None if auth: params['auth'] = (auth['username'], auth['password']) return params - def filter_before_inject(self, models_list): - """We cannot filter so returns the models_list as is. - - """ - return models_list - def get_model_from_repo(self, repo): return { 'instance': self.lister_name, 'uid': repo['id'], 'indexable': repo['id'], 'name': repo['name'], 'full_name': repo['path_with_namespace'], 'html_url': repo['web_url'], 'origin_url': repo['http_url_to_repo'], 'origin_type': 'git', 'description': repo['description'], } def transport_quota_check(self, response): """Deal with rate limit """ reqs_remaining = int(response.headers['RateLimit-Remaining']) # TODO: need to dig further about the actual returned code # (not seen yet in documentation) if response.status_code == 403 and reqs_remaining == 0: reset_at = int(response.headers['RateLimit-Reset']) delay = min(reset_at - time.time(), 3600) return True, delay return False, 0 def get_next_target_from_response(self, response): """Deal with pagination """ if 'next' in response.links: next_url = response.links['next']['url'] return int(self.API_URL_INDEX_RE.match(next_url).group(1)) return None def transport_response_simplified(self, response): repos = response.json() return [self.get_model_from_repo(repo) for repo in repos]