diff --git a/swh/lister/core/paging_lister.py b/swh/lister/core/paging_lister.py index d65d4a4..78cc5de 100644 --- a/swh/lister/core/paging_lister.py +++ b/swh/lister/core/paging_lister.py @@ -1,136 +1,153 @@ # Copyright (C) 2015-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import logging from .lister_transports import SWHListerHttpTransport from .lister_base import SWHListerBase class PageByPageLister(SWHListerBase): """Lister* intermediate class for any service that follows the simple pagination page pattern. - Client sends a request to list repositories starting from a given page identifier. - Client receives structured (json/xml/etc) response with information about a sequential series of repositories (per page) starting from a given index. And, if available, some indication of the next page index for fetching the remaining repository data. See :class:`swh.lister.core.lister_base.SWHListerBase` for more details. This class cannot be instantiated. To create a new Lister for a source code listing service that follows the model described above, you must subclass this class. Then provide the required overrides in addition to any unmet implementation/override requirements of this class's base (see parent class and member docstrings for details). Required Overrides:: def get_next_target_from_response """ @abc.abstractmethod def get_next_target_from_response(self, response): """Find the next server endpoint page given the entire response. Implementation of this method depends on the server API spec and the shape of the network response object returned by the transport_request method. For example, some api can use the headers links to provide the next page. Args: response (transport response): response page from the server Returns: index of next page, possibly extracted from a next href url """ pass @abc.abstractmethod def get_pages_information(self): """Find the total number of pages. Implementation of this method depends on the server API spec and the shape of the network response object returned by the transport_request method. For example, some api can use dedicated headers: - x-total-pages to provide the total number of pages - x-total to provide the total number of repositories - x-per-page to provide the number of elements per page Returns: tuple (total number of repositories, total number of pages, per_page) """ pass # You probably don't need to override anything below this line. - def run(self, min_bound=None, max_bound=None): + def check_existence(self, injected_repos): + """Given a list of injected repos, check if we already have them. + + """ + # FIXME: Implement the check + return False + + def run(self, min_bound=None, max_bound=None, check_existence=False): """Main entry function. Sequentially fetches repository data from the service according to the basic outline in the class docstring. Continually fetching sublists until either there is no next page reference given or the given next page is greater than the desired max_page. Args: min_bound: optional page to start from max_bound: optional page to stop at + check_existence (bool): optional existence check (for + incremental lister whose sort + order is inverted) Returns: nothing """ page = min_bound or '' loop_count = 0 self.min_page = min_bound self.max_page = max_bound + already_seen = False while self.is_within_bounds(page, self.min_page, self.max_page): logging.info('listing repos starting at %s' % page) response, injected_repos = self.ingest_data(page) next_page = self.get_next_target_from_response(response) + if check_existence: + already_seen = self.check_existence(injected_repos) + # termination condition if (next_page is None) or (next_page == page): logging.info('stopping after page %s, no next link found' % page) break + elif already_seen: + logging.info('Repositories already seen, stopping') + break else: page = next_page loop_count += 1 if loop_count == 20: logging.info('flushing updates') loop_count = 0 self.db_session.commit() self.db_session = self.mk_session() self.db_session.commit() self.db_session = self.mk_session() class PageByPageHttpLister(SWHListerHttpTransport, PageByPageLister): """Convenience class for ensuring right lookup and init order when combining PageByPageLister and SWHListerHttpTransport. """ def __init__(self, api_baseurl=None, override_config=None): SWHListerHttpTransport.__init__(self, api_baseurl=api_baseurl) PageByPageLister.__init__(self, override_config=override_config) diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py index a60cd23..608e148 100644 --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -1,124 +1,124 @@ # Copyright (C) 2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import random import re import time from ..core.paging_lister import PageByPageHttpLister from .models import GitLabModel class GitLabLister(PageByPageHttpLister): # Template path expecting an integer that represents the page id - PATH_TEMPLATE = '/projects?page=%d&order_by=id&sort=asc&simple=true' + PATH_TEMPLATE = '/projects?page=%d&order_by=id' API_URL_INDEX_RE = re.compile(r'^.*/projects.*page=(\d+).*') MODEL = GitLabModel LISTER_NAME = 'gitlab' def __init__(self, api_baseurl=None, instance=None, override_config=None, sort='asc'): super().__init__(api_baseurl=api_baseurl, override_config=override_config) self.instance = instance self.PATH_TEMPLATE = '%s&sort=%s' % (self.PATH_TEMPLATE, sort) @property def ADDITIONAL_CONFIG(self): """Override additional config as the 'credentials' structure change between the ancestor classes and this class. cf. request_params method below """ default_config = super().ADDITIONAL_CONFIG # 'credentials' is a dict of (instance, {username, password}) dict default_config['credentials'] = ('dict', {}) return default_config def request_params(self, identifier): """Get the full parameters passed to requests given the transport_request identifier. For the gitlab lister, the 'credentials' entries is configured per instance. For example: - credentials: - gitlab.com: - username: user0 password: - username: user1 password: - ... - other-gitlab-instance: ... """ params = { 'headers': self.request_headers() or {} } # Retrieve the credentials per instance creds = self.config['credentials'] if creds: creds_lister = creds[self.instance] auth = random.choice(creds_lister) if creds else None if auth: params['auth'] = (auth['username'], auth['password']) return params def get_model_from_repo(self, repo): return { 'instance': self.instance, 'uid': repo['id'], 'indexable': repo['id'], 'name': repo['name'], 'full_name': repo['path_with_namespace'], 'html_url': repo['web_url'], 'origin_url': repo['http_url_to_repo'], 'origin_type': 'git', 'description': repo['description'], } def transport_quota_check(self, response): """Deal with rate limit if any. """ # not all gitlab instance have rate limit if 'RateLimit-Remaining' in response.headers: reqs_remaining = int(response.headers['RateLimit-Remaining']) if response.status_code == 403 and reqs_remaining == 0: reset_at = int(response.headers['RateLimit-Reset']) delay = min(reset_at - time.time(), 3600) return True, delay return False, 0 def get_next_target_from_response(self, response): """Determine the next page identifier. """ if 'next' in response.links: next_url = response.links['next']['url'] return int(self.API_URL_INDEX_RE.match(next_url).group(1)) return None def get_pages_information(self): - """Determine some pages information. + """Determine pages information. """ response = self.transport_head(identifier=1) h = response.headers total = h.get('x-total', h.get('X-Total')) total_pages = h.get('x-total-pages', h.get('X-Total-Pages')) per_page = h.get('x-per-page', h.get('X-Per-Page')) if total is not None: total = int(total) if total_pages is not None: total_pages = int(total_pages) if per_page is not None: per_page = int(per_page) return total, total_pages, per_page def transport_response_simplified(self, response): repos = response.json() return [self.get_model_from_repo(repo) for repo in repos] diff --git a/swh/lister/gitlab/tasks.py b/swh/lister/gitlab/tasks.py index ba6290e..dbae2c3 100644 --- a/swh/lister/gitlab/tasks.py +++ b/swh/lister/gitlab/tasks.py @@ -1,43 +1,65 @@ # Copyright (C) 2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import random from celery import group from ..core.tasks import ListerTaskBase, RangeListerTask from .lister import GitLabLister class GitLabListerTask(ListerTaskBase): def new_lister(self, api_baseurl='https://gitlab.com/api/v4', instance='gitlab.com'): return GitLabLister(api_baseurl=api_baseurl, instance=instance) class RangeGitLabLister(GitLabListerTask, RangeListerTask): - """GitLab lister working on specified range (start, end) arguments. + """Range GitLab lister (list available origins on specified range) """ task_queue = 'swh_lister_gitlab_refresh' class FullGitLabRelister(GitLabListerTask): + """Full GitLab lister (list all available origins from the api). + + """ task_queue = 'swh_lister_gitlab_refresh' def run_task(self, *args, **kwargs): lister = self.new_lister(*args, **kwargs) total, _, per_page = lister.get_pages_information() ranges = [] prev_index = None for index in range(0, total, per_page): if index is not None and prev_index is not None: ranges.append((prev_index, index)) prev_index = index random.shuffle(ranges) range_task = RangeGitLabLister() group(range_task.s(minv, maxv, *args, **kwargs) for minv, maxv in ranges)() + + +class IncrementalGitLabLister(ListerTaskBase): + """Incremental GitLab lister (list only new available origins). + + """ + task_queue = 'swh_lister_gitlab_discover' + + def new_lister(self, api_baseurl='https://gitlab.com/api/v4', + instance='gitlab.com',): + # will invert the order of the lister's result + return GitLabLister(instance=instance, api_baseurl=api_baseurl, + sort='desc') + + def run_task(self, *args, **kwargs): + lister = self.new_lister(*args, **kwargs) + # will check for existing data and exit when found + return lister.run(min_bound=None, max_bound=None, + check_existence=True)