diff --git a/swh/lister/core/lister_transports.py b/swh/lister/core/lister_transports.py --- a/swh/lister/core/lister_transports.py +++ b/swh/lister/core/lister_transports.py @@ -100,14 +100,19 @@ self.session = requests.Session() self.lister_version = __version__ - def transport_request(self, identifier): - """Implements SWHListerBase.transport_request for HTTP using Requests. + def _transport_action(self, identifier, method='get'): + """Permit to ask information to the api prior to actually executing + query. + """ path = self.request_uri(identifier) params = self.request_params(identifier) try: - response = self.session.get(path, **params) + if method == 'head': + response = self.session.head(path, **params) + else: + response = self.session.get(path, **params) except requests.exceptions.ConnectionError as e: raise FetchError(e) else: @@ -115,6 +120,20 @@ raise FetchError(response) return response + def transport_head(self, identifier): + """Retrieve head information on api. + + """ + return self._transport_action(identifier, method='head') + + def transport_request(self, identifier): + """Implements SWHListerBase.transport_request for HTTP using Requests. + + Retrieve get information on api. + + """ + return self._transport_action(identifier) + def transport_response_to_string(self, response): """Implements SWHListerBase.transport_response_to_string for HTTP given Requests responses. diff --git a/swh/lister/core/paging_lister.py b/swh/lister/core/paging_lister.py --- a/swh/lister/core/paging_lister.py +++ b/swh/lister/core/paging_lister.py @@ -57,6 +57,26 @@ """ pass + @abc.abstractmethod + def get_pages_information(self): + """Find the total number of pages. + + Implementation of this method depends on the server API spec + and the shape of the network response object returned by the + transport_request method. + + For example, some api can use dedicated headers: + - x-total-pages to provide the total number of pages + - x-total to provide the total number of repositories + - x-per-page to provide the number of elements per page + + Returns: + tuple (total number of repositories, total number of + pages, per_page) + + """ + pass + # You probably don't need to override anything below this line. def run(self, min_index=None, max_index=None): @@ -76,6 +96,7 @@ """ index = min_index or '' loop_count = 0 + self.min_index = min_index self.max_index = max_index diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -100,7 +100,7 @@ return False, 0 def get_next_target_from_response(self, response): - """Deal with pagination + """Determine the next page identifier. """ if 'next' in response.links: @@ -108,6 +108,23 @@ return int(self.API_URL_INDEX_RE.match(next_url).group(1)) return None + def get_pages_information(self): + """Determine some pages information. + + """ + response = self.transport_head(identifier=1) + h = response.headers + total = h.get('x-total', h.get('X-Total')) + total_pages = h.get('x-total-pages', h.get('X-Total-Pages')) + per_page = h.get('x-per-page', h.get('X-Per-Page')) + if total is not None: + total = int(total) + if total_pages is not None: + total_pages = int(total_pages) + if per_page is not None: + per_page = int(per_page) + return total, total_pages, per_page + def transport_response_simplified(self, response): repos = response.json() return [self.get_model_from_repo(repo) for repo in repos] diff --git a/swh/lister/gitlab/tasks.py b/swh/lister/gitlab/tasks.py --- a/swh/lister/gitlab/tasks.py +++ b/swh/lister/gitlab/tasks.py @@ -2,23 +2,43 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.lister.core.tasks import ListerTaskBase, RangeListerTask +import random +from celery import group +from ..core.tasks import ListerTaskBase, RangeListerTask from .lister import GitLabLister -class GitLabDotComListerTask(ListerTaskBase): +class GitLabListerTask(ListerTaskBase): def new_lister(self, lister_name='gitlab.com', api_baseurl='https://gitlab.com/api/v4'): return GitLabLister( lister_name=lister_name, api_baseurl=api_baseurl) -class RangeGitLabLister(GitLabDotComListerTask, RangeListerTask): +class RangeGitLabLister(GitLabListerTask, RangeListerTask): """GitLab lister working on specified range (start, end) arguments. """ task_queue = 'swh_lister_gitlab_refresh' +class FullGitLabRelister(GitLabListerTask): + task_queue = 'swh_lister_gitlab_refresh' + + def run_task(self, *args, **kwargs): + lister = self.new_lister(*args, **kwargs) + total, _, per_page = lister.get_pages_information() + + ranges = [] + prev_index = None + for index in range(0, total, per_page): + if index is not None and prev_index is not None: + ranges.append((prev_index, index)) + prev_index = index + + random.shuffle(ranges) + range_task = RangeGitLabLister() + group(range_task.s(minv, maxv, *args, **kwargs) + for minv, maxv in ranges)()