diff --git a/swh/lister/core/paging_lister.py b/swh/lister/core/paging_lister.py --- a/swh/lister/core/paging_lister.py +++ b/swh/lister/core/paging_lister.py @@ -79,7 +79,14 @@ # You probably don't need to override anything below this line. - def run(self, min_bound=None, max_bound=None): + def check_existence(self, injected_repos): + """Given a list of injected repos, check if we already have them. + + """ + # FIXME: Implement the check + return False + + def run(self, min_bound=None, max_bound=None, check_existence=False): """Main entry function. Sequentially fetches repository data from the service according to the basic outline in the class docstring. Continually fetching sublists until either there @@ -89,6 +96,9 @@ Args: min_bound: optional page to start from max_bound: optional page to stop at + check_existence (bool): optional existence check (for + incremental lister whose sort + order is inverted) Returns: nothing @@ -99,6 +109,7 @@ self.min_page = min_bound self.max_page = max_bound + already_seen = False while self.is_within_bounds(page, self.min_page, self.max_page): logging.info('listing repos starting at %s' % page) @@ -106,12 +117,18 @@ response, injected_repos = self.ingest_data(page) next_page = self.get_next_target_from_response(response) + if check_existence: + already_seen = self.check_existence(injected_repos) + # termination condition if (next_page is None) or (next_page == page): logging.info('stopping after page %s, no next link found' % page) break + elif already_seen: + logging.info('Repositories already seen, stopping') + break else: page = next_page diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -12,7 +12,7 @@ class GitLabLister(PageByPageHttpLister): # Template path expecting an integer that represents the page id - PATH_TEMPLATE = '/projects?page=%d&order_by=id&sort=asc&simple=true' + PATH_TEMPLATE = '/projects?page=%d&order_by=id' API_URL_INDEX_RE = re.compile(r'^.*/projects.*page=(\d+).*') MODEL = GitLabModel LISTER_NAME = 'gitlab' @@ -103,7 +103,7 @@ return None def get_pages_information(self): - """Determine some pages information. + """Determine pages information. """ response = self.transport_head(identifier=1) diff --git a/swh/lister/gitlab/tasks.py b/swh/lister/gitlab/tasks.py --- a/swh/lister/gitlab/tasks.py +++ b/swh/lister/gitlab/tasks.py @@ -17,13 +17,16 @@ class RangeGitLabLister(GitLabListerTask, RangeListerTask): - """GitLab lister working on specified range (start, end) arguments. + """Range GitLab lister (list available origins on specified range) """ task_queue = 'swh_lister_gitlab_refresh' class FullGitLabRelister(GitLabListerTask): + """Full GitLab lister (list all available origins from the api). + + """ task_queue = 'swh_lister_gitlab_refresh' def run_task(self, *args, **kwargs): @@ -41,3 +44,22 @@ range_task = RangeGitLabLister() group(range_task.s(minv, maxv, *args, **kwargs) for minv, maxv in ranges)() + + +class IncrementalGitLabLister(ListerTaskBase): + """Incremental GitLab lister (list only new available origins). + + """ + task_queue = 'swh_lister_gitlab_discover' + + def new_lister(self, api_baseurl='https://gitlab.com/api/v4', + instance='gitlab.com',): + # will invert the order of the lister's result + return GitLabLister(instance=instance, api_baseurl=api_baseurl, + sort='desc') + + def run_task(self, *args, **kwargs): + lister = self.new_lister(*args, **kwargs) + # will check for existing data and exit when found + return lister.run(min_bound=None, max_bound=None, + check_existence=True)