diff --git a/swh/lister/core/paging_lister.py b/swh/lister/core/paging_lister.py --- a/swh/lister/core/paging_lister.py +++ b/swh/lister/core/paging_lister.py @@ -79,7 +79,14 @@ # You probably don't need to override anything below this line. - def run(self, min_index=None, max_index=None): + def check_existence(self, injected_repos): + """Given a list of injected repos, check if we already have them. + + """ + # FIXME: Implement the check + return False + + def run(self, min_index=None, max_index=None, check_existence=False): """Main entry function. Sequentially fetches repository data from the service according to the basic outline in the class docstring. Continually fetching sublists until either there @@ -89,6 +96,9 @@ Args: min_index (indexable type): optional index to start from max_index (indexable type): optional index to stop at + check_existence (bool): optional existence check (for + incremental lister whose sort + order is inverted) Returns: nothing @@ -99,6 +109,7 @@ self.min_index = min_index self.max_index = max_index + already_seen = False while self.is_within_bounds(index, self.min_index, self.max_index): logging.info('listing repos starting at %s' % index) @@ -106,12 +117,18 @@ response, injected_repos = self.ingest_data(index) next_index = self.get_next_target_from_response(response) + if check_existence: + already_seen = self.check_existence(injected_repos) + # termination condition if (next_index is None) or (next_index == index): logging.info('stopping after index %s, no next link found' % index) break + elif already_seen: + logging.info('Repositories already seen, stopping') + break else: index = next_index diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -12,14 +12,21 @@ class GitLabLister(SWHPagingHttpLister): # Template path expecting an integer that represents the page id - PATH_TEMPLATE = '/projects?page=%d&order_by=id&sort=asc&simple=true' + PATH_TEMPLATE = '/projects?page=%d&order_by=id' API_URL_INDEX_RE = re.compile(r'^.*/projects.*page=(\d+).*') MODEL = GitLabModel + def __init__(self, lister_name=None, api_baseurl=None, + override_config=None, + sort='asc'): + super().__init__(lister_name=lister_name, api_baseurl=api_baseurl, + override_config=override_config) + self.PATH_TEMPLATE = '%s&sort=%s' % (self.PATH_TEMPLATE, sort) + @property def CONFIG_BASE_FILENAME(self): """One gitlab lister for all instances. We discriminate between the - origin on a per instance basis in the table. + origin on a per instance basis in the table. """ return 'lister-gitlab' @@ -109,7 +116,7 @@ return None def get_pages_information(self): - """Determine some pages information. + """Determine pages information. """ response = self.transport_head(identifier=1) diff --git a/swh/lister/gitlab/tasks.py b/swh/lister/gitlab/tasks.py --- a/swh/lister/gitlab/tasks.py +++ b/swh/lister/gitlab/tasks.py @@ -18,13 +18,16 @@ class RangeGitLabLister(GitLabListerTask, RangeListerTask): - """GitLab lister working on specified range (start, end) arguments. + """Range GitLab lister (list available origins on specified range) """ task_queue = 'swh_lister_gitlab_refresh' class FullGitLabRelister(GitLabListerTask): + """Full GitLab lister (list all available origins from the api). + + """ task_queue = 'swh_lister_gitlab_refresh' def run_task(self, *args, **kwargs): @@ -42,3 +45,23 @@ range_task = RangeGitLabLister() group(range_task.s(minv, maxv, *args, **kwargs) for minv, maxv in ranges)() + + +class IncrementalGitLabLister(ListerTaskBase): + """Incremental GitLab lister (list only new available origins). + + """ + task_queue = 'swh_lister_gitlab_discover' + + def new_lister(self, lister_name='gitlab.com', + api_baseurl='https://gitlab.com/api/v4'): + # will invert the order of the lister's result + return GitLabLister( + lister_name=lister_name, api_baseurl=api_baseurl, + sort='desc') + + def run_task(self, *args, **kwargs): + lister = self.new_lister(*args, **kwargs) + # will check for existing data and exit when found + return lister.run(min_index=None, max_index=None, + check_for_presence=True)