diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -6,7 +6,6 @@ import iso8601 from datetime import datetime - from urllib import parse from swh.lister.bitbucket.models import BitBucketModel @@ -15,7 +14,6 @@ logger = logging.getLogger(__name__) - DEFAULT_BITBUCKET_PAGE = 10 @@ -29,9 +27,20 @@ def __init__(self, api_baseurl, override_config=None, per_page=100): super().__init__( api_baseurl=api_baseurl, override_config=override_config) - if per_page != DEFAULT_BITBUCKET_PAGE: - self.PATH_TEMPLATE = '%s&pagelen=%s' % ( - self.PATH_TEMPLATE, per_page) + self.per_page = per_page + + def request_params(self, identifier): + """Deal properly with extra api call query parameters. + + This installs the `pagelen` query parameter in charge of quantity of + repositories to return per api call. + + """ + params = super().request_params(identifier) + query_params = params.get('params', {}) # future query parameters here + query_params.update({'pagelen': self.per_page}) + params['params'] = query_params + return params def get_model_from_repo(self, repo): return { @@ -45,15 +54,29 @@ } def get_next_target_from_response(self, response): + """This will read the next link from the api response. It so happens + that sometimes, the next link stays the same between consecutive api + calls... Thus stopping the listing... + + This tries to work around that by shifting the next index detected + to 1 day in the future (creating a hole). Experiments below one day + bear no success. + + Returns: + next date (isoformatted) to use as pagination index + + """ body = response.json() - if 'next' in body: - return parse.unquote(body['next'].split('after=')[1]) + next_ = body.get('next') + if next_ is not None: + return parse.unquote(next_.split('after=')[1]) def transport_response_simplified(self, response): repos = response.json()['values'] return [self.get_model_from_repo(repo) for repo in repos] def request_uri(self, identifier): + identifier = parse.quote(identifier) return super().request_uri(identifier or '1970-01-01') def is_within_bounds(self, inner, lower=None, upper=None): diff --git a/swh/lister/core/indexing_lister.py b/swh/lister/core/indexing_lister.py --- a/swh/lister/core/indexing_lister.py +++ b/swh/lister/core/indexing_lister.py @@ -217,17 +217,22 @@ self.disable_deleted_repo_tasks(index, next_index, keep_these) # termination condition - if next_index is None or next_index == index: - logger.info('stopping after index %s, no next link found' % + if next_index is None: + logger.info('No next link found, stopping after index %s', index) return + if next_index == index: + logger.warn( + 'Next link found same as current one %s', + next_index) + return index = next_index logger.debug('Index: %s', index) yield i for i in ingest_indexes(): if (i % 20) == 0: - logger.info('flushing updates at index %s', i) + logger.debug('Flushing updates at index %s', i) self.db_session.commit() self.db_session = self.mk_session()