diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -30,6 +30,13 @@ if per_page != DEFAULT_BITBUCKET_PAGE: self.PATH_TEMPLATE = '%s&pagelen=%s' % ( self.PATH_TEMPLATE, per_page) + # hack: to determine incremental lister's starting point + try: + self.last_index = super().db_last_index() + except Exception: + self.last_index = None + # variable to work around pagination caveat + self.last_seen_index = None def get_model_from_repo(self, repo): return { @@ -42,28 +49,51 @@ 'origin_type': repo['scm'], } + def _heuristic_next_index(self, index): + """Alter slightly the next index date by 1 day in the future. + + """ + from datetime import timedelta + d = iso8601.parse_date(index) + delta = timedelta(days=+1) + return (d + delta).isoformat() + def get_next_target_from_response(self, response): + """This will read the next link from the api response. It so happens + that sometimes, the next link stays the same between consecutive api + calls... Thus stopping the listing... + + This tries to work around that by shifting the next index detected + to 1 day in the future (creating a hole). Experiments below one day + bear no success. + + """ body = response.json() - if 'next' in body: - return parse.unquote(body['next'].split('after=')[1]) + next_ = body.get('next') + if next_ is not None: + next_ = parse.unquote(next_.split('after=')[1]) + if self.last_seen_index == next_: + next_ = self._heuristic_next_index(next_) + self.last_seen_index = next_ + return next_ def transport_response_simplified(self, response): repos = response.json()['values'] return [self.get_model_from_repo(repo) for repo in repos] def db_first_index(self): - """For the first time listing, there is no data in db, so fallback to the - bitbucket starting year. + """For the first time listing, there is no data in db, so fallback to + the bitbucket starting year. """ return super().db_first_index() or '2008-01-01T00:00:00Z' def db_last_index(self): - """For the first time listing, there is no data in db, so fallback to the time - of the first run as max date. + """For the first time listing (full lister), there is no data in db, so + fallback to the time of the first run as max date. """ - return super().db_last_index() or datetime.datetime.now( + return self.last_index or datetime.datetime.now( tz=datetime.timezone.utc).isoformat() def request_uri(self, identifier): diff --git a/swh/lister/bitbucket/tasks.py b/swh/lister/bitbucket/tasks.py --- a/swh/lister/bitbucket/tasks.py +++ b/swh/lister/bitbucket/tasks.py @@ -19,7 +19,7 @@ @app.task(name=__name__ + '.IncrementalBitBucketLister') def incremental_bitbucket_lister(**lister_args): lister = new_lister(**lister_args) - lister.run(min_bound=lister.db_last_index(), max_bound=None) + lister.run(min_bound=lister.last_index, max_bound=None) @app.task(name=__name__ + '.RangeBitBucketLister') diff --git a/swh/lister/core/indexing_lister.py b/swh/lister/core/indexing_lister.py --- a/swh/lister/core/indexing_lister.py +++ b/swh/lister/core/indexing_lister.py @@ -148,8 +148,6 @@ t = self.db_session.query(func.min(self.MODEL.indexable)).first() if t: return t[0] - else: - return None def db_last_index(self): """Look in the db for the largest indexable value @@ -160,8 +158,6 @@ t = self.db_session.query(func.max(self.MODEL.indexable)).first() if t: return t[0] - else: - return None def disable_deleted_repo_tasks(self, start, end, keep_these): """Disable tasks for repos that no longer exist between start and end. @@ -222,11 +218,12 @@ index) return index = next_index + logger.debug('Index: %s', index) yield i for i in ingest_indexes(): if (i % 20) == 0: - logger.info('flushing updates') + logger.info('flushing updates at index %s', i) self.db_session.commit() self.db_session = self.mk_session()