diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -16,6 +16,7 @@ DEFAULT_BITBUCKET_PAGE = 10 +BITBUCKET_STARTING_TIME = '2008-01-01T00:00:00Z' class BitBucketLister(IndexingHttpLister): @@ -56,7 +57,7 @@ bitbucket starting year. """ - return super().db_first_index() or '2008-01-01T00:00:00Z' + return super().db_first_index() or BITBUCKET_STARTING_TIME def db_last_index(self): """For the first time listing, there is no data in db, so fallback to the time diff --git a/swh/lister/bitbucket/tasks.py b/swh/lister/bitbucket/tasks.py --- a/swh/lister/bitbucket/tasks.py +++ b/swh/lister/bitbucket/tasks.py @@ -7,7 +7,7 @@ from swh.scheduler.celery_backend.config import app -from .lister import BitBucketLister +from .lister import BitBucketLister, BITBUCKET_STARTING_TIME GROUP_SPLIT = 10000 @@ -19,7 +19,11 @@ @app.task(name=__name__ + '.IncrementalBitBucketLister') def incremental_bitbucket_lister(**lister_args): lister = new_lister(**lister_args) - lister.run(min_bound=lister.db_last_index(), max_bound=None) + min_bound = lister.db_first_index() + if min_bound != BITBUCKET_STARTING_TIME: + # data in db, start incrementally from the last seen index + min_bound = lister.db_last_index() + lister.run(min_bound=min_bound, max_bound=None) @app.task(name=__name__ + '.RangeBitBucketLister')