diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -2,10 +2,11 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import datetime import logging import iso8601 +from datetime import datetime + from urllib import parse from swh.lister.bitbucket.models import BitBucketModel @@ -23,6 +24,7 @@ MODEL = BitBucketModel LISTER_NAME = 'bitbucket' instance = 'bitbucket' + default_min_bound = datetime.utcfromtimestamp(0).isoformat() def __init__(self, api_baseurl, override_config=None, per_page=100): super().__init__( @@ -54,21 +56,6 @@ repos = response.json()['values'] return [self.get_model_from_repo(repo) for repo in repos] - def db_first_index(self): - """For the first time listing, there is no data in db, so fallback to the - bitbucket starting year. - - """ - return super().db_first_index() or '2008-01-01T00:00:00Z' - - def db_last_index(self): - """For the first time listing, there is no data in db, so fallback to the time - of the first run as max date. - - """ - return super().db_last_index() or datetime.datetime.now( - tz=datetime.timezone.utc).isoformat() - def request_uri(self, identifier): return super().request_uri(identifier or '1970-01-01') diff --git a/swh/lister/bitbucket/tasks.py b/swh/lister/bitbucket/tasks.py --- a/swh/lister/bitbucket/tasks.py +++ b/swh/lister/bitbucket/tasks.py @@ -30,8 +30,17 @@ @app.task(name=__name__ + '.FullBitBucketRelister', bind=True) def full_bitbucket_relister(self, split=None, **lister_args): + """Relist from the beginning of what's already been listed. + + It's not to be called for an initial listing. + + """ lister = new_lister(**lister_args) ranges = lister.db_partition_indices(split or GROUP_SPLIT) + if not ranges: + self.log.info('Nothing to list') + return + random.shuffle(ranges) promise = group(range_bitbucket_lister.s(minv, maxv, **lister_args) for minv, maxv in ranges)() diff --git a/swh/lister/core/indexing_lister.py b/swh/lister/core/indexing_lister.py --- a/swh/lister/core/indexing_lister.py +++ b/swh/lister/core/indexing_lister.py @@ -17,6 +17,7 @@ class IndexingLister(ListerBase): flush_packet_db = 20 + default_min_bound = '' """Lister* intermediate class for any service that follows the pattern: - The service must report at least one stable unique identifier, known @@ -95,17 +96,18 @@ def db_partition_indices(self, partition_size): """Describe an index-space compartmentalization of the db table - in equal sized chunks. This is used to describe min&max bounds for - parallelizing fetch tasks. + in equal sized chunks. This is used to describe min&max bounds for + parallelizing fetch tasks. Args: partition_size (int): desired size to make each partition + Returns: a list of tuples (begin, end) of indexable value that - declare approximately equal-sized ranges of existing - repos - """ + declare approximately equal-sized ranges of existing + repos + """ n = max(self.db_num_entries(), 10) partition_size = min(partition_size, n) n_partitions = n // partition_size @@ -114,7 +116,8 @@ max_index = self.db_last_index() if not min_index or not max_index: - raise ValueError("Can't partition an empty range") + # Nothing to list + return [] if isinstance(min_index, str): def format_bound(bound): @@ -201,7 +204,7 @@ self.max_index = max_bound def ingest_indexes(): - index = min_bound or '' + index = min_bound or self.default_min_bound for i in count(1): response, injected_repos = self.ingest_data(index) if not response and not injected_repos: diff --git a/swh/lister/github/tasks.py b/swh/lister/github/tasks.py --- a/swh/lister/github/tasks.py +++ b/swh/lister/github/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2018 the Software Heritage developers +# Copyright (C) 2017-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -30,8 +30,16 @@ @app.task(name=__name__ + '.FullGitHubRelister', bind=True) def full_github_relister(self, split=None, **lister_args): + """Relist from the beginning of what's already been listed. + + It's not to be called for an initial listing. + + """ lister = new_lister(**lister_args) ranges = lister.db_partition_indices(split or GROUP_SPLIT) + if not ranges: + self.log.info('Nothing to list') + return random.shuffle(ranges) promise = group(range_github_lister.s(minv, maxv, **lister_args) for minv, maxv in ranges)() diff --git a/swh/lister/gitlab/tasks.py b/swh/lister/gitlab/tasks.py --- a/swh/lister/gitlab/tasks.py +++ b/swh/lister/gitlab/tasks.py @@ -38,6 +38,11 @@ @app.task(name=__name__ + '.FullGitLabRelister', bind=True) def full_gitlab_relister(self, **lister_args): + """Full lister + + This should be renamed as such. + + """ lister = new_lister(**lister_args) _, total_pages, _ = lister.get_pages_information() ranges = list(utils.split_range(total_pages, NBPAGES)) diff --git a/swh/lister/phabricator/lister.py b/swh/lister/phabricator/lister.py --- a/swh/lister/phabricator/lister.py +++ b/swh/lister/phabricator/lister.py @@ -31,6 +31,14 @@ super().__init__(api_baseurl=api_baseurl, override_config=override_config) + @property + def default_min_bound(self): + """Starting boundary when `min_bound` is not defined (db empty). This + is used within the fn:`run` call. + + """ + return self._bootstrap_repositories_listing() + def _build_query_params(self, params, api_token): """Build query params to include the forge's api token @@ -134,21 +142,6 @@ self.schedule_missing_tasks(models_list, injected) return self.max_index - def run(self, min_bound=None, max_bound=None): - """ - (Override) Run the lister on the specified Phabricator instance - - Args: - min_bound (int): Optional repository index to start the listing - after it - max_bound (int): Optional repository index to stop the listing - after it - """ - # initial call to the lister, we need to bootstrap it in that case - if min_bound is None: - min_bound = self._bootstrap_repositories_listing() - super().run(min_bound, max_bound) - def get_repo_url(attachments): """