Page MenuHomeSoftware Heritage

D1632.id.diff
No OneTemporary

D1632.id.diff

diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py
--- a/swh/lister/bitbucket/lister.py
+++ b/swh/lister/bitbucket/lister.py
@@ -2,10 +2,11 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import datetime
import logging
import iso8601
+from datetime import datetime
+
from urllib import parse
from swh.lister.bitbucket.models import BitBucketModel
@@ -23,6 +24,7 @@
MODEL = BitBucketModel
LISTER_NAME = 'bitbucket'
instance = 'bitbucket'
+ default_min_bound = datetime.utcfromtimestamp(0).isoformat()
def __init__(self, api_baseurl, override_config=None, per_page=100):
super().__init__(
@@ -54,21 +56,6 @@
repos = response.json()['values']
return [self.get_model_from_repo(repo) for repo in repos]
- def db_first_index(self):
- """For the first time listing, there is no data in db, so fallback to the
- bitbucket starting year.
-
- """
- return super().db_first_index() or '2008-01-01T00:00:00Z'
-
- def db_last_index(self):
- """For the first time listing, there is no data in db, so fallback to the time
- of the first run as max date.
-
- """
- return super().db_last_index() or datetime.datetime.now(
- tz=datetime.timezone.utc).isoformat()
-
def request_uri(self, identifier):
return super().request_uri(identifier or '1970-01-01')
diff --git a/swh/lister/bitbucket/tasks.py b/swh/lister/bitbucket/tasks.py
--- a/swh/lister/bitbucket/tasks.py
+++ b/swh/lister/bitbucket/tasks.py
@@ -30,8 +30,17 @@
@app.task(name=__name__ + '.FullBitBucketRelister', bind=True)
def full_bitbucket_relister(self, split=None, **lister_args):
+ """Relist from the beginning of what's already been listed.
+
+ It's not to be called for an initial listing.
+
+ """
lister = new_lister(**lister_args)
ranges = lister.db_partition_indices(split or GROUP_SPLIT)
+ if not ranges:
+ self.log.info('Nothing to list')
+ return
+
random.shuffle(ranges)
promise = group(range_bitbucket_lister.s(minv, maxv, **lister_args)
for minv, maxv in ranges)()
diff --git a/swh/lister/core/indexing_lister.py b/swh/lister/core/indexing_lister.py
--- a/swh/lister/core/indexing_lister.py
+++ b/swh/lister/core/indexing_lister.py
@@ -17,6 +17,7 @@
class IndexingLister(ListerBase):
flush_packet_db = 20
+ default_min_bound = ''
"""Lister* intermediate class for any service that follows the pattern:
- The service must report at least one stable unique identifier, known
@@ -95,17 +96,18 @@
def db_partition_indices(self, partition_size):
"""Describe an index-space compartmentalization of the db table
- in equal sized chunks. This is used to describe min&max bounds for
- parallelizing fetch tasks.
+ in equal sized chunks. This is used to describe min&max bounds for
+ parallelizing fetch tasks.
Args:
partition_size (int): desired size to make each partition
+
Returns:
a list of tuples (begin, end) of indexable value that
- declare approximately equal-sized ranges of existing
- repos
- """
+ declare approximately equal-sized ranges of existing
+ repos
+ """
n = max(self.db_num_entries(), 10)
partition_size = min(partition_size, n)
n_partitions = n // partition_size
@@ -114,7 +116,8 @@
max_index = self.db_last_index()
if not min_index or not max_index:
- raise ValueError("Can't partition an empty range")
+ # Nothing to list
+ return []
if isinstance(min_index, str):
def format_bound(bound):
@@ -201,7 +204,7 @@
self.max_index = max_bound
def ingest_indexes():
- index = min_bound or ''
+ index = min_bound or self.default_min_bound
for i in count(1):
response, injected_repos = self.ingest_data(index)
if not response and not injected_repos:
diff --git a/swh/lister/github/tasks.py b/swh/lister/github/tasks.py
--- a/swh/lister/github/tasks.py
+++ b/swh/lister/github/tasks.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2018 the Software Heritage developers
+# Copyright (C) 2017-2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -30,8 +30,16 @@
@app.task(name=__name__ + '.FullGitHubRelister', bind=True)
def full_github_relister(self, split=None, **lister_args):
+ """Relist from the beginning of what's already been listed.
+
+ It's not to be called for an initial listing.
+
+ """
lister = new_lister(**lister_args)
ranges = lister.db_partition_indices(split or GROUP_SPLIT)
+ if not ranges:
+ self.log.info('Nothing to list')
+ return
random.shuffle(ranges)
promise = group(range_github_lister.s(minv, maxv, **lister_args)
for minv, maxv in ranges)()
diff --git a/swh/lister/gitlab/tasks.py b/swh/lister/gitlab/tasks.py
--- a/swh/lister/gitlab/tasks.py
+++ b/swh/lister/gitlab/tasks.py
@@ -38,6 +38,11 @@
@app.task(name=__name__ + '.FullGitLabRelister', bind=True)
def full_gitlab_relister(self, **lister_args):
+ """Full lister
+
+ This should be renamed as such.
+
+ """
lister = new_lister(**lister_args)
_, total_pages, _ = lister.get_pages_information()
ranges = list(utils.split_range(total_pages, NBPAGES))
diff --git a/swh/lister/phabricator/lister.py b/swh/lister/phabricator/lister.py
--- a/swh/lister/phabricator/lister.py
+++ b/swh/lister/phabricator/lister.py
@@ -31,6 +31,14 @@
super().__init__(api_baseurl=api_baseurl,
override_config=override_config)
+ @property
+ def default_min_bound(self):
+ """Starting boundary when `min_bound` is not defined (db empty). This
+ is used within the fn:`run` call.
+
+ """
+ return self._bootstrap_repositories_listing()
+
def _build_query_params(self, params, api_token):
"""Build query params to include the forge's api token
@@ -134,21 +142,6 @@
self.schedule_missing_tasks(models_list, injected)
return self.max_index
- def run(self, min_bound=None, max_bound=None):
- """
- (Override) Run the lister on the specified Phabricator instance
-
- Args:
- min_bound (int): Optional repository index to start the listing
- after it
- max_bound (int): Optional repository index to stop the listing
- after it
- """
- # initial call to the lister, we need to bootstrap it in that case
- if min_bound is None:
- min_bound = self._bootstrap_repositories_listing()
- super().run(min_bound, max_bound)
-
def get_repo_url(attachments):
"""

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 11:44 AM (12 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3227912

Event Timeline