Page MenuHomeSoftware Heritage

D367.id1174.diff
No OneTemporary

D367.id1174.diff

diff --git a/swh/lister/core/paging_lister.py b/swh/lister/core/paging_lister.py
--- a/swh/lister/core/paging_lister.py
+++ b/swh/lister/core/paging_lister.py
@@ -79,7 +79,16 @@
# You probably don't need to override anything below this line.
+ def check_existence(self, injected_repos):
+ """Given a list of injected repos, check if we already have them.
+
+ """
+ # FIXME: Implement the check
+ return False
+
+<<<<<<< HEAD
def run(self, min_bound=None, max_bound=None):
+=======
"""Main entry function. Sequentially fetches repository data from the
service according to the basic outline in the class
docstring. Continually fetching sublists until either there
@@ -89,6 +98,9 @@
Args:
min_bound: optional page to start from
max_bound: optional page to stop at
+ check_existence (bool): optional existence check (for
+ incremental lister whose sort
+ order is inverted)
Returns:
nothing
@@ -99,6 +111,7 @@
self.min_page = min_bound
self.max_page = max_bound
+ already_seen = False
while self.is_within_bounds(page, self.min_page, self.max_page):
logging.info('listing repos starting at %s' % page)
@@ -106,12 +119,18 @@
response, injected_repos = self.ingest_data(page)
next_page = self.get_next_target_from_response(response)
+ if check_existence:
+ already_seen = self.check_existence(injected_repos)
+
# termination condition
if (next_page is None) or (next_page == page):
logging.info('stopping after page %s, no next link found' %
page)
break
+ elif already_seen:
+ logging.info('Repositories already seen, stopping')
+ break
else:
page = next_page
diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py
--- a/swh/lister/gitlab/lister.py
+++ b/swh/lister/gitlab/lister.py
@@ -12,7 +12,7 @@
class GitLabLister(PageByPageHttpLister):
# Template path expecting an integer that represents the page id
- PATH_TEMPLATE = '/projects?page=%d&order_by=id&sort=asc&simple=true'
+ PATH_TEMPLATE = '/projects?page=%d&order_by=id'
API_URL_INDEX_RE = re.compile(r'^.*/projects.*page=(\d+).*')
MODEL = GitLabModel
LISTER_NAME = 'gitlab'
@@ -103,7 +103,7 @@
return None
def get_pages_information(self):
- """Determine some pages information.
+ """Determine pages information.
"""
response = self.transport_head(identifier=1)
diff --git a/swh/lister/gitlab/tasks.py b/swh/lister/gitlab/tasks.py
--- a/swh/lister/gitlab/tasks.py
+++ b/swh/lister/gitlab/tasks.py
@@ -17,13 +17,16 @@
class RangeGitLabLister(GitLabListerTask, RangeListerTask):
- """GitLab lister working on specified range (start, end) arguments.
+ """Range GitLab lister (list available origins on specified range)
"""
task_queue = 'swh_lister_gitlab_refresh'
class FullGitLabRelister(GitLabListerTask):
+ """Full GitLab lister (list all available origins from the api).
+
+ """
task_queue = 'swh_lister_gitlab_refresh'
def run_task(self, *args, **kwargs):
@@ -41,3 +44,22 @@
range_task = RangeGitLabLister()
group(range_task.s(minv, maxv, *args, **kwargs)
for minv, maxv in ranges)()
+
+
+class IncrementalGitLabLister(ListerTaskBase):
+ """Incremental GitLab lister (list only new available origins).
+
+ """
+ task_queue = 'swh_lister_gitlab_discover'
+
+ def new_lister(self, api_baseurl='https://gitlab.com/api/v4',
+ instance='gitlab.com',):
+ # will invert the order of the lister's result
+ return GitLabLister(instance=instance, api_baseurl=api_baseurl,
+ sort='desc')
+
+ def run_task(self, *args, **kwargs):
+ lister = self.new_lister(*args, **kwargs)
+ # will check for existing data and exit when found
+ return lister.run(min_bound=None, max_bound=None,
+ check_for_presence=True)

File Metadata

Mime Type
text/plain
Expires
Dec 17 2024, 12:49 AM (4 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3230594

Event Timeline