Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/github/tasks.py
# Copyright (C) 2016 the Software Heritage developers | # Copyright (C) 2017 the Software Heritage developers | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import random | from swh.lister.core.tasks import (IndexingDiscoveryListerTask, | ||||
IndexingRangeListerTask, | |||||
from celery import group | IndexingRefreshListerTask, ListerTaskBase) | ||||
from swh.scheduler.task import Task | |||||
from .lister import GitHubLister | from .lister import GitHubLister | ||||
GROUP_SPLIT = 10000 | |||||
class IncrementalGitHubLister(Task): | |||||
task_queue = 'swh_lister_github_incremental' | |||||
def run(self): | |||||
lister = GitHubLister() | |||||
last_id = lister.last_repo_id() | |||||
lister.fetch(min_id=last_id + 1, max_id=None) | |||||
class RangeGitHubLister(Task): | class GitHubListerTask(ListerTaskBase): | ||||
task_queue = 'swh_lister_github_full' | def new_lister(self): | ||||
return GitHubLister(lister_name='github.com', | |||||
api_baseurl='https://github.com') | |||||
def run(self, start, end): | |||||
lister = GitHubLister() | |||||
lister.fetch(min_id=start, max_id=end) | |||||
class IncrementalGitHubLister(GitHubListerTask, IndexingDiscoveryListerTask): | |||||
task_queue = 'swh_lister_github_discover' | |||||
class FullGitHubLister(Task): | |||||
task_queue = 'swh_lister_github_full' | |||||
def run(self): | class RangeGitHubLister(GitHubListerTask, IndexingRangeListerTask): | ||||
lister = GitHubLister() | task_queue = 'swh_lister_github_refresh' | ||||
last_id = lister.last_repo_id() | |||||
ranges = [ | |||||
(i, min(last_id, i + GROUP_SPLIT - 1)) | |||||
for i in range(1, last_id, GROUP_SPLIT) | |||||
] | |||||
random.shuffle(ranges) | |||||
range_task = RangeGitHubLister() | class FullGitHubRelister(GitHubListerTask, IndexingRefreshListerTask): | ||||
group(range_task.s(min, max) for min, max in ranges)() | task_queue = 'swh_lister_github_refresh' |