diff --git a/tasks/send-1000-stars-repo.py b/tasks/send-1000-stars-repo.py index 57a63a3..6420044 100755 --- a/tasks/send-1000-stars-repo.py +++ b/tasks/send-1000-stars-repo.py @@ -1,110 +1,109 @@ #!/usr/bin/env python3 import click import logging -import json import time import requests from urllib.parse import quote from swh.scheduler.celery_backend.config import app from swh.loader.git import tasks # noqa def parse_links(links): """Parse links Args: links (dict): with keys next and last """ next_link = links['next']['url'] last_link = links['last']['url'] return next_link, last_link # max sleeping time in seconds MAX_SLEEP = 3600 def list_repositories(start_page=None, last_page=None): """List desired gh repositories. """ next_page = 'https://api.github.com/search/repositories?q=%s&per_page=100' % quote( 'stars:>=1000') if start_page: next_page = '%s&page=%s' % (next_page, start_page) if last_page: last_page = '%s&page=%s' % (next_page, last_page) while True: logging.info('Querying: %s' % next_page) r = requests.get(next_page) rate_limit = r.headers['X-RateLimit-Remaining'] logging.debug('Remaining %s' % rate_limit) if r.ok: data = r.json() next_page = r.links.get('next') if next_page: next_page = next_page['url'] last_page = r.links.get('last') if last_page: last_page = last_page['url'] print('Repos: %s ' % len(data['items'])) for repo in data['items']: yield repo # detect throttling, pause computations if r.status_code == 403 or rate_limit == 0: delay = int(r.headers['X-RateLimit-Reset']) - time.time() delay = min(delay, MAX_SLEEP) logging.warn('rate limited upon, sleep for %d seconds' % int(delay)) time.sleep(delay) if not next_page or not last_page or next_page == last_page: break @click.command() @click.option('--queue', default="swh.loader.git.tasks.ReaderGitRepository", help="Destination queue to send results") @click.option('--start-page', default=None, help='Starting page to read from') @click.option('--last-page', default=None, help='Ending page') -@click.option('--dump-repositories', default='/tmp/repositories', +@click.option('--dump-repositories', default=None, help='Default path to keep the list of repositories') def main(queue, start_page, last_page, dump_repositories): logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(process)d %(message)s' ) if start_page: start_page = int(start_page) if last_page: last_page = int(last_page) task_destination = app.tasks[queue] if dump_repositories: f = open(dump_repositories, 'w+') for repo in list_repositories(start_page): url = repo['html_url'] task_destination.delay(url) f.write('%s %s\n' % (url, repo['stargazers_count'])) logging.info(url) if dump_repositories: f.close() if __name__ == '__main__': main()