diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py index 30e686e..40eb4e2 100644 --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -1,73 +1,71 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re -import time from typing import Any from swh.lister.core.indexing_lister import IndexingHttpLister from swh.lister.github.models import GitHubModel class GitHubLister(IndexingHttpLister): PATH_TEMPLATE = '/repositories?since=%d' MODEL = GitHubModel DEFAULT_URL = 'https://api.github.com' API_URL_INDEX_RE = re.compile(r'^.*/repositories\?since=(\d+)') LISTER_NAME = 'github' instance = 'github' # There is only 1 instance of such lister default_min_bound = 0 # type: Any def get_model_from_repo(self, repo): return { 'uid': repo['id'], 'indexable': repo['id'], 'name': repo['name'], 'full_name': repo['full_name'], 'html_url': repo['html_url'], 'origin_url': repo['html_url'], 'origin_type': 'git', 'fork': repo['fork'], } def transport_quota_check(self, response): x_rate_limit_remaining = response.headers.get('X-RateLimit-Remaining') if not x_rate_limit_remaining: return False, 0 reqs_remaining = int(x_rate_limit_remaining) if response.status_code == 403 and reqs_remaining == 0: - reset_at = int(response.headers['X-RateLimit-Reset']) - delay = min(reset_at - time.time(), 3600) + delay = int(response.headers['Retry-After']) return True, delay return False, 0 def get_next_target_from_response(self, response): if 'next' in response.links: next_url = response.links['next']['url'] return int(self.API_URL_INDEX_RE.match(next_url).group(1)) def transport_response_simplified(self, response): repos = response.json() return [self.get_model_from_repo(repo) for repo in repos] def request_headers(self): """(Override) Set requests headers to send when querying the GitHub API """ headers = super().request_headers() headers['Accept'] = 'application/vnd.github.v3+json' return headers def disable_deleted_repo_tasks(self, index, next_index, keep_these): """ (Overrides) Fix provided index value to avoid erroneously disabling some scheduler tasks """ # Next listed repository ids are strictly greater than the 'since' # parameter, so increment the index to avoid disabling the latest # created task when processing a new repositories page returned by # the Github API return super().disable_deleted_repo_tasks(index + 1, next_index, keep_these) diff --git a/swh/lister/github/tests/test_lister.py b/swh/lister/github/tests/test_lister.py index f2c085f..c0b2711 100644 --- a/swh/lister/github/tests/test_lister.py +++ b/swh/lister/github/tests/test_lister.py @@ -1,81 +1,78 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import unittest import requests_mock -from datetime import datetime, timedelta - from swh.lister.core.tests.test_lister import HttpListerTester from swh.lister.github.lister import GitHubLister class GitHubListerTester(HttpListerTester, unittest.TestCase): Lister = GitHubLister test_re = re.compile(r'/repositories\?since=([^?&]+)') lister_subdir = 'github' good_api_response_file = 'data/https_api.github.com/first_response.json' bad_api_response_file = 'data/https_api.github.com/empty_response.json' first_index = 0 last_index = 369 entries_per_page = 100 convert_type = int def response_headers(self, request): headers = {'X-RateLimit-Remaining': '1'} if self.request_index(request) == self.first_index: headers.update({ 'Link': ';' ' rel="next",' ';' ' rel="first"' % self.last_index }) else: headers.update({ 'Link': ';' ' rel="first"' }) return headers def mock_rate_quota(self, n, request, context): self.rate_limit += 1 context.status_code = 403 context.headers['X-RateLimit-Remaining'] = '0' - one_second = int((datetime.now() + timedelta(seconds=1.5)).timestamp()) - context.headers['X-RateLimit-Reset'] = str(one_second) + context.headers['Retry-After'] = '1' # 1 second return '{"error":"dummy"}' @requests_mock.Mocker() def test_scheduled_tasks(self, http_mocker): self.scheduled_tasks_test( 'data/https_api.github.com/next_response.json', 876, http_mocker) def test_lister_github(swh_listers, requests_mock_datadir): """Simple github listing should create scheduled tasks """ lister = swh_listers['github'] lister.run() r = lister.scheduler.search_tasks(task_type='load-git') assert len(r) == 100 for row in r: assert row['type'] == 'load-git' # arguments check args = row['arguments']['args'] assert len(args) == 0 # kwargs kwargs = row['arguments']['kwargs'] url = kwargs['url'] assert url.startswith('https://github.com') assert row['policy'] == 'recurring' assert row['priority'] is None