diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py index 859dead..9d22172 100644 --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -1,62 +1,66 @@ -# Copyright (C) 2017-2019 the Software Heritage developers +# Copyright (C) 2017-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import time from swh.lister.core.indexing_lister import IndexingHttpLister from swh.lister.github.models import GitHubModel class GitHubLister(IndexingHttpLister): PATH_TEMPLATE = '/repositories?since=%d' MODEL = GitHubModel DEFAULT_URL = 'https://api.github.com' API_URL_INDEX_RE = re.compile(r'^.*/repositories\?since=(\d+)') LISTER_NAME = 'github' instance = 'github' # There is only 1 instance of such lister default_min_bound = 0 def get_model_from_repo(self, repo): return { 'uid': repo['id'], 'indexable': repo['id'], 'name': repo['name'], 'full_name': repo['full_name'], 'html_url': repo['html_url'], 'origin_url': repo['html_url'], 'origin_type': 'git', 'fork': repo['fork'], } def transport_quota_check(self, response): - reqs_remaining = int(response.headers['X-RateLimit-Remaining']) + x_rate_limit_remaining = response.headers.get('X-RateLimit-Remaining') + if not x_rate_limit_remaining: + return False, 0 + reqs_remaining = int(x_rate_limit_remaining) if response.status_code == 403 and reqs_remaining == 0: reset_at = int(response.headers['X-RateLimit-Reset']) delay = min(reset_at - time.time(), 3600) return True, delay return False, 0 def get_next_target_from_response(self, response): if 'next' in response.links: next_url = response.links['next']['url'] return int(self.API_URL_INDEX_RE.match(next_url).group(1)) def transport_response_simplified(self, response): repos = response.json() return [self.get_model_from_repo(repo) for repo in repos] def request_headers(self): return {'Accept': 'application/vnd.github.v3+json'} def disable_deleted_repo_tasks(self, index, next_index, keep_these): """ (Overrides) Fix provided index value to avoid erroneously disabling some scheduler tasks """ # Next listed repository ids are strictly greater than the 'since' # parameter, so increment the index to avoid disabling the latest # created task when processing a new repositories page returned by # the Github API return super().disable_deleted_repo_tasks(index + 1, next_index, keep_these) diff --git a/swh/lister/github/tests/api_empty_response.json b/swh/lister/github/tests/data/api.github.com/empty_response.json similarity index 100% rename from swh/lister/github/tests/api_empty_response.json rename to swh/lister/github/tests/data/api.github.com/empty_response.json diff --git a/swh/lister/github/tests/data/api.github.com/first_response.json b/swh/lister/github/tests/data/api.github.com/first_response.json new file mode 120000 index 0000000..6070594 --- /dev/null +++ b/swh/lister/github/tests/data/api.github.com/first_response.json @@ -0,0 +1 @@ +repositories,since=0 \ No newline at end of file diff --git a/swh/lister/github/tests/api_next_response.json b/swh/lister/github/tests/data/api.github.com/next_response.json similarity index 100% rename from swh/lister/github/tests/api_next_response.json rename to swh/lister/github/tests/data/api.github.com/next_response.json diff --git a/swh/lister/github/tests/api_first_response.json b/swh/lister/github/tests/data/api.github.com/repositories,since=0 similarity index 100% rename from swh/lister/github/tests/api_first_response.json rename to swh/lister/github/tests/data/api.github.com/repositories,since=0 diff --git a/swh/lister/github/tests/test_gh_lister.py b/swh/lister/github/tests/test_lister.py similarity index 61% rename from swh/lister/github/tests/test_gh_lister.py rename to swh/lister/github/tests/test_lister.py index c6aca16..cc61bc0 100644 --- a/swh/lister/github/tests/test_gh_lister.py +++ b/swh/lister/github/tests/test_lister.py @@ -1,53 +1,83 @@ -# Copyright (C) 2017-2019 the Software Heritage developers +# Copyright (C) 2017-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import unittest import requests_mock from datetime import datetime, timedelta from swh.lister.core.tests.test_lister import HttpListerTester from swh.lister.github.lister import GitHubLister class GitHubListerTester(HttpListerTester, unittest.TestCase): Lister = GitHubLister test_re = re.compile(r'/repositories\?since=([^?&]+)') lister_subdir = 'github' - good_api_response_file = 'api_first_response.json' - bad_api_response_file = 'api_empty_response.json' + good_api_response_file = 'data/api.github.com/first_response.json' + bad_api_response_file = 'data/api.github.com/empty_response.json' first_index = 0 last_index = 369 entries_per_page = 100 convert_type = int def response_headers(self, request): headers = {'X-RateLimit-Remaining': '1'} if self.request_index(request) == self.first_index: headers.update({ 'Link': ';' ' rel="next",' ';' ' rel="first"' % self.last_index }) else: headers.update({ 'Link': ';' ' rel="first"' }) return headers def mock_rate_quota(self, n, request, context): self.rate_limit += 1 context.status_code = 403 context.headers['X-RateLimit-Remaining'] = '0' one_second = int((datetime.now() + timedelta(seconds=1.5)).timestamp()) context.headers['X-RateLimit-Reset'] = str(one_second) return '{"error":"dummy"}' @requests_mock.Mocker() def test_scheduled_tasks(self, http_mocker): - self.scheduled_tasks_test('api_next_response.json', 876, http_mocker) + self.scheduled_tasks_test( + 'data/api.github.com/next_response.json', 876, http_mocker) + + +def test_lister_github(swh_listers, requests_mock_datadir): + """Simple github listing should create scheduled tasks + + """ + lister = swh_listers['github'] + + lister.run() + + r = lister.scheduler.search_tasks(task_type='load-git') + assert len(r) == 100 + + for row in r: + assert row['type'] == 'load-git' + # arguments check + args = row['arguments']['args'] + assert len(args) == 1 + + url = args[0] + assert url.startswith('https://github.com') + + # kwargs + kwargs = row['arguments']['kwargs'] + assert kwargs == {} + + assert row['policy'] == 'recurring' + assert row['priority'] is None