diff --git a/requirements-github.txt b/requirements-github.txt new file mode 100644 --- /dev/null +++ b/requirements-github.txt @@ -0,0 +1,3 @@ +# requirements for swh.core.github +requests +tenacity diff --git a/requirements-http.txt b/requirements-http.txt --- a/requirements-http.txt +++ b/requirements-http.txt @@ -6,4 +6,4 @@ iso8601 msgpack >= 1.0.0 requests -tenacity + diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -57,6 +57,7 @@ "logging": parse_requirements("logging"), "db": parse_requirements("db", "db-pytestplugin"), "http": parse_requirements("http"), + "github": parse_requirements("github"), # kitchen sink, please do not use "testing": parse_requirements( "test", "db", "db-pytestplugin", "http", "logging" @@ -71,6 +72,7 @@ db=swh.core.cli.db [pytest11] pytest_swh_core = swh.core.pytest_plugin + pytest_swh_github = swh.core.github.pytest_plugin """, classifiers=[ "Programming Language :: Python :: 3", diff --git a/swh/core/tests/test_github_utils.py b/swh/core/github/pytest_plugin.py rename from swh/core/tests/test_github_utils.py rename to swh/core/github/pytest_plugin.py --- a/swh/core/tests/test_github_utils.py +++ b/swh/core/github/pytest_plugin.py @@ -1,64 +1,19 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import logging import time from typing import Dict, Iterator, List, Optional, Union import pytest import requests_mock -from swh.core.github.utils import ( - GitHubSession, - _sanitize_github_url, - _url_github_api, - _url_github_html, - get_canonical_github_origin_url, -) - -KNOWN_GH_REPO = "https://github.com/user/repo" - - -@pytest.mark.parametrize( - "user_repo, expected_url", - [ - ("user/repo.git", KNOWN_GH_REPO), - ("user/repo.git/", KNOWN_GH_REPO), - ("user/repo/", KNOWN_GH_REPO), - ("user/repo", KNOWN_GH_REPO), - ("user/repo/.git", KNOWN_GH_REPO), - # edge cases - ("https://github.com/unknown-page", None), # unknown gh origin returns None - ("user/repo/with/some/deps", None), # url kind is not dealt with for now - ], -) -def test_get_canonical_github_origin_url(user_repo, expected_url, requests_mock): - """It should return a canonical github origin when it exists, None otherwise""" - html_url = _url_github_html(user_repo) - api_url = _url_github_api(_sanitize_github_url(user_repo)) - - if expected_url is not None: - status_code = 200 - response = {"html_url": _sanitize_github_url(html_url)} - else: - status_code = 404 - response = {} - - requests_mock.get(api_url, [{"status_code": status_code, "json": response}]) - - assert get_canonical_github_origin_url(html_url) == expected_url - - -def test_get_canonical_github_origin_url_not_gh_origin(): - """It should return the input url when that origin is not a github one""" - url = "https://example.org" - assert get_canonical_github_origin_url(url) == url +HTTP_GITHUB_API_URL = "https://api.github.com/repositories" def fake_time_sleep(duration: float, sleep_calls: Optional[List[float]] = None): - """Record calls to time.sleep in the sleep_calls list""" + """Record calls to time.sleep in the sleep_calls list.""" if duration < 0: raise ValueError("Can't sleep for a negative amount of time!") if sleep_calls is not None: @@ -136,9 +91,6 @@ return repo -HTTP_GH_API_URL = "https://api.github.com/repositories" - - def github_response_callback( request: requests_mock.request._RequestObjectProxy, context: requests_mock.response._Context, @@ -162,7 +114,7 @@ if next_page < origin_count: # the first id for the next page is within our origin count; add a Link # header to the response - next_url = f"{HTTP_GH_API_URL}?per_page={page_size}&since={next_page}" + next_url = f"{HTTP_GITHUB_API_URL}?per_page={page_size}&since={next_page}" context.headers["Link"] = f"<{next_url}>; rel=next" return [github_repo(i) for i in range(since + 1, min(next_page, origin_count) + 1)] @@ -208,7 +160,7 @@ return github_response_callback(request, context) with requests_mock.Mocker() as mock: - mock.get(HTTP_GH_API_URL, json=response_callback) + mock.get(HTTP_GITHUB_API_URL, json=response_callback) yield mock @@ -230,108 +182,3 @@ """Return the list of tokens matching the static credential""" return [t.get("token", t.get("password")) for t in github_credentials] - - -def test_github_session_anonymous_session(): - user_agent = ("GitHub Session Test",) - github_session = GitHubSession( - user_agent=user_agent, - ) - assert github_session.anonymous is True - - actual_headers = github_session.session.headers - assert actual_headers["Accept"] == "application/vnd.github.v3+json" - assert actual_headers["User-Agent"] == user_agent - - -@pytest.mark.parametrize( - "num_ratelimit", [1] # return a single rate-limit response, then continue -) -def test_github_session_ratelimit_once_recovery( - caplog, - requests_ratelimited, - num_ratelimit, - monkeypatch_sleep_calls, - github_credentials, -): - """GitHubSession should recover from hitting the rate-limit once""" - caplog.set_level(logging.DEBUG, "swh.core.github.utils") - - github_session = GitHubSession( - user_agent="GitHub Session Test", credentials=github_credentials - ) - - res = github_session.request(f"{HTTP_GH_API_URL}?per_page=1000&since=10") - assert res.status_code == 200 - - token_users = [] - for record in caplog.records: - if "Using authentication token" in record.message: - token_users.append(record.args[0]) - - # check that we used one more token than we saw rate limited requests - assert len(token_users) == 1 + num_ratelimit - - # check that we slept for one second between our token uses - assert monkeypatch_sleep_calls == [1] - - -def test_github_session_authenticated_credentials( - caplog, github_credentials, all_tokens -): - """GitHubSession should have Authorization headers set in authenticated mode""" - caplog.set_level(logging.DEBUG, "swh.core.github.utils") - - github_session = GitHubSession( - "GitHub Session Test", credentials=github_credentials - ) - - assert github_session.anonymous is False - assert github_session.token_index == 0 - assert ( - sorted(github_session.credentials, key=lambda t: t["username"]) - == github_credentials - ) - assert github_session.session.headers["Authorization"] in [ - f"token {t}" for t in all_tokens - ] - - -@pytest.mark.parametrize( - # Do 5 successful requests, return 6 ratelimits (to exhaust the credentials) with a - # set value for X-Ratelimit-Reset, then resume listing successfully. - "num_before_ratelimit, num_ratelimit, ratelimit_reset", - [(5, 6, 123456)], -) -def test_github_session_ratelimit_reset_sleep( - caplog, - requests_ratelimited, - monkeypatch_sleep_calls, - num_before_ratelimit, - num_ratelimit, - ratelimit_reset, - github_credentials, -): - """GitHubSession should handle rate-limit with authentication tokens.""" - caplog.set_level(logging.DEBUG, "swh.core.github.utils") - - github_session = GitHubSession( - user_agent="GitHub Session Test", credentials=github_credentials - ) - - for _ in range(num_ratelimit): - github_session.request(f"{HTTP_GH_API_URL}?per_page=1000&since=10") - - # We sleep 1 second every time we change credentials, then we sleep until - # ratelimit_reset + 1 - expected_sleep_calls = len(github_credentials) * [1] + [ratelimit_reset + 1] - assert monkeypatch_sleep_calls == expected_sleep_calls - - found_exhaustion_message = False - for record in caplog.records: - if record.levelname == "INFO": - if "Rate limits exhausted for all tokens" in record.message: - found_exhaustion_message = True - break - - assert found_exhaustion_message is True diff --git a/swh/core/github/tests/__init__.py b/swh/core/github/tests/__init__.py new file mode 100644 diff --git a/swh/core/github/tests/test_github_utils.py b/swh/core/github/tests/test_github_utils.py new file mode 100644 --- /dev/null +++ b/swh/core/github/tests/test_github_utils.py @@ -0,0 +1,160 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging + +import pytest + +from swh.core.github.pytest_plugin import HTTP_GITHUB_API_URL +from swh.core.github.utils import ( + GitHubSession, + _sanitize_github_url, + _url_github_api, + _url_github_html, + get_canonical_github_origin_url, +) + +KNOWN_GH_REPO = "https://github.com/user/repo" + + +@pytest.mark.parametrize( + "user_repo, expected_url", + [ + ("user/repo.git", KNOWN_GH_REPO), + ("user/repo.git/", KNOWN_GH_REPO), + ("user/repo/", KNOWN_GH_REPO), + ("user/repo", KNOWN_GH_REPO), + ("user/repo/.git", KNOWN_GH_REPO), + # edge cases + ("https://github.com/unknown-page", None), # unknown gh origin returns None + ("user/repo/with/some/deps", None), # url kind is not dealt with for now + ], +) +def test_get_canonical_github_origin_url(user_repo, expected_url, requests_mock): + """It should return a canonical github origin when it exists, None otherwise""" + html_url = _url_github_html(user_repo) + api_url = _url_github_api(_sanitize_github_url(user_repo)) + + if expected_url is not None: + status_code = 200 + response = {"html_url": _sanitize_github_url(html_url)} + else: + status_code = 404 + response = {} + + requests_mock.get(api_url, [{"status_code": status_code, "json": response}]) + + assert get_canonical_github_origin_url(html_url) == expected_url + + +def test_get_canonical_github_origin_url_not_gh_origin(): + """It should return the input url when that origin is not a github one""" + url = "https://example.org" + assert get_canonical_github_origin_url(url) == url + + +def test_github_session_anonymous_session(): + user_agent = ("GitHub Session Test",) + github_session = GitHubSession( + user_agent=user_agent, + ) + assert github_session.anonymous is True + + actual_headers = github_session.session.headers + assert actual_headers["Accept"] == "application/vnd.github.v3+json" + assert actual_headers["User-Agent"] == user_agent + + +@pytest.mark.parametrize( + "num_ratelimit", [1] # return a single rate-limit response, then continue +) +def test_github_session_ratelimit_once_recovery( + caplog, + requests_ratelimited, + num_ratelimit, + monkeypatch_sleep_calls, + github_credentials, +): + """GitHubSession should recover from hitting the rate-limit once""" + caplog.set_level(logging.DEBUG, "swh.core.github.utils") + + github_session = GitHubSession( + user_agent="GitHub Session Test", credentials=github_credentials + ) + + res = github_session.request(f"{HTTP_GITHUB_API_URL}?per_page=1000&since=10") + assert res.status_code == 200 + + token_users = [] + for record in caplog.records: + if "Using authentication token" in record.message: + token_users.append(record.args[0]) + + # check that we used one more token than we saw rate limited requests + assert len(token_users) == 1 + num_ratelimit + + # check that we slept for one second between our token uses + assert monkeypatch_sleep_calls == [1] + + +def test_github_session_authenticated_credentials( + caplog, github_credentials, all_tokens +): + """GitHubSession should have Authorization headers set in authenticated mode""" + caplog.set_level(logging.DEBUG, "swh.core.github.utils") + + github_session = GitHubSession( + "GitHub Session Test", credentials=github_credentials + ) + + assert github_session.anonymous is False + assert github_session.token_index == 0 + assert ( + sorted(github_session.credentials, key=lambda t: t["username"]) + == github_credentials + ) + assert github_session.session.headers["Authorization"] in [ + f"token {t}" for t in all_tokens + ] + + +@pytest.mark.parametrize( + # Do 5 successful requests, return 6 ratelimits (to exhaust the credentials) with a + # set value for X-Ratelimit-Reset, then resume listing successfully. + "num_before_ratelimit, num_ratelimit, ratelimit_reset", + [(5, 6, 123456)], +) +def test_github_session_ratelimit_reset_sleep( + caplog, + requests_ratelimited, + monkeypatch_sleep_calls, + num_before_ratelimit, + num_ratelimit, + ratelimit_reset, + github_credentials, +): + """GitHubSession should handle rate-limit with authentication tokens.""" + caplog.set_level(logging.DEBUG, "swh.core.github.utils") + + github_session = GitHubSession( + user_agent="GitHub Session Test", credentials=github_credentials + ) + + for _ in range(num_ratelimit): + github_session.request(f"{HTTP_GITHUB_API_URL}?per_page=1000&since=10") + + # We sleep 1 second every time we change credentials, then we sleep until + # ratelimit_reset + 1 + expected_sleep_calls = len(github_credentials) * [1] + [ratelimit_reset + 1] + assert monkeypatch_sleep_calls == expected_sleep_calls + + found_exhaustion_message = False + for record in caplog.records: + if record.levelname == "INFO": + if "Rate limits exhausted for all tokens" in record.message: + found_exhaustion_message = True + break + + assert found_exhaustion_message is True diff --git a/tox.ini b/tox.ini --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist=black,flake8,mypy,py3-{core,db,server} +envlist=black,flake8,mypy,py3-{core,db,github,server} [testenv] passenv = PYTHONASYNCIODEBUG @@ -7,6 +7,7 @@ testing-core core: logging db: db + github: github server: http deps = cover: pytest-cov @@ -15,6 +16,7 @@ slow: --hypothesis-profile=slow \ cover: --cov={envsitepackagesdir}/swh/core --cov-branch \ core: {envsitepackagesdir}/swh/core/tests \ + github: {envsitepackagesdir}/swh/core/github/tests \ db: {envsitepackagesdir}/swh/core/db/tests \ server: {envsitepackagesdir}/swh/core/api/tests \ {posargs} @@ -44,6 +46,7 @@ testing-core logging db + github http deps = mypy==0.942 @@ -82,6 +85,7 @@ testing-core logging db + github http deps = # install swh-docs in develop mode