Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/github/tests/test_lister.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | import datetime | ||||
import logging | import logging | ||||
import time | from typing import Any, Dict, Iterator, List | ||||
from typing import Any, Dict, Iterator, List, Optional, Union | |||||
import pytest | import pytest | ||||
import requests_mock | import requests_mock | ||||
from swh.core.github.pytest_plugin import github_response_callback | |||||
from swh.lister.github.lister import GitHubLister | from swh.lister.github.lister import GitHubLister | ||||
from swh.lister.pattern import CredentialsType, ListerStats | from swh.lister.pattern import CredentialsType, ListerStats | ||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import Lister | from swh.scheduler.model import Lister | ||||
NUM_PAGES = 10 | NUM_PAGES = 10 | ||||
ORIGIN_COUNT = GitHubLister.PAGE_SIZE * NUM_PAGES | ORIGIN_COUNT = GitHubLister.PAGE_SIZE * NUM_PAGES | ||||
def github_repo(i: int) -> Dict[str, Union[int, str]]: | |||||
"""Basic repository information returned by the GitHub API""" | |||||
repo: Dict[str, Union[int, str]] = { | |||||
"id": i, | |||||
"html_url": f"https://github.com/origin/{i}", | |||||
} | |||||
# Set the pushed_at date on one of the origins | |||||
if i == 4321: | |||||
repo["pushed_at"] = "2018-11-08T13:16:24Z" | |||||
return repo | |||||
def github_response_callback( | |||||
request: requests_mock.request._RequestObjectProxy, | |||||
context: requests_mock.response._Context, | |||||
) -> List[Dict[str, Union[str, int]]]: | |||||
"""Return minimal GitHub API responses for the common case where the loader | |||||
hasn't been rate-limited""" | |||||
# Check request headers | |||||
assert request.headers["Accept"] == "application/vnd.github.v3+json" | |||||
assert "Software Heritage Lister" in request.headers["User-Agent"] | |||||
# Check request parameters: per_page == 1000, since = last_repo_id | |||||
assert "per_page" in request.qs | |||||
assert request.qs["per_page"] == [str(GitHubLister.PAGE_SIZE)] | |||||
assert "since" in request.qs | |||||
since = int(request.qs["since"][0]) | |||||
next_page = since + GitHubLister.PAGE_SIZE | |||||
if next_page < ORIGIN_COUNT: | |||||
# the first id for the next page is within our origin count; add a Link | |||||
# header to the response | |||||
next_url = ( | |||||
GitHubLister.API_URL | |||||
+ f"?per_page={GitHubLister.PAGE_SIZE}&since={next_page}" | |||||
) | |||||
context.headers["Link"] = f"<{next_url}>; rel=next" | |||||
return [github_repo(i) for i in range(since + 1, min(next_page, ORIGIN_COUNT) + 1)] | |||||
@pytest.fixture() | @pytest.fixture() | ||||
def requests_mocker() -> Iterator[requests_mock.Mocker]: | def requests_mocker() -> Iterator[requests_mock.Mocker]: | ||||
with requests_mock.Mocker() as mock: | with requests_mock.Mocker() as mock: | ||||
mock.get(GitHubLister.API_URL, json=github_response_callback) | mock.get(GitHubLister.API_URL, json=github_response_callback) | ||||
yield mock | yield mock | ||||
def get_lister_data(swh_scheduler: SchedulerInterface) -> Lister: | def get_lister_data(swh_scheduler: SchedulerInterface) -> Lister: | ||||
▲ Show 20 Lines • Show All 101 Lines • ▼ Show 20 Lines | def test_relister(swh_scheduler, caplog, requests_mocker) -> None: | ||||
# Make sure we got two full pages of results | # Make sure we got two full pages of results | ||||
assert res == ListerStats(pages=2, origins=2000) | assert res == ListerStats(pages=2, origins=2000) | ||||
# Check that the relisting mode hasn't touched the stored state. | # Check that the relisting mode hasn't touched the stored state. | ||||
lister_data = get_lister_data(swh_scheduler) | lister_data = get_lister_data(swh_scheduler) | ||||
assert lister_data.current_state == {"last_seen_id": 123} | assert lister_data.current_state == {"last_seen_id": 123} | ||||
def github_ratelimit_callback( | |||||
request: requests_mock.request._RequestObjectProxy, | |||||
context: requests_mock.response._Context, | |||||
ratelimit_reset: Optional[int], | |||||
) -> Dict[str, str]: | |||||
"""Return a rate-limited GitHub API response.""" | |||||
# Check request headers | |||||
assert request.headers["Accept"] == "application/vnd.github.v3+json" | |||||
assert "Software Heritage Lister" in request.headers["User-Agent"] | |||||
if "Authorization" in request.headers: | |||||
context.status_code = 429 | |||||
else: | |||||
context.status_code = 403 | |||||
if ratelimit_reset is not None: | |||||
context.headers["X-Ratelimit-Reset"] = str(ratelimit_reset) | |||||
return { | |||||
"message": "API rate limit exceeded for <IP>.", | |||||
"documentation_url": "https://developer.github.com/v3/#rate-limiting", | |||||
} | |||||
@pytest.fixture() | |||||
def num_before_ratelimit() -> int: | |||||
"""Number of successful requests before the ratelimit hits""" | |||||
return 0 | |||||
@pytest.fixture() | |||||
def num_ratelimit() -> Optional[int]: | |||||
"""Number of rate-limited requests; None means infinity""" | |||||
return None | |||||
@pytest.fixture() | |||||
def ratelimit_reset() -> Optional[int]: | |||||
"""Value of the X-Ratelimit-Reset header on ratelimited responses""" | |||||
return None | |||||
@pytest.fixture() | |||||
def requests_ratelimited( | |||||
num_before_ratelimit: int, | |||||
num_ratelimit: Optional[int], | |||||
ratelimit_reset: Optional[int], | |||||
) -> Iterator[requests_mock.Mocker]: | |||||
"""Mock requests to the GitHub API, returning a rate-limiting status code | |||||
after `num_before_ratelimit` requests. | |||||
GitHub does inconsistent rate-limiting: | |||||
- Anonymous requests return a 403 status code | |||||
- Authenticated requests return a 429 status code, with an | |||||
X-Ratelimit-Reset header. | |||||
This fixture takes multiple arguments (which can be overridden with a | |||||
:func:`pytest.mark.parametrize` parameter): | |||||
- num_before_ratelimit: the global number of requests until the | |||||
ratelimit triggers | |||||
- num_ratelimit: the number of requests that return a | |||||
rate-limited response. | |||||
- ratelimit_reset: the timestamp returned in X-Ratelimit-Reset if the | |||||
request is authenticated. | |||||
The default values set in the previous fixtures make all requests return a rate | |||||
limit response. | |||||
""" | |||||
current_request = 0 | |||||
def response_callback(request, context): | |||||
nonlocal current_request | |||||
current_request += 1 | |||||
if num_before_ratelimit < current_request and ( | |||||
num_ratelimit is None | |||||
or current_request < num_before_ratelimit + num_ratelimit + 1 | |||||
): | |||||
return github_ratelimit_callback(request, context, ratelimit_reset) | |||||
else: | |||||
return github_response_callback(request, context) | |||||
with requests_mock.Mocker() as mock: | |||||
mock.get(GitHubLister.API_URL, json=response_callback) | |||||
yield mock | |||||
def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> None: | def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> None: | ||||
caplog.set_level(logging.DEBUG, "swh.lister.github.utils") | caplog.set_level(logging.DEBUG, "swh.core.github.utils") | ||||
lister = GitHubLister(scheduler=swh_scheduler) | lister = GitHubLister(scheduler=swh_scheduler) | ||||
assert lister.github_session.anonymous | assert lister.github_session.anonymous | ||||
assert "using anonymous mode" in caplog.records[-1].message | assert "using anonymous mode" in caplog.records[-1].message | ||||
caplog.clear() | caplog.clear() | ||||
res = lister.run() | res = lister.run() | ||||
assert res == ListerStats(pages=0, origins=0) | assert res == ListerStats(pages=0, origins=0) | ||||
last_log = caplog.records[-1] | last_log = caplog.records[-1] | ||||
assert last_log.levelname == "WARNING" | assert last_log.levelname == "WARNING" | ||||
assert "No X-Ratelimit-Reset value found in responses" in last_log.message | assert "No X-Ratelimit-Reset value found in responses" in last_log.message | ||||
@pytest.fixture | @pytest.fixture | ||||
def github_credentials() -> List[Dict[str, str]]: | |||||
"""Return a static list of GitHub credentials""" | |||||
return sorted( | |||||
[{"username": f"swh{i:d}", "token": f"token-{i:d}"} for i in range(3)] | |||||
+ [ | |||||
{"username": f"swh-legacy{i:d}", "password": f"token-legacy-{i:d}"} | |||||
for i in range(3) | |||||
], | |||||
key=lambda c: c["username"], | |||||
) | |||||
@pytest.fixture | |||||
def all_tokens(github_credentials) -> List[str]: | |||||
"""Return the list of tokens matching the static credential""" | |||||
return [t.get("token", t.get("password")) for t in github_credentials] | |||||
@pytest.fixture | |||||
def lister_credentials(github_credentials: List[Dict[str, str]]) -> CredentialsType: | def lister_credentials(github_credentials: List[Dict[str, str]]) -> CredentialsType: | ||||
"""Return the credentials formatted for use by the lister""" | """Return the credentials formatted for use by the lister""" | ||||
return {"github": {"github": github_credentials}} | return {"github": {"github": github_credentials}} | ||||
def test_authenticated_credentials( | def test_authenticated_credentials( | ||||
swh_scheduler, caplog, github_credentials, lister_credentials, all_tokens | swh_scheduler, caplog, github_credentials, lister_credentials, all_tokens | ||||
): | ): | ||||
"""Test credentials management when the lister is authenticated""" | """Test credentials management when the lister is authenticated""" | ||||
caplog.set_level(logging.DEBUG, "swh.lister.github.lister") | caplog.set_level(logging.DEBUG, "swh.lister.github.lister") | ||||
lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials) | lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials) | ||||
assert lister.github_session.token_index == 0 | assert lister.github_session.token_index == 0 | ||||
assert sorted(lister.credentials, key=lambda t: t["username"]) == github_credentials | assert sorted(lister.credentials, key=lambda t: t["username"]) == github_credentials | ||||
assert lister.github_session.session.headers["Authorization"] in [ | assert lister.github_session.session.headers["Authorization"] in [ | ||||
"token %s" % t for t in all_tokens | "token %s" % t for t in all_tokens | ||||
] | ] | ||||
def fake_time_sleep(duration: float, sleep_calls: Optional[List[float]] = None): | |||||
"""Record calls to time.sleep in the sleep_calls list""" | |||||
if duration < 0: | |||||
raise ValueError("Can't sleep for a negative amount of time!") | |||||
if sleep_calls is not None: | |||||
sleep_calls.append(duration) | |||||
def fake_time_time(): | |||||
"""Return 0 when running time.time()""" | |||||
return 0 | |||||
@pytest.fixture | |||||
def monkeypatch_sleep_calls(monkeypatch) -> Iterator[List[float]]: | |||||
"""Monkeypatch `time.time` and `time.sleep`. Returns a list cumulating the arguments | |||||
passed to time.sleep().""" | |||||
sleeps: List[float] = [] | |||||
monkeypatch.setattr(time, "sleep", lambda d: fake_time_sleep(d, sleeps)) | |||||
monkeypatch.setattr(time, "time", fake_time_time) | |||||
yield sleeps | |||||
@pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||
"num_ratelimit", [1] | "num_ratelimit", [1] | ||||
) # return a single rate-limit response, then continue | ) # return a single rate-limit response, then continue | ||||
def test_ratelimit_once_recovery( | def test_ratelimit_once_recovery( | ||||
swh_scheduler, | swh_scheduler, | ||||
caplog, | caplog, | ||||
requests_ratelimited, | requests_ratelimited, | ||||
num_ratelimit, | num_ratelimit, | ||||
monkeypatch_sleep_calls, | monkeypatch_sleep_calls, | ||||
lister_credentials, | lister_credentials, | ||||
): | ): | ||||
"""Check that the lister recovers from hitting the rate-limit once""" | """Check that the lister recovers from hitting the rate-limit once""" | ||||
caplog.set_level(logging.DEBUG, "swh.lister.github.utils") | caplog.set_level(logging.DEBUG, "swh.core.github.utils") | ||||
lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials) | lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials) | ||||
res = lister.run() | res = lister.run() | ||||
# check that we used all the pages | # check that we used all the pages | ||||
assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT) | assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT) | ||||
token_users = [] | token_users = [] | ||||
Show All 21 Lines | def test_ratelimit_reset_sleep( | ||||
monkeypatch_sleep_calls, | monkeypatch_sleep_calls, | ||||
num_before_ratelimit, | num_before_ratelimit, | ||||
ratelimit_reset, | ratelimit_reset, | ||||
github_credentials, | github_credentials, | ||||
lister_credentials, | lister_credentials, | ||||
): | ): | ||||
"""Check that the lister properly handles rate-limiting when providing it with | """Check that the lister properly handles rate-limiting when providing it with | ||||
authentication tokens""" | authentication tokens""" | ||||
caplog.set_level(logging.DEBUG, "swh.lister.github.utils") | caplog.set_level(logging.DEBUG, "swh.core.github.utils") | ||||
lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials) | lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials) | ||||
res = lister.run() | res = lister.run() | ||||
assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT) | assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT) | ||||
# We sleep 1 second every time we change credentials, then we sleep until | # We sleep 1 second every time we change credentials, then we sleep until | ||||
# ratelimit_reset + 1 | # ratelimit_reset + 1 | ||||
Show All 11 Lines |