diff --git a/conftest.py b/conftest.py --- a/conftest.py +++ b/conftest.py @@ -1,10 +1,10 @@ -# Copyright (C) 2020-2021 The Software Heritage developers +# Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os -pytest_plugins = ["swh.scheduler.pytest_plugin"] +pytest_plugins = ["swh.scheduler.pytest_plugin", "swh.core.github.pytest_plugin"] os.environ["LC_ALL"] = "C.UTF-8" diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,2 @@ -swh.core[db] >= 0.9 +swh.core[db,github] >= 2.6 swh.scheduler >= 0.8 diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -11,12 +11,12 @@ import iso8601 +from swh.core.github.utils import GitHubSession, MissingRateLimitReset from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, Lister -from .utils import GitHubSession, MissingRateLimitReset logger = logging.getLogger(__name__) diff --git a/swh/lister/github/tests/test_lister.py b/swh/lister/github/tests/test_lister.py --- a/swh/lister/github/tests/test_lister.py +++ b/swh/lister/github/tests/test_lister.py @@ -1,16 +1,16 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import logging -import time -from typing import Any, Dict, Iterator, List, Optional, Union +from typing import Any, Dict, Iterator, List import pytest import requests_mock +from swh.core.github.pytest_plugin import github_response_callback from swh.lister.github.lister import GitHubLister from swh.lister.pattern import CredentialsType, ListerStats from swh.scheduler.interface import SchedulerInterface @@ -20,51 +20,6 @@ ORIGIN_COUNT = GitHubLister.PAGE_SIZE * NUM_PAGES -def github_repo(i: int) -> Dict[str, Union[int, str]]: - """Basic repository information returned by the GitHub API""" - - repo: Dict[str, Union[int, str]] = { - "id": i, - "html_url": f"https://github.com/origin/{i}", - } - - # Set the pushed_at date on one of the origins - if i == 4321: - repo["pushed_at"] = "2018-11-08T13:16:24Z" - - return repo - - -def github_response_callback( - request: requests_mock.request._RequestObjectProxy, - context: requests_mock.response._Context, -) -> List[Dict[str, Union[str, int]]]: - """Return minimal GitHub API responses for the common case where the loader - hasn't been rate-limited""" - # Check request headers - assert request.headers["Accept"] == "application/vnd.github.v3+json" - assert "Software Heritage Lister" in request.headers["User-Agent"] - - # Check request parameters: per_page == 1000, since = last_repo_id - assert "per_page" in request.qs - assert request.qs["per_page"] == [str(GitHubLister.PAGE_SIZE)] - assert "since" in request.qs - - since = int(request.qs["since"][0]) - - next_page = since + GitHubLister.PAGE_SIZE - if next_page < ORIGIN_COUNT: - # the first id for the next page is within our origin count; add a Link - # header to the response - next_url = ( - GitHubLister.API_URL - + f"?per_page={GitHubLister.PAGE_SIZE}&since={next_page}" - ) - context.headers["Link"] = f"<{next_url}>; rel=next" - - return [github_repo(i) for i in range(since + 1, min(next_page, ORIGIN_COUNT) + 1)] - - @pytest.fixture() def requests_mocker() -> Iterator[requests_mock.Mocker]: with requests_mock.Mocker() as mock: @@ -182,93 +137,8 @@ assert lister_data.current_state == {"last_seen_id": 123} -def github_ratelimit_callback( - request: requests_mock.request._RequestObjectProxy, - context: requests_mock.response._Context, - ratelimit_reset: Optional[int], -) -> Dict[str, str]: - """Return a rate-limited GitHub API response.""" - # Check request headers - assert request.headers["Accept"] == "application/vnd.github.v3+json" - assert "Software Heritage Lister" in request.headers["User-Agent"] - if "Authorization" in request.headers: - context.status_code = 429 - else: - context.status_code = 403 - - if ratelimit_reset is not None: - context.headers["X-Ratelimit-Reset"] = str(ratelimit_reset) - - return { - "message": "API rate limit exceeded for .", - "documentation_url": "https://developer.github.com/v3/#rate-limiting", - } - - -@pytest.fixture() -def num_before_ratelimit() -> int: - """Number of successful requests before the ratelimit hits""" - return 0 - - -@pytest.fixture() -def num_ratelimit() -> Optional[int]: - """Number of rate-limited requests; None means infinity""" - return None - - -@pytest.fixture() -def ratelimit_reset() -> Optional[int]: - """Value of the X-Ratelimit-Reset header on ratelimited responses""" - return None - - -@pytest.fixture() -def requests_ratelimited( - num_before_ratelimit: int, - num_ratelimit: Optional[int], - ratelimit_reset: Optional[int], -) -> Iterator[requests_mock.Mocker]: - """Mock requests to the GitHub API, returning a rate-limiting status code - after `num_before_ratelimit` requests. - - GitHub does inconsistent rate-limiting: - - Anonymous requests return a 403 status code - - Authenticated requests return a 429 status code, with an - X-Ratelimit-Reset header. - - This fixture takes multiple arguments (which can be overridden with a - :func:`pytest.mark.parametrize` parameter): - - num_before_ratelimit: the global number of requests until the - ratelimit triggers - - num_ratelimit: the number of requests that return a - rate-limited response. - - ratelimit_reset: the timestamp returned in X-Ratelimit-Reset if the - request is authenticated. - - The default values set in the previous fixtures make all requests return a rate - limit response. - """ - current_request = 0 - - def response_callback(request, context): - nonlocal current_request - current_request += 1 - if num_before_ratelimit < current_request and ( - num_ratelimit is None - or current_request < num_before_ratelimit + num_ratelimit + 1 - ): - return github_ratelimit_callback(request, context, ratelimit_reset) - else: - return github_response_callback(request, context) - - with requests_mock.Mocker() as mock: - mock.get(GitHubLister.API_URL, json=response_callback) - yield mock - - def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> None: - caplog.set_level(logging.DEBUG, "swh.lister.github.utils") + caplog.set_level(logging.DEBUG, "swh.core.github.utils") lister = GitHubLister(scheduler=swh_scheduler) assert lister.github_session.anonymous @@ -283,26 +153,6 @@ assert "No X-Ratelimit-Reset value found in responses" in last_log.message -@pytest.fixture -def github_credentials() -> List[Dict[str, str]]: - """Return a static list of GitHub credentials""" - return sorted( - [{"username": f"swh{i:d}", "token": f"token-{i:d}"} for i in range(3)] - + [ - {"username": f"swh-legacy{i:d}", "password": f"token-legacy-{i:d}"} - for i in range(3) - ], - key=lambda c: c["username"], - ) - - -@pytest.fixture -def all_tokens(github_credentials) -> List[str]: - """Return the list of tokens matching the static credential""" - - return [t.get("token", t.get("password")) for t in github_credentials] - - @pytest.fixture def lister_credentials(github_credentials: List[Dict[str, str]]) -> CredentialsType: """Return the credentials formatted for use by the lister""" @@ -323,29 +173,6 @@ ] -def fake_time_sleep(duration: float, sleep_calls: Optional[List[float]] = None): - """Record calls to time.sleep in the sleep_calls list""" - if duration < 0: - raise ValueError("Can't sleep for a negative amount of time!") - if sleep_calls is not None: - sleep_calls.append(duration) - - -def fake_time_time(): - """Return 0 when running time.time()""" - return 0 - - -@pytest.fixture -def monkeypatch_sleep_calls(monkeypatch) -> Iterator[List[float]]: - """Monkeypatch `time.time` and `time.sleep`. Returns a list cumulating the arguments - passed to time.sleep().""" - sleeps: List[float] = [] - monkeypatch.setattr(time, "sleep", lambda d: fake_time_sleep(d, sleeps)) - monkeypatch.setattr(time, "time", fake_time_time) - yield sleeps - - @pytest.mark.parametrize( "num_ratelimit", [1] ) # return a single rate-limit response, then continue @@ -358,7 +185,7 @@ lister_credentials, ): """Check that the lister recovers from hitting the rate-limit once""" - caplog.set_level(logging.DEBUG, "swh.lister.github.utils") + caplog.set_level(logging.DEBUG, "swh.core.github.utils") lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials) @@ -396,7 +223,7 @@ ): """Check that the lister properly handles rate-limiting when providing it with authentication tokens""" - caplog.set_level(logging.DEBUG, "swh.lister.github.utils") + caplog.set_level(logging.DEBUG, "swh.core.github.utils") lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials) diff --git a/swh/lister/github/utils.py b/swh/lister/github/utils.py --- a/swh/lister/github/utils.py +++ b/swh/lister/github/utils.py @@ -3,168 +3,4 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import logging -import random -import time -from typing import Dict, List, Optional - -import requests -from tenacity import ( - retry, - retry_any, - retry_if_exception_type, - retry_if_result, - wait_exponential, -) - -logger = logging.getLogger(__name__) - - -class RateLimited(Exception): - def __init__(self, response): - self.reset_time: Optional[int] - - # Figure out how long we need to sleep because of that rate limit - ratelimit_reset = response.headers.get("X-Ratelimit-Reset") - retry_after = response.headers.get("Retry-After") - if ratelimit_reset is not None: - self.reset_time = int(ratelimit_reset) - elif retry_after is not None: - self.reset_time = int(time.time()) + int(retry_after) + 1 - else: - logger.warning( - "Received a rate-limit-like status code %s, but no rate-limit " - "headers set. Response content: %s", - response.status_code, - response.content, - ) - self.reset_time = None - self.response = response - - -class MissingRateLimitReset(Exception): - pass - - -class GitHubSession: - """Manages a :class:`requests.Session` with (optionally) multiple credentials, - and cycles through them when reaching rate-limits.""" - - def __init__( - self, user_agent: str, credentials: Optional[List[Dict[str, str]]] = None - ) -> None: - """Initialize a requests session with the proper headers for requests to - GitHub.""" - self.credentials = credentials - if self.credentials: - random.shuffle(self.credentials) - - self.session = requests.Session() - - self.session.headers.update( - {"Accept": "application/vnd.github.v3+json", "User-Agent": user_agent} - ) - - self.anonymous = not self.credentials - - if self.anonymous: - logger.warning("No tokens set in configuration, using anonymous mode") - - self.token_index = -1 - self.current_user: Optional[str] = None - - if not self.anonymous: - # Initialize the first token value in the session headers - self.set_next_session_token() - - def set_next_session_token(self) -> None: - """Update the current authentication token with the next one in line.""" - - assert self.credentials - - self.token_index = (self.token_index + 1) % len(self.credentials) - - auth = self.credentials[self.token_index] - - self.current_user = auth["username"] - logger.debug("Using authentication token for user %s", self.current_user) - - if "password" in auth: - token = auth["password"] - else: - token = auth["token"] - - self.session.headers.update({"Authorization": f"token {token}"}) - - @retry( - wait=wait_exponential(multiplier=1, min=4, max=10), - retry=retry_any( - # ChunkedEncodingErrors happen when the TLS connection gets reset, e.g. - # when running the lister on a connection with high latency - retry_if_exception_type(requests.exceptions.ChunkedEncodingError), - # 502 status codes happen for a Server Error, sometimes - retry_if_result(lambda r: r.status_code == 502), - ), - ) - def _request(self, url: str) -> requests.Response: - response = self.session.get(url) - - if ( - # GitHub returns inconsistent status codes between unauthenticated - # rate limit and authenticated rate limits. Handle both. - response.status_code == 429 - or (self.anonymous and response.status_code == 403) - ): - raise RateLimited(response) - - return response - - def request(self, url) -> requests.Response: - """Repeatedly requests the given URL, cycling through credentials and sleeping - if necessary; until either a successful response or :exc:`MissingRateLimitReset` - """ - # The following for/else loop handles rate limiting; if successful, - # it provides the rest of the function with a `response` object. - # - # If all tokens are rate-limited, we sleep until the reset time, - # then `continue` into another iteration of the outer while loop, - # attempting to get data from the same URL again. - - while True: - max_attempts = len(self.credentials) if self.credentials else 1 - reset_times: Dict[int, int] = {} # token index -> time - for attempt in range(max_attempts): - try: - return self._request(url) - except RateLimited as e: - reset_info = "(unknown reset)" - if e.reset_time is not None: - reset_times[self.token_index] = e.reset_time - reset_info = "(resetting in %ss)" % (e.reset_time - time.time()) - - if not self.anonymous: - logger.info( - "Rate limit exhausted for current user %s %s", - self.current_user, - reset_info, - ) - # Use next token in line - self.set_next_session_token() - # Wait one second to avoid triggering GitHub's abuse rate limits - time.sleep(1) - - # All tokens have been rate-limited. What do we do? - - if not reset_times: - logger.warning( - "No X-Ratelimit-Reset value found in responses for any token; " - "Giving up." - ) - raise MissingRateLimitReset() - - sleep_time = max(reset_times.values()) - time.time() + 1 - logger.info( - "Rate limits exhausted for all tokens. Sleeping for %f seconds.", - sleep_time, - ) - time.sleep(sleep_time) +from swh.core.github.utils import * # noqa