D7840.diff
No OneTemporary
Actions

Size

17 KB

Subscribers

None

D7840.diff
View Options

	diff --git a/requirements-http.txt b/requirements-http.txt
	--- a/requirements-http.txt
	+++ b/requirements-http.txt
	@@ -6,3 +6,4 @@
	iso8601
	msgpack >= 1.0.0
	requests
	+tenacity
	diff --git a/swh/core/github/utils.py b/swh/core/github/utils.py
	--- a/swh/core/github/utils.py
	+++ b/swh/core/github/utils.py
	@@ -1,17 +1,30 @@
	-# Copyright (C) 2022 The Software Heritage developers
	+# Copyright (C) 2020-2022 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information


	+import logging
	+import random
	import re
	-from typing import Optional
	+import time
	+from typing import Dict, List, Optional

	import requests
	+from tenacity import (
	+ retry,
	+ retry_any,
	+ retry_if_exception_type,
	+ retry_if_result,
	+ wait_exponential,
	+)

	GITHUB_PATTERN = re.compile(r"https?://github.com/(?P<user_repo>.*)")


	+logger = logging.getLogger(__name__)
	+
	+
	def _url_github_html(user_repo: str) -> str:
	"""Given the user repo, returns the expected github html url."""
	return f"https://github.com/{user_repo}"
	@@ -46,3 +59,156 @@
	return None
	data = response.json()
	return data["html_url"]
	+
	+
	+class RateLimited(Exception):
	+ def __init__(self, response):
	+ self.reset_time: Optional[int]
	+
	+ # Figure out how long we need to sleep because of that rate limit
	+ ratelimit_reset = response.headers.get("X-Ratelimit-Reset")
	+ retry_after = response.headers.get("Retry-After")
	+ if ratelimit_reset is not None:
	+ self.reset_time = int(ratelimit_reset)
	+ elif retry_after is not None:
	+ self.reset_time = int(time.time()) + int(retry_after) + 1
	+ else:
	+ logger.warning(
	+ "Received a rate-limit-like status code %s, but no rate-limit "
	+ "headers set. Response content: %s",
	+ response.status_code,
	+ response.content,
	+ )
	+ self.reset_time = None
	+ self.response = response
	+
	+
	+class MissingRateLimitReset(Exception):
	+ pass
	+
	+
	+class GitHubSession:
	+ """Manages a :class:`requests.Session` with (optionally) multiple credentials,
	+ and cycles through them when reaching rate-limits."""
	+
	+ credentials: Optional[List[Dict[str, str]]] = None
	+
	+ def __init__(
	+ self, user_agent: str, credentials: Optional[List[Dict[str, str]]] = None
	+ ) -> None:
	+ """Initialize a requests session with the proper headers for requests to
	+ GitHub."""
	+ if credentials:
	+ creds = credentials.copy()
	+ random.shuffle(creds)
	+ self.credentials = creds
	+
	+ self.session = requests.Session()
	+
	+ self.session.headers.update(
	+ {"Accept": "application/vnd.github.v3+json", "User-Agent": user_agent}
	+ )
	+
	+ self.anonymous = not self.credentials
	+
	+ if self.anonymous:
	+ logger.warning("No tokens set in configuration, using anonymous mode")
	+
	+ self.token_index = -1
	+ self.current_user: Optional[str] = None
	+
	+ if not self.anonymous:
	+ # Initialize the first token value in the session headers
	+ self.set_next_session_token()
	+
	+ def set_next_session_token(self) -> None:
	+ """Update the current authentication token with the next one in line."""
	+
	+ assert self.credentials
	+
	+ self.token_index = (self.token_index + 1) % len(self.credentials)
	+
	+ auth = self.credentials[self.token_index]
	+
	+ self.current_user = auth["username"]
	+ logger.debug("Using authentication token for user %s", self.current_user)
	+
	+ if "password" in auth:
	+ token = auth["password"]
	+ else:
	+ token = auth["token"]
	+
	+ self.session.headers.update({"Authorization": f"token {token}"})
	+
	+ @retry(
	+ wait=wait_exponential(multiplier=1, min=4, max=10),
	+ retry=retry_any(
	+ # ChunkedEncodingErrors happen when the TLS connection gets reset, e.g.
	+ # when running the lister on a connection with high latency
	+ retry_if_exception_type(requests.exceptions.ChunkedEncodingError),
	+ # 502 status codes happen for a Server Error, sometimes
	+ retry_if_result(lambda r: r.status_code == 502),
	+ ),
	+ )
	+ def _request(self, url: str) -> requests.Response:
	+ response = self.session.get(url)
	+
	+ if (
	+ # GitHub returns inconsistent status codes between unauthenticated
	+ # rate limit and authenticated rate limits. Handle both.
	+ response.status_code == 429
	+ or (self.anonymous and response.status_code == 403)
	+ ):
	+ raise RateLimited(response)
	+
	+ return response
	+
	+ def request(self, url) -> requests.Response:
	+ """Repeatedly requests the given URL, cycling through credentials and sleeping
	+ if necessary; until either a successful response or :exc:`MissingRateLimitReset`
	+ """
	+ # The following for/else loop handles rate limiting; if successful,
	+ # it provides the rest of the function with a `response` object.
	+ #
	+ # If all tokens are rate-limited, we sleep until the reset time,
	+ # then `continue` into another iteration of the outer while loop,
	+ # attempting to get data from the same URL again.
	+
	+ while True:
	+ max_attempts = len(self.credentials) if self.credentials else 1
	+ reset_times: Dict[int, int] = {} # token index -> time
	+ for attempt in range(max_attempts):
	+ try:
	+ return self._request(url)
	+ except RateLimited as e:
	+ reset_info = "(unknown reset)"
	+ if e.reset_time is not None:
	+ reset_times[self.token_index] = e.reset_time
	+ reset_info = "(resetting in %ss)" % (e.reset_time - time.time())
	+
	+ if not self.anonymous:
	+ logger.info(
	+ "Rate limit exhausted for current user %s %s",
	+ self.current_user,
	+ reset_info,
	+ )
	+ # Use next token in line
	+ self.set_next_session_token()
	+ # Wait one second to avoid triggering GitHub's abuse rate limits
	+ time.sleep(1)
	+
	+ # All tokens have been rate-limited. What do we do?
	+
	+ if not reset_times:
	+ logger.warning(
	+ "No X-Ratelimit-Reset value found in responses for any token; "
	+ "Giving up."
	+ )
	+ raise MissingRateLimitReset()
	+
	+ sleep_time = max(reset_times.values()) - time.time() + 1
	+ logger.info(
	+ "Rate limits exhausted for all tokens. Sleeping for %f seconds.",
	+ sleep_time,
	+ )
	+ time.sleep(sleep_time)
	diff --git a/swh/core/tests/test_github_utils.py b/swh/core/tests/test_github_utils.py
	--- a/swh/core/tests/test_github_utils.py
	+++ b/swh/core/tests/test_github_utils.py
	@@ -3,9 +3,15 @@
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	+import logging
	+import time
	+from typing import Dict, Iterator, List, Optional, Union
	+
	import pytest
	+import requests_mock

	from swh.core.github.utils import (
	+ GitHubSession,
	_sanitize_github_url,
	_url_github_api,
	_url_github_html,
	@@ -49,3 +55,283 @@
	"""It should return the input url when that origin is not a github one"""
	url = "https://example.org"
	assert get_canonical_github_origin_url(url) == url
	+
	+
	+def fake_time_sleep(duration: float, sleep_calls: Optional[List[float]] = None):
	+ """Record calls to time.sleep in the sleep_calls list"""
	+ if duration < 0:
	+ raise ValueError("Can't sleep for a negative amount of time!")
	+ if sleep_calls is not None:
	+ sleep_calls.append(duration)
	+
	+
	+def fake_time_time():
	+ """Return 0 when running time.time()"""
	+ return 0
	+
	+
	+@pytest.fixture
	+def monkeypatch_sleep_calls(monkeypatch) -> Iterator[List[float]]:
	+ """Monkeypatch `time.time` and `time.sleep`. Returns a list cumulating the arguments
	+ passed to time.sleep()."""
	+ sleeps: List[float] = []
	+ monkeypatch.setattr(time, "sleep", lambda d: fake_time_sleep(d, sleeps))
	+ monkeypatch.setattr(time, "time", fake_time_time)
	+ yield sleeps
	+
	+
	+@pytest.fixture()
	+def num_before_ratelimit() -> int:
	+ """Number of successful requests before the ratelimit hits"""
	+ return 0
	+
	+
	+@pytest.fixture()
	+def num_ratelimit() -> Optional[int]:
	+ """Number of rate-limited requests; None means infinity"""
	+ return None
	+
	+
	+@pytest.fixture()
	+def ratelimit_reset() -> Optional[int]:
	+ """Value of the X-Ratelimit-Reset header on ratelimited responses"""
	+ return None
	+
	+
	+def github_ratelimit_callback(
	+ request: requests_mock.request._RequestObjectProxy,
	+ context: requests_mock.response._Context,
	+ ratelimit_reset: Optional[int],
	+) -> Dict[str, str]:
	+ """Return a rate-limited GitHub API response."""
	+ # Check request headers
	+ assert request.headers["Accept"] == "application/vnd.github.v3+json"
	+ assert request.headers["User-Agent"] is not None
	+ if "Authorization" in request.headers:
	+ context.status_code = 429
	+ else:
	+ context.status_code = 403
	+
	+ if ratelimit_reset is not None:
	+ context.headers["X-Ratelimit-Reset"] = str(ratelimit_reset)
	+
	+ return {
	+ "message": "API rate limit exceeded for <IP>.",
	+ "documentation_url": "https://developer.github.com/v3/#rate-limiting",
	+ }
	+
	+
	+def github_repo(i: int) -> Dict[str, Union[int, str]]:
	+ """Basic repository information returned by the GitHub API"""
	+
	+ repo: Dict[str, Union[int, str]] = {
	+ "id": i,
	+ "html_url": f"https://github.com/origin/{i}",
	+ }
	+
	+ # Set the pushed_at date on one of the origins
	+ if i == 4321:
	+ repo["pushed_at"] = "2018-11-08T13:16:24Z"
	+
	+ return repo
	+
	+
	+HTTP_GH_API_URL = "https://api.github.com/repositories"
	+
	+
	+def github_response_callback(
	+ request: requests_mock.request._RequestObjectProxy,
	+ context: requests_mock.response._Context,
	+ page_size: int = 1000,
	+ origin_count: int = 10000,
	+) -> List[Dict[str, Union[str, int]]]:
	+ """Return minimal GitHub API responses for the common case where the loader
	+ hasn't been rate-limited"""
	+ # Check request headers
	+ assert request.headers["Accept"] == "application/vnd.github.v3+json"
	+ assert request.headers["User-Agent"] is not None
	+
	+ # Check request parameters: per_page == 1000, since = last_repo_id
	+ assert "per_page" in request.qs
	+ assert request.qs["per_page"] == [str(page_size)]
	+ assert "since" in request.qs
	+
	+ since = int(request.qs["since"][0])
	+
	+ next_page = since + page_size
	+ if next_page < origin_count:
	+ # the first id for the next page is within our origin count; add a Link
	+ # header to the response
	+ next_url = f"{HTTP_GH_API_URL}?per_page={page_size}&since={next_page}"
	+ context.headers["Link"] = f"<{next_url}>; rel=next"
	+
	+ return [github_repo(i) for i in range(since + 1, min(next_page, origin_count) + 1)]
	+
	+
	+@pytest.fixture()
	+def requests_ratelimited(
	+ num_before_ratelimit: int,
	+ num_ratelimit: Optional[int],
	+ ratelimit_reset: Optional[int],
	+) -> Iterator[requests_mock.Mocker]:
	+ """Mock requests to the GitHub API, returning a rate-limiting status code
	+ after `num_before_ratelimit` requests.
	+
	+ GitHub does inconsistent rate-limiting:
	+ - Anonymous requests return a 403 status code
	+ - Authenticated requests return a 429 status code, with an
	+ X-Ratelimit-Reset header.
	+
	+ This fixture takes multiple arguments (which can be overridden with a
	+ :func:`pytest.mark.parametrize` parameter):
	+ - num_before_ratelimit: the global number of requests until the
	+ ratelimit triggers
	+ - num_ratelimit: the number of requests that return a
	+ rate-limited response.
	+ - ratelimit_reset: the timestamp returned in X-Ratelimit-Reset if the
	+ request is authenticated.
	+
	+ The default values set in the previous fixtures make all requests return a rate
	+ limit response.
	+ """
	+ current_request = 0
	+
	+ def response_callback(request, context):
	+ nonlocal current_request
	+ current_request += 1
	+ if num_before_ratelimit < current_request and (
	+ num_ratelimit is None
	+ or current_request < num_before_ratelimit + num_ratelimit + 1
	+ ):
	+ return github_ratelimit_callback(request, context, ratelimit_reset)
	+ else:
	+ return github_response_callback(request, context)
	+
	+ with requests_mock.Mocker() as mock:
	+ mock.get(HTTP_GH_API_URL, json=response_callback)
	+ yield mock
	+
	+
	+@pytest.fixture
	+def github_credentials() -> List[Dict[str, str]]:
	+ """Return a static list of GitHub credentials"""
	+ return sorted(
	+ [{"username": f"swh{i:d}", "token": f"token-{i:d}"} for i in range(3)]
	+ + [
	+ {"username": f"swh-legacy{i:d}", "password": f"token-legacy-{i:d}"}
	+ for i in range(3)
	+ ],
	+ key=lambda c: c["username"],
	+ )
	+
	+
	+@pytest.fixture
	+def all_tokens(github_credentials) -> List[str]:
	+ """Return the list of tokens matching the static credential"""
	+
	+ return [t.get("token", t.get("password")) for t in github_credentials]
	+
	+
	+def test_github_session_anonymous_session():
	+ user_agent = ("GitHub Session Test",)
	+ github_session = GitHubSession(
	+ user_agent=user_agent,
	+ )
	+ assert github_session.anonymous is True
	+
	+ actual_headers = github_session.session.headers
	+ assert actual_headers["Accept"] == "application/vnd.github.v3+json"
	+ assert actual_headers["User-Agent"] == user_agent
	+
	+
	+@pytest.mark.parametrize(
	+ "num_ratelimit", [1] # return a single rate-limit response, then continue
	+)
	+def test_github_session_ratelimit_once_recovery(
	+ caplog,
	+ requests_ratelimited,
	+ num_ratelimit,
	+ monkeypatch_sleep_calls,
	+ github_credentials,
	+):
	+ """GitHubSession should recover from hitting the rate-limit once"""
	+ caplog.set_level(logging.DEBUG, "swh.core.github.utils")
	+
	+ github_session = GitHubSession(
	+ user_agent="GitHub Session Test", credentials=github_credentials
	+ )
	+
	+ res = github_session.request(f"{HTTP_GH_API_URL}?per_page=1000&since=10")
	+ assert res.status_code == 200
	+
	+ token_users = []
	+ for record in caplog.records:
	+ if "Using authentication token" in record.message:
	+ token_users.append(record.args[0])
	+
	+ # check that we used one more token than we saw rate limited requests
	+ assert len(token_users) == 1 + num_ratelimit
	+
	+ # check that we slept for one second between our token uses
	+ assert monkeypatch_sleep_calls == [1]
	+
	+
	+def test_github_session_authenticated_credentials(
	+ caplog, github_credentials, all_tokens
	+):
	+ """GitHubSession should have Authorization headers set in authenticated mode"""
	+ caplog.set_level(logging.DEBUG, "swh.core.github.utils")
	+
	+ github_session = GitHubSession(
	+ "GitHub Session Test", credentials=github_credentials
	+ )
	+
	+ assert github_session.anonymous is False
	+ assert github_session.token_index == 0
	+ assert (
	+ sorted(github_session.credentials, key=lambda t: t["username"])
	+ == github_credentials
	+ )
	+ assert github_session.session.headers["Authorization"] in [
	+ f"token {t}" for t in all_tokens
	+ ]
	+
	+
	+@pytest.mark.parametrize(
	+ # Do 5 successful requests, return 6 ratelimits (to exhaust the credentials) with a
	+ # set value for X-Ratelimit-Reset, then resume listing successfully.
	+ "num_before_ratelimit, num_ratelimit, ratelimit_reset",
	+ [(5, 6, 123456)],
	+)
	+def test_github_session_ratelimit_reset_sleep(
	+ caplog,
	+ requests_ratelimited,
	+ monkeypatch_sleep_calls,
	+ num_before_ratelimit,
	+ num_ratelimit,
	+ ratelimit_reset,
	+ github_credentials,
	+):
	+ """GitHubSession should handle rate-limit with authentication tokens."""
	+ caplog.set_level(logging.DEBUG, "swh.core.github.utils")
	+
	+ github_session = GitHubSession(
	+ user_agent="GitHub Session Test", credentials=github_credentials
	+ )
	+
	+ for _ in range(num_ratelimit):
	+ github_session.request(f"{HTTP_GH_API_URL}?per_page=1000&since=10")
	+
	+ # We sleep 1 second every time we change credentials, then we sleep until
	+ # ratelimit_reset + 1
	+ expected_sleep_calls = len(github_credentials) * [1] + [ratelimit_reset + 1]
	+ assert monkeypatch_sleep_calls == expected_sleep_calls
	+
	+ found_exhaustion_message = False
	+ for record in caplog.records:
	+ if record.levelname == "INFO":
	+ if "Rate limits exhausted for all tokens" in record.message:
	+ found_exhaustion_message = True
	+ break
	+
	+ assert found_exhaustion_message is True

File Metadata

Mime Type: text/plain
Expires: Sun, Aug 17, 8:04 PM (1 w, 1 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3213663

D7840.diffNo OneTemporaryActions

D7840.diffView Options

File Metadata

Event Timeline

D7840.diff
No OneTemporary
Actions

D7840.diff
View Options