diff --git a/swh/core/github/tests/test_github_utils.py b/swh/core/github/tests/test_github_utils.py index da8bf7b..95fa7ed 100644 --- a/swh/core/github/tests/test_github_utils.py +++ b/swh/core/github/tests/test_github_utils.py @@ -1,160 +1,192 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import pytest from swh.core.github.pytest_plugin import HTTP_GITHUB_API_URL from swh.core.github.utils import ( GitHubSession, _sanitize_github_url, _url_github_api, _url_github_html, get_canonical_github_origin_url, ) KNOWN_GH_REPO = "https://github.com/user/repo" @pytest.mark.parametrize( "user_repo, expected_url", [ ("user/repo.git", KNOWN_GH_REPO), ("user/repo.git/", KNOWN_GH_REPO), ("user/repo/", KNOWN_GH_REPO), ("user/repo", KNOWN_GH_REPO), ("user/repo/.git", KNOWN_GH_REPO), # edge cases ("https://github.com/unknown-page", None), # unknown gh origin returns None ("user/repo/with/some/deps", None), # url kind is not dealt with for now ], ) -def test_get_canonical_github_origin_url(user_repo, expected_url, requests_mock): +def test_get_canonical_github_origin_url( + user_repo, expected_url, requests_mock, github_credentials +): """It should return a canonical github origin when it exists, None otherwise""" html_url = _url_github_html(user_repo) api_url = _url_github_api(_sanitize_github_url(user_repo)) if expected_url is not None: status_code = 200 response = {"html_url": _sanitize_github_url(html_url)} else: status_code = 404 response = {} requests_mock.get(api_url, [{"status_code": status_code, "json": response}]) + # anonymous assert get_canonical_github_origin_url(html_url) == expected_url + # with credentials + assert ( + get_canonical_github_origin_url(html_url, credentials=github_credentials) + == expected_url + ) + + # anonymous + assert ( + GitHubSession( + user_agent="GitHub Session Test", + ).get_canonical_url(html_url) + == expected_url + ) + + # with credentials + assert ( + GitHubSession( + user_agent="GitHub Session Test", credentials=github_credentials + ).get_canonical_url(html_url) + == expected_url + ) + def test_get_canonical_github_origin_url_not_gh_origin(): """It should return the input url when that origin is not a github one""" url = "https://example.org" assert get_canonical_github_origin_url(url) == url + assert ( + GitHubSession( + user_agent="GitHub Session Test", + ).get_canonical_url(url) + == url + ) + def test_github_session_anonymous_session(): user_agent = ("GitHub Session Test",) github_session = GitHubSession( user_agent=user_agent, ) assert github_session.anonymous is True actual_headers = github_session.session.headers assert actual_headers["Accept"] == "application/vnd.github.v3+json" assert actual_headers["User-Agent"] == user_agent @pytest.mark.parametrize( "num_ratelimit", [1] # return a single rate-limit response, then continue ) def test_github_session_ratelimit_once_recovery( caplog, requests_ratelimited, num_ratelimit, monkeypatch_sleep_calls, github_credentials, ): """GitHubSession should recover from hitting the rate-limit once""" caplog.set_level(logging.DEBUG, "swh.core.github.utils") github_session = GitHubSession( user_agent="GitHub Session Test", credentials=github_credentials ) res = github_session.request(f"{HTTP_GITHUB_API_URL}?per_page=1000&since=10") assert res.status_code == 200 token_users = [] for record in caplog.records: if "Using authentication token" in record.message: token_users.append(record.args[0]) # check that we used one more token than we saw rate limited requests assert len(token_users) == 1 + num_ratelimit # check that we slept for one second between our token uses assert monkeypatch_sleep_calls == [1] def test_github_session_authenticated_credentials( caplog, github_credentials, all_tokens ): """GitHubSession should have Authorization headers set in authenticated mode""" caplog.set_level(logging.DEBUG, "swh.core.github.utils") github_session = GitHubSession( "GitHub Session Test", credentials=github_credentials ) assert github_session.anonymous is False assert github_session.token_index == 0 assert ( sorted(github_session.credentials, key=lambda t: t["username"]) == github_credentials ) assert github_session.session.headers["Authorization"] in [ f"token {t}" for t in all_tokens ] @pytest.mark.parametrize( # Do 5 successful requests, return 6 ratelimits (to exhaust the credentials) with a # set value for X-Ratelimit-Reset, then resume listing successfully. "num_before_ratelimit, num_ratelimit, ratelimit_reset", [(5, 6, 123456)], ) def test_github_session_ratelimit_reset_sleep( caplog, requests_ratelimited, monkeypatch_sleep_calls, num_before_ratelimit, num_ratelimit, ratelimit_reset, github_credentials, ): """GitHubSession should handle rate-limit with authentication tokens.""" caplog.set_level(logging.DEBUG, "swh.core.github.utils") github_session = GitHubSession( user_agent="GitHub Session Test", credentials=github_credentials ) for _ in range(num_ratelimit): github_session.request(f"{HTTP_GITHUB_API_URL}?per_page=1000&since=10") # We sleep 1 second every time we change credentials, then we sleep until # ratelimit_reset + 1 expected_sleep_calls = len(github_credentials) * [1] + [ratelimit_reset + 1] assert monkeypatch_sleep_calls == expected_sleep_calls found_exhaustion_message = False for record in caplog.records: if record.levelname == "INFO": if "Rate limits exhausted for all tokens" in record.message: found_exhaustion_message = True break assert found_exhaustion_message is True diff --git a/swh/core/github/utils.py b/swh/core/github/utils.py index c21c8bb..0c9522d 100644 --- a/swh/core/github/utils.py +++ b/swh/core/github/utils.py @@ -1,214 +1,230 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import random import re import time from typing import Dict, List, Optional import requests from tenacity import ( retry, retry_any, retry_if_exception_type, retry_if_result, wait_exponential, ) GITHUB_PATTERN = re.compile(r"https?://github.com/(?P.*)") logger = logging.getLogger(__name__) def _url_github_html(user_repo: str) -> str: """Given the user repo, returns the expected github html url.""" return f"https://github.com/{user_repo}" def _url_github_api(user_repo: str) -> str: """Given the user_repo, returns the expected github api url.""" return f"https://api.github.com/repos/{user_repo}" def _sanitize_github_url(url: str) -> str: """Sanitize github url.""" return url.lower().rstrip("/").rstrip(".git").rstrip("/") -def get_canonical_github_origin_url(url: str) -> Optional[str]: +def get_canonical_github_origin_url( + url: str, credentials: Optional[List[Dict[str, str]]] = None +) -> Optional[str]: """Retrieve canonical github url out of an url if any or None otherwise. - This triggers an anonymous http request to the github api url to determine the - canonical repository url. + This triggers an http request to the github api url to determine the canonical + repository url (if no credentials is provided, the http request is anonymous. Either + way that request can be rate-limited by github.) """ - url_ = url.lower() - - match = GITHUB_PATTERN.match(url_) - if not match: - return url - - user_repo = _sanitize_github_url(match.groupdict()["user_repo"]) - response = requests.get(_url_github_api(user_repo)) - if response.status_code != 200: - return None - data = response.json() - return data["html_url"] + return GitHubSession( + user_agent="SWH core library", credentials=credentials + ).get_canonical_url(url) class RateLimited(Exception): def __init__(self, response): self.reset_time: Optional[int] # Figure out how long we need to sleep because of that rate limit ratelimit_reset = response.headers.get("X-Ratelimit-Reset") retry_after = response.headers.get("Retry-After") if ratelimit_reset is not None: self.reset_time = int(ratelimit_reset) elif retry_after is not None: self.reset_time = int(time.time()) + int(retry_after) + 1 else: logger.warning( "Received a rate-limit-like status code %s, but no rate-limit " "headers set. Response content: %s", response.status_code, response.content, ) self.reset_time = None self.response = response class MissingRateLimitReset(Exception): pass class GitHubSession: """Manages a :class:`requests.Session` with (optionally) multiple credentials, and cycles through them when reaching rate-limits.""" credentials: Optional[List[Dict[str, str]]] = None def __init__( self, user_agent: str, credentials: Optional[List[Dict[str, str]]] = None ) -> None: """Initialize a requests session with the proper headers for requests to GitHub.""" if credentials: creds = credentials.copy() random.shuffle(creds) self.credentials = creds self.session = requests.Session() self.session.headers.update( {"Accept": "application/vnd.github.v3+json", "User-Agent": user_agent} ) self.anonymous = not self.credentials if self.anonymous: logger.warning("No tokens set in configuration, using anonymous mode") self.token_index = -1 self.current_user: Optional[str] = None if not self.anonymous: # Initialize the first token value in the session headers self.set_next_session_token() def set_next_session_token(self) -> None: """Update the current authentication token with the next one in line.""" assert self.credentials self.token_index = (self.token_index + 1) % len(self.credentials) auth = self.credentials[self.token_index] self.current_user = auth["username"] logger.debug("Using authentication token for user %s", self.current_user) if "password" in auth: token = auth["password"] else: token = auth["token"] self.session.headers.update({"Authorization": f"token {token}"}) @retry( wait=wait_exponential(multiplier=1, min=4, max=10), retry=retry_any( # ChunkedEncodingErrors happen when the TLS connection gets reset, e.g. # when running the lister on a connection with high latency retry_if_exception_type(requests.exceptions.ChunkedEncodingError), # 502 status codes happen for a Server Error, sometimes retry_if_result(lambda r: r.status_code == 502), ), ) def _request(self, url: str) -> requests.Response: response = self.session.get(url) if ( # GitHub returns inconsistent status codes between unauthenticated # rate limit and authenticated rate limits. Handle both. response.status_code == 429 or (self.anonymous and response.status_code == 403) ): raise RateLimited(response) return response def request(self, url) -> requests.Response: """Repeatedly requests the given URL, cycling through credentials and sleeping if necessary; until either a successful response or :exc:`MissingRateLimitReset` """ # The following for/else loop handles rate limiting; if successful, # it provides the rest of the function with a `response` object. # # If all tokens are rate-limited, we sleep until the reset time, # then `continue` into another iteration of the outer while loop, # attempting to get data from the same URL again. while True: max_attempts = len(self.credentials) if self.credentials else 1 reset_times: Dict[int, int] = {} # token index -> time for attempt in range(max_attempts): try: return self._request(url) except RateLimited as e: reset_info = "(unknown reset)" if e.reset_time is not None: reset_times[self.token_index] = e.reset_time reset_info = "(resetting in %ss)" % (e.reset_time - time.time()) if not self.anonymous: logger.info( "Rate limit exhausted for current user %s %s", self.current_user, reset_info, ) # Use next token in line self.set_next_session_token() # Wait one second to avoid triggering GitHub's abuse rate limits time.sleep(1) # All tokens have been rate-limited. What do we do? if not reset_times: logger.warning( "No X-Ratelimit-Reset value found in responses for any token; " "Giving up." ) raise MissingRateLimitReset() sleep_time = max(reset_times.values()) - time.time() + 1 logger.info( "Rate limits exhausted for all tokens. Sleeping for %f seconds.", sleep_time, ) time.sleep(sleep_time) + + def get_canonical_url(self, url: str) -> Optional[str]: + """Retrieve canonical github url out of an url if any or None otherwise. + + This triggers an http request to the github api url to determine the + canonical repository url. + + Returns + The canonical url if any, None otherwise. + """ + url_ = url.lower() + + match = GITHUB_PATTERN.match(url_) + if not match: + return url + + user_repo = _sanitize_github_url(match.groupdict()["user_repo"]) + response = self.request(_url_github_api(user_repo)) + if response.status_code != 200: + return None + data = response.json() + return data["html_url"]