diff --git a/swh/core/github/tests/test_github_utils.py b/swh/core/github/tests/test_github_utils.py index d9d940c..c7b7087 100644 --- a/swh/core/github/tests/test_github_utils.py +++ b/swh/core/github/tests/test_github_utils.py @@ -1,199 +1,205 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import pytest from swh.core.github.pytest_plugin import HTTP_GITHUB_API_URL from swh.core.github.utils import ( GitHubSession, _sanitize_github_url, _url_github_api, get_canonical_github_origin_url, ) KNOWN_GH_REPO = "https://github.com/user/repo" -def _url_github_html(user_repo: str, protocol: str = "https") -> str: - """Given the user repo, returns the expected github html url.""" - return f"{protocol}://github.com/{user_repo}" - - @pytest.mark.parametrize( "user_repo, expected_url", [ ("user/repo.git", KNOWN_GH_REPO), ("user/repo.git/", KNOWN_GH_REPO), ("user/repo/", KNOWN_GH_REPO), ("user/repo", KNOWN_GH_REPO), ("user/repo/.git", KNOWN_GH_REPO), ("unknown/page", None), # unknown gh origin returns None ("user/with/deps", None), # url kind is not dealt with ], ) def test_get_canonical_github_origin_url( user_repo, expected_url, requests_mock, github_credentials ): """It should return a canonical github origin when it exists, None otherwise""" - for protocol in ["https", "git", "http"]: - html_input_url = _url_github_html(user_repo, protocol=protocol) - html_url = _url_github_html(user_repo) - api_url = _url_github_api(_sanitize_github_url(user_repo)) - - if expected_url is not None: - status_code = 200 - response = {"html_url": _sanitize_github_url(html_url)} - else: - status_code = 404 - response = {} - - requests_mock.get(api_url, [{"status_code": status_code, "json": response}]) - - # anonymous - assert get_canonical_github_origin_url(html_input_url) == expected_url - - # with credentials - assert ( - get_canonical_github_origin_url( - html_input_url, credentials=github_credentials + for separator in ["/", ":"]: + for prefix in [ + "http://", + "https://", + "git://", + "ssh://", + "//", + "git@", + "ssh://git@", + "https://${env.GITHUB_TOKEN_USR}:${env.GITHUB_TOKEN_PSW}@", + "[fetch=]git@", + ]: + html_input_url = f"{prefix}github.com{separator}{user_repo}" + html_url = f"https://github.com/{user_repo}" + api_url = _url_github_api(_sanitize_github_url(user_repo)) + + if expected_url is not None: + status_code = 200 + response = {"html_url": _sanitize_github_url(html_url)} + else: + status_code = 404 + response = {} + + requests_mock.get(api_url, [{"status_code": status_code, "json": response}]) + + # anonymous + assert get_canonical_github_origin_url(html_input_url) == expected_url + + # with credentials + assert ( + get_canonical_github_origin_url( + html_input_url, credentials=github_credentials + ) + == expected_url + ) + + # anonymous + assert ( + GitHubSession( + user_agent="GitHub Session Test", + ).get_canonical_url(html_input_url) + == expected_url + ) + + # with credentials + assert ( + GitHubSession( + user_agent="GitHub Session Test", credentials=github_credentials + ).get_canonical_url(html_input_url) + == expected_url ) - == expected_url - ) - - # anonymous - assert ( - GitHubSession( - user_agent="GitHub Session Test", - ).get_canonical_url(html_input_url) - == expected_url - ) - - # with credentials - assert ( - GitHubSession( - user_agent="GitHub Session Test", credentials=github_credentials - ).get_canonical_url(html_input_url) - == expected_url - ) def test_get_canonical_github_origin_url_not_gh_origin(): """It should return the input url when that origin is not a github one""" url = "https://example.org" assert get_canonical_github_origin_url(url) == url assert ( GitHubSession( user_agent="GitHub Session Test", ).get_canonical_url(url) == url ) def test_github_session_anonymous_session(): user_agent = ("GitHub Session Test",) github_session = GitHubSession( user_agent=user_agent, ) assert github_session.anonymous is True actual_headers = github_session.session.headers assert actual_headers["Accept"] == "application/vnd.github.v3+json" assert actual_headers["User-Agent"] == user_agent @pytest.mark.parametrize( "num_ratelimit", [1] # return a single rate-limit response, then continue ) def test_github_session_ratelimit_once_recovery( caplog, requests_ratelimited, num_ratelimit, monkeypatch_sleep_calls, github_credentials, ): """GitHubSession should recover from hitting the rate-limit once""" caplog.set_level(logging.DEBUG, "swh.core.github.utils") github_session = GitHubSession( user_agent="GitHub Session Test", credentials=github_credentials ) res = github_session.request(f"{HTTP_GITHUB_API_URL}?per_page=1000&since=10") assert res.status_code == 200 token_users = [] for record in caplog.records: if "Using authentication token" in record.message: token_users.append(record.args[0]) # check that we used one more token than we saw rate limited requests assert len(token_users) == 1 + num_ratelimit # check that we slept for one second between our token uses assert monkeypatch_sleep_calls == [1] def test_github_session_authenticated_credentials( caplog, github_credentials, all_tokens ): """GitHubSession should have Authorization headers set in authenticated mode""" caplog.set_level(logging.DEBUG, "swh.core.github.utils") github_session = GitHubSession( "GitHub Session Test", credentials=github_credentials ) assert github_session.anonymous is False assert github_session.token_index == 0 assert ( sorted(github_session.credentials, key=lambda t: t["username"]) == github_credentials ) assert github_session.session.headers["Authorization"] in [ f"token {t}" for t in all_tokens ] @pytest.mark.parametrize( # Do 5 successful requests, return 6 ratelimits (to exhaust the credentials) with a # set value for X-Ratelimit-Reset, then resume listing successfully. "num_before_ratelimit, num_ratelimit, ratelimit_reset", [(5, 6, 123456)], ) def test_github_session_ratelimit_reset_sleep( caplog, requests_ratelimited, monkeypatch_sleep_calls, num_before_ratelimit, num_ratelimit, ratelimit_reset, github_credentials, ): """GitHubSession should handle rate-limit with authentication tokens.""" caplog.set_level(logging.DEBUG, "swh.core.github.utils") github_session = GitHubSession( user_agent="GitHub Session Test", credentials=github_credentials ) for _ in range(num_ratelimit): github_session.request(f"{HTTP_GITHUB_API_URL}?per_page=1000&since=10") # We sleep 1 second every time we change credentials, then we sleep until # ratelimit_reset + 1 expected_sleep_calls = len(github_credentials) * [1] + [ratelimit_reset + 1] assert monkeypatch_sleep_calls == expected_sleep_calls found_exhaustion_message = False for record in caplog.records: if record.levelname == "INFO": if "Rate limits exhausted for all tokens" in record.message: found_exhaustion_message = True break assert found_exhaustion_message is True diff --git a/swh/core/github/utils.py b/swh/core/github/utils.py index 867b2e4..80ffa2b 100644 --- a/swh/core/github/utils.py +++ b/swh/core/github/utils.py @@ -1,225 +1,227 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import random import re import time from typing import Dict, List, Optional import requests from tenacity import ( retry, retry_any, retry_if_exception_type, retry_if_result, wait_exponential, ) -GITHUB_PATTERN = re.compile(r"(git|https?)://github.com/(?P.*)") +GITHUB_PATTERN = re.compile( + r"(//|git://|git@|git//|https?://|ssh://|.*@)github.com[/:](?P.*)" +) logger = logging.getLogger(__name__) def _url_github_api(user_repo: str) -> str: """Given the user_repo, returns the expected github api url.""" return f"https://api.github.com/repos/{user_repo}" def _sanitize_github_url(url: str) -> str: """Sanitize github url.""" return url.lower().rstrip("/").rstrip(".git").rstrip("/") def get_canonical_github_origin_url( url: str, credentials: Optional[List[Dict[str, str]]] = None ) -> Optional[str]: """Retrieve canonical github url out of an url if any or None otherwise. This triggers an http request to the github api url to determine the canonical repository url (if no credentials is provided, the http request is anonymous. Either way that request can be rate-limited by github.) """ return GitHubSession( user_agent="SWH core library", credentials=credentials ).get_canonical_url(url) class RateLimited(Exception): def __init__(self, response): self.reset_time: Optional[int] # Figure out how long we need to sleep because of that rate limit ratelimit_reset = response.headers.get("X-Ratelimit-Reset") retry_after = response.headers.get("Retry-After") if ratelimit_reset is not None: self.reset_time = int(ratelimit_reset) elif retry_after is not None: self.reset_time = int(time.time()) + int(retry_after) + 1 else: logger.warning( "Received a rate-limit-like status code %s, but no rate-limit " "headers set. Response content: %s", response.status_code, response.content, ) self.reset_time = None self.response = response class MissingRateLimitReset(Exception): pass class GitHubSession: """Manages a :class:`requests.Session` with (optionally) multiple credentials, and cycles through them when reaching rate-limits.""" credentials: Optional[List[Dict[str, str]]] = None def __init__( self, user_agent: str, credentials: Optional[List[Dict[str, str]]] = None ) -> None: """Initialize a requests session with the proper headers for requests to GitHub.""" if credentials: creds = credentials.copy() random.shuffle(creds) self.credentials = creds self.session = requests.Session() self.session.headers.update( {"Accept": "application/vnd.github.v3+json", "User-Agent": user_agent} ) self.anonymous = not self.credentials if self.anonymous: logger.warning("No tokens set in configuration, using anonymous mode") self.token_index = -1 self.current_user: Optional[str] = None if not self.anonymous: # Initialize the first token value in the session headers self.set_next_session_token() def set_next_session_token(self) -> None: """Update the current authentication token with the next one in line.""" assert self.credentials self.token_index = (self.token_index + 1) % len(self.credentials) auth = self.credentials[self.token_index] self.current_user = auth["username"] logger.debug("Using authentication token for user %s", self.current_user) if "password" in auth: token = auth["password"] else: token = auth["token"] self.session.headers.update({"Authorization": f"token {token}"}) @retry( wait=wait_exponential(multiplier=1, min=4, max=10), retry=retry_any( # ChunkedEncodingErrors happen when the TLS connection gets reset, e.g. # when running the lister on a connection with high latency retry_if_exception_type(requests.exceptions.ChunkedEncodingError), # 502 status codes happen for a Server Error, sometimes retry_if_result(lambda r: r.status_code == 502), ), ) def _request(self, url: str) -> requests.Response: response = self.session.get(url) if ( # GitHub returns inconsistent status codes between unauthenticated # rate limit and authenticated rate limits. Handle both. response.status_code == 429 or (self.anonymous and response.status_code == 403) ): raise RateLimited(response) return response def request(self, url) -> requests.Response: """Repeatedly requests the given URL, cycling through credentials and sleeping if necessary; until either a successful response or :exc:`MissingRateLimitReset` """ # The following for/else loop handles rate limiting; if successful, # it provides the rest of the function with a `response` object. # # If all tokens are rate-limited, we sleep until the reset time, # then `continue` into another iteration of the outer while loop, # attempting to get data from the same URL again. while True: max_attempts = len(self.credentials) if self.credentials else 1 reset_times: Dict[int, int] = {} # token index -> time for attempt in range(max_attempts): try: return self._request(url) except RateLimited as e: reset_info = "(unknown reset)" if e.reset_time is not None: reset_times[self.token_index] = e.reset_time reset_info = "(resetting in %ss)" % (e.reset_time - time.time()) if not self.anonymous: logger.info( "Rate limit exhausted for current user %s %s", self.current_user, reset_info, ) # Use next token in line self.set_next_session_token() # Wait one second to avoid triggering GitHub's abuse rate limits time.sleep(1) # All tokens have been rate-limited. What do we do? if not reset_times: logger.warning( "No X-Ratelimit-Reset value found in responses for any token; " "Giving up." ) raise MissingRateLimitReset() sleep_time = max(reset_times.values()) - time.time() + 1 logger.info( "Rate limits exhausted for all tokens. Sleeping for %f seconds.", sleep_time, ) time.sleep(sleep_time) def get_canonical_url(self, url: str) -> Optional[str]: """Retrieve canonical github url out of an url if any or None otherwise. This triggers an http request to the github api url to determine the canonical repository url. Returns The canonical url if any, None otherwise. """ url_ = url.lower() match = GITHUB_PATTERN.match(url_) if not match: return url user_repo = _sanitize_github_url(match.groupdict()["user_repo"]) response = self.request(_url_github_api(user_repo)) if response.status_code != 200: return None data = response.json() return data["html_url"]