Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9312159
D7630.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
11 KB
Subscribers
None
D7630.diff
View Options
diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py
--- a/swh/lister/github/lister.py
+++ b/swh/lister/github/lister.py
@@ -30,19 +30,6 @@
logger = logging.getLogger(__name__)
-def init_session(session: Optional[requests.Session] = None) -> requests.Session:
- """Initialize a requests session with the proper headers for requests to
- GitHub."""
- if not session:
- session = requests.Session()
-
- session.headers.update(
- {"Accept": "application/vnd.github.v3+json", "User-Agent": USER_AGENT}
- )
-
- return session
-
-
class RateLimited(Exception):
def __init__(self, response):
self.reset_time: Optional[int]
@@ -65,30 +52,130 @@
self.response = response
-@retry(
- wait=wait_exponential(multiplier=1, min=4, max=10),
- retry=retry_any(
- # ChunkedEncodingErrors happen when the TLS connection gets reset, e.g.
- # when running the lister on a connection with high latency
- retry_if_exception_type(requests.exceptions.ChunkedEncodingError),
- # 502 status codes happen for a Server Error, sometimes
- retry_if_result(lambda r: r.status_code == 502),
- ),
-)
-def github_request(url: str, session: requests.Session) -> requests.Response:
- response = session.get(url)
+class MissingRateLimitReset(Exception):
+ pass
- anonymous = "Authorization" not in session.headers
- if (
- # GitHub returns inconsistent status codes between unauthenticated
- # rate limit and authenticated rate limits. Handle both.
- response.status_code == 429
- or (anonymous and response.status_code == 403)
- ):
- raise RateLimited(response)
+class GitHubSession:
+ """Manages a :class:`requests.Session` with (optionally) multiple credentials,
+ and cycles through them when reaching rate-limits."""
+
+ def __init__(self, credentials: Optional[List[Dict[str, str]]] = None) -> None:
+ """Initialize a requests session with the proper headers for requests to
+ GitHub."""
+ self.credentials = credentials
+ if self.credentials:
+ random.shuffle(self.credentials)
+
+ self.session = requests.Session()
+
+ self.session.headers.update(
+ {"Accept": "application/vnd.github.v3+json", "User-Agent": USER_AGENT}
+ )
+
+ self.anonymous = not self.credentials
+
+ if self.anonymous:
+ logger.warning("No tokens set in configuration, using anonymous mode")
+
+ self.token_index = -1
+ self.current_user: Optional[str] = None
+
+ if not self.anonymous:
+ # Initialize the first token value in the session headers
+ self.set_next_session_token()
+
+ def set_next_session_token(self) -> None:
+ """Update the current authentication token with the next one in line."""
+
+ assert self.credentials
+
+ self.token_index = (self.token_index + 1) % len(self.credentials)
+
+ auth = self.credentials[self.token_index]
- return response
+ self.current_user = auth["username"]
+ logger.debug("Using authentication token for user %s", self.current_user)
+
+ if "password" in auth:
+ token = auth["password"]
+ else:
+ token = auth["token"]
+
+ self.session.headers.update({"Authorization": f"token {token}"})
+
+ @retry(
+ wait=wait_exponential(multiplier=1, min=4, max=10),
+ retry=retry_any(
+ # ChunkedEncodingErrors happen when the TLS connection gets reset, e.g.
+ # when running the lister on a connection with high latency
+ retry_if_exception_type(requests.exceptions.ChunkedEncodingError),
+ # 502 status codes happen for a Server Error, sometimes
+ retry_if_result(lambda r: r.status_code == 502),
+ ),
+ )
+ def _request(self, url: str) -> requests.Response:
+ response = self.session.get(url)
+
+ if (
+ # GitHub returns inconsistent status codes between unauthenticated
+ # rate limit and authenticated rate limits. Handle both.
+ response.status_code == 429
+ or (self.anonymous and response.status_code == 403)
+ ):
+ raise RateLimited(response)
+
+ return response
+
+ def request(self, url) -> requests.Response:
+ """Repeatedly requests the given URL, cycling through credentials and sleeping
+ if necessary; until either a successful response or :exc:`MissingRateLimitReset`
+ """
+ # The following for/else loop handles rate limiting; if successful,
+ # it provides the rest of the function with a `response` object.
+ #
+ # If all tokens are rate-limited, we sleep until the reset time,
+ # then `continue` into another iteration of the outer while loop,
+ # attempting to get data from the same URL again.
+
+ while True:
+ max_attempts = len(self.credentials) if self.credentials else 1
+ reset_times: Dict[int, int] = {} # token index -> time
+ for attempt in range(max_attempts):
+ try:
+ return self._request(url)
+ except RateLimited as e:
+ reset_info = "(unknown reset)"
+ if e.reset_time is not None:
+ reset_times[self.token_index] = e.reset_time
+ reset_info = "(resetting in %ss)" % (e.reset_time - time.time())
+
+ if not self.anonymous:
+ logger.info(
+ "Rate limit exhausted for current user %s %s",
+ self.current_user,
+ reset_info,
+ )
+ # Use next token in line
+ self.set_next_session_token()
+ # Wait one second to avoid triggering GitHub's abuse rate limits
+ time.sleep(1)
+
+ # All tokens have been rate-limited. What do we do?
+
+ if not reset_times:
+ logger.warning(
+ "No X-Ratelimit-Reset value found in responses for any token; "
+ "Giving up."
+ )
+ raise MissingRateLimitReset()
+
+ sleep_time = max(reset_times.values()) - time.time() + 1
+ logger.info(
+ "Rate limits exhausted for all tokens. Sleeping for %f seconds.",
+ sleep_time,
+ )
+ time.sleep(sleep_time)
@dataclass
@@ -156,37 +243,7 @@
self.relisting = self.first_id is not None or self.last_id is not None
- self.session = init_session()
-
- random.shuffle(self.credentials)
-
- self.anonymous = not self.credentials
-
- if self.anonymous:
- logger.warning("No tokens set in configuration, using anonymous mode")
-
- self.token_index = -1
- self.current_user: Optional[str] = None
-
- if not self.anonymous:
- # Initialize the first token value in the session headers
- self.set_next_session_token()
-
- def set_next_session_token(self) -> None:
- """Update the current authentication token with the next one in line."""
-
- self.token_index = (self.token_index + 1) % len(self.credentials)
-
- auth = self.credentials[self.token_index]
- if "password" in auth:
- token = auth["password"]
- else:
- token = auth["token"]
-
- self.current_user = auth["username"]
- logger.debug("Using authentication token for user %s", self.current_user)
-
- self.session.headers.update({"Authorization": f"token {token}"})
+ self.github_session = GitHubSession(credentials=self.credentials)
def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState:
return GitHubListerState(**d)
@@ -206,54 +263,11 @@
while self.last_id is None or current_id < self.last_id:
logger.debug("Getting page %s", current_url)
- # The following for/else loop handles rate limiting; if successful,
- # it provides the rest of the function with a `response` object.
- #
- # If all tokens are rate-limited, we sleep until the reset time,
- # then `continue` into another iteration of the outer while loop,
- # attempting to get data from the same URL again.
-
- max_attempts = 1 if self.anonymous else len(self.credentials)
- reset_times: Dict[int, int] = {} # token index -> time
- for attempt in range(max_attempts):
- try:
- response = github_request(current_url, session=self.session)
- break
- except RateLimited as e:
- reset_info = "(unknown reset)"
- if e.reset_time is not None:
- reset_times[self.token_index] = e.reset_time
- reset_info = "(resetting in %ss)" % (e.reset_time - time.time())
-
- if not self.anonymous:
- logger.info(
- "Rate limit exhausted for current user %s %s",
- self.current_user,
- reset_info,
- )
- # Use next token in line
- self.set_next_session_token()
- # Wait one second to avoid triggering GitHub's abuse rate limits
- time.sleep(1)
- else:
- # All tokens have been rate-limited. What do we do?
-
- if not reset_times:
- logger.warning(
- "No X-Ratelimit-Reset value found in responses for any token; "
- "Giving up."
- )
- break
-
- sleep_time = max(reset_times.values()) - time.time() + 1
- logger.info(
- "Rate limits exhausted for all tokens. Sleeping for %f seconds.",
- sleep_time,
- )
- time.sleep(sleep_time)
- # This goes back to the outer page-by-page loop, doing one more
- # iteration on the same page
- continue
+ try:
+ response = self.github_session.request(current_url)
+ except MissingRateLimitReset:
+ # Give up
+ break
# We've successfully retrieved a (non-ratelimited) `response`. We
# still need to check it for validity.
diff --git a/swh/lister/github/tests/test_lister.py b/swh/lister/github/tests/test_lister.py
--- a/swh/lister/github/tests/test_lister.py
+++ b/swh/lister/github/tests/test_lister.py
@@ -270,7 +270,7 @@
caplog.set_level(logging.DEBUG, "swh.lister.github.lister")
lister = GitHubLister(scheduler=swh_scheduler)
- assert lister.anonymous
+ assert lister.github_session.anonymous
assert "using anonymous mode" in caplog.records[-1].message
caplog.clear()
@@ -315,9 +315,9 @@
caplog.set_level(logging.DEBUG, "swh.lister.github.lister")
lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials)
- assert lister.token_index == 0
+ assert lister.github_session.token_index == 0
assert sorted(lister.credentials, key=lambda t: t["username"]) == github_credentials
- assert lister.session.headers["Authorization"] in [
+ assert lister.github_session.session.headers["Authorization"] in [
"token %s" % t for t in all_tokens
]
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Jul 2, 10:44 AM (2 w, 10 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220196
Attached To
D7630: github: Refactor rate-limiting out of the GitHubLister class
Event Timeline
Log In to Comment