diff --git a/requirements.txt b/requirements.txt index 9ae9172..bd9bfc3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,11 @@ SQLAlchemy arrow python_debian requests setuptools xmltodict iso8601 beautifulsoup4 pytz -launchpadlib \ No newline at end of file +launchpadlib +tenacity diff --git a/swh/lister/tests/test_utils.py b/swh/lister/tests/test_utils.py index 0d965e3..bf804fc 100644 --- a/swh/lister/tests/test_utils.py +++ b/swh/lister/tests/test_utils.py @@ -1,41 +1,133 @@ # Copyright (C) 2018-2020 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest +import requests +from requests.status_codes import codes +from tenacity.wait import wait_fixed from testing.postgresql import Postgresql -from swh.lister import utils +from swh.lister.utils import ( + MAX_NUMBER_ATTEMPTS, + WAIT_EXP_BASE, + split_range, + throttling_retry, +) @pytest.mark.parametrize( "total_pages,nb_pages,expected_ranges", [ (14, 5, [(0, 4), (5, 9), (10, 14)]), (19, 10, [(0, 9), (10, 19)]), (20, 3, [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 20)]), (21, 3, [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 21),],), ], ) def test_split_range(total_pages, nb_pages, expected_ranges): - actual_ranges = list(utils.split_range(total_pages, nb_pages)) + actual_ranges = list(split_range(total_pages, nb_pages)) assert actual_ranges == expected_ranges @pytest.mark.parametrize("total_pages,nb_pages", [(None, 1), (100, None)]) def test_split_range_errors(total_pages, nb_pages): for total_pages, nb_pages in [(None, 1), (100, None)]: with pytest.raises(TypeError): - next(utils.split_range(total_pages, nb_pages)) + next(split_range(total_pages, nb_pages)) def init_db(): """Factorize the db_url instantiation Returns: db object to ease db manipulation """ initdb_args = Postgresql.DEFAULT_SETTINGS["initdb_args"] initdb_args = " ".join([initdb_args, "-E UTF-8"]) return Postgresql(initdb_args=initdb_args) + + +TEST_URL = "https://example.og/api/repositories" + + +@throttling_retry() +def make_request(): + response = requests.get(TEST_URL) + response.raise_for_status() + return response + + +def _assert_sleep_calls(mocker, mock_sleep, sleep_params): + try: + mock_sleep.assert_has_calls([mocker.call(param) for param in sleep_params]) + except AssertionError: + # tenacity < 5.1 has a different behavior for wait_exponential + # https://github.com/jd/tenacity/commit/aac4307a0aa30d7befd0ebe4212ee4fc69083a95 + mock_sleep.assert_has_calls( + [mocker.call(param * WAIT_EXP_BASE) for param in sleep_params] + ) + + +def test_throttling_retry(requests_mock, mocker): + data = {"result": {}} + requests_mock.get( + TEST_URL, + [ + {"status_code": codes.too_many_requests}, + {"status_code": codes.too_many_requests}, + {"status_code": codes.ok, "json": data}, + ], + ) + + mock_sleep = mocker.patch.object(make_request.retry, "sleep") + + response = make_request() + + _assert_sleep_calls(mocker, mock_sleep, [1, WAIT_EXP_BASE]) + + assert response.json() == data + + +def test_throttling_retry_max_attemps(requests_mock, mocker): + requests_mock.get( + TEST_URL, [{"status_code": codes.too_many_requests}] * (MAX_NUMBER_ATTEMPTS), + ) + + mock_sleep = mocker.patch.object(make_request.retry, "sleep") + + with pytest.raises(requests.exceptions.HTTPError) as e: + make_request() + + assert e.value.response.status_code == codes.too_many_requests + + _assert_sleep_calls( + mocker, + mock_sleep, + [float(WAIT_EXP_BASE ** i) for i in range(MAX_NUMBER_ATTEMPTS - 1)], + ) + + +@throttling_retry(wait=wait_fixed(WAIT_EXP_BASE)) +def make_request_wait_fixed(): + response = requests.get(TEST_URL) + response.raise_for_status() + return response + + +def test_throttling_retry_wait_fixed(requests_mock, mocker): + requests_mock.get( + TEST_URL, + [ + {"status_code": codes.too_many_requests}, + {"status_code": codes.too_many_requests}, + {"status_code": codes.ok}, + ], + ) + + mock_sleep = mocker.patch.object(make_request_wait_fixed.retry, "sleep") + + make_request_wait_fixed() + + _assert_sleep_calls(mocker, mock_sleep, [WAIT_EXP_BASE] * 2) diff --git a/swh/lister/utils.py b/swh/lister/utils.py index 465bdd8..c7b6a4c 100644 --- a/swh/lister/utils.py +++ b/swh/lister/utils.py @@ -1,29 +1,105 @@ -# Copyright (C) 2018-2020 the Software Heritage developers +# Copyright (C) 2018-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Iterator, Tuple +from requests.exceptions import HTTPError +from requests.status_codes import codes +from tenacity import retry as tenacity_retry +from tenacity.stop import stop_after_attempt +from tenacity.wait import wait_exponential + def split_range(total_pages: int, nb_pages: int) -> Iterator[Tuple[int, int]]: """Split `total_pages` into mostly `nb_pages` ranges. In some cases, the last range can have one more element. >>> list(split_range(19, 10)) [(0, 9), (10, 19)] >>> list(split_range(20, 3)) [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 20)] >>> list(split_range(21, 3)) [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 21)] """ prev_index = None for index in range(0, total_pages, nb_pages): if index is not None and prev_index is not None: yield prev_index, index - 1 prev_index = index if index != total_pages: yield index, total_pages + + +def is_throttling_exception(e: Exception) -> bool: + """ + Checks if an exception is a requests.exception.HTTPError for + a response with status code 429 (too many requests). + """ + return ( + isinstance(e, HTTPError) and e.response.status_code == codes.too_many_requests + ) + + +def retry_attempt(retry_state): + """ + Utility function to get last retry attempt info based on the + tenacity version (as debian buster packages version 4.12). + """ + try: + attempt = retry_state.outcome + except AttributeError: + # tenacity < 5.0 + attempt = retry_state + return attempt + + +def retry_if_throttling(retry_state) -> bool: + """ + Custom tenacity retry predicate for handling HTTP responses with + status code 429 (too many requests). + """ + attempt = retry_attempt(retry_state) + if attempt.failed: + exception = attempt.exception() + return is_throttling_exception(exception) + return False + + +WAIT_EXP_BASE = 10 +MAX_NUMBER_ATTEMPTS = 5 + + +def throttling_retry( + retry=retry_if_throttling, + wait=wait_exponential(exp_base=WAIT_EXP_BASE), + stop=stop_after_attempt(max_attempt_number=MAX_NUMBER_ATTEMPTS), + **retry_args, +): + """ + Decorator based on `tenacity` for retrying a function possibly raising + requests.exception.HTTPError for status code 429 (too many requests). + + It provides a default configuration that should work properly in most + cases but all `tenacity.retry` parameters can also be overridden in client + code. + + When the mmaximum of attempts is reached, the HTTPError exception will then + be reraised. + + Args: + retry: function defining request retry condition (default to 429 status code) + https://tenacity.readthedocs.io/en/latest/#whether-to-retry + + wait: function defining wait strategy before retrying (default to exponential + backoff) https://tenacity.readthedocs.io/en/latest/#waiting-before-retrying + + stop: function defining when to stop retrying (default after 5 attempts) + https://tenacity.readthedocs.io/en/latest/#stopping + + """ + return tenacity_retry(retry=retry, wait=wait, stop=stop, reraise=True, **retry_args)