diff --git a/docs/new_lister_template.py b/docs/new_lister_template.py --- a/docs/new_lister_template.py +++ b/docs/new_lister_template.py @@ -11,7 +11,7 @@ import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -77,11 +77,11 @@ def state_to_dict(self, state: NewForgeListerState) -> Dict[str, Any]: return asdict(state) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url, params) -> requests.Response: # Do the network resource request under a retrying decorator # to handle rate limiting and transient errors up to a limit. - # `throttling_retry` by default use the `requests` library to check + # `http_retry` by default use the `requests` library to check # only for rate-limit and a base-10 exponential waiting strategy. # This can be customized by passed waiting, retrying and logging strategies # as functions. See the `tenacity` library documentation. diff --git a/docs/tutorial.rst b/docs/tutorial.rst --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -317,7 +317,7 @@ then immediately stop the listing by doing an equivalent of :py:meth:`Response.raise_for_status` from the ``requests`` library. As for rate-limiting errors, we have a strategy of using a flexible decorator to handle the retrying for us. -It is based on the ``tenacity`` library and accessible as :py:func:`throttling_retry` from +It is based on the ``tenacity`` library and accessible as :py:func:`http_retry` from :py:mod:`swh.lister.utils`. Pagination diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py --- a/swh/lister/arch/lister.py +++ b/swh/lister/arch/lister.py @@ -16,7 +16,7 @@ import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.model.hashutil import hash_to_hex from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -132,7 +132,7 @@ } ) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.debug("Fetching URL %s with params %s", url, params) diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -14,7 +14,7 @@ import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -107,7 +107,7 @@ if username is not None and password is not None: self.session.auth = (username, password) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) + @http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) def page_request(self, last_repo_cdate: str) -> requests.Response: self.url_params["after"] = last_repo_cdate diff --git a/swh/lister/bower/lister.py b/swh/lister/bower/lister.py --- a/swh/lister/bower/lister.py +++ b/swh/lister/bower/lister.py @@ -8,7 +8,7 @@ import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -49,7 +49,7 @@ } ) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.info("Fetching URL %s with params %s", url, params) diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -15,7 +15,7 @@ from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, StatelessLister -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -79,7 +79,7 @@ ) self.base_git_url = base_git_url - @throttling_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) + @http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) def _get_and_parse(self, url: str) -> BeautifulSoup: """Get the given url and parse the retrieved HTML using BeautifulSoup""" response = self.session.get(url) diff --git a/swh/lister/gitea/tests/test_lister.py b/swh/lister/gitea/tests/test_lister.py --- a/swh/lister/gitea/tests/test_lister.py +++ b/swh/lister/gitea/tests/test_lister.py @@ -138,7 +138,7 @@ assert stats.pages == 1 -@pytest.mark.parametrize("http_code", [400, 500, 502]) +@pytest.mark.parametrize("http_code", [400, 500]) def test_gitea_list_http_error( swh_scheduler, requests_mock, http_code, trygitea_p1, trygitea_p2 ): diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -17,7 +17,7 @@ from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, Lister -from swh.lister.utils import is_retryable_exception, throttling_retry +from swh.lister.utils import http_retry, is_retryable_exception from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) @@ -138,7 +138,7 @@ def state_to_dict(self, state: GitLabListerState) -> Dict[str, Any]: return asdict(state) - @throttling_retry( + @http_retry( retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING) ) def get_page_result(self, url: str) -> PageResult: diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py --- a/swh/lister/gogs/lister.py +++ b/swh/lister/gogs/lister.py @@ -12,7 +12,7 @@ import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -116,7 +116,7 @@ def state_to_dict(self, state: GogsListerState) -> Dict[str, Any]: return asdict(state) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request( self, url: str, params: Dict[str, Any] ) -> Tuple[Dict[str, Any], Dict[str, Any]]: diff --git a/swh/lister/gogs/tests/test_lister.py b/swh/lister/gogs/tests/test_lister.py --- a/swh/lister/gogs/tests/test_lister.py +++ b/swh/lister/gogs/tests/test_lister.py @@ -177,7 +177,7 @@ assert stats.origins == 6 -@pytest.mark.parametrize("http_code", [400, 500, 502]) +@pytest.mark.parametrize("http_code", [400, 500]) def test_gogs_list_http_error( swh_scheduler, requests_mock, http_code, trygogs_p1, trygogs_p3_last ): diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py --- a/swh/lister/golang/lister.py +++ b/swh/lister/golang/lister.py @@ -13,7 +13,7 @@ import requests from tenacity import before_sleep_log -from swh.lister.utils import retry_policy_generic, throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -87,8 +87,7 @@ ): self.updated = True - @throttling_retry( - retry=retry_policy_generic, + @http_retry( before_sleep=before_sleep_log(logger, logging.WARNING), ) def api_request(self, url: str) -> List[str]: diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py --- a/swh/lister/launchpad/lister.py +++ b/swh/lister/launchpad/lister.py @@ -14,7 +14,7 @@ from lazr.restfulclient.resource import Collection from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import retry_if_exception, throttling_retry +from swh.lister.utils import http_retry, retry_if_exception from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -100,7 +100,7 @@ d[attribute_name] = date_last_modified.isoformat() return d - @throttling_retry( + @http_retry( retry=retry_if_restful_error, before_sleep=before_sleep_log(logger, logging.WARNING), ) diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -16,7 +16,7 @@ from tenacity.before_sleep import before_sleep_log from swh.core.github.utils import GitHubSession -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -112,7 +112,7 @@ def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]: return asdict(state) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.info("Fetching URL %s with params %s", url, params) diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -125,6 +125,11 @@ requests_mock.get(URL_POM_3, content=maven_pom_3) +@pytest.fixture(autouse=True) +def retry_sleep_mock(mocker): + mocker.patch.object(MavenLister.page_request.retry, "sleep") + + def test_maven_full_listing(swh_scheduler): """Covers full listing of multiple pages, checking page results and listed origins, statelessness.""" diff --git a/swh/lister/npm/lister.py b/swh/lister/npm/lister.py --- a/swh/lister/npm/lister.py +++ b/swh/lister/npm/lister.py @@ -12,7 +12,7 @@ from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, Lister -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -95,7 +95,7 @@ params["startkey"] = last_package_id return params - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, last_package_id: str) -> requests.Response: params = self.request_params(last_package_id) logger.debug("Fetching URL %s with params %s", self.url, params) diff --git a/swh/lister/npm/tests/test_lister.py b/swh/lister/npm/tests/test_lister.py --- a/swh/lister/npm/tests/test_lister.py +++ b/swh/lister/npm/tests/test_lister.py @@ -35,6 +35,11 @@ return json.loads(Path(datadir, "npm_incremental_page2.json").read_text()) +@pytest.fixture(autouse=True) +def retry_sleep_mock(mocker): + mocker.patch.object(NpmLister.page_request.retry, "sleep") + + def _check_listed_npm_packages(lister, packages, scheduler_origins): for package in packages: package_name = package["doc"]["name"] diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py --- a/swh/lister/pubdev/lister.py +++ b/swh/lister/pubdev/lister.py @@ -10,7 +10,7 @@ from requests.exceptions import HTTPError from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -60,7 +60,7 @@ } ) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.debug("Fetching URL %s with params %s", url, params) diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -13,7 +13,7 @@ from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -88,7 +88,7 @@ def state_to_dict(self, state: PyPIListerState) -> Dict[str, Any]: return asdict(state) - @throttling_retry( + @http_retry( retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING) ) def _changelog_last_serial(self, client: ServerProxy) -> int: @@ -97,7 +97,7 @@ assert isinstance(serial, int) return serial - @throttling_retry( + @http_retry( retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING) ) def _changelog_since_serial( diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -18,7 +18,7 @@ from tenacity.before_sleep import before_sleep_log from swh.core.api.classes import stream_results -from swh.lister.utils import retry_policy_generic, throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -208,8 +208,7 @@ self._project_last_modified = listed_origins return listed_origins - @throttling_retry( - retry=retry_policy_generic, + @http_retry( before_sleep=before_sleep_log(logger, logging.WARNING), ) def page_request(self, url, params) -> requests.Response: diff --git a/swh/lister/tests/test_utils.py b/swh/lister/tests/test_utils.py --- a/swh/lister/tests/test_utils.py +++ b/swh/lister/tests/test_utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 the Software Heritage developers +# Copyright (C) 2018-2022 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,12 +7,7 @@ from requests.status_codes import codes from tenacity.wait import wait_fixed -from swh.lister.utils import ( - MAX_NUMBER_ATTEMPTS, - WAIT_EXP_BASE, - split_range, - throttling_retry, -) +from swh.lister.utils import MAX_NUMBER_ATTEMPTS, WAIT_EXP_BASE, http_retry, split_range @pytest.mark.parametrize( @@ -51,7 +46,7 @@ TEST_URL = "https://example.og/api/repositories" -@throttling_retry() +@http_retry() def make_request(): response = requests.get(TEST_URL) response.raise_for_status() @@ -62,13 +57,22 @@ mock_sleep.assert_has_calls([mocker.call(param) for param in sleep_params]) -def test_throttling_retry(requests_mock, mocker): +@pytest.mark.parametrize( + "status_code", + [ + codes.too_many_requests, + codes.internal_server_error, + codes.bad_gateway, + codes.service_unavailable, + ], +) +def test_http_retry(requests_mock, mocker, status_code): data = {"result": {}} requests_mock.get( TEST_URL, [ - {"status_code": codes.too_many_requests}, - {"status_code": codes.too_many_requests}, + {"status_code": status_code}, + {"status_code": status_code}, {"status_code": codes.ok, "json": data}, ], ) @@ -82,7 +86,7 @@ assert response.json() == data -def test_throttling_retry_max_attemps(requests_mock, mocker): +def test_http_retry_max_attemps(requests_mock, mocker): requests_mock.get( TEST_URL, [{"status_code": codes.too_many_requests}] * (MAX_NUMBER_ATTEMPTS), @@ -102,14 +106,14 @@ ) -@throttling_retry(wait=wait_fixed(WAIT_EXP_BASE)) +@http_retry(wait=wait_fixed(WAIT_EXP_BASE)) def make_request_wait_fixed(): response = requests.get(TEST_URL) response.raise_for_status() return response -def test_throttling_retry_wait_fixed(requests_mock, mocker): +def test_http_retry_wait_fixed(requests_mock, mocker): requests_mock.get( TEST_URL, [ diff --git a/swh/lister/tuleap/lister.py b/swh/lister/tuleap/lister.py --- a/swh/lister/tuleap/lister.py +++ b/swh/lister/tuleap/lister.py @@ -11,7 +11,7 @@ import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -65,7 +65,7 @@ } ) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.info("Fetching URL %s with params %s", url, params) diff --git a/swh/lister/tuleap/tests/test_lister.py b/swh/lister/tuleap/tests/test_lister.py --- a/swh/lister/tuleap/tests/test_lister.py +++ b/swh/lister/tuleap/tests/test_lister.py @@ -92,6 +92,11 @@ return text, headers, page_results, origin_urls +@pytest.fixture(autouse=True) +def retry_sleep_mock(mocker): + mocker.patch.object(TuleapLister.page_request.retry, "sleep") + + def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): """Asserts that the two collections have the same origin URLs. diff --git a/swh/lister/utils.py b/swh/lister/utils.py --- a/swh/lister/utils.py +++ b/swh/lister/utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 the Software Heritage developers +# Copyright (C) 2018-2022 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -66,14 +66,6 @@ return False -def retry_if_throttling(retry_state) -> bool: - """ - Custom tenacity retry predicate for handling HTTP responses with - status code 429 (too many requests). - """ - return retry_if_exception(retry_state, is_throttling_exception) - - def retry_policy_generic(retry_state) -> bool: """ Custom tenacity retry predicate for handling failed requests: @@ -90,8 +82,8 @@ MAX_NUMBER_ATTEMPTS = 5 -def throttling_retry( - retry=retry_if_throttling, +def http_retry( + retry=retry_policy_generic, wait=wait_exponential(exp_base=WAIT_EXP_BASE), stop=stop_after_attempt(max_attempt_number=MAX_NUMBER_ATTEMPTS), **retry_args,