diff --git a/swh/loader/package/deposit/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py --- a/swh/loader/package/deposit/tests/test_deposit.py +++ b/swh/loader/package/deposit/tests/test_deposit.py @@ -149,8 +149,8 @@ "status": "failed", "status_detail": { "loading": [ - "Failed to load branch HEAD for some-url-2: Fail to query " - "'https://deposit.softwareheritage.org/1/private/666/raw/'. Reason: 404" + "Failed to load branch HEAD for some-url-2: 404 Client Error: None " + "for url: https://deposit.softwareheritage.org/1/private/666/raw/" ] }, } diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py --- a/swh/loader/package/tests/test_utils.py +++ b/swh/loader/package/tests/test_utils.py @@ -11,6 +11,7 @@ from urllib.parse import quote import pytest +from requests.exceptions import HTTPError from swh.loader.exception import NotFound import swh.loader.package @@ -29,10 +30,10 @@ status_code = 404 requests_mock.get(url, status_code=status_code) - with pytest.raises(ValueError) as e: + with pytest.raises(HTTPError) as e: download(url, tmp_path) - assert e.value.args[0] == "Fail to query '%s'. Reason: %s" % (url, status_code) + assert e.value.args[0] == f"{status_code} Client Error: None for url: {url}" _filename = "requests-0.0.1.tar.gz" @@ -234,3 +235,39 @@ ("0.0.2", "something", "releases/0.0.2/something"), ]: assert release_name(version, filename) == expected_release + + +@pytest.fixture(autouse=True) +def mock_download_retry_sleep(mocker): + mocker.patch.object(download.retry, "sleep") + + +def test_download_retry(mocker, requests_mock, tmp_path): + url = f"https://example.org/project/requests/files/{_filename}" + + requests_mock.get( + url, + [ + {"status_code": 429}, + {"status_code": 429}, + { + "text": _data, + "headers": {"content-length": str(len(_data))}, + "status_code": 200, + }, + ], + ) + + _check_download_ok(url, dest=str(tmp_path)) + + +def test_download_retry_reraise(mocker, requests_mock, tmp_path): + url = f"https://example.org/project/requests/files/{_filename}" + + requests_mock.get( + url, + [{"status_code": 429}] * 5, + ) + + with pytest.raises(HTTPError): + _check_download_ok(url, dest=str(tmp_path)) diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -14,6 +14,11 @@ from urllib.request import urlopen import requests +from requests.exceptions import HTTPError +from tenacity import retry +from tenacity.before_sleep import before_sleep_log +from tenacity.stop import stop_after_attempt +from tenacity.wait import wait_exponential from swh.loader.exception import NotFound from swh.loader.package import DEFAULT_PARAMS @@ -63,6 +68,26 @@ return fname +def _retry_if_throttling(retry_state) -> bool: + """Custom tenacity retry predicate for handling HTTP responses with + status code 429 (too many requests). + """ + attempt = retry_state.outcome + if attempt.failed: + exception = attempt.exception() + return ( + isinstance(exception, HTTPError) and exception.response.status_code == 429 + ) + return False + + +@retry( + retry=_retry_if_throttling, + wait=wait_exponential(exp_base=10), + stop=stop_after_attempt(max_attempt_number=5), + before_sleep=before_sleep_log(logger, logging.WARNING), + reraise=True, +) def download( url: str, dest: str, @@ -104,10 +129,7 @@ response_data = itertools.takewhile(bool, chunks) else: response = requests.get(url, **params, timeout=timeout, stream=True) - if response.status_code != 200: - raise ValueError( - "Fail to query '%s'. Reason: %s" % (url, response.status_code) - ) + response.raise_for_status() # update URL to response one as requests follow redirection by default # on GET requests url = response.url