diff --git a/swh/storage/proxies/retry.py b/swh/storage/proxies/retry.py --- a/swh/storage/proxies/retry.py +++ b/swh/storage/proxies/retry.py @@ -6,8 +6,10 @@ import logging import traceback -from tenacity import retry, stop_after_attempt, wait_random_exponential +from tenacity import RetryCallState, retry, stop_after_attempt, wait_random_exponential +from tenacity.wait import wait_base +from swh.core.api import TransientRemoteException from swh.storage import get_storage from swh.storage.exc import StorageArgumentException from swh.storage.interface import StorageInterface @@ -15,9 +17,10 @@ logger = logging.getLogger(__name__) -def should_retry_adding(retry_state) -> bool: +def should_retry_adding(retry_state: RetryCallState) -> bool: """Retry if the error/exception is (probably) not about a caller error""" attempt = retry_state.outcome + assert attempt if attempt.failed: error = attempt.exception() @@ -48,9 +51,25 @@ return False +class wait_transient_exceptions(wait_base): + """Wait longer when servers return HTTP 503.""" + + def __init__(self, wait: float) -> None: + self.wait = wait + + def __call__(self, retry_state: RetryCallState) -> float: + attempt = retry_state.outcome + assert attempt + + if attempt.failed and isinstance(attempt.exception(), TransientRemoteException): + return self.wait + else: + return 0.0 + + swh_retry = retry( retry=should_retry_adding, - wait=wait_random_exponential(multiplier=1, max=10), + wait=wait_random_exponential(multiplier=1, max=10) + wait_transient_exceptions(10), stop=stop_after_attempt(3), reraise=True, ) diff --git a/swh/storage/tests/test_retry.py b/swh/storage/tests/test_retry.py --- a/swh/storage/tests/test_retry.py +++ b/swh/storage/tests/test_retry.py @@ -9,6 +9,7 @@ import psycopg2 import pytest +from swh.core.api import TransientRemoteException from swh.storage.exc import HashCollision, StorageArgumentException from swh.storage.utils import now @@ -77,6 +78,7 @@ sample_content = sample_data.content + sleep = mocker.patch("time.sleep") content = swh_storage.content_get_data(sample_content.sha1) assert content is None @@ -91,6 +93,52 @@ ] ) + assert len(sleep.mock_calls) == 2 + (_name, args1, _kwargs) = sleep.mock_calls[0] + (_name, args2, _kwargs) = sleep.mock_calls[1] + assert 0 < args1[0] < 1 + assert 0 < args2[0] < 2 + + +def test_retrying_proxy_storage_content_add_with_retry_of_transient( + monkeypatch_sleep, + swh_storage, + sample_data, + mocker, +): + """Multiple retries for hash collision and psycopg2 error but finally ok + after many attempts""" + mock_memory = mocker.patch("swh.storage.in_memory.InMemoryStorage.content_add") + mock_memory.side_effect = [ + TransientRemoteException("temporary failure"), + TransientRemoteException("temporary failure"), + # ok then! + {"content:add": 1}, + ] + + sample_content = sample_data.content + + content = swh_storage.content_get_data(sample_content.sha1) + assert content is None + + sleep = mocker.patch("time.sleep") + s = swh_storage.content_add([sample_content]) + assert s == {"content:add": 1} + + mock_memory.assert_has_calls( + [ + call([sample_content]), + call([sample_content]), + call([sample_content]), + ] + ) + + assert len(sleep.mock_calls) == 2 + (_name, args1, _kwargs) = sleep.mock_calls[0] + (_name, args2, _kwargs) = sleep.mock_calls[1] + assert 10 < args1[0] < 11 + assert 10 < args2[0] < 12 + def test_retrying_proxy_swh_storage_content_add_failure( swh_storage, sample_data, mocker