Page MenuHomeSoftware Heritage

D8519.diff
No OneTemporary

D8519.diff

diff --git a/docs/new_lister_template.py b/docs/new_lister_template.py
--- a/docs/new_lister_template.py
+++ b/docs/new_lister_template.py
@@ -11,7 +11,7 @@
import requests
from tenacity.before_sleep import before_sleep_log
-from swh.lister.utils import throttling_retry
+from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -77,11 +77,11 @@
def state_to_dict(self, state: NewForgeListerState) -> Dict[str, Any]:
return asdict(state)
- @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+ @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url, params) -> requests.Response:
# Do the network resource request under a retrying decorator
# to handle rate limiting and transient errors up to a limit.
- # `throttling_retry` by default use the `requests` library to check
+ # `http_retry` by default use the `requests` library to check
# only for rate-limit and a base-10 exponential waiting strategy.
# This can be customized by passed waiting, retrying and logging strategies
# as functions. See the `tenacity` library documentation.
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
@@ -317,7 +317,7 @@
then immediately stop the listing by doing an equivalent of
:py:meth:`Response.raise_for_status` from the ``requests`` library. As for rate-limiting
errors, we have a strategy of using a flexible decorator to handle the retrying for us.
-It is based on the ``tenacity`` library and accessible as :py:func:`throttling_retry` from
+It is based on the ``tenacity`` library and accessible as :py:func:`http_retry` from
:py:mod:`swh.lister.utils`.
Pagination
diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py
--- a/swh/lister/arch/lister.py
+++ b/swh/lister/arch/lister.py
@@ -16,7 +16,7 @@
import requests
from tenacity.before_sleep import before_sleep_log
-from swh.lister.utils import throttling_retry
+from swh.lister.utils import http_retry
from swh.model.hashutil import hash_to_hex
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -132,7 +132,7 @@
}
)
- @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+ @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.debug("Fetching URL %s with params %s", url, params)
diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py
--- a/swh/lister/bitbucket/lister.py
+++ b/swh/lister/bitbucket/lister.py
@@ -14,7 +14,7 @@
import requests
from tenacity.before_sleep import before_sleep_log
-from swh.lister.utils import throttling_retry
+from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -107,7 +107,7 @@
if username is not None and password is not None:
self.session.auth = (username, password)
- @throttling_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
+ @http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
def page_request(self, last_repo_cdate: str) -> requests.Response:
self.url_params["after"] = last_repo_cdate
diff --git a/swh/lister/bower/lister.py b/swh/lister/bower/lister.py
--- a/swh/lister/bower/lister.py
+++ b/swh/lister/bower/lister.py
@@ -8,7 +8,7 @@
import requests
from tenacity.before_sleep import before_sleep_log
-from swh.lister.utils import throttling_retry
+from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -49,7 +49,7 @@
}
)
- @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+ @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.info("Fetching URL %s with params %s", url, params)
diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py
--- a/swh/lister/cgit/lister.py
+++ b/swh/lister/cgit/lister.py
@@ -15,7 +15,7 @@
from swh.lister import USER_AGENT
from swh.lister.pattern import CredentialsType, StatelessLister
-from swh.lister.utils import throttling_retry
+from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -79,7 +79,7 @@
)
self.base_git_url = base_git_url
- @throttling_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
+ @http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
def _get_and_parse(self, url: str) -> BeautifulSoup:
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
response = self.session.get(url)
diff --git a/swh/lister/gitea/tests/test_lister.py b/swh/lister/gitea/tests/test_lister.py
--- a/swh/lister/gitea/tests/test_lister.py
+++ b/swh/lister/gitea/tests/test_lister.py
@@ -138,7 +138,7 @@
assert stats.pages == 1
-@pytest.mark.parametrize("http_code", [400, 500, 502])
+@pytest.mark.parametrize("http_code", [400, 500])
def test_gitea_list_http_error(
swh_scheduler, requests_mock, http_code, trygitea_p1, trygitea_p2
):
diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py
--- a/swh/lister/gitlab/lister.py
+++ b/swh/lister/gitlab/lister.py
@@ -17,7 +17,7 @@
from swh.lister import USER_AGENT
from swh.lister.pattern import CredentialsType, Lister
-from swh.lister.utils import is_retryable_exception, throttling_retry
+from swh.lister.utils import http_retry, is_retryable_exception
from swh.scheduler.model import ListedOrigin
logger = logging.getLogger(__name__)
@@ -138,7 +138,7 @@
def state_to_dict(self, state: GitLabListerState) -> Dict[str, Any]:
return asdict(state)
- @throttling_retry(
+ @http_retry(
retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING)
)
def get_page_result(self, url: str) -> PageResult:
diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py
--- a/swh/lister/gogs/lister.py
+++ b/swh/lister/gogs/lister.py
@@ -12,7 +12,7 @@
import requests
from tenacity.before_sleep import before_sleep_log
-from swh.lister.utils import throttling_retry
+from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -116,7 +116,7 @@
def state_to_dict(self, state: GogsListerState) -> Dict[str, Any]:
return asdict(state)
- @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+ @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(
self, url: str, params: Dict[str, Any]
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
diff --git a/swh/lister/gogs/tests/test_lister.py b/swh/lister/gogs/tests/test_lister.py
--- a/swh/lister/gogs/tests/test_lister.py
+++ b/swh/lister/gogs/tests/test_lister.py
@@ -177,7 +177,7 @@
assert stats.origins == 6
-@pytest.mark.parametrize("http_code", [400, 500, 502])
+@pytest.mark.parametrize("http_code", [400, 500])
def test_gogs_list_http_error(
swh_scheduler, requests_mock, http_code, trygogs_p1, trygogs_p3_last
):
diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py
--- a/swh/lister/golang/lister.py
+++ b/swh/lister/golang/lister.py
@@ -13,7 +13,7 @@
import requests
from tenacity import before_sleep_log
-from swh.lister.utils import retry_policy_generic, throttling_retry
+from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -87,8 +87,7 @@
):
self.updated = True
- @throttling_retry(
- retry=retry_policy_generic,
+ @http_retry(
before_sleep=before_sleep_log(logger, logging.WARNING),
)
def api_request(self, url: str) -> List[str]:
diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py
--- a/swh/lister/launchpad/lister.py
+++ b/swh/lister/launchpad/lister.py
@@ -14,7 +14,7 @@
from lazr.restfulclient.resource import Collection
from tenacity.before_sleep import before_sleep_log
-from swh.lister.utils import retry_if_exception, throttling_retry
+from swh.lister.utils import http_retry, retry_if_exception
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -100,7 +100,7 @@
d[attribute_name] = date_last_modified.isoformat()
return d
- @throttling_retry(
+ @http_retry(
retry=retry_if_restful_error,
before_sleep=before_sleep_log(logger, logging.WARNING),
)
diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py
--- a/swh/lister/maven/lister.py
+++ b/swh/lister/maven/lister.py
@@ -16,7 +16,7 @@
from tenacity.before_sleep import before_sleep_log
from swh.core.github.utils import GitHubSession
-from swh.lister.utils import throttling_retry
+from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -112,7 +112,7 @@
def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]:
return asdict(state)
- @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+ @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.info("Fetching URL %s with params %s", url, params)
diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py
--- a/swh/lister/maven/tests/test_lister.py
+++ b/swh/lister/maven/tests/test_lister.py
@@ -125,6 +125,11 @@
requests_mock.get(URL_POM_3, content=maven_pom_3)
+@pytest.fixture(autouse=True)
+def retry_sleep_mock(mocker):
+ mocker.patch.object(MavenLister.page_request.retry, "sleep")
+
+
def test_maven_full_listing(swh_scheduler):
"""Covers full listing of multiple pages, checking page results and listed
origins, statelessness."""
diff --git a/swh/lister/npm/lister.py b/swh/lister/npm/lister.py
--- a/swh/lister/npm/lister.py
+++ b/swh/lister/npm/lister.py
@@ -12,7 +12,7 @@
from swh.lister import USER_AGENT
from swh.lister.pattern import CredentialsType, Lister
-from swh.lister.utils import throttling_retry
+from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -95,7 +95,7 @@
params["startkey"] = last_package_id
return params
- @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+ @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, last_package_id: str) -> requests.Response:
params = self.request_params(last_package_id)
logger.debug("Fetching URL %s with params %s", self.url, params)
diff --git a/swh/lister/npm/tests/test_lister.py b/swh/lister/npm/tests/test_lister.py
--- a/swh/lister/npm/tests/test_lister.py
+++ b/swh/lister/npm/tests/test_lister.py
@@ -35,6 +35,11 @@
return json.loads(Path(datadir, "npm_incremental_page2.json").read_text())
+@pytest.fixture(autouse=True)
+def retry_sleep_mock(mocker):
+ mocker.patch.object(NpmLister.page_request.retry, "sleep")
+
+
def _check_listed_npm_packages(lister, packages, scheduler_origins):
for package in packages:
package_name = package["doc"]["name"]
diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py
--- a/swh/lister/pubdev/lister.py
+++ b/swh/lister/pubdev/lister.py
@@ -10,7 +10,7 @@
from requests.exceptions import HTTPError
from tenacity.before_sleep import before_sleep_log
-from swh.lister.utils import throttling_retry
+from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -60,7 +60,7 @@
}
)
- @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+ @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.debug("Fetching URL %s with params %s", url, params)
diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py
--- a/swh/lister/pypi/lister.py
+++ b/swh/lister/pypi/lister.py
@@ -13,7 +13,7 @@
from tenacity.before_sleep import before_sleep_log
-from swh.lister.utils import throttling_retry
+from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -88,7 +88,7 @@
def state_to_dict(self, state: PyPIListerState) -> Dict[str, Any]:
return asdict(state)
- @throttling_retry(
+ @http_retry(
retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING)
)
def _changelog_last_serial(self, client: ServerProxy) -> int:
@@ -97,7 +97,7 @@
assert isinstance(serial, int)
return serial
- @throttling_retry(
+ @http_retry(
retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING)
)
def _changelog_since_serial(
diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py
--- a/swh/lister/sourceforge/lister.py
+++ b/swh/lister/sourceforge/lister.py
@@ -18,7 +18,7 @@
from tenacity.before_sleep import before_sleep_log
from swh.core.api.classes import stream_results
-from swh.lister.utils import retry_policy_generic, throttling_retry
+from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -208,8 +208,7 @@
self._project_last_modified = listed_origins
return listed_origins
- @throttling_retry(
- retry=retry_policy_generic,
+ @http_retry(
before_sleep=before_sleep_log(logger, logging.WARNING),
)
def page_request(self, url, params) -> requests.Response:
diff --git a/swh/lister/tests/test_utils.py b/swh/lister/tests/test_utils.py
--- a/swh/lister/tests/test_utils.py
+++ b/swh/lister/tests/test_utils.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2021 the Software Heritage developers
+# Copyright (C) 2018-2022 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -7,12 +7,7 @@
from requests.status_codes import codes
from tenacity.wait import wait_fixed
-from swh.lister.utils import (
- MAX_NUMBER_ATTEMPTS,
- WAIT_EXP_BASE,
- split_range,
- throttling_retry,
-)
+from swh.lister.utils import MAX_NUMBER_ATTEMPTS, WAIT_EXP_BASE, http_retry, split_range
@pytest.mark.parametrize(
@@ -51,7 +46,7 @@
TEST_URL = "https://example.og/api/repositories"
-@throttling_retry()
+@http_retry()
def make_request():
response = requests.get(TEST_URL)
response.raise_for_status()
@@ -62,13 +57,22 @@
mock_sleep.assert_has_calls([mocker.call(param) for param in sleep_params])
-def test_throttling_retry(requests_mock, mocker):
+@pytest.mark.parametrize(
+ "status_code",
+ [
+ codes.too_many_requests,
+ codes.internal_server_error,
+ codes.bad_gateway,
+ codes.service_unavailable,
+ ],
+)
+def test_http_retry(requests_mock, mocker, status_code):
data = {"result": {}}
requests_mock.get(
TEST_URL,
[
- {"status_code": codes.too_many_requests},
- {"status_code": codes.too_many_requests},
+ {"status_code": status_code},
+ {"status_code": status_code},
{"status_code": codes.ok, "json": data},
],
)
@@ -82,7 +86,7 @@
assert response.json() == data
-def test_throttling_retry_max_attemps(requests_mock, mocker):
+def test_http_retry_max_attemps(requests_mock, mocker):
requests_mock.get(
TEST_URL,
[{"status_code": codes.too_many_requests}] * (MAX_NUMBER_ATTEMPTS),
@@ -102,14 +106,14 @@
)
-@throttling_retry(wait=wait_fixed(WAIT_EXP_BASE))
+@http_retry(wait=wait_fixed(WAIT_EXP_BASE))
def make_request_wait_fixed():
response = requests.get(TEST_URL)
response.raise_for_status()
return response
-def test_throttling_retry_wait_fixed(requests_mock, mocker):
+def test_http_retry_wait_fixed(requests_mock, mocker):
requests_mock.get(
TEST_URL,
[
diff --git a/swh/lister/tuleap/lister.py b/swh/lister/tuleap/lister.py
--- a/swh/lister/tuleap/lister.py
+++ b/swh/lister/tuleap/lister.py
@@ -11,7 +11,7 @@
import requests
from tenacity.before_sleep import before_sleep_log
-from swh.lister.utils import throttling_retry
+from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -65,7 +65,7 @@
}
)
- @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+ @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.info("Fetching URL %s with params %s", url, params)
diff --git a/swh/lister/tuleap/tests/test_lister.py b/swh/lister/tuleap/tests/test_lister.py
--- a/swh/lister/tuleap/tests/test_lister.py
+++ b/swh/lister/tuleap/tests/test_lister.py
@@ -92,6 +92,11 @@
return text, headers, page_results, origin_urls
+@pytest.fixture(autouse=True)
+def retry_sleep_mock(mocker):
+ mocker.patch.object(TuleapLister.page_request.retry, "sleep")
+
+
def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]):
"""Asserts that the two collections have the same origin URLs.
diff --git a/swh/lister/utils.py b/swh/lister/utils.py
--- a/swh/lister/utils.py
+++ b/swh/lister/utils.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2021 the Software Heritage developers
+# Copyright (C) 2018-2022 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -66,14 +66,6 @@
return False
-def retry_if_throttling(retry_state) -> bool:
- """
- Custom tenacity retry predicate for handling HTTP responses with
- status code 429 (too many requests).
- """
- return retry_if_exception(retry_state, is_throttling_exception)
-
-
def retry_policy_generic(retry_state) -> bool:
"""
Custom tenacity retry predicate for handling failed requests:
@@ -90,8 +82,8 @@
MAX_NUMBER_ATTEMPTS = 5
-def throttling_retry(
- retry=retry_if_throttling,
+def http_retry(
+ retry=retry_policy_generic,
wait=wait_exponential(exp_base=WAIT_EXP_BASE),
stop=stop_after_attempt(max_attempt_number=MAX_NUMBER_ATTEMPTS),
**retry_args,

File Metadata

Mime Type
text/plain
Expires
Dec 21 2024, 12:27 AM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221952

Event Timeline