diff --git a/swh/loader/package/crates/loader.py b/swh/loader/package/crates/loader.py --- a/swh/loader/package/crates/loader.py +++ b/swh/loader/package/crates/loader.py @@ -14,7 +14,7 @@ from typing_extensions import TypedDict from swh.loader.package.loader import BasePackageInfo, PackageLoader -from swh.loader.package.utils import api_info, cached_method, release_name +from swh.loader.package.utils import cached_method, get_url_body, release_name from swh.model.model import ObjectType, Person, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface @@ -248,7 +248,7 @@ Returns: Content response as bytes. Content response is a json document. """ - return api_info(self.url) + return get_url_body(self.url) @cached_method def info(self) -> Dict: diff --git a/swh/loader/package/golang/loader.py b/swh/loader/package/golang/loader.py --- a/swh/loader/package/golang/loader.py +++ b/swh/loader/package/golang/loader.py @@ -11,7 +11,12 @@ import attr from swh.loader.package.loader import BasePackageInfo, PackageLoader -from swh.loader.package.utils import EMPTY_AUTHOR, api_info, cached_method, release_name +from swh.loader.package.utils import ( + EMPTY_AUTHOR, + get_url_body, + release_name, + cached_method, +) from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface @@ -54,7 +59,7 @@ self.url = _uppercase_encode(self.url) def get_versions(self) -> Sequence[str]: - versions = api_info(f"{self.url}/@v/list").decode().splitlines() + versions = get_url_body(f"{self.url}/@v/list").decode().splitlines() # some go packages only have a development version not listed by the endpoint above, # so ensure to return it or it will be missed by the golang loader default_version = self.get_default_version() @@ -64,12 +69,12 @@ @cached_method def get_default_version(self) -> str: - latest = api_info(f"{self.url}/@latest") + latest = get_url_body(f"{self.url}/@latest") return json.loads(latest)["Version"] def _raw_info(self, version: str) -> dict: url = f"{self.url}/@v/{_uppercase_encode(version)}.info" - return json.loads(api_info(url)) + return json.loads(get_url_body(url)) def get_package_info(self, version: str) -> Iterator[Tuple[str, GolangPackageInfo]]: # Encode the name because creating nested folders can become problematic diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -17,7 +17,7 @@ PartialExtID, RawExtrinsicMetadataCore, ) -from swh.loader.package.utils import EMPTY_AUTHOR, api_info, cached_method +from swh.loader.package.utils import EMPTY_AUTHOR, cached_method, get_url_body from swh.model import hashutil from swh.model.model import ( MetadataAuthority, @@ -195,7 +195,7 @@ def retrieve_sources(url: str) -> bytes: """Retrieve sources. Potentially raise NotFound error.""" - return api_info(url, allow_redirects=True) + return get_url_body(url, allow_redirects=True) def parse_sources(raw_sources: bytes) -> Dict[str, Any]: diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -19,7 +19,7 @@ PackageLoader, RawExtrinsicMetadataCore, ) -from swh.loader.package.utils import api_info, cached_method, release_name +from swh.loader.package.utils import cached_method, get_url_body, release_name from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, @@ -116,7 +116,7 @@ @cached_method def _raw_info(self) -> bytes: - return api_info(self.provider_url) + return get_url_body(self.provider_url) @cached_method def info(self) -> Dict: diff --git a/swh/loader/package/pubdev/loader.py b/swh/loader/package/pubdev/loader.py --- a/swh/loader/package/pubdev/loader.py +++ b/swh/loader/package/pubdev/loader.py @@ -14,8 +14,8 @@ from swh.loader.package.utils import ( EMPTY_AUTHOR, Person, - api_info, cached_method, + get_url_body, release_name, ) from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone @@ -78,7 +78,7 @@ ) def _raw_info(self) -> bytes: - return api_info(self.package_info_url) + return get_url_body(self.package_info_url) @cached_method def info(self) -> Dict: diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -18,7 +18,12 @@ PartialExtID, RawExtrinsicMetadataCore, ) -from swh.loader.package.utils import EMPTY_AUTHOR, api_info, cached_method, release_name +from swh.loader.package.utils import ( + EMPTY_AUTHOR, + cached_method, + get_url_body, + release_name, +) from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, @@ -83,7 +88,7 @@ @cached_method def _raw_info(self) -> bytes: - return api_info(self.provider_url) + return get_url_body(self.provider_url) @cached_method def info(self) -> Dict: diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py --- a/swh/loader/package/tests/test_utils.py +++ b/swh/loader/package/tests/test_utils.py @@ -13,9 +13,8 @@ import pytest from requests.exceptions import HTTPError -from swh.loader.exception import NotFound import swh.loader.package -from swh.loader.package.utils import api_info, download, release_name +from swh.loader.package.utils import download, get_url_body, release_name def test_version_generation(): @@ -213,17 +212,17 @@ status_code = 400 requests_mock.get(url, status_code=status_code) - with pytest.raises(NotFound) as e0: - api_info(url) - - assert e0.value.args[0] == "Fail to query '%s'. Reason: %s" % (url, status_code) + with pytest.raises( + HTTPError, match=f"{status_code} Client Error: None for url: {url}" + ): + get_url_body(url) def test_api_info(requests_mock): """Fetching json info from pypi project should be ok""" url = "https://pypi.org/pypi/requests/json" requests_mock.get(url, text='{"version": "0.0.1"}') - actual_info = json.loads(api_info(url)) + actual_info = json.loads(get_url_body(url)) assert actual_info == { "version": "0.0.1", } @@ -271,3 +270,39 @@ with pytest.raises(HTTPError): _check_download_ok(url, dest=str(tmp_path)) + + +@pytest.fixture(autouse=True) +def mock_api_info_retry_sleep(mocker): + mocker.patch.object(get_url_body.retry, "sleep") + + +def test_api_info_retry(mocker, requests_mock, tmp_path): + url = "https://example.org/api/endpoint" + json_data = {"foo": "bar"} + + requests_mock.get( + url, + [ + {"status_code": 429}, + {"status_code": 429}, + { + "json": json_data, + "status_code": 200, + }, + ], + ) + + assert json.loads(get_url_body(url)) == json_data + + +def test_api_info_retry_reraise(mocker, requests_mock, tmp_path): + url = "https://example.org/api/endpoint" + + requests_mock.get( + url, + [{"status_code": 429}] * 5, + ) + + with pytest.raises(HTTPError, match=f"429 Client Error: None for url: {url}"): + get_url_body(url) diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -34,26 +34,6 @@ EMPTY_AUTHOR = Person.from_fullname(b"") -def api_info(url: str, **extra_params) -> bytes: - """Basic api client to retrieve information on project. This deals with - fetching json metadata about pypi projects. - - Args: - url (str): The api url (e.g PyPI, npm, etc...) - - Raises: - NotFound in case of query failures (for some reasons: 404, ...) - - Returns: - The associated response's information - - """ - response = requests.get(url, **{**DEFAULT_PARAMS, **extra_params}) - if response.status_code != 200: - raise NotFound(f"Fail to query '{url}'. Reason: {response.status_code}") - return response.content - - def _content_disposition_filename(header: str) -> Optional[str]: fname = None fnames = re.findall(r"filename[\*]?=([^;]+)", header) @@ -81,13 +61,16 @@ return False -@retry( +throttling_retry = retry( retry=_retry_if_throttling, wait=wait_exponential(exp_base=10), stop=stop_after_attempt(max_attempt_number=5), before_sleep=before_sleep_log(logger, logging.WARNING), reraise=True, ) + + +@throttling_retry def download( url: str, dest: str, @@ -181,6 +164,29 @@ return filepath, extrinsic_metadata +@throttling_retry +def get_url_body(url: str, **extra_params) -> bytes: + """Basic api client to retrieve information, typically JSON metadata, + on software package. + + Args: + url (str): The api url (e.g PyPI, npm, etc...) + + Raises: + NotFound in case of query failures (for some reasons: 404, ...) + + Returns: + The associated response's information + + """ + logger.debug("Fetching %s", url) + response = requests.get(url, **{**DEFAULT_PARAMS, **extra_params}) + if response.status_code == 404: + raise NotFound(f"Fail to query '{url}'. Reason: {response.status_code}") + response.raise_for_status() + return response.content + + def release_name(version: str, filename: Optional[str] = None) -> str: if filename: return "releases/%s/%s" % (version, filename)