diff --git a/swh/lister/gitlab/tests/test_lister.py b/swh/lister/gitlab/tests/test_lister.py index 04c3e1b..14d1bc4 100644 --- a/swh/lister/gitlab/tests/test_lister.py +++ b/swh/lister/gitlab/tests/test_lister.py @@ -1,220 +1,220 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging from pathlib import Path from typing import Dict, List import pytest from requests.status_codes import codes from swh.lister import USER_AGENT from swh.lister.gitlab.lister import GitLabLister, _parse_page_id from swh.lister.pattern import ListerStats -from swh.lister.tests.test_utils import _assert_sleep_calls +from swh.lister.tests.test_utils import assert_sleep_calls from swh.lister.utils import WAIT_EXP_BASE logger = logging.getLogger(__name__) def api_url(instance: str) -> str: return f"https://{instance}/api/v4/" def url_page(api_url: str, page_id: int) -> str: return f"{api_url}projects?page={page_id}&order_by=id&sort=asc&per_page=20" def _match_request(request): return request.headers.get("User-Agent") == USER_AGENT def test_lister_gitlab(datadir, swh_scheduler, requests_mock): """Gitlab lister supports full listing """ instance = "gitlab.com" url = api_url(instance) response = gitlab_page_response(datadir, instance, 1) requests_mock.get( url_page(url, 1), [{"json": response}], additional_matcher=_match_request, ) lister_gitlab = GitLabLister( swh_scheduler, url=api_url(instance), instance=instance ) listed_result = lister_gitlab.run() expected_nb_origins = len(response) assert listed_result == ListerStats(pages=1, origins=expected_nb_origins) scheduler_origins = lister_gitlab.scheduler.get_listed_origins( lister_gitlab.lister_obj.id ).origins assert len(scheduler_origins) == expected_nb_origins for listed_origin in scheduler_origins: assert listed_origin.visit_type == "git" assert listed_origin.url.startswith(f"https://{instance}") def gitlab_page_response(datadir, instance: str, page_id: int) -> List[Dict]: """Return list of repositories (out of test dataset)""" datapath = Path(datadir, f"https_{instance}", f"api_response_page{page_id}.json") return json.loads(datapath.read_text()) if datapath.exists else [] def test_lister_gitlab_with_pages(swh_scheduler, requests_mock, datadir): """Gitlab lister supports pagination """ instance = "gite.lirmm.fr" url = api_url(instance) response1 = gitlab_page_response(datadir, instance, 1) response2 = gitlab_page_response(datadir, instance, 2) requests_mock.get( url_page(url, 1), [{"json": response1, "headers": {"Link": f"<{url_page(url, 2)}>; rel=next"}}], additional_matcher=_match_request, ) requests_mock.get( url_page(url, 2), [{"json": response2}], additional_matcher=_match_request, ) lister = GitLabLister(swh_scheduler, url=url) listed_result = lister.run() expected_nb_origins = len(response1) + len(response2) assert listed_result == ListerStats(pages=2, origins=expected_nb_origins) scheduler_origins = lister.scheduler.get_listed_origins( lister.lister_obj.id ).origins assert len(scheduler_origins) == expected_nb_origins for listed_origin in scheduler_origins: assert listed_origin.visit_type == "git" assert listed_origin.url.startswith(f"https://{instance}") def test_lister_gitlab_incremental(swh_scheduler, requests_mock, datadir): """Gitlab lister supports incremental visits """ instance = "gite.lirmm.fr" url = api_url(instance) url_page1 = url_page(url, 1) response1 = gitlab_page_response(datadir, instance, 1) url_page2 = url_page(url, 2) response2 = gitlab_page_response(datadir, instance, 2) url_page3 = url_page(url, 3) response3 = gitlab_page_response(datadir, instance, 3) requests_mock.get( url_page1, [{"json": response1, "headers": {"Link": f"<{url_page2}>; rel=next"}}], additional_matcher=_match_request, ) requests_mock.get( url_page2, [{"json": response2}], additional_matcher=_match_request, ) lister = GitLabLister(swh_scheduler, url=url, instance=instance, incremental=True) listed_result = lister.run() expected_nb_origins = len(response1) + len(response2) assert listed_result == ListerStats(pages=2, origins=expected_nb_origins) assert lister.state.last_seen_next_link == url_page2 lister2 = GitLabLister(swh_scheduler, url=url, instance=instance, incremental=True) requests_mock.reset() # Lister will start back at the last stop requests_mock.get( url_page2, [{"json": response2, "headers": {"Link": f"<{url_page3}>; rel=next"}}], additional_matcher=_match_request, ) requests_mock.get( url_page3, [{"json": response3}], additional_matcher=_match_request, ) listed_result2 = lister2.run() assert listed_result2 == ListerStats( pages=2, origins=len(response2) + len(response3) ) assert lister2.state.last_seen_next_link == url_page3 assert lister.lister_obj.id == lister2.lister_obj.id scheduler_origins = lister2.scheduler.get_listed_origins( lister2.lister_obj.id ).origins assert len(scheduler_origins) == len(response1) + len(response2) + len(response3) for listed_origin in scheduler_origins: assert listed_origin.visit_type == "git" assert listed_origin.url.startswith(f"https://{instance}") def test_lister_gitlab_rate_limit(swh_scheduler, requests_mock, datadir, mocker): """Gitlab lister supports rate-limit """ instance = "gite.lirmm.fr" url = api_url(instance) url_page1 = url_page(url, 1) response1 = gitlab_page_response(datadir, instance, 1) url_page2 = url_page(url, 2) response2 = gitlab_page_response(datadir, instance, 2) requests_mock.get( url_page1, [{"json": response1, "headers": {"Link": f"<{url_page2}>; rel=next"}}], additional_matcher=_match_request, ) requests_mock.get( url_page2, [ # rate limited twice {"status_code": codes.forbidden, "headers": {"RateLimit-Remaining": "0"}}, {"status_code": codes.forbidden, "headers": {"RateLimit-Remaining": "0"}}, # ok {"json": response2}, ], additional_matcher=_match_request, ) lister = GitLabLister(swh_scheduler, url=url, instance=instance) # To avoid this test being too slow, we mock sleep within the retry behavior mock_sleep = mocker.patch.object(lister.get_page_result.retry, "sleep") listed_result = lister.run() expected_nb_origins = len(response1) + len(response2) assert listed_result == ListerStats(pages=2, origins=expected_nb_origins) - _assert_sleep_calls(mocker, mock_sleep, [1, WAIT_EXP_BASE]) + assert_sleep_calls(mocker, mock_sleep, [1, WAIT_EXP_BASE]) @pytest.mark.parametrize( "url,expected_result", [ (None, None), ("http://dummy/?query=1", None), ("http://dummy/?foo=bar&page=1&some=result", 1), ("http://dummy/?foo=bar&page=&some=result", None), ], ) def test__parse_page_id(url, expected_result): assert _parse_page_id(url) == expected_result diff --git a/swh/lister/tests/test_utils.py b/swh/lister/tests/test_utils.py index bf804fc..68aed42 100644 --- a/swh/lister/tests/test_utils.py +++ b/swh/lister/tests/test_utils.py @@ -1,133 +1,133 @@ # Copyright (C) 2018-2020 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest import requests from requests.status_codes import codes from tenacity.wait import wait_fixed from testing.postgresql import Postgresql from swh.lister.utils import ( MAX_NUMBER_ATTEMPTS, WAIT_EXP_BASE, split_range, throttling_retry, ) @pytest.mark.parametrize( "total_pages,nb_pages,expected_ranges", [ (14, 5, [(0, 4), (5, 9), (10, 14)]), (19, 10, [(0, 9), (10, 19)]), (20, 3, [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 20)]), (21, 3, [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 21),],), ], ) def test_split_range(total_pages, nb_pages, expected_ranges): actual_ranges = list(split_range(total_pages, nb_pages)) assert actual_ranges == expected_ranges @pytest.mark.parametrize("total_pages,nb_pages", [(None, 1), (100, None)]) def test_split_range_errors(total_pages, nb_pages): for total_pages, nb_pages in [(None, 1), (100, None)]: with pytest.raises(TypeError): next(split_range(total_pages, nb_pages)) def init_db(): """Factorize the db_url instantiation Returns: db object to ease db manipulation """ initdb_args = Postgresql.DEFAULT_SETTINGS["initdb_args"] initdb_args = " ".join([initdb_args, "-E UTF-8"]) return Postgresql(initdb_args=initdb_args) TEST_URL = "https://example.og/api/repositories" @throttling_retry() def make_request(): response = requests.get(TEST_URL) response.raise_for_status() return response -def _assert_sleep_calls(mocker, mock_sleep, sleep_params): +def assert_sleep_calls(mocker, mock_sleep, sleep_params): try: mock_sleep.assert_has_calls([mocker.call(param) for param in sleep_params]) except AssertionError: # tenacity < 5.1 has a different behavior for wait_exponential # https://github.com/jd/tenacity/commit/aac4307a0aa30d7befd0ebe4212ee4fc69083a95 mock_sleep.assert_has_calls( [mocker.call(param * WAIT_EXP_BASE) for param in sleep_params] ) def test_throttling_retry(requests_mock, mocker): data = {"result": {}} requests_mock.get( TEST_URL, [ {"status_code": codes.too_many_requests}, {"status_code": codes.too_many_requests}, {"status_code": codes.ok, "json": data}, ], ) mock_sleep = mocker.patch.object(make_request.retry, "sleep") response = make_request() - _assert_sleep_calls(mocker, mock_sleep, [1, WAIT_EXP_BASE]) + assert_sleep_calls(mocker, mock_sleep, [1, WAIT_EXP_BASE]) assert response.json() == data def test_throttling_retry_max_attemps(requests_mock, mocker): requests_mock.get( TEST_URL, [{"status_code": codes.too_many_requests}] * (MAX_NUMBER_ATTEMPTS), ) mock_sleep = mocker.patch.object(make_request.retry, "sleep") with pytest.raises(requests.exceptions.HTTPError) as e: make_request() assert e.value.response.status_code == codes.too_many_requests - _assert_sleep_calls( + assert_sleep_calls( mocker, mock_sleep, [float(WAIT_EXP_BASE ** i) for i in range(MAX_NUMBER_ATTEMPTS - 1)], ) @throttling_retry(wait=wait_fixed(WAIT_EXP_BASE)) def make_request_wait_fixed(): response = requests.get(TEST_URL) response.raise_for_status() return response def test_throttling_retry_wait_fixed(requests_mock, mocker): requests_mock.get( TEST_URL, [ {"status_code": codes.too_many_requests}, {"status_code": codes.too_many_requests}, {"status_code": codes.ok}, ], ) mock_sleep = mocker.patch.object(make_request_wait_fixed.retry, "sleep") make_request_wait_fixed() - _assert_sleep_calls(mocker, mock_sleep, [WAIT_EXP_BASE] * 2) + assert_sleep_calls(mocker, mock_sleep, [WAIT_EXP_BASE] * 2)