diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -111,6 +111,7 @@ ) self.incremental = incremental self.last_page: Optional[str] = None + self.per_page = 100 self.session = requests.Session() self.session.headers.update( @@ -145,7 +146,25 @@ response.url, response.content, ) - response.raise_for_status() + + # GitLab API can return errors 500 when listing projects. + # https://gitlab.com/gitlab-org/gitlab/-/issues/262629 + # To avoid ending the listing prematurely, skip buggy URLs and move + # to next pages. + if response.status_code == 500: + id_after = _parse_id_after(url) + assert id_after is not None + while True: + next_id_after = id_after + self.per_page + url = url.replace(f"id_after={id_after}", f"id_after={next_id_after}") + response = self.session.get(url) + if response.status_code == 200: + break + else: + id_after = next_id_after + else: + response.raise_for_status() + repositories: Tuple[Repository, ...] = tuple(response.json()) if hasattr(response, "links") and response.links.get("next"): next_page = response.links["next"]["url"] @@ -160,7 +179,7 @@ "order_by": "id", "sort": "asc", "simple": "true", - "per_page": "100", + "per_page": f"{self.per_page}", } if id_after is not None: parameters["id_after"] = str(id_after) diff --git a/swh/lister/gitlab/tests/test_lister.py b/swh/lister/gitlab/tests/test_lister.py --- a/swh/lister/gitlab/tests/test_lister.py +++ b/swh/lister/gitlab/tests/test_lister.py @@ -244,6 +244,39 @@ assert_sleep_calls(mocker, mock_sleep, [1]) +def test_lister_gitlab_http_error_500(swh_scheduler, requests_mock, datadir): + """Gitlab lister should skip buggy URL and move to next page. + + """ + instance = "gite.lirmm.fr" + url = api_url(instance) + lister = GitLabLister(swh_scheduler, url=url, instance=instance) + + url_page1 = lister.page_url() + response1 = gitlab_page_response(datadir, instance, 1) + url_page2 = lister.page_url(lister.per_page) + url_page3 = lister.page_url(2 * lister.per_page) + response3 = gitlab_page_response(datadir, instance, 3) + + requests_mock.get( + url_page1, + [{"json": response1, "headers": {"Link": f"<{url_page2}>; rel=next"}}], + additional_matcher=_match_request, + ) + requests_mock.get( + url_page2, [{"status_code": 500},], additional_matcher=_match_request, + ) + + requests_mock.get( + url_page3, [{"json": response3}], additional_matcher=_match_request, + ) + + listed_result = lister.run() + + expected_nb_origins = len(response1) + len(response3) + assert listed_result == ListerStats(pages=2, origins=expected_nb_origins) + + def test_lister_gitlab_credentials(swh_scheduler): """Gitlab lister supports credentials configuration