diff --git a/swh/lister/gitea/tests/test_lister.py b/swh/lister/gitea/tests/test_lister.py --- a/swh/lister/gitea/tests/test_lister.py +++ b/swh/lister/gitea/tests/test_lister.py @@ -9,6 +9,7 @@ import pytest import requests +from requests import HTTPError from swh.lister.gitea.lister import GiteaLister from swh.lister.gogs.lister import GogsListerPage @@ -138,16 +139,38 @@ @pytest.mark.parametrize("http_code", [400, 500, 502]) -def test_gitea_list_http_error(swh_scheduler, requests_mock, http_code): +def test_gitea_list_http_error( + swh_scheduler, requests_mock, http_code, trygitea_p1, trygitea_p2 +): """Test handling of some HTTP errors commonly encountered""" lister = GiteaLister(scheduler=swh_scheduler, url=TRYGITEA_URL, page_size=3) + p1_text, p1_headers, _, p1_origin_urls = trygitea_p1 + p3_text, p3_headers, _, p3_origin_urls = trygitea_p2 + base_url = TRYGITEA_URL + lister.REPO_LIST_PATH - requests_mock.get(base_url, status_code=http_code) + requests_mock.get( + base_url, + [ + {"text": p1_text, "headers": p1_headers, "status_code": 200}, + {"status_code": http_code}, + {"text": p3_text, "headers": p3_headers, "status_code": 200}, + ], + ) - with pytest.raises(requests.HTTPError): + # pages with fatal repositories should be skipped (no error raised) + # See T4423 for more details + if http_code == 500: lister.run() + else: + with pytest.raises(HTTPError): + lister.run() + # Both P1 and P3 origins should be listed in case of 500 error + # While in other cases, only P1 origins should be listed scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - assert len(scheduler_origins) == 0 + check_listed_origins( + (p1_origin_urls + p3_origin_urls) if http_code == 500 else p1_origin_urls, + scheduler_origins, + ) diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py --- a/swh/lister/gogs/lister.py +++ b/swh/lister/gogs/lister.py @@ -2,12 +2,11 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - from dataclasses import asdict, dataclass import logging import random -from typing import Any, Dict, Iterator, List, Optional -from urllib.parse import parse_qs, urljoin, urlparse +from typing import Any, Dict, Iterator, List, Optional, Tuple +from urllib.parse import parse_qs, parse_qsl, urlencode, urljoin, urlparse import iso8601 import requests @@ -97,8 +96,6 @@ # Raises an error on Gogs, or a warning on Gitea self.on_anonymous_mode() - self.max_page_limit = 2 - self.session = requests.Session() self.session.headers.update( { @@ -120,7 +117,9 @@ return asdict(state) @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) - def page_request(self, url, params) -> requests.Response: + def page_request( + self, url: str, params: Dict[str, Any] + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: logger.debug("Fetching URL %s with params %s", url, params) @@ -133,9 +132,20 @@ response.url, response.content, ) - response.raise_for_status() - - return response + if ( + response.status_code == 500 + ): # Temporary hack for skipping fatal repos (T4423) + url_parts = urlparse(url) + query: Dict[str, Any] = dict(parse_qsl(url_parts.query)) + query.update({"page": _parse_page_id(url) + 1}) + next_page_link = url_parts._replace(query=urlencode(query)).geturl() + body: Dict[str, Any] = {"data": []} + links = {"next": {"url": next_page_link}} + return body, links + else: + response.raise_for_status() + + return response.json(), response.links @classmethod def extract_repos(cls, body: Dict[str, Any]) -> List[Repo]: @@ -149,21 +159,24 @@ # base with trailing slash, path without leading slash for urljoin next_link: Optional[str] = urljoin(self.url, self.REPO_LIST_PATH) - response = self.page_request(next_link, {**self.query_params, "page": page_id}) + + body, links = self.page_request( + next_link, {**self.query_params, "page": page_id} + ) while next_link is not None: - repos = self.extract_repos(response.json()) + repos = self.extract_repos(body) - assert len(response.links) > 0, "API changed: no Link header found" - if "next" in response.links: - next_link = response.links["next"]["url"] + assert len(links) > 0, "API changed: no Link header found" + if "next" in links: + next_link = links["next"]["url"] else: next_link = None # Happens for the last page yield GogsListerPage(repos=repos, next_link=next_link) if next_link is not None: - response = self.page_request(next_link, {}) + body, links = self.page_request(next_link, {}) def get_origins_from_page(self, page: GogsListerPage) -> Iterator[ListedOrigin]: """Convert a page of Gogs repositories into a list of ListedOrigins""" diff --git a/swh/lister/gogs/tests/test_lister.py b/swh/lister/gogs/tests/test_lister.py --- a/swh/lister/gogs/tests/test_lister.py +++ b/swh/lister/gogs/tests/test_lister.py @@ -186,7 +186,7 @@ lister = GogsLister(scheduler=swh_scheduler, url=TRY_GOGS_URL, api_token="secret") p1_text, p1_headers, _, p1_origin_urls = trygogs_p1 - p3_text, p3_headers, _, _ = trygogs_p3_last + p3_text, p3_headers, _, p3_origin_urls = trygogs_p3_last base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH requests_mock.get( @@ -198,13 +198,21 @@ ], ) - with pytest.raises(HTTPError): + # pages with fatal repositories should be skipped (no error raised) + # See T4423 for more details + if http_code == 500: lister.run() + else: + with pytest.raises(HTTPError): + lister.run() + # Both P1 and P3 origins should be listed in case of 500 error + # While in other cases, only P1 origins should be listed scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results check_listed_origins( - p1_origin_urls, scheduler_origins - ) # Only the first page is listed + (p1_origin_urls + p3_origin_urls) if http_code == 500 else p1_origin_urls, + scheduler_origins, + ) def test_gogs_incremental_lister(