diff --git a/swh/lister/gitea/lister.py b/swh/lister/gitea/lister.py --- a/swh/lister/gitea/lister.py +++ b/swh/lister/gitea/lister.py @@ -4,8 +4,12 @@ # See top-level LICENSE file for more information import logging +from typing import Optional + +from swh.scheduler.interface import SchedulerInterface from ..gogs.lister import GogsLister +from ..pattern import CredentialsType logger = logging.getLogger(__name__) @@ -21,6 +25,25 @@ LISTER_NAME = "gitea" + def __init__( + self, + scheduler: SchedulerInterface, + url: str, + instance: Optional[str] = None, + api_token: Optional[str] = None, + page_size: int = 50, + credentials: CredentialsType = None, + ): + super().__init__( + scheduler=scheduler, + url=url, + instance=instance, + api_token=api_token, + page_size=page_size, + credentials=credentials, + ) + self.skip_on_500 = False + def on_anonymous_mode(self): logger.warning( "No authentication token set in configuration, using anonymous mode" diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py --- a/swh/lister/gogs/lister.py +++ b/swh/lister/gogs/lister.py @@ -2,12 +2,11 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - from dataclasses import asdict, dataclass import logging import random -from typing import Any, Dict, Iterator, List, Optional -from urllib.parse import parse_qs, urljoin, urlparse +from typing import Any, Dict, Iterator, List, Optional, Tuple +from urllib.parse import parse_qs, parse_qsl, urlencode, urljoin, urlparse import iso8601 import requests @@ -81,6 +80,8 @@ instance=instance, ) + self.skip_on_500 = True + self.query_params = { "limit": page_size, } @@ -97,8 +98,6 @@ # Raises an error on Gogs, or a warning on Gitea self.on_anonymous_mode() - self.max_page_limit = 2 - self.session = requests.Session() self.session.headers.update( { @@ -120,7 +119,9 @@ return asdict(state) @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) - def page_request(self, url, params) -> requests.Response: + def page_request( + self, url: str, params: Dict[str, Any] + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: logger.debug("Fetching URL %s with params %s", url, params) @@ -133,9 +134,20 @@ response.url, response.content, ) - response.raise_for_status() - - return response + if ( + self.skip_on_500 and response.status_code == 500 + ): # Temporary hack for skipping fatal repos (T4423) + url_parts = urlparse(url) + query: Dict[str, Any] = dict(parse_qsl(url_parts.query)) + query.update({"page": _parse_page_id(url) + 1}) + next_page_link = url_parts._replace(query=urlencode(query)).geturl() + body: Dict[str, Any] = {"data": []} + links = {"next": {"url": next_page_link}} + return body, links + else: + response.raise_for_status() + + return response.json(), response.links @classmethod def extract_repos(cls, body: Dict[str, Any]) -> List[Repo]: @@ -149,21 +161,24 @@ # base with trailing slash, path without leading slash for urljoin next_link: Optional[str] = urljoin(self.url, self.REPO_LIST_PATH) - response = self.page_request(next_link, {**self.query_params, "page": page_id}) + + body, links = self.page_request( + next_link, {**self.query_params, "page": page_id} + ) while next_link is not None: - repos = self.extract_repos(response.json()) + repos = self.extract_repos(body) - assert len(response.links) > 0, "API changed: no Link header found" - if "next" in response.links: - next_link = response.links["next"]["url"] + assert len(links) > 0, "API changed: no Link header found" + if "next" in links: + next_link = links["next"]["url"] else: next_link = None # Happens for the last page yield GogsListerPage(repos=repos, next_link=next_link) if next_link is not None: - response = self.page_request(next_link, {}) + body, links = self.page_request(next_link, {}) def get_origins_from_page(self, page: GogsListerPage) -> Iterator[ListedOrigin]: """Convert a page of Gogs repositories into a list of ListedOrigins""" diff --git a/swh/lister/gogs/tests/test_lister.py b/swh/lister/gogs/tests/test_lister.py --- a/swh/lister/gogs/tests/test_lister.py +++ b/swh/lister/gogs/tests/test_lister.py @@ -186,7 +186,7 @@ lister = GogsLister(scheduler=swh_scheduler, url=TRY_GOGS_URL, api_token="secret") p1_text, p1_headers, _, p1_origin_urls = trygogs_p1 - p3_text, p3_headers, _, _ = trygogs_p3_last + p3_text, p3_headers, _, p3_origin_urls = trygogs_p3_last base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH requests_mock.get( @@ -198,13 +198,21 @@ ], ) - with pytest.raises(HTTPError): + # pages with fatal repositories should be skipped (no error raised) + # See T4423 for more details + if http_code == 500: lister.run() + else: + with pytest.raises(HTTPError): + lister.run() + # Both P1 and P3 origins should be listed in case of 500 error + # While in other cases, only P1 origins should be listed scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results check_listed_origins( - p1_origin_urls, scheduler_origins - ) # Only the first page is listed + (p1_origin_urls + p3_origin_urls) if http_code == 500 else p1_origin_urls, + scheduler_origins, + ) def test_gogs_incremental_lister(