Page MenuHomeSoftware Heritage

D8509.id30649.diff
No OneTemporary

D8509.id30649.diff

diff --git a/swh/lister/gitea/tests/test_lister.py b/swh/lister/gitea/tests/test_lister.py
--- a/swh/lister/gitea/tests/test_lister.py
+++ b/swh/lister/gitea/tests/test_lister.py
@@ -9,6 +9,7 @@
import pytest
import requests
+from requests import HTTPError
from swh.lister.gitea.lister import GiteaLister
from swh.lister.gogs.lister import GogsListerPage
@@ -138,16 +139,38 @@
@pytest.mark.parametrize("http_code", [400, 500, 502])
-def test_gitea_list_http_error(swh_scheduler, requests_mock, http_code):
+def test_gitea_list_http_error(
+ swh_scheduler, requests_mock, http_code, trygitea_p1, trygitea_p2
+):
"""Test handling of some HTTP errors commonly encountered"""
lister = GiteaLister(scheduler=swh_scheduler, url=TRYGITEA_URL, page_size=3)
+ p1_text, p1_headers, _, p1_origin_urls = trygitea_p1
+ p3_text, p3_headers, _, p3_origin_urls = trygitea_p2
+
base_url = TRYGITEA_URL + lister.REPO_LIST_PATH
- requests_mock.get(base_url, status_code=http_code)
+ requests_mock.get(
+ base_url,
+ [
+ {"text": p1_text, "headers": p1_headers, "status_code": 200},
+ {"status_code": http_code},
+ {"text": p3_text, "headers": p3_headers, "status_code": 200},
+ ],
+ )
- with pytest.raises(requests.HTTPError):
+ # pages with fatal repositories should be skipped (no error raised)
+ # See T4423 for more details
+ if http_code == 500:
lister.run()
+ else:
+ with pytest.raises(HTTPError):
+ lister.run()
+ # Both P1 and P3 origins should be listed in case of 500 error
+ # While in other cases, only P1 origins should be listed
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
- assert len(scheduler_origins) == 0
+ check_listed_origins(
+ (p1_origin_urls + p3_origin_urls) if http_code == 500 else p1_origin_urls,
+ scheduler_origins,
+ )
diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py
--- a/swh/lister/gogs/lister.py
+++ b/swh/lister/gogs/lister.py
@@ -2,12 +2,11 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-
from dataclasses import asdict, dataclass
import logging
import random
-from typing import Any, Dict, Iterator, List, Optional
-from urllib.parse import parse_qs, urljoin, urlparse
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+from urllib.parse import parse_qs, parse_qsl, urlencode, urljoin, urlparse
import iso8601
import requests
@@ -97,8 +96,6 @@
# Raises an error on Gogs, or a warning on Gitea
self.on_anonymous_mode()
- self.max_page_limit = 2
-
self.session = requests.Session()
self.session.headers.update(
{
@@ -120,7 +117,9 @@
return asdict(state)
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
- def page_request(self, url, params) -> requests.Response:
+ def page_request(
+ self, url: str, params: Dict[str, Any]
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
logger.debug("Fetching URL %s with params %s", url, params)
@@ -133,9 +132,20 @@
response.url,
response.content,
)
- response.raise_for_status()
-
- return response
+ if (
+ response.status_code == 500
+ ): # Temporary hack for skipping fatal repos (T4423)
+ url_parts = urlparse(url)
+ query: Dict[str, Any] = dict(parse_qsl(url_parts.query))
+ query.update({"page": _parse_page_id(url) + 1})
+ next_page_link = url_parts._replace(query=urlencode(query)).geturl()
+ body: Dict[str, Any] = {"data": []}
+ links = {"next": {"url": next_page_link}}
+ return body, links
+ else:
+ response.raise_for_status()
+
+ return response.json(), response.links
@classmethod
def extract_repos(cls, body: Dict[str, Any]) -> List[Repo]:
@@ -149,21 +159,24 @@
# base with trailing slash, path without leading slash for urljoin
next_link: Optional[str] = urljoin(self.url, self.REPO_LIST_PATH)
- response = self.page_request(next_link, {**self.query_params, "page": page_id})
+
+ body, links = self.page_request(
+ next_link, {**self.query_params, "page": page_id}
+ )
while next_link is not None:
- repos = self.extract_repos(response.json())
+ repos = self.extract_repos(body)
- assert len(response.links) > 0, "API changed: no Link header found"
- if "next" in response.links:
- next_link = response.links["next"]["url"]
+ assert len(links) > 0, "API changed: no Link header found"
+ if "next" in links:
+ next_link = links["next"]["url"]
else:
next_link = None # Happens for the last page
yield GogsListerPage(repos=repos, next_link=next_link)
if next_link is not None:
- response = self.page_request(next_link, {})
+ body, links = self.page_request(next_link, {})
def get_origins_from_page(self, page: GogsListerPage) -> Iterator[ListedOrigin]:
"""Convert a page of Gogs repositories into a list of ListedOrigins"""
diff --git a/swh/lister/gogs/tests/test_lister.py b/swh/lister/gogs/tests/test_lister.py
--- a/swh/lister/gogs/tests/test_lister.py
+++ b/swh/lister/gogs/tests/test_lister.py
@@ -186,7 +186,7 @@
lister = GogsLister(scheduler=swh_scheduler, url=TRY_GOGS_URL, api_token="secret")
p1_text, p1_headers, _, p1_origin_urls = trygogs_p1
- p3_text, p3_headers, _, _ = trygogs_p3_last
+ p3_text, p3_headers, _, p3_origin_urls = trygogs_p3_last
base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH
requests_mock.get(
@@ -198,13 +198,21 @@
],
)
- with pytest.raises(HTTPError):
+ # pages with fatal repositories should be skipped (no error raised)
+ # See T4423 for more details
+ if http_code == 500:
lister.run()
+ else:
+ with pytest.raises(HTTPError):
+ lister.run()
+ # Both P1 and P3 origins should be listed in case of 500 error
+ # While in other cases, only P1 origins should be listed
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
check_listed_origins(
- p1_origin_urls, scheduler_origins
- ) # Only the first page is listed
+ (p1_origin_urls + p3_origin_urls) if http_code == 500 else p1_origin_urls,
+ scheduler_origins,
+ )
def test_gogs_incremental_lister(

File Metadata

Mime Type
text/plain
Expires
Tue, Jun 3, 7:22 PM (5 d, 21 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3228297

Event Timeline