diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -195,10 +195,9 @@ if response.status_code != 200: # Log response content to ease debugging logger.warning( - "Unexpected HTTP status code %s on %s: %s", + "Unexpected HTTP status code %s for URL %s", response.status_code, response.url, - response.content, ) # The lister must fail on blocking errors response.raise_for_status() @@ -294,7 +293,8 @@ else: logger.debug("Project '%s' does not have any VCS", project) else: - # Should always match, let's log it + # Should almost always match, let's log it + # The only ones that don't match are mostly specialized one-off URLs. msg = "Project URL '%s' does not match expected pattern" logger.warning(msg, project_url) @@ -324,11 +324,15 @@ msg = "New project during an incremental run: %s/%s" logger.debug(msg, namespace, project) - res = self.page_request(endpoint, {}).json() + try: + res = self.page_request(endpoint, {}).json() + except requests.HTTPError: + # We've already logged in `page_request` + return [] tools = res.get("tools") if tools is None: - # This probably never happens + # This rarely happens, on very old URLs logger.warning("Project '%s' does not have any tools", endpoint) return [] diff --git a/swh/lister/sourceforge/tests/test_lister.py b/swh/lister/sourceforge/tests/test_lister.py --- a/swh/lister/sourceforge/tests/test_lister.py +++ b/swh/lister/sourceforge/tests/test_lister.py @@ -338,3 +338,35 @@ with pytest.raises(HTTPError): lister.run() + + +@pytest.mark.parametrize("status_code", [500, 503, 504, 403, 404]) +def test_sourceforge_lister_project_error( + datadir, swh_scheduler, requests_mock, status_code, +): + lister = SourceForgeLister(scheduler=swh_scheduler) + + requests_mock.get( + MAIN_SITEMAP_URL, + text=get_main_sitemap(datadir), + additional_matcher=_check_request_headers, + ) + requests_mock.get( + "https://sourceforge.net/allura_sitemap/sitemap-0.xml", + text=get_subsitemap_0(datadir), + additional_matcher=_check_request_headers, + ) + requests_mock.get( + "https://sourceforge.net/allura_sitemap/sitemap-1.xml", + text=get_subsitemap_1(datadir), + additional_matcher=_check_request_headers, + ) + requests_mock.get( + re.compile("https://sourceforge.net/rest/.*"), status_code=status_code + ) + + # Make sure that a single non-OK status does not cause the entire lister to stop + stats = lister.run() + # ... and that it does not mysteriously list something + assert stats.pages == 0 + assert stats.origins == 0