diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -258,9 +258,9 @@ logger.info("Fetching poms..") for pom in out_pom: - text = self.page_request(pom, {}) try: - project = xmltodict.parse(text.content.decode()) + response = self.page_request(pom, {}) + project = xmltodict.parse(response.content.decode()) if "scm" in project["project"]: if "connection" in project["project"]["scm"]: scm = project["project"]["scm"]["connection"] @@ -278,6 +278,11 @@ logger.debug("No scm.connection in pom %s", pom) else: logger.debug("No scm in pom %s", pom) + except requests.HTTPError: + logger.warning( + "POM info page could not be fetched, skipping project '%s'", + pom, + ) except xmltodict.expat.ExpatError as error: logger.info("Could not parse POM %s XML: %s. Next.", pom, error) diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -291,35 +291,37 @@ @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) -def test_maven_list_http_error( +def test_maven_list_http_error_on_index_read( swh_scheduler, requests_mock, mocker, maven_index, http_code ): - """Test handling of some common HTTP errors: - - 400: Bad request. - - 404: Resource no found. - - 500: Internal server error. - - 502: Bad gateway ou proxy Error. - """ + """should stop listing if the lister fails to retrieve the main index url.""" lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) - - # Test failure of index retrieval. - requests_mock.get(INDEX_URL, status_code=http_code) - - with pytest.raises(requests.HTTPError): + with pytest.raises(requests.HTTPError): # listing cannot continues so stop lister.run() - # Test failure of artefacts retrieval. + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == 0 + +@pytest.mark.parametrize("http_code", [400, 404, 500, 502]) +def test_maven_list_http_error_artifacts( + swh_scheduler, requests_mock, mocker, maven_index, http_code, maven_pom_2 +): + """should continue listing when failing to retrieve artifacts.""" + # Test failure of artefacts retrieval. requests_mock.get(INDEX_URL, text=maven_index) requests_mock.get(URL_POM_1, status_code=http_code) + requests_mock.get(URL_POM_2, text=maven_pom_2) - with pytest.raises(requests.HTTPError): - lister.run() + lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) + + # on artifacts though, that raises but continue listing + lister.run() # If the maven_index step succeeded but not the get_pom step, # then we get only the 2 maven-jar origins (and not the 2 additional # src origins). scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - assert len(scheduler_origins) == 2 + assert len(scheduler_origins) == 3