diff --git a/swh/lister/maven/tests/data/https_maven.org/citrus-parent-3.0.7.pom b/swh/lister/maven/tests/data/citrus-parent-3.0.7.pom similarity index 100% rename from swh/lister/maven/tests/data/https_maven.org/citrus-parent-3.0.7.pom rename to swh/lister/maven/tests/data/citrus-parent-3.0.7.pom diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom b/swh/lister/maven/tests/data/https_repo1.maven.org/maven2_al_aldi_sprova4j_0.1.0_sprova4j-0.1.0.pom similarity index 100% rename from swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom rename to swh/lister/maven/tests/data/https_repo1.maven.org/maven2_al_aldi_sprova4j_0.1.0_sprova4j-0.1.0.pom diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom b/swh/lister/maven/tests/data/https_repo1.maven.org/maven2_al_aldi_sprova4j_0.1.1_sprova4j-0.1.1.pom similarity index 100% rename from swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom rename to swh/lister/maven/tests/data/https_repo1.maven.org/maven2_al_aldi_sprova4j_0.1.1_sprova4j-0.1.1.pom diff --git a/swh/lister/maven/tests/data/https_maven.org/arangodb-graphql-1.2.pom b/swh/lister/maven/tests/data/https_repo1.maven.org/maven2_com_arangodb_arangodb-graphql_1.2_arangodb-graphql-1.2.pom similarity index 100% rename from swh/lister/maven/tests/data/https_maven.org/arangodb-graphql-1.2.pom rename to swh/lister/maven/tests/data/https_repo1.maven.org/maven2_com_arangodb_arangodb-graphql_1.2_arangodb-graphql-1.2.pom diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom b/swh/lister/maven/tests/data/sprova4j-0.1.0.malformed.pom similarity index 100% rename from swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom rename to swh/lister/maven/tests/data/sprova4j-0.1.0.malformed.pom diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py index e0608e4..9936a9b 100644 --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -1,388 +1,359 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pathlib import Path import iso8601 import pytest import requests from swh.lister.maven.lister import MavenLister MVN_URL = "https://repo1.maven.org/maven2/" # main maven repo url INDEX_URL = "http://indexes/export.fld" # index directory url URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom" URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom" URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom" USER_REPO0 = "aldialimucaj/sprova4j" GIT_REPO_URL0_HTTPS = f"https://github.com/{USER_REPO0}" GIT_REPO_URL0_API = f"https://api.github.com/repos/{USER_REPO0}" ORIGIN_GIT = GIT_REPO_URL0_HTTPS USER_REPO1 = "ArangoDB-Community/arangodb-graphql-java" GIT_REPO_URL1_HTTPS = f"https://github.com/{USER_REPO1}" GIT_REPO_URL1_GIT = f"git://github.com/{USER_REPO1}.git" GIT_REPO_URL1_API = f"https://api.github.com/repos/{USER_REPO1}" ORIGIN_GIT_INCR = GIT_REPO_URL1_HTTPS USER_REPO2 = "webx/citrus" GIT_REPO_URL2_HTTPS = f"https://github.com/{USER_REPO2}" GIT_REPO_URL2_API = f"https://api.github.com/repos/{USER_REPO2}" ORIGIN_SRC = MVN_URL + "al/aldi/sprova4j" LIST_SRC_DATA = ( { "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.0/sprova4j-0.1.0-sources.jar", "time": "2021-07-12T17:06:59+00:00", "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.0", "base_url": MVN_URL, }, { "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.1/sprova4j-0.1.1-sources.jar", "time": "2021-07-12T17:37:05+00:00", "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.1", "base_url": MVN_URL, }, ) @pytest.fixture def maven_index_full(datadir) -> bytes: return Path(datadir, "http_indexes", "export_full.fld").read_bytes() @pytest.fixture def maven_index_incr_first(datadir) -> bytes: return Path(datadir, "http_indexes", "export_incr_first.fld").read_bytes() -@pytest.fixture -def maven_pom_1(datadir) -> bytes: - return Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_bytes() - - @pytest.fixture def maven_index_null_mtime(datadir) -> bytes: return Path(datadir, "http_indexes", "export_null_mtime.fld").read_bytes() -@pytest.fixture -def maven_pom_1_malformed(datadir) -> bytes: - return Path(datadir, "https_maven.org", "sprova4j-0.1.0.malformed.pom").read_bytes() - - -@pytest.fixture -def maven_pom_2(datadir) -> bytes: - return Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_bytes() - - -@pytest.fixture -def maven_pom_3(datadir) -> bytes: - return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_bytes() - - -@pytest.fixture -def maven_pom_multi_byte_encoding(datadir) -> bytes: - return Path(datadir, "https_maven.org", "citrus-parent-3.0.7.pom").read_bytes() - - -@pytest.fixture -def requests_mock(requests_mock): - """If github api calls for the configured scm repository, returns its canonical url.""" +@pytest.fixture(autouse=True) +def network_requests_mock(requests_mock, requests_mock_datadir, maven_index_full): + # If github api calls for the configured scm repository, returns its canonical url. for url_api, url_html in [ (GIT_REPO_URL0_API, GIT_REPO_URL0_HTTPS), (GIT_REPO_URL1_API, GIT_REPO_URL1_HTTPS), (GIT_REPO_URL2_API, GIT_REPO_URL2_HTTPS), ]: requests_mock.get( url_api, json={"html_url": url_html}, ) - yield requests_mock - -@pytest.fixture(autouse=True) -def network_requests_mock( - requests_mock, maven_index_full, maven_pom_1, maven_pom_2, maven_pom_3 -): requests_mock.get(INDEX_URL, content=maven_index_full) - requests_mock.get(URL_POM_1, content=maven_pom_1) - requests_mock.get(URL_POM_2, content=maven_pom_2) - requests_mock.get(URL_POM_3, content=maven_pom_3) @pytest.fixture(autouse=True) def retry_sleep_mock(mocker): mocker.patch.object(MavenLister.http_request.retry, "sleep") def test_maven_full_listing(swh_scheduler): """Covers full listing of multiple pages, checking page results and listed origins, statelessness.""" # Run the lister. lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) stats = lister.run() # Start test checks. assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 3 git origins + 1 maven origin with 2 releases (one per jar) assert set(origin_urls) == {ORIGIN_GIT, ORIGIN_GIT_INCR, ORIGIN_SRC} assert len(set(origin_urls)) == len(origin_urls) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 assert scheduler_state.last_seen_pom == -1 def test_maven_full_listing_malformed( swh_scheduler, requests_mock, - maven_pom_1_malformed, + datadir, ): """Covers full listing of multiple pages, checking page results with a malformed scm entry in pom.""" lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) # Set up test. - requests_mock.get(URL_POM_1, content=maven_pom_1_malformed) + requests_mock.get( + URL_POM_1, content=Path(datadir, "sprova4j-0.1.0.malformed.pom").read_bytes() + ) # Then run the lister. stats = lister.run() # Start test checks. assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 2 git origins + 1 maven origin with 2 releases (one per jar) assert set(origin_urls) == {ORIGIN_GIT, ORIGIN_GIT_INCR, ORIGIN_SRC} assert len(origin_urls) == len(set(origin_urls)) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 assert scheduler_state.last_seen_pom == -1 def test_maven_incremental_listing( swh_scheduler, requests_mock, maven_index_full, maven_index_incr_first, ): """Covers full listing of multiple pages, checking page results and listed origins, with a second updated run for statefulness.""" lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=True, ) # Set up test. requests_mock.get(INDEX_URL, content=maven_index_incr_first) # Then run the lister. stats = lister.run() # Start test checks. assert lister.incremental assert lister.updated assert stats.pages == 2 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 1 git origins + 1 maven origin with 1 release (one per jar) assert set(origin_urls) == {ORIGIN_GIT, ORIGIN_SRC} assert len(origin_urls) == len(set(origin_urls)) for origin in scheduler_origins: if origin.visit_type == "maven": last_update_src = iso8601.parse_date(LIST_SRC_DATA[0]["time"]) assert last_update_src == origin.last_update assert origin.extra_loader_arguments["artifacts"] == [LIST_SRC_DATA[0]] # Second execution of the lister, incremental mode lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=True, ) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == 1 assert scheduler_state.last_seen_pom == 1 # Set up test. requests_mock.get(INDEX_URL, content=maven_index_full) # Then run the lister. stats = lister.run() # Start test checks. assert lister.incremental assert lister.updated assert stats.pages == 4 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] assert set(origin_urls) == {ORIGIN_SRC, ORIGIN_GIT, ORIGIN_GIT_INCR} assert len(origin_urls) == len(set(origin_urls)) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == 4 assert scheduler_state.last_seen_pom == 4 @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) def test_maven_list_http_error_on_index_read(swh_scheduler, requests_mock, http_code): """should stop listing if the lister fails to retrieve the main index url.""" lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) requests_mock.get(INDEX_URL, status_code=http_code) with pytest.raises(requests.HTTPError): # listing cannot continues so stop lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 0 @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) def test_maven_list_http_error_artifacts( swh_scheduler, requests_mock, http_code, ): """should continue listing when failing to retrieve artifacts.""" # Test failure of artefacts retrieval. requests_mock.get(URL_POM_1, status_code=http_code) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) # on artifacts though, that raises but continue listing lister.run() # If the maven_index_full step succeeded but not the get_pom step, # then we get only one maven-jar origin and one git origin. scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] assert set(origin_urls) == {ORIGIN_SRC, ORIGIN_GIT_INCR} assert len(origin_urls) == len(set(origin_urls)) def test_maven_lister_null_mtime(swh_scheduler, requests_mock, maven_index_null_mtime): requests_mock.get(INDEX_URL, content=maven_index_null_mtime) # Run the lister. lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) stats = lister.run() # Start test checks. assert stats.pages == 1 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 1 assert scheduler_origins[0].last_update is None -def test_maven_list_pom_bad_encoding(swh_scheduler, requests_mock, maven_pom_1): +def test_maven_list_pom_bad_encoding(swh_scheduler, requests_mock): """should continue listing when failing to decode pom file.""" # Test failure of pom parsing by reencoding a UTF-8 pom file to a not expected one - requests_mock.get(URL_POM_1, content=maven_pom_1.decode("utf-8").encode("utf-32")) + requests_mock.get( + URL_POM_1, + content=requests.get(URL_POM_1).content.decode("utf-8").encode("utf-32"), + ) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) lister.run() # If the maven_index_full step succeeded but not the pom parsing step, # then we get only one maven-jar origin and one git origin. scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 2 -def test_maven_list_pom_multi_byte_encoding( - swh_scheduler, requests_mock, maven_pom_multi_byte_encoding -): +def test_maven_list_pom_multi_byte_encoding(swh_scheduler, requests_mock, datadir): """should parse POM file with multi-byte encoding.""" # replace pom file with a multi-byte encoding one - requests_mock.get(URL_POM_1, content=maven_pom_multi_byte_encoding) + requests_mock.get( + URL_POM_1, content=Path(datadir, "citrus-parent-3.0.7.pom").read_bytes() + ) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 3