diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -97,6 +97,8 @@ } ) + self.jar_origins: Dict[str, ListedOrigin] = {} + def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: return MavenListerState(**d) @@ -185,22 +187,10 @@ m_doc = re_doc.match(line) if m_doc is not None: doc_id = int(m_doc.group("doc")) - if ( - self.incremental - and self.state - and self.state.last_seen_doc - and self.state.last_seen_doc >= doc_id - ): - # jar_src["doc"] contains the id of the current document, whatever - # its type (scm or jar). - jar_src["doc"] = None - else: - jar_src["doc"] = doc_id + # jar_src["doc"] contains the id of the current document, whatever + # its type (scm or jar). + jar_src["doc"] = doc_id else: - # If incremental mode, we don't record any line that is - # before our last recorded doc id. - if self.incremental and jar_src["doc"] is None: - continue m_val = re_val.match(line) if m_val is not None: (gid, aid, version, classifier, ext) = m_val.groups() @@ -315,35 +305,61 @@ ) yield origin else: - # Origin is a source archive: + # Origin is gathering source archives: last_update_dt = None last_update_iso = "" last_update_seconds = str(page["time"])[:-3] try: last_update_dt = datetime.fromtimestamp(int(last_update_seconds)) - last_update_dt_tz = last_update_dt.astimezone(timezone.utc) + last_update_dt = last_update_dt.astimezone(timezone.utc) except OverflowError: logger.warning("- Failed to convert datetime %s.", last_update_seconds) if last_update_dt: - last_update_iso = last_update_dt_tz.isoformat() - origin = ListedOrigin( - lister_id=self.lister_obj.id, - url=page["url"], - visit_type=page["type"], - last_update=last_update_dt_tz, - extra_loader_arguments={ - "artifacts": [ - { - "time": last_update_iso, - "gid": page["gid"], - "aid": page["aid"], - "version": page["version"], - "base_url": self.BASE_URL, - } - ] - }, - ) - yield origin + last_update_iso = last_update_dt.isoformat() + + # Origin URL will target page holding sources for all versions of + # an artifactId (package name) inside a groupId (namespace) + path = "/".join(page["gid"].split(".")) + origin_url = urljoin(self.BASE_URL, f"{path}/{page['aid']}") + + artifact = { + **{k: v for k, v in page.items() if k != "doc"}, + "time": last_update_iso, + "base_url": self.BASE_URL, + } + + if origin_url not in self.jar_origins: + # Create ListedOrigin instance if we did not see that origin yet + jar_origin = ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type=page["type"], + last_update=last_update_dt, + extra_loader_arguments={"artifacts": [artifact]}, + ) + self.jar_origins[origin_url] = jar_origin + else: + # Update list of source artifacts for that origin otherwise + jar_origin = self.jar_origins[origin_url] + artifacts = jar_origin.extra_loader_arguments["artifacts"] + if artifact not in artifacts: + artifacts.append(artifact) + + if ( + jar_origin.last_update + and last_update_dt + and last_update_dt > jar_origin.last_update + ): + jar_origin.last_update = last_update_dt + + if not self.incremental or ( + self.state and page["doc"] > self.state.last_seen_doc + ): + # Yield origin with updated source artifacts, multiple instances of + # ListedOrigin for the same origin URL but with different artifacts + # list will be sent to the scheduler but it will deduplicate them and + # take the latest one to upsert in database + yield jar_origin def commit_page(self, page: RepoPage) -> None: """Update currently stored state using the latest listed doc. diff --git a/swh/lister/maven/tests/data/http_indexes/export.fld b/swh/lister/maven/tests/data/http_indexes/export.fld deleted file mode 100755 --- a/swh/lister/maven/tests/data/http_indexes/export.fld +++ /dev/null @@ -1,113 +0,0 @@ -doc 0 - field 0 - name u - type string - value al.aldi|sprova4j|0.1.0|sources|jar - field 1 - name m - type string - value 1626111735737 - field 2 - name i - type string - value jar|1626109619335|14316|2|2|0|jar - field 10 - name n - type string - value sprova4j - field 11 - name d - type string - value Java client for Sprova Test Management -doc 1 - field 0 - name u - type string - value al.aldi|sprova4j|0.1.0|NA|pom - field 1 - name m - type string - value 1626111735764 - field 2 - name i - type string - value jar|1626109636636|-1|1|0|0|pom - field 10 - name n - type string - value sprova4j - field 11 - name d - type string - value Java client for Sprova Test Management -doc 2 - field 0 - name u - type string - value al.aldi|sprova4j|0.1.1|sources|jar - field 1 - name m - type string - value 1626111784883 - field 2 - name i - type string - value jar|1626111425534|14510|2|2|0|jar - field 10 - name n - type string - value sprova4j - field 11 - name d - type string - value Java client for Sprova Test Management -doc 3 - field 0 - name u - type string - value al.aldi|sprova4j|0.1.1|NA|pom - field 1 - name m - type string - value 1626111784915 - field 2 - name i - type string - value jar|1626111437014|-1|1|0|0|pom - field 10 - name n - type string - value sprova4j - field 11 - name d - type string - value Java client for Sprova Test Management -doc 4 - field 14 - name DESCRIPTOR - type string - value NexusIndex - field 15 - name IDXINFO - type string - value 1.0|index -doc 5 - field 16 - name allGroups - type string - value allGroups - field 17 - name allGroupsList - type string - value al.aldi -doc 6 - field 18 - name rootGroups - type string - value rootGroups - field 19 - name rootGroupsList - type string - value al -END -checksum 00000000003321211082 diff --git a/swh/lister/maven/tests/data/http_indexes/export_incr.fld b/swh/lister/maven/tests/data/http_indexes/export_full.fld old mode 100755 new mode 100644 rename from swh/lister/maven/tests/data/http_indexes/export_incr.fld rename to swh/lister/maven/tests/data/http_indexes/export_full.fld diff --git a/swh/lister/maven/tests/data/http_indexes/export_incr_first.fld b/swh/lister/maven/tests/data/http_indexes/export_incr_first.fld new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/data/http_indexes/export_incr_first.fld @@ -0,0 +1,42 @@ +doc 0 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.0|sources|jar + field 1 + name m + type string + value 1633786348254 + field 2 + name i + type string + value jar|1626109619335|14316|2|2|0|jar + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 1 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.0|NA|pom + field 1 + name m + type string + value 1633786348271 + field 2 + name i + type string + value jar|1626109636636|-1|1|0|0|pom + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -1,9 +1,8 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from datetime import timezone from pathlib import Path import iso8601 @@ -26,10 +25,7 @@ LIST_GIT_INCR = ("git://github.com/ArangoDB-Community/arangodb-graphql-java.git",) -LIST_SRC = ( - MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0-sources.jar", - MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1-sources.jar", -) +LIST_SRC = (MVN_URL + "al/aldi/sprova4j",) LIST_SRC_DATA = ( { @@ -40,6 +36,7 @@ "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.0", + "base_url": MVN_URL, }, { "type": "maven", @@ -49,18 +46,19 @@ "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.1", + "base_url": MVN_URL, }, ) @pytest.fixture -def maven_index(datadir) -> str: - return Path(datadir, "http_indexes", "export.fld").read_text() +def maven_index_full(datadir) -> str: + return Path(datadir, "http_indexes", "export_full.fld").read_text() @pytest.fixture -def maven_index_incr(datadir) -> str: - return Path(datadir, "http_indexes", "export_incr.fld").read_text() +def maven_index_incr_first(datadir) -> str: + return Path(datadir, "http_indexes", "export_incr_first.fld").read_text() @pytest.fixture @@ -83,17 +81,21 @@ return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_text() -def test_maven_full_listing( - swh_scheduler, - requests_mock, - mocker, - maven_index, - maven_pom_1, - maven_pom_2, +@pytest.fixture(autouse=True) +def network_requests_mock( + requests_mock, maven_index_full, maven_pom_1, maven_pom_2, maven_pom_3 ): + requests_mock.get(INDEX_URL, text=maven_index_full) + requests_mock.get(URL_POM_1, text=maven_pom_1) + requests_mock.get(URL_POM_2, text=maven_pom_2) + requests_mock.get(URL_POM_3, text=maven_pom_3) + + +def test_maven_full_listing(swh_scheduler): """Covers full listing of multiple pages, checking page results and listed origins, statelessness.""" + # Run the lister. lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, @@ -102,43 +104,25 @@ incremental=False, ) - # Set up test. - index_text = maven_index - requests_mock.get(INDEX_URL, text=index_text) - requests_mock.get(URL_POM_1, text=maven_pom_1) - requests_mock.get(URL_POM_2, text=maven_pom_2) - - # Then run the lister. stats = lister.run() # Start test checks. - assert stats.pages == 4 - assert stats.origins == 4 + assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - origin_urls = [origin.url for origin in scheduler_origins] - assert sorted(origin_urls) == sorted(LIST_GIT + LIST_SRC) + + # 3 git origins + 1 maven origin with 2 releases (one per jar) + assert len(origin_urls) == 4 + assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: - if src.get("url") == origin.url: - last_update_src = iso8601.parse_date(src.get("time")).astimezone( - tz=timezone.utc - ) - assert last_update_src == origin.last_update - artifact = origin.extra_loader_arguments["artifacts"][0] - assert src.get("time") == artifact["time"] - assert src.get("gid") == artifact["gid"] - assert src.get("aid") == artifact["aid"] - assert src.get("version") == artifact["version"] - assert MVN_URL == artifact["base_url"] - break - else: - raise AssertionError( - "Could not find scheduler origin in referenced origins." - ) + last_update_src = iso8601.parse_date(src["time"]) + assert last_update_src <= origin.last_update + assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) + scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 @@ -148,10 +132,7 @@ def test_maven_full_listing_malformed( swh_scheduler, requests_mock, - mocker, - maven_index, maven_pom_1_malformed, - maven_pom_2, ): """Covers full listing of multiple pages, checking page results with a malformed scm entry in pom.""" @@ -165,39 +146,28 @@ ) # Set up test. - index_text = maven_index - requests_mock.get(INDEX_URL, text=index_text) requests_mock.get(URL_POM_1, text=maven_pom_1_malformed) - requests_mock.get(URL_POM_2, text=maven_pom_2) # Then run the lister. stats = lister.run() # Start test checks. - assert stats.pages == 4 - assert stats.origins == 3 + assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - origin_urls = [origin.url for origin in scheduler_origins] - LIST_SRC_1 = ("https://github.com/aldialimucaj/sprova4j.git",) - assert sorted(origin_urls) == sorted(LIST_SRC_1 + LIST_SRC) + + # 2 git origins + 1 maven origin with 2 releases (one per jar) + assert len(origin_urls) == 3 + assert sorted(origin_urls) == sorted((LIST_GIT[1],) + LIST_GIT_INCR + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: - if src.get("url") == origin.url: - artifact = origin.extra_loader_arguments["artifacts"][0] - assert src.get("time") == artifact["time"] - assert src.get("gid") == artifact["gid"] - assert src.get("aid") == artifact["aid"] - assert src.get("version") == artifact["version"] - assert MVN_URL == artifact["base_url"] - break - else: - raise AssertionError( - "Could not find scheduler origin in referenced origins." - ) + last_update_src = iso8601.parse_date(src["time"]) + assert last_update_src <= origin.last_update + assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) + scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 @@ -207,12 +177,8 @@ def test_maven_incremental_listing( swh_scheduler, requests_mock, - mocker, - maven_index, - maven_index_incr, - maven_pom_1, - maven_pom_2, - maven_pom_3, + maven_index_full, + maven_index_incr_first, ): """Covers full listing of multiple pages, checking page results and listed origins, with a second updated run for statefulness.""" @@ -226,9 +192,7 @@ ) # Set up test. - requests_mock.get(INDEX_URL, text=maven_index) - requests_mock.get(URL_POM_1, text=maven_pom_1) - requests_mock.get(URL_POM_2, text=maven_pom_2) + requests_mock.get(INDEX_URL, text=maven_index_incr_first) # Then run the lister. stats = lister.run() @@ -236,8 +200,20 @@ # Start test checks. assert lister.incremental assert lister.updated - assert stats.pages == 4 - assert stats.origins == 4 + assert stats.pages == 2 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + origin_urls = [origin.url for origin in scheduler_origins] + + # 1 git origins + 1 maven origin with 1 release (one per jar) + assert len(origin_urls) == 2 + assert sorted(origin_urls) == sorted((LIST_GIT[0],) + LIST_SRC) + + for origin in scheduler_origins: + if origin.visit_type == "maven": + last_update_src = iso8601.parse_date(LIST_SRC_DATA[0]["time"]) + assert last_update_src == origin.last_update + assert origin.extra_loader_arguments["artifacts"] == [LIST_SRC_DATA[0]] # Second execution of the lister, incremental mode lister = MavenLister( @@ -250,12 +226,11 @@ scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None - assert scheduler_state.last_seen_doc == 3 - assert scheduler_state.last_seen_pom == 3 + assert scheduler_state.last_seen_doc == 1 + assert scheduler_state.last_seen_pom == 1 # Set up test. - requests_mock.get(INDEX_URL, text=maven_index_incr) - requests_mock.get(URL_POM_3, text=maven_pom_3) + requests_mock.get(INDEX_URL, text=maven_index_full) # Then run the lister. stats = lister.run() @@ -263,26 +238,19 @@ # Start test checks. assert lister.incremental assert lister.updated - assert stats.pages == 1 - assert stats.origins == 1 + assert stats.pages == 4 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - origin_urls = [origin.url for origin in scheduler_origins] + assert sorted(origin_urls) == sorted(LIST_SRC + LIST_GIT + LIST_GIT_INCR) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: - if src.get("url") == origin.url: - artifact = origin.extra_loader_arguments["artifacts"][0] - assert src.get("time") == artifact["time"] - assert src.get("gid") == artifact["gid"] - assert src.get("aid") == artifact["aid"] - assert src.get("version") == artifact["version"] - break - else: - raise AssertionError + last_update_src = iso8601.parse_date(src["time"]) + assert last_update_src <= origin.last_update + assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None @@ -291,9 +259,7 @@ @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) -def test_maven_list_http_error_on_index_read( - swh_scheduler, requests_mock, mocker, maven_index, http_code -): +def test_maven_list_http_error_on_index_read(swh_scheduler, requests_mock, http_code): """should stop listing if the lister fails to retrieve the main index url.""" lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) @@ -307,21 +273,20 @@ @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) def test_maven_list_http_error_artifacts( - swh_scheduler, requests_mock, mocker, maven_index, http_code, maven_pom_2 + swh_scheduler, + requests_mock, + http_code, ): """should continue listing when failing to retrieve artifacts.""" # Test failure of artefacts retrieval. - requests_mock.get(INDEX_URL, text=maven_index) requests_mock.get(URL_POM_1, status_code=http_code) - requests_mock.get(URL_POM_2, text=maven_pom_2) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) # on artifacts though, that raises but continue listing lister.run() - # If the maven_index step succeeded but not the get_pom step, - # then we get only the 2 maven-jar origins (and not the 2 additional - # src origins). + # If the maven_index_full step succeeded but not the get_pom step, + # then we get only one maven-jar origin and one git origin. scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 3