diff --git a/swh/loader/pypi/loader.py b/swh/loader/pypi/loader.py --- a/swh/loader/pypi/loader.py +++ b/swh/loader/pypi/loader.py @@ -106,13 +106,16 @@ list of (filename, sha256) tuples. """ + if not last_snapshot or 'branches' not in last_snapshot: + return {} + revs = [rev['target'] for rev in last_snapshot['branches'].values()] known_revisions = self.storage.revision_get(revs) - ret = [] + ret = {} for revision in known_revisions: if 'original_artifact' in revision['metadata']: artifact = revision['metadata']['original_artifact'] - ret.append((artifact['filename'], artifact['sha256'])) + ret[artifact['filename'], artifact['sha256']] = revision['id'] return ret def _last_snapshot(self): @@ -146,18 +149,11 @@ """ last_snapshot = self._last_snapshot() - if last_snapshot: - self._snapshot = last_snapshot.copy() - known_artifacts = self._known_artifacts(self._snapshot) - else: - self._snapshot = { - 'branches': {} - } - known_artifacts = [] + self.known_artifacts = self._known_artifacts(last_snapshot) # and the artifacts # that will be the source of data to retrieve - self.release_artifacts = self.project.download_new_releases( - known_artifacts + self.new_artifacts = self.project.download_new_releases( + self.known_artifacts ) # temporary state self._contents = [] @@ -182,7 +178,7 @@ return False try: - data = next(self.release_artifacts) + data = next(self.new_artifacts) except StopIteration: self.done = True return False @@ -223,18 +219,34 @@ revision_identifier(_revision)) self._revisions.append(_revision) - branch_name = artifact['filename'].encode('utf-8') - self._snapshot['branches'][branch_name] = { - 'target': _revision['id'], - 'target_type': 'revision', - } + artifact_key = artifact['filename'], artifact['sha256'] + self.known_artifacts[artifact_key] = _revision['id'] return not self.done + def target_from_artifact(self, filename, sha256): + target = self.known_artifacts.get((filename, sha256)) + if target: + return { + 'target': target, + 'target_type': 'revision', + } + return None + def generate_and_load_snapshot(self): - self._snapshot['id'] = identifier_to_bytes( - snapshot_identifier(self._snapshot)) - self.maybe_load_snapshot(self._snapshot) + branches = {} + for release, artifacts in self.project.all_release_artifacts().items(): + for filename, sha256 in artifacts: + branch_name = filename + target = self.target_from_artifact(filename, sha256) + branches[branch_name.encode('utf-8')] = target + + snapshot = { + 'branches': branches, + } + snapshot['id'] = identifier_to_bytes( + snapshot_identifier(snapshot)) + self.maybe_load_snapshot(snapshot) def store_data(self): """(override) This sends collected objects to storage. diff --git a/swh/loader/pypi/tests/test_loader.py b/swh/loader/pypi/tests/test_loader.py --- a/swh/loader/pypi/tests/test_loader.py +++ b/swh/loader/pypi/tests/test_loader.py @@ -217,16 +217,16 @@ def _known_artifacts(self, last_snapshot): """List corresponding seen release artifacts""" - yield from [ + return { ( '0805nexter-1.1.0.zip', '52cd128ad3afe539478abc7440d4b043384295fbe6b0958a237cb6d926465035' # noqa - ), + ): b'L\x99\x89\x1f\x93\xb8\x14P8Ww#Z7\xb5\xe9f\xdd\x15q', ( '0805nexter-1.2.0.zip', '49785c6ae39ea511b3c253d7621c0b1b6228be2f965aca8a491e6b84126d0709' # noqa - ) - ] + ): b'\xe4E\xdaM\xa2+1\xbf\xeb\xb6\xff\xc48=\xbf\x83\x9a\x07M!', + } class LoaderNoNewChangesSinceLastVisitITest(BaseLoaderITest): @@ -374,21 +374,21 @@ } def _known_artifacts(self, last_snapshot): - """List corresponding seen release artifacts""" - return [ + """Map previously seen release artifacts to their revision""" + return { ( '0805nexter-1.1.0.zip', '52cd128ad3afe539478abc7440d4b043384295fbe6b0958a237cb6d926465035' # noqa - ), + ): b'L\x99\x89\x1f\x93\xb8\x14P8Ww#Z7\xb5\xe9f\xdd\x15q', ( '0805nexter-1.2.0.zip', '49785c6ae39ea511b3c253d7621c0b1b6228be2f965aca8a491e6b84126d0709' # noqa - ), + ): b'\xe4E\xdaM\xa2+1\xbf\xeb\xb6\xff\xc48=\xbf\x83\x9a\x07M!', ( '0805nexter-1.3.0.zip', '7097c49fb8ec24a7aaab54c3dbfbb5a6ca1431419d9ee0f6c363d9ad01d2b8b1' # noqa - ) - ] + ): b'\xfbF\xe4\x96\x05\xb0\xbb\xe6\x9f\x8cS\xd3\x15\xe8\x93p\xe7\xc6\xcb]', # noqa + } class LoaderChangesOldReleaseArtifactRemovedSinceLastVisit(BaseLoaderITest): @@ -399,14 +399,6 @@ - a new release has been uploaded - an older one has been removed - # What wrongly happens now - - The visit results in a new snapshot, such snapshot shares the same - branches as last visit's snapshot (including the unpublished - release artifacts). - - # What should happen: - The visit should result in a new snapshot. Such snapshot shares some of the same branches as prior visit (but not all): @@ -458,19 +450,21 @@ self.assertDirectoriesOk(expected_directories) expected_revisions = { + # 1.4.0 '5e91875f096ac48c98d74acf307439a3490f2827': '770e21215ecac53cea331d8ea4dc0ffc9d979367', # noqa } self.assertRevisionsOk(expected_revisions) old_revisions = { - '4c99891f93b81450385777235a37b5e966dd1571': '05219ba38bc542d4345d5638af1ed56c7d43ca7d', # noqa + # 1.2.0 'e445da4da22b31bfebb6ffc4383dbf839a074d21': 'b178b66bd22383d5f16f4f5c923d39ca798861b4', # noqa + # 1.3.0 'fb46e49605b0bbe69f8c53d315e89370e7c6cb5d': 'e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a', # noqa } for rev, dir_id in old_revisions.items(): expected_revisions[rev] = dir_id - expected_snapshot_id = '0f36392a2c825bafa766b371f2485f9549eec41c' + expected_snapshot_id = 'fb192f35397812776377fa758e0ba4cf20a4cf5d' self.assertSnapshotOk(expected_snapshot_id, expected_revisions) _id = hashutil.hash_to_hex(self.loader._last_snapshot()['id'])