diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py --- a/swh/loader/svn/loader.py +++ b/swh/loader/svn/loader.py @@ -102,6 +102,7 @@ self.temp_directory = temp_directory self.done = False self.svnrepo = None + self.skip_post_load = False # Revision check is configurable self.check_revision = check_revision # internal state used to store swh objects @@ -223,15 +224,15 @@ rev, commit, self.svnrepo.uuid, dir_id, parents ) - def check_history_not_altered( - self, svnrepo: SvnRepo, revision_start: int, swh_rev: Revision - ) -> bool: + def check_history_not_altered(self, revision_start: int, swh_rev: Revision) -> bool: """Given a svn repository, check if the history was modified in between visits. """ revision_id = swh_rev.id parents = swh_rev.parents - hash_data_per_revs = svnrepo.swh_hash_data_at_revision(revision_start) + + assert self.svnrepo is not None + hash_data_per_revs = self.svnrepo.swh_hash_data_at_revision(revision_start) rev = revision_start commit, root_dir = list(hash_data_per_revs)[0] @@ -279,9 +280,7 @@ revision_start, ) - if not self.check_history_not_altered( - self.svnrepo, revision_start, self.latest_revision - ): + if not self.check_history_not_altered(revision_start, self.latest_revision): self.log.debug( ( "History of svn %s@%s altered. " @@ -540,6 +539,8 @@ return self._visit_status def post_load(self, success: bool = True) -> None: + if self.skip_post_load: + return if success and self._last_revision is not None: # check if the reconstructed filesystem for the last loaded revision is # consistent with the one obtained with a svn export operation. If it is not @@ -782,6 +783,23 @@ # subversion origin and get the number of the last one last_loaded_svn_rev = self.get_last_loaded_svn_rev(self.svn_url) + # Then check if the last loaded revision in the archive is different + # from the last revision on the remote subversion server. + # Skip the dump of all revisions and the loading process if they are identical + # to save some disk space and processing time. + last_loaded_snp_and_rev = self._latest_snapshot_revision(self.origin_url) + if last_loaded_snp_and_rev is not None: + last_loaded_snp, last_loaded_rev = last_loaded_snp_and_rev + self.svnrepo = SvnRepo( + self.origin_url, self.origin_url, self.temp_dir, self.max_content_size + ) + if self.check_history_not_altered(last_loaded_svn_rev, last_loaded_rev): + self._snapshot = last_loaded_snp + self._last_revision = last_loaded_rev + self.done = True + self.skip_post_load = True + return + # Then try to generate a dump file containing relevant svn revisions # to load, an exception will be thrown if something wrong happened dump_path = self.dump_svn_revisions(self.svn_url, last_loaded_svn_rev) diff --git a/swh/loader/svn/tests/test_loader.py b/swh/loader/svn/tests/test_loader.py --- a/swh/loader/svn/tests/test_loader.py +++ b/swh/loader/svn/tests/test_loader.py @@ -742,14 +742,7 @@ assert not os.path.exists(loader.temp_dir) -def test_loader_svn_loader_from_remote_dump(swh_storage, datadir, tmp_path): - """Repository with wrong symlinks should be ingested ok nonetheless - - Edge case: - - wrong symbolic link - - wrong symbolic link with empty space names - - """ +def test_svn_loader_from_remote_dump(swh_storage, datadir, tmp_path): archive_name = "pkg-gourmet" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) @@ -810,6 +803,59 @@ assert loaderFromDump.load() == {"status": "uneventful"} +def test_svn_loader_from_remote_dump_multiple_load_on_stale_repo( + swh_storage, datadir, tmp_path, mocker +): + archive_name = "pkg-gourmet" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + + # first load: a dump file will be created, mounted to a local repository + # and the latter will be loaded into the archive + loaderFromDump = SvnLoaderFromRemoteDump( + swh_storage, repo_url, temp_directory=tmp_path + ) + assert loaderFromDump.load() == {"status": "eventful"} + assert_last_visit_matches( + loaderFromDump.storage, + repo_url, + status="full", + type="svn", + snapshot=GOURMET_SNAPSHOT.id, + ) + + # second load on same repository: the loader will detect there is no changes + # since last load and will skip the dump, mount and load phases + loaderFromDump = SvnLoaderFromRemoteDump( + swh_storage, repo_url, temp_directory=tmp_path + ) + + loaderFromDump.dump_svn_revisions = mocker.MagicMock() + init_svn_repo_from_dump = mocker.patch( + "swh.loader.svn.loader.init_svn_repo_from_dump" + ) + loaderFromDump.process_svn_revisions = mocker.MagicMock() + loaderFromDump._check_revision_divergence = mocker.MagicMock() + + assert loaderFromDump.load() == {"status": "uneventful"} + assert_last_visit_matches( + loaderFromDump.storage, + repo_url, + status="full", + type="svn", + snapshot=GOURMET_SNAPSHOT.id, + ) + + # no dump + loaderFromDump.dump_svn_revisions.assert_not_called() + # no mount + init_svn_repo_from_dump.assert_not_called() + # no loading + loaderFromDump.process_svn_revisions.assert_not_called() + # no redundant post_load processing + loaderFromDump._check_revision_divergence.assert_not_called() + + def test_loader_user_defined_svn_properties(swh_storage, datadir, tmp_path): """Edge cases: The repository held some user defined svn-properties with special encodings, this prevented the repository from being loaded even though we do not