Page MenuHomeSoftware Heritage
Paste P1219

SvnLoaderFromRemoteDump optimization
ActivePublic

Authored by anlambert on Nov 18 2021, 4:54 PM.
(swh) ✔ ~/swh/swh-environment/swh-loader-svn [master|✚ 2⚑ 10]
16:43 $ git diff
diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py
index eac96a3..698babe 100644
--- a/swh/loader/svn/loader.py
+++ b/swh/loader/svn/loader.py
@@ -223,15 +223,15 @@ Local repository not cleaned up for investigation: %s""",
rev, commit, self.svnrepo.uuid, dir_id, parents
)
- def check_history_not_altered(
- self, svnrepo: SvnRepo, revision_start: int, swh_rev: Revision
- ) -> bool:
+ def check_history_not_altered(self, revision_start: int, swh_rev: Revision) -> bool:
"""Given a svn repository, check if the history was modified in between visits.
"""
revision_id = swh_rev.id
parents = swh_rev.parents
- hash_data_per_revs = svnrepo.swh_hash_data_at_revision(revision_start)
+
+ assert self.svnrepo is not None
+ hash_data_per_revs = self.svnrepo.swh_hash_data_at_revision(revision_start)
rev = revision_start
commit, root_dir = list(hash_data_per_revs)[0]
@@ -279,9 +279,7 @@ Local repository not cleaned up for investigation: %s""",
revision_start,
)
- if not self.check_history_not_altered(
- self.svnrepo, revision_start, self.latest_revision
- ):
+ if not self.check_history_not_altered(revision_start, self.latest_revision):
self.log.debug(
(
"History of svn %s@%s altered. "
@@ -782,6 +780,22 @@ class SvnLoaderFromRemoteDump(SvnLoader):
# subversion origin and get the number of the last one
last_loaded_svn_rev = self.get_last_loaded_svn_rev(self.svn_url)
+ # Then check if the last loaded revision in the archive is different
+ # from the last revision on the remote subversion server.
+ # Skip the dump of all revisions and the loading process if they are identical
+ # to save some disk space and processing time.
+ last_loaded_snp_and_rev = self._latest_snapshot_revision(self.origin_url)
+ if last_loaded_snp_and_rev is not None:
+ last_loaded_snp, last_loaded_rev = last_loaded_snp_and_rev
+ self.svnrepo = SvnRepo(
+ self.origin_url, self.origin_url, self.temp_dir, self.max_content_size
+ )
+ if self.check_history_not_altered(last_loaded_svn_rev, last_loaded_rev):
+ self._snapshot = last_loaded_snp
+ self._last_revision = last_loaded_rev
+ self.done = True
+ return
+
# Then try to generate a dump file containing relevant svn revisions
# to load, an exception will be thrown if something wrong happened
dump_path = self.dump_svn_revisions(self.svn_url, last_loaded_svn_rev)