diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py --- a/swh/loader/svn/loader.py +++ b/swh/loader/svn/loader.py @@ -120,7 +120,7 @@ self.snapshot: Optional[Snapshot] = None # state from previous visit self.latest_snapshot = None - self.latest_revision = None + self.latest_revision: Optional[Revision] = None def pre_cleanup(self): """Cleanup potential dangling files from prior runs (e.g. OOM killed @@ -239,11 +239,11 @@ swh_revision_id = swh_revision.id return swh_revision_id == revision_id - def start_from(self) -> Tuple[int, int, Dict[int, Tuple[bytes, ...]]]: + def start_from(self) -> Tuple[int, int]: """Determine from where to start the loading. Returns: - tuple (revision_start, revision_end, revision_parents) + tuple (revision_start, revision_end) Raises: @@ -261,15 +261,10 @@ revision_start = self.svnrepo.initial_revision() revision_end = revision_head - revision_parents: Dict[int, Tuple[bytes, ...]] = {revision_start: ()} - # start from a previous revision if any if self.incremental and self.latest_revision is not None: extra_headers = dict(self.latest_revision.extra_headers) revision_start = int(extra_headers[b"svn_revision"]) - revision_parents = { - revision_start: self.latest_revision.parents, - } if not self.check_history_not_altered(revision_start, self.latest_revision): self.log.debug( @@ -284,9 +279,6 @@ # now we know history is ok, we start at next revision revision_start = revision_start + 1 - # and the parent become the latest know revision for - # that repository - revision_parents[revision_start] = (self.latest_revision.id,) if revision_start > revision_end: msg = "%s@%s already injected." % (self.svnrepo.remote_url, revision_end) @@ -299,7 +291,7 @@ self.svnrepo, ) - return revision_start, revision_end, revision_parents + return revision_start, revision_end def _check_revision_divergence(self, rev: int, dir_id: bytes) -> None: """Check for hash revision computation divergence. @@ -328,7 +320,7 @@ raise ValueError(err) def process_svn_revisions( - self, svnrepo, revision_start, revision_end, revision_parents + self, svnrepo, revision_start, revision_end ) -> Iterator[ Tuple[List[Content], List[SkippedContent], List[Directory], Revision] ]: @@ -351,18 +343,16 @@ """ gen_revs = svnrepo.swh_hash_data_per_revision(revision_start, revision_end) - swh_revision = None + parents = (self.latest_revision.id,) if self.latest_revision is not None else () count = 0 - for rev, nextrev, commit, new_objects, root_directory in gen_revs: + for rev, commit, new_objects, root_directory in gen_revs: count += 1 # Send the associated contents/directories _contents, _skipped_contents, _directories = new_objects # compute the fs tree's checksums dir_id = root_directory.hash - swh_revision = self.build_swh_revision( - rev, commit, dir_id, revision_parents.get(rev, ()) - ) + swh_revision = self.build_swh_revision(rev, commit, dir_id, parents) self.log.debug( "rev: %s, swhrev: %s, dir: %s", @@ -378,8 +368,7 @@ ): self._check_revision_divergence(rev, dir_id) - if nextrev: - revision_parents[nextrev] = [swh_revision.id] + parents = (swh_revision.id,) yield _contents, _skipped_contents, _directories, swh_revision @@ -392,11 +381,12 @@ self.origin = Origin(url=self.origin_url if self.origin_url else self.svn_url) def prepare(self): - latest_snapshot_revision = self._latest_snapshot_revision(self.origin_url) - if latest_snapshot_revision: - self.latest_snapshot, self.latest_revision = latest_snapshot_revision - self._snapshot = self.latest_snapshot - self._last_revision = self.latest_revision + if self.incremental: + latest_snapshot_revision = self._latest_snapshot_revision(self.origin_url) + if latest_snapshot_revision: + self.latest_snapshot, self.latest_revision = latest_snapshot_revision + self._snapshot = self.latest_snapshot + self._last_revision = self.latest_revision local_dirname = self._create_tmp_dir(self.temp_directory) @@ -416,9 +406,9 @@ raise try: - revision_start, revision_end, revision_parents = self.start_from() + revision_start, revision_end = self.start_from() self.swh_revision_gen = self.process_svn_revisions( - self.svnrepo, revision_start, revision_end, revision_parents + self.svnrepo, revision_start, revision_end ) except SvnLoaderUneventful as e: self.log.warning(e) diff --git a/swh/loader/svn/svn.py b/swh/loader/svn/svn.py --- a/swh/loader/svn/svn.py +++ b/swh/loader/svn/svn.py @@ -300,7 +300,6 @@ ) -> Iterator[ Tuple[ int, - Optional[int], Dict, Tuple[List[Content], List[SkippedContent], List[Directory]], DirectoryFromDisk, @@ -318,7 +317,6 @@ Tuple (rev, nextrev, commit, objects_per_path): - rev: current revision - - nextrev: next revision or None if we reached end_revision. - commit: commit data (author, date, message) for such revision - objects_per_path: Tuple of list of objects between start_revision and end_revision @@ -334,15 +332,10 @@ rev = commit["rev"] objects = self.swhreplay.compute_objects(rev) - if rev == end_revision: - nextrev = None - else: - nextrev = rev + 1 - if rev >= start_revision: # start yielding new data to archive once we reached the revision to # resume the loading from - yield rev, nextrev, commit, objects, self.swhreplay.directory + yield rev, commit, objects, self.swhreplay.directory def swh_hash_data_at_revision( self, revision: int