diff --git a/swh/loader/svn/replay.py b/swh/loader/svn/replay.py --- a/swh/loader/svn/replay.py +++ b/swh/loader/svn/replay.py @@ -443,14 +443,30 @@ svnrepo=self.svnrepo, ) - def add_directory(self, path: str, *args) -> DirEditor: + def add_directory( + self, path: str, copyfrom_path: Optional[str] = None, copyfrom_rev: int = -1 + ) -> DirEditor: """Adding a new directory.""" path_bytes = os.fsencode(path) + fullpath = os.path.join(self.rootpath, path_bytes) - os.makedirs(os.path.join(self.rootpath, path_bytes), exist_ok=True) - if path_bytes and path_bytes not in self.directory: - self.dir_states[path_bytes] = DirState() - self.directory[path_bytes] = from_disk.Directory() + os.makedirs(fullpath, exist_ok=True) + if copyfrom_rev == -1: + if path_bytes and path_bytes not in self.directory: + self.dir_states[path_bytes] = DirState() + self.directory[path_bytes] = from_disk.Directory() + else: + url = svn_urljoin(self.svnrepo.remote_url, copyfrom_path) + self.remove_child(path_bytes) + self.svnrepo.export( + url, + to=fullpath, + peg_rev=copyfrom_rev, + ignore_keywords=True, + overwrite=True, + ignore_externals=True, + ) + self.directory[path_bytes] = from_disk.Directory.from_disk(path=fullpath) return DirEditor( self.directory, @@ -474,12 +490,28 @@ svnrepo=self.svnrepo, ) - def add_file(self, path: str, *args) -> FileEditor: + def add_file( + self, path: str, copyfrom_path: Optional[str] = None, copyfrom_rev: int = -1 + ) -> FileEditor: """Creating a new file.""" path_bytes = os.fsencode(path) - self.directory[path_bytes] = from_disk.Content() fullpath = os.path.join(self.rootpath, path_bytes) + self.file_states[fullpath] = FileState() + if copyfrom_rev == -1: + self.directory[path_bytes] = from_disk.Content() + else: + url = svn_urljoin(self.svnrepo.remote_url, copyfrom_path) + self.remove_child(path_bytes) + self.svnrepo.export( + url, + to=fullpath, + peg_rev=copyfrom_rev, + ignore_keywords=True, + overwrite=True, + ) + self.directory[path_bytes] = from_disk.Content.from_file(path=fullpath) + return FileEditor( self.directory, self.rootpath, @@ -931,7 +963,7 @@ rootpath=rootpath, directory=directory, svnrepo=svnrepo, temp_dir=temp_dir ) - def replay(self, rev: int) -> from_disk.Directory: + def replay(self, rev: int, low_water_mark: int) -> from_disk.Directory: """Replay svn actions between rev and rev+1. This method updates in place the self.editor.directory, as well as the @@ -942,12 +974,12 @@ """ codecs.register_error("strict", _ra_codecs_error_handler) - self.conn.replay(rev, rev + 1, self.editor) + self.conn.replay(rev, low_water_mark, self.editor) codecs.register_error("strict", codecs.strict_errors) return self.editor.directory def compute_objects( - self, rev: int + self, rev: int, low_water_mark: int ) -> Tuple[List[Content], List[SkippedContent], List[Directory]]: """Compute objects added or modified at revisions rev. Expects the state to be at previous revision's objects. @@ -960,7 +992,7 @@ mutates the filesystem at rootpath accordingly. """ - self.replay(rev) + self.replay(rev, low_water_mark) contents: List[Content] = [] skipped_contents: List[SkippedContent] = [] diff --git a/swh/loader/svn/svn.py b/swh/loader/svn/svn.py --- a/swh/loader/svn/svn.py +++ b/swh/loader/svn/svn.py @@ -220,6 +220,7 @@ "author_name": author, "message": message, "has_changes": has_changes, + "changed_paths": changed_paths, } def logs(self, revision_start: int, revision_end: int) -> Iterator[Dict]: @@ -249,7 +250,7 @@ paths=None, start=revision_start, end=revision_end, - discover_changed_paths=self.from_dump, + discover_changed_paths=True, ): yield self.__to_entry(log_entry) @@ -539,7 +540,23 @@ first_revision = 1 if start_revision else 0 # handle empty repository edge case for commit in self.logs(first_revision, end_revision): rev = commit["rev"] - objects = self.swhreplay.compute_objects(rev) + copyfrom_revs = ( + [ + copyfrom_rev + for (_, _, copyfrom_rev, _) in commit["changed_paths"].values() + if copyfrom_rev != -1 + ] + if commit["changed_paths"] + else None + ) + low_water_mark = rev + 1 + if copyfrom_revs: + # when files or directories in the revision to replay have been copied from + # ancestor revisions, we need to adjust the low water mark revision used by + # svn replay API to handle the copies in our commit editor and to ensure + # replace operations after copy will be replayed + low_water_mark = min(copyfrom_revs) + objects = self.swhreplay.compute_objects(rev, low_water_mark) if rev >= start_revision: # start yielding new data to archive once we reached the revision to diff --git a/swh/loader/svn/tests/test_loader.py b/swh/loader/svn/tests/test_loader.py --- a/swh/loader/svn/tests/test_loader.py +++ b/swh/loader/svn/tests/test_loader.py @@ -2197,3 +2197,92 @@ with open(dest_path, "rb") as f: assert f.read() == content + + +@pytest.mark.parametrize("svn_loader_cls", [SvnLoader, SvnLoaderFromRemoteDump]) +def test_loader_repo_with_copyfrom_and_replace_operations( + swh_storage, repo_url, tmp_path, svn_loader_cls +): + add_commit( + repo_url, + "Create trunk/data folder", + [ + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="trunk/data/foo", + data=b"foo", + ), + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="trunk/data/bar", + data=b"bar", + ), + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="trunk/data/baz/", + ), + ], + ) + + add_commit( + repo_url, + "Create trunk/project folder", + [ + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="trunk/project/", + ), + ], + ) + + add_commit( + repo_url, + "Create trunk/project/bar as copy of trunk/data/bar from revision 1", + [ + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="trunk/project/bar", + copyfrom_path=repo_url + "/trunk/data/bar", + copyfrom_rev=1, + ), + ], + ) + + add_commit( + repo_url, + ( + "Create trunk/project/data/ folder as a copy of /trunk/data from revision 1" + " and replace the trunk/project/data/baz/ folder by a trunk/project/data/baz file" + ), + [ + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="trunk/project/data/", + copyfrom_path=repo_url + "/trunk/data/", + copyfrom_rev=1, + ), + CommitChange( + change_type=CommitChangeType.Delete, + path="trunk/project/data/baz/", + ), + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="trunk/project/data/baz", + data=b"baz", + ), + ], + ) + + loader = svn_loader_cls( + swh_storage, repo_url, temp_directory=tmp_path, check_revision=1 + ) + + assert loader.load() == {"status": "eventful"} + + assert_last_visit_matches( + loader.storage, + repo_url, + status="full", + type="svn", + ) + check_snapshot(loader.snapshot, loader.storage) diff --git a/swh/loader/svn/tests/utils.py b/swh/loader/svn/tests/utils.py --- a/swh/loader/svn/tests/utils.py +++ b/swh/loader/svn/tests/utils.py @@ -23,6 +23,8 @@ path: str properties: Dict[str, str] data: bytes + copyfrom_path: str + copyfrom_rev: int def add_commit(repo_url: str, message: str, changes: List[CommitChange]) -> None: @@ -35,17 +37,19 @@ else: dir_change = change["path"].endswith("/") split_path = change["path"].rstrip("/").split("/") + copyfrom_path = change.get("copyfrom_path") + copyfrom_rev = change.get("copyfrom_rev", -1) for i in range(len(split_path)): path = "/".join(split_path[0 : i + 1]) if i < len(split_path) - 1: try: - root.add_directory(path).close() + root.add_directory(path, copyfrom_path, copyfrom_rev).close() except SubversionException: pass else: if dir_change: try: - dir = root.add_directory(path) + dir = root.add_directory(path, copyfrom_path, copyfrom_rev) except SubversionException: dir = root.open_directory(path) if "properties" in change: @@ -54,7 +58,7 @@ dir.close() else: try: - file = root.add_file(path) + file = root.add_file(path, copyfrom_path, copyfrom_rev) except SubversionException: file = root.open_file(path) if "properties" in change: