diff --git a/swh/loader/svn/replay.py b/swh/loader/svn/replay.py --- a/swh/loader/svn/replay.py +++ b/swh/loader/svn/replay.py @@ -361,6 +361,8 @@ externals: Dict[str, List[ExternalDefinition]] = field(default_factory=dict) """Map a path in the directory to a list of (external_url, revision, relative_url) targeting it""" + externals_paths: Set[bytes] = field(default_factory=set) + """Keep track of all external paths reachable from the directory""" class DirEditor: @@ -413,11 +415,8 @@ path: to remove from the current objects. """ - try: + if path in self.directory: entry_removed = self.directory[path] - except KeyError: - entry_removed = None - else: del self.directory[path] fpath = os.path.join(self.rootpath, path) if isinstance(entry_removed, from_disk.Directory): @@ -542,10 +541,33 @@ def delete_entry(self, path: str, revision: int) -> None: """Remove a path.""" path_bytes = os.fsencode(path) + fullpath = os.path.join(self.rootpath, path_bytes) + + if os.path.isdir(fullpath): + # remove all external paths associated to the removed directory + # (we cannot simply remove a root external directory as externals + # paths associated to ancestor directories can overlap) + for external_path in self.dir_states[path_bytes].externals_paths: + self.remove_external_path( + external_path, + root_path=path_bytes, + remove_subpaths=False, + force=True, + ) + if path_bytes not in self.editor.external_paths: - fullpath = os.path.join(self.rootpath, path_bytes) self.file_states.pop(fullpath, None) self.remove_child(path_bytes) + elif os.path.isdir(fullpath): + # versioned and external paths can overlap so we need to iterate on + # all subpaths to check which ones to remove + for root, dirs, files in os.walk(fullpath): + for p in chain(dirs, files): + full_repo_path = os.path.join(root, p) + repo_path = full_repo_path.replace(self.rootpath + b"/", b"") + if repo_path not in self.editor.external_paths: + self.file_states.pop(full_repo_path, None) + self.remove_child(repo_path) def close(self): """Function called when we finish processing a repository. @@ -714,9 +736,6 @@ # copy exported path to reconstructed filesystem fullpath = os.path.join(self.rootpath, dest_fullpath) - # update from_disk model and store external paths - self.editor.external_paths[dest_fullpath] += 1 - if os.path.isfile(temp_path): if os.path.islink(fullpath): # remove destination file if it is a link @@ -752,37 +771,53 @@ self.directory[dest_fullpath] = from_disk.Directory.from_disk( path=fullpath ) - external_paths = set() - for root, dirs, files in os.walk(fullpath): - external_paths.update( - [ - os.path.join(root.replace(self.rootpath + b"/", b""), p) - for p in chain(dirs, files) - ] - ) - for external_path in external_paths: - self.editor.external_paths[external_path] += 1 + + # update set of external paths reachable from the directory + external_paths = set() + dest_path_part = dest_path.split(b"/") + for i in range(1, len(dest_path_part) + 1): + external_paths.add(b"/".join(dest_path_part[:i])) + + for root, dirs, files in os.walk(temp_path): + external_paths.update( + [ + os.path.join( + dest_path, + os.path.join(root, p).replace(temp_path, b"").strip(b"/"), + ) + for p in chain(dirs, files) + ] + ) + + self.dir_states[self.path].externals_paths.update(external_paths) + + for external_path in external_paths: + self.editor.external_paths[os.path.join(self.path, external_path)] += 1 # ensure hash update for the directory with externals set self.directory[self.path].update_hash(force=True) def remove_external_path( - self, external_path: bytes, remove_subpaths: bool = True, force: bool = False + self, + external_path: bytes, + remove_subpaths: bool = True, + force: bool = False, + root_path: Optional[bytes] = None, ) -> None: """Remove a previously exported SVN external path from the reconstructed filesystem. """ - fullpath = os.path.join(self.path, external_path) + path = root_path if root_path else self.path + fullpath = os.path.join(path, external_path) # decrement number of references for external path when we really remove it # (when remove_subpaths is False, we just cleanup the external path before # copying exported paths in it) - if fullpath in self.editor.external_paths and remove_subpaths: + if force or (fullpath in self.editor.external_paths and remove_subpaths): self.editor.external_paths[fullpath] -= 1 if ( - force - or fullpath in self.editor.external_paths + fullpath in self.editor.external_paths and self.editor.external_paths[fullpath] == 0 ): self.remove_child(fullpath) @@ -795,10 +830,10 @@ self.editor.external_paths.pop(path) if remove_subpaths: - subpath_split = external_path.split(b"/")[:-1] + subpath_split = fullpath.split(b"/")[:-1] for i in reversed(range(1, len(subpath_split) + 1)): # delete external sub-directory only if it is not versioned - subpath = os.path.join(self.path, b"/".join(subpath_split[0:i])) + subpath = b"/".join(subpath_split[0:i]) try: self.svnrepo.client.info( svn_urljoin(self.svnrepo.remote_url, os.fsdecode(subpath)), diff --git a/swh/loader/svn/tests/test_externals.py b/swh/loader/svn/tests/test_externals.py --- a/swh/loader/svn/tests/test_externals.py +++ b/swh/loader/svn/tests/test_externals.py @@ -1565,3 +1565,78 @@ type="svn", ) check_snapshot(loader.snapshot, loader.storage) + + +@pytest.mark.parametrize("remote_external_path", ["src/main/project", "src/main"]) +def test_loader_overlapping_external_paths_removal( + swh_storage, repo_url, external_repo_url, tmp_path, remote_external_path +): + add_commit( + external_repo_url, + "Create external repository layout", + [ + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="src/main/project/foo/bar", + data=b"bar", + ), + ], + ) + + add_commit( + repo_url, + "Create repository layout", + [ + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="trunk/src/main/project/", + ), + ], + ) + + add_commit( + repo_url, + "Add overlapping externals", + [ + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="trunk/src/main/", + properties={ + "svn:externals": f"{svn_urljoin(external_repo_url, remote_external_path)} project" # noqa + }, + ), + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="trunk/src/main/project/", + properties={ + "svn:externals": f'{svn_urljoin(external_repo_url, "src/main/project/foo")} foo' # noqa + }, + ), + ], + ) + + add_commit( + repo_url, + "Remove directory with externals overlapping with those from ancestor directory", + [ + CommitChange( + change_type=CommitChangeType.Delete, + path="trunk/src/main/project/", + ), + ], + ) + + loader = SvnLoader( + swh_storage, + repo_url, + temp_directory=tmp_path, + check_revision=1, + ) + assert loader.load() == {"status": "eventful"} + assert_last_visit_matches( + loader.storage, + repo_url, + status="full", + type="svn", + ) + check_snapshot(loader.snapshot, loader.storage)