diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py --- a/swh/loader/svn/loader.py +++ b/swh/loader/svn/loader.py @@ -121,6 +121,7 @@ # state from previous visit self.latest_snapshot = None self.latest_revision: Optional[Revision] = None + self.from_dump = False def pre_cleanup(self): """Cleanup potential dangling files from prior runs (e.g. OOM killed @@ -160,9 +161,9 @@ """ assert self.svnrepo is not None local_dirname, local_url = self.svnrepo.export_temporary(revision) - h = from_disk.Directory.from_disk(path=local_url).hash + root_dir = from_disk.Directory.from_disk(path=local_url) self.svnrepo.clean_fs(local_dirname) - return h + return root_dir.hash def _latest_snapshot_revision( self, origin_url: str, @@ -392,7 +393,11 @@ try: self.svnrepo = SvnRepo( - self.svn_url, self.origin_url, local_dirname, self.max_content_size + self.svn_url, + self.origin_url, + local_dirname, + self.max_content_size, + self.from_dump, ) except SubversionException as e: error_msgs = [ @@ -577,6 +582,7 @@ self.archive_path = archive_path self.temp_dir = None self.repo_path = None + self.from_dump = True def prepare(self): self.log.info("Archive to mount and load %s", self.archive_path) @@ -630,6 +636,7 @@ check_revision=check_revision, max_content_size=max_content_size, ) + self.from_dump = True self.temp_dir = self._create_tmp_dir(self.temp_directory) self.repo_path = None self.truncated_dump = False diff --git a/swh/loader/svn/svn.py b/swh/loader/svn/svn.py --- a/swh/loader/svn/svn.py +++ b/swh/loader/svn/svn.py @@ -53,9 +53,11 @@ origin_url: str, local_dirname: str, max_content_length: int, + from_dump: bool = False, ): self.remote_url = remote_url.rstrip("/") self.origin_url = origin_url + self.from_dump = from_dump auth = Auth([get_username_provider()]) # one connection for log iteration @@ -81,6 +83,12 @@ self.has_recursive_externals = False self.replay_started = False + # compute root directory path from the remote repository URL, required to + # properly load the sub-tree of a repository mounted from a dump file + info = self.client.info(origin_url.rstrip("/")) + repos_root_url = next(iter(info.values())).repos_root_url + self.root_directory = origin_url.replace(repos_root_url, "", 1) + def __str__(self): return str( { @@ -157,11 +165,21 @@ revprops.get(properties.PROP_REVISION_LOG, DEFAULT_AUTHOR_MESSAGE) ) + has_changes = ( + not self.from_dump + or changed_paths is not None + and any( + changed_path.startswith(self.root_directory) + for changed_path in changed_paths.keys() + ) + ) + return { "rev": rev, "author_date": author_date, "author_name": author, "message": message, + "has_changes": has_changes, } def logs(self, revision_start: int, revision_end: int) -> Iterator[Dict]: @@ -191,7 +209,7 @@ paths=None, start=revision_start, end=revision_end, - discover_changed_paths=False, + discover_changed_paths=self.from_dump, ): yield self.__to_entry(log_entry) @@ -293,6 +311,14 @@ pass else: raise + + if self.from_dump: + # when exporting a subpath of a subversion repository mounted from + # a dump file gnerated by svnrdump, exported paths are relative to + # the repository root path while they are relative to the subpath + # otherwise, so we need to adjust the URL of the exported filesystem + local_url = os.path.join(local_url, self.root_directory.strip("/")) + return local_dirname, os.fsencode(local_url) def swh_hash_data_per_revision( @@ -335,7 +361,10 @@ if rev >= start_revision: # start yielding new data to archive once we reached the revision to # resume the loading from - yield rev, commit, objects, self.swhreplay.directory + if commit["has_changes"] or start_revision == 0: + # yield data only if commit has changes or if repository is empty + root_dir = self.swhreplay.directory[self.root_directory.encode()] + yield rev, commit, objects, root_dir def swh_hash_data_at_revision( self, revision: int diff --git a/swh/loader/svn/tests/test_externals.py b/swh/loader/svn/tests/test_externals.py --- a/swh/loader/svn/tests/test_externals.py +++ b/swh/loader/svn/tests/test_externals.py @@ -1175,6 +1175,11 @@ mock_client = mocker.MagicMock() mocker.patch.object(client, "Client", mock_client) + class Info: + repos_root_url = repo_url + + mock_client().info.return_value = {"repo": Info()} + loader = SvnLoaderFromRemoteDump(swh_storage, repo_url, temp_directory=tmp_path) loader.load() diff --git a/swh/loader/svn/tests/test_loader.py b/swh/loader/svn/tests/test_loader.py --- a/swh/loader/svn/tests/test_loader.py +++ b/swh/loader/svn/tests/test_loader.py @@ -709,9 +709,10 @@ assert not os.path.exists(loader.temp_dir) -def test_svn_loader_from_remote_dump(swh_storage, datadir, tmp_path): +def test_svn_loader_from_remote_dump(swh_storage, datadir, tmpdir_factory): archive_name = "pkg-gourmet" archive_path = os.path.join(datadir, f"{archive_name}.tgz") + tmp_path = tmpdir_factory.mktemp("repo1") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) loaderFromDump = SvnLoaderFromRemoteDump( @@ -726,7 +727,10 @@ snapshot=GOURMET_SNAPSHOT.id, ) - origin_url = repo_url + "2" # rename to another origin + # rename to another origin + tmp_path = tmpdir_factory.mktemp("repo2") + origin_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + loader = SvnLoader( swh_storage, repo_url, origin_url=origin_url, temp_directory=tmp_path ) @@ -911,7 +915,6 @@ archive_name = "pkg-gourmet" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - origin_url = f"svn://{archive_name}" dump_filename = f"{archive_name}.dump" with open(os.path.join(tmp_path, dump_filename), "wb") as dump_file: @@ -922,7 +925,7 @@ # load svn repo from that compressed dump file loader = SvnLoaderFromDumpArchive( swh_storage, - url=origin_url, + url=repo_url, archive_path=os.path.join(tmp_path, f"{dump_filename}.gz"), temp_directory=tmp_path, ) @@ -931,7 +934,7 @@ assert_last_visit_matches( loader.storage, - origin_url, + repo_url, status="full", type="svn", snapshot=GOURMET_SNAPSHOT.id, @@ -1778,3 +1781,99 @@ loader.storage, repo_url, status="full", type="svn", ) check_snapshot(loader.snapshot, loader.storage) + + +@pytest.mark.parametrize( + "svn_loader_cls", [SvnLoader, SvnLoaderFromDumpArchive, SvnLoaderFromRemoteDump] +) +def test_loader_with_subprojects(swh_storage, repo_url, tmp_path, svn_loader_cls): + + # first commit + add_commit( + repo_url, + "Add first project in repository", + [ + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="project1/foo.sh", + data=b"#!/bin/bash\necho foo", + ), + ], + ) + + # second commit + add_commit( + repo_url, + "Add second project in repository", + [ + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="project2/bar.sh", + data=b"#!/bin/bash\necho bar", + ), + ], + ) + + # third commit + add_commit( + repo_url, + "Add third project in repository", + [ + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="project3/baz.sh", + data=b"#!/bin/bash\necho baz", + ), + ], + ) + + def dump_project(origin_url): + svnrdump_cmd = ["svnrdump", "dump", origin_url] + dump_path = f"{tmp_path}/repo.dump" + with open(dump_path, "wb") as dump_file: + subprocess.run(svnrdump_cmd, stdout=dump_file) + subprocess.run(["gzip", dump_path]) + return dump_path + ".gz" + + for i in range(1, 4): + # load each project in the repository separately + origin_url = f"{repo_url}/project{i}" + + loader_params = { + "storage": swh_storage, + "url": origin_url, + "origin_url": origin_url, + "temp_directory": tmp_path, + "incremental": True, + "check_revision": 1, + } + + if svn_loader_cls == SvnLoaderFromDumpArchive: + loader_params["archive_path"] = dump_project(origin_url) + + loader = svn_loader_cls(**loader_params) + + assert loader.load() == {"status": "eventful"} + assert_last_visit_matches( + loader.storage, origin_url, status="full", type="svn", + ) + check_snapshot(loader.snapshot, loader.storage) + + if svn_loader_cls == SvnLoaderFromDumpArchive: + loader_params["archive_path"] = dump_project(origin_url) + + loader = svn_loader_cls(**loader_params) + + assert loader.load() == {"status": "uneventful"} + + # each project origin must have + assert get_stats(loader.storage) == { + "content": i, # one content + "directory": 2 * i, # two directories + "origin": i, + "origin_visit": 2 * i, # two visits + "release": 0, + "revision": i, # one revision + "skipped_content": 0, + "snapshot": i, # one snapshot + }