Page MenuHomeSoftware Heritage

D7163.diff
No OneTemporary

D7163.diff

diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py
--- a/swh/loader/svn/loader.py
+++ b/swh/loader/svn/loader.py
@@ -121,6 +121,7 @@
# state from previous visit
self.latest_snapshot = None
self.latest_revision: Optional[Revision] = None
+ self.from_dump = False
def pre_cleanup(self):
"""Cleanup potential dangling files from prior runs (e.g. OOM killed
@@ -160,9 +161,9 @@
"""
assert self.svnrepo is not None
local_dirname, local_url = self.svnrepo.export_temporary(revision)
- h = from_disk.Directory.from_disk(path=local_url).hash
+ root_dir = from_disk.Directory.from_disk(path=local_url)
self.svnrepo.clean_fs(local_dirname)
- return h
+ return root_dir.hash
def _latest_snapshot_revision(
self, origin_url: str,
@@ -392,7 +393,11 @@
try:
self.svnrepo = SvnRepo(
- self.svn_url, self.origin_url, local_dirname, self.max_content_size
+ self.svn_url,
+ self.origin_url,
+ local_dirname,
+ self.max_content_size,
+ self.from_dump,
)
except SubversionException as e:
error_msgs = [
@@ -577,6 +582,7 @@
self.archive_path = archive_path
self.temp_dir = None
self.repo_path = None
+ self.from_dump = True
def prepare(self):
self.log.info("Archive to mount and load %s", self.archive_path)
@@ -630,6 +636,7 @@
check_revision=check_revision,
max_content_size=max_content_size,
)
+ self.from_dump = True
self.temp_dir = self._create_tmp_dir(self.temp_directory)
self.repo_path = None
self.truncated_dump = False
diff --git a/swh/loader/svn/svn.py b/swh/loader/svn/svn.py
--- a/swh/loader/svn/svn.py
+++ b/swh/loader/svn/svn.py
@@ -53,9 +53,11 @@
origin_url: str,
local_dirname: str,
max_content_length: int,
+ from_dump: bool = False,
):
self.remote_url = remote_url.rstrip("/")
self.origin_url = origin_url
+ self.from_dump = from_dump
auth = Auth([get_username_provider()])
# one connection for log iteration
@@ -81,6 +83,12 @@
self.has_recursive_externals = False
self.replay_started = False
+ # compute root directory path from the remote repository URL, required to
+ # properly load the sub-tree of a repository mounted from a dump file
+ info = self.client.info(origin_url.rstrip("/"))
+ repos_root_url = next(iter(info.values())).repos_root_url
+ self.root_directory = origin_url.replace(repos_root_url, "", 1)
+
def __str__(self):
return str(
{
@@ -157,11 +165,21 @@
revprops.get(properties.PROP_REVISION_LOG, DEFAULT_AUTHOR_MESSAGE)
)
+ has_changes = (
+ not self.from_dump
+ or changed_paths is not None
+ and any(
+ changed_path.startswith(self.root_directory)
+ for changed_path in changed_paths.keys()
+ )
+ )
+
return {
"rev": rev,
"author_date": author_date,
"author_name": author,
"message": message,
+ "has_changes": has_changes,
}
def logs(self, revision_start: int, revision_end: int) -> Iterator[Dict]:
@@ -191,7 +209,7 @@
paths=None,
start=revision_start,
end=revision_end,
- discover_changed_paths=False,
+ discover_changed_paths=self.from_dump,
):
yield self.__to_entry(log_entry)
@@ -293,6 +311,14 @@
pass
else:
raise
+
+ if self.from_dump:
+ # when exporting a subpath of a subversion repository mounted from
+ # a dump file gnerated by svnrdump, exported paths are relative to
+ # the repository root path while they are relative to the subpath
+ # otherwise, so we need to adjust the URL of the exported filesystem
+ local_url = os.path.join(local_url, self.root_directory.strip("/"))
+
return local_dirname, os.fsencode(local_url)
def swh_hash_data_per_revision(
@@ -335,7 +361,10 @@
if rev >= start_revision:
# start yielding new data to archive once we reached the revision to
# resume the loading from
- yield rev, commit, objects, self.swhreplay.directory
+ if commit["has_changes"] or start_revision == 0:
+ # yield data only if commit has changes or if repository is empty
+ root_dir = self.swhreplay.directory[self.root_directory.encode()]
+ yield rev, commit, objects, root_dir
def swh_hash_data_at_revision(
self, revision: int
diff --git a/swh/loader/svn/tests/test_externals.py b/swh/loader/svn/tests/test_externals.py
--- a/swh/loader/svn/tests/test_externals.py
+++ b/swh/loader/svn/tests/test_externals.py
@@ -1175,6 +1175,11 @@
mock_client = mocker.MagicMock()
mocker.patch.object(client, "Client", mock_client)
+ class Info:
+ repos_root_url = repo_url
+
+ mock_client().info.return_value = {"repo": Info()}
+
loader = SvnLoaderFromRemoteDump(swh_storage, repo_url, temp_directory=tmp_path)
loader.load()
diff --git a/swh/loader/svn/tests/test_loader.py b/swh/loader/svn/tests/test_loader.py
--- a/swh/loader/svn/tests/test_loader.py
+++ b/swh/loader/svn/tests/test_loader.py
@@ -709,9 +709,10 @@
assert not os.path.exists(loader.temp_dir)
-def test_svn_loader_from_remote_dump(swh_storage, datadir, tmp_path):
+def test_svn_loader_from_remote_dump(swh_storage, datadir, tmpdir_factory):
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
+ tmp_path = tmpdir_factory.mktemp("repo1")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loaderFromDump = SvnLoaderFromRemoteDump(
@@ -726,7 +727,10 @@
snapshot=GOURMET_SNAPSHOT.id,
)
- origin_url = repo_url + "2" # rename to another origin
+ # rename to another origin
+ tmp_path = tmpdir_factory.mktemp("repo2")
+ origin_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
+
loader = SvnLoader(
swh_storage, repo_url, origin_url=origin_url, temp_directory=tmp_path
)
@@ -911,7 +915,6 @@
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
- origin_url = f"svn://{archive_name}"
dump_filename = f"{archive_name}.dump"
with open(os.path.join(tmp_path, dump_filename), "wb") as dump_file:
@@ -922,7 +925,7 @@
# load svn repo from that compressed dump file
loader = SvnLoaderFromDumpArchive(
swh_storage,
- url=origin_url,
+ url=repo_url,
archive_path=os.path.join(tmp_path, f"{dump_filename}.gz"),
temp_directory=tmp_path,
)
@@ -931,7 +934,7 @@
assert_last_visit_matches(
loader.storage,
- origin_url,
+ repo_url,
status="full",
type="svn",
snapshot=GOURMET_SNAPSHOT.id,
@@ -1778,3 +1781,99 @@
loader.storage, repo_url, status="full", type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
+
+
+@pytest.mark.parametrize(
+ "svn_loader_cls", [SvnLoader, SvnLoaderFromDumpArchive, SvnLoaderFromRemoteDump]
+)
+def test_loader_with_subprojects(swh_storage, repo_url, tmp_path, svn_loader_cls):
+
+ # first commit
+ add_commit(
+ repo_url,
+ "Add first project in repository",
+ [
+ CommitChange(
+ change_type=CommitChangeType.AddOrUpdate,
+ path="project1/foo.sh",
+ data=b"#!/bin/bash\necho foo",
+ ),
+ ],
+ )
+
+ # second commit
+ add_commit(
+ repo_url,
+ "Add second project in repository",
+ [
+ CommitChange(
+ change_type=CommitChangeType.AddOrUpdate,
+ path="project2/bar.sh",
+ data=b"#!/bin/bash\necho bar",
+ ),
+ ],
+ )
+
+ # third commit
+ add_commit(
+ repo_url,
+ "Add third project in repository",
+ [
+ CommitChange(
+ change_type=CommitChangeType.AddOrUpdate,
+ path="project3/baz.sh",
+ data=b"#!/bin/bash\necho baz",
+ ),
+ ],
+ )
+
+ def dump_project(origin_url):
+ svnrdump_cmd = ["svnrdump", "dump", origin_url]
+ dump_path = f"{tmp_path}/repo.dump"
+ with open(dump_path, "wb") as dump_file:
+ subprocess.run(svnrdump_cmd, stdout=dump_file)
+ subprocess.run(["gzip", dump_path])
+ return dump_path + ".gz"
+
+ for i in range(1, 4):
+ # load each project in the repository separately
+ origin_url = f"{repo_url}/project{i}"
+
+ loader_params = {
+ "storage": swh_storage,
+ "url": origin_url,
+ "origin_url": origin_url,
+ "temp_directory": tmp_path,
+ "incremental": True,
+ "check_revision": 1,
+ }
+
+ if svn_loader_cls == SvnLoaderFromDumpArchive:
+ loader_params["archive_path"] = dump_project(origin_url)
+
+ loader = svn_loader_cls(**loader_params)
+
+ assert loader.load() == {"status": "eventful"}
+ assert_last_visit_matches(
+ loader.storage, origin_url, status="full", type="svn",
+ )
+ check_snapshot(loader.snapshot, loader.storage)
+
+ if svn_loader_cls == SvnLoaderFromDumpArchive:
+ loader_params["archive_path"] = dump_project(origin_url)
+
+ loader = svn_loader_cls(**loader_params)
+
+ assert loader.load() == {"status": "uneventful"}
+
+ # each project origin must have
+ assert get_stats(loader.storage) == {
+ "content": i, # one content
+ "directory": 2 * i, # two directories
+ "origin": i,
+ "origin_visit": 2 * i, # two visits
+ "release": 0,
+ "revision": i, # one revision
+ "skipped_content": 0,
+ "snapshot": i, # one snapshot
+ }

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 12:34 PM (2 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217212

Event Timeline