diff --git a/swh/loader/svn/replay.py b/swh/loader/svn/replay.py --- a/swh/loader/svn/replay.py +++ b/swh/loader/svn/replay.py @@ -42,7 +42,11 @@ if TYPE_CHECKING: from swh.loader.svn.svn import SvnRepo -from swh.loader.svn.utils import parse_external_definition, svn_urljoin +from swh.loader.svn.utils import ( + is_recursive_external, + parse_external_definition, + svn_urljoin, +) _eol_style = {"native": b"\n", "CRLF": b"\r\n", "LF": b"\n", "CR": b"\r"} @@ -587,6 +591,12 @@ # external already exported, nothing to do continue + if is_recursive_external( + self.svnrepo.origin_url, os.fsdecode(self.path), path, external_url + ): + # recursive external, skip it + continue + if external not in self.editor.externals_cache: try: @@ -675,6 +685,21 @@ [relative_url for (_, relative_url) in self.editor.valid_externals.values()] ) + self.svnrepo.has_recursive_externals = any( + is_recursive_external( + self.svnrepo.origin_url, os.fsdecode(path), external_path, external_url + ) + for path, dir_state in self.dir_states.items() + for external_path, (external_url, _, _) in dir_state.externals.items() + ) + if self.svnrepo.has_recursive_externals: + # If the repository has recursive externals, we stop processing externals + # and remove those already exported, + # We will then ignore externals when exporting the revision to check for + # divergence with the reconstructed filesystem + for external_path in list(self.editor.external_paths): + self.remove_external_path(external_path) + def remove_external_path(self, external_path: bytes) -> None: """Remove a previously exported SVN external path from the reconstruted filesystem. diff --git a/swh/loader/svn/svn.py b/swh/loader/svn/svn.py --- a/swh/loader/svn/svn.py +++ b/swh/loader/svn/svn.py @@ -28,7 +28,7 @@ ) from . import converters, replay -from .utils import parse_external_definition +from .utils import is_recursive_external, parse_external_definition # When log message contains empty data DEFAULT_AUTHOR_MESSAGE = "" @@ -78,6 +78,7 @@ ) self.max_content_length = max_content_length self.has_relative_externals = False + self.has_recursive_externals = False self.replay_started = False def __str__(self): @@ -221,10 +222,11 @@ if self.replay_started and self.has_relative_externals: # externals detected while replaying revisions url = self.origin_url - elif not self.replay_started and self.remote_url.startswith("file://"): + elif not self.replay_started: # revisions replay has not started, we need to check if svn:externals # properties are set from a checkout of the revision and if some - # external URLs are relative to pick the right export URL + # external URLs are relative to pick the right export URL, + # recursive externals are also checked with tempfile.TemporaryDirectory( dir=self.local_dirname, prefix=f"checkout-revision-{revision}." ) as co_dirname: @@ -236,23 +238,42 @@ "svn:externals", co_dirname, None, revision, True ) self.has_relative_externals = False + self.has_recursive_externals = False for path, external_defs in externals.items(): - if self.has_relative_externals: + if self.has_relative_externals or self.has_recursive_externals: break + path = path.replace(self.remote_url.rstrip("/") + "/", "") for external_def in os.fsdecode(external_defs).split("\n"): # skip empty line or comment if not external_def or external_def.startswith("#"): continue - _, _, _, relative_url = parse_external_definition( + ( + external_path, + external_url, + _, + relative_url, + ) = parse_external_definition( external_def.rstrip("\r"), path, self.origin_url ) + + if is_recursive_external( + self.origin_url, path, external_path, external_url, + ): + self.has_recursive_externals = True + url = self.remote_url + break + if relative_url: self.has_relative_externals = True url = self.origin_url break self.client.export( - url.rstrip("/"), to=local_url, rev=revision, ignore_keywords=True, + url.rstrip("/"), + to=local_url, + rev=revision, + ignore_keywords=True, + ignore_externals=self.has_recursive_externals, ) return local_dirname, os.fsencode(local_url) diff --git a/swh/loader/svn/tests/test_loader.py b/swh/loader/svn/tests/test_loader.py --- a/swh/loader/svn/tests/test_loader.py +++ b/swh/loader/svn/tests/test_loader.py @@ -2734,3 +2734,99 @@ loader.storage, repo_url, status="full", type="svn", ) check_snapshot(loader.snapshot, loader.storage) + + +def test_loader_with_recursive_external( + swh_storage, repo_url, external_repo_url, tmp_path +): + # first commit on external + add_commit( + external_repo_url, + "Create a file in an external repository", + [ + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="code/foo.sh", + data=b"#!/bin/bash\necho foo", + ), + ], + ) + + # first commit + add_commit( + repo_url, + "Add trunk dir and a file", + [ + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="trunk/bar.sh", + data=b"#!/bin/bash\necho bar", + ) + ], + ) + + # second commit + add_commit( + repo_url, + "Set externals code on trunk/externals dir, one being recursive", + [ + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="trunk/externals/", + properties={ + "svn:externals": ( + f"{svn_urljoin(external_repo_url, 'code')} code\n" + f"{repo_url} recursive" + ) + }, + ), + ], + ) + + # first load + loader = SvnLoader( + swh_storage, repo_url, temp_directory=tmp_path, check_revision=1, + ) + assert loader.load() == {"status": "eventful"} + assert_last_visit_matches( + loader.storage, repo_url, status="full", type="svn", + ) + check_snapshot(loader.snapshot, loader.storage) + assert loader.svnrepo.has_recursive_externals + + # second load on stale repo + loader = SvnLoader( + swh_storage, repo_url, temp_directory=tmp_path, check_revision=1, + ) + assert loader.load() == {"status": "uneventful"} + assert_last_visit_matches( + loader.storage, repo_url, status="full", type="svn", + ) + check_snapshot(loader.snapshot, loader.storage) + assert loader.svnrepo.has_recursive_externals + + # third commit + add_commit( + repo_url, + "Remove recursive external on trunk/externals dir", + [ + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="trunk/externals/", + properties={ + "svn:externals": (f"{svn_urljoin(external_repo_url, 'code')} code") + }, + ), + ], + ) + + # third load + loader = SvnLoader( + swh_storage, repo_url, temp_directory=tmp_path, check_revision=1, + ) + assert loader.load() == {"status": "eventful"} + assert_last_visit_matches( + loader.storage, repo_url, status="full", type="svn", + ) + check_snapshot(loader.snapshot, loader.storage) + assert not loader.svnrepo.has_recursive_externals diff --git a/swh/loader/svn/utils.py b/swh/loader/svn/utils.py --- a/swh/loader/svn/utils.py +++ b/swh/loader/svn/utils.py @@ -11,7 +11,7 @@ from subprocess import PIPE, Popen, call import tempfile from typing import Optional, Tuple -from urllib.parse import urlparse +from urllib.parse import quote, urlparse, urlunparse logger = logging.getLogger(__name__) @@ -292,3 +292,29 @@ # handle URL like http://user@svn.example.org/ pass return (path.rstrip("/"), external_url, revision, relative_url) + + +def is_recursive_external( + origin_url: str, dir_path: str, external_path: str, external_url: str +) -> bool: + """ + Check if an external definition can lead to a recursive subversion export + operation (https://issues.apache.org/jira/browse/SVN-1703). + + Args: + origin_url: repository URL + dir_path: path of the directory where external is defined + external_path: path of the external relative to the directory + external_url: external URL + + Returns: + Whether the external definition is recursive + """ + parsed_origin_url = urlparse(origin_url) + parsed_external_url = urlparse(external_url) + external_url = urlunparse( + parsed_external_url._replace(scheme=parsed_origin_url.scheme) + ) + return svn_urljoin(origin_url, quote(dir_path), quote(external_path)).startswith( + external_url + )