diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py --- a/swh/loader/svn/loader.py +++ b/swh/loader/svn/loader.py @@ -140,7 +140,9 @@ return self.svnrepo.clean_fs() - def swh_revision_hash_tree_at_svn_revision(self, revision: int) -> bytes: + def swh_revision_hash_tree_at_svn_revision( + self, revision: int + ) -> from_disk.Directory: """Compute and return the hash tree at a given svn revision. Args: @@ -154,7 +156,7 @@ local_dirname, local_url = self.svnrepo.export_temporary(revision) root_dir = from_disk.Directory.from_disk(path=local_url) self.svnrepo.clean_fs(local_dirname) - return root_dir.hash + return root_dir def _latest_snapshot_revision( self, @@ -284,7 +286,9 @@ return revision_start, revision_end - def _check_revision_divergence(self, rev: int, dir_id: bytes) -> None: + def _check_revision_divergence( + self, rev: int, dir_id: bytes, dir: from_disk.Directory + ) -> None: """Check for hash revision computation divergence. The Rationale behind this is that svn can trigger unknown edge cases (mixed @@ -301,12 +305,37 @@ """ self.log.debug("Checking hash computations on revision %s...", rev) - checked_dir_id = self.swh_revision_hash_tree_at_svn_revision(rev) + checked_dir = self.swh_revision_hash_tree_at_svn_revision(rev) + checked_dir_id = checked_dir.hash + if checked_dir_id != dir_id: + # do not bother checking tree differences if root directory id of reconstructed + # repository filesystem does not match the id of the one from the last loaded + # revision (can happen when called from post_load and tree differences were checked + # before the last revision to load) + if self.debug and dir_id == dir.hash: + for obj in checked_dir.iter_tree(): + path = obj.data["path"].replace(checked_dir.data["path"], b"")[1:] + if not path: + # ignore root directory + continue + if path not in dir: + self.log.debug( + "%s with path %s is missing in reconstructed repository filesystem", + obj.object_type, # type: ignore + path, + ) + elif dir[path].hash != checked_dir[path].hash: + self.log.debug( + "%s with path %s has different hash in reconstructed repository filesystem", # noqa + obj.object_type, # type: ignore + path, + ) err = ( - "Hash tree computation divergence detected " + "Hash tree computation divergence detected at revision %s " "(%s != %s), stopping!" % ( + rev, hashutil.hash_to_hex(dir_id), hashutil.hash_to_hex(checked_dir_id), ) @@ -360,7 +389,7 @@ and self.check_revision != 0 and count % self.check_revision == 0 ): - self._check_revision_divergence(rev, dir_id) + self._check_revision_divergence(rev, dir_id, root_directory) parents = (swh_revision.id,) @@ -535,6 +564,7 @@ self._check_revision_divergence( int(dict(self._last_revision.extra_headers)[b"svn_revision"]), self._last_revision.directory, + self.svnrepo.swhreplay.directory, ) def _create_tmp_dir(self, root_tmp_dir: str) -> str: diff --git a/swh/loader/svn/tests/test_loader.py b/swh/loader/svn/tests/test_loader.py --- a/swh/loader/svn/tests/test_loader.py +++ b/swh/loader/svn/tests/test_loader.py @@ -3,6 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import logging import os import shutil import subprocess @@ -25,7 +26,7 @@ get_stats, prepare_repository_from_archive, ) -from swh.model.from_disk import DentryPerms +from swh.model.from_disk import DentryPerms, Directory from swh.model.hashutil import hash_to_bytes from swh.model.model import Snapshot, SnapshotBranch, TargetType @@ -2286,3 +2287,65 @@ type="svn", ) check_snapshot(loader.snapshot, loader.storage) + + +def test_loader_check_tree_divergence(swh_storage, repo_url, tmp_path, caplog): + # create sample repository + add_commit( + repo_url, + "Create trunk/data folder", + [ + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="trunk/data/foo", + data=b"foo", + ), + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="trunk/data/bar", + data=b"bar", + ), + CommitChange( + change_type=CommitChangeType.AddOrUpdate, + path="trunk/data/baz/", + ), + ], + ) + + # load it + loader = SvnLoader( + swh_storage, + repo_url, + temp_directory=tmp_path, + debug=True, + check_revision=1, + ) + assert loader.load() == {"status": "eventful"} + + # export it to a temporary directory + export_path, _ = loader.svnrepo.export_temporary(revision=1) + export_path = os.path.join(export_path, repo_url.split("/")[-1]) + + # modify some file content in the export and remove a path + with open(os.path.join(export_path, "trunk/data/foo"), "wb") as f: + f.write(b"baz") + shutil.rmtree(os.path.join(export_path, "trunk/data/baz/")) + + # create directory model from the modified export + export_dir = Directory.from_disk(path=export_path.encode()) + + # ensure debug logs + caplog.set_level(logging.DEBUG) + + # check exported tree and repository tree are diverging + with pytest.raises(ValueError): + loader._check_revision_divergence(1, export_dir.hash, export_dir) + + # check diverging paths have been detected and logged + for debug_log in ( + "directory with path b'trunk' has different hash in reconstructed repository filesystem", # noqa + "directory with path b'trunk/data' has different hash in reconstructed repository filesystem", # noqa + "content with path b'trunk/data/foo' has different hash in reconstructed repository filesystem", # noqa + "directory with path b'trunk/data/baz' is missing in reconstructed repository filesystem", # noqa + ): + assert debug_log in caplog.text