Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/svn/tests/test_loader.py
# Copyright (C) 2016-2022 The Software Heritage developers | # Copyright (C) 2016-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import logging | |||||
import os | import os | ||||
import shutil | import shutil | ||||
import subprocess | import subprocess | ||||
import textwrap | import textwrap | ||||
from typing import Any, Dict | from typing import Any, Dict | ||||
import pytest | import pytest | ||||
from subvertpy import SubversionException | from subvertpy import SubversionException | ||||
from swh.loader.svn.loader import ( | from swh.loader.svn.loader import ( | ||||
SvnLoader, | SvnLoader, | ||||
SvnLoaderFromDumpArchive, | SvnLoaderFromDumpArchive, | ||||
SvnLoaderFromRemoteDump, | SvnLoaderFromRemoteDump, | ||||
) | ) | ||||
from swh.loader.svn.svn import SvnRepo | from swh.loader.svn.svn import SvnRepo | ||||
from swh.loader.svn.utils import init_svn_repo_from_dump | from swh.loader.svn.utils import init_svn_repo_from_dump | ||||
from swh.loader.tests import ( | from swh.loader.tests import ( | ||||
assert_last_visit_matches, | assert_last_visit_matches, | ||||
check_snapshot, | check_snapshot, | ||||
get_stats, | get_stats, | ||||
prepare_repository_from_archive, | prepare_repository_from_archive, | ||||
) | ) | ||||
from swh.model.from_disk import DentryPerms | from swh.model.from_disk import DentryPerms, Directory | ||||
from swh.model.hashutil import hash_to_bytes | from swh.model.hashutil import hash_to_bytes | ||||
from swh.model.model import Snapshot, SnapshotBranch, TargetType | from swh.model.model import Snapshot, SnapshotBranch, TargetType | ||||
from .utils import CommitChange, CommitChangeType, add_commit | from .utils import CommitChange, CommitChangeType, add_commit | ||||
GOURMET_SNAPSHOT = Snapshot( | GOURMET_SNAPSHOT = Snapshot( | ||||
id=hash_to_bytes("889cacc2731e3312abfb2b1a0c18ade82a949e07"), | id=hash_to_bytes("889cacc2731e3312abfb2b1a0c18ade82a949e07"), | ||||
branches={ | branches={ | ||||
▲ Show 20 Lines • Show All 2,244 Lines • ▼ Show 20 Lines | ): | ||||
assert_last_visit_matches( | assert_last_visit_matches( | ||||
loader.storage, | loader.storage, | ||||
repo_url, | repo_url, | ||||
status="full", | status="full", | ||||
type="svn", | type="svn", | ||||
) | ) | ||||
check_snapshot(loader.snapshot, loader.storage) | check_snapshot(loader.snapshot, loader.storage) | ||||
def test_loader_check_tree_divergence(swh_storage, repo_url, tmp_path, caplog): | |||||
# create sample repository | |||||
add_commit( | |||||
repo_url, | |||||
"Create trunk/data folder", | |||||
[ | |||||
CommitChange( | |||||
change_type=CommitChangeType.AddOrUpdate, | |||||
path="trunk/data/foo", | |||||
data=b"foo", | |||||
), | |||||
CommitChange( | |||||
change_type=CommitChangeType.AddOrUpdate, | |||||
path="trunk/data/bar", | |||||
data=b"bar", | |||||
), | |||||
CommitChange( | |||||
change_type=CommitChangeType.AddOrUpdate, | |||||
path="trunk/data/baz/", | |||||
), | |||||
], | |||||
) | |||||
# load it | |||||
loader = SvnLoader( | |||||
swh_storage, | |||||
repo_url, | |||||
temp_directory=tmp_path, | |||||
debug=True, | |||||
check_revision=1, | |||||
) | |||||
assert loader.load() == {"status": "eventful"} | |||||
# export it to a temporary directory | |||||
export_path, _ = loader.svnrepo.export_temporary(revision=1) | |||||
export_path = os.path.join(export_path, repo_url.split("/")[-1]) | |||||
# modify some file content in the export and remove a path | |||||
with open(os.path.join(export_path, "trunk/data/foo"), "wb") as f: | |||||
f.write(b"baz") | |||||
shutil.rmtree(os.path.join(export_path, "trunk/data/baz/")) | |||||
# create directory model from the modified export | |||||
export_dir = Directory.from_disk(path=export_path.encode()) | |||||
# ensure debug logs | |||||
caplog.set_level(logging.DEBUG) | |||||
# check exported tree and repository tree are diverging | |||||
with pytest.raises(ValueError): | |||||
loader._check_revision_divergence(1, export_dir.hash, export_dir) | |||||
# check diverging paths have been detected and logged | |||||
for debug_log in ( | |||||
"directory with path b'trunk' has different hash in reconstructed repository filesystem", # noqa | |||||
ardumont: Those are the top-level directory which detects the divergence too due to the way we compute… | |||||
Done Inline ActionsI thought about it too, we should only check hash difference for contents indeed. anlambert: I thought about it too, we should only check hash difference for contents indeed. | |||||
Not Done Inline ActionsIt's not blocking (if it's a bit hard-ish to do immediately), you can always land this now and iterate other this in another diff (as you wish heh ;) ardumont: It's not blocking (if it's a bit hard-ish to do immediately), you can always land this now and… | |||||
Done Inline ActionsIn fact, we need to keep those checks as it can happen that the directory model from the replay module missed some hash updates when copying files/directories, see below a bug that I am tracking currently: DEBUG:swh.loader.svn.loader.SvnLoader:rev: 10364, swhrev: 8538c37dccb886ca848151787174d022993485c7, dir: 816e429fd284c5775b97c95ab723e302c44c6f55 DEBUG:swh.loader.svn.loader.SvnLoader:Checking hash computations on revision 10364... DEBUG:swh.loader.svn.svn:svn export -r 10364 --depth infinity --ignore-keywords file:///home/anlambert/tmp/codeblocks_repo /tmp/swh.loader.svn.gji1ywo9-3153480/check-revision-10364.prxv4dgw/codeblocks_repo DEBUG:swh.loader.svn.svn:cleanup /tmp/swh.loader.svn.gji1ywo9-3153480/check-revision-10364.prxv4dgw DEBUG:swh.loader.svn.loader.SvnLoader:directory with path b'trunk' has different hash in reconstructed repository filesystem DEBUG:swh.loader.svn.loader.SvnLoader:directory with path b'trunk/src' has different hash in reconstructed repository filesystem DEBUG:swh.loader.svn.loader.SvnLoader:directory with path b'trunk/src/sdk' has different hash in reconstructed repository filesystem DEBUG:swh.loader.svn.loader.SvnLoader:content with path b'trunk/src/sdk/filemanager.cpp' has different hash in reconstructed repository filesystem DEBUG:swh.loader.svn.loader.SvnLoader:directory with path b'branches' has different hash in reconstructed repository filesystem DEBUG:swh.loader.svn.loader.SvnLoader:directory with path b'branches/scintilla_3_5_x' has different hash in reconstructed repository filesystem DEBUG:swh.loader.svn.loader.SvnLoader:directory with path b'branches/scintilla_3_5_x/src' has different hash in reconstructed repository filesystem DEBUG:swh.loader.svn.loader.SvnLoader:directory with path b'branches/scintilla_3_5_x/src/plugins' has different hash in reconstructed repository filesystem DEBUG:swh.loader.svn.loader.SvnLoader:directory with path b'branches/scintilla_3_5_x/src/plugins/contrib' has different hash in reconstructed repository filesystem ERROR:swh.loader.svn.loader.SvnLoader:Hash tree computation divergence detected at revision 10364 (816e429fd284c5775b97c95ab723e302c44c6f55 != 576b09a85b208eb2439148b2be49a6fd7365cf32), stopping! Traceback (most recent call last): File "/home/anlambert/swh/swh-environment/swh-loader-svn/swh/loader/svn/loader.py", line 476, in fetch_data data = next(self.swh_revision_gen) File "/home/anlambert/swh/swh-environment/swh-loader-svn/swh/loader/svn/loader.py", line 393, in process_svn_revisions self._check_revision_divergence(rev, dir_id, root_directory) File "/home/anlambert/swh/swh-environment/swh-loader-svn/swh/loader/svn/loader.py", line 343, in _check_revision_divergence raise ValueError(err) ValueError: Hash tree computation divergence detected at revision 10364 (816e429fd284c5775b97c95ab723e302c44c6f55 != 576b09a85b208eb2439148b2be49a6fd7365cf32), stopping! anlambert: In fact, we need to keep those checks as it can happen that the directory model from the replay… | |||||
"directory with path b'trunk/data' has different hash in reconstructed repository filesystem", # noqa | |||||
"content with path b'trunk/data/foo' has different hash in reconstructed repository filesystem", # noqa | |||||
"directory with path b'trunk/data/baz' is missing in reconstructed repository filesystem", # noqa | |||||
): | |||||
assert debug_log in caplog.text |
Those are the top-level directory which detects the divergence too due to the way we compute the hashes.
Maybe, it'd be worth trying to filter those out otherwise, we could have a hard time parsing through the debug log?