diff --git a/swh/vault/cli.py b/swh/vault/cli.py --- a/swh/vault/cli.py +++ b/swh/vault/cli.py @@ -71,6 +71,7 @@ """ from swh.core import config from swh.graph.client import RemoteGraphClient + from swh.objstorage.factory import get_objstorage from swh.storage import get_storage from .cookers import COOKER_TYPES, get_cooker_cls @@ -102,9 +103,17 @@ backend = InMemoryVaultBackend() storage = get_storage(**conf["storage"]) + objstorage = get_objstorage(**conf["objstorage"]) if "objstorage" in conf else None graph = RemoteGraphClient(**conf["graph"]) if "graph" in conf else None cooker_cls = get_cooker_cls(cooker_name) - cooker = cooker_cls(cooker_name, swhid.object_id, backend, storage, graph) + cooker = cooker_cls( + obj_type=cooker_name, + obj_id=swhid.object_id, + backend=backend, + storage=storage, + graph=graph, + objstorage=objstorage, + ) cooker.cook() bundle = backend.fetch(cooker_name, swhid.object_id) diff --git a/swh/vault/cookers/base.py b/swh/vault/cookers/base.py --- a/swh/vault/cookers/base.py +++ b/swh/vault/cookers/base.py @@ -67,6 +67,7 @@ backend, storage, graph=None, + objstorage=None, max_bundle_size=MAX_BUNDLE_SIZE, ): """Initialize the cooker. @@ -86,6 +87,7 @@ self.obj_id = hashutil.hash_to_bytes(obj_id) self.backend = backend self.storage = storage + self.objstorage = objstorage self.graph = graph self.max_bundle_size = max_bundle_size diff --git a/swh/vault/cookers/git_bare.py b/swh/vault/cookers/git_bare.py --- a/swh/vault/cookers/git_bare.py +++ b/swh/vault/cookers/git_bare.py @@ -275,9 +275,15 @@ # It's tricky, because, by definition, we can't write a git object with # the expected hash, so git-fsck *will* choke on it. contents = self.storage.content_get(obj_ids, "sha1_git") - for (obj_id, content) in zip(obj_ids, contents): - assert obj_id == content.sha1_git # just to be sure - content = self.storage.content_get_data(content.sha1) - self.write_object( - obj_id, f"blob {len(content)}\0".encode("ascii") + content - ) + + if self.objstorage is None: + for content in contents: + data = self.storage.content_get_data(content.sha1) + self.write_content(content.sha1_git, data) + else: + content_data = self.objstorage.get_batch(c.sha1 for c in contents) + for (content, data) in zip(contents, content_data): + self.write_content(content.sha1_git, data) + + def write_content(self, obj_id: Sha1Git, content: bytes) -> None: + self.write_object(obj_id, f"blob {len(content)}\0".encode("ascii") + content) diff --git a/swh/vault/tests/test_cli.py b/swh/vault/tests/test_cli.py --- a/swh/vault/tests/test_cli.py +++ b/swh/vault/tests/test_cli.py @@ -92,11 +92,12 @@ raise result.exception cooker_cls.assert_called_once_with( - f"{obj_type}_{cooker_name_suffix}" if cooker_name_suffix else obj_type, - b"\x00" * 20, - backend, - storage, - None, + obj_type=f"{obj_type}_{cooker_name_suffix}" if cooker_name_suffix else obj_type, + obj_id=b"\x00" * 20, + backend=backend, + storage=storage, + graph=None, + objstorage=None, ) cooker.cook.assert_called_once_with() diff --git a/swh/vault/tests/test_cookers.py b/swh/vault/tests/test_cookers.py --- a/swh/vault/tests/test_cookers.py +++ b/swh/vault/tests/test_cookers.py @@ -200,14 +200,22 @@ @contextlib.contextmanager -def cook_extract_directory_git_bare(storage, obj_id, fsck=True): +def cook_extract_directory_git_bare( + storage, obj_id, fsck=True, direct_objstorage=False +): """Context manager that cooks a revision and extract it, using GitBareCooker""" backend = unittest.mock.MagicMock() backend.storage = storage # Cook the object - cooker = GitBareCooker("directory", obj_id, backend=backend, storage=storage) + cooker = GitBareCooker( + "directory", + obj_id, + backend=backend, + storage=storage, + objstorage=storage.objstorage if direct_objstorage else None, + ) cooker.use_fsck = fsck # Some tests try edge-cases that git-fsck rejects cooker.fileobj = io.BytesIO() assert cooker.check_exists() @@ -338,7 +346,7 @@ assert (p / "file").read_text() == TEST_CONTENT assert (p / "executable").stat().st_mode == 0o100755 assert (p / "executable").read_bytes() == TEST_EXECUTABLE - assert (p / "link").is_symlink + assert (p / "link").is_symlink() assert os.readlink(str(p / "link")) == "file" assert (p / "dir1/dir2/file").stat().st_mode == 0o100644 assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT @@ -437,6 +445,57 @@ assert (p / "executable").stat().st_mode == 0o100755 assert (p / "wat").stat().st_mode == 0o100644 + @pytest.mark.parametrize("direct_objstorage", [True, False]) + def test_directory_objstorage( + self, swh_storage, git_loader, mocker, direct_objstorage + ): + """Like test_directory_simple, but using swh_objstorage directly, without + going through swh_storage.content_get_data()""" + repo = TestRepo() + with repo as rp: + (rp / "file").write_text(TEST_CONTENT) + (rp / "executable").write_bytes(TEST_EXECUTABLE) + (rp / "executable").chmod(0o755) + (rp / "link").symlink_to("file") + (rp / "dir1/dir2").mkdir(parents=True) + (rp / "dir1/dir2/file").write_text(TEST_CONTENT) + c = repo.commit() + loader = git_loader(str(rp)) + loader.load() + + obj_id_hex = repo.repo[c].tree.decode() + obj_id = hashutil.hash_to_bytes(obj_id_hex) + + # Set-up spies + storage_content_get_data = mocker.patch.object( + swh_storage, "content_get_data", wraps=swh_storage.content_get_data + ) + objstorage_content_batch = mocker.patch.object( + swh_storage.objstorage, "get_batch", wraps=swh_storage.objstorage.get_batch + ) + + with cook_extract_directory_git_bare( + loader.storage, obj_id, direct_objstorage=direct_objstorage + ) as p: + assert (p / "file").stat().st_mode == 0o100644 + assert (p / "file").read_text() == TEST_CONTENT + assert (p / "executable").stat().st_mode == 0o100755 + assert (p / "executable").read_bytes() == TEST_EXECUTABLE + assert (p / "link").is_symlink() + assert os.readlink(str(p / "link")) == "file" + assert (p / "dir1/dir2/file").stat().st_mode == 0o100644 + assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT + + directory = from_disk.Directory.from_disk(path=bytes(p)) + assert obj_id_hex == hashutil.hash_to_hex(directory.hash) + + if direct_objstorage: + storage_content_get_data.assert_not_called() + objstorage_content_batch.assert_called() + else: + storage_content_get_data.assert_called() + objstorage_content_batch.assert_not_called() + def test_directory_revision_data(self, swh_storage): target_rev = "0e8a3ad980ec179856012b7eecf4327e99cd44cd" @@ -489,7 +548,7 @@ ert.checkout(b"HEAD") assert (p / "file1").stat().st_mode == 0o100644 assert (p / "file1").read_text() == TEST_CONTENT - assert (p / "link1").is_symlink + assert (p / "link1").is_symlink() assert os.readlink(str(p / "link1")) == "file1" assert (p / "bin").stat().st_mode == 0o100755 assert (p / "bin").read_bytes() == TEST_EXECUTABLE