diff --git a/swh/vault/cli.py b/swh/vault/cli.py --- a/swh/vault/cli.py +++ b/swh/vault/cli.py @@ -53,7 +53,7 @@ @click.argument("outfile", type=click.File("wb")) @click.option( "--cooker-type", - type=click.Choice(["flat", "gitfast"]), + type=click.Choice(["flat", "gitfast", "git_bare"]), help="Selects which cooker to use, when there is more than one available " "for the given object type.", ) diff --git a/swh/vault/cookers/__init__.py b/swh/vault/cookers/__init__.py --- a/swh/vault/cookers/__init__.py +++ b/swh/vault/cookers/__init__.py @@ -14,6 +14,7 @@ from swh.vault import get_vault from swh.vault.cookers.base import DEFAULT_CONFIG, DEFAULT_CONFIG_PATH from swh.vault.cookers.directory import DirectoryCooker +from swh.vault.cookers.git_bare import GitBareCooker from swh.vault.cookers.revision_flat import RevisionFlatCooker from swh.vault.cookers.revision_gitfast import RevisionGitfastCooker @@ -21,6 +22,8 @@ "directory": DirectoryCooker, "revision_flat": RevisionFlatCooker, "revision_gitfast": RevisionGitfastCooker, + "revision_git_bare": GitBareCooker, + "directory_git_bare": GitBareCooker, } diff --git a/swh/vault/cookers/base.py b/swh/vault/cookers/base.py --- a/swh/vault/cookers/base.py +++ b/swh/vault/cookers/base.py @@ -98,6 +98,10 @@ """ raise NotImplementedError + def cache_type_key(self) -> str: + assert self.CACHE_TYPE_KEY + return self.CACHE_TYPE_KEY + def write(self, chunk): self.fileobj.write(chunk) @@ -117,7 +121,7 @@ ) bundle = self.fileobj.getvalue() # TODO: use proper content streaming instead of put_bundle() - self.backend.put_bundle(self.CACHE_TYPE_KEY, self.obj_id, bundle) + self.backend.put_bundle(self.cache_type_key(), self.obj_id, bundle) except PolicyError as e: self.backend.set_status(self.obj_type, self.obj_id, "failed") self.backend.set_progress(self.obj_type, self.obj_id, str(e)) diff --git a/swh/vault/cookers/git_bare.py b/swh/vault/cookers/git_bare.py new file mode 100644 --- /dev/null +++ b/swh/vault/cookers/git_bare.py @@ -0,0 +1,184 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import datetime +import os.path +import subprocess +import tarfile +import tempfile +from typing import Any, Dict +import zlib + +from swh.model import identifiers +from swh.model.hashutil import hash_to_bytehex, hash_to_hex +from swh.model.model import ( + Person, + Revision, + RevisionType, + Sha1Git, + TimestampWithTimezone, +) +from swh.storage.algos.revisions_walker import DFSRevisionsWalker +from swh.vault.cookers.base import BaseVaultCooker + + +class GitBareCooker(BaseVaultCooker): + use_fsck = True + + def cache_type_key(self) -> str: + return self.obj_type + + def check_exists(self): + obj_type = self.obj_type.split("_")[0] + if obj_type == "revision": + return not list(self.storage.revision_missing([self.obj_id])) + elif obj_type == "directory": + return not list(self.storage.directory_missing([self.obj_id])) + else: + raise NotImplementedError(f"GitBareCooker for {obj_type}") + + def obj_swhid(self) -> identifiers.CoreSWHID: + obj_type = self.obj_type.split("_")[0] + return identifiers.CoreSWHID( + object_type=identifiers.ObjectType[obj_type.upper()], object_id=self.obj_id, + ) + + def prepare_bundle(self): + with tempfile.TemporaryDirectory() as workdir: + # Initialize a Git directory + self.workdir = workdir + self.gitdir = os.path.join(workdir, "clone.git") + os.mkdir(self.gitdir) + self.init_git() + + # Load and write all the objects to disk + self.load_subgraph(self.obj_type.split("_")[0], self.obj_id) + + # Write the root object as a ref. + # This must be done before repacking; git-repack ignores orphan objects. + self.write_refs() + + self.repack() + self.write_archive() + + def init_git(self) -> None: + subprocess.run(["git", "-C", self.gitdir, "init", "--bare"], check=True) + + # Create all possible dirs ahead of time, so we don't have to check for + # existence every time. + for byte in range(256): + os.mkdir(os.path.join(self.gitdir, "objects", f"{byte:02x}")) + + def repack(self) -> None: + if self.use_fsck: + subprocess.run(["git", "-C", self.gitdir, "fsck"], check=True) + + # Add objects we wrote in a pack + subprocess.run(["git", "-C", self.gitdir, "repack"], check=True) + + # Remove their non-packed originals + subprocess.run(["git", "-C", self.gitdir, "prune-packed"], check=True) + + def write_refs(self): + obj_type = self.obj_type.split("_")[0] + if obj_type == "directory": + # We need a synthetic revision pointing to the directory + author = Person.from_fullname( + b"swh-vault, git-bare cooker " + ) + dt = datetime.datetime.now(tz=datetime.timezone.utc) + dt = dt.replace(microsecond=0) # not supported by git + date = TimestampWithTimezone.from_datetime(dt) + revision = Revision( + author=author, + committer=author, + date=date, + committer_date=date, + message=b"Initial commit", + type=RevisionType.GIT, + directory=self.obj_id, + synthetic=True, + ) + self.write_revision_node(revision.to_dict()) + head = revision.id + elif obj_type == "revision": + head = self.obj_id + else: + assert False, obj_type + + with open(os.path.join(self.gitdir, "refs", "heads", "master"), "wb") as fd: + fd.write(hash_to_bytehex(head)) + + def write_archive(self): + with tarfile.TarFile(mode="w", fileobj=self.fileobj) as tf: + tf.add(self.gitdir, arcname=f"{self.obj_swhid()}.git", recursive=True) + + def write_object(self, obj_id: Sha1Git, obj: bytes) -> bool: + """Writes a git object on disk. + + Returns whether it was already written.""" + obj_id_hex = hash_to_hex(obj_id) + directory = obj_id_hex[0:2] + filename = obj_id_hex[2:] + path = os.path.join(self.gitdir, "objects", directory, filename) + if os.path.exists(path): + # Already written + return False + + # Git requires objects to be zlib-compressed; but repacking decompresses and + # removes them, so we don't need to compress them too much. + data = zlib.compress(obj, level=1) + + with open(path, "wb") as fd: + fd.write(data) + return True + + def load_subgraph(self, obj_type, obj_id) -> None: + if obj_type == "revision": + self.load_revision_subgraph(obj_id) + elif obj_type == "directory": + self.load_directory_subgraph(obj_id) + else: + raise NotImplementedError(f"GitBareCooker.load_subgraph({obj_type!r}, ...)") + + def load_revision_subgraph(self, obj_id: Sha1Git) -> None: + """Fetches a revision and all its children, and writes them to disk""" + walker = DFSRevisionsWalker(self.storage, obj_id) + for revision in walker: + self.write_revision_node(revision) + self.load_directory_subgraph(revision["directory"]) + + def write_revision_node(self, revision: Dict[str, Any]) -> bool: + """Writes a revision object to disk""" + git_object = identifiers.revision_git_object(revision) + return self.write_object(revision["id"], git_object) + + def load_directory_subgraph(self, obj_id: Sha1Git) -> None: + """Fetches a directory and all its children, and writes them to disk""" + directory = self.load_directory_node(obj_id) + entry_loaders = { + "file": self.load_content, + "dir": self.load_directory_subgraph, + "rev": self.load_revision_subgraph, + } + for entry in directory["entries"]: + entry_loader = entry_loaders[entry["type"]] + entry_loader(entry["target"]) + + def load_directory_node(self, obj_id: Sha1Git) -> Dict[str, Any]: + """Fetches a directory, writes it to disk (non-recursively), and returns it.""" + entries = list(self.storage.directory_ls(obj_id, recursive=False)) + directory = {"id": obj_id, "entries": entries} + git_object = identifiers.directory_git_object(directory) + self.write_object(obj_id, git_object) + return directory + + def load_content(self, obj_id: Sha1Git) -> None: + # TODO: add support of filtered objects, somehow? + # It's tricky, because, by definition, we can't write a git object with + # the expected hash, so git-fsck *will* choke on it. + content_sha1 = self.storage.content_find({"sha1_git": obj_id})[0].sha1 + content = self.storage.content_get_data(content_sha1) + self.write_object(obj_id, f"blob {len(content)}\0".encode("ascii") + content) diff --git a/swh/vault/in_memory_backend.py b/swh/vault/in_memory_backend.py --- a/swh/vault/in_memory_backend.py +++ b/swh/vault/in_memory_backend.py @@ -19,7 +19,7 @@ self._cache = VaultCache(cls="memory") def fetch(self, obj_type: str, obj_id: ObjectId) -> Optional[bytes]: - return self._cache.get(obj_type, obj_id) + return self._cache.get(obj_type, hash_to_bytes(obj_id)) def cook( self, obj_type: str, obj_id: ObjectId, email: Optional[str] = None diff --git a/swh/vault/tests/test_cookers.py b/swh/vault/tests/test_cookers.py --- a/swh/vault/tests/test_cookers.py +++ b/swh/vault/tests/test_cookers.py @@ -9,6 +9,7 @@ import io import os import pathlib +import shutil import subprocess import tarfile import tempfile @@ -25,7 +26,7 @@ from swh.loader.git.from_disk import GitLoaderFromDisk from swh.model import from_disk, hashutil from swh.model.model import Directory, DirectoryEntry, Person, Revision, RevisionType -from swh.vault.cookers import DirectoryCooker, RevisionGitfastCooker +from swh.vault.cookers import DirectoryCooker, GitBareCooker, RevisionGitfastCooker from swh.vault.tests.vault_testing import hash_content from swh.vault.to_disk import HIDDEN_MESSAGE, SKIPPED_MESSAGE @@ -141,7 +142,7 @@ @contextlib.contextmanager -def cook_extract_directory(storage, obj_id): +def cook_extract_directory_dircooker(storage, obj_id, fsck=True): """Context manager that cooks a directory and extract it.""" backend = unittest.mock.MagicMock() backend.storage = storage @@ -157,6 +158,46 @@ cooker.storage = None +@contextlib.contextmanager +def cook_extract_directory_git_bare(storage, obj_id, fsck=True): + """Context manager that cooks a revision and extract it, + using GitBareCooker""" + backend = unittest.mock.MagicMock() + backend.storage = storage + + # Cook the object + cooker = GitBareCooker("directory", obj_id, backend=backend, storage=storage) + cooker.use_fsck = fsck # Some tests try edge-cases that git-fsck rejects + cooker.fileobj = io.BytesIO() + assert cooker.check_exists() + cooker.prepare_bundle() + cooker.fileobj.seek(0) + + # Extract it + with tempfile.TemporaryDirectory(prefix="tmp-vault-extract-") as td: + with tarfile.open(fileobj=cooker.fileobj, mode="r") as tar: + tar.extractall(td) + + # Clone it with Dulwich + test_repo = TestRepo() + with test_repo as p: + test_repo.git_shell( + "pull", os.path.join(td, f"swh:1:dir:{obj_id.hex()}.git") + ) + shutil.rmtree(p / ".git") + yield p + + +@pytest.fixture( + scope="module", + params=[cook_extract_directory_dircooker, cook_extract_directory_git_bare], +) +def cook_extract_directory(request): + """A fixture that is instantiated as either cook_extract_directory_dircooker or + cook_extract_directory_git_bare.""" + return request.param + + @contextlib.contextmanager def cook_stream_revision_gitfast(storage, obj_id): """Context manager that cooks a revision and stream its fastexport.""" @@ -175,8 +216,9 @@ @contextlib.contextmanager -def cook_extract_revision_gitfast(storage, obj_id): - """Context manager that cooks a revision and extract it.""" +def cook_extract_revision_gitfast(storage, obj_id, fsck=True): + """Context manager that cooks a revision and extract it, + using RevisionGitfastCooker""" test_repo = TestRepo() with cook_stream_revision_gitfast(storage, obj_id) as stream, test_repo as p: processor = dulwich.fastexport.GitImportProcessor(test_repo.repo) @@ -184,6 +226,45 @@ yield test_repo, p +@contextlib.contextmanager +def cook_extract_revision_git_bare(storage, obj_id, fsck=True): + """Context manager that cooks a revision and extract it, + using GitBareCooker""" + backend = unittest.mock.MagicMock() + backend.storage = storage + + # Cook the object + cooker = GitBareCooker("revision", obj_id, backend=backend, storage=storage) + cooker.use_fsck = fsck # Some tests try edge-cases that git-fsck rejects + cooker.fileobj = io.BytesIO() + assert cooker.check_exists() + cooker.prepare_bundle() + cooker.fileobj.seek(0) + + # Extract it + with tempfile.TemporaryDirectory(prefix="tmp-vault-extract-") as td: + with tarfile.open(fileobj=cooker.fileobj, mode="r") as tar: + tar.extractall(td) + + # Clone it with Dulwich + test_repo = TestRepo() + with test_repo as p: + test_repo.git_shell( + "pull", os.path.join(td, f"swh:1:rev:{obj_id.hex()}.git") + ) + yield test_repo, p + + +@pytest.fixture( + scope="module", + params=[cook_extract_revision_gitfast, cook_extract_revision_git_bare], +) +def cook_extract_revision(request): + """A fixture that is instantiated as either cook_extract_revision_gitfast or + cook_extract_revision_git_bare.""" + return request.param + + TEST_CONTENT = ( " test content\n" "and unicode \N{BLACK HEART SUIT}\n" " and trailing spaces " ) @@ -191,7 +272,7 @@ class TestDirectoryCooker: - def test_directory_simple(self, git_loader): + def test_directory_simple(self, git_loader, cook_extract_directory): repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) @@ -220,7 +301,9 @@ directory = from_disk.Directory.from_disk(path=bytes(p)) assert obj_id_hex == hashutil.hash_to_hex(directory.hash) - def test_directory_filtered_objects(self, git_loader): + def test_directory_filtered_objects(self, git_loader, cook_extract_directory): + if cook_extract_directory is cook_extract_directory_git_bare: + pytest.xfail("GitBareCooker does not support filtered objects (yet?)") repo = TestRepo() with repo as rp: file_1, id_1 = hash_content(b"test1") @@ -262,7 +345,7 @@ assert (p / "hidden_file").read_bytes() == HIDDEN_MESSAGE assert (p / "absent_file").read_bytes() == SKIPPED_MESSAGE - def test_directory_bogus_perms(self, git_loader): + def test_directory_bogus_perms(self, git_loader, cook_extract_directory): # Some early git repositories have 664/775 permissions... let's check # if all the weird modes are properly normalized in the directory # cooker. @@ -286,7 +369,9 @@ assert (p / "executable").stat().st_mode == 0o100755 assert (p / "wat").stat().st_mode == 0o100644 - def test_directory_revision_data(self, swh_storage): + def test_directory_revision_data(self, swh_storage, cook_extract_directory): + if cook_extract_directory is cook_extract_directory_git_bare: + pytest.xfail("GitBareCooker does not support submodules yet") target_rev = "0e8a3ad980ec179856012b7eecf4327e99cd44cd" dir = Directory( @@ -301,13 +386,13 @@ ) swh_storage.directory_add([dir]) - with cook_extract_directory(swh_storage, dir.id) as p: + with cook_extract_directory(swh_storage, dir.id, fsck=False) as p: assert (p / "submodule").is_symlink() assert os.readlink(str(p / "submodule")) == target_rev -class TestRevisionGitfastCooker: - def test_revision_simple(self, git_loader): +class TestRevisionCooker: + def test_revision_simple(self, git_loader, cook_extract_revision): # # 1--2--3--4--5--6--7 # @@ -334,7 +419,7 @@ obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): + with cook_extract_revision(loader.storage, obj_id) as (ert, p): ert.checkout(b"HEAD") assert (p / "file1").stat().st_mode == 0o100644 assert (p / "file1").read_text() == TEST_CONTENT @@ -346,7 +431,7 @@ assert (p / "dir1/dir2/file").stat().st_mode == 0o100644 assert ert.repo.refs[b"HEAD"].decode() == obj_id_hex - def test_revision_two_roots(self, git_loader): + def test_revision_two_roots(self, git_loader, cook_extract_revision): # # 1----3---4 # / @@ -367,10 +452,10 @@ loader = git_loader(str(rp)) loader.load() - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): + with cook_extract_revision(loader.storage, obj_id) as (ert, p): assert ert.repo.refs[b"HEAD"].decode() == obj_id_hex - def test_revision_two_double_fork_merge(self, git_loader): + def test_revision_two_double_fork_merge(self, git_loader, cook_extract_revision): # # 2---4---6 # / / / @@ -401,10 +486,10 @@ loader = git_loader(str(rp)) loader.load() - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): + with cook_extract_revision(loader.storage, obj_id) as (ert, p): assert ert.repo.refs[b"HEAD"].decode() == obj_id_hex - def test_revision_triple_merge(self, git_loader): + def test_revision_triple_merge(self, git_loader, cook_extract_revision): # # .---.---5 # / / / @@ -429,10 +514,12 @@ loader = git_loader(str(rp)) loader.load() - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): + with cook_extract_revision(loader.storage, obj_id) as (ert, p): assert ert.repo.refs[b"HEAD"].decode() == obj_id_hex - def test_revision_filtered_objects(self, git_loader): + def test_revision_filtered_objects(self, git_loader, cook_extract_revision): + if cook_extract_revision is cook_extract_revision_git_bare: + pytest.xfail("GitBareCooker does not support filtered objects (yet?)") repo = TestRepo() with repo as rp: file_1, id_1 = hash_content(b"test1") @@ -468,13 +555,13 @@ (id_3,), ) - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): + with cook_extract_revision(loader.storage, obj_id) as (ert, p): ert.checkout(b"HEAD") assert (p / "file").read_bytes() == b"test1" assert (p / "hidden_file").read_bytes() == HIDDEN_MESSAGE assert (p / "absent_file").read_bytes() == SKIPPED_MESSAGE - def test_revision_bogus_perms(self, git_loader): + def test_revision_bogus_perms(self, git_loader, cook_extract_revision): # Some early git repositories have 664/775 permissions... let's check # if all the weird modes are properly normalized in the revision # cooker. @@ -492,13 +579,13 @@ obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): + with cook_extract_revision(loader.storage, obj_id) as (ert, p): ert.checkout(b"HEAD") assert (p / "file").stat().st_mode == 0o100644 assert (p / "executable").stat().st_mode == 0o100755 assert (p / "wat").stat().st_mode == 0o100644 - def test_revision_null_fields(self, git_loader): + def test_revision_null_fields(self, git_loader, cook_extract_revision): # Our schema doesn't enforce a lot of non-null revision fields. We need # to check these cases don't break the cooker. repo = TestRepo() @@ -527,7 +614,7 @@ storage = loader.storage storage.revision_add([test_revision]) - with cook_extract_revision_gitfast(storage, test_revision.id) as (ert, p): + with cook_extract_revision(storage, test_revision.id, fsck=False) as (ert, p): ert.checkout(b"HEAD") assert (p / "file").stat().st_mode == 0o100644