diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,5 @@ swh.core[db,http] >= 0.14.0 +swh.graph >= v0.3.2 swh.model >= 0.3 swh.objstorage >= 0.0.17 swh.scheduler >= 0.7.0 diff --git a/swh/vault/cli.py b/swh/vault/cli.py --- a/swh/vault/cli.py +++ b/swh/vault/cli.py @@ -53,7 +53,7 @@ @click.argument("outfile", type=click.File("wb")) @click.option( "--cooker-type", - type=click.Choice(["flat", "gitfast"]), + type=click.Choice(["flat", "gitfast", "git_bare"]), help="Selects which cooker to use, when there is more than one available " "for the given object type.", ) @@ -70,6 +70,7 @@ and outputs it to the given file. """ from swh.core import config + from swh.graph.client import RemoteGraphClient from swh.storage import get_storage from .cookers import COOKER_TYPES, get_cooker_cls @@ -101,8 +102,9 @@ backend = InMemoryVaultBackend() storage = get_storage(**conf["storage"]) + graph = RemoteGraphClient(**conf["graph"]) if "graph" in conf else None cooker_cls = get_cooker_cls(cooker_name) - cooker = cooker_cls(cooker_name, swhid.object_id, backend, storage) + cooker = cooker_cls(cooker_name, swhid.object_id, backend, storage, graph) cooker.cook() bundle = backend.fetch(cooker_name, swhid.object_id) diff --git a/swh/vault/cookers/__init__.py b/swh/vault/cookers/__init__.py --- a/swh/vault/cookers/__init__.py +++ b/swh/vault/cookers/__init__.py @@ -10,10 +10,12 @@ from swh.core.config import load_named_config from swh.core.config import read as read_config +from swh.graph.client import RemoteGraphClient from swh.storage import get_storage from swh.vault import get_vault from swh.vault.cookers.base import DEFAULT_CONFIG, DEFAULT_CONFIG_PATH from swh.vault.cookers.directory import DirectoryCooker +from swh.vault.cookers.git_bare import GitBareCooker from swh.vault.cookers.revision_flat import RevisionFlatCooker from swh.vault.cookers.revision_gitfast import RevisionGitfastCooker @@ -21,6 +23,8 @@ "directory": DirectoryCooker, "revision_flat": RevisionFlatCooker, "revision_gitfast": RevisionGitfastCooker, + "revision_git_bare": GitBareCooker, + "directory_git_bare": GitBareCooker, } @@ -86,11 +90,13 @@ storage = get_storage(**vcfg.pop("storage")) backend = get_vault(**vcfg) + graph = RemoteGraphClient(**vcfg["graph"]) if "graph" in vcfg else None return cooker_cls( obj_type, obj_id, backend=backend, storage=storage, + graph=graph, max_bundle_size=cfg["max_bundle_size"], ) diff --git a/swh/vault/cookers/base.py b/swh/vault/cookers/base.py --- a/swh/vault/cookers/base.py +++ b/swh/vault/cookers/base.py @@ -61,7 +61,7 @@ CACHE_TYPE_KEY = None # type: Optional[str] def __init__( - self, obj_type, obj_id, backend, storage, max_bundle_size=MAX_BUNDLE_SIZE + self, obj_type, obj_id, backend, storage, graph, max_bundle_size=MAX_BUNDLE_SIZE ): """Initialize the cooker. @@ -80,6 +80,7 @@ self.obj_id = hashutil.hash_to_bytes(obj_id) self.backend = backend self.storage = storage + self.graph = graph self.max_bundle_size = max_bundle_size @abc.abstractmethod @@ -98,6 +99,10 @@ """ raise NotImplementedError + def cache_type_key(self) -> str: + assert self.CACHE_TYPE_KEY + return self.CACHE_TYPE_KEY + def write(self, chunk): self.fileobj.write(chunk) @@ -117,7 +122,7 @@ ) bundle = self.fileobj.getvalue() # TODO: use proper content streaming instead of put_bundle() - self.backend.put_bundle(self.CACHE_TYPE_KEY, self.obj_id, bundle) + self.backend.put_bundle(self.cache_type_key(), self.obj_id, bundle) except PolicyError as e: self.backend.set_status(self.obj_type, self.obj_id, "failed") self.backend.set_progress(self.obj_type, self.obj_id, str(e)) diff --git a/swh/vault/cookers/git_bare.py b/swh/vault/cookers/git_bare.py new file mode 100644 --- /dev/null +++ b/swh/vault/cookers/git_bare.py @@ -0,0 +1,252 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +""" +This cooker creates tarballs containing a bare .git directory, +that can be unpacked and cloned like any git repository. + +It works in three steps: + +1. Write objects one by one in :file:`.git/objects/` +2. Calls `git repack` to pack all these objects into git packfiles. +3. Creates a tarball of the resulting repository + +To avoid downloading and writing the same objects twice, +it checks the existence of the object file in the temporary directory. +To avoid sending a syscall every time, it also uses ``functools.lru_cache`, +as a first layer of cache before checking the file's existence. +""" + +import datetime +import functools +import os.path +import subprocess +import tarfile +import tempfile +from typing import Any, Callable, Dict, List +import zlib + +from swh.core.utils import grouper +from swh.graph.client import GraphArgumentException +from swh.model import identifiers +from swh.model.hashutil import hash_to_bytehex, hash_to_hex +from swh.model.model import ( + Person, + Revision, + RevisionType, + Sha1Git, + TimestampWithTimezone, +) +from swh.storage.algos.revisions_walker import DFSRevisionsWalker +from swh.vault.cookers.base import BaseVaultCooker + +REVISION_BATCH_SIZE = 10000 + + +class GitBareCooker(BaseVaultCooker): + use_fsck = True + + def cache_type_key(self) -> str: + return self.obj_type + + def check_exists(self): + obj_type = self.obj_type.split("_")[0] + if obj_type == "revision": + return not list(self.storage.revision_missing([self.obj_id])) + elif obj_type == "directory": + return not list(self.storage.directory_missing([self.obj_id])) + else: + raise NotImplementedError(f"GitBareCooker for {obj_type}") + + def obj_swhid(self) -> identifiers.CoreSWHID: + obj_type = self.obj_type.split("_")[0] + return identifiers.CoreSWHID( + object_type=identifiers.ObjectType[obj_type.upper()], object_id=self.obj_id, + ) + + def prepare_bundle(self): + with tempfile.TemporaryDirectory(prefix="swh-vault-gitbare-") as workdir: + # Initialize a Git directory + self.workdir = workdir + self.gitdir = os.path.join(workdir, "clone.git") + os.mkdir(self.gitdir) + self.init_git() + + # Load and write all the objects to disk + self.load_subgraph(self.obj_type.split("_")[0], self.obj_id) + + # Write the root object as a ref. + # This must be done before repacking; git-repack ignores orphan objects. + self.write_refs() + + self.repack() + self.write_archive() + + def init_git(self) -> None: + subprocess.run(["git", "-C", self.gitdir, "init", "--bare"], check=True) + + # Create all possible dirs ahead of time, so we don't have to check for + # existence every time. + for byte in range(256): + os.mkdir(os.path.join(self.gitdir, "objects", f"{byte:02x}")) + + def repack(self) -> None: + if self.use_fsck: + subprocess.run(["git", "-C", self.gitdir, "fsck"], check=True) + + # Add objects we wrote in a pack + subprocess.run(["git", "-C", self.gitdir, "repack"], check=True) + + # Remove their non-packed originals + subprocess.run(["git", "-C", self.gitdir, "prune-packed"], check=True) + + def write_refs(self): + obj_type = self.obj_type.split("_")[0] + if obj_type == "directory": + # We need a synthetic revision pointing to the directory + author = Person.from_fullname( + b"swh-vault, git-bare cooker " + ) + dt = datetime.datetime.now(tz=datetime.timezone.utc) + dt = dt.replace(microsecond=0) # not supported by git + date = TimestampWithTimezone.from_datetime(dt) + revision = Revision( + author=author, + committer=author, + date=date, + committer_date=date, + message=b"Initial commit", + type=RevisionType.GIT, + directory=self.obj_id, + synthetic=True, + ) + self.write_revision_node(revision.to_dict()) + head = revision.id + elif obj_type == "revision": + head = self.obj_id + else: + assert False, obj_type + + with open(os.path.join(self.gitdir, "refs", "heads", "master"), "wb") as fd: + fd.write(hash_to_bytehex(head)) + + def write_archive(self): + with tarfile.TarFile(mode="w", fileobj=self.fileobj) as tf: + tf.add(self.gitdir, arcname=f"{self.obj_swhid()}.git", recursive=True) + + def _obj_path(self, obj_id: Sha1Git): + obj_id_hex = hash_to_hex(obj_id) + directory = obj_id_hex[0:2] + filename = obj_id_hex[2:] + return os.path.join(self.gitdir, "objects", directory, filename) + + def object_exists(self, obj_id: Sha1Git) -> bool: + return os.path.exists(self._obj_path(obj_id)) + + def write_object(self, obj_id: Sha1Git, obj: bytes) -> bool: + """Writes a git object on disk. + + Returns whether it was already written.""" + # Git requires objects to be zlib-compressed; but repacking decompresses and + # removes them, so we don't need to compress them too much. + data = zlib.compress(obj, level=1) + + with open(self._obj_path(obj_id), "wb") as fd: + fd.write(data) + return True + + def load_subgraph(self, obj_type, obj_id) -> None: + if obj_type == "revision": + self.load_revision_subgraph(obj_id) + elif obj_type == "directory": + self.load_directory_subgraph(obj_id) + else: + raise NotImplementedError(f"GitBareCooker.load_subgraph({obj_type!r}, ...)") + + def load_revision_subgraph(self, obj_id: Sha1Git) -> None: + """Fetches a revision and all its children, and writes them to disk""" + loaded_from_graph = False + + if self.graph: + # First, try to cook using swh-graph, as it is more efficient than + # swh-storage for querying the history + obj_swhid = identifiers.CoreSWHID( + object_type=identifiers.ObjectType.REVISION, object_id=obj_id, + ) + try: + revision_ids = ( + swhid.object_id + for swhid in map( + identifiers.CoreSWHID.from_string, + self.graph.visit_nodes(str(obj_swhid), edges="rev:rev"), + ) + ) + except GraphArgumentException: + # Revision not found in the graph + pass + else: + loaded_from_graph = True + for revision_id_group in grouper(revision_ids, REVISION_BATCH_SIZE): + self.load_revisions_and_directory_subgraphs(revision_id_group) + + if not loaded_from_graph: + # If swh-graph is not available, or the revision is not yet in + # swh-graph, fall back to self.storage.revision_log. + walker = DFSRevisionsWalker(self.storage, obj_id) + for revision in walker: + self.write_revision_node(revision) + self.load_directory_subgraph(revision["directory"]) + + def load_revisions_and_directory_subgraphs(self, obj_ids: List[Sha1Git]) -> None: + """Given a list of revision ids, loads these revisions and their directories; + but not their parent revisions.""" + revisions = self.storage.revision_get(obj_ids) + for revision in revisions: + self.write_revision_node(revision.to_dict()) + self.load_directory_subgraph(revision.directory) + + def write_revision_node(self, revision: Dict[str, Any]) -> bool: + """Writes a revision object to disk""" + git_object = identifiers.revision_git_object(revision) + return self.write_object(revision["id"], git_object) + + @functools.lru_cache(10240) + def load_directory_subgraph(self, obj_id: Sha1Git) -> None: + """Fetches a directory and all its children, and writes them to disk""" + if self.object_exists(obj_id): + # Checks if the object is already written on disk. + # This rarely happens thanks to @lru_cache() + return + directory = self.load_directory_node(obj_id) + entry_loaders: Dict[str, Callable[[Sha1Git], None]] = { + "file": self.load_content, + "dir": self.load_directory_subgraph, + "rev": self.load_revision_subgraph, + } + for entry in directory["entries"]: + entry_loader = entry_loaders[entry["type"]] + entry_loader(entry["target"]) + + def load_directory_node(self, obj_id: Sha1Git) -> Dict[str, Any]: + """Fetches a directory, writes it to disk (non-recursively), and returns it.""" + entries = list(self.storage.directory_ls(obj_id, recursive=False)) + directory = {"id": obj_id, "entries": entries} + git_object = identifiers.directory_git_object(directory) + self.write_object(obj_id, git_object) + return directory + + @functools.lru_cache(10240) + def load_content(self, obj_id: Sha1Git) -> None: + if self.object_exists(obj_id): + # Checks if the object is already written on disk. + # This rarely happens thanks to @lru_cache() + return + + # TODO: add support of filtered objects, somehow? + # It's tricky, because, by definition, we can't write a git object with + # the expected hash, so git-fsck *will* choke on it. + content_sha1 = self.storage.content_find({"sha1_git": obj_id})[0].sha1 + content = self.storage.content_get_data(content_sha1) + self.write_object(obj_id, f"blob {len(content)}\0".encode("ascii") + content) diff --git a/swh/vault/in_memory_backend.py b/swh/vault/in_memory_backend.py --- a/swh/vault/in_memory_backend.py +++ b/swh/vault/in_memory_backend.py @@ -19,7 +19,7 @@ self._cache = VaultCache(cls="memory") def fetch(self, obj_type: str, obj_id: ObjectId) -> Optional[bytes]: - return self._cache.get(obj_type, obj_id) + return self._cache.get(obj_type, hash_to_bytes(obj_id)) def cook( self, obj_type: str, obj_id: ObjectId, email: Optional[str] = None diff --git a/swh/vault/tests/test_cookers.py b/swh/vault/tests/test_cookers.py --- a/swh/vault/tests/test_cookers.py +++ b/swh/vault/tests/test_cookers.py @@ -10,6 +10,7 @@ import io import os import pathlib +import shutil import subprocess import tarfile import tempfile @@ -25,8 +26,15 @@ from swh.loader.git.from_disk import GitLoaderFromDisk from swh.model import from_disk, hashutil -from swh.model.model import Directory, DirectoryEntry, Person, Revision, RevisionType -from swh.vault.cookers import DirectoryCooker, RevisionGitfastCooker +from swh.model.model import ( + Directory, + DirectoryEntry, + Person, + Revision, + RevisionType, + TimestampWithTimezone, +) +from swh.vault.cookers import DirectoryCooker, GitBareCooker, RevisionGitfastCooker from swh.vault.tests.vault_testing import hash_content from swh.vault.to_disk import HIDDEN_MESSAGE, SKIPPED_MESSAGE @@ -146,7 +154,7 @@ @contextlib.contextmanager -def cook_extract_directory(storage, obj_id): +def cook_extract_directory_dircooker(storage, obj_id, fsck=True): """Context manager that cooks a directory and extract it.""" backend = unittest.mock.MagicMock() backend.storage = storage @@ -162,6 +170,79 @@ cooker.storage = None +@contextlib.contextmanager +def cook_extract_directory_gitfast(storage, obj_id, fsck=True): + """Context manager that cooks a revision containing a directory and extract it, + using RevisionGitfastCooker""" + test_repo = TestRepo() + with test_repo as p: + date = TimestampWithTimezone.from_datetime( + datetime.datetime.now(datetime.timezone.utc) + ) + revision = Revision( + directory=obj_id, + message=b"dummy message", + author=Person.from_fullname(b"someone"), + committer=Person.from_fullname(b"someone"), + date=date, + committer_date=date, + type=RevisionType.GIT, + synthetic=False, + ) + storage.revision_add([revision]) + + with cook_stream_revision_gitfast(storage, revision.id) as stream, test_repo as p: + processor = dulwich.fastexport.GitImportProcessor(test_repo.repo) + processor.import_stream(stream) + test_repo.checkout(b"HEAD") + shutil.rmtree(p / ".git") + yield p + + +@contextlib.contextmanager +def cook_extract_directory_git_bare(storage, obj_id, fsck=True): + """Context manager that cooks a revision and extract it, + using GitBareCooker""" + backend = unittest.mock.MagicMock() + backend.storage = storage + + # Cook the object + cooker = GitBareCooker("directory", obj_id, backend=backend, storage=storage) + cooker.use_fsck = fsck # Some tests try edge-cases that git-fsck rejects + cooker.fileobj = io.BytesIO() + assert cooker.check_exists() + cooker.prepare_bundle() + cooker.fileobj.seek(0) + + # Extract it + with tempfile.TemporaryDirectory(prefix="tmp-vault-extract-") as td: + with tarfile.open(fileobj=cooker.fileobj, mode="r") as tar: + tar.extractall(td) + + # Clone it with Dulwich + test_repo = TestRepo() + with test_repo as p: + test_repo.git_shell( + "pull", os.path.join(td, f"swh:1:dir:{obj_id.hex()}.git") + ) + shutil.rmtree(p / ".git") + yield p + + +@pytest.fixture( + scope="module", + params=[ + cook_extract_directory_dircooker, + cook_extract_directory_gitfast, + cook_extract_directory_git_bare, + ], +) +def cook_extract_directory(request): + """A fixture that is instantiated as either cook_extract_directory_dircooker or + cook_extract_directory_git_bare.""" + return request.param + + @contextlib.contextmanager def cook_stream_revision_gitfast(storage, obj_id): """Context manager that cooks a revision and stream its fastexport.""" @@ -180,8 +261,9 @@ @contextlib.contextmanager -def cook_extract_revision_gitfast(storage, obj_id): - """Context manager that cooks a revision and extract it.""" +def cook_extract_revision_gitfast(storage, obj_id, fsck=True): + """Context manager that cooks a revision and extract it, + using RevisionGitfastCooker""" test_repo = TestRepo() with cook_stream_revision_gitfast(storage, obj_id) as stream, test_repo as p: processor = dulwich.fastexport.GitImportProcessor(test_repo.repo) @@ -189,6 +271,45 @@ yield test_repo, p +@contextlib.contextmanager +def cook_extract_revision_git_bare(storage, obj_id, fsck=True): + """Context manager that cooks a revision and extract it, + using GitBareCooker""" + backend = unittest.mock.MagicMock() + backend.storage = storage + + # Cook the object + cooker = GitBareCooker("revision", obj_id, backend=backend, storage=storage) + cooker.use_fsck = fsck # Some tests try edge-cases that git-fsck rejects + cooker.fileobj = io.BytesIO() + assert cooker.check_exists() + cooker.prepare_bundle() + cooker.fileobj.seek(0) + + # Extract it + with tempfile.TemporaryDirectory(prefix="tmp-vault-extract-") as td: + with tarfile.open(fileobj=cooker.fileobj, mode="r") as tar: + tar.extractall(td) + + # Clone it with Dulwich + test_repo = TestRepo() + with test_repo as p: + test_repo.git_shell( + "pull", os.path.join(td, f"swh:1:rev:{obj_id.hex()}.git") + ) + yield test_repo, p + + +@pytest.fixture( + scope="module", + params=[cook_extract_revision_gitfast, cook_extract_revision_git_bare], +) +def cook_extract_revision(request): + """A fixture that is instantiated as either cook_extract_revision_gitfast or + cook_extract_revision_git_bare.""" + return request.param + + TEST_CONTENT = ( " test content\n" "and unicode \N{BLACK HEART SUIT}\n" " and trailing spaces " ) @@ -196,7 +317,7 @@ class TestDirectoryCooker: - def test_directory_simple(self, git_loader): + def test_directory_simple(self, git_loader, cook_extract_directory): repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) @@ -225,7 +346,9 @@ directory = from_disk.Directory.from_disk(path=bytes(p)) assert obj_id_hex == hashutil.hash_to_hex(directory.hash) - def test_directory_filtered_objects(self, git_loader): + def test_directory_filtered_objects(self, git_loader, cook_extract_directory): + if cook_extract_directory is cook_extract_directory_git_bare: + pytest.xfail("GitBareCooker does not support filtered objects (yet?)") repo = TestRepo() with repo as rp: file_1, id_1 = hash_content(b"test1") @@ -267,7 +390,7 @@ assert (p / "hidden_file").read_bytes() == HIDDEN_MESSAGE assert (p / "absent_file").read_bytes() == SKIPPED_MESSAGE - def test_directory_bogus_perms(self, git_loader): + def test_directory_bogus_perms(self, git_loader, cook_extract_directory): # Some early git repositories have 664/775 permissions... let's check # if all the weird modes are properly normalized in the directory # cooker. @@ -329,13 +452,13 @@ ) swh_storage.directory_add([dir]) - with cook_extract_directory(swh_storage, dir.id) as p: + with cook_extract_directory_dircooker(swh_storage, dir.id, fsck=False) as p: assert (p / "submodule").is_symlink() assert os.readlink(str(p / "submodule")) == target_rev -class TestRevisionGitfastCooker: - def test_revision_simple(self, git_loader): +class TestRevisionCooker: + def test_revision_simple(self, git_loader, cook_extract_revision): # # 1--2--3--4--5--6--7 # @@ -362,7 +485,7 @@ obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): + with cook_extract_revision(loader.storage, obj_id) as (ert, p): ert.checkout(b"HEAD") assert (p / "file1").stat().st_mode == 0o100644 assert (p / "file1").read_text() == TEST_CONTENT @@ -374,7 +497,7 @@ assert (p / "dir1/dir2/file").stat().st_mode == 0o100644 assert ert.repo.refs[b"HEAD"].decode() == obj_id_hex - def test_revision_two_roots(self, git_loader): + def test_revision_two_roots(self, git_loader, cook_extract_revision): # # 1----3---4 # / @@ -395,10 +518,10 @@ loader = git_loader(str(rp)) loader.load() - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): + with cook_extract_revision(loader.storage, obj_id) as (ert, p): assert ert.repo.refs[b"HEAD"].decode() == obj_id_hex - def test_revision_two_double_fork_merge(self, git_loader): + def test_revision_two_double_fork_merge(self, git_loader, cook_extract_revision): # # 2---4---6 # / / / @@ -429,10 +552,10 @@ loader = git_loader(str(rp)) loader.load() - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): + with cook_extract_revision(loader.storage, obj_id) as (ert, p): assert ert.repo.refs[b"HEAD"].decode() == obj_id_hex - def test_revision_triple_merge(self, git_loader): + def test_revision_triple_merge(self, git_loader, cook_extract_revision): # # .---.---5 # / / / @@ -457,10 +580,12 @@ loader = git_loader(str(rp)) loader.load() - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): + with cook_extract_revision(loader.storage, obj_id) as (ert, p): assert ert.repo.refs[b"HEAD"].decode() == obj_id_hex - def test_revision_filtered_objects(self, git_loader): + def test_revision_filtered_objects(self, git_loader, cook_extract_revision): + if cook_extract_revision is cook_extract_revision_git_bare: + pytest.xfail("GitBareCooker does not support filtered objects (yet?)") repo = TestRepo() with repo as rp: file_1, id_1 = hash_content(b"test1") @@ -496,61 +621,13 @@ (id_3,), ) - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): + with cook_extract_revision(loader.storage, obj_id) as (ert, p): ert.checkout(b"HEAD") assert (p / "file").read_bytes() == b"test1" assert (p / "hidden_file").read_bytes() == HIDDEN_MESSAGE assert (p / "absent_file").read_bytes() == SKIPPED_MESSAGE - def test_revision_bogus_perms(self, git_loader): - # Some early git repositories have 664/775 permissions... let's check - # if all the weird modes are properly normalized in the revision - # cooker. - repo = TestRepo() - with repo as rp: - (rp / "file").write_text(TEST_CONTENT) - (rp / "file").chmod(0o664) - (rp / "executable").write_bytes(TEST_EXECUTABLE) - (rp / "executable").chmod(0o775) - (rp / "wat").write_text(TEST_CONTENT) - (rp / "wat").chmod(0o604) - - # Disable mode cleanup - with unittest.mock.patch("dulwich.index.cleanup_mode", lambda mode: mode): - c = repo.commit("initial commit") - - # Make sure Dulwich didn't normalize the permissions itself. - # (if it did, then the test can't check the cooker normalized them) - tree_id = repo.repo[c].tree - assert {entry.mode for entry in repo.repo[tree_id].items()} == { - 0o100775, - 0o100664, - 0o100604, - } - - # Disable mode checks - with unittest.mock.patch("dulwich.objects.Tree.check", lambda self: None): - loader = git_loader(str(rp)) - loader.load() - - # Make sure swh-loader didn't normalize them either - dir_entries = loader.storage.directory_ls(hashutil.bytehex_to_hash(tree_id)) - assert {entry["perms"] for entry in dir_entries} == { - 0o100664, - 0o100775, - 0o100604, - } - - obj_id_hex = repo.repo.refs[b"HEAD"].decode() - obj_id = hashutil.hash_to_bytes(obj_id_hex) - - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): - ert.checkout(b"HEAD") - assert (p / "file").stat().st_mode == 0o100644 - assert (p / "executable").stat().st_mode == 0o100755 - assert (p / "wat").stat().st_mode == 0o100644 - - def test_revision_null_fields(self, git_loader): + def test_revision_null_fields(self, git_loader, cook_extract_revision): # Our schema doesn't enforce a lot of non-null revision fields. We need # to check these cases don't break the cooker. repo = TestRepo() @@ -579,7 +656,7 @@ storage = loader.storage storage.revision_add([test_revision]) - with cook_extract_revision_gitfast(storage, test_revision.id) as (ert, p): + with cook_extract_revision(storage, test_revision.id, fsck=False) as (ert, p): ert.checkout(b"HEAD") assert (p / "file").stat().st_mode == 0o100644 diff --git a/swh/vault/tests/test_git_bare_cooker.py b/swh/vault/tests/test_git_bare_cooker.py new file mode 100644 --- /dev/null +++ b/swh/vault/tests/test_git_bare_cooker.py @@ -0,0 +1,178 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +""" +This module contains additional tests for the bare cooker. +Generic cooker tests (eg. without swh-graph) in test_cookers.py also +run on the bare cooker. +""" + +import datetime +import io +import subprocess +import tarfile +import tempfile +import unittest.mock + +import pytest + +from swh.graph.naive_client import NaiveClient as GraphClient +from swh.model.from_disk import DentryPerms +from swh.model.model import ( + Content, + Directory, + DirectoryEntry, + Person, + Revision, + RevisionType, + TimestampWithTimezone, +) +from swh.vault.cookers.git_bare import GitBareCooker +from swh.vault.in_memory_backend import InMemoryVaultBackend + + +def get_objects(): + """ + Build objects:: + + rev1 <------ rev2 + | | + v v + dir1 dir2 + | / | + v / v + cnt1 <----° cnt2 + """ + date = TimestampWithTimezone.from_datetime( + datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc) + ) + author = Person.from_fullname(b"Foo ") + cnt1 = Content.from_data(b"hello") + cnt2 = Content.from_data(b"world") + dir1 = Directory( + entries=( + DirectoryEntry( + name=b"file1", + type="file", + perms=DentryPerms.content, + target=cnt1.sha1_git, + ), + ) + ) + dir2 = Directory( + entries=( + DirectoryEntry( + name=b"file1", + type="file", + perms=DentryPerms.content, + target=cnt1.sha1_git, + ), + DirectoryEntry( + name=b"file2", + type="file", + perms=DentryPerms.content, + target=cnt2.sha1_git, + ), + ) + ) + rev1 = Revision( + message=b"msg1", + date=date, + committer_date=date, + author=author, + committer=author, + directory=dir1.id, + type=RevisionType.GIT, + synthetic=True, + ) + rev2 = Revision( + message=b"msg2", + date=date, + committer_date=date, + author=author, + committer=author, + directory=dir2.id, + parents=(rev1.id,), + type=RevisionType.GIT, + synthetic=True, + ) + + return (cnt1, cnt2, dir1, dir2, rev1, rev2) + + +@pytest.mark.parametrize("last_revision_in_graph", [True, False]) +def test_graph_revisions(swh_storage, last_revision_in_graph): + (cnt1, cnt2, dir1, dir2, rev1, rev2) = get_objects() + + # Add all objects to storage + swh_storage.content_add([cnt1, cnt2]) + swh_storage.directory_add([dir1, dir2]) + swh_storage.revision_add([rev1, rev2]) + + # Add spy on swh_storage, to make sure revision_log is not called + # (the graph must be used instead) + swh_storage = unittest.mock.MagicMock(wraps=swh_storage) + + # Add all objects to graph + if last_revision_in_graph: + nodes = [str(n.swhid()) for n in [cnt1, cnt2, dir1, dir2, rev1, rev2]] + edges = [ + (str(s.swhid()), str(d.swhid())) + for (s, d) in [ + (dir1, cnt1), + (dir2, cnt1), + (dir2, cnt2), + (rev1, dir1), + (rev2, dir2), + (rev2, rev1), + ] + ] + else: + nodes = [str(n.swhid()) for n in [cnt1, cnt2, dir1, dir2, rev1]] + edges = [ + (str(s.swhid()), str(d.swhid())) + for (s, d) in [(dir1, cnt1), (dir2, cnt1), (dir2, cnt2), (rev1, dir1),] + ] + swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges)) + + # Cook + backend = InMemoryVaultBackend() + cooker = GitBareCooker( + "revision_gitbare", + rev2.id, + backend=backend, + storage=swh_storage, + graph=swh_graph, + ) + cooker.cook() + + # Get bundle + bundle = backend.fetch("revision_gitbare", rev2.id) + + # Extract bundle and make sure both revisions are in it + with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: + with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: + tf.extractall(tempdir) + + output = subprocess.check_output( + [ + "git", + "-C", + f"{tempdir}/{rev2.swhid()}.git", + "log", + "--format=oneline", + "--decorate=", + ] + ) + + assert output.decode() == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n" + + # Make sure the graph was used instead of swh_storage.revision_log + swh_graph.visit_nodes.assert_called_once_with(str(rev2.swhid()), edges="rev:rev") + if last_revision_in_graph: + swh_storage.revision_log.assert_not_called() + swh_storage.revision_shortlog.assert_not_called() + else: + swh_storage.revision_log.assert_called()