diff --git a/swh/vault/cookers/__init__.py b/swh/vault/cookers/__init__.py --- a/swh/vault/cookers/__init__.py +++ b/swh/vault/cookers/__init__.py @@ -22,6 +22,7 @@ "directory": DirectoryCooker, "revision_flat": RevisionFlatCooker, "revision_gitfast": RevisionGitfastCooker, + "snapshot_git_bare": GitBareCooker, "revision_git_bare": GitBareCooker, "directory_git_bare": GitBareCooker, } diff --git a/swh/vault/cookers/git_bare.py b/swh/vault/cookers/git_bare.py --- a/swh/vault/cookers/git_bare.py +++ b/swh/vault/cookers/git_bare.py @@ -23,7 +23,7 @@ import subprocess import tarfile import tempfile -from typing import Any, Dict, Iterable, List, Set +from typing import Any, Dict, Iterable, List, Optional, Set import zlib from swh.core.api.classes import stream_results @@ -34,9 +34,11 @@ Revision, RevisionType, Sha1Git, + TargetType, TimestampWithTimezone, ) from swh.storage.algos.revisions_walker import DFSRevisionsWalker +from swh.storage.algos.snapshot import snapshot_get_all_branches from swh.vault.cookers.base import BaseVaultCooker from swh.vault.to_disk import HIDDEN_MESSAGE, SKIPPED_MESSAGE @@ -57,6 +59,8 @@ return not list(self.storage.revision_missing([self.obj_id])) elif obj_type == "directory": return not list(self.storage.directory_missing([self.obj_id])) + if obj_type == "snapshot": + return not list(self.storage.snapshot_missing([self.obj_id])) else: raise NotImplementedError(f"GitBareCooker for {obj_type}") @@ -85,6 +89,7 @@ # Set of objects already in any of the stacks: self._seen: Set[Sha1Git] = set() + self._walker_state: Optional[Any] = None # Set of errors we expect git-fsck to raise at the end: self._expected_fsck_errors = set() @@ -102,7 +107,7 @@ # Load and write all the objects to disk self.load_objects() - # Write the root object as a ref. + # Write the root object as a ref (this step is skipped if it's a snapshot) # This must be done before repacking; git-repack ignores orphan objects. self.write_refs() @@ -150,7 +155,8 @@ ) ) - def write_refs(self): + def write_refs(self, snapshot=None): + refs: Dict[bytes, bytes] # ref name -> target obj_type = self.obj_type.split("_")[0] if obj_type == "directory": # We need a synthetic revision pointing to the directory @@ -171,14 +177,29 @@ synthetic=True, ) self.write_revision_node(revision.to_dict()) - head = revision.id + refs = {b"refs/heads/master": hash_to_bytehex(revision.id)} elif obj_type == "revision": - head = self.obj_id + refs = {b"refs/heads/master": hash_to_bytehex(self.obj_id)} + elif obj_type == "snapshot": + if snapshot is None: + # refs were already written in a previous step + return + refs = { + branch_name: ( + b"ref: " + branch.target + if branch.target_type == TargetType.ALIAS + else hash_to_bytehex(branch.target) + ) + for (branch_name, branch) in snapshot.branches.items() + } else: assert False, obj_type - with open(os.path.join(self.gitdir, "refs", "heads", "master"), "wb") as fd: - fd.write(hash_to_bytehex(head)) + for (ref_name, ref_target) in refs.items(): + path = os.path.join(self.gitdir.encode(), ref_name) + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "wb") as fd: + fd.write(ref_target) def write_archive(self): with tarfile.TarFile(mode="w", fileobj=self.fileobj) as tf: @@ -213,6 +234,8 @@ self.push_revision_subgraph(obj_id) elif obj_type == "directory": self._push(self._dir_stack, [obj_id]) + elif obj_type == "snapshot": + self.push_snapshot_subgraph(obj_id) else: raise NotImplementedError( f"GitBareCooker.queue_subgraph({obj_type!r}, ...)" @@ -261,10 +284,37 @@ # swh-graph, fall back to self.storage.revision_log. # self.storage.revision_log also gives us the full revisions, # so we load them right now instead of just pushing them on the stack. - walker = DFSRevisionsWalker(self.storage, obj_id) + walker = DFSRevisionsWalker(self.storage, obj_id, state=self._walker_state) for revision in walker: self.write_revision_node(revision) self._push(self._dir_stack, [revision["directory"]]) + # Save the state, so the next call to the walker won't return the same + # revisions + self._walker_state = walker.export_state() + + def push_snapshot_subgraph(self, obj_id: Sha1Git) -> None: + """Fetches a snapshot and all its children, and writes them to disk""" + loaded_from_graph = False + + if self.graph: + pass # TODO + + # TODO: when self.graph is available and supports edge labels, use it + # directly to get branch names. + snapshot = snapshot_get_all_branches(self.storage, obj_id) + assert snapshot, "Unknown snapshot" # should have been caught by check_exists() + for branch in snapshot.branches.values(): + if not loaded_from_graph: + if branch.target_type == TargetType.REVISION: + self.push_revision_subgraph(branch.target) + elif branch.target_type == TargetType.ALIAS: + # Nothing to do, this for loop also iterates on the target branch + # (if it exists) + pass + else: + raise NotImplementedError(f"{branch.target_type} branches") + + self.write_refs(snapshot=snapshot) def load_revisions(self, obj_ids: List[Sha1Git]) -> None: """Given a list of revision ids, loads these revisions and their directories; diff --git a/swh/vault/tests/test_cookers.py b/swh/vault/tests/test_cookers.py --- a/swh/vault/tests/test_cookers.py +++ b/swh/vault/tests/test_cookers.py @@ -25,7 +25,7 @@ import pytest from swh.loader.git.from_disk import GitLoaderFromDisk -from swh.model import from_disk, hashutil +from swh.model import from_disk, hashutil, identifiers from swh.model.model import ( Directory, DirectoryEntry, @@ -294,14 +294,19 @@ @contextlib.contextmanager -def cook_extract_revision_git_bare(storage, obj_id, fsck=True): +def cook_extract_git_bare(storage, swhid, fsck=True): """Context manager that cooks a revision and extract it, using GitBareCooker""" backend = unittest.mock.MagicMock() backend.storage = storage # Cook the object - cooker = GitBareCooker("revision", obj_id, backend=backend, storage=storage) + cooker = GitBareCooker( + swhid.object_type.name.lower(), + swhid.object_id, + backend=backend, + storage=storage, + ) cooker.use_fsck = fsck # Some tests try edge-cases that git-fsck rejects cooker.fileobj = io.BytesIO() assert cooker.check_exists() @@ -317,18 +322,25 @@ with tempfile.TemporaryDirectory(prefix="tmp-vault-clone-") as clone_dir: clone_dir = pathlib.Path(clone_dir) subprocess.check_call( - [ - "git", - "clone", - os.path.join(td, f"swh:1:rev:{obj_id.hex()}.git"), - clone_dir, - ] + ["git", "clone", os.path.join(td, f"{swhid}.git"), clone_dir,] ) test_repo = TestRepo(clone_dir) with test_repo: yield test_repo, clone_dir +@contextlib.contextmanager +def cook_extract_revision_git_bare(storage, obj_id, fsck=True): + with cook_extract_git_bare( + storage, + identifiers.CoreSWHID( + object_type=identifiers.ObjectType.REVISION, object_id=obj_id + ), + fsck=fsck, + ) as res: + yield res + + @pytest.fixture( scope="module", params=[cook_extract_revision_gitfast, cook_extract_revision_git_bare], @@ -339,6 +351,27 @@ return request.param +@contextlib.contextmanager +def cook_extract_snapshot_git_bare(storage, obj_id, fsck=True): + with cook_extract_git_bare( + storage, + identifiers.CoreSWHID( + object_type=identifiers.ObjectType.SNAPSHOT, object_id=obj_id + ), + fsck=fsck, + ) as res: + yield res + + +@pytest.fixture( + scope="module", params=[cook_extract_snapshot_git_bare], +) +def cook_extract_snapshot(request): + """Equivalent to cook_extract_snapshot_git_bare; but analogous to + cook_extract_revision in case we ever have more cookers supporting snapshots""" + return request.param + + TEST_CONTENT = ( " test content\n" "and unicode \N{BLACK HEART SUIT}\n" " and trailing spaces " ) @@ -612,6 +645,53 @@ def check_revision_two_roots(self, ert, p, obj_id): assert ert.repo.refs[b"HEAD"].decode() == obj_id.hex() + (c3,) = ert.repo[hashutil.hash_to_bytehex(obj_id)].parents + assert len(ert.repo[c3].parents) == 2 + + def load_repo_two_heads(self, git_loader): + # + # 1---2----4 <-- master and b1 + # \ + # ----3 <-- b2 + # + repo = TestRepo() + with repo as rp: + (rp / "file1").write_text(TEST_CONTENT) + repo.commit("Add file1") + + (rp / "file2").write_text(TEST_CONTENT) + c2 = repo.commit("Add file2") + + repo.repo.refs[b"refs/heads/b2"] = c2 # branch b2 from master + + (rp / "file3").write_text(TEST_CONTENT) + repo.commit("add file3", ref=b"refs/heads/b2") + + (rp / "file4").write_text(TEST_CONTENT) + c4 = repo.commit("add file4", ref=b"refs/heads/master") + repo.repo.refs[b"refs/heads/b1"] = c4 # branch b1 from master + + obj_id_hex = repo.repo.refs[b"HEAD"].decode() + obj_id = hashutil.hash_to_bytes(obj_id_hex) + loader = git_loader(str(rp)) + loader.load() + return (loader, obj_id) + + def check_snapshot_two_heads(self, ert, p, obj_id): + assert ( + hashutil.hash_to_bytehex(obj_id) + == ert.repo.refs[b"HEAD"] + == ert.repo.refs[b"refs/heads/master"] + == ert.repo.refs[b"refs/remotes/origin/HEAD"] + == ert.repo.refs[b"refs/remotes/origin/master"] + == ert.repo.refs[b"refs/remotes/origin/b1"] + ) + + c4_id = hashutil.hash_to_bytehex(obj_id) + c3_id = ert.repo.refs[b"refs/remotes/origin/b2"] + + assert ert.repo[c3_id].parents == ert.repo[c4_id].parents + def load_repo_two_double_fork_merge(self, git_loader): # # 2---4---6 @@ -621,22 +701,22 @@ repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) - c1 = repo.commit("Add file1") - repo.repo.refs[b"refs/heads/c1"] = c1 + c1 = repo.commit("Add file1") # create commit 1 + repo.repo.refs[b"refs/heads/c1"] = c1 # branch c1 from master (rp / "file2").write_text(TEST_CONTENT) - repo.commit("Add file2") + repo.commit("Add file2") # create commit 2 (rp / "file3").write_text(TEST_CONTENT) - c3 = repo.commit("Add file3", ref=b"refs/heads/c1") - repo.repo.refs[b"refs/heads/c3"] = c3 + c3 = repo.commit("Add file3", ref=b"refs/heads/c1") # create commit 3 on c1 + repo.repo.refs[b"refs/heads/c3"] = c3 # branch c3 from c1 - repo.merge([c3]) + repo.merge([c3]) # create commit 4 (rp / "file5").write_text(TEST_CONTENT) - c5 = repo.commit("Add file3", ref=b"refs/heads/c3") + c5 = repo.commit("Add file3", ref=b"refs/heads/c3") # create commit 5 on c3 - repo.merge([c5]) + repo.merge([c5]) # create commit 6 obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) @@ -647,6 +727,21 @@ def check_revision_two_double_fork_merge(self, ert, p, obj_id): assert ert.repo.refs[b"HEAD"].decode() == obj_id.hex() + def check_snapshot_two_double_fork_merge(self, ert, p, obj_id): + assert ( + hashutil.hash_to_bytehex(obj_id) + == ert.repo.refs[b"HEAD"] + == ert.repo.refs[b"refs/heads/master"] + == ert.repo.refs[b"refs/remotes/origin/HEAD"] + == ert.repo.refs[b"refs/remotes/origin/master"] + ) + + (c4_id, c5_id) = ert.repo[obj_id.hex().encode()].parents + assert c5_id == ert.repo.refs[b"refs/remotes/origin/c3"] + + (c2_id, c3_id) = ert.repo[c4_id].parents + assert c3_id == ert.repo.refs[b"refs/remotes/origin/c1"] + def load_repo_triple_merge(self, git_loader): # # .---.---5 @@ -676,6 +771,25 @@ def check_revision_triple_merge(self, ert, p, obj_id): assert ert.repo.refs[b"HEAD"].decode() == obj_id.hex() + def check_snapshot_triple_merge(self, ert, p, obj_id): + assert ( + hashutil.hash_to_bytehex(obj_id) + == ert.repo.refs[b"HEAD"] + == ert.repo.refs[b"refs/heads/master"] + == ert.repo.refs[b"refs/remotes/origin/HEAD"] + == ert.repo.refs[b"refs/remotes/origin/master"] + ) + + (c2_id, c3_id, c4_id) = ert.repo[obj_id.hex().encode()].parents + assert c3_id == ert.repo.refs[b"refs/remotes/origin/b1"] + assert c4_id == ert.repo.refs[b"refs/remotes/origin/b2"] + + assert ( + ert.repo[c2_id].parents + == ert.repo[c3_id].parents + == ert.repo[c4_id].parents + ) + def load_repo_filtered_objects(self, git_loader): repo = TestRepo() with repo as rp: @@ -825,3 +939,43 @@ with cook_stream_revision_gitfast(swh_storage, rev.id) as stream: pattern = "M 160000 {} submodule".format(target_rev).encode() assert pattern in stream.read() + + +class TestSnapshotCooker(RepoFixtures): + def test_snapshot_simple(self, git_loader, cook_extract_snapshot): + (loader, main_rev_id) = self.load_repo_simple(git_loader) + snp_id = loader.loaded_snapshot_id + with cook_extract_snapshot(loader.storage, snp_id) as (ert, p): + self.check_revision_simple(ert, p, main_rev_id) + + def test_snapshot_two_roots(self, git_loader, cook_extract_snapshot): + (loader, main_rev_id) = self.load_repo_two_roots(git_loader) + snp_id = loader.loaded_snapshot_id + with cook_extract_snapshot(loader.storage, snp_id) as (ert, p): + self.check_revision_two_roots(ert, p, main_rev_id) + + def test_snapshot_two_heads(self, git_loader, cook_extract_snapshot): + (loader, main_rev_id) = self.load_repo_two_heads(git_loader) + snp_id = loader.loaded_snapshot_id + with cook_extract_snapshot(loader.storage, snp_id) as (ert, p): + self.check_snapshot_two_heads(ert, p, main_rev_id) + + def test_snapshot_two_double_fork_merge(self, git_loader, cook_extract_snapshot): + (loader, main_rev_id) = self.load_repo_two_double_fork_merge(git_loader) + snp_id = loader.loaded_snapshot_id + with cook_extract_snapshot(loader.storage, snp_id) as (ert, p): + self.check_revision_two_double_fork_merge(ert, p, main_rev_id) + self.check_snapshot_two_double_fork_merge(ert, p, main_rev_id) + + def test_snapshot_triple_merge(self, git_loader, cook_extract_snapshot): + (loader, main_rev_id) = self.load_repo_triple_merge(git_loader) + snp_id = loader.loaded_snapshot_id + with cook_extract_snapshot(loader.storage, snp_id) as (ert, p): + self.check_revision_triple_merge(ert, p, main_rev_id) + self.check_snapshot_triple_merge(ert, p, main_rev_id) + + def test_snapshot_filtered_objects(self, git_loader, cook_extract_snapshot): + (loader, main_rev_id) = self.load_repo_filtered_objects(git_loader) + snp_id = loader.loaded_snapshot_id + with cook_extract_snapshot(loader.storage, snp_id) as (ert, p): + self.check_revision_filtered_objects(ert, p, main_rev_id)