Page MenuHomeSoftware Heritage

D5999.id21775.diff
No OneTemporary

D5999.id21775.diff

diff --git a/swh/vault/cookers/__init__.py b/swh/vault/cookers/__init__.py
--- a/swh/vault/cookers/__init__.py
+++ b/swh/vault/cookers/__init__.py
@@ -22,6 +22,7 @@
"directory": DirectoryCooker,
"revision_flat": RevisionFlatCooker,
"revision_gitfast": RevisionGitfastCooker,
+ "snapshot_git_bare": GitBareCooker,
"revision_git_bare": GitBareCooker,
"directory_git_bare": GitBareCooker,
}
diff --git a/swh/vault/cookers/git_bare.py b/swh/vault/cookers/git_bare.py
--- a/swh/vault/cookers/git_bare.py
+++ b/swh/vault/cookers/git_bare.py
@@ -23,7 +23,7 @@
import subprocess
import tarfile
import tempfile
-from typing import Any, Dict, Iterable, List, Set
+from typing import Any, Dict, Iterable, List, Optional, Set
import zlib
from swh.core.api.classes import stream_results
@@ -34,9 +34,11 @@
Revision,
RevisionType,
Sha1Git,
+ TargetType,
TimestampWithTimezone,
)
from swh.storage.algos.revisions_walker import DFSRevisionsWalker
+from swh.storage.algos.snapshot import snapshot_get_all_branches
from swh.vault.cookers.base import BaseVaultCooker
from swh.vault.to_disk import HIDDEN_MESSAGE, SKIPPED_MESSAGE
@@ -57,6 +59,8 @@
return not list(self.storage.revision_missing([self.obj_id]))
elif obj_type == "directory":
return not list(self.storage.directory_missing([self.obj_id]))
+ if obj_type == "snapshot":
+ return not list(self.storage.snapshot_missing([self.obj_id]))
else:
raise NotImplementedError(f"GitBareCooker for {obj_type}")
@@ -85,6 +89,7 @@
# Set of objects already in any of the stacks:
self._seen: Set[Sha1Git] = set()
+ self._walker_state: Optional[Any] = None
# Set of errors we expect git-fsck to raise at the end:
self._expected_fsck_errors = set()
@@ -102,7 +107,7 @@
# Load and write all the objects to disk
self.load_objects()
- # Write the root object as a ref.
+ # Write the root object as a ref (this step is skipped if it's a snapshot)
# This must be done before repacking; git-repack ignores orphan objects.
self.write_refs()
@@ -150,7 +155,8 @@
)
)
- def write_refs(self):
+ def write_refs(self, snapshot=None):
+ refs: Dict[bytes, bytes] # ref name -> target
obj_type = self.obj_type.split("_")[0]
if obj_type == "directory":
# We need a synthetic revision pointing to the directory
@@ -171,14 +177,29 @@
synthetic=True,
)
self.write_revision_node(revision.to_dict())
- head = revision.id
+ refs = {b"refs/heads/master": hash_to_bytehex(revision.id)}
elif obj_type == "revision":
- head = self.obj_id
+ refs = {b"refs/heads/master": hash_to_bytehex(self.obj_id)}
+ elif obj_type == "snapshot":
+ if snapshot is None:
+ # refs were already written in a previous step
+ return
+ refs = {
+ branch_name: (
+ b"ref: " + branch.target
+ if branch.target_type == TargetType.ALIAS
+ else hash_to_bytehex(branch.target)
+ )
+ for (branch_name, branch) in snapshot.branches.items()
+ }
else:
assert False, obj_type
- with open(os.path.join(self.gitdir, "refs", "heads", "master"), "wb") as fd:
- fd.write(hash_to_bytehex(head))
+ for (ref_name, ref_target) in refs.items():
+ path = os.path.join(self.gitdir.encode(), ref_name)
+ os.makedirs(os.path.dirname(path), exist_ok=True)
+ with open(path, "wb") as fd:
+ fd.write(ref_target)
def write_archive(self):
with tarfile.TarFile(mode="w", fileobj=self.fileobj) as tf:
@@ -213,6 +234,8 @@
self.push_revision_subgraph(obj_id)
elif obj_type == "directory":
self._push(self._dir_stack, [obj_id])
+ elif obj_type == "snapshot":
+ self.push_snapshot_subgraph(obj_id)
else:
raise NotImplementedError(
f"GitBareCooker.queue_subgraph({obj_type!r}, ...)"
@@ -261,10 +284,37 @@
# swh-graph, fall back to self.storage.revision_log.
# self.storage.revision_log also gives us the full revisions,
# so we load them right now instead of just pushing them on the stack.
- walker = DFSRevisionsWalker(self.storage, obj_id)
+ walker = DFSRevisionsWalker(self.storage, obj_id, state=self._walker_state)
for revision in walker:
self.write_revision_node(revision)
self._push(self._dir_stack, [revision["directory"]])
+ # Save the state, so the next call to the walker won't return the same
+ # revisions
+ self._walker_state = walker.export_state()
+
+ def push_snapshot_subgraph(self, obj_id: Sha1Git) -> None:
+ """Fetches a snapshot and all its children, and writes them to disk"""
+ loaded_from_graph = False
+
+ if self.graph:
+ pass # TODO
+
+ # TODO: when self.graph is available and supports edge labels, use it
+ # directly to get branch names.
+ snapshot = snapshot_get_all_branches(self.storage, obj_id)
+ assert snapshot, "Unknown snapshot" # should have been caught by check_exists()
+ for branch in snapshot.branches.values():
+ if not loaded_from_graph:
+ if branch.target_type == TargetType.REVISION:
+ self.push_revision_subgraph(branch.target)
+ elif branch.target_type == TargetType.ALIAS:
+ # Nothing to do, this for loop also iterates on the target branch
+ # (if it exists)
+ pass
+ else:
+ raise NotImplementedError(f"{branch.target_type} branches")
+
+ self.write_refs(snapshot=snapshot)
def load_revisions(self, obj_ids: List[Sha1Git]) -> None:
"""Given a list of revision ids, loads these revisions and their directories;
diff --git a/swh/vault/tests/test_cookers.py b/swh/vault/tests/test_cookers.py
--- a/swh/vault/tests/test_cookers.py
+++ b/swh/vault/tests/test_cookers.py
@@ -25,7 +25,7 @@
import pytest
from swh.loader.git.from_disk import GitLoaderFromDisk
-from swh.model import from_disk, hashutil
+from swh.model import from_disk, hashutil, identifiers
from swh.model.model import (
Directory,
DirectoryEntry,
@@ -294,14 +294,19 @@
@contextlib.contextmanager
-def cook_extract_revision_git_bare(storage, obj_id, fsck=True):
+def cook_extract_git_bare(storage, swhid, fsck=True):
"""Context manager that cooks a revision and extract it,
using GitBareCooker"""
backend = unittest.mock.MagicMock()
backend.storage = storage
# Cook the object
- cooker = GitBareCooker("revision", obj_id, backend=backend, storage=storage)
+ cooker = GitBareCooker(
+ swhid.object_type.name.lower(),
+ swhid.object_id,
+ backend=backend,
+ storage=storage,
+ )
cooker.use_fsck = fsck # Some tests try edge-cases that git-fsck rejects
cooker.fileobj = io.BytesIO()
assert cooker.check_exists()
@@ -317,18 +322,25 @@
with tempfile.TemporaryDirectory(prefix="tmp-vault-clone-") as clone_dir:
clone_dir = pathlib.Path(clone_dir)
subprocess.check_call(
- [
- "git",
- "clone",
- os.path.join(td, f"swh:1:rev:{obj_id.hex()}.git"),
- clone_dir,
- ]
+ ["git", "clone", os.path.join(td, f"{swhid}.git"), clone_dir,]
)
test_repo = TestRepo(clone_dir)
with test_repo:
yield test_repo, clone_dir
+@contextlib.contextmanager
+def cook_extract_revision_git_bare(storage, obj_id, fsck=True):
+ with cook_extract_git_bare(
+ storage,
+ identifiers.CoreSWHID(
+ object_type=identifiers.ObjectType.REVISION, object_id=obj_id
+ ),
+ fsck=fsck,
+ ) as res:
+ yield res
+
+
@pytest.fixture(
scope="module",
params=[cook_extract_revision_gitfast, cook_extract_revision_git_bare],
@@ -339,6 +351,27 @@
return request.param
+@contextlib.contextmanager
+def cook_extract_snapshot_git_bare(storage, obj_id, fsck=True):
+ with cook_extract_git_bare(
+ storage,
+ identifiers.CoreSWHID(
+ object_type=identifiers.ObjectType.SNAPSHOT, object_id=obj_id
+ ),
+ fsck=fsck,
+ ) as res:
+ yield res
+
+
+@pytest.fixture(
+ scope="module", params=[cook_extract_snapshot_git_bare],
+)
+def cook_extract_snapshot(request):
+ """Equivalent to cook_extract_snapshot_git_bare; but analogous to
+ cook_extract_revision in case we ever have more cookers supporting snapshots"""
+ return request.param
+
+
TEST_CONTENT = (
" test content\n" "and unicode \N{BLACK HEART SUIT}\n" " and trailing spaces "
)
@@ -612,6 +645,53 @@
def check_revision_two_roots(self, ert, p, obj_id):
assert ert.repo.refs[b"HEAD"].decode() == obj_id.hex()
+ (c3,) = ert.repo[hashutil.hash_to_bytehex(obj_id)].parents
+ assert len(ert.repo[c3].parents) == 2
+
+ def load_repo_two_heads(self, git_loader):
+ #
+ # 1---2----4 <-- master and b1
+ # \
+ # ----3 <-- b2
+ #
+ repo = TestRepo()
+ with repo as rp:
+ (rp / "file1").write_text(TEST_CONTENT)
+ repo.commit("Add file1")
+
+ (rp / "file2").write_text(TEST_CONTENT)
+ c2 = repo.commit("Add file2")
+
+ repo.repo.refs[b"refs/heads/b2"] = c2 # branch b2 from master
+
+ (rp / "file3").write_text(TEST_CONTENT)
+ repo.commit("add file3", ref=b"refs/heads/b2")
+
+ (rp / "file4").write_text(TEST_CONTENT)
+ c4 = repo.commit("add file4", ref=b"refs/heads/master")
+ repo.repo.refs[b"refs/heads/b1"] = c4 # branch b1 from master
+
+ obj_id_hex = repo.repo.refs[b"HEAD"].decode()
+ obj_id = hashutil.hash_to_bytes(obj_id_hex)
+ loader = git_loader(str(rp))
+ loader.load()
+ return (loader, obj_id)
+
+ def check_snapshot_two_heads(self, ert, p, obj_id):
+ assert (
+ hashutil.hash_to_bytehex(obj_id)
+ == ert.repo.refs[b"HEAD"]
+ == ert.repo.refs[b"refs/heads/master"]
+ == ert.repo.refs[b"refs/remotes/origin/HEAD"]
+ == ert.repo.refs[b"refs/remotes/origin/master"]
+ == ert.repo.refs[b"refs/remotes/origin/b1"]
+ )
+
+ c4_id = hashutil.hash_to_bytehex(obj_id)
+ c3_id = ert.repo.refs[b"refs/remotes/origin/b2"]
+
+ assert ert.repo[c3_id].parents == ert.repo[c4_id].parents
+
def load_repo_two_double_fork_merge(self, git_loader):
#
# 2---4---6
@@ -621,22 +701,22 @@
repo = TestRepo()
with repo as rp:
(rp / "file1").write_text(TEST_CONTENT)
- c1 = repo.commit("Add file1")
- repo.repo.refs[b"refs/heads/c1"] = c1
+ c1 = repo.commit("Add file1") # create commit 1
+ repo.repo.refs[b"refs/heads/c1"] = c1 # branch c1 from master
(rp / "file2").write_text(TEST_CONTENT)
- repo.commit("Add file2")
+ repo.commit("Add file2") # create commit 2
(rp / "file3").write_text(TEST_CONTENT)
- c3 = repo.commit("Add file3", ref=b"refs/heads/c1")
- repo.repo.refs[b"refs/heads/c3"] = c3
+ c3 = repo.commit("Add file3", ref=b"refs/heads/c1") # create commit 3 on c1
+ repo.repo.refs[b"refs/heads/c3"] = c3 # branch c3 from c1
- repo.merge([c3])
+ repo.merge([c3]) # create commit 4
(rp / "file5").write_text(TEST_CONTENT)
- c5 = repo.commit("Add file3", ref=b"refs/heads/c3")
+ c5 = repo.commit("Add file3", ref=b"refs/heads/c3") # create commit 5 on c3
- repo.merge([c5])
+ repo.merge([c5]) # create commit 6
obj_id_hex = repo.repo.refs[b"HEAD"].decode()
obj_id = hashutil.hash_to_bytes(obj_id_hex)
@@ -647,6 +727,21 @@
def check_revision_two_double_fork_merge(self, ert, p, obj_id):
assert ert.repo.refs[b"HEAD"].decode() == obj_id.hex()
+ def check_snapshot_two_double_fork_merge(self, ert, p, obj_id):
+ assert (
+ hashutil.hash_to_bytehex(obj_id)
+ == ert.repo.refs[b"HEAD"]
+ == ert.repo.refs[b"refs/heads/master"]
+ == ert.repo.refs[b"refs/remotes/origin/HEAD"]
+ == ert.repo.refs[b"refs/remotes/origin/master"]
+ )
+
+ (c4_id, c5_id) = ert.repo[obj_id.hex().encode()].parents
+ assert c5_id == ert.repo.refs[b"refs/remotes/origin/c3"]
+
+ (c2_id, c3_id) = ert.repo[c4_id].parents
+ assert c3_id == ert.repo.refs[b"refs/remotes/origin/c1"]
+
def load_repo_triple_merge(self, git_loader):
#
# .---.---5
@@ -676,6 +771,25 @@
def check_revision_triple_merge(self, ert, p, obj_id):
assert ert.repo.refs[b"HEAD"].decode() == obj_id.hex()
+ def check_snapshot_triple_merge(self, ert, p, obj_id):
+ assert (
+ hashutil.hash_to_bytehex(obj_id)
+ == ert.repo.refs[b"HEAD"]
+ == ert.repo.refs[b"refs/heads/master"]
+ == ert.repo.refs[b"refs/remotes/origin/HEAD"]
+ == ert.repo.refs[b"refs/remotes/origin/master"]
+ )
+
+ (c2_id, c3_id, c4_id) = ert.repo[obj_id.hex().encode()].parents
+ assert c3_id == ert.repo.refs[b"refs/remotes/origin/b1"]
+ assert c4_id == ert.repo.refs[b"refs/remotes/origin/b2"]
+
+ assert (
+ ert.repo[c2_id].parents
+ == ert.repo[c3_id].parents
+ == ert.repo[c4_id].parents
+ )
+
def load_repo_filtered_objects(self, git_loader):
repo = TestRepo()
with repo as rp:
@@ -825,3 +939,43 @@
with cook_stream_revision_gitfast(swh_storage, rev.id) as stream:
pattern = "M 160000 {} submodule".format(target_rev).encode()
assert pattern in stream.read()
+
+
+class TestSnapshotCooker(RepoFixtures):
+ def test_snapshot_simple(self, git_loader, cook_extract_snapshot):
+ (loader, main_rev_id) = self.load_repo_simple(git_loader)
+ snp_id = loader.loaded_snapshot_id
+ with cook_extract_snapshot(loader.storage, snp_id) as (ert, p):
+ self.check_revision_simple(ert, p, main_rev_id)
+
+ def test_snapshot_two_roots(self, git_loader, cook_extract_snapshot):
+ (loader, main_rev_id) = self.load_repo_two_roots(git_loader)
+ snp_id = loader.loaded_snapshot_id
+ with cook_extract_snapshot(loader.storage, snp_id) as (ert, p):
+ self.check_revision_two_roots(ert, p, main_rev_id)
+
+ def test_snapshot_two_heads(self, git_loader, cook_extract_snapshot):
+ (loader, main_rev_id) = self.load_repo_two_heads(git_loader)
+ snp_id = loader.loaded_snapshot_id
+ with cook_extract_snapshot(loader.storage, snp_id) as (ert, p):
+ self.check_snapshot_two_heads(ert, p, main_rev_id)
+
+ def test_snapshot_two_double_fork_merge(self, git_loader, cook_extract_snapshot):
+ (loader, main_rev_id) = self.load_repo_two_double_fork_merge(git_loader)
+ snp_id = loader.loaded_snapshot_id
+ with cook_extract_snapshot(loader.storage, snp_id) as (ert, p):
+ self.check_revision_two_double_fork_merge(ert, p, main_rev_id)
+ self.check_snapshot_two_double_fork_merge(ert, p, main_rev_id)
+
+ def test_snapshot_triple_merge(self, git_loader, cook_extract_snapshot):
+ (loader, main_rev_id) = self.load_repo_triple_merge(git_loader)
+ snp_id = loader.loaded_snapshot_id
+ with cook_extract_snapshot(loader.storage, snp_id) as (ert, p):
+ self.check_revision_triple_merge(ert, p, main_rev_id)
+ self.check_snapshot_triple_merge(ert, p, main_rev_id)
+
+ def test_snapshot_filtered_objects(self, git_loader, cook_extract_snapshot):
+ (loader, main_rev_id) = self.load_repo_filtered_objects(git_loader)
+ snp_id = loader.loaded_snapshot_id
+ with cook_extract_snapshot(loader.storage, snp_id) as (ert, p):
+ self.check_revision_filtered_objects(ert, p, main_rev_id)

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 3:58 PM (2 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219266

Event Timeline