diff --git a/swh/vault/cookers/git_bare.py b/swh/vault/cookers/git_bare.py --- a/swh/vault/cookers/git_bare.py +++ b/swh/vault/cookers/git_bare.py @@ -19,6 +19,7 @@ import datetime import os.path +import re import subprocess import tarfile import tempfile @@ -38,6 +39,7 @@ ) from swh.storage.algos.revisions_walker import DFSRevisionsWalker from swh.vault.cookers.base import BaseVaultCooker +from swh.vault.to_disk import HIDDEN_MESSAGE, SKIPPED_MESSAGE REVISION_BATCH_SIZE = 10000 DIRECTORY_BATCH_SIZE = 10000 @@ -45,7 +47,7 @@ class GitBareCooker(BaseVaultCooker): - use_fsck = False + use_fsck = True def cache_type_key(self) -> str: return self.obj_type @@ -81,9 +83,13 @@ self._rev_stack: List[Sha1Git] = [] self._dir_stack: List[Sha1Git] = [] self._cnt_stack: List[Sha1Git] = [] + # Set of objects already in any of the stacks: self._seen: Set[Sha1Git] = set() + # Set of errors we expect git-fsck to raise at the end: + self._expected_fsck_errors = set() + with tempfile.TemporaryDirectory(prefix="swh-vault-gitbare-") as workdir: # Initialize a Git directory self.workdir = workdir @@ -114,7 +120,7 @@ def repack(self) -> None: if self.use_fsck: - subprocess.run(["git", "-C", self.gitdir, "fsck"], check=True) + self.git_fsck() # Add objects we wrote in a pack subprocess.run(["git", "-C", self.gitdir, "repack"], check=True) @@ -122,6 +128,29 @@ # Remove their non-packed originals subprocess.run(["git", "-C", self.gitdir, "prune-packed"], check=True) + def git_fsck(self) -> None: + proc = subprocess.run( + ["git", "-C", self.gitdir, "fsck"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + env={"LANG": "C.utf8"}, + ) + if not self._expected_fsck_errors: + # All went well, there should not be any error + proc.check_returncode() + return + + # Split on newlines not followed by a space + errors = re.split("\n(?! )", proc.stdout.decode()) + + unexpected_errors = set(filter(bool, errors)) - self._expected_fsck_errors + if unexpected_errors: + raise Exception( + "\n".join( + ["Unexpected errors from git-fsck:"] + sorted(unexpected_errors) + ) + ) + def write_refs(self): obj_type = self.obj_type.split("_")[0] if obj_type == "directory": @@ -157,10 +186,13 @@ tf.add(self.gitdir, arcname=f"{self.obj_swhid()}.git", recursive=True) def _obj_path(self, obj_id: Sha1Git): + return os.path.join(self.gitdir, self._obj_relative_path(obj_id)) + + def _obj_relative_path(self, obj_id: Sha1Git): obj_id_hex = hash_to_hex(obj_id) directory = obj_id_hex[0:2] filename = obj_id_hex[2:] - return os.path.join(self.gitdir, "objects", directory, filename) + return os.path.join("objects", directory, filename) def object_exists(self, obj_id: Sha1Git) -> bool: return os.path.exists(self._obj_path(obj_id)) @@ -276,15 +308,43 @@ # the expected hash, so git-fsck *will* choke on it. contents = self.storage.content_get(obj_ids, "sha1_git") + visible_contents = [] + for (obj_id, content) in zip(obj_ids, contents): + if content is None: + # FIXME: this may also happen for missing content + self.write_content(obj_id, SKIPPED_MESSAGE) + self._expect_mismatched_object_error(obj_id) + elif content.status == "visible": + visible_contents.append(content) + elif content.status == "hidden": + self.write_content(obj_id, HIDDEN_MESSAGE) + self._expect_mismatched_object_error(obj_id) + else: + assert False, ( + f"unexpected status {content.status!r} " + f"for content {hash_to_hex(content.sha1_git)}" + ) + if self.objstorage is None: - for content in contents: + for content in visible_contents: data = self.storage.content_get_data(content.sha1) self.write_content(content.sha1_git, data) else: - content_data = self.objstorage.get_batch(c.sha1 for c in contents) + content_data = self.objstorage.get_batch(c.sha1 for c in visible_contents) for (content, data) in zip(contents, content_data): self.write_content(content.sha1_git, data) def write_content(self, obj_id: Sha1Git, content: bytes) -> None: header = identifiers.git_object_header("blob", len(content)) self.write_object(obj_id, header + content) + + def _expect_mismatched_object_error(self, obj_id): + obj_id_hex = hash_to_hex(obj_id) + obj_path = self._obj_relative_path(obj_id) + self._expected_fsck_errors.add( + f"error: sha1 mismatch for ./{obj_path} (expected {obj_id_hex})" + ) + self._expected_fsck_errors.add( + f"error: {obj_id_hex}: object corrupt or missing: ./{obj_path}" + ) + self._expected_fsck_errors.add(f"missing blob {obj_id_hex}") diff --git a/swh/vault/tests/test_cookers.py b/swh/vault/tests/test_cookers.py --- a/swh/vault/tests/test_cookers.py +++ b/swh/vault/tests/test_cookers.py @@ -44,10 +44,17 @@ functions to perform basic git stuff. """ + def __init__(self, repo_dir=None): + self.repo_dir = repo_dir + def __enter__(self): - self.tmp_dir = tempfile.TemporaryDirectory(prefix="tmp-vault-repo-") - self.repo_dir = self.tmp_dir.__enter__() - self.repo = dulwich.repo.Repo.init(self.repo_dir) + if self.repo_dir: + self.tmp_dir = None + self.repo = dulwich.repo.Repo(self.repo_dir) + else: + self.tmp_dir = tempfile.TemporaryDirectory(prefix="tmp-vault-repo-") + self.repo_dir = self.tmp_dir.__enter__() + self.repo = dulwich.repo.Repo.init(self.repo_dir) self.author_name = b"Test Author" self.author_email = b"test@softwareheritage.org" self.author = b"%s <%s>" % (self.author_name, self.author_email) @@ -56,7 +63,9 @@ return pathlib.Path(self.repo_dir) def __exit__(self, exc, value, tb): - self.tmp_dir.__exit__(exc, value, tb) + if self.tmp_dir is not None: + self.tmp_dir.__exit__(exc, value, tb) + self.repo_dir = None def checkout(self, rev_sha): rev = self.repo[rev_sha] @@ -228,13 +237,18 @@ tar.extractall(td) # Clone it with Dulwich - test_repo = TestRepo() - with test_repo as p: - test_repo.git_shell( - "pull", os.path.join(td, f"swh:1:dir:{obj_id.hex()}.git") + with tempfile.TemporaryDirectory(prefix="tmp-vault-clone-") as clone_dir: + clone_dir = pathlib.Path(clone_dir) + subprocess.check_call( + [ + "git", + "clone", + os.path.join(td, f"swh:1:dir:{obj_id.hex()}.git"), + clone_dir, + ] ) - shutil.rmtree(p / ".git") - yield p + shutil.rmtree(clone_dir / ".git") + yield clone_dir @pytest.fixture( @@ -300,12 +314,19 @@ tar.extractall(td) # Clone it with Dulwich - test_repo = TestRepo() - with test_repo as p: - test_repo.git_shell( - "pull", os.path.join(td, f"swh:1:rev:{obj_id.hex()}.git") + with tempfile.TemporaryDirectory(prefix="tmp-vault-clone-") as clone_dir: + clone_dir = pathlib.Path(clone_dir) + subprocess.check_call( + [ + "git", + "clone", + os.path.join(td, f"swh:1:rev:{obj_id.hex()}.git"), + clone_dir, + ] ) - yield test_repo, p + test_repo = TestRepo(clone_dir) + with test_repo: + yield test_repo, clone_dir @pytest.fixture( @@ -355,8 +376,6 @@ assert obj_id_hex == hashutil.hash_to_hex(directory.hash) def test_directory_filtered_objects(self, git_loader, cook_extract_directory): - if cook_extract_directory is cook_extract_directory_git_bare: - pytest.xfail("GitBareCooker does not support filtered objects (yet?)") repo = TestRepo() with repo as rp: file_1, id_1 = hash_content(b"test1") @@ -387,12 +406,20 @@ where sha1 = %s""", (id_2,), ) + cur.execute( - """update content set status = 'absent' - where sha1 = %s""", + """ + insert into skipped_content + (sha1, sha1_git, sha256, blake2s256, length, reason) + select sha1, sha1_git, sha256, blake2s256, length, 'no reason' + from content + where sha1 = %s + """, (id_3,), ) + cur.execute("delete from content where sha1 = %s", (id_3,)) + with cook_extract_directory(loader.storage, obj_id) as p: assert (p / "file").read_bytes() == b"test1" assert (p / "hidden_file").read_bytes() == HIDDEN_MESSAGE @@ -643,8 +670,6 @@ assert ert.repo.refs[b"HEAD"].decode() == obj_id_hex def test_revision_filtered_objects(self, git_loader, cook_extract_revision): - if cook_extract_revision is cook_extract_revision_git_bare: - pytest.xfail("GitBareCooker does not support filtered objects (yet?)") repo = TestRepo() with repo as rp: file_1, id_1 = hash_content(b"test1") @@ -674,12 +699,20 @@ where sha1 = %s""", (id_2,), ) + cur.execute( - """update content set status = 'absent' - where sha1 = %s""", + """ + insert into skipped_content + (sha1, sha1_git, sha256, blake2s256, length, reason) + select sha1, sha1_git, sha256, blake2s256, length, 'no reason' + from content + where sha1 = %s + """, (id_3,), ) + cur.execute("delete from content where sha1 = %s", (id_3,)) + with cook_extract_revision(loader.storage, obj_id) as (ert, p): ert.checkout(b"HEAD") assert (p / "file").read_bytes() == b"test1"