diff --git a/swh/vault/tests/conftest.py b/swh/vault/tests/conftest.py index 5c7cbf2..e91753f 100644 --- a/swh/vault/tests/conftest.py +++ b/swh/vault/tests/conftest.py @@ -1,93 +1,75 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from functools import partial import os from typing import Any, Dict import pkg_resources.extern.packaging.version import pytest from pytest_postgresql import factories from swh.core.db.pytest_plugin import initialize_database_for_module, postgresql_fact -from swh.storage.postgresql.db import Db as StorageDb from swh.vault import get_vault from swh.vault.backend import VaultBackend os.environ["LC_ALL"] = "C.UTF-8" # needed for directory tests on git-cloned repositories # 022 is usually the default value, but some environments (eg. Debian builds) have # a different one. os.umask(0o022) pytest_v = pkg_resources.get_distribution("pytest").parsed_version if pytest_v < pkg_resources.extern.packaging.version.parse("3.9"): @pytest.fixture def tmp_path(): import pathlib import tempfile with tempfile.TemporaryDirectory() as tmpdir: yield pathlib.Path(tmpdir) -storage_postgresql_proc = factories.postgresql_proc( - dbname="storage", - load=[ - partial(initialize_database_for_module, "storage", StorageDb.current_version) - ], -) - vault_postgresql_proc = factories.postgresql_proc( dbname="vault", load=[ partial(initialize_database_for_module, "vault", VaultBackend.current_version) ], ) postgres_vault = postgresql_fact("vault_postgresql_proc") -postgres_storage = postgresql_fact( - "storage_postgresql_proc", - no_db_drop=True, # keep the db for performance reasons -) @pytest.fixture -def swh_vault_config(postgres_vault, postgres_storage, tmp_path) -> Dict[str, Any]: +def swh_vault_config(postgres_vault, tmp_path) -> Dict[str, Any]: tmp_path = str(tmp_path) return { "db": postgres_vault.dsn, "storage": { - "cls": "postgresql", - "db": postgres_storage.dsn, - "objstorage": { - "cls": "pathslicing", - "root": tmp_path, - "slicing": "0:1/1:5", - }, + "cls": "memory", }, "cache": { "cls": "pathslicing", "root": tmp_path, "slicing": "0:1/1:5", "allow_delete": True, }, "scheduler": { "cls": "remote", "url": "http://swh-scheduler:5008", }, } @pytest.fixture def swh_vault(swh_vault_config): return get_vault("local", **swh_vault_config) @pytest.fixture def swh_storage(swh_vault): return swh_vault.storage diff --git a/swh/vault/tests/test_cookers.py b/swh/vault/tests/test_cookers.py index 2d1a368..48f53cc 100644 --- a/swh/vault/tests/test_cookers.py +++ b/swh/vault/tests/test_cookers.py @@ -1,1211 +1,1200 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import contextlib import datetime import glob import gzip import io import os import pathlib import shutil import subprocess import tarfile import tempfile import unittest import unittest.mock +import attrs import dulwich.fastexport import dulwich.index import dulwich.objects import dulwich.porcelain import dulwich.repo import pytest from swh.loader.git.from_disk import GitLoaderFromDisk from swh.model import from_disk, hashutil from swh.model.model import ( Person, Release, Revision, RevisionType, + SkippedContent, Snapshot, SnapshotBranch, TargetType, Timestamp, TimestampWithTimezone, ) from swh.model.model import Content, Directory, DirectoryEntry from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ObjectType from swh.vault.cookers import DirectoryCooker, GitBareCooker, RevisionGitfastCooker from swh.vault.tests.vault_testing import hash_content from swh.vault.to_disk import HIDDEN_MESSAGE, SKIPPED_MESSAGE class TestRepo: """A tiny context manager for a test git repository, with some utility functions to perform basic git stuff. """ def __init__(self, repo_dir=None): self.repo_dir = repo_dir def __enter__(self): if self.repo_dir: self.tmp_dir = None self.repo = dulwich.repo.Repo(self.repo_dir) else: self.tmp_dir = tempfile.TemporaryDirectory(prefix="tmp-vault-repo-") self.repo_dir = self.tmp_dir.__enter__() self.repo = dulwich.repo.Repo.init(self.repo_dir) self.author_name = b"Test Author" self.author_email = b"test@softwareheritage.org" self.author = b"%s <%s>" % (self.author_name, self.author_email) self.base_date = 258244200 self.counter = 0 return pathlib.Path(self.repo_dir) def __exit__(self, exc, value, tb): if self.tmp_dir is not None: self.tmp_dir.__exit__(exc, value, tb) self.repo_dir = None def checkout(self, rev_sha): rev = self.repo[rev_sha] dulwich.index.build_index_from_tree( str(self.repo_dir), self.repo.index_path(), self.repo.object_store, rev.tree ) def git_shell(self, *cmd, stdout=subprocess.DEVNULL, **kwargs): name = self.author_name email = self.author_email date = "%d +0000" % (self.base_date + self.counter) env = { # Set git commit format "GIT_AUTHOR_NAME": name, "GIT_AUTHOR_EMAIL": email, "GIT_AUTHOR_DATE": date, "GIT_COMMITTER_NAME": name, "GIT_COMMITTER_EMAIL": email, "GIT_COMMITTER_DATE": date, # Ignore all the system-wide and user configurations "GIT_CONFIG_NOSYSTEM": "1", "HOME": str(self.tmp_dir), "XDG_CONFIG_HOME": str(self.tmp_dir), } kwargs.setdefault("env", {}).update(env) subprocess.check_call( ("git", "-C", self.repo_dir) + cmd, stdout=stdout, **kwargs ) def commit(self, message="Commit test\n", ref=b"HEAD"): """Commit the current working tree in a new commit with message on the branch 'ref'. At the end of the commit, the reference should stay the same and the index should be clean. """ paths = [ os.path.relpath(path, self.repo_dir) for path in glob.glob(self.repo_dir + "/**/*", recursive=True) ] self.repo.stage(paths) message = message.encode() + b"\n" ret = self.repo.do_commit( message=message, committer=self.author, commit_timestamp=self.base_date + self.counter, commit_timezone=0, ref=ref, ) self.counter += 1 # committing on another branch leaves # dangling files in index if ref != b"HEAD": # XXX this should work (but does not) # dulwich.porcelain.reset(self.repo, 'hard') self.git_shell("reset", "--hard", "HEAD") return ret def tag(self, name, target=b"HEAD", message=None): dulwich.porcelain.tag_create( self.repo, name, message=message, annotated=message is not None, objectish=target, ) def merge(self, parent_sha_list, message="Merge branches."): self.git_shell( "merge", "--allow-unrelated-histories", "-m", message, *[p.decode() for p in parent_sha_list], ) self.counter += 1 return self.repo.refs[b"HEAD"] def print_debug_graph(self, reflog=False): args = ["log", "--all", "--graph", "--decorate"] if reflog: args.append("--reflog") self.git_shell(*args, stdout=None) @pytest.fixture def git_loader( swh_storage, ): """Instantiate a Git Loader using the storage instance as storage.""" def _create_loader(directory): return GitLoaderFromDisk( swh_storage, "fake_origin", directory=directory, visit_date=datetime.datetime.now(datetime.timezone.utc), ) return _create_loader @contextlib.contextmanager def cook_extract_directory_dircooker(storage, swhid, fsck=True): """Context manager that cooks a directory and extract it.""" backend = unittest.mock.MagicMock() backend.storage = storage cooker = DirectoryCooker(swhid, backend=backend, storage=storage) cooker.fileobj = io.BytesIO() assert cooker.check_exists() cooker.prepare_bundle() cooker.fileobj.seek(0) with tempfile.TemporaryDirectory(prefix="tmp-vault-extract-") as td: with tarfile.open(fileobj=cooker.fileobj, mode="r") as tar: tar.extractall(td) yield pathlib.Path(td) / str(swhid) cooker.storage = None @contextlib.contextmanager def cook_extract_directory_gitfast(storage, swhid, fsck=True): """Context manager that cooks a revision containing a directory and extract it, using RevisionGitfastCooker""" test_repo = TestRepo() with test_repo as p: date = TimestampWithTimezone.from_datetime( datetime.datetime.now(datetime.timezone.utc) ) revision = Revision( directory=swhid.object_id, message=b"dummy message", author=Person.from_fullname(b"someone"), committer=Person.from_fullname(b"someone"), date=date, committer_date=date, type=RevisionType.GIT, synthetic=False, ) storage.revision_add([revision]) with cook_stream_revision_gitfast( storage, revision.swhid() ) as stream, test_repo as p: processor = dulwich.fastexport.GitImportProcessor(test_repo.repo) processor.import_stream(stream) test_repo.checkout(b"HEAD") shutil.rmtree(p / ".git") yield p @contextlib.contextmanager def cook_extract_directory_git_bare(storage, swhid, fsck=True, direct_objstorage=False): """Context manager that cooks a revision and extract it, using GitBareCooker""" backend = unittest.mock.MagicMock() backend.storage = storage # Cook the object cooker = GitBareCooker( swhid, backend=backend, storage=storage, objstorage=storage.objstorage if direct_objstorage else None, ) cooker.use_fsck = fsck # Some tests try edge-cases that git-fsck rejects cooker.fileobj = io.BytesIO() assert cooker.check_exists() cooker.prepare_bundle() cooker.fileobj.seek(0) # Extract it with tempfile.TemporaryDirectory(prefix="tmp-vault-extract-") as td: with tarfile.open(fileobj=cooker.fileobj, mode="r") as tar: tar.extractall(td) # Clone it with Dulwich with tempfile.TemporaryDirectory(prefix="tmp-vault-clone-") as clone_dir: clone_dir = pathlib.Path(clone_dir) subprocess.check_call( [ "git", "clone", os.path.join(td, f"{swhid}.git"), clone_dir, ] ) shutil.rmtree(clone_dir / ".git") yield clone_dir @pytest.fixture( scope="module", params=[ cook_extract_directory_dircooker, cook_extract_directory_gitfast, cook_extract_directory_git_bare, ], ) def cook_extract_directory(request): """A fixture that is instantiated as either cook_extract_directory_dircooker or cook_extract_directory_git_bare.""" return request.param @contextlib.contextmanager def cook_stream_revision_gitfast(storage, swhid): """Context manager that cooks a revision and stream its fastexport.""" backend = unittest.mock.MagicMock() backend.storage = storage cooker = RevisionGitfastCooker(swhid, backend=backend, storage=storage) cooker.fileobj = io.BytesIO() assert cooker.check_exists() cooker.prepare_bundle() cooker.fileobj.seek(0) fastexport_stream = gzip.GzipFile(fileobj=cooker.fileobj) yield fastexport_stream cooker.storage = None @contextlib.contextmanager def cook_extract_revision_gitfast(storage, swhid, fsck=True): """Context manager that cooks a revision and extract it, using RevisionGitfastCooker""" test_repo = TestRepo() with cook_stream_revision_gitfast(storage, swhid) as stream, test_repo as p: processor = dulwich.fastexport.GitImportProcessor(test_repo.repo) processor.import_stream(stream) yield test_repo, p @contextlib.contextmanager def cook_extract_git_bare(storage, swhid, fsck=True): """Context manager that cooks a revision and extract it, using GitBareCooker""" backend = unittest.mock.MagicMock() backend.storage = storage # Cook the object cooker = GitBareCooker(swhid, backend=backend, storage=storage) cooker.use_fsck = fsck # Some tests try edge-cases that git-fsck rejects cooker.fileobj = io.BytesIO() assert cooker.check_exists() cooker.prepare_bundle() cooker.fileobj.seek(0) # Extract it with tempfile.TemporaryDirectory(prefix="tmp-vault-extract-") as td: with tarfile.open(fileobj=cooker.fileobj, mode="r") as tar: tar.extractall(td) # Clone it with Dulwich with tempfile.TemporaryDirectory(prefix="tmp-vault-clone-") as clone_dir: clone_dir = pathlib.Path(clone_dir) subprocess.check_call( [ "git", "clone", os.path.join(td, f"{swhid}.git"), clone_dir, ] ) test_repo = TestRepo(clone_dir) with test_repo: yield test_repo, clone_dir @contextlib.contextmanager def cook_extract_revision_git_bare(storage, swhid, fsck=True): with cook_extract_git_bare( storage, swhid, fsck=fsck, ) as res: yield res @pytest.fixture( scope="module", params=[cook_extract_revision_gitfast, cook_extract_revision_git_bare], ) def cook_extract_revision(request): """A fixture that is instantiated as either cook_extract_revision_gitfast or cook_extract_revision_git_bare.""" return request.param @contextlib.contextmanager def cook_extract_snapshot_git_bare(storage, swhid, fsck=True): with cook_extract_git_bare( storage, swhid, fsck=fsck, ) as res: yield res @pytest.fixture( scope="module", params=[cook_extract_snapshot_git_bare], ) def cook_extract_snapshot(request): """Equivalent to cook_extract_snapshot_git_bare; but analogous to cook_extract_revision in case we ever have more cookers supporting snapshots""" return request.param TEST_CONTENT = ( " test content\n" "and unicode \N{BLACK HEART SUIT}\n" " and trailing spaces " ) TEST_EXECUTABLE = b"\x42\x40\x00\x00\x05" class TestDirectoryCooker: def test_directory_simple(self, git_loader, cook_extract_directory): repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) (rp / "executable").write_bytes(TEST_EXECUTABLE) (rp / "executable").chmod(0o755) (rp / "link").symlink_to("file") (rp / "dir1/dir2").mkdir(parents=True) (rp / "dir1/dir2/file").write_text(TEST_CONTENT) c = repo.commit() loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj_id) with cook_extract_directory(loader.storage, swhid) as p: assert (p / "file").stat().st_mode == 0o100644 assert (p / "file").read_text() == TEST_CONTENT assert (p / "executable").stat().st_mode == 0o100755 assert (p / "executable").read_bytes() == TEST_EXECUTABLE assert (p / "link").is_symlink() assert os.readlink(str(p / "link")) == "file" assert (p / "dir1/dir2/file").stat().st_mode == 0o100644 assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT directory = from_disk.Directory.from_disk(path=bytes(p)) assert obj_id_hex == hashutil.hash_to_hex(directory.hash) def test_directory_filtered_objects(self, git_loader, cook_extract_directory): repo = TestRepo() with repo as rp: file_1, id_1 = hash_content(b"test1") file_2, id_2 = hash_content(b"test2") file_3, id_3 = hash_content(b"test3") (rp / "file").write_bytes(file_1) (rp / "hidden_file").write_bytes(file_2) (rp / "absent_file").write_bytes(file_3) c = repo.commit() loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj_id) - # FIXME: storage.content_update() should be changed to allow things - # like that - with loader.storage.get_db().transaction() as cur: - cur.execute( - """update content set status = 'visible' - where sha1 = %s""", - (id_1,), - ) - cur.execute( - """update content set status = 'hidden' - where sha1 = %s""", - (id_2,), - ) - - cur.execute( - """ - insert into skipped_content - (sha1, sha1_git, sha256, blake2s256, length, reason) - select sha1, sha1_git, sha256, blake2s256, length, 'no reason' - from content - where sha1 = %s - """, - (id_3,), - ) - - cur.execute("delete from content where sha1 = %s", (id_3,)) + # alter the content of the storage + # 1/ make file 2 an hidden file object + loader.storage._allow_overwrite = True + cnt2 = attrs.evolve( + loader.storage.content_get([id_2])[0], status="hidden", data=file_2 + ) + loader.storage.content_add([cnt2]) + assert loader.storage.content_get([id_2])[0].status == "hidden" + + # 2/ make file 3 an skipped file object + cnt3 = loader.storage.content_get([id_3])[0].to_dict() + cnt3["status"] = "absent" + cnt3["reason"] = "no reason" + sk_cnt3 = SkippedContent.from_dict(cnt3) + loader.storage.skipped_content_add([sk_cnt3]) + # dirty dirty dirty... let's pretend it is the equivalent of writing sql + # queries in the postgresql backend + for hashkey in loader.storage._cql_runner._content_indexes: + loader.storage._cql_runner._content_indexes[hashkey].pop(cnt3[hashkey]) with cook_extract_directory(loader.storage, swhid) as p: assert (p / "file").read_bytes() == b"test1" assert (p / "hidden_file").read_bytes() == HIDDEN_MESSAGE assert (p / "absent_file").read_bytes() == SKIPPED_MESSAGE def test_directory_bogus_perms(self, git_loader, cook_extract_directory): # Some early git repositories have 664/775 permissions... let's check # if all the weird modes are properly normalized in the directory # cooker. repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) (rp / "file").chmod(0o664) (rp / "executable").write_bytes(TEST_EXECUTABLE) (rp / "executable").chmod(0o775) (rp / "wat").write_text(TEST_CONTENT) (rp / "wat").chmod(0o604) # Disable mode cleanup with unittest.mock.patch("dulwich.index.cleanup_mode", lambda mode: mode): c = repo.commit() # Make sure Dulwich didn't normalize the permissions itself. # (if it did, then the test can't check the cooker normalized them) tree_id = repo.repo[c].tree assert {entry.mode for entry in repo.repo[tree_id].items()} == { 0o100775, 0o100664, 0o100604, } # Disable mode checks with unittest.mock.patch("dulwich.objects.Tree.check", lambda self: None): loader = git_loader(str(rp)) loader.load() # Make sure swh-loader didn't normalize them either dir_entries = loader.storage.directory_ls(hashutil.bytehex_to_hash(tree_id)) assert {entry["perms"] for entry in dir_entries} == { 0o100664, 0o100775, 0o100604, } obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj_id) with cook_extract_directory(loader.storage, swhid) as p: assert (p / "file").stat().st_mode == 0o100644 assert (p / "executable").stat().st_mode == 0o100755 assert (p / "wat").stat().st_mode == 0o100644 @pytest.mark.parametrize("direct_objstorage", [True, False]) def test_directory_objstorage( self, swh_storage, git_loader, mocker, direct_objstorage ): """Like test_directory_simple, but using swh_objstorage directly, without going through swh_storage.content_get_data()""" repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) (rp / "executable").write_bytes(TEST_EXECUTABLE) (rp / "executable").chmod(0o755) (rp / "link").symlink_to("file") (rp / "dir1/dir2").mkdir(parents=True) (rp / "dir1/dir2/file").write_text(TEST_CONTENT) c = repo.commit() loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj_id) # Set-up spies storage_content_get_data = mocker.patch.object( swh_storage, "content_get_data", wraps=swh_storage.content_get_data ) objstorage_content_batch = mocker.patch.object( swh_storage.objstorage, "get_batch", wraps=swh_storage.objstorage.get_batch ) with cook_extract_directory_git_bare( loader.storage, swhid, direct_objstorage=direct_objstorage ) as p: assert (p / "file").stat().st_mode == 0o100644 assert (p / "file").read_text() == TEST_CONTENT assert (p / "executable").stat().st_mode == 0o100755 assert (p / "executable").read_bytes() == TEST_EXECUTABLE assert (p / "link").is_symlink() assert os.readlink(str(p / "link")) == "file" assert (p / "dir1/dir2/file").stat().st_mode == 0o100644 assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT directory = from_disk.Directory.from_disk(path=bytes(p)) assert obj_id_hex == hashutil.hash_to_hex(directory.hash) if direct_objstorage: storage_content_get_data.assert_not_called() objstorage_content_batch.assert_called() else: storage_content_get_data.assert_called() objstorage_content_batch.assert_not_called() def test_directory_revision_data(self, swh_storage): target_rev = "0e8a3ad980ec179856012b7eecf4327e99cd44cd" dir = Directory( entries=( DirectoryEntry( name=b"submodule", type="rev", target=hashutil.hash_to_bytes(target_rev), perms=0o100644, ), ), ) swh_storage.directory_add([dir]) with cook_extract_directory_dircooker( swh_storage, dir.swhid(), fsck=False ) as p: assert (p / "submodule").is_symlink() assert os.readlink(str(p / "submodule")) == target_rev class RepoFixtures: """Shared loading and checking methods that can be reused by different types of tests.""" def load_repo_simple(self, git_loader): # # 1--2--3--4--5--6--7 # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) repo.commit("add file1") (rp / "file2").write_text(TEST_CONTENT) repo.commit("add file2") (rp / "dir1/dir2").mkdir(parents=True) (rp / "dir1/dir2/file").write_text(TEST_CONTENT) (rp / "bin1").write_bytes(TEST_EXECUTABLE) (rp / "bin1").chmod(0o755) repo.commit("add bin1") (rp / "link1").symlink_to("file1") repo.commit("link link1 to file1") (rp / "file2").unlink() repo.commit("remove file2") (rp / "bin1").rename(rp / "bin") repo.commit("rename bin1 to bin") loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) return (loader, swhid) def check_revision_simple(self, ert, p, swhid): ert.checkout(b"HEAD") assert (p / "file1").stat().st_mode == 0o100644 assert (p / "file1").read_text() == TEST_CONTENT assert (p / "link1").is_symlink() assert os.readlink(str(p / "link1")) == "file1" assert (p / "bin").stat().st_mode == 0o100755 assert (p / "bin").read_bytes() == TEST_EXECUTABLE assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT assert (p / "dir1/dir2/file").stat().st_mode == 0o100644 assert ert.repo.refs[b"HEAD"].decode() == swhid.object_id.hex() def load_repo_two_roots(self, git_loader): # # 1----3---4 # / # 2---- # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) c1 = repo.commit("Add file1") del repo.repo.refs[b"refs/heads/master"] # git update-ref -d HEAD (rp / "file2").write_text(TEST_CONTENT) repo.commit("Add file2") repo.merge([c1]) (rp / "file3").write_text(TEST_CONTENT) repo.commit("add file3") obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) loader = git_loader(str(rp)) loader.load() return (loader, swhid) def check_revision_two_roots(self, ert, p, swhid): assert ert.repo.refs[b"HEAD"].decode() == swhid.object_id.hex() (c3,) = ert.repo[hashutil.hash_to_bytehex(swhid.object_id)].parents assert len(ert.repo[c3].parents) == 2 def load_repo_two_heads(self, git_loader): # # 1---2----4 <-- master and b1 # \ # ----3 <-- b2 # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) repo.commit("Add file1") (rp / "file2").write_text(TEST_CONTENT) c2 = repo.commit("Add file2") repo.repo.refs[b"refs/heads/b2"] = c2 # branch b2 from master (rp / "file3").write_text(TEST_CONTENT) repo.commit("add file3", ref=b"refs/heads/b2") (rp / "file4").write_text(TEST_CONTENT) c4 = repo.commit("add file4", ref=b"refs/heads/master") repo.repo.refs[b"refs/heads/b1"] = c4 # branch b1 from master obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) loader = git_loader(str(rp)) loader.load() return (loader, swhid) def check_snapshot_two_heads(self, ert, p, swhid): assert ( hashutil.hash_to_bytehex(swhid.object_id) == ert.repo.refs[b"HEAD"] == ert.repo.refs[b"refs/heads/master"] == ert.repo.refs[b"refs/remotes/origin/HEAD"] == ert.repo.refs[b"refs/remotes/origin/master"] == ert.repo.refs[b"refs/remotes/origin/b1"] ) c4_id = hashutil.hash_to_bytehex(swhid.object_id) c3_id = ert.repo.refs[b"refs/remotes/origin/b2"] assert ert.repo[c3_id].parents == ert.repo[c4_id].parents def load_repo_two_double_fork_merge(self, git_loader): # # 2---4---6 # / / / # 1---3---5 # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) c1 = repo.commit("Add file1") # create commit 1 repo.repo.refs[b"refs/heads/c1"] = c1 # branch c1 from master (rp / "file2").write_text(TEST_CONTENT) repo.commit("Add file2") # create commit 2 (rp / "file3").write_text(TEST_CONTENT) c3 = repo.commit("Add file3", ref=b"refs/heads/c1") # create commit 3 on c1 repo.repo.refs[b"refs/heads/c3"] = c3 # branch c3 from c1 repo.merge([c3]) # create commit 4 (rp / "file5").write_text(TEST_CONTENT) c5 = repo.commit("Add file3", ref=b"refs/heads/c3") # create commit 5 on c3 repo.merge([c5]) # create commit 6 obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) loader = git_loader(str(rp)) loader.load() return (loader, swhid) def check_revision_two_double_fork_merge(self, ert, p, swhid): assert ert.repo.refs[b"HEAD"].decode() == swhid.object_id.hex() def check_snapshot_two_double_fork_merge(self, ert, p, swhid): assert ( hashutil.hash_to_bytehex(swhid.object_id) == ert.repo.refs[b"HEAD"] == ert.repo.refs[b"refs/heads/master"] == ert.repo.refs[b"refs/remotes/origin/HEAD"] == ert.repo.refs[b"refs/remotes/origin/master"] ) (c4_id, c5_id) = ert.repo[swhid.object_id.hex().encode()].parents assert c5_id == ert.repo.refs[b"refs/remotes/origin/c3"] (c2_id, c3_id) = ert.repo[c4_id].parents assert c3_id == ert.repo.refs[b"refs/remotes/origin/c1"] def load_repo_triple_merge(self, git_loader): # # .---.---5 # / / / # 2 3 4 # / / / # 1---.---. # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) c1 = repo.commit("Commit 1") repo.repo.refs[b"refs/heads/b1"] = c1 repo.repo.refs[b"refs/heads/b2"] = c1 repo.commit("Commit 2") c3 = repo.commit("Commit 3", ref=b"refs/heads/b1") c4 = repo.commit("Commit 4", ref=b"refs/heads/b2") repo.merge([c3, c4]) obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) loader = git_loader(str(rp)) loader.load() return (loader, swhid) def check_revision_triple_merge(self, ert, p, swhid): assert ert.repo.refs[b"HEAD"].decode() == swhid.object_id.hex() def check_snapshot_triple_merge(self, ert, p, swhid): assert ( hashutil.hash_to_bytehex(swhid.object_id) == ert.repo.refs[b"HEAD"] == ert.repo.refs[b"refs/heads/master"] == ert.repo.refs[b"refs/remotes/origin/HEAD"] == ert.repo.refs[b"refs/remotes/origin/master"] ) (c2_id, c3_id, c4_id) = ert.repo[swhid.object_id.hex().encode()].parents assert c3_id == ert.repo.refs[b"refs/remotes/origin/b1"] assert c4_id == ert.repo.refs[b"refs/remotes/origin/b2"] assert ( ert.repo[c2_id].parents == ert.repo[c3_id].parents == ert.repo[c4_id].parents ) def load_repo_filtered_objects(self, git_loader): repo = TestRepo() with repo as rp: file_1, id_1 = hash_content(b"test1") file_2, id_2 = hash_content(b"test2") file_3, id_3 = hash_content(b"test3") (rp / "file").write_bytes(file_1) (rp / "hidden_file").write_bytes(file_2) (rp / "absent_file").write_bytes(file_3) repo.commit() obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) loader = git_loader(str(rp)) loader.load() - # FIXME: storage.content_update() should be changed to allow things - # like that - with loader.storage.get_db().transaction() as cur: - cur.execute( - """update content set status = 'visible' - where sha1 = %s""", - (id_1,), - ) - cur.execute( - """update content set status = 'hidden' - where sha1 = %s""", - (id_2,), - ) - - cur.execute( - """ - insert into skipped_content - (sha1, sha1_git, sha256, blake2s256, length, reason) - select sha1, sha1_git, sha256, blake2s256, length, 'no reason' - from content - where sha1 = %s - """, - (id_3,), - ) + # alter the content of the storage + # 1/ make file 2 an hidden file object + loader.storage._allow_overwrite = True + cnt2 = attrs.evolve( + loader.storage.content_get([id_2])[0], status="hidden", data=file_2 + ) + loader.storage.content_add([cnt2]) + assert loader.storage.content_get([id_2])[0].status == "hidden" + + # 2/ make file 3 an skipped file object + cnt3 = loader.storage.content_get([id_3])[0].to_dict() + cnt3["status"] = "absent" + cnt3["reason"] = "no reason" + sk_cnt3 = SkippedContent.from_dict(cnt3) + loader.storage.skipped_content_add([sk_cnt3]) + # dirty dirty dirty... let's pretend it is the equivalent of writing sql + # queries in the postgresql backend + for hashkey in loader.storage._cql_runner._content_indexes: + loader.storage._cql_runner._content_indexes[hashkey].pop(cnt3[hashkey]) - cur.execute("delete from content where sha1 = %s", (id_3,)) return (loader, swhid) def check_revision_filtered_objects(self, ert, p, swhid): ert.checkout(b"HEAD") assert (p / "file").read_bytes() == b"test1" assert (p / "hidden_file").read_bytes() == HIDDEN_MESSAGE assert (p / "absent_file").read_bytes() == SKIPPED_MESSAGE def load_repo_null_fields(self, git_loader): # Our schema doesn't enforce a lot of non-null revision fields. We need # to check these cases don't break the cooker. repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) c = repo.commit("initial commit") loader = git_loader(str(rp)) loader.load() repo.repo.refs[b"HEAD"].decode() dir_id_hex = repo.repo[c].tree.decode() dir_id = hashutil.hash_to_bytes(dir_id_hex) test_revision = Revision( message=b"", author=Person(name=None, email=None, fullname=b""), date=None, committer=Person(name=None, email=None, fullname=b""), committer_date=None, parents=(), type=RevisionType.GIT, directory=dir_id, metadata={}, synthetic=True, ) storage = loader.storage storage.revision_add([test_revision]) return (loader, test_revision.swhid()) def check_revision_null_fields(self, ert, p, swhid): ert.checkout(b"HEAD") assert (p / "file").stat().st_mode == 0o100644 def load_repo_tags(self, git_loader): # v-- t2 # # 1---2----5 <-- master, t5, and t5a (annotated) # \ # ----3----4 <-- t4a (annotated) # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) repo.commit("Add file1") (rp / "file2").write_text(TEST_CONTENT) repo.commit("Add file2") # create c2 repo.tag(b"t2") (rp / "file3").write_text(TEST_CONTENT) repo.commit("add file3") (rp / "file4").write_text(TEST_CONTENT) repo.commit("add file4") repo.tag(b"t4a", message=b"tag 4") # Go back to c2 repo.git_shell("reset", "--hard", "HEAD^^") (rp / "file5").write_text(TEST_CONTENT) repo.commit("add file5") # create c5 repo.tag(b"t5") repo.tag(b"t5a", message=b"tag 5") obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) loader = git_loader(str(rp)) loader.load() return (loader, swhid) def check_snapshot_tags(self, ert, p, swhid): assert ( hashutil.hash_to_bytehex(swhid.object_id) == ert.repo.refs[b"HEAD"] == ert.repo.refs[b"refs/heads/master"] == ert.repo.refs[b"refs/remotes/origin/HEAD"] == ert.repo.refs[b"refs/remotes/origin/master"] == ert.repo.refs[b"refs/tags/t5"] ) c2_id = ert.repo.refs[b"refs/tags/t2"] c5_id = hashutil.hash_to_bytehex(swhid.object_id) assert ert.repo[c5_id].parents == [c2_id] t5a = ert.repo[ert.repo.refs[b"refs/tags/t5a"]] # TODO: investigate why new dulwich adds \n assert t5a.message in (b"tag 5", b"tag 5\n") assert t5a.object == (dulwich.objects.Commit, c5_id) t4a = ert.repo[ert.repo.refs[b"refs/tags/t4a"]] (_, c4_id) = t4a.object assert ert.repo[c4_id].message == b"add file4\n" # TODO: ditto (c3_id,) = ert.repo[c4_id].parents assert ert.repo[c3_id].message == b"add file3\n" # TODO: ditto assert ert.repo[c3_id].parents == [c2_id] class TestRevisionCooker(RepoFixtures): def test_revision_simple(self, git_loader, cook_extract_revision): (loader, swhid) = self.load_repo_simple(git_loader) with cook_extract_revision(loader.storage, swhid) as (ert, p): self.check_revision_simple(ert, p, swhid) def test_revision_two_roots(self, git_loader, cook_extract_revision): (loader, swhid) = self.load_repo_two_roots(git_loader) with cook_extract_revision(loader.storage, swhid) as (ert, p): self.check_revision_two_roots(ert, p, swhid) def test_revision_two_double_fork_merge(self, git_loader, cook_extract_revision): (loader, swhid) = self.load_repo_two_double_fork_merge(git_loader) with cook_extract_revision(loader.storage, swhid) as (ert, p): self.check_revision_two_double_fork_merge(ert, p, swhid) def test_revision_triple_merge(self, git_loader, cook_extract_revision): (loader, swhid) = self.load_repo_triple_merge(git_loader) with cook_extract_revision(loader.storage, swhid) as (ert, p): self.check_revision_triple_merge(ert, p, swhid) def test_revision_filtered_objects(self, git_loader, cook_extract_revision): (loader, swhid) = self.load_repo_filtered_objects(git_loader) with cook_extract_revision(loader.storage, swhid) as (ert, p): self.check_revision_filtered_objects(ert, p, swhid) def test_revision_null_fields(self, git_loader, cook_extract_revision): (loader, swhid) = self.load_repo_null_fields(git_loader) with cook_extract_revision(loader.storage, swhid, fsck=False) as (ert, p): self.check_revision_null_fields(ert, p, swhid) @pytest.mark.parametrize("ingest_target_revision", [False, True]) def test_revision_submodule( self, swh_storage, cook_extract_revision, ingest_target_revision ): date = TimestampWithTimezone.from_datetime( datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0) ) target_rev = Revision( message=b"target_rev", author=Person.from_fullname(b"me "), date=date, committer=Person.from_fullname(b"me "), committer_date=date, parents=(), type=RevisionType.GIT, directory=bytes.fromhex("3333333333333333333333333333333333333333"), metadata={}, synthetic=True, ) if ingest_target_revision: swh_storage.revision_add([target_rev]) dir = Directory( entries=( DirectoryEntry( name=b"submodule", type="rev", target=target_rev.id, perms=0o160000, ), ), ) swh_storage.directory_add([dir]) rev = Revision( message=b"msg", author=Person.from_fullname(b"me "), date=date, committer=Person.from_fullname(b"me "), committer_date=date, parents=(), type=RevisionType.GIT, directory=dir.id, metadata={}, synthetic=True, ) swh_storage.revision_add([rev]) with cook_extract_revision(swh_storage, rev.swhid()) as (ert, p): ert.checkout(b"HEAD") pattern = b"160000 submodule\x00%s" % target_rev.id tree = ert.repo[b"HEAD"].tree assert pattern in ert.repo[tree].as_raw_string() class TestSnapshotCooker(RepoFixtures): def test_snapshot_simple(self, git_loader, cook_extract_snapshot): (loader, main_rev_id) = self.load_repo_simple(git_loader) snp_id = loader.loaded_snapshot_id swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id) with cook_extract_snapshot(loader.storage, swhid) as (ert, p): self.check_revision_simple(ert, p, main_rev_id) def test_snapshot_two_roots(self, git_loader, cook_extract_snapshot): (loader, main_rev_id) = self.load_repo_two_roots(git_loader) snp_id = loader.loaded_snapshot_id swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id) with cook_extract_snapshot(loader.storage, swhid) as (ert, p): self.check_revision_two_roots(ert, p, main_rev_id) def test_snapshot_two_heads(self, git_loader, cook_extract_snapshot): (loader, main_rev_id) = self.load_repo_two_heads(git_loader) snp_id = loader.loaded_snapshot_id swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id) with cook_extract_snapshot(loader.storage, swhid) as (ert, p): self.check_snapshot_two_heads(ert, p, main_rev_id) def test_snapshot_two_double_fork_merge(self, git_loader, cook_extract_snapshot): (loader, main_rev_id) = self.load_repo_two_double_fork_merge(git_loader) snp_id = loader.loaded_snapshot_id swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id) with cook_extract_snapshot(loader.storage, swhid) as (ert, p): self.check_revision_two_double_fork_merge(ert, p, main_rev_id) self.check_snapshot_two_double_fork_merge(ert, p, main_rev_id) def test_snapshot_triple_merge(self, git_loader, cook_extract_snapshot): (loader, main_rev_id) = self.load_repo_triple_merge(git_loader) snp_id = loader.loaded_snapshot_id swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id) with cook_extract_snapshot(loader.storage, swhid) as (ert, p): self.check_revision_triple_merge(ert, p, main_rev_id) self.check_snapshot_triple_merge(ert, p, main_rev_id) def test_snapshot_filtered_objects(self, git_loader, cook_extract_snapshot): (loader, main_rev_id) = self.load_repo_filtered_objects(git_loader) snp_id = loader.loaded_snapshot_id swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id) with cook_extract_snapshot(loader.storage, swhid) as (ert, p): self.check_revision_filtered_objects(ert, p, main_rev_id) def test_snapshot_tags(self, git_loader, cook_extract_snapshot): (loader, main_rev_id) = self.load_repo_tags(git_loader) snp_id = loader.loaded_snapshot_id swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id) with cook_extract_snapshot(loader.storage, swhid) as (ert, p): self.check_snapshot_tags(ert, p, main_rev_id) def test_original_malformed_objects(self, swh_storage, cook_extract_snapshot): """Tests that objects that were originally malformed: * are still interpreted somewhat correctly (if the loader could make sense of them), especially that they still have links to children * have their original manifest in the bundle """ date = TimestampWithTimezone.from_numeric_offset( Timestamp(1643819927, 0), 0, False ) content = Content.from_data(b"foo") swh_storage.content_add([content]) # disordered # fmt: off malformed_dir_manifest = ( b"" + b"100644 file2\x00" + content.sha1_git + b"100644 file1\x00" + content.sha1_git ) # fmt: on directory = Directory( entries=( DirectoryEntry( name=b"file1", type="file", perms=0o100644, target=content.sha1_git ), DirectoryEntry( name=b"file2", type="file", perms=0o100644, target=content.sha1_git ), ), raw_manifest=f"tree {len(malformed_dir_manifest)}\x00".encode() + malformed_dir_manifest, ) swh_storage.directory_add([directory]) # 'committer' and 'author' swapped # fmt: off malformed_rev_manifest = ( b"tree " + hashutil.hash_to_bytehex(directory.id) + b"\n" + b"committer me 1643819927 +0000\n" + b"author me 1643819927 +0000\n" + b"\n" + b"rev" ) # fmt: on revision = Revision( message=b"rev", author=Person.from_fullname(b"me "), date=date, committer=Person.from_fullname(b"me "), committer_date=date, parents=(), type=RevisionType.GIT, directory=directory.id, synthetic=True, raw_manifest=f"commit {len(malformed_rev_manifest)}\x00".encode() + malformed_rev_manifest, ) swh_storage.revision_add([revision]) # 'tag' and 'tagger' swapped # fmt: off malformed_rel_manifest = ( b"object " + hashutil.hash_to_bytehex(revision.id) + b"\n" + b"type commit\n" + b"tagger me 1643819927 +0000\n" + b"tag v1.1.0\n" ) # fmt: on release = Release( name=b"v1.1.0", message=None, author=Person.from_fullname(b"me "), date=date, target=revision.id, target_type=ModelObjectType.REVISION, synthetic=True, raw_manifest=f"tag {len(malformed_rel_manifest)}\x00".encode() + malformed_rel_manifest, ) swh_storage.release_add([release]) snapshot = Snapshot( branches={ b"refs/tags/v1.1.0": SnapshotBranch( target=release.id, target_type=TargetType.RELEASE ), b"HEAD": SnapshotBranch( target=revision.id, target_type=TargetType.REVISION ), } ) swh_storage.snapshot_add([snapshot]) with cook_extract_snapshot(swh_storage, snapshot.swhid()) as (ert, p): tag = ert.repo[b"refs/tags/v1.1.0"] assert tag.as_raw_string() == malformed_rel_manifest commit = ert.repo[tag.object[1]] assert commit.as_raw_string() == malformed_rev_manifest tree = ert.repo[commit.tree] assert tree.as_raw_string() == malformed_dir_manifest diff --git a/swh/vault/tests/test_git_bare_cooker.py b/swh/vault/tests/test_git_bare_cooker.py index f3c27e9..138dd4d 100644 --- a/swh/vault/tests/test_git_bare_cooker.py +++ b/swh/vault/tests/test_git_bare_cooker.py @@ -1,681 +1,702 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """ This module contains additional tests for the bare cooker. Generic cooker tests (eg. without swh-graph) in test_cookers.py also run on the bare cooker. """ import datetime import enum +from functools import partial import io import subprocess import tarfile import tempfile import unittest.mock import attr import dulwich.repo import pytest from pytest import param +from pytest_postgresql import factories +from swh.core.db.pytest_plugin import initialize_database_for_module, postgresql_fact from swh.model.from_disk import DentryPerms from swh.model.model import ( Content, Directory, DirectoryEntry, ObjectType, Person, Release, Revision, RevisionType, Snapshot, SnapshotBranch, TargetType, Timestamp, TimestampWithTimezone, ) +from swh.storage import get_storage +from swh.storage.postgresql.db import Db as StorageBackend from swh.vault.cookers.git_bare import GitBareCooker from swh.vault.in_memory_backend import InMemoryVaultBackend +storage_postgresql_proc = factories.postgresql_proc( + dbname="storage", + load=[ + partial( + initialize_database_for_module, "storage", StorageBackend.current_version + ) + ], +) + +storage_postgresql = postgresql_fact("storage_postgresql_proc", no_db_drop=True) + + +@pytest.fixture +def swh_storage(storage_postgresql): + return get_storage("local", db=storage_postgresql.dsn, objstorage={"cls": "memory"}) + class RootObjects(enum.Enum): REVISION = enum.auto() SNAPSHOT = enum.auto() RELEASE = enum.auto() WEIRD_RELEASE = enum.auto() # has a : in the name + points to another release @pytest.mark.graph @pytest.mark.parametrize( "root_object,up_to_date_graph,tag,weird_branches", [ param( RootObjects.REVISION, False, False, False, id="rev, outdated graph, no tag/tree/blob", ), param( RootObjects.REVISION, True, False, False, id="rev, updated graph, no tag/tree/blob", ), param( RootObjects.RELEASE, False, False, False, id="rel, outdated graph, no tag/tree/blob", ), param( RootObjects.RELEASE, True, False, False, id="rel, updated graph, no tag/tree/blob", ), param( RootObjects.WEIRD_RELEASE, True, False, False, id="weird rel, updated graph, no tag/tree/blob", ), param( RootObjects.SNAPSHOT, False, False, False, id="snp, outdated graph, no tag/tree/blob", ), param( RootObjects.SNAPSHOT, True, False, False, id="snp, updated graph, no tag/tree/blob", ), param( RootObjects.SNAPSHOT, False, True, False, id="snp, outdated graph, w/ tag, no tree/blob", ), param( RootObjects.SNAPSHOT, True, True, False, id="snp, updated graph, w/ tag, no tree/blob", ), param( RootObjects.SNAPSHOT, False, True, True, id="snp, outdated graph, w/ tag, tree, and blob", ), param( RootObjects.SNAPSHOT, True, True, True, id="snp, updated graph, w/ tag, tree, and blob", ), ], ) def test_graph_revisions( swh_storage, up_to_date_graph, root_object, tag, weird_branches ): r""" Build objects:: snp /|||\ / ||| \ rel2 <----° /|\ \----> rel4 | / | \ | v / v \ v rev1 <------ rev2 <----° dir4 \ rel3 | | | \ | v v v \ | dir1 dir2 dir3 | | | / | | | | v / v v v v cnt1 <----° cnt2 cnt3 cnt4 cnt5 If up_to_date_graph is true, then swh-graph contains all objects. Else, cnt4, cnt5, dir4, rev2, rel2, rel3, and snp are missing from the graph. If tag is False, rel2 is excluded. If weird_branches is False, dir4, cnt4, rel3, rel4, and cnt5 are excluded. """ from swh.graph.naive_client import NaiveClient as GraphClient # Create objects: date = TimestampWithTimezone.from_datetime( datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc) ) author = Person.from_fullname(b"Foo ") cnt1 = Content.from_data(b"correct") cnt2 = Content.from_data(b"horse") cnt3 = Content.from_data(b"battery") cnt4 = Content.from_data(b"staple") cnt5 = Content.from_data(b"Tr0ub4dor&3") dir1 = Directory( entries=( DirectoryEntry( name=b"file1", type="file", perms=DentryPerms.content, target=cnt1.sha1_git, ), ) ) dir2 = Directory( entries=( DirectoryEntry( name=b"file1", type="file", perms=DentryPerms.content, target=cnt1.sha1_git, ), DirectoryEntry( name=b"file2", type="file", perms=DentryPerms.content, target=cnt2.sha1_git, ), ) ) dir3 = Directory( entries=( DirectoryEntry( name=b"file3", type="file", perms=DentryPerms.content, target=cnt3.sha1_git, ), ) ) dir4 = Directory( entries=( DirectoryEntry( name=b"directory3", type="dir", perms=DentryPerms.directory, target=dir3.id, ), ) ) rev1 = Revision( message=b"msg1", date=date, committer_date=date, author=author, committer=author, directory=dir1.id, type=RevisionType.GIT, synthetic=True, ) rev2 = Revision( message=b"msg2", date=date, committer_date=date, author=author, committer=author, directory=dir2.id, parents=(rev1.id,), type=RevisionType.GIT, synthetic=True, ) rel2 = Release( name=b"1.0.0", message=b"tag2", target_type=ObjectType.REVISION, target=rev2.id, synthetic=True, ) rel3 = Release( name=b"1.0.0-blob", message=b"tagged-blob", target_type=ObjectType.CONTENT, target=cnt5.sha1_git, synthetic=True, ) rel4 = Release( name=b"1.0.0-weird", message=b"weird release", target_type=ObjectType.RELEASE, target=rel3.id, synthetic=True, ) rel5 = Release( name=b"1.0.0:weirdname", message=b"weird release", target_type=ObjectType.RELEASE, target=rel2.id, synthetic=True, ) # Create snapshot: branches = { b"refs/heads/master": SnapshotBranch( target=rev2.id, target_type=TargetType.REVISION ), } if tag: branches[b"refs/tags/1.0.0"] = SnapshotBranch( target=rel2.id, target_type=TargetType.RELEASE ) if weird_branches: branches[b"refs/heads/tree-ref"] = SnapshotBranch( target=dir4.id, target_type=TargetType.DIRECTORY ) branches[b"refs/heads/blob-ref"] = SnapshotBranch( target=cnt4.sha1_git, target_type=TargetType.CONTENT ) branches[b"refs/tags/1.0.0-weird"] = SnapshotBranch( target=rel4.id, target_type=TargetType.RELEASE ) snp = Snapshot(branches=branches) # "Fill" swh-graph if up_to_date_graph: nodes = [cnt1, cnt2, dir1, dir2, rev1, rev2, snp] edges = [ (dir1, cnt1), (dir2, cnt1), (dir2, cnt2), (rev1, dir1), (rev2, dir2), (rev2, rev1), (snp, rev2), ] if tag: nodes.append(rel2) edges.append((rel2, rev2)) edges.append((snp, rel2)) if weird_branches: nodes.extend([cnt3, cnt4, cnt5, dir3, dir4, rel3, rel4, rel5]) edges.extend( [ (dir3, cnt3), (dir4, dir3), (snp, dir4), (snp, cnt4), (snp, rel4), (rel4, rel3), (rel3, cnt5), (rel5, rev2), ] ) else: nodes = [cnt1, cnt2, cnt3, dir1, dir2, dir3, rev1] edges = [ (dir1, cnt1), (dir2, cnt1), (dir2, cnt2), (dir3, cnt3), (rev1, dir1), ] if tag: nodes.append(rel2) if weird_branches: nodes.extend([cnt3, dir3]) edges.extend([(dir3, cnt3)]) nodes = [str(n.swhid()) for n in nodes] edges = [(str(s.swhid()), str(d.swhid())) for (s, d) in edges] # Add all objects to storage swh_storage.content_add([cnt1, cnt2, cnt3, cnt4, cnt5]) swh_storage.directory_add([dir1, dir2, dir3, dir4]) swh_storage.revision_add([rev1, rev2]) swh_storage.release_add([rel2, rel3, rel4, rel5]) swh_storage.snapshot_add([snp]) # Add spy on swh_storage, to make sure revision_log is not called # (the graph must be used instead) swh_storage = unittest.mock.MagicMock(wraps=swh_storage) # Add all objects to graph swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges)) # Cook backend = InMemoryVaultBackend() cooked_swhid = { RootObjects.SNAPSHOT: snp.swhid(), RootObjects.REVISION: rev2.swhid(), RootObjects.RELEASE: rel2.swhid(), RootObjects.WEIRD_RELEASE: rel5.swhid(), }[root_object] cooker = GitBareCooker( cooked_swhid, backend=backend, storage=swh_storage, graph=swh_graph, ) if weird_branches: # git-fsck now rejects refs pointing to trees and blobs, # but some old git repos have them. cooker.use_fsck = False cooker.cook() # Get bundle bundle = backend.fetch("git_bare", cooked_swhid) # Extract bundle and make sure both revisions are in it with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: tf.extractall(tempdir) if root_object in (RootObjects.SNAPSHOT, RootObjects.REVISION): log_head = "master" elif root_object == RootObjects.RELEASE: log_head = "1.0.0" elif root_object == RootObjects.WEIRD_RELEASE: log_head = "release" else: assert False, root_object output = subprocess.check_output( [ "git", "-C", f"{tempdir}/{cooked_swhid}.git", "log", "--format=oneline", "--decorate=", log_head, ] ) assert output.decode() == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n" # Make sure the graph was used instead of swh_storage.revision_log if root_object == RootObjects.SNAPSHOT: if up_to_date_graph: # The graph has everything, so the first call succeeds and returns # all objects transitively pointed by the snapshot swh_graph.visit_nodes.assert_has_calls( [ unittest.mock.call(str(snp.swhid()), edges="snp:*,rel:*,rev:rev"), ] ) else: # The graph does not have everything, so the first call returns nothing. # However, the second call (on the top rev) succeeds and returns # all objects but the rev and the rel swh_graph.visit_nodes.assert_has_calls( [ unittest.mock.call(str(snp.swhid()), edges="snp:*,rel:*,rev:rev"), unittest.mock.call(str(rev2.swhid()), edges="rev:rev"), ] ) elif root_object in ( RootObjects.REVISION, RootObjects.RELEASE, RootObjects.WEIRD_RELEASE, ): swh_graph.visit_nodes.assert_has_calls( [unittest.mock.call(str(rev2.swhid()), edges="rev:rev")] ) else: assert False, root_object if up_to_date_graph: swh_storage.revision_log.assert_not_called() swh_storage.revision_shortlog.assert_not_called() else: swh_storage.revision_log.assert_called() @pytest.mark.parametrize( "mismatch_on", ["content", "directory", "revision1", "revision2", "none"] ) def test_checksum_mismatch(swh_storage, mismatch_on): date = TimestampWithTimezone.from_datetime( datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc) ) author = Person.from_fullname(b"Foo ") wrong_hash = b"\x12\x34" * 10 cnt1 = Content.from_data(b"Tr0ub4dor&3") if mismatch_on == "content": cnt1 = attr.evolve(cnt1, sha1_git=wrong_hash) dir1 = Directory( entries=( DirectoryEntry( name=b"file1", type="file", perms=DentryPerms.content, target=cnt1.sha1_git, ), ) ) if mismatch_on == "directory": dir1 = attr.evolve(dir1, id=wrong_hash) rev1 = Revision( message=b"msg1", date=date, committer_date=date, author=author, committer=author, directory=dir1.id, type=RevisionType.GIT, synthetic=True, ) if mismatch_on == "revision1": rev1 = attr.evolve(rev1, id=wrong_hash) rev2 = Revision( message=b"msg2", date=date, committer_date=date, author=author, committer=author, directory=dir1.id, parents=(rev1.id,), type=RevisionType.GIT, synthetic=True, ) if mismatch_on == "revision2": rev2 = attr.evolve(rev2, id=wrong_hash) cooked_swhid = rev2.swhid() swh_storage.content_add([cnt1]) swh_storage.directory_add([dir1]) swh_storage.revision_add([rev1, rev2]) backend = InMemoryVaultBackend() cooker = GitBareCooker( cooked_swhid, backend=backend, storage=swh_storage, graph=None, ) cooker.cook() # Get bundle bundle = backend.fetch("git_bare", cooked_swhid) # Extract bundle and make sure both revisions are in it with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: tf.extractall(tempdir) if mismatch_on != "revision2": # git-log fails if the head revision is corrupted # TODO: we need to find a way to make this somewhat usable output = subprocess.check_output( [ "git", "-C", f"{tempdir}/{cooked_swhid}.git", "log", "--format=oneline", "--decorate=", ] ) assert output.decode() == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n" @pytest.mark.parametrize( "use_graph", [ pytest.param(False, id="without-graph"), pytest.param(True, id="with-graph", marks=pytest.mark.graph), ], ) def test_ignore_displayname(swh_storage, use_graph): """Tests the original authorship information is used instead of configured display names; otherwise objects would not match their hash, and git-fsck/git-clone would fail. This tests both with and without swh-graph, as both configurations use different code paths to fetch revisions. """ date = TimestampWithTimezone.from_numeric_offset(Timestamp(1643882820, 0), 0, False) legacy_person = Person.from_fullname(b"old me ") current_person = Person.from_fullname(b"me ") content = Content.from_data(b"foo") swh_storage.content_add([content]) directory = Directory( entries=( DirectoryEntry( name=b"file1", type="file", perms=0o100644, target=content.sha1_git ), ), ) swh_storage.directory_add([directory]) revision = Revision( message=b"rev", author=legacy_person, date=date, committer=legacy_person, committer_date=date, parents=(), type=RevisionType.GIT, directory=directory.id, synthetic=True, ) swh_storage.revision_add([revision]) release = Release( name=b"v1.1.0", message=None, author=legacy_person, date=date, target=revision.id, target_type=ObjectType.REVISION, synthetic=True, ) swh_storage.release_add([release]) snapshot = Snapshot( branches={ b"refs/tags/v1.1.0": SnapshotBranch( target=release.id, target_type=TargetType.RELEASE ), b"HEAD": SnapshotBranch( target=revision.id, target_type=TargetType.REVISION ), } ) swh_storage.snapshot_add([snapshot]) # Add all objects to graph if use_graph: from swh.graph.naive_client import NaiveClient as GraphClient nodes = [ str(x.swhid()) for x in [content, directory, revision, release, snapshot] ] edges = [ (str(x.swhid()), str(y.swhid())) for (x, y) in [ (directory, content), (revision, directory), (release, revision), (snapshot, release), (snapshot, revision), ] ] swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges)) else: swh_graph = None # Set a display name with swh_storage.db() as db: with db.transaction() as cur: cur.execute( "UPDATE person set displayname = %s where fullname = %s", (current_person.fullname, legacy_person.fullname), ) # Check the display name did apply in the storage assert swh_storage.revision_get([revision.id])[0] == attr.evolve( revision, author=current_person, committer=current_person, ) # Cook cooked_swhid = snapshot.swhid() backend = InMemoryVaultBackend() cooker = GitBareCooker( cooked_swhid, backend=backend, storage=swh_storage, graph=swh_graph, ) cooker.cook() # Get bundle bundle = backend.fetch("git_bare", cooked_swhid) # Extract bundle and make sure both revisions are in it with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: tf.extractall(tempdir) # If we are here, it means git-fsck succeeded when called by cooker.cook(), # so we already know the original person was used. Let's double-check. repo = dulwich.repo.Repo(f"{tempdir}/{cooked_swhid}.git") tag = repo[b"refs/tags/v1.1.0"] assert tag.tagger == legacy_person.fullname commit = repo[tag.object[1]] assert commit.author == legacy_person.fullname