Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/vault/tests/conftest.py b/swh/vault/tests/conftest.py
index 5c7cbf2..e91753f 100644
--- a/swh/vault/tests/conftest.py
+++ b/swh/vault/tests/conftest.py
@@ -1,93 +1,75 @@
-# Copyright (C) 2020 The Software Heritage developers
+# Copyright (C) 2020-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from functools import partial
import os
from typing import Any, Dict
import pkg_resources.extern.packaging.version
import pytest
from pytest_postgresql import factories
from swh.core.db.pytest_plugin import initialize_database_for_module, postgresql_fact
-from swh.storage.postgresql.db import Db as StorageDb
from swh.vault import get_vault
from swh.vault.backend import VaultBackend
os.environ["LC_ALL"] = "C.UTF-8"
# needed for directory tests on git-cloned repositories
# 022 is usually the default value, but some environments (eg. Debian builds) have
# a different one.
os.umask(0o022)
pytest_v = pkg_resources.get_distribution("pytest").parsed_version
if pytest_v < pkg_resources.extern.packaging.version.parse("3.9"):
@pytest.fixture
def tmp_path():
import pathlib
import tempfile
with tempfile.TemporaryDirectory() as tmpdir:
yield pathlib.Path(tmpdir)
-storage_postgresql_proc = factories.postgresql_proc(
- dbname="storage",
- load=[
- partial(initialize_database_for_module, "storage", StorageDb.current_version)
- ],
-)
-
vault_postgresql_proc = factories.postgresql_proc(
dbname="vault",
load=[
partial(initialize_database_for_module, "vault", VaultBackend.current_version)
],
)
postgres_vault = postgresql_fact("vault_postgresql_proc")
-postgres_storage = postgresql_fact(
- "storage_postgresql_proc",
- no_db_drop=True, # keep the db for performance reasons
-)
@pytest.fixture
-def swh_vault_config(postgres_vault, postgres_storage, tmp_path) -> Dict[str, Any]:
+def swh_vault_config(postgres_vault, tmp_path) -> Dict[str, Any]:
tmp_path = str(tmp_path)
return {
"db": postgres_vault.dsn,
"storage": {
- "cls": "postgresql",
- "db": postgres_storage.dsn,
- "objstorage": {
- "cls": "pathslicing",
- "root": tmp_path,
- "slicing": "0:1/1:5",
- },
+ "cls": "memory",
},
"cache": {
"cls": "pathslicing",
"root": tmp_path,
"slicing": "0:1/1:5",
"allow_delete": True,
},
"scheduler": {
"cls": "remote",
"url": "http://swh-scheduler:5008",
},
}
@pytest.fixture
def swh_vault(swh_vault_config):
return get_vault("local", **swh_vault_config)
@pytest.fixture
def swh_storage(swh_vault):
return swh_vault.storage
diff --git a/swh/vault/tests/test_cookers.py b/swh/vault/tests/test_cookers.py
index 2d1a368..48f53cc 100644
--- a/swh/vault/tests/test_cookers.py
+++ b/swh/vault/tests/test_cookers.py
@@ -1,1211 +1,1200 @@
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import contextlib
import datetime
import glob
import gzip
import io
import os
import pathlib
import shutil
import subprocess
import tarfile
import tempfile
import unittest
import unittest.mock
+import attrs
import dulwich.fastexport
import dulwich.index
import dulwich.objects
import dulwich.porcelain
import dulwich.repo
import pytest
from swh.loader.git.from_disk import GitLoaderFromDisk
from swh.model import from_disk, hashutil
from swh.model.model import (
Person,
Release,
Revision,
RevisionType,
+ SkippedContent,
Snapshot,
SnapshotBranch,
TargetType,
Timestamp,
TimestampWithTimezone,
)
from swh.model.model import Content, Directory, DirectoryEntry
from swh.model.model import ObjectType as ModelObjectType
from swh.model.swhids import CoreSWHID, ObjectType
from swh.vault.cookers import DirectoryCooker, GitBareCooker, RevisionGitfastCooker
from swh.vault.tests.vault_testing import hash_content
from swh.vault.to_disk import HIDDEN_MESSAGE, SKIPPED_MESSAGE
class TestRepo:
"""A tiny context manager for a test git repository, with some utility
functions to perform basic git stuff.
"""
def __init__(self, repo_dir=None):
self.repo_dir = repo_dir
def __enter__(self):
if self.repo_dir:
self.tmp_dir = None
self.repo = dulwich.repo.Repo(self.repo_dir)
else:
self.tmp_dir = tempfile.TemporaryDirectory(prefix="tmp-vault-repo-")
self.repo_dir = self.tmp_dir.__enter__()
self.repo = dulwich.repo.Repo.init(self.repo_dir)
self.author_name = b"Test Author"
self.author_email = b"test@softwareheritage.org"
self.author = b"%s <%s>" % (self.author_name, self.author_email)
self.base_date = 258244200
self.counter = 0
return pathlib.Path(self.repo_dir)
def __exit__(self, exc, value, tb):
if self.tmp_dir is not None:
self.tmp_dir.__exit__(exc, value, tb)
self.repo_dir = None
def checkout(self, rev_sha):
rev = self.repo[rev_sha]
dulwich.index.build_index_from_tree(
str(self.repo_dir), self.repo.index_path(), self.repo.object_store, rev.tree
)
def git_shell(self, *cmd, stdout=subprocess.DEVNULL, **kwargs):
name = self.author_name
email = self.author_email
date = "%d +0000" % (self.base_date + self.counter)
env = {
# Set git commit format
"GIT_AUTHOR_NAME": name,
"GIT_AUTHOR_EMAIL": email,
"GIT_AUTHOR_DATE": date,
"GIT_COMMITTER_NAME": name,
"GIT_COMMITTER_EMAIL": email,
"GIT_COMMITTER_DATE": date,
# Ignore all the system-wide and user configurations
"GIT_CONFIG_NOSYSTEM": "1",
"HOME": str(self.tmp_dir),
"XDG_CONFIG_HOME": str(self.tmp_dir),
}
kwargs.setdefault("env", {}).update(env)
subprocess.check_call(
("git", "-C", self.repo_dir) + cmd, stdout=stdout, **kwargs
)
def commit(self, message="Commit test\n", ref=b"HEAD"):
"""Commit the current working tree in a new commit with message on
the branch 'ref'.
At the end of the commit, the reference should stay the same
and the index should be clean.
"""
paths = [
os.path.relpath(path, self.repo_dir)
for path in glob.glob(self.repo_dir + "/**/*", recursive=True)
]
self.repo.stage(paths)
message = message.encode() + b"\n"
ret = self.repo.do_commit(
message=message,
committer=self.author,
commit_timestamp=self.base_date + self.counter,
commit_timezone=0,
ref=ref,
)
self.counter += 1
# committing on another branch leaves
# dangling files in index
if ref != b"HEAD":
# XXX this should work (but does not)
# dulwich.porcelain.reset(self.repo, 'hard')
self.git_shell("reset", "--hard", "HEAD")
return ret
def tag(self, name, target=b"HEAD", message=None):
dulwich.porcelain.tag_create(
self.repo,
name,
message=message,
annotated=message is not None,
objectish=target,
)
def merge(self, parent_sha_list, message="Merge branches."):
self.git_shell(
"merge",
"--allow-unrelated-histories",
"-m",
message,
*[p.decode() for p in parent_sha_list],
)
self.counter += 1
return self.repo.refs[b"HEAD"]
def print_debug_graph(self, reflog=False):
args = ["log", "--all", "--graph", "--decorate"]
if reflog:
args.append("--reflog")
self.git_shell(*args, stdout=None)
@pytest.fixture
def git_loader(
swh_storage,
):
"""Instantiate a Git Loader using the storage instance as storage."""
def _create_loader(directory):
return GitLoaderFromDisk(
swh_storage,
"fake_origin",
directory=directory,
visit_date=datetime.datetime.now(datetime.timezone.utc),
)
return _create_loader
@contextlib.contextmanager
def cook_extract_directory_dircooker(storage, swhid, fsck=True):
"""Context manager that cooks a directory and extract it."""
backend = unittest.mock.MagicMock()
backend.storage = storage
cooker = DirectoryCooker(swhid, backend=backend, storage=storage)
cooker.fileobj = io.BytesIO()
assert cooker.check_exists()
cooker.prepare_bundle()
cooker.fileobj.seek(0)
with tempfile.TemporaryDirectory(prefix="tmp-vault-extract-") as td:
with tarfile.open(fileobj=cooker.fileobj, mode="r") as tar:
tar.extractall(td)
yield pathlib.Path(td) / str(swhid)
cooker.storage = None
@contextlib.contextmanager
def cook_extract_directory_gitfast(storage, swhid, fsck=True):
"""Context manager that cooks a revision containing a directory and extract it,
using RevisionGitfastCooker"""
test_repo = TestRepo()
with test_repo as p:
date = TimestampWithTimezone.from_datetime(
datetime.datetime.now(datetime.timezone.utc)
)
revision = Revision(
directory=swhid.object_id,
message=b"dummy message",
author=Person.from_fullname(b"someone"),
committer=Person.from_fullname(b"someone"),
date=date,
committer_date=date,
type=RevisionType.GIT,
synthetic=False,
)
storage.revision_add([revision])
with cook_stream_revision_gitfast(
storage, revision.swhid()
) as stream, test_repo as p:
processor = dulwich.fastexport.GitImportProcessor(test_repo.repo)
processor.import_stream(stream)
test_repo.checkout(b"HEAD")
shutil.rmtree(p / ".git")
yield p
@contextlib.contextmanager
def cook_extract_directory_git_bare(storage, swhid, fsck=True, direct_objstorage=False):
"""Context manager that cooks a revision and extract it,
using GitBareCooker"""
backend = unittest.mock.MagicMock()
backend.storage = storage
# Cook the object
cooker = GitBareCooker(
swhid,
backend=backend,
storage=storage,
objstorage=storage.objstorage if direct_objstorage else None,
)
cooker.use_fsck = fsck # Some tests try edge-cases that git-fsck rejects
cooker.fileobj = io.BytesIO()
assert cooker.check_exists()
cooker.prepare_bundle()
cooker.fileobj.seek(0)
# Extract it
with tempfile.TemporaryDirectory(prefix="tmp-vault-extract-") as td:
with tarfile.open(fileobj=cooker.fileobj, mode="r") as tar:
tar.extractall(td)
# Clone it with Dulwich
with tempfile.TemporaryDirectory(prefix="tmp-vault-clone-") as clone_dir:
clone_dir = pathlib.Path(clone_dir)
subprocess.check_call(
[
"git",
"clone",
os.path.join(td, f"{swhid}.git"),
clone_dir,
]
)
shutil.rmtree(clone_dir / ".git")
yield clone_dir
@pytest.fixture(
scope="module",
params=[
cook_extract_directory_dircooker,
cook_extract_directory_gitfast,
cook_extract_directory_git_bare,
],
)
def cook_extract_directory(request):
"""A fixture that is instantiated as either cook_extract_directory_dircooker or
cook_extract_directory_git_bare."""
return request.param
@contextlib.contextmanager
def cook_stream_revision_gitfast(storage, swhid):
"""Context manager that cooks a revision and stream its fastexport."""
backend = unittest.mock.MagicMock()
backend.storage = storage
cooker = RevisionGitfastCooker(swhid, backend=backend, storage=storage)
cooker.fileobj = io.BytesIO()
assert cooker.check_exists()
cooker.prepare_bundle()
cooker.fileobj.seek(0)
fastexport_stream = gzip.GzipFile(fileobj=cooker.fileobj)
yield fastexport_stream
cooker.storage = None
@contextlib.contextmanager
def cook_extract_revision_gitfast(storage, swhid, fsck=True):
"""Context manager that cooks a revision and extract it,
using RevisionGitfastCooker"""
test_repo = TestRepo()
with cook_stream_revision_gitfast(storage, swhid) as stream, test_repo as p:
processor = dulwich.fastexport.GitImportProcessor(test_repo.repo)
processor.import_stream(stream)
yield test_repo, p
@contextlib.contextmanager
def cook_extract_git_bare(storage, swhid, fsck=True):
"""Context manager that cooks a revision and extract it,
using GitBareCooker"""
backend = unittest.mock.MagicMock()
backend.storage = storage
# Cook the object
cooker = GitBareCooker(swhid, backend=backend, storage=storage)
cooker.use_fsck = fsck # Some tests try edge-cases that git-fsck rejects
cooker.fileobj = io.BytesIO()
assert cooker.check_exists()
cooker.prepare_bundle()
cooker.fileobj.seek(0)
# Extract it
with tempfile.TemporaryDirectory(prefix="tmp-vault-extract-") as td:
with tarfile.open(fileobj=cooker.fileobj, mode="r") as tar:
tar.extractall(td)
# Clone it with Dulwich
with tempfile.TemporaryDirectory(prefix="tmp-vault-clone-") as clone_dir:
clone_dir = pathlib.Path(clone_dir)
subprocess.check_call(
[
"git",
"clone",
os.path.join(td, f"{swhid}.git"),
clone_dir,
]
)
test_repo = TestRepo(clone_dir)
with test_repo:
yield test_repo, clone_dir
@contextlib.contextmanager
def cook_extract_revision_git_bare(storage, swhid, fsck=True):
with cook_extract_git_bare(
storage,
swhid,
fsck=fsck,
) as res:
yield res
@pytest.fixture(
scope="module",
params=[cook_extract_revision_gitfast, cook_extract_revision_git_bare],
)
def cook_extract_revision(request):
"""A fixture that is instantiated as either cook_extract_revision_gitfast or
cook_extract_revision_git_bare."""
return request.param
@contextlib.contextmanager
def cook_extract_snapshot_git_bare(storage, swhid, fsck=True):
with cook_extract_git_bare(
storage,
swhid,
fsck=fsck,
) as res:
yield res
@pytest.fixture(
scope="module",
params=[cook_extract_snapshot_git_bare],
)
def cook_extract_snapshot(request):
"""Equivalent to cook_extract_snapshot_git_bare; but analogous to
cook_extract_revision in case we ever have more cookers supporting snapshots"""
return request.param
TEST_CONTENT = (
" test content\n" "and unicode \N{BLACK HEART SUIT}\n" " and trailing spaces "
)
TEST_EXECUTABLE = b"\x42\x40\x00\x00\x05"
class TestDirectoryCooker:
def test_directory_simple(self, git_loader, cook_extract_directory):
repo = TestRepo()
with repo as rp:
(rp / "file").write_text(TEST_CONTENT)
(rp / "executable").write_bytes(TEST_EXECUTABLE)
(rp / "executable").chmod(0o755)
(rp / "link").symlink_to("file")
(rp / "dir1/dir2").mkdir(parents=True)
(rp / "dir1/dir2/file").write_text(TEST_CONTENT)
c = repo.commit()
loader = git_loader(str(rp))
loader.load()
obj_id_hex = repo.repo[c].tree.decode()
obj_id = hashutil.hash_to_bytes(obj_id_hex)
swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj_id)
with cook_extract_directory(loader.storage, swhid) as p:
assert (p / "file").stat().st_mode == 0o100644
assert (p / "file").read_text() == TEST_CONTENT
assert (p / "executable").stat().st_mode == 0o100755
assert (p / "executable").read_bytes() == TEST_EXECUTABLE
assert (p / "link").is_symlink()
assert os.readlink(str(p / "link")) == "file"
assert (p / "dir1/dir2/file").stat().st_mode == 0o100644
assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT
directory = from_disk.Directory.from_disk(path=bytes(p))
assert obj_id_hex == hashutil.hash_to_hex(directory.hash)
def test_directory_filtered_objects(self, git_loader, cook_extract_directory):
repo = TestRepo()
with repo as rp:
file_1, id_1 = hash_content(b"test1")
file_2, id_2 = hash_content(b"test2")
file_3, id_3 = hash_content(b"test3")
(rp / "file").write_bytes(file_1)
(rp / "hidden_file").write_bytes(file_2)
(rp / "absent_file").write_bytes(file_3)
c = repo.commit()
loader = git_loader(str(rp))
loader.load()
obj_id_hex = repo.repo[c].tree.decode()
obj_id = hashutil.hash_to_bytes(obj_id_hex)
swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj_id)
- # FIXME: storage.content_update() should be changed to allow things
- # like that
- with loader.storage.get_db().transaction() as cur:
- cur.execute(
- """update content set status = 'visible'
- where sha1 = %s""",
- (id_1,),
- )
- cur.execute(
- """update content set status = 'hidden'
- where sha1 = %s""",
- (id_2,),
- )
-
- cur.execute(
- """
- insert into skipped_content
- (sha1, sha1_git, sha256, blake2s256, length, reason)
- select sha1, sha1_git, sha256, blake2s256, length, 'no reason'
- from content
- where sha1 = %s
- """,
- (id_3,),
- )
-
- cur.execute("delete from content where sha1 = %s", (id_3,))
+ # alter the content of the storage
+ # 1/ make file 2 an hidden file object
+ loader.storage._allow_overwrite = True
+ cnt2 = attrs.evolve(
+ loader.storage.content_get([id_2])[0], status="hidden", data=file_2
+ )
+ loader.storage.content_add([cnt2])
+ assert loader.storage.content_get([id_2])[0].status == "hidden"
+
+ # 2/ make file 3 an skipped file object
+ cnt3 = loader.storage.content_get([id_3])[0].to_dict()
+ cnt3["status"] = "absent"
+ cnt3["reason"] = "no reason"
+ sk_cnt3 = SkippedContent.from_dict(cnt3)
+ loader.storage.skipped_content_add([sk_cnt3])
+ # dirty dirty dirty... let's pretend it is the equivalent of writing sql
+ # queries in the postgresql backend
+ for hashkey in loader.storage._cql_runner._content_indexes:
+ loader.storage._cql_runner._content_indexes[hashkey].pop(cnt3[hashkey])
with cook_extract_directory(loader.storage, swhid) as p:
assert (p / "file").read_bytes() == b"test1"
assert (p / "hidden_file").read_bytes() == HIDDEN_MESSAGE
assert (p / "absent_file").read_bytes() == SKIPPED_MESSAGE
def test_directory_bogus_perms(self, git_loader, cook_extract_directory):
# Some early git repositories have 664/775 permissions... let's check
# if all the weird modes are properly normalized in the directory
# cooker.
repo = TestRepo()
with repo as rp:
(rp / "file").write_text(TEST_CONTENT)
(rp / "file").chmod(0o664)
(rp / "executable").write_bytes(TEST_EXECUTABLE)
(rp / "executable").chmod(0o775)
(rp / "wat").write_text(TEST_CONTENT)
(rp / "wat").chmod(0o604)
# Disable mode cleanup
with unittest.mock.patch("dulwich.index.cleanup_mode", lambda mode: mode):
c = repo.commit()
# Make sure Dulwich didn't normalize the permissions itself.
# (if it did, then the test can't check the cooker normalized them)
tree_id = repo.repo[c].tree
assert {entry.mode for entry in repo.repo[tree_id].items()} == {
0o100775,
0o100664,
0o100604,
}
# Disable mode checks
with unittest.mock.patch("dulwich.objects.Tree.check", lambda self: None):
loader = git_loader(str(rp))
loader.load()
# Make sure swh-loader didn't normalize them either
dir_entries = loader.storage.directory_ls(hashutil.bytehex_to_hash(tree_id))
assert {entry["perms"] for entry in dir_entries} == {
0o100664,
0o100775,
0o100604,
}
obj_id_hex = repo.repo[c].tree.decode()
obj_id = hashutil.hash_to_bytes(obj_id_hex)
swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj_id)
with cook_extract_directory(loader.storage, swhid) as p:
assert (p / "file").stat().st_mode == 0o100644
assert (p / "executable").stat().st_mode == 0o100755
assert (p / "wat").stat().st_mode == 0o100644
@pytest.mark.parametrize("direct_objstorage", [True, False])
def test_directory_objstorage(
self, swh_storage, git_loader, mocker, direct_objstorage
):
"""Like test_directory_simple, but using swh_objstorage directly, without
going through swh_storage.content_get_data()"""
repo = TestRepo()
with repo as rp:
(rp / "file").write_text(TEST_CONTENT)
(rp / "executable").write_bytes(TEST_EXECUTABLE)
(rp / "executable").chmod(0o755)
(rp / "link").symlink_to("file")
(rp / "dir1/dir2").mkdir(parents=True)
(rp / "dir1/dir2/file").write_text(TEST_CONTENT)
c = repo.commit()
loader = git_loader(str(rp))
loader.load()
obj_id_hex = repo.repo[c].tree.decode()
obj_id = hashutil.hash_to_bytes(obj_id_hex)
swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj_id)
# Set-up spies
storage_content_get_data = mocker.patch.object(
swh_storage, "content_get_data", wraps=swh_storage.content_get_data
)
objstorage_content_batch = mocker.patch.object(
swh_storage.objstorage, "get_batch", wraps=swh_storage.objstorage.get_batch
)
with cook_extract_directory_git_bare(
loader.storage, swhid, direct_objstorage=direct_objstorage
) as p:
assert (p / "file").stat().st_mode == 0o100644
assert (p / "file").read_text() == TEST_CONTENT
assert (p / "executable").stat().st_mode == 0o100755
assert (p / "executable").read_bytes() == TEST_EXECUTABLE
assert (p / "link").is_symlink()
assert os.readlink(str(p / "link")) == "file"
assert (p / "dir1/dir2/file").stat().st_mode == 0o100644
assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT
directory = from_disk.Directory.from_disk(path=bytes(p))
assert obj_id_hex == hashutil.hash_to_hex(directory.hash)
if direct_objstorage:
storage_content_get_data.assert_not_called()
objstorage_content_batch.assert_called()
else:
storage_content_get_data.assert_called()
objstorage_content_batch.assert_not_called()
def test_directory_revision_data(self, swh_storage):
target_rev = "0e8a3ad980ec179856012b7eecf4327e99cd44cd"
dir = Directory(
entries=(
DirectoryEntry(
name=b"submodule",
type="rev",
target=hashutil.hash_to_bytes(target_rev),
perms=0o100644,
),
),
)
swh_storage.directory_add([dir])
with cook_extract_directory_dircooker(
swh_storage, dir.swhid(), fsck=False
) as p:
assert (p / "submodule").is_symlink()
assert os.readlink(str(p / "submodule")) == target_rev
class RepoFixtures:
"""Shared loading and checking methods that can be reused by different types
of tests."""
def load_repo_simple(self, git_loader):
#
# 1--2--3--4--5--6--7
#
repo = TestRepo()
with repo as rp:
(rp / "file1").write_text(TEST_CONTENT)
repo.commit("add file1")
(rp / "file2").write_text(TEST_CONTENT)
repo.commit("add file2")
(rp / "dir1/dir2").mkdir(parents=True)
(rp / "dir1/dir2/file").write_text(TEST_CONTENT)
(rp / "bin1").write_bytes(TEST_EXECUTABLE)
(rp / "bin1").chmod(0o755)
repo.commit("add bin1")
(rp / "link1").symlink_to("file1")
repo.commit("link link1 to file1")
(rp / "file2").unlink()
repo.commit("remove file2")
(rp / "bin1").rename(rp / "bin")
repo.commit("rename bin1 to bin")
loader = git_loader(str(rp))
loader.load()
obj_id_hex = repo.repo.refs[b"HEAD"].decode()
obj_id = hashutil.hash_to_bytes(obj_id_hex)
swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id)
return (loader, swhid)
def check_revision_simple(self, ert, p, swhid):
ert.checkout(b"HEAD")
assert (p / "file1").stat().st_mode == 0o100644
assert (p / "file1").read_text() == TEST_CONTENT
assert (p / "link1").is_symlink()
assert os.readlink(str(p / "link1")) == "file1"
assert (p / "bin").stat().st_mode == 0o100755
assert (p / "bin").read_bytes() == TEST_EXECUTABLE
assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT
assert (p / "dir1/dir2/file").stat().st_mode == 0o100644
assert ert.repo.refs[b"HEAD"].decode() == swhid.object_id.hex()
def load_repo_two_roots(self, git_loader):
#
# 1----3---4
# /
# 2----
#
repo = TestRepo()
with repo as rp:
(rp / "file1").write_text(TEST_CONTENT)
c1 = repo.commit("Add file1")
del repo.repo.refs[b"refs/heads/master"] # git update-ref -d HEAD
(rp / "file2").write_text(TEST_CONTENT)
repo.commit("Add file2")
repo.merge([c1])
(rp / "file3").write_text(TEST_CONTENT)
repo.commit("add file3")
obj_id_hex = repo.repo.refs[b"HEAD"].decode()
obj_id = hashutil.hash_to_bytes(obj_id_hex)
swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id)
loader = git_loader(str(rp))
loader.load()
return (loader, swhid)
def check_revision_two_roots(self, ert, p, swhid):
assert ert.repo.refs[b"HEAD"].decode() == swhid.object_id.hex()
(c3,) = ert.repo[hashutil.hash_to_bytehex(swhid.object_id)].parents
assert len(ert.repo[c3].parents) == 2
def load_repo_two_heads(self, git_loader):
#
# 1---2----4 <-- master and b1
# \
# ----3 <-- b2
#
repo = TestRepo()
with repo as rp:
(rp / "file1").write_text(TEST_CONTENT)
repo.commit("Add file1")
(rp / "file2").write_text(TEST_CONTENT)
c2 = repo.commit("Add file2")
repo.repo.refs[b"refs/heads/b2"] = c2 # branch b2 from master
(rp / "file3").write_text(TEST_CONTENT)
repo.commit("add file3", ref=b"refs/heads/b2")
(rp / "file4").write_text(TEST_CONTENT)
c4 = repo.commit("add file4", ref=b"refs/heads/master")
repo.repo.refs[b"refs/heads/b1"] = c4 # branch b1 from master
obj_id_hex = repo.repo.refs[b"HEAD"].decode()
obj_id = hashutil.hash_to_bytes(obj_id_hex)
swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id)
loader = git_loader(str(rp))
loader.load()
return (loader, swhid)
def check_snapshot_two_heads(self, ert, p, swhid):
assert (
hashutil.hash_to_bytehex(swhid.object_id)
== ert.repo.refs[b"HEAD"]
== ert.repo.refs[b"refs/heads/master"]
== ert.repo.refs[b"refs/remotes/origin/HEAD"]
== ert.repo.refs[b"refs/remotes/origin/master"]
== ert.repo.refs[b"refs/remotes/origin/b1"]
)
c4_id = hashutil.hash_to_bytehex(swhid.object_id)
c3_id = ert.repo.refs[b"refs/remotes/origin/b2"]
assert ert.repo[c3_id].parents == ert.repo[c4_id].parents
def load_repo_two_double_fork_merge(self, git_loader):
#
# 2---4---6
# / / /
# 1---3---5
#
repo = TestRepo()
with repo as rp:
(rp / "file1").write_text(TEST_CONTENT)
c1 = repo.commit("Add file1") # create commit 1
repo.repo.refs[b"refs/heads/c1"] = c1 # branch c1 from master
(rp / "file2").write_text(TEST_CONTENT)
repo.commit("Add file2") # create commit 2
(rp / "file3").write_text(TEST_CONTENT)
c3 = repo.commit("Add file3", ref=b"refs/heads/c1") # create commit 3 on c1
repo.repo.refs[b"refs/heads/c3"] = c3 # branch c3 from c1
repo.merge([c3]) # create commit 4
(rp / "file5").write_text(TEST_CONTENT)
c5 = repo.commit("Add file3", ref=b"refs/heads/c3") # create commit 5 on c3
repo.merge([c5]) # create commit 6
obj_id_hex = repo.repo.refs[b"HEAD"].decode()
obj_id = hashutil.hash_to_bytes(obj_id_hex)
swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id)
loader = git_loader(str(rp))
loader.load()
return (loader, swhid)
def check_revision_two_double_fork_merge(self, ert, p, swhid):
assert ert.repo.refs[b"HEAD"].decode() == swhid.object_id.hex()
def check_snapshot_two_double_fork_merge(self, ert, p, swhid):
assert (
hashutil.hash_to_bytehex(swhid.object_id)
== ert.repo.refs[b"HEAD"]
== ert.repo.refs[b"refs/heads/master"]
== ert.repo.refs[b"refs/remotes/origin/HEAD"]
== ert.repo.refs[b"refs/remotes/origin/master"]
)
(c4_id, c5_id) = ert.repo[swhid.object_id.hex().encode()].parents
assert c5_id == ert.repo.refs[b"refs/remotes/origin/c3"]
(c2_id, c3_id) = ert.repo[c4_id].parents
assert c3_id == ert.repo.refs[b"refs/remotes/origin/c1"]
def load_repo_triple_merge(self, git_loader):
#
# .---.---5
# / / /
# 2 3 4
# / / /
# 1---.---.
#
repo = TestRepo()
with repo as rp:
(rp / "file1").write_text(TEST_CONTENT)
c1 = repo.commit("Commit 1")
repo.repo.refs[b"refs/heads/b1"] = c1
repo.repo.refs[b"refs/heads/b2"] = c1
repo.commit("Commit 2")
c3 = repo.commit("Commit 3", ref=b"refs/heads/b1")
c4 = repo.commit("Commit 4", ref=b"refs/heads/b2")
repo.merge([c3, c4])
obj_id_hex = repo.repo.refs[b"HEAD"].decode()
obj_id = hashutil.hash_to_bytes(obj_id_hex)
swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id)
loader = git_loader(str(rp))
loader.load()
return (loader, swhid)
def check_revision_triple_merge(self, ert, p, swhid):
assert ert.repo.refs[b"HEAD"].decode() == swhid.object_id.hex()
def check_snapshot_triple_merge(self, ert, p, swhid):
assert (
hashutil.hash_to_bytehex(swhid.object_id)
== ert.repo.refs[b"HEAD"]
== ert.repo.refs[b"refs/heads/master"]
== ert.repo.refs[b"refs/remotes/origin/HEAD"]
== ert.repo.refs[b"refs/remotes/origin/master"]
)
(c2_id, c3_id, c4_id) = ert.repo[swhid.object_id.hex().encode()].parents
assert c3_id == ert.repo.refs[b"refs/remotes/origin/b1"]
assert c4_id == ert.repo.refs[b"refs/remotes/origin/b2"]
assert (
ert.repo[c2_id].parents
== ert.repo[c3_id].parents
== ert.repo[c4_id].parents
)
def load_repo_filtered_objects(self, git_loader):
repo = TestRepo()
with repo as rp:
file_1, id_1 = hash_content(b"test1")
file_2, id_2 = hash_content(b"test2")
file_3, id_3 = hash_content(b"test3")
(rp / "file").write_bytes(file_1)
(rp / "hidden_file").write_bytes(file_2)
(rp / "absent_file").write_bytes(file_3)
repo.commit()
obj_id_hex = repo.repo.refs[b"HEAD"].decode()
obj_id = hashutil.hash_to_bytes(obj_id_hex)
swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id)
loader = git_loader(str(rp))
loader.load()
- # FIXME: storage.content_update() should be changed to allow things
- # like that
- with loader.storage.get_db().transaction() as cur:
- cur.execute(
- """update content set status = 'visible'
- where sha1 = %s""",
- (id_1,),
- )
- cur.execute(
- """update content set status = 'hidden'
- where sha1 = %s""",
- (id_2,),
- )
-
- cur.execute(
- """
- insert into skipped_content
- (sha1, sha1_git, sha256, blake2s256, length, reason)
- select sha1, sha1_git, sha256, blake2s256, length, 'no reason'
- from content
- where sha1 = %s
- """,
- (id_3,),
- )
+ # alter the content of the storage
+ # 1/ make file 2 an hidden file object
+ loader.storage._allow_overwrite = True
+ cnt2 = attrs.evolve(
+ loader.storage.content_get([id_2])[0], status="hidden", data=file_2
+ )
+ loader.storage.content_add([cnt2])
+ assert loader.storage.content_get([id_2])[0].status == "hidden"
+
+ # 2/ make file 3 an skipped file object
+ cnt3 = loader.storage.content_get([id_3])[0].to_dict()
+ cnt3["status"] = "absent"
+ cnt3["reason"] = "no reason"
+ sk_cnt3 = SkippedContent.from_dict(cnt3)
+ loader.storage.skipped_content_add([sk_cnt3])
+ # dirty dirty dirty... let's pretend it is the equivalent of writing sql
+ # queries in the postgresql backend
+ for hashkey in loader.storage._cql_runner._content_indexes:
+ loader.storage._cql_runner._content_indexes[hashkey].pop(cnt3[hashkey])
- cur.execute("delete from content where sha1 = %s", (id_3,))
return (loader, swhid)
def check_revision_filtered_objects(self, ert, p, swhid):
ert.checkout(b"HEAD")
assert (p / "file").read_bytes() == b"test1"
assert (p / "hidden_file").read_bytes() == HIDDEN_MESSAGE
assert (p / "absent_file").read_bytes() == SKIPPED_MESSAGE
def load_repo_null_fields(self, git_loader):
# Our schema doesn't enforce a lot of non-null revision fields. We need
# to check these cases don't break the cooker.
repo = TestRepo()
with repo as rp:
(rp / "file").write_text(TEST_CONTENT)
c = repo.commit("initial commit")
loader = git_loader(str(rp))
loader.load()
repo.repo.refs[b"HEAD"].decode()
dir_id_hex = repo.repo[c].tree.decode()
dir_id = hashutil.hash_to_bytes(dir_id_hex)
test_revision = Revision(
message=b"",
author=Person(name=None, email=None, fullname=b""),
date=None,
committer=Person(name=None, email=None, fullname=b""),
committer_date=None,
parents=(),
type=RevisionType.GIT,
directory=dir_id,
metadata={},
synthetic=True,
)
storage = loader.storage
storage.revision_add([test_revision])
return (loader, test_revision.swhid())
def check_revision_null_fields(self, ert, p, swhid):
ert.checkout(b"HEAD")
assert (p / "file").stat().st_mode == 0o100644
def load_repo_tags(self, git_loader):
# v-- t2
#
# 1---2----5 <-- master, t5, and t5a (annotated)
# \
# ----3----4 <-- t4a (annotated)
#
repo = TestRepo()
with repo as rp:
(rp / "file1").write_text(TEST_CONTENT)
repo.commit("Add file1")
(rp / "file2").write_text(TEST_CONTENT)
repo.commit("Add file2") # create c2
repo.tag(b"t2")
(rp / "file3").write_text(TEST_CONTENT)
repo.commit("add file3")
(rp / "file4").write_text(TEST_CONTENT)
repo.commit("add file4")
repo.tag(b"t4a", message=b"tag 4")
# Go back to c2
repo.git_shell("reset", "--hard", "HEAD^^")
(rp / "file5").write_text(TEST_CONTENT)
repo.commit("add file5") # create c5
repo.tag(b"t5")
repo.tag(b"t5a", message=b"tag 5")
obj_id_hex = repo.repo.refs[b"HEAD"].decode()
obj_id = hashutil.hash_to_bytes(obj_id_hex)
swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id)
loader = git_loader(str(rp))
loader.load()
return (loader, swhid)
def check_snapshot_tags(self, ert, p, swhid):
assert (
hashutil.hash_to_bytehex(swhid.object_id)
== ert.repo.refs[b"HEAD"]
== ert.repo.refs[b"refs/heads/master"]
== ert.repo.refs[b"refs/remotes/origin/HEAD"]
== ert.repo.refs[b"refs/remotes/origin/master"]
== ert.repo.refs[b"refs/tags/t5"]
)
c2_id = ert.repo.refs[b"refs/tags/t2"]
c5_id = hashutil.hash_to_bytehex(swhid.object_id)
assert ert.repo[c5_id].parents == [c2_id]
t5a = ert.repo[ert.repo.refs[b"refs/tags/t5a"]]
# TODO: investigate why new dulwich adds \n
assert t5a.message in (b"tag 5", b"tag 5\n")
assert t5a.object == (dulwich.objects.Commit, c5_id)
t4a = ert.repo[ert.repo.refs[b"refs/tags/t4a"]]
(_, c4_id) = t4a.object
assert ert.repo[c4_id].message == b"add file4\n" # TODO: ditto
(c3_id,) = ert.repo[c4_id].parents
assert ert.repo[c3_id].message == b"add file3\n" # TODO: ditto
assert ert.repo[c3_id].parents == [c2_id]
class TestRevisionCooker(RepoFixtures):
def test_revision_simple(self, git_loader, cook_extract_revision):
(loader, swhid) = self.load_repo_simple(git_loader)
with cook_extract_revision(loader.storage, swhid) as (ert, p):
self.check_revision_simple(ert, p, swhid)
def test_revision_two_roots(self, git_loader, cook_extract_revision):
(loader, swhid) = self.load_repo_two_roots(git_loader)
with cook_extract_revision(loader.storage, swhid) as (ert, p):
self.check_revision_two_roots(ert, p, swhid)
def test_revision_two_double_fork_merge(self, git_loader, cook_extract_revision):
(loader, swhid) = self.load_repo_two_double_fork_merge(git_loader)
with cook_extract_revision(loader.storage, swhid) as (ert, p):
self.check_revision_two_double_fork_merge(ert, p, swhid)
def test_revision_triple_merge(self, git_loader, cook_extract_revision):
(loader, swhid) = self.load_repo_triple_merge(git_loader)
with cook_extract_revision(loader.storage, swhid) as (ert, p):
self.check_revision_triple_merge(ert, p, swhid)
def test_revision_filtered_objects(self, git_loader, cook_extract_revision):
(loader, swhid) = self.load_repo_filtered_objects(git_loader)
with cook_extract_revision(loader.storage, swhid) as (ert, p):
self.check_revision_filtered_objects(ert, p, swhid)
def test_revision_null_fields(self, git_loader, cook_extract_revision):
(loader, swhid) = self.load_repo_null_fields(git_loader)
with cook_extract_revision(loader.storage, swhid, fsck=False) as (ert, p):
self.check_revision_null_fields(ert, p, swhid)
@pytest.mark.parametrize("ingest_target_revision", [False, True])
def test_revision_submodule(
self, swh_storage, cook_extract_revision, ingest_target_revision
):
date = TimestampWithTimezone.from_datetime(
datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0)
)
target_rev = Revision(
message=b"target_rev",
author=Person.from_fullname(b"me <test@example.org>"),
date=date,
committer=Person.from_fullname(b"me <test@example.org>"),
committer_date=date,
parents=(),
type=RevisionType.GIT,
directory=bytes.fromhex("3333333333333333333333333333333333333333"),
metadata={},
synthetic=True,
)
if ingest_target_revision:
swh_storage.revision_add([target_rev])
dir = Directory(
entries=(
DirectoryEntry(
name=b"submodule",
type="rev",
target=target_rev.id,
perms=0o160000,
),
),
)
swh_storage.directory_add([dir])
rev = Revision(
message=b"msg",
author=Person.from_fullname(b"me <test@example.org>"),
date=date,
committer=Person.from_fullname(b"me <test@example.org>"),
committer_date=date,
parents=(),
type=RevisionType.GIT,
directory=dir.id,
metadata={},
synthetic=True,
)
swh_storage.revision_add([rev])
with cook_extract_revision(swh_storage, rev.swhid()) as (ert, p):
ert.checkout(b"HEAD")
pattern = b"160000 submodule\x00%s" % target_rev.id
tree = ert.repo[b"HEAD"].tree
assert pattern in ert.repo[tree].as_raw_string()
class TestSnapshotCooker(RepoFixtures):
def test_snapshot_simple(self, git_loader, cook_extract_snapshot):
(loader, main_rev_id) = self.load_repo_simple(git_loader)
snp_id = loader.loaded_snapshot_id
swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id)
with cook_extract_snapshot(loader.storage, swhid) as (ert, p):
self.check_revision_simple(ert, p, main_rev_id)
def test_snapshot_two_roots(self, git_loader, cook_extract_snapshot):
(loader, main_rev_id) = self.load_repo_two_roots(git_loader)
snp_id = loader.loaded_snapshot_id
swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id)
with cook_extract_snapshot(loader.storage, swhid) as (ert, p):
self.check_revision_two_roots(ert, p, main_rev_id)
def test_snapshot_two_heads(self, git_loader, cook_extract_snapshot):
(loader, main_rev_id) = self.load_repo_two_heads(git_loader)
snp_id = loader.loaded_snapshot_id
swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id)
with cook_extract_snapshot(loader.storage, swhid) as (ert, p):
self.check_snapshot_two_heads(ert, p, main_rev_id)
def test_snapshot_two_double_fork_merge(self, git_loader, cook_extract_snapshot):
(loader, main_rev_id) = self.load_repo_two_double_fork_merge(git_loader)
snp_id = loader.loaded_snapshot_id
swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id)
with cook_extract_snapshot(loader.storage, swhid) as (ert, p):
self.check_revision_two_double_fork_merge(ert, p, main_rev_id)
self.check_snapshot_two_double_fork_merge(ert, p, main_rev_id)
def test_snapshot_triple_merge(self, git_loader, cook_extract_snapshot):
(loader, main_rev_id) = self.load_repo_triple_merge(git_loader)
snp_id = loader.loaded_snapshot_id
swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id)
with cook_extract_snapshot(loader.storage, swhid) as (ert, p):
self.check_revision_triple_merge(ert, p, main_rev_id)
self.check_snapshot_triple_merge(ert, p, main_rev_id)
def test_snapshot_filtered_objects(self, git_loader, cook_extract_snapshot):
(loader, main_rev_id) = self.load_repo_filtered_objects(git_loader)
snp_id = loader.loaded_snapshot_id
swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id)
with cook_extract_snapshot(loader.storage, swhid) as (ert, p):
self.check_revision_filtered_objects(ert, p, main_rev_id)
def test_snapshot_tags(self, git_loader, cook_extract_snapshot):
(loader, main_rev_id) = self.load_repo_tags(git_loader)
snp_id = loader.loaded_snapshot_id
swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id)
with cook_extract_snapshot(loader.storage, swhid) as (ert, p):
self.check_snapshot_tags(ert, p, main_rev_id)
def test_original_malformed_objects(self, swh_storage, cook_extract_snapshot):
"""Tests that objects that were originally malformed:
* are still interpreted somewhat correctly (if the loader could make sense of
them), especially that they still have links to children
* have their original manifest in the bundle
"""
date = TimestampWithTimezone.from_numeric_offset(
Timestamp(1643819927, 0), 0, False
)
content = Content.from_data(b"foo")
swh_storage.content_add([content])
# disordered
# fmt: off
malformed_dir_manifest = (
b""
+ b"100644 file2\x00" + content.sha1_git
+ b"100644 file1\x00" + content.sha1_git
)
# fmt: on
directory = Directory(
entries=(
DirectoryEntry(
name=b"file1", type="file", perms=0o100644, target=content.sha1_git
),
DirectoryEntry(
name=b"file2", type="file", perms=0o100644, target=content.sha1_git
),
),
raw_manifest=f"tree {len(malformed_dir_manifest)}\x00".encode()
+ malformed_dir_manifest,
)
swh_storage.directory_add([directory])
# 'committer' and 'author' swapped
# fmt: off
malformed_rev_manifest = (
b"tree " + hashutil.hash_to_bytehex(directory.id) + b"\n"
+ b"committer me <test@example.org> 1643819927 +0000\n"
+ b"author me <test@example.org> 1643819927 +0000\n"
+ b"\n"
+ b"rev"
)
# fmt: on
revision = Revision(
message=b"rev",
author=Person.from_fullname(b"me <test@example.org>"),
date=date,
committer=Person.from_fullname(b"me <test@example.org>"),
committer_date=date,
parents=(),
type=RevisionType.GIT,
directory=directory.id,
synthetic=True,
raw_manifest=f"commit {len(malformed_rev_manifest)}\x00".encode()
+ malformed_rev_manifest,
)
swh_storage.revision_add([revision])
# 'tag' and 'tagger' swapped
# fmt: off
malformed_rel_manifest = (
b"object " + hashutil.hash_to_bytehex(revision.id) + b"\n"
+ b"type commit\n"
+ b"tagger me <test@example.org> 1643819927 +0000\n"
+ b"tag v1.1.0\n"
)
# fmt: on
release = Release(
name=b"v1.1.0",
message=None,
author=Person.from_fullname(b"me <test@example.org>"),
date=date,
target=revision.id,
target_type=ModelObjectType.REVISION,
synthetic=True,
raw_manifest=f"tag {len(malformed_rel_manifest)}\x00".encode()
+ malformed_rel_manifest,
)
swh_storage.release_add([release])
snapshot = Snapshot(
branches={
b"refs/tags/v1.1.0": SnapshotBranch(
target=release.id, target_type=TargetType.RELEASE
),
b"HEAD": SnapshotBranch(
target=revision.id, target_type=TargetType.REVISION
),
}
)
swh_storage.snapshot_add([snapshot])
with cook_extract_snapshot(swh_storage, snapshot.swhid()) as (ert, p):
tag = ert.repo[b"refs/tags/v1.1.0"]
assert tag.as_raw_string() == malformed_rel_manifest
commit = ert.repo[tag.object[1]]
assert commit.as_raw_string() == malformed_rev_manifest
tree = ert.repo[commit.tree]
assert tree.as_raw_string() == malformed_dir_manifest
diff --git a/swh/vault/tests/test_git_bare_cooker.py b/swh/vault/tests/test_git_bare_cooker.py
index f3c27e9..138dd4d 100644
--- a/swh/vault/tests/test_git_bare_cooker.py
+++ b/swh/vault/tests/test_git_bare_cooker.py
@@ -1,681 +1,702 @@
-# Copyright (C) 2021 The Software Heritage developers
+# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""
This module contains additional tests for the bare cooker.
Generic cooker tests (eg. without swh-graph) in test_cookers.py also
run on the bare cooker.
"""
import datetime
import enum
+from functools import partial
import io
import subprocess
import tarfile
import tempfile
import unittest.mock
import attr
import dulwich.repo
import pytest
from pytest import param
+from pytest_postgresql import factories
+from swh.core.db.pytest_plugin import initialize_database_for_module, postgresql_fact
from swh.model.from_disk import DentryPerms
from swh.model.model import (
Content,
Directory,
DirectoryEntry,
ObjectType,
Person,
Release,
Revision,
RevisionType,
Snapshot,
SnapshotBranch,
TargetType,
Timestamp,
TimestampWithTimezone,
)
+from swh.storage import get_storage
+from swh.storage.postgresql.db import Db as StorageBackend
from swh.vault.cookers.git_bare import GitBareCooker
from swh.vault.in_memory_backend import InMemoryVaultBackend
+storage_postgresql_proc = factories.postgresql_proc(
+ dbname="storage",
+ load=[
+ partial(
+ initialize_database_for_module, "storage", StorageBackend.current_version
+ )
+ ],
+)
+
+storage_postgresql = postgresql_fact("storage_postgresql_proc", no_db_drop=True)
+
+
+@pytest.fixture
+def swh_storage(storage_postgresql):
+ return get_storage("local", db=storage_postgresql.dsn, objstorage={"cls": "memory"})
+
class RootObjects(enum.Enum):
REVISION = enum.auto()
SNAPSHOT = enum.auto()
RELEASE = enum.auto()
WEIRD_RELEASE = enum.auto() # has a : in the name + points to another release
@pytest.mark.graph
@pytest.mark.parametrize(
"root_object,up_to_date_graph,tag,weird_branches",
[
param(
RootObjects.REVISION,
False,
False,
False,
id="rev, outdated graph, no tag/tree/blob",
),
param(
RootObjects.REVISION,
True,
False,
False,
id="rev, updated graph, no tag/tree/blob",
),
param(
RootObjects.RELEASE,
False,
False,
False,
id="rel, outdated graph, no tag/tree/blob",
),
param(
RootObjects.RELEASE,
True,
False,
False,
id="rel, updated graph, no tag/tree/blob",
),
param(
RootObjects.WEIRD_RELEASE,
True,
False,
False,
id="weird rel, updated graph, no tag/tree/blob",
),
param(
RootObjects.SNAPSHOT,
False,
False,
False,
id="snp, outdated graph, no tag/tree/blob",
),
param(
RootObjects.SNAPSHOT,
True,
False,
False,
id="snp, updated graph, no tag/tree/blob",
),
param(
RootObjects.SNAPSHOT,
False,
True,
False,
id="snp, outdated graph, w/ tag, no tree/blob",
),
param(
RootObjects.SNAPSHOT,
True,
True,
False,
id="snp, updated graph, w/ tag, no tree/blob",
),
param(
RootObjects.SNAPSHOT,
False,
True,
True,
id="snp, outdated graph, w/ tag, tree, and blob",
),
param(
RootObjects.SNAPSHOT,
True,
True,
True,
id="snp, updated graph, w/ tag, tree, and blob",
),
],
)
def test_graph_revisions(
swh_storage, up_to_date_graph, root_object, tag, weird_branches
):
r"""
Build objects::
snp
/|||\
/ ||| \
rel2 <----° /|\ \----> rel4
| / | \ |
v / v \ v
rev1 <------ rev2 <----° dir4 \ rel3
| | | \ |
v v v \ |
dir1 dir2 dir3 | |
| / | | | |
v / v v v v
cnt1 <----° cnt2 cnt3 cnt4 cnt5
If up_to_date_graph is true, then swh-graph contains all objects.
Else, cnt4, cnt5, dir4, rev2, rel2, rel3, and snp are missing from the graph.
If tag is False, rel2 is excluded.
If weird_branches is False, dir4, cnt4, rel3, rel4, and cnt5 are excluded.
"""
from swh.graph.naive_client import NaiveClient as GraphClient
# Create objects:
date = TimestampWithTimezone.from_datetime(
datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc)
)
author = Person.from_fullname(b"Foo <foo@example.org>")
cnt1 = Content.from_data(b"correct")
cnt2 = Content.from_data(b"horse")
cnt3 = Content.from_data(b"battery")
cnt4 = Content.from_data(b"staple")
cnt5 = Content.from_data(b"Tr0ub4dor&3")
dir1 = Directory(
entries=(
DirectoryEntry(
name=b"file1",
type="file",
perms=DentryPerms.content,
target=cnt1.sha1_git,
),
)
)
dir2 = Directory(
entries=(
DirectoryEntry(
name=b"file1",
type="file",
perms=DentryPerms.content,
target=cnt1.sha1_git,
),
DirectoryEntry(
name=b"file2",
type="file",
perms=DentryPerms.content,
target=cnt2.sha1_git,
),
)
)
dir3 = Directory(
entries=(
DirectoryEntry(
name=b"file3",
type="file",
perms=DentryPerms.content,
target=cnt3.sha1_git,
),
)
)
dir4 = Directory(
entries=(
DirectoryEntry(
name=b"directory3",
type="dir",
perms=DentryPerms.directory,
target=dir3.id,
),
)
)
rev1 = Revision(
message=b"msg1",
date=date,
committer_date=date,
author=author,
committer=author,
directory=dir1.id,
type=RevisionType.GIT,
synthetic=True,
)
rev2 = Revision(
message=b"msg2",
date=date,
committer_date=date,
author=author,
committer=author,
directory=dir2.id,
parents=(rev1.id,),
type=RevisionType.GIT,
synthetic=True,
)
rel2 = Release(
name=b"1.0.0",
message=b"tag2",
target_type=ObjectType.REVISION,
target=rev2.id,
synthetic=True,
)
rel3 = Release(
name=b"1.0.0-blob",
message=b"tagged-blob",
target_type=ObjectType.CONTENT,
target=cnt5.sha1_git,
synthetic=True,
)
rel4 = Release(
name=b"1.0.0-weird",
message=b"weird release",
target_type=ObjectType.RELEASE,
target=rel3.id,
synthetic=True,
)
rel5 = Release(
name=b"1.0.0:weirdname",
message=b"weird release",
target_type=ObjectType.RELEASE,
target=rel2.id,
synthetic=True,
)
# Create snapshot:
branches = {
b"refs/heads/master": SnapshotBranch(
target=rev2.id, target_type=TargetType.REVISION
),
}
if tag:
branches[b"refs/tags/1.0.0"] = SnapshotBranch(
target=rel2.id, target_type=TargetType.RELEASE
)
if weird_branches:
branches[b"refs/heads/tree-ref"] = SnapshotBranch(
target=dir4.id, target_type=TargetType.DIRECTORY
)
branches[b"refs/heads/blob-ref"] = SnapshotBranch(
target=cnt4.sha1_git, target_type=TargetType.CONTENT
)
branches[b"refs/tags/1.0.0-weird"] = SnapshotBranch(
target=rel4.id, target_type=TargetType.RELEASE
)
snp = Snapshot(branches=branches)
# "Fill" swh-graph
if up_to_date_graph:
nodes = [cnt1, cnt2, dir1, dir2, rev1, rev2, snp]
edges = [
(dir1, cnt1),
(dir2, cnt1),
(dir2, cnt2),
(rev1, dir1),
(rev2, dir2),
(rev2, rev1),
(snp, rev2),
]
if tag:
nodes.append(rel2)
edges.append((rel2, rev2))
edges.append((snp, rel2))
if weird_branches:
nodes.extend([cnt3, cnt4, cnt5, dir3, dir4, rel3, rel4, rel5])
edges.extend(
[
(dir3, cnt3),
(dir4, dir3),
(snp, dir4),
(snp, cnt4),
(snp, rel4),
(rel4, rel3),
(rel3, cnt5),
(rel5, rev2),
]
)
else:
nodes = [cnt1, cnt2, cnt3, dir1, dir2, dir3, rev1]
edges = [
(dir1, cnt1),
(dir2, cnt1),
(dir2, cnt2),
(dir3, cnt3),
(rev1, dir1),
]
if tag:
nodes.append(rel2)
if weird_branches:
nodes.extend([cnt3, dir3])
edges.extend([(dir3, cnt3)])
nodes = [str(n.swhid()) for n in nodes]
edges = [(str(s.swhid()), str(d.swhid())) for (s, d) in edges]
# Add all objects to storage
swh_storage.content_add([cnt1, cnt2, cnt3, cnt4, cnt5])
swh_storage.directory_add([dir1, dir2, dir3, dir4])
swh_storage.revision_add([rev1, rev2])
swh_storage.release_add([rel2, rel3, rel4, rel5])
swh_storage.snapshot_add([snp])
# Add spy on swh_storage, to make sure revision_log is not called
# (the graph must be used instead)
swh_storage = unittest.mock.MagicMock(wraps=swh_storage)
# Add all objects to graph
swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges))
# Cook
backend = InMemoryVaultBackend()
cooked_swhid = {
RootObjects.SNAPSHOT: snp.swhid(),
RootObjects.REVISION: rev2.swhid(),
RootObjects.RELEASE: rel2.swhid(),
RootObjects.WEIRD_RELEASE: rel5.swhid(),
}[root_object]
cooker = GitBareCooker(
cooked_swhid,
backend=backend,
storage=swh_storage,
graph=swh_graph,
)
if weird_branches:
# git-fsck now rejects refs pointing to trees and blobs,
# but some old git repos have them.
cooker.use_fsck = False
cooker.cook()
# Get bundle
bundle = backend.fetch("git_bare", cooked_swhid)
# Extract bundle and make sure both revisions are in it
with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir:
with tarfile.open(fileobj=io.BytesIO(bundle)) as tf:
tf.extractall(tempdir)
if root_object in (RootObjects.SNAPSHOT, RootObjects.REVISION):
log_head = "master"
elif root_object == RootObjects.RELEASE:
log_head = "1.0.0"
elif root_object == RootObjects.WEIRD_RELEASE:
log_head = "release"
else:
assert False, root_object
output = subprocess.check_output(
[
"git",
"-C",
f"{tempdir}/{cooked_swhid}.git",
"log",
"--format=oneline",
"--decorate=",
log_head,
]
)
assert output.decode() == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n"
# Make sure the graph was used instead of swh_storage.revision_log
if root_object == RootObjects.SNAPSHOT:
if up_to_date_graph:
# The graph has everything, so the first call succeeds and returns
# all objects transitively pointed by the snapshot
swh_graph.visit_nodes.assert_has_calls(
[
unittest.mock.call(str(snp.swhid()), edges="snp:*,rel:*,rev:rev"),
]
)
else:
# The graph does not have everything, so the first call returns nothing.
# However, the second call (on the top rev) succeeds and returns
# all objects but the rev and the rel
swh_graph.visit_nodes.assert_has_calls(
[
unittest.mock.call(str(snp.swhid()), edges="snp:*,rel:*,rev:rev"),
unittest.mock.call(str(rev2.swhid()), edges="rev:rev"),
]
)
elif root_object in (
RootObjects.REVISION,
RootObjects.RELEASE,
RootObjects.WEIRD_RELEASE,
):
swh_graph.visit_nodes.assert_has_calls(
[unittest.mock.call(str(rev2.swhid()), edges="rev:rev")]
)
else:
assert False, root_object
if up_to_date_graph:
swh_storage.revision_log.assert_not_called()
swh_storage.revision_shortlog.assert_not_called()
else:
swh_storage.revision_log.assert_called()
@pytest.mark.parametrize(
"mismatch_on", ["content", "directory", "revision1", "revision2", "none"]
)
def test_checksum_mismatch(swh_storage, mismatch_on):
date = TimestampWithTimezone.from_datetime(
datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc)
)
author = Person.from_fullname(b"Foo <foo@example.org>")
wrong_hash = b"\x12\x34" * 10
cnt1 = Content.from_data(b"Tr0ub4dor&3")
if mismatch_on == "content":
cnt1 = attr.evolve(cnt1, sha1_git=wrong_hash)
dir1 = Directory(
entries=(
DirectoryEntry(
name=b"file1",
type="file",
perms=DentryPerms.content,
target=cnt1.sha1_git,
),
)
)
if mismatch_on == "directory":
dir1 = attr.evolve(dir1, id=wrong_hash)
rev1 = Revision(
message=b"msg1",
date=date,
committer_date=date,
author=author,
committer=author,
directory=dir1.id,
type=RevisionType.GIT,
synthetic=True,
)
if mismatch_on == "revision1":
rev1 = attr.evolve(rev1, id=wrong_hash)
rev2 = Revision(
message=b"msg2",
date=date,
committer_date=date,
author=author,
committer=author,
directory=dir1.id,
parents=(rev1.id,),
type=RevisionType.GIT,
synthetic=True,
)
if mismatch_on == "revision2":
rev2 = attr.evolve(rev2, id=wrong_hash)
cooked_swhid = rev2.swhid()
swh_storage.content_add([cnt1])
swh_storage.directory_add([dir1])
swh_storage.revision_add([rev1, rev2])
backend = InMemoryVaultBackend()
cooker = GitBareCooker(
cooked_swhid,
backend=backend,
storage=swh_storage,
graph=None,
)
cooker.cook()
# Get bundle
bundle = backend.fetch("git_bare", cooked_swhid)
# Extract bundle and make sure both revisions are in it
with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir:
with tarfile.open(fileobj=io.BytesIO(bundle)) as tf:
tf.extractall(tempdir)
if mismatch_on != "revision2":
# git-log fails if the head revision is corrupted
# TODO: we need to find a way to make this somewhat usable
output = subprocess.check_output(
[
"git",
"-C",
f"{tempdir}/{cooked_swhid}.git",
"log",
"--format=oneline",
"--decorate=",
]
)
assert output.decode() == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n"
@pytest.mark.parametrize(
"use_graph",
[
pytest.param(False, id="without-graph"),
pytest.param(True, id="with-graph", marks=pytest.mark.graph),
],
)
def test_ignore_displayname(swh_storage, use_graph):
"""Tests the original authorship information is used instead of
configured display names; otherwise objects would not match their hash,
and git-fsck/git-clone would fail.
This tests both with and without swh-graph, as both configurations use different
code paths to fetch revisions.
"""
date = TimestampWithTimezone.from_numeric_offset(Timestamp(1643882820, 0), 0, False)
legacy_person = Person.from_fullname(b"old me <old@example.org>")
current_person = Person.from_fullname(b"me <me@example.org>")
content = Content.from_data(b"foo")
swh_storage.content_add([content])
directory = Directory(
entries=(
DirectoryEntry(
name=b"file1", type="file", perms=0o100644, target=content.sha1_git
),
),
)
swh_storage.directory_add([directory])
revision = Revision(
message=b"rev",
author=legacy_person,
date=date,
committer=legacy_person,
committer_date=date,
parents=(),
type=RevisionType.GIT,
directory=directory.id,
synthetic=True,
)
swh_storage.revision_add([revision])
release = Release(
name=b"v1.1.0",
message=None,
author=legacy_person,
date=date,
target=revision.id,
target_type=ObjectType.REVISION,
synthetic=True,
)
swh_storage.release_add([release])
snapshot = Snapshot(
branches={
b"refs/tags/v1.1.0": SnapshotBranch(
target=release.id, target_type=TargetType.RELEASE
),
b"HEAD": SnapshotBranch(
target=revision.id, target_type=TargetType.REVISION
),
}
)
swh_storage.snapshot_add([snapshot])
# Add all objects to graph
if use_graph:
from swh.graph.naive_client import NaiveClient as GraphClient
nodes = [
str(x.swhid()) for x in [content, directory, revision, release, snapshot]
]
edges = [
(str(x.swhid()), str(y.swhid()))
for (x, y) in [
(directory, content),
(revision, directory),
(release, revision),
(snapshot, release),
(snapshot, revision),
]
]
swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges))
else:
swh_graph = None
# Set a display name
with swh_storage.db() as db:
with db.transaction() as cur:
cur.execute(
"UPDATE person set displayname = %s where fullname = %s",
(current_person.fullname, legacy_person.fullname),
)
# Check the display name did apply in the storage
assert swh_storage.revision_get([revision.id])[0] == attr.evolve(
revision,
author=current_person,
committer=current_person,
)
# Cook
cooked_swhid = snapshot.swhid()
backend = InMemoryVaultBackend()
cooker = GitBareCooker(
cooked_swhid,
backend=backend,
storage=swh_storage,
graph=swh_graph,
)
cooker.cook()
# Get bundle
bundle = backend.fetch("git_bare", cooked_swhid)
# Extract bundle and make sure both revisions are in it
with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir:
with tarfile.open(fileobj=io.BytesIO(bundle)) as tf:
tf.extractall(tempdir)
# If we are here, it means git-fsck succeeded when called by cooker.cook(),
# so we already know the original person was used. Let's double-check.
repo = dulwich.repo.Repo(f"{tempdir}/{cooked_swhid}.git")
tag = repo[b"refs/tags/v1.1.0"]
assert tag.tagger == legacy_person.fullname
commit = repo[tag.object[1]]
assert commit.author == legacy_person.fullname

File Metadata

Mime Type
text/x-diff
Expires
Fri, Jul 4, 12:18 PM (2 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3293296

Event Timeline