diff --git a/swh/vault/cli.py b/swh/vault/cli.py index b9808aa..2f038d2 100644 --- a/swh/vault/cli.py +++ b/swh/vault/cli.py @@ -1,165 +1,165 @@ # Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations # WARNING: do not import unnecessary things here to keep cli startup time under # control import logging from typing import TYPE_CHECKING, Optional import click from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup from swh.core.cli import swh as swh_cli_group if TYPE_CHECKING: import io from swh.model.identifiers import CoreSWHID class SwhidParamType(click.ParamType): name = "swhid" def convert(self, value, param, ctx): from swh.model.exceptions import ValidationError from swh.model.identifiers import CoreSWHID try: return CoreSWHID.from_string(value) except ValidationError: self.fail(f"expected core SWHID, got {value!r}", param, ctx) @swh_cli_group.group(name="vault", context_settings=CONTEXT_SETTINGS, cls=AliasedGroup) @click.pass_context def vault(ctx): """Software Heritage Vault tools.""" @vault.command() @click.option( "--config-file", "-C", default=None, metavar="CONFIGFILE", type=click.Path(exists=True, dir_okay=False,), help="Configuration file.", ) @click.argument("swhid", type=SwhidParamType()) @click.argument("outfile", type=click.File("wb")) @click.option( "--cooker-type", - type=click.Choice(["flat", "gitfast"]), + type=click.Choice(["flat", "gitfast", "git_bare"]), help="Selects which cooker to use, when there is more than one available " "for the given object type.", ) @click.pass_context def cook( ctx, config_file: str, swhid: CoreSWHID, outfile: io.RawIOBase, cooker_type: Optional[str], ): """ Runs a vault cooker for a single object (identified by a SWHID), and outputs it to the given file. """ from swh.core import config from swh.storage import get_storage from .cookers import COOKER_TYPES, get_cooker_cls from .in_memory_backend import InMemoryVaultBackend conf = config.read(config_file) supported_object_types = {name.split("_")[0] for name in COOKER_TYPES} if swhid.object_type.name.lower() not in supported_object_types: raise click.ClickException( f"No cooker available for {swhid.object_type.name} objects." ) cooker_name = swhid.object_type.name.lower() if cooker_type: cooker_name = f"{cooker_name}_{cooker_type}" if cooker_name not in COOKER_TYPES: raise click.ClickException( f"{swhid.object_type.name.lower()} objects do not have " f"a {cooker_type} cooker." ) else: if cooker_name not in COOKER_TYPES: raise click.ClickException( f"{swhid.object_type.name.lower()} objects need " f"an explicit --cooker-type." ) backend = InMemoryVaultBackend() storage = get_storage(**conf["storage"]) cooker_cls = get_cooker_cls(cooker_name) cooker = cooker_cls(cooker_name, swhid.object_id, backend, storage) cooker.cook() bundle = backend.fetch(cooker_name, swhid.object_id) assert bundle, "Cooker did not write a bundle to the backend." outfile.write(bundle) @vault.command(name="rpc-serve") @click.option( "--config-file", "-C", default=None, metavar="CONFIGFILE", type=click.Path(exists=True, dir_okay=False,), help="Configuration file.", ) @click.option( "--host", default="0.0.0.0", metavar="IP", show_default=True, help="Host ip address to bind the server on", ) @click.option( "--port", default=5005, type=click.INT, metavar="PORT", help="Binding port of the server", ) @click.option( "--debug/--no-debug", default=True, help="Indicates if the server should run in debug mode", ) @click.pass_context def serve(ctx, config_file, host, port, debug): """Software Heritage Vault RPC server.""" import aiohttp from swh.vault.api.server import make_app_from_configfile ctx.ensure_object(dict) try: app = make_app_from_configfile(config_file, debug=debug) except EnvironmentError as e: click.echo(e.msg, err=True) ctx.exit(1) aiohttp.web.run_app(app, host=host, port=int(port)) def main(): logging.basicConfig() return serve(auto_envvar_prefix="SWH_VAULT") if __name__ == "__main__": main() diff --git a/swh/vault/cookers/__init__.py b/swh/vault/cookers/__init__.py index 0a9df00..fc665ab 100644 --- a/swh/vault/cookers/__init__.py +++ b/swh/vault/cookers/__init__.py @@ -1,96 +1,99 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations import os from typing import Any, Dict from swh.core.config import load_named_config from swh.core.config import read as read_config from swh.storage import get_storage from swh.vault import get_vault from swh.vault.cookers.base import DEFAULT_CONFIG, DEFAULT_CONFIG_PATH from swh.vault.cookers.directory import DirectoryCooker +from swh.vault.cookers.git_bare import GitBareCooker from swh.vault.cookers.revision_flat import RevisionFlatCooker from swh.vault.cookers.revision_gitfast import RevisionGitfastCooker COOKER_TYPES = { "directory": DirectoryCooker, "revision_flat": RevisionFlatCooker, "revision_gitfast": RevisionGitfastCooker, + "revision_git_bare": GitBareCooker, + "directory_git_bare": GitBareCooker, } def get_cooker_cls(obj_type): return COOKER_TYPES[obj_type] def check_config(cfg: Dict[str, Any]) -> Dict[str, Any]: """Ensure the configuration is ok to run a vault worker, and propagate defaults Raises: EnvironmentError if the configuration is not for remote instance ValueError if one of the following keys is missing: vault, storage Returns: New configuration dict to instantiate a vault worker instance """ cfg = cfg.copy() if "vault" not in cfg: raise ValueError("missing 'vault' configuration") vcfg = cfg["vault"] if vcfg["cls"] != "remote": raise EnvironmentError( "This vault backend can only be a 'remote' configuration" ) # TODO: Soft-deprecation of args key. Remove when ready. vcfg.update(vcfg.get("args", {})) # Default to top-level value if any if "storage" not in vcfg: vcfg["storage"] = cfg.get("storage") if not vcfg.get("storage"): raise ValueError("invalid configuration: missing 'storage' config entry.") return cfg def get_cooker(obj_type: str, obj_id: str): """Instantiate a cooker class of type obj_type. Returns: Cooker class in charge of cooking the obj_type with id obj_id. Raises: ValueError in case of a missing top-level vault key configuration or a storage key. EnvironmentError in case the vault configuration reference a non remote class. """ if "SWH_CONFIG_FILENAME" in os.environ: cfg = read_config(os.environ["SWH_CONFIG_FILENAME"], DEFAULT_CONFIG) else: cfg = load_named_config(DEFAULT_CONFIG_PATH, DEFAULT_CONFIG) cooker_cls = get_cooker_cls(obj_type) cfg = check_config(cfg) vcfg = cfg["vault"] storage = get_storage(**vcfg.pop("storage")) backend = get_vault(**vcfg) return cooker_cls( obj_type, obj_id, backend=backend, storage=storage, max_bundle_size=cfg["max_bundle_size"], ) diff --git a/swh/vault/cookers/base.py b/swh/vault/cookers/base.py index a9c32f5..9425c82 100644 --- a/swh/vault/cookers/base.py +++ b/swh/vault/cookers/base.py @@ -1,136 +1,140 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import io import logging from typing import Optional from psycopg2.extensions import QueryCanceledError from swh.model import hashutil MAX_BUNDLE_SIZE = 2 ** 29 # 512 MiB DEFAULT_CONFIG_PATH = "vault/cooker" DEFAULT_CONFIG = { "max_bundle_size": ("int", MAX_BUNDLE_SIZE), } class PolicyError(Exception): """Raised when the bundle violates the cooking policy.""" pass class BundleTooLargeError(PolicyError): """Raised when the bundle is too large to be cooked.""" pass class BytesIOBundleSizeLimit(io.BytesIO): def __init__(self, *args, size_limit=None, **kwargs): super().__init__(*args, **kwargs) self.size_limit = size_limit def write(self, chunk): if ( self.size_limit is not None and self.getbuffer().nbytes + len(chunk) > self.size_limit ): raise BundleTooLargeError( "The requested bundle exceeds the maximum allowed " "size of {} bytes.".format(self.size_limit) ) return super().write(chunk) class BaseVaultCooker(metaclass=abc.ABCMeta): """Abstract base class for the vault's bundle creators This class describes a common API for the cookers. To define a new cooker, inherit from this class and override: - CACHE_TYPE_KEY: key to use for the bundle to reference in cache - def cook(): cook the object into a bundle """ CACHE_TYPE_KEY = None # type: Optional[str] def __init__( self, obj_type, obj_id, backend, storage, max_bundle_size=MAX_BUNDLE_SIZE ): """Initialize the cooker. The type of the object represented by the id depends on the concrete class. Very likely, each type of bundle will have its own cooker class. Args: obj_type: type of the object to be cooked into a bundle (directory, revision_flat or revision_gitfast; see swh.vault.cooker.COOKER_TYPES). obj_id: id of the object to be cooked into a bundle. backend: the vault backend (swh.vault.backend.VaultBackend). """ self.obj_type = obj_type self.obj_id = hashutil.hash_to_bytes(obj_id) self.backend = backend self.storage = storage self.max_bundle_size = max_bundle_size @abc.abstractmethod def check_exists(self): """Checks that the requested object exists and can be cooked. Override this in the cooker implementation. """ raise NotImplementedError @abc.abstractmethod def prepare_bundle(self): """Implementation of the cooker. Yields chunks of the bundle bytes. Override this with the cooker implementation. """ raise NotImplementedError + def cache_type_key(self) -> str: + assert self.CACHE_TYPE_KEY + return self.CACHE_TYPE_KEY + def write(self, chunk): self.fileobj.write(chunk) def cook(self): """Cook the requested object into a bundle """ self.backend.set_status(self.obj_type, self.obj_id, "pending") self.backend.set_progress(self.obj_type, self.obj_id, "Processing...") self.fileobj = BytesIOBundleSizeLimit(size_limit=self.max_bundle_size) try: try: self.prepare_bundle() except QueryCanceledError: raise PolicyError( "Timeout reached while assembling the requested bundle" ) bundle = self.fileobj.getvalue() # TODO: use proper content streaming instead of put_bundle() - self.backend.put_bundle(self.CACHE_TYPE_KEY, self.obj_id, bundle) + self.backend.put_bundle(self.cache_type_key(), self.obj_id, bundle) except PolicyError as e: self.backend.set_status(self.obj_type, self.obj_id, "failed") self.backend.set_progress(self.obj_type, self.obj_id, str(e)) except Exception: self.backend.set_status(self.obj_type, self.obj_id, "failed") self.backend.set_progress( self.obj_type, self.obj_id, "Internal Server Error. This incident will be reported.", ) logging.exception("Bundle cooking failed.") else: self.backend.set_status(self.obj_type, self.obj_id, "done") self.backend.set_progress(self.obj_type, self.obj_id, None) finally: self.backend.send_notif(self.obj_type, self.obj_id) diff --git a/swh/vault/cookers/git_bare.py b/swh/vault/cookers/git_bare.py new file mode 100644 index 0000000..1df6986 --- /dev/null +++ b/swh/vault/cookers/git_bare.py @@ -0,0 +1,184 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import datetime +import os.path +import subprocess +import tarfile +import tempfile +from typing import Any, Dict +import zlib + +from swh.model import identifiers +from swh.model.hashutil import hash_to_bytehex, hash_to_hex +from swh.model.model import ( + Person, + Revision, + RevisionType, + Sha1Git, + TimestampWithTimezone, +) +from swh.storage.algos.revisions_walker import DFSRevisionsWalker +from swh.vault.cookers.base import BaseVaultCooker + + +class GitBareCooker(BaseVaultCooker): + use_fsck = True + + def cache_type_key(self) -> str: + return self.obj_type + + def check_exists(self): + obj_type = self.obj_type.split("_")[0] + if obj_type == "revision": + return not list(self.storage.revision_missing([self.obj_id])) + elif obj_type == "directory": + return not list(self.storage.directory_missing([self.obj_id])) + else: + raise NotImplementedError(f"GitBareCooker for {obj_type}") + + def obj_swhid(self) -> identifiers.CoreSWHID: + obj_type = self.obj_type.split("_")[0] + return identifiers.CoreSWHID( + object_type=identifiers.ObjectType[obj_type.upper()], object_id=self.obj_id, + ) + + def prepare_bundle(self): + with tempfile.TemporaryDirectory() as workdir: + # Initialize a Git directory + self.workdir = workdir + self.gitdir = os.path.join(workdir, "clone.git") + os.mkdir(self.gitdir) + self.init_git() + + # Load and write all the objects to disk + self.load_subgraph(self.obj_type.split("_")[0], self.obj_id) + + # Write the root object as a ref. + # This must be done before repacking; git-repack ignores orphan objects. + self.write_refs() + + self.repack() + self.write_archive() + + def init_git(self) -> None: + subprocess.run(["git", "-C", self.gitdir, "init", "--bare"], check=True) + + # Create all possible dirs ahead of time, so we don't have to check for + # existence every time. + for byte in range(256): + os.mkdir(os.path.join(self.gitdir, "objects", f"{byte:02x}")) + + def repack(self) -> None: + if self.use_fsck: + subprocess.run(["git", "-C", self.gitdir, "fsck"], check=True) + + # Add objects we wrote in a pack + subprocess.run(["git", "-C", self.gitdir, "repack"], check=True) + + # Remove their non-packed originals + subprocess.run(["git", "-C", self.gitdir, "prune-packed"], check=True) + + def write_refs(self): + obj_type = self.obj_type.split("_")[0] + if obj_type == "directory": + # We need a synthetic revision pointing to the directory + author = Person.from_fullname( + b"swh-vault, git-bare cooker " + ) + dt = datetime.datetime.now(tz=datetime.timezone.utc) + dt = dt.replace(microsecond=0) # not supported by git + date = TimestampWithTimezone.from_datetime(dt) + revision = Revision( + author=author, + committer=author, + date=date, + committer_date=date, + message=b"Initial commit", + type=RevisionType.GIT, + directory=self.obj_id, + synthetic=True, + ) + self.write_revision_node(revision.to_dict()) + head = revision.id + elif obj_type == "revision": + head = self.obj_id + else: + assert False, obj_type + + with open(os.path.join(self.gitdir, "refs", "heads", "master"), "wb") as fd: + fd.write(hash_to_bytehex(head)) + + def write_archive(self): + with tarfile.TarFile(mode="w", fileobj=self.fileobj) as tf: + tf.add(self.gitdir, arcname=f"{self.obj_swhid()}.git", recursive=True) + + def write_object(self, obj_id: Sha1Git, obj: bytes) -> bool: + """Writes a git object on disk. + + Returns whether it was already written.""" + obj_id_hex = hash_to_hex(obj_id) + directory = obj_id_hex[0:2] + filename = obj_id_hex[2:] + path = os.path.join(self.gitdir, "objects", directory, filename) + if os.path.exists(path): + # Already written + return False + + # Git requires objects to be zlib-compressed; but repacking decompresses and + # removes them, so we don't need to compress them too much. + data = zlib.compress(obj, level=1) + + with open(path, "wb") as fd: + fd.write(data) + return True + + def load_subgraph(self, obj_type, obj_id) -> None: + if obj_type == "revision": + self.load_revision_subgraph(obj_id) + elif obj_type == "directory": + self.load_directory_subgraph(obj_id) + else: + raise NotImplementedError(f"GitBareCooker.load_subgraph({obj_type!r}, ...)") + + def load_revision_subgraph(self, obj_id: Sha1Git) -> None: + """Fetches a revision and all its children, and writes them to disk""" + walker = DFSRevisionsWalker(self.storage, obj_id) + for revision in walker: + self.write_revision_node(revision) + self.load_directory_subgraph(revision["directory"]) + + def write_revision_node(self, revision: Dict[str, Any]) -> bool: + """Writes a revision object to disk""" + git_object = identifiers.revision_git_object(revision) + return self.write_object(revision["id"], git_object) + + def load_directory_subgraph(self, obj_id: Sha1Git) -> None: + """Fetches a directory and all its children, and writes them to disk""" + directory = self.load_directory_node(obj_id) + entry_loaders = { + "file": self.load_content, + "dir": self.load_directory_subgraph, + "rev": self.load_revision_subgraph, + } + for entry in directory["entries"]: + entry_loader = entry_loaders[entry["type"]] + entry_loader(entry["target"]) + + def load_directory_node(self, obj_id: Sha1Git) -> Dict[str, Any]: + """Fetches a directory, writes it to disk (non-recursively), and returns it.""" + entries = list(self.storage.directory_ls(obj_id, recursive=False)) + directory = {"id": obj_id, "entries": entries} + git_object = identifiers.directory_git_object(directory) + self.write_object(obj_id, git_object) + return directory + + def load_content(self, obj_id: Sha1Git) -> None: + # TODO: add support of filtered objects, somehow? + # It's tricky, because, by definition, we can't write a git object with + # the expected hash, so git-fsck *will* choke on it. + content_sha1 = self.storage.content_find({"sha1_git": obj_id})[0].sha1 + content = self.storage.content_get_data(content_sha1) + self.write_object(obj_id, f"blob {len(content)}\0".encode("ascii") + content) diff --git a/swh/vault/in_memory_backend.py b/swh/vault/in_memory_backend.py index c8c93cb..e7cbb2a 100644 --- a/swh/vault/in_memory_backend.py +++ b/swh/vault/in_memory_backend.py @@ -1,53 +1,53 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Dict, List, Optional, Tuple, Union from swh.model.hashutil import hash_to_bytes from .cache import VaultCache ObjectId = Union[str, bytes] class InMemoryVaultBackend: """Stub vault backend, for use in the CLI.""" def __init__(self): self._cache = VaultCache(cls="memory") def fetch(self, obj_type: str, obj_id: ObjectId) -> Optional[bytes]: - return self._cache.get(obj_type, obj_id) + return self._cache.get(obj_type, hash_to_bytes(obj_id)) def cook( self, obj_type: str, obj_id: ObjectId, email: Optional[str] = None ) -> Dict[str, Any]: raise NotImplementedError("InMemoryVaultBackend.cook()") def progress(self, obj_type: str, obj_id: ObjectId): raise NotImplementedError("InMemoryVaultBackend.progress()") # Cookers endpoints def set_progress(self, obj_type: str, obj_id: ObjectId, progress: str) -> None: pass def set_status(self, obj_type: str, obj_id: ObjectId, status: str) -> None: pass def put_bundle(self, obj_type: str, obj_id: ObjectId, bundle) -> bool: self._cache.add(obj_type, hash_to_bytes(obj_id), bundle) return True def send_notif(self, obj_type: str, obj_id: ObjectId): pass # Batch endpoints def batch_cook(self, batch: List[Tuple[str, str]]) -> int: raise NotImplementedError("InMemoryVaultBackend.batch_cook()") def batch_progress(self, batch_id: int) -> Dict[str, Any]: pass diff --git a/swh/vault/tests/test_cookers.py b/swh/vault/tests/test_cookers.py index bd162c5..63f518f 100644 --- a/swh/vault/tests/test_cookers.py +++ b/swh/vault/tests/test_cookers.py @@ -1,617 +1,704 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import contextlib import datetime import glob import gzip import io import os import pathlib +import shutil import subprocess import tarfile import tempfile import unittest import unittest.mock import dulwich.fastexport import dulwich.index import dulwich.objects import dulwich.porcelain import dulwich.repo import pytest from swh.loader.git.from_disk import GitLoaderFromDisk from swh.model import from_disk, hashutil from swh.model.model import Directory, DirectoryEntry, Person, Revision, RevisionType -from swh.vault.cookers import DirectoryCooker, RevisionGitfastCooker +from swh.vault.cookers import DirectoryCooker, GitBareCooker, RevisionGitfastCooker from swh.vault.tests.vault_testing import hash_content from swh.vault.to_disk import HIDDEN_MESSAGE, SKIPPED_MESSAGE class TestRepo: """A tiny context manager for a test git repository, with some utility functions to perform basic git stuff. """ def __enter__(self): self.tmp_dir = tempfile.TemporaryDirectory(prefix="tmp-vault-repo-") self.repo_dir = self.tmp_dir.__enter__() self.repo = dulwich.repo.Repo.init(self.repo_dir) self.author_name = b"Test Author" self.author_email = b"test@softwareheritage.org" self.author = b"%s <%s>" % (self.author_name, self.author_email) self.base_date = 258244200 self.counter = 0 return pathlib.Path(self.repo_dir) def __exit__(self, exc, value, tb): self.tmp_dir.__exit__(exc, value, tb) def checkout(self, rev_sha): rev = self.repo[rev_sha] dulwich.index.build_index_from_tree( self.repo_dir, self.repo.index_path(), self.repo.object_store, rev.tree ) def git_shell(self, *cmd, stdout=subprocess.DEVNULL, **kwargs): name = self.author_name email = self.author_email date = "%d +0000" % (self.base_date + self.counter) env = { # Set git commit format "GIT_AUTHOR_NAME": name, "GIT_AUTHOR_EMAIL": email, "GIT_AUTHOR_DATE": date, "GIT_COMMITTER_NAME": name, "GIT_COMMITTER_EMAIL": email, "GIT_COMMITTER_DATE": date, # Ignore all the system-wide and user configurations "GIT_CONFIG_NOSYSTEM": "1", "HOME": str(self.tmp_dir), "XDG_CONFIG_HOME": str(self.tmp_dir), } kwargs.setdefault("env", {}).update(env) subprocess.check_call( ("git", "-C", self.repo_dir) + cmd, stdout=stdout, **kwargs ) def commit(self, message="Commit test\n", ref=b"HEAD"): """Commit the current working tree in a new commit with message on the branch 'ref'. At the end of the commit, the reference should stay the same and the index should be clean. """ paths = [ os.path.relpath(path, self.repo_dir) for path in glob.glob(self.repo_dir + "/**/*", recursive=True) ] self.repo.stage(paths) message = message.encode() + b"\n" ret = self.repo.do_commit( message=message, committer=self.author, commit_timestamp=self.base_date + self.counter, commit_timezone=0, ref=ref, ) self.counter += 1 # committing on another branch leaves # dangling files in index if ref != b"HEAD": # XXX this should work (but does not) # dulwich.porcelain.reset(self.repo, 'hard') self.git_shell("reset", "--hard", "HEAD") return ret def merge(self, parent_sha_list, message="Merge branches."): self.git_shell( "merge", "--allow-unrelated-histories", "-m", message, *[p.decode() for p in parent_sha_list], ) self.counter += 1 return self.repo.refs[b"HEAD"] def print_debug_graph(self, reflog=False): args = ["log", "--all", "--graph", "--decorate"] if reflog: args.append("--reflog") self.git_shell(*args, stdout=None) @pytest.fixture def git_loader(swh_storage,): """Instantiate a Git Loader using the storage instance as storage. """ def _create_loader(directory): return GitLoaderFromDisk( swh_storage, "fake_origin", directory=directory, visit_date=datetime.datetime.now(datetime.timezone.utc), ) return _create_loader @contextlib.contextmanager -def cook_extract_directory(storage, obj_id): +def cook_extract_directory_dircooker(storage, obj_id, fsck=True): """Context manager that cooks a directory and extract it.""" backend = unittest.mock.MagicMock() backend.storage = storage cooker = DirectoryCooker("directory", obj_id, backend=backend, storage=storage) cooker.fileobj = io.BytesIO() assert cooker.check_exists() cooker.prepare_bundle() cooker.fileobj.seek(0) with tempfile.TemporaryDirectory(prefix="tmp-vault-extract-") as td: with tarfile.open(fileobj=cooker.fileobj, mode="r") as tar: tar.extractall(td) yield pathlib.Path(td) / hashutil.hash_to_hex(obj_id) cooker.storage = None +@contextlib.contextmanager +def cook_extract_directory_git_bare(storage, obj_id, fsck=True): + """Context manager that cooks a revision and extract it, + using GitBareCooker""" + backend = unittest.mock.MagicMock() + backend.storage = storage + + # Cook the object + cooker = GitBareCooker("directory", obj_id, backend=backend, storage=storage) + cooker.use_fsck = fsck # Some tests try edge-cases that git-fsck rejects + cooker.fileobj = io.BytesIO() + assert cooker.check_exists() + cooker.prepare_bundle() + cooker.fileobj.seek(0) + + # Extract it + with tempfile.TemporaryDirectory(prefix="tmp-vault-extract-") as td: + with tarfile.open(fileobj=cooker.fileobj, mode="r") as tar: + tar.extractall(td) + + # Clone it with Dulwich + test_repo = TestRepo() + with test_repo as p: + test_repo.git_shell( + "pull", os.path.join(td, f"swh:1:dir:{obj_id.hex()}.git") + ) + shutil.rmtree(p / ".git") + yield p + + +@pytest.fixture( + scope="module", + params=[cook_extract_directory_dircooker, cook_extract_directory_git_bare], +) +def cook_extract_directory(request): + """A fixture that is instantiated as either cook_extract_directory_dircooker or + cook_extract_directory_git_bare.""" + return request.param + + @contextlib.contextmanager def cook_stream_revision_gitfast(storage, obj_id): """Context manager that cooks a revision and stream its fastexport.""" backend = unittest.mock.MagicMock() backend.storage = storage cooker = RevisionGitfastCooker( "revision_gitfast", obj_id, backend=backend, storage=storage ) cooker.fileobj = io.BytesIO() assert cooker.check_exists() cooker.prepare_bundle() cooker.fileobj.seek(0) fastexport_stream = gzip.GzipFile(fileobj=cooker.fileobj) yield fastexport_stream cooker.storage = None @contextlib.contextmanager -def cook_extract_revision_gitfast(storage, obj_id): - """Context manager that cooks a revision and extract it.""" +def cook_extract_revision_gitfast(storage, obj_id, fsck=True): + """Context manager that cooks a revision and extract it, + using RevisionGitfastCooker""" test_repo = TestRepo() with cook_stream_revision_gitfast(storage, obj_id) as stream, test_repo as p: processor = dulwich.fastexport.GitImportProcessor(test_repo.repo) processor.import_stream(stream) yield test_repo, p +@contextlib.contextmanager +def cook_extract_revision_git_bare(storage, obj_id, fsck=True): + """Context manager that cooks a revision and extract it, + using GitBareCooker""" + backend = unittest.mock.MagicMock() + backend.storage = storage + + # Cook the object + cooker = GitBareCooker("revision", obj_id, backend=backend, storage=storage) + cooker.use_fsck = fsck # Some tests try edge-cases that git-fsck rejects + cooker.fileobj = io.BytesIO() + assert cooker.check_exists() + cooker.prepare_bundle() + cooker.fileobj.seek(0) + + # Extract it + with tempfile.TemporaryDirectory(prefix="tmp-vault-extract-") as td: + with tarfile.open(fileobj=cooker.fileobj, mode="r") as tar: + tar.extractall(td) + + # Clone it with Dulwich + test_repo = TestRepo() + with test_repo as p: + test_repo.git_shell( + "pull", os.path.join(td, f"swh:1:rev:{obj_id.hex()}.git") + ) + yield test_repo, p + + +@pytest.fixture( + scope="module", + params=[cook_extract_revision_gitfast, cook_extract_revision_git_bare], +) +def cook_extract_revision(request): + """A fixture that is instantiated as either cook_extract_revision_gitfast or + cook_extract_revision_git_bare.""" + return request.param + + TEST_CONTENT = ( " test content\n" "and unicode \N{BLACK HEART SUIT}\n" " and trailing spaces " ) TEST_EXECUTABLE = b"\x42\x40\x00\x00\x05" class TestDirectoryCooker: - def test_directory_simple(self, git_loader): + def test_directory_simple(self, git_loader, cook_extract_directory): repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) (rp / "executable").write_bytes(TEST_EXECUTABLE) (rp / "executable").chmod(0o755) (rp / "link").symlink_to("file") (rp / "dir1/dir2").mkdir(parents=True) (rp / "dir1/dir2/file").write_text(TEST_CONTENT) c = repo.commit() loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) with cook_extract_directory(loader.storage, obj_id) as p: assert (p / "file").stat().st_mode == 0o100644 assert (p / "file").read_text() == TEST_CONTENT assert (p / "executable").stat().st_mode == 0o100755 assert (p / "executable").read_bytes() == TEST_EXECUTABLE assert (p / "link").is_symlink assert os.readlink(str(p / "link")) == "file" assert (p / "dir1/dir2/file").stat().st_mode == 0o100644 assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT directory = from_disk.Directory.from_disk(path=bytes(p)) assert obj_id_hex == hashutil.hash_to_hex(directory.hash) - def test_directory_filtered_objects(self, git_loader): + def test_directory_filtered_objects(self, git_loader, cook_extract_directory): + if cook_extract_directory is cook_extract_directory_git_bare: + pytest.xfail("GitBareCooker does not support filtered objects (yet?)") repo = TestRepo() with repo as rp: file_1, id_1 = hash_content(b"test1") file_2, id_2 = hash_content(b"test2") file_3, id_3 = hash_content(b"test3") (rp / "file").write_bytes(file_1) (rp / "hidden_file").write_bytes(file_2) (rp / "absent_file").write_bytes(file_3) c = repo.commit() loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) # FIXME: storage.content_update() should be changed to allow things # like that with loader.storage.get_db().transaction() as cur: cur.execute( """update content set status = 'visible' where sha1 = %s""", (id_1,), ) cur.execute( """update content set status = 'hidden' where sha1 = %s""", (id_2,), ) cur.execute( """update content set status = 'absent' where sha1 = %s""", (id_3,), ) with cook_extract_directory(loader.storage, obj_id) as p: assert (p / "file").read_bytes() == b"test1" assert (p / "hidden_file").read_bytes() == HIDDEN_MESSAGE assert (p / "absent_file").read_bytes() == SKIPPED_MESSAGE - def test_directory_bogus_perms(self, git_loader): + def test_directory_bogus_perms(self, git_loader, cook_extract_directory): # Some early git repositories have 664/775 permissions... let's check # if all the weird modes are properly normalized in the directory # cooker. repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) (rp / "file").chmod(0o664) (rp / "executable").write_bytes(TEST_EXECUTABLE) (rp / "executable").chmod(0o775) (rp / "wat").write_text(TEST_CONTENT) (rp / "wat").chmod(0o604) # Disable mode cleanup with unittest.mock.patch("dulwich.index.cleanup_mode", lambda mode: mode): c = repo.commit() # Make sure Dulwich didn't normalize the permissions itself. # (if it did, then the test can't check the cooker normalized them) tree_id = repo.repo[c].tree assert {entry.mode for entry in repo.repo[tree_id].items()} == { 0o100775, 0o100664, 0o100604, } # Disable mode checks with unittest.mock.patch("dulwich.objects.Tree.check", lambda self: None): loader = git_loader(str(rp)) loader.load() # Make sure swh-loader didn't normalize them either dir_entries = loader.storage.directory_ls(hashutil.bytehex_to_hash(tree_id)) assert {entry["perms"] for entry in dir_entries} == { 0o100664, 0o100775, 0o100604, } obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) with cook_extract_directory(loader.storage, obj_id) as p: assert (p / "file").stat().st_mode == 0o100644 assert (p / "executable").stat().st_mode == 0o100755 assert (p / "wat").stat().st_mode == 0o100644 - def test_directory_revision_data(self, swh_storage): + def test_directory_revision_data(self, swh_storage, cook_extract_directory): + if cook_extract_directory is cook_extract_directory_git_bare: + pytest.xfail("GitBareCooker does not support submodules yet") target_rev = "0e8a3ad980ec179856012b7eecf4327e99cd44cd" dir = Directory( entries=( DirectoryEntry( name=b"submodule", type="rev", target=hashutil.hash_to_bytes(target_rev), perms=0o100644, ), ), ) swh_storage.directory_add([dir]) - with cook_extract_directory(swh_storage, dir.id) as p: + with cook_extract_directory(swh_storage, dir.id, fsck=False) as p: assert (p / "submodule").is_symlink() assert os.readlink(str(p / "submodule")) == target_rev -class TestRevisionGitfastCooker: - def test_revision_simple(self, git_loader): +class TestRevisionCooker: + def test_revision_simple(self, git_loader, cook_extract_revision): # # 1--2--3--4--5--6--7 # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) repo.commit("add file1") (rp / "file2").write_text(TEST_CONTENT) repo.commit("add file2") (rp / "dir1/dir2").mkdir(parents=True) (rp / "dir1/dir2/file").write_text(TEST_CONTENT) repo.commit("add dir1/dir2/file") (rp / "bin1").write_bytes(TEST_EXECUTABLE) (rp / "bin1").chmod(0o755) repo.commit("add bin1") (rp / "link1").symlink_to("file1") repo.commit("link link1 to file1") (rp / "file2").unlink() repo.commit("remove file2") (rp / "bin1").rename(rp / "bin") repo.commit("rename bin1 to bin") loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): + with cook_extract_revision(loader.storage, obj_id) as (ert, p): ert.checkout(b"HEAD") assert (p / "file1").stat().st_mode == 0o100644 assert (p / "file1").read_text() == TEST_CONTENT assert (p / "link1").is_symlink assert os.readlink(str(p / "link1")) == "file1" assert (p / "bin").stat().st_mode == 0o100755 assert (p / "bin").read_bytes() == TEST_EXECUTABLE assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT assert (p / "dir1/dir2/file").stat().st_mode == 0o100644 assert ert.repo.refs[b"HEAD"].decode() == obj_id_hex - def test_revision_two_roots(self, git_loader): + def test_revision_two_roots(self, git_loader, cook_extract_revision): # # 1----3---4 # / # 2---- # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) c1 = repo.commit("Add file1") del repo.repo.refs[b"refs/heads/master"] # git update-ref -d HEAD (rp / "file2").write_text(TEST_CONTENT) repo.commit("Add file2") repo.merge([c1]) (rp / "file3").write_text(TEST_CONTENT) repo.commit("add file3") obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) loader = git_loader(str(rp)) loader.load() - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): + with cook_extract_revision(loader.storage, obj_id) as (ert, p): assert ert.repo.refs[b"HEAD"].decode() == obj_id_hex - def test_revision_two_double_fork_merge(self, git_loader): + def test_revision_two_double_fork_merge(self, git_loader, cook_extract_revision): # # 2---4---6 # / / / # 1---3---5 # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) c1 = repo.commit("Add file1") repo.repo.refs[b"refs/heads/c1"] = c1 (rp / "file2").write_text(TEST_CONTENT) repo.commit("Add file2") (rp / "file3").write_text(TEST_CONTENT) c3 = repo.commit("Add file3", ref=b"refs/heads/c1") repo.repo.refs[b"refs/heads/c3"] = c3 repo.merge([c3]) (rp / "file5").write_text(TEST_CONTENT) c5 = repo.commit("Add file3", ref=b"refs/heads/c3") repo.merge([c5]) obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) loader = git_loader(str(rp)) loader.load() - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): + with cook_extract_revision(loader.storage, obj_id) as (ert, p): assert ert.repo.refs[b"HEAD"].decode() == obj_id_hex - def test_revision_triple_merge(self, git_loader): + def test_revision_triple_merge(self, git_loader, cook_extract_revision): # # .---.---5 # / / / # 2 3 4 # / / / # 1---.---. # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) c1 = repo.commit("Commit 1") repo.repo.refs[b"refs/heads/b1"] = c1 repo.repo.refs[b"refs/heads/b2"] = c1 repo.commit("Commit 2") c3 = repo.commit("Commit 3", ref=b"refs/heads/b1") c4 = repo.commit("Commit 4", ref=b"refs/heads/b2") repo.merge([c3, c4]) obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) loader = git_loader(str(rp)) loader.load() - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): + with cook_extract_revision(loader.storage, obj_id) as (ert, p): assert ert.repo.refs[b"HEAD"].decode() == obj_id_hex - def test_revision_filtered_objects(self, git_loader): + def test_revision_filtered_objects(self, git_loader, cook_extract_revision): + if cook_extract_revision is cook_extract_revision_git_bare: + pytest.xfail("GitBareCooker does not support filtered objects (yet?)") repo = TestRepo() with repo as rp: file_1, id_1 = hash_content(b"test1") file_2, id_2 = hash_content(b"test2") file_3, id_3 = hash_content(b"test3") (rp / "file").write_bytes(file_1) (rp / "hidden_file").write_bytes(file_2) (rp / "absent_file").write_bytes(file_3) repo.commit() obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) loader = git_loader(str(rp)) loader.load() # FIXME: storage.content_update() should be changed to allow things # like that with loader.storage.get_db().transaction() as cur: cur.execute( """update content set status = 'visible' where sha1 = %s""", (id_1,), ) cur.execute( """update content set status = 'hidden' where sha1 = %s""", (id_2,), ) cur.execute( """update content set status = 'absent' where sha1 = %s""", (id_3,), ) - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): + with cook_extract_revision(loader.storage, obj_id) as (ert, p): ert.checkout(b"HEAD") assert (p / "file").read_bytes() == b"test1" assert (p / "hidden_file").read_bytes() == HIDDEN_MESSAGE assert (p / "absent_file").read_bytes() == SKIPPED_MESSAGE - def test_revision_bogus_perms(self, git_loader): + def test_revision_bogus_perms(self, git_loader, cook_extract_revision): # Some early git repositories have 664/775 permissions... let's check # if all the weird modes are properly normalized in the revision # cooker. repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) (rp / "file").chmod(0o664) (rp / "executable").write_bytes(TEST_EXECUTABLE) (rp / "executable").chmod(0o775) (rp / "wat").write_text(TEST_CONTENT) (rp / "wat").chmod(0o604) # Disable mode cleanup with unittest.mock.patch("dulwich.index.cleanup_mode", lambda mode: mode): c = repo.commit("initial commit") # Make sure Dulwich didn't normalize the permissions itself. # (if it did, then the test can't check the cooker normalized them) tree_id = repo.repo[c].tree assert {entry.mode for entry in repo.repo[tree_id].items()} == { 0o100775, 0o100664, 0o100604, } # Disable mode checks with unittest.mock.patch("dulwich.objects.Tree.check", lambda self: None): loader = git_loader(str(rp)) loader.load() # Make sure swh-loader didn't normalize them either dir_entries = loader.storage.directory_ls(hashutil.bytehex_to_hash(tree_id)) assert {entry["perms"] for entry in dir_entries} == { 0o100664, 0o100775, 0o100604, } obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) - with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): + with cook_extract_revision(loader.storage, obj_id) as (ert, p): ert.checkout(b"HEAD") assert (p / "file").stat().st_mode == 0o100644 assert (p / "executable").stat().st_mode == 0o100755 assert (p / "wat").stat().st_mode == 0o100644 - def test_revision_null_fields(self, git_loader): + def test_revision_null_fields(self, git_loader, cook_extract_revision): # Our schema doesn't enforce a lot of non-null revision fields. We need # to check these cases don't break the cooker. repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) c = repo.commit("initial commit") loader = git_loader(str(rp)) loader.load() repo.repo.refs[b"HEAD"].decode() dir_id_hex = repo.repo[c].tree.decode() dir_id = hashutil.hash_to_bytes(dir_id_hex) test_revision = Revision( message=b"", author=Person(name=None, email=None, fullname=b""), date=None, committer=Person(name=None, email=None, fullname=b""), committer_date=None, parents=(), type=RevisionType.GIT, directory=dir_id, metadata={}, synthetic=True, ) storage = loader.storage storage.revision_add([test_revision]) - with cook_extract_revision_gitfast(storage, test_revision.id) as (ert, p): + with cook_extract_revision(storage, test_revision.id, fsck=False) as (ert, p): ert.checkout(b"HEAD") assert (p / "file").stat().st_mode == 0o100644 def test_revision_revision_data(self, swh_storage): target_rev = "0e8a3ad980ec179856012b7eecf4327e99cd44cd" dir = Directory( entries=( DirectoryEntry( name=b"submodule", type="rev", target=hashutil.hash_to_bytes(target_rev), perms=0o100644, ), ), ) swh_storage.directory_add([dir]) rev = Revision( message=b"", author=Person(name=None, email=None, fullname=b""), date=None, committer=Person(name=None, email=None, fullname=b""), committer_date=None, parents=(), type=RevisionType.GIT, directory=dir.id, metadata={}, synthetic=True, ) swh_storage.revision_add([rev]) with cook_stream_revision_gitfast(swh_storage, rev.id) as stream: pattern = "M 160000 {} submodule".format(target_rev).encode() assert pattern in stream.read()