diff --git a/requirements-swh.txt b/requirements-swh.txt index 47cf1f6..da20312 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,6 @@ swh.core[db,http] >= 0.14.0 +swh.graph >= v0.3.2 swh.model >= 0.3 swh.objstorage >= 0.0.17 swh.scheduler >= 0.7.0 swh.storage >= 0.0.106 diff --git a/swh/vault/cli.py b/swh/vault/cli.py index 2f038d2..f1f649b 100644 --- a/swh/vault/cli.py +++ b/swh/vault/cli.py @@ -1,165 +1,167 @@ # Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations # WARNING: do not import unnecessary things here to keep cli startup time under # control import logging from typing import TYPE_CHECKING, Optional import click from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup from swh.core.cli import swh as swh_cli_group if TYPE_CHECKING: import io from swh.model.identifiers import CoreSWHID class SwhidParamType(click.ParamType): name = "swhid" def convert(self, value, param, ctx): from swh.model.exceptions import ValidationError from swh.model.identifiers import CoreSWHID try: return CoreSWHID.from_string(value) except ValidationError: self.fail(f"expected core SWHID, got {value!r}", param, ctx) @swh_cli_group.group(name="vault", context_settings=CONTEXT_SETTINGS, cls=AliasedGroup) @click.pass_context def vault(ctx): """Software Heritage Vault tools.""" @vault.command() @click.option( "--config-file", "-C", default=None, metavar="CONFIGFILE", type=click.Path(exists=True, dir_okay=False,), help="Configuration file.", ) @click.argument("swhid", type=SwhidParamType()) @click.argument("outfile", type=click.File("wb")) @click.option( "--cooker-type", type=click.Choice(["flat", "gitfast", "git_bare"]), help="Selects which cooker to use, when there is more than one available " "for the given object type.", ) @click.pass_context def cook( ctx, config_file: str, swhid: CoreSWHID, outfile: io.RawIOBase, cooker_type: Optional[str], ): """ Runs a vault cooker for a single object (identified by a SWHID), and outputs it to the given file. """ from swh.core import config + from swh.graph.client import RemoteGraphClient from swh.storage import get_storage from .cookers import COOKER_TYPES, get_cooker_cls from .in_memory_backend import InMemoryVaultBackend conf = config.read(config_file) supported_object_types = {name.split("_")[0] for name in COOKER_TYPES} if swhid.object_type.name.lower() not in supported_object_types: raise click.ClickException( f"No cooker available for {swhid.object_type.name} objects." ) cooker_name = swhid.object_type.name.lower() if cooker_type: cooker_name = f"{cooker_name}_{cooker_type}" if cooker_name not in COOKER_TYPES: raise click.ClickException( f"{swhid.object_type.name.lower()} objects do not have " f"a {cooker_type} cooker." ) else: if cooker_name not in COOKER_TYPES: raise click.ClickException( f"{swhid.object_type.name.lower()} objects need " f"an explicit --cooker-type." ) backend = InMemoryVaultBackend() storage = get_storage(**conf["storage"]) + graph = RemoteGraphClient(**conf["graph"]) if "graph" in conf else None cooker_cls = get_cooker_cls(cooker_name) - cooker = cooker_cls(cooker_name, swhid.object_id, backend, storage) + cooker = cooker_cls(cooker_name, swhid.object_id, backend, storage, graph) cooker.cook() bundle = backend.fetch(cooker_name, swhid.object_id) assert bundle, "Cooker did not write a bundle to the backend." outfile.write(bundle) @vault.command(name="rpc-serve") @click.option( "--config-file", "-C", default=None, metavar="CONFIGFILE", type=click.Path(exists=True, dir_okay=False,), help="Configuration file.", ) @click.option( "--host", default="0.0.0.0", metavar="IP", show_default=True, help="Host ip address to bind the server on", ) @click.option( "--port", default=5005, type=click.INT, metavar="PORT", help="Binding port of the server", ) @click.option( "--debug/--no-debug", default=True, help="Indicates if the server should run in debug mode", ) @click.pass_context def serve(ctx, config_file, host, port, debug): """Software Heritage Vault RPC server.""" import aiohttp from swh.vault.api.server import make_app_from_configfile ctx.ensure_object(dict) try: app = make_app_from_configfile(config_file, debug=debug) except EnvironmentError as e: click.echo(e.msg, err=True) ctx.exit(1) aiohttp.web.run_app(app, host=host, port=int(port)) def main(): logging.basicConfig() return serve(auto_envvar_prefix="SWH_VAULT") if __name__ == "__main__": main() diff --git a/swh/vault/cookers/__init__.py b/swh/vault/cookers/__init__.py index fc665ab..8ef8b36 100644 --- a/swh/vault/cookers/__init__.py +++ b/swh/vault/cookers/__init__.py @@ -1,99 +1,102 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations import os from typing import Any, Dict from swh.core.config import load_named_config from swh.core.config import read as read_config +from swh.graph.client import RemoteGraphClient from swh.storage import get_storage from swh.vault import get_vault from swh.vault.cookers.base import DEFAULT_CONFIG, DEFAULT_CONFIG_PATH from swh.vault.cookers.directory import DirectoryCooker from swh.vault.cookers.git_bare import GitBareCooker from swh.vault.cookers.revision_flat import RevisionFlatCooker from swh.vault.cookers.revision_gitfast import RevisionGitfastCooker COOKER_TYPES = { "directory": DirectoryCooker, "revision_flat": RevisionFlatCooker, "revision_gitfast": RevisionGitfastCooker, "revision_git_bare": GitBareCooker, "directory_git_bare": GitBareCooker, } def get_cooker_cls(obj_type): return COOKER_TYPES[obj_type] def check_config(cfg: Dict[str, Any]) -> Dict[str, Any]: """Ensure the configuration is ok to run a vault worker, and propagate defaults Raises: EnvironmentError if the configuration is not for remote instance ValueError if one of the following keys is missing: vault, storage Returns: New configuration dict to instantiate a vault worker instance """ cfg = cfg.copy() if "vault" not in cfg: raise ValueError("missing 'vault' configuration") vcfg = cfg["vault"] if vcfg["cls"] != "remote": raise EnvironmentError( "This vault backend can only be a 'remote' configuration" ) # TODO: Soft-deprecation of args key. Remove when ready. vcfg.update(vcfg.get("args", {})) # Default to top-level value if any if "storage" not in vcfg: vcfg["storage"] = cfg.get("storage") if not vcfg.get("storage"): raise ValueError("invalid configuration: missing 'storage' config entry.") return cfg def get_cooker(obj_type: str, obj_id: str): """Instantiate a cooker class of type obj_type. Returns: Cooker class in charge of cooking the obj_type with id obj_id. Raises: ValueError in case of a missing top-level vault key configuration or a storage key. EnvironmentError in case the vault configuration reference a non remote class. """ if "SWH_CONFIG_FILENAME" in os.environ: cfg = read_config(os.environ["SWH_CONFIG_FILENAME"], DEFAULT_CONFIG) else: cfg = load_named_config(DEFAULT_CONFIG_PATH, DEFAULT_CONFIG) cooker_cls = get_cooker_cls(obj_type) cfg = check_config(cfg) vcfg = cfg["vault"] storage = get_storage(**vcfg.pop("storage")) backend = get_vault(**vcfg) + graph = RemoteGraphClient(**vcfg["graph"]) if "graph" in vcfg else None return cooker_cls( obj_type, obj_id, backend=backend, storage=storage, + graph=graph, max_bundle_size=cfg["max_bundle_size"], ) diff --git a/swh/vault/cookers/base.py b/swh/vault/cookers/base.py index 9425c82..e36341e 100644 --- a/swh/vault/cookers/base.py +++ b/swh/vault/cookers/base.py @@ -1,140 +1,147 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import io import logging from typing import Optional from psycopg2.extensions import QueryCanceledError from swh.model import hashutil MAX_BUNDLE_SIZE = 2 ** 29 # 512 MiB DEFAULT_CONFIG_PATH = "vault/cooker" DEFAULT_CONFIG = { "max_bundle_size": ("int", MAX_BUNDLE_SIZE), } class PolicyError(Exception): """Raised when the bundle violates the cooking policy.""" pass class BundleTooLargeError(PolicyError): """Raised when the bundle is too large to be cooked.""" pass class BytesIOBundleSizeLimit(io.BytesIO): def __init__(self, *args, size_limit=None, **kwargs): super().__init__(*args, **kwargs) self.size_limit = size_limit def write(self, chunk): if ( self.size_limit is not None and self.getbuffer().nbytes + len(chunk) > self.size_limit ): raise BundleTooLargeError( "The requested bundle exceeds the maximum allowed " "size of {} bytes.".format(self.size_limit) ) return super().write(chunk) class BaseVaultCooker(metaclass=abc.ABCMeta): """Abstract base class for the vault's bundle creators This class describes a common API for the cookers. To define a new cooker, inherit from this class and override: - CACHE_TYPE_KEY: key to use for the bundle to reference in cache - def cook(): cook the object into a bundle """ CACHE_TYPE_KEY = None # type: Optional[str] def __init__( - self, obj_type, obj_id, backend, storage, max_bundle_size=MAX_BUNDLE_SIZE + self, + obj_type, + obj_id, + backend, + storage, + graph=None, + max_bundle_size=MAX_BUNDLE_SIZE, ): """Initialize the cooker. The type of the object represented by the id depends on the concrete class. Very likely, each type of bundle will have its own cooker class. Args: obj_type: type of the object to be cooked into a bundle (directory, revision_flat or revision_gitfast; see swh.vault.cooker.COOKER_TYPES). obj_id: id of the object to be cooked into a bundle. backend: the vault backend (swh.vault.backend.VaultBackend). """ self.obj_type = obj_type self.obj_id = hashutil.hash_to_bytes(obj_id) self.backend = backend self.storage = storage + self.graph = graph self.max_bundle_size = max_bundle_size @abc.abstractmethod def check_exists(self): """Checks that the requested object exists and can be cooked. Override this in the cooker implementation. """ raise NotImplementedError @abc.abstractmethod def prepare_bundle(self): """Implementation of the cooker. Yields chunks of the bundle bytes. Override this with the cooker implementation. """ raise NotImplementedError def cache_type_key(self) -> str: assert self.CACHE_TYPE_KEY return self.CACHE_TYPE_KEY def write(self, chunk): self.fileobj.write(chunk) def cook(self): """Cook the requested object into a bundle """ self.backend.set_status(self.obj_type, self.obj_id, "pending") self.backend.set_progress(self.obj_type, self.obj_id, "Processing...") self.fileobj = BytesIOBundleSizeLimit(size_limit=self.max_bundle_size) try: try: self.prepare_bundle() except QueryCanceledError: raise PolicyError( "Timeout reached while assembling the requested bundle" ) bundle = self.fileobj.getvalue() # TODO: use proper content streaming instead of put_bundle() self.backend.put_bundle(self.cache_type_key(), self.obj_id, bundle) except PolicyError as e: self.backend.set_status(self.obj_type, self.obj_id, "failed") self.backend.set_progress(self.obj_type, self.obj_id, str(e)) except Exception: self.backend.set_status(self.obj_type, self.obj_id, "failed") self.backend.set_progress( self.obj_type, self.obj_id, "Internal Server Error. This incident will be reported.", ) logging.exception("Bundle cooking failed.") else: self.backend.set_status(self.obj_type, self.obj_id, "done") self.backend.set_progress(self.obj_type, self.obj_id, None) finally: self.backend.send_notif(self.obj_type, self.obj_id) diff --git a/swh/vault/cookers/git_bare.py b/swh/vault/cookers/git_bare.py index 47b4cdd..6d7e89f 100644 --- a/swh/vault/cookers/git_bare.py +++ b/swh/vault/cookers/git_bare.py @@ -1,213 +1,252 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """ This cooker creates tarballs containing a bare .git directory, that can be unpacked and cloned like any git repository. It works in three steps: 1. Write objects one by one in :file:`.git/objects/` 2. Calls ``git repack`` to pack all these objects into git packfiles. 3. Creates a tarball of the resulting repository To avoid downloading and writing the same objects twice, it checks the existence of the object file in the temporary directory. To avoid sending a syscall every time, it also uses ``functools.lru_cache``, as a first layer of cache before checking the file's existence. """ import datetime import functools import os.path import subprocess import tarfile import tempfile -from typing import Any, Callable, Dict +from typing import Any, Callable, Dict, List import zlib +from swh.core.utils import grouper +from swh.graph.client import GraphArgumentException from swh.model import identifiers from swh.model.hashutil import hash_to_bytehex, hash_to_hex from swh.model.model import ( Person, Revision, RevisionType, Sha1Git, TimestampWithTimezone, ) from swh.storage.algos.revisions_walker import DFSRevisionsWalker from swh.vault.cookers.base import BaseVaultCooker +REVISION_BATCH_SIZE = 10000 + class GitBareCooker(BaseVaultCooker): use_fsck = True def cache_type_key(self) -> str: return self.obj_type def check_exists(self): obj_type = self.obj_type.split("_")[0] if obj_type == "revision": return not list(self.storage.revision_missing([self.obj_id])) elif obj_type == "directory": return not list(self.storage.directory_missing([self.obj_id])) else: raise NotImplementedError(f"GitBareCooker for {obj_type}") def obj_swhid(self) -> identifiers.CoreSWHID: obj_type = self.obj_type.split("_")[0] return identifiers.CoreSWHID( object_type=identifiers.ObjectType[obj_type.upper()], object_id=self.obj_id, ) def prepare_bundle(self): - with tempfile.TemporaryDirectory() as workdir: + with tempfile.TemporaryDirectory(prefix="swh-vault-gitbare-") as workdir: # Initialize a Git directory self.workdir = workdir self.gitdir = os.path.join(workdir, "clone.git") os.mkdir(self.gitdir) self.init_git() # Load and write all the objects to disk self.load_subgraph(self.obj_type.split("_")[0], self.obj_id) # Write the root object as a ref. # This must be done before repacking; git-repack ignores orphan objects. self.write_refs() self.repack() self.write_archive() def init_git(self) -> None: subprocess.run(["git", "-C", self.gitdir, "init", "--bare"], check=True) # Create all possible dirs ahead of time, so we don't have to check for # existence every time. for byte in range(256): os.mkdir(os.path.join(self.gitdir, "objects", f"{byte:02x}")) def repack(self) -> None: if self.use_fsck: subprocess.run(["git", "-C", self.gitdir, "fsck"], check=True) # Add objects we wrote in a pack subprocess.run(["git", "-C", self.gitdir, "repack"], check=True) # Remove their non-packed originals subprocess.run(["git", "-C", self.gitdir, "prune-packed"], check=True) def write_refs(self): obj_type = self.obj_type.split("_")[0] if obj_type == "directory": # We need a synthetic revision pointing to the directory author = Person.from_fullname( b"swh-vault, git-bare cooker " ) dt = datetime.datetime.now(tz=datetime.timezone.utc) dt = dt.replace(microsecond=0) # not supported by git date = TimestampWithTimezone.from_datetime(dt) revision = Revision( author=author, committer=author, date=date, committer_date=date, message=b"Initial commit", type=RevisionType.GIT, directory=self.obj_id, synthetic=True, ) self.write_revision_node(revision.to_dict()) head = revision.id elif obj_type == "revision": head = self.obj_id else: assert False, obj_type with open(os.path.join(self.gitdir, "refs", "heads", "master"), "wb") as fd: fd.write(hash_to_bytehex(head)) def write_archive(self): with tarfile.TarFile(mode="w", fileobj=self.fileobj) as tf: tf.add(self.gitdir, arcname=f"{self.obj_swhid()}.git", recursive=True) def _obj_path(self, obj_id: Sha1Git): obj_id_hex = hash_to_hex(obj_id) directory = obj_id_hex[0:2] filename = obj_id_hex[2:] return os.path.join(self.gitdir, "objects", directory, filename) def object_exists(self, obj_id: Sha1Git) -> bool: return os.path.exists(self._obj_path(obj_id)) def write_object(self, obj_id: Sha1Git, obj: bytes) -> bool: """Writes a git object on disk. Returns whether it was already written.""" # Git requires objects to be zlib-compressed; but repacking decompresses and # removes them, so we don't need to compress them too much. data = zlib.compress(obj, level=1) with open(self._obj_path(obj_id), "wb") as fd: fd.write(data) return True def load_subgraph(self, obj_type, obj_id) -> None: if obj_type == "revision": self.load_revision_subgraph(obj_id) elif obj_type == "directory": self.load_directory_subgraph(obj_id) else: raise NotImplementedError(f"GitBareCooker.load_subgraph({obj_type!r}, ...)") def load_revision_subgraph(self, obj_id: Sha1Git) -> None: """Fetches a revision and all its children, and writes them to disk""" - walker = DFSRevisionsWalker(self.storage, obj_id) - for revision in walker: - self.write_revision_node(revision) - self.load_directory_subgraph(revision["directory"]) + loaded_from_graph = False + + if self.graph: + # First, try to cook using swh-graph, as it is more efficient than + # swh-storage for querying the history + obj_swhid = identifiers.CoreSWHID( + object_type=identifiers.ObjectType.REVISION, object_id=obj_id, + ) + try: + revision_ids = ( + swhid.object_id + for swhid in map( + identifiers.CoreSWHID.from_string, + self.graph.visit_nodes(str(obj_swhid), edges="rev:rev"), + ) + ) + for revision_id_group in grouper(revision_ids, REVISION_BATCH_SIZE): + self.load_revisions_and_directory_subgraphs(revision_id_group) + except GraphArgumentException: + # Revision not found in the graph + pass + else: + loaded_from_graph = True + + if not loaded_from_graph: + # If swh-graph is not available, or the revision is not yet in + # swh-graph, fall back to self.storage.revision_log. + walker = DFSRevisionsWalker(self.storage, obj_id) + for revision in walker: + self.write_revision_node(revision) + self.load_directory_subgraph(revision["directory"]) + + def load_revisions_and_directory_subgraphs(self, obj_ids: List[Sha1Git]) -> None: + """Given a list of revision ids, loads these revisions and their directories; + but not their parent revisions.""" + revisions = self.storage.revision_get(obj_ids) + for revision in revisions: + self.write_revision_node(revision.to_dict()) + self.load_directory_subgraph(revision.directory) def write_revision_node(self, revision: Dict[str, Any]) -> bool: """Writes a revision object to disk""" git_object = identifiers.revision_git_object(revision) return self.write_object(revision["id"], git_object) @functools.lru_cache(10240) def load_directory_subgraph(self, obj_id: Sha1Git) -> None: """Fetches a directory and all its children, and writes them to disk""" if self.object_exists(obj_id): # Checks if the object is already written on disk. # This rarely happens thanks to @lru_cache() return directory = self.load_directory_node(obj_id) entry_loaders: Dict[str, Callable[[Sha1Git], None]] = { "file": self.load_content, "dir": self.load_directory_subgraph, "rev": self.load_revision_subgraph, } for entry in directory["entries"]: entry_loader = entry_loaders[entry["type"]] entry_loader(entry["target"]) def load_directory_node(self, obj_id: Sha1Git) -> Dict[str, Any]: """Fetches a directory, writes it to disk (non-recursively), and returns it.""" entries = list(self.storage.directory_ls(obj_id, recursive=False)) directory = {"id": obj_id, "entries": entries} git_object = identifiers.directory_git_object(directory) self.write_object(obj_id, git_object) return directory @functools.lru_cache(10240) def load_content(self, obj_id: Sha1Git) -> None: if self.object_exists(obj_id): # Checks if the object is already written on disk. # This rarely happens thanks to @lru_cache() return # TODO: add support of filtered objects, somehow? # It's tricky, because, by definition, we can't write a git object with # the expected hash, so git-fsck *will* choke on it. content_sha1 = self.storage.content_find({"sha1_git": obj_id})[0].sha1 content = self.storage.content_get_data(content_sha1) self.write_object(obj_id, f"blob {len(content)}\0".encode("ascii") + content) diff --git a/swh/vault/tests/test_cli.py b/swh/vault/tests/test_cli.py index 53f484b..4c35ff6 100644 --- a/swh/vault/tests/test_cli.py +++ b/swh/vault/tests/test_cli.py @@ -1,102 +1,103 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import tempfile from unittest.mock import MagicMock import click import click.testing import pytest from swh.vault.cli import vault as vault_cli_group from swh.vault.cookers.base import BaseVaultCooker from swh.vault.in_memory_backend import InMemoryVaultBackend def test_cook_unsupported_swhid(): runner = click.testing.CliRunner() result = runner.invoke(vault_cli_group, ["cook", "swh:1:dir:f00b4r", "-"]) assert isinstance(result.exception, SystemExit) assert "expected core SWHID" in result.stdout result = runner.invoke(vault_cli_group, ["cook", "swh:1:ori:" + "0" * 40, "-"]) assert isinstance(result.exception, SystemExit) assert "expected core SWHID" in result.stdout result = runner.invoke(vault_cli_group, ["cook", "swh:1:cnt:" + "0" * 40, "-"]) assert isinstance(result.exception, SystemExit) assert "No cooker available for CONTENT" in result.stdout def test_cook_unknown_cooker(): runner = click.testing.CliRunner() result = runner.invoke( vault_cli_group, ["cook", "swh:1:dir:" + "0" * 40, "-", "--cooker-type", "gitfast"], ) assert isinstance(result.exception, SystemExit) assert "do not have a gitfast cooker" in result.stdout result = runner.invoke(vault_cli_group, ["cook", "swh:1:rev:" + "0" * 40, "-"]) assert isinstance(result.exception, SystemExit) assert "explicit --cooker-type" in result.stdout @pytest.mark.parametrize( "obj_type,cooker_name_suffix,swhid_type", [("directory", "", "dir"), ("revision", "gitfast", "rev"),], ) def test_cook_directory(obj_type, cooker_name_suffix, swhid_type, mocker): storage = object() mocker.patch("swh.storage.get_storage", return_value=storage) backend = MagicMock(spec=InMemoryVaultBackend) backend.fetch.return_value = b"bundle content" mocker.patch( "swh.vault.in_memory_backend.InMemoryVaultBackend", return_value=backend ) cooker = MagicMock(spec=BaseVaultCooker) cooker_cls = MagicMock(return_value=cooker) mocker.patch("swh.vault.cookers.get_cooker_cls", return_value=cooker_cls) runner = click.testing.CliRunner() with tempfile.NamedTemporaryFile("a", suffix=".yml") as config_fd: config_fd.write('{"storage": {}}') config_fd.seek(0) if cooker_name_suffix: result = runner.invoke( vault_cli_group, [ "cook", f"swh:1:{swhid_type}:{'0'*40}", "-", "-C", config_fd.name, "--cooker-type", cooker_name_suffix, ], ) else: result = runner.invoke( vault_cli_group, ["cook", f"swh:1:{swhid_type}:{'0'*40}", "-", "-C", config_fd.name], ) if result.exception is not None: raise result.exception cooker_cls.assert_called_once_with( f"{obj_type}_{cooker_name_suffix}" if cooker_name_suffix else obj_type, b"\x00" * 20, backend, storage, + None, ) cooker.cook.assert_called_once_with() assert result.stdout_bytes == b"bundle content" diff --git a/swh/vault/tests/test_git_bare_cooker.py b/swh/vault/tests/test_git_bare_cooker.py new file mode 100644 index 0000000..78d5aff --- /dev/null +++ b/swh/vault/tests/test_git_bare_cooker.py @@ -0,0 +1,181 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +""" +This module contains additional tests for the bare cooker. +Generic cooker tests (eg. without swh-graph) in test_cookers.py also +run on the bare cooker. +""" + +import datetime +import io +import subprocess +import tarfile +import tempfile +import unittest.mock + +import pytest + +from swh.graph.naive_client import NaiveClient as GraphClient +from swh.model.from_disk import DentryPerms +from swh.model.model import ( + Content, + Directory, + DirectoryEntry, + Person, + Revision, + RevisionType, + TimestampWithTimezone, +) +from swh.vault.cookers.git_bare import GitBareCooker +from swh.vault.in_memory_backend import InMemoryVaultBackend + + +def get_objects(last_revision_in_graph): + """ + Build objects:: + + rev1 <------ rev2 + | | + v v + dir1 dir2 + | / | + v / v + cnt1 <----° cnt2 + """ + date = TimestampWithTimezone.from_datetime( + datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc) + ) + author = Person.from_fullname(b"Foo ") + cnt1 = Content.from_data(b"hello") + cnt2 = Content.from_data(b"world") + dir1 = Directory( + entries=( + DirectoryEntry( + name=b"file1", + type="file", + perms=DentryPerms.content, + target=cnt1.sha1_git, + ), + ) + ) + dir2 = Directory( + entries=( + DirectoryEntry( + name=b"file1", + type="file", + perms=DentryPerms.content, + target=cnt1.sha1_git, + ), + DirectoryEntry( + name=b"file2", + type="file", + perms=DentryPerms.content, + target=cnt2.sha1_git, + ), + ) + ) + rev1 = Revision( + message=b"msg1", + date=date, + committer_date=date, + author=author, + committer=author, + directory=dir1.id, + type=RevisionType.GIT, + synthetic=True, + ) + rev2 = Revision( + message=b"msg2", + date=date, + committer_date=date, + author=author, + committer=author, + directory=dir2.id, + parents=(rev1.id,), + type=RevisionType.GIT, + synthetic=True, + ) + + if last_revision_in_graph: + nodes = [str(n.swhid()) for n in [cnt1, cnt2, dir1, dir2, rev1, rev2]] + edges = [ + (str(s.swhid()), str(d.swhid())) + for (s, d) in [ + (dir1, cnt1), + (dir2, cnt1), + (dir2, cnt2), + (rev1, dir1), + (rev2, dir2), + (rev2, rev1), + ] + ] + else: + nodes = [str(n.swhid()) for n in [cnt1, cnt2, dir1, dir2, rev1]] + edges = [ + (str(s.swhid()), str(d.swhid())) + for (s, d) in [(dir1, cnt1), (dir2, cnt1), (dir2, cnt2), (rev1, dir1),] + ] + + return (cnt1, cnt2, dir1, dir2, rev1, rev2, nodes, edges) + + +@pytest.mark.parametrize("last_revision_in_graph", [True, False]) +def test_graph_revisions(swh_storage, last_revision_in_graph): + (cnt1, cnt2, dir1, dir2, rev1, rev2, nodes, edges) = get_objects( + last_revision_in_graph + ) + + # Add all objects to storage + swh_storage.content_add([cnt1, cnt2]) + swh_storage.directory_add([dir1, dir2]) + swh_storage.revision_add([rev1, rev2]) + + # Add spy on swh_storage, to make sure revision_log is not called + # (the graph must be used instead) + swh_storage = unittest.mock.MagicMock(wraps=swh_storage) + + # Add all objects to graph + swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges)) + + # Cook + backend = InMemoryVaultBackend() + cooker = GitBareCooker( + "revision_gitbare", + rev2.id, + backend=backend, + storage=swh_storage, + graph=swh_graph, + ) + cooker.cook() + + # Get bundle + bundle = backend.fetch("revision_gitbare", rev2.id) + + # Extract bundle and make sure both revisions are in it + with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: + with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: + tf.extractall(tempdir) + + output = subprocess.check_output( + [ + "git", + "-C", + f"{tempdir}/{rev2.swhid()}.git", + "log", + "--format=oneline", + "--decorate=", + ] + ) + + assert output.decode() == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n" + + # Make sure the graph was used instead of swh_storage.revision_log + swh_graph.visit_nodes.assert_called_once_with(str(rev2.swhid()), edges="rev:rev") + if last_revision_in_graph: + swh_storage.revision_log.assert_not_called() + swh_storage.revision_shortlog.assert_not_called() + else: + swh_storage.revision_log.assert_called()