diff --git a/swh/vault/cli.py b/swh/vault/cli.py --- a/swh/vault/cli.py +++ b/swh/vault/cli.py @@ -70,6 +70,7 @@ and outputs it to the given file. """ from swh.core import config + from swh.graph.client import RemoteGraphClient from swh.storage import get_storage from .cookers import COOKER_TYPES, get_cooker_cls @@ -101,8 +102,9 @@ backend = InMemoryVaultBackend() storage = get_storage(**conf["storage"]) + graph = RemoteGraphClient(**conf["graph"]) if "graph" in conf else None cooker_cls = get_cooker_cls(cooker_name) - cooker = cooker_cls(cooker_name, swhid.object_id, backend, storage) + cooker = cooker_cls(cooker_name, swhid.object_id, backend, storage, graph) cooker.cook() bundle = backend.fetch(cooker_name, swhid.object_id) diff --git a/swh/vault/cookers/__init__.py b/swh/vault/cookers/__init__.py --- a/swh/vault/cookers/__init__.py +++ b/swh/vault/cookers/__init__.py @@ -10,6 +10,7 @@ from swh.core.config import load_named_config from swh.core.config import read as read_config +from swh.graph.client import RemoteGraphClient from swh.storage import get_storage from swh.vault import get_vault from swh.vault.cookers.base import DEFAULT_CONFIG, DEFAULT_CONFIG_PATH @@ -89,11 +90,13 @@ storage = get_storage(**vcfg.pop("storage")) backend = get_vault(**vcfg) + graph = RemoteGraphClient(**vcfg["graph"]) if "graph" in vcfg else None return cooker_cls( obj_type, obj_id, backend=backend, storage=storage, + graph=graph, max_bundle_size=cfg["max_bundle_size"], ) diff --git a/swh/vault/cookers/base.py b/swh/vault/cookers/base.py --- a/swh/vault/cookers/base.py +++ b/swh/vault/cookers/base.py @@ -61,7 +61,7 @@ CACHE_TYPE_KEY = None # type: Optional[str] def __init__( - self, obj_type, obj_id, backend, storage, max_bundle_size=MAX_BUNDLE_SIZE + self, obj_type, obj_id, backend, storage, graph, max_bundle_size=MAX_BUNDLE_SIZE ): """Initialize the cooker. @@ -80,6 +80,7 @@ self.obj_id = hashutil.hash_to_bytes(obj_id) self.backend = backend self.storage = storage + self.graph = graph self.max_bundle_size = max_bundle_size @abc.abstractmethod diff --git a/swh/vault/cookers/git_bare.py b/swh/vault/cookers/git_bare.py --- a/swh/vault/cookers/git_bare.py +++ b/swh/vault/cookers/git_bare.py @@ -25,9 +25,10 @@ import subprocess import tarfile import tempfile -from typing import Any, Callable, Dict +from typing import Any, Callable, Dict, List import zlib +from swh.core.utils import grouper from swh.model import identifiers from swh.model.hashutil import hash_to_bytehex, hash_to_hex from swh.model.model import ( @@ -40,6 +41,8 @@ from swh.storage.algos.revisions_walker import DFSRevisionsWalker from swh.vault.cookers.base import BaseVaultCooker +REVISION_BATCH_SIZE = 10000 + class GitBareCooker(BaseVaultCooker): use_fsck = True @@ -63,7 +66,7 @@ ) def prepare_bundle(self): - with tempfile.TemporaryDirectory() as workdir: + with tempfile.TemporaryDirectory(prefix="swh-vault-gitbare-") as workdir: # Initialize a Git directory self.workdir = workdir self.gitdir = os.path.join(workdir, "clone.git") @@ -163,10 +166,40 @@ def load_revision_subgraph(self, obj_id: Sha1Git) -> None: """Fetches a revision and all its children, and writes them to disk""" - walker = DFSRevisionsWalker(self.storage, obj_id) - for revision in walker: - self.write_revision_node(revision) - self.load_directory_subgraph(revision["directory"]) + loaded_from_graph = False + + if self.graph: + # First, try to cook using swh-graph, as it is more efficient than + # swh-storage for querying the history + obj_swhid = identifiers.CoreSWHID( + object_type=identifiers.ObjectType.REVISION, object_id=obj_id, + ) + revision_ids = ( + swhid.object_id + for swhid in map( + identifiers.CoreSWHID.from_string, + self.graph.visit_nodes(str(obj_swhid), edges="rev:rev"), + ) + ) + for revision_id_group in grouper(revision_ids, REVISION_BATCH_SIZE): + loaded_from_graph = True + self.load_revisions_and_directory_subgraphs(revision_id_group) + + if not loaded_from_graph: + # If swh-graph is not available, or the revision is not yet in + # swh-graph, fall back to self.storage.revision_log. + walker = DFSRevisionsWalker(self.storage, obj_id) + for revision in walker: + self.write_revision_node(revision) + self.load_directory_subgraph(revision["directory"]) + + def load_revisions_and_directory_subgraphs(self, obj_ids: List[Sha1Git]) -> None: + """Given a list of revision ids, loads these revisions and their directories; + but not their parent revisions.""" + revisions = self.storage.revision_get(obj_ids) + for revision in revisions: + self.write_revision_node(revision.to_dict()) + self.load_directory_subgraph(revision.directory) def write_revision_node(self, revision: Dict[str, Any]) -> bool: """Writes a revision object to disk""" diff --git a/swh/vault/tests/test_git_bare_cooker.py b/swh/vault/tests/test_git_bare_cooker.py new file mode 100644 --- /dev/null +++ b/swh/vault/tests/test_git_bare_cooker.py @@ -0,0 +1,159 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +""" +This module contains additional tests for the bare cooker. +Generic cooker tests (eg. without swh-graph) in test_cookers.py also +run on the bare cooker. +""" + +import datetime +import io +import subprocess +import tarfile +import tempfile +import unittest.mock + +from swh.graph.naive_client import NaiveClient as GraphClient +from swh.model.from_disk import DentryPerms +from swh.model.model import ( + Content, + Directory, + DirectoryEntry, + Person, + Revision, + RevisionType, + TimestampWithTimezone, +) +from swh.vault.cookers.git_bare import GitBareCooker +from swh.vault.in_memory_backend import InMemoryVaultBackend + + +def test_graph_revisions(swh_storage): + # Build objects: + # rev1 <------ rev2 + # | | + # v v + # dir1 dir2 + # | / | + # v / v + # cnt1 <----° cnt2 + date = TimestampWithTimezone.from_datetime( + datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc) + ) + author = Person.from_fullname(b"Foo ") + cnt1 = Content.from_data(b"hello") + cnt2 = Content.from_data(b"world") + dir1 = Directory( + entries=( + DirectoryEntry( + name=b"file1", + type="file", + perms=DentryPerms.content, + target=cnt1.sha1_git, + ), + ) + ) + dir2 = Directory( + entries=( + DirectoryEntry( + name=b"file1", + type="file", + perms=DentryPerms.content, + target=cnt1.sha1_git, + ), + DirectoryEntry( + name=b"file2", + type="file", + perms=DentryPerms.content, + target=cnt2.sha1_git, + ), + ) + ) + rev1 = Revision( + message=b"msg1", + date=date, + committer_date=date, + author=author, + committer=author, + directory=dir1.id, + type=RevisionType.GIT, + synthetic=True, + ) + rev2 = Revision( + message=b"msg2", + date=date, + committer_date=date, + author=author, + committer=author, + directory=dir2.id, + parents=(rev1.id,), + type=RevisionType.GIT, + synthetic=True, + ) + + # Add all objects to storage + swh_storage.content_add([cnt1, cnt2]) + swh_storage.directory_add([dir1, dir2]) + swh_storage.revision_add([rev1, rev2]) + + # Add spy on swh_storage, to make sure revision_log is not called + # (the graph must be used instead) + swh_storage = unittest.mock.MagicMock(wraps=swh_storage) + + # Add all objects to graph + swh_graph = unittest.mock.Mock( + wraps=GraphClient( + nodes=[str(n.swhid()) for n in [cnt1, cnt2, dir1, dir2, rev1, rev2]], + edges=[ + (str(s.swhid()), str(d.swhid())) + for (s, d) in [ + (dir1, cnt1), + (dir2, cnt1), + (dir2, cnt2), + (rev1, dir1), + (rev2, dir2), + (rev2, rev1), + ] + ], + ) + ) + + # Cook + backend = InMemoryVaultBackend() + cooker = GitBareCooker( + "revision_gitbare", + rev2.id, + backend=backend, + storage=swh_storage, + graph=swh_graph, + ) + cooker.cook() + + # Get bundle + bundle = backend.fetch("revision_gitbare", rev2.id) + + # Extract bundle and make sure both revisions are in it + with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: + with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: + tf.extractall(tempdir) + + output = subprocess.check_output( + [ + "git", + "-C", + f"{tempdir}/{rev2.swhid()}.git", + "log", + "--format=oneline", + "--decorate=", + ] + ) + + assert output.decode() == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n" + + # Make sure the graph was used instead of swh_storage.revision_log + swh_storage.revision_log.assert_not_called() + swh_storage.revision_shortlog.assert_not_called() + swh_graph.visit_nodes.assert_called_once_with(str(rev2.swhid()), edges="rev:rev")