diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,5 @@ swh.core[db,http] >= 0.14.0 +swh.graph >= v0.3.2 swh.model >= 0.3 swh.objstorage >= 0.0.17 swh.scheduler >= 0.7.0 diff --git a/swh/vault/cli.py b/swh/vault/cli.py --- a/swh/vault/cli.py +++ b/swh/vault/cli.py @@ -70,6 +70,7 @@ and outputs it to the given file. """ from swh.core import config + from swh.graph.client import RemoteGraphClient from swh.storage import get_storage from .cookers import COOKER_TYPES, get_cooker_cls @@ -101,8 +102,9 @@ backend = InMemoryVaultBackend() storage = get_storage(**conf["storage"]) + graph = RemoteGraphClient(**conf["graph"]) if "graph" in conf else None cooker_cls = get_cooker_cls(cooker_name) - cooker = cooker_cls(cooker_name, swhid.object_id, backend, storage) + cooker = cooker_cls(cooker_name, swhid.object_id, backend, storage, graph) cooker.cook() bundle = backend.fetch(cooker_name, swhid.object_id) diff --git a/swh/vault/cookers/__init__.py b/swh/vault/cookers/__init__.py --- a/swh/vault/cookers/__init__.py +++ b/swh/vault/cookers/__init__.py @@ -10,6 +10,7 @@ from swh.core.config import load_named_config from swh.core.config import read as read_config +from swh.graph.client import RemoteGraphClient from swh.storage import get_storage from swh.vault import get_vault from swh.vault.cookers.base import DEFAULT_CONFIG, DEFAULT_CONFIG_PATH @@ -89,11 +90,13 @@ storage = get_storage(**vcfg.pop("storage")) backend = get_vault(**vcfg) + graph = RemoteGraphClient(**vcfg["graph"]) if "graph" in vcfg else None return cooker_cls( obj_type, obj_id, backend=backend, storage=storage, + graph=graph, max_bundle_size=cfg["max_bundle_size"], ) diff --git a/swh/vault/cookers/base.py b/swh/vault/cookers/base.py --- a/swh/vault/cookers/base.py +++ b/swh/vault/cookers/base.py @@ -61,7 +61,13 @@ CACHE_TYPE_KEY = None # type: Optional[str] def __init__( - self, obj_type, obj_id, backend, storage, max_bundle_size=MAX_BUNDLE_SIZE + self, + obj_type, + obj_id, + backend, + storage, + graph=None, + max_bundle_size=MAX_BUNDLE_SIZE, ): """Initialize the cooker. @@ -80,6 +86,7 @@ self.obj_id = hashutil.hash_to_bytes(obj_id) self.backend = backend self.storage = storage + self.graph = graph self.max_bundle_size = max_bundle_size @abc.abstractmethod diff --git a/swh/vault/cookers/git_bare.py b/swh/vault/cookers/git_bare.py --- a/swh/vault/cookers/git_bare.py +++ b/swh/vault/cookers/git_bare.py @@ -25,9 +25,11 @@ import subprocess import tarfile import tempfile -from typing import Any, Callable, Dict +from typing import Any, Callable, Dict, List import zlib +from swh.core.utils import grouper +from swh.graph.client import GraphArgumentException from swh.model import identifiers from swh.model.hashutil import hash_to_bytehex, hash_to_hex from swh.model.model import ( @@ -40,6 +42,8 @@ from swh.storage.algos.revisions_walker import DFSRevisionsWalker from swh.vault.cookers.base import BaseVaultCooker +REVISION_BATCH_SIZE = 10000 + class GitBareCooker(BaseVaultCooker): use_fsck = True @@ -63,7 +67,7 @@ ) def prepare_bundle(self): - with tempfile.TemporaryDirectory() as workdir: + with tempfile.TemporaryDirectory(prefix="swh-vault-gitbare-") as workdir: # Initialize a Git directory self.workdir = workdir self.gitdir = os.path.join(workdir, "clone.git") @@ -163,10 +167,45 @@ def load_revision_subgraph(self, obj_id: Sha1Git) -> None: """Fetches a revision and all its children, and writes them to disk""" - walker = DFSRevisionsWalker(self.storage, obj_id) - for revision in walker: - self.write_revision_node(revision) - self.load_directory_subgraph(revision["directory"]) + loaded_from_graph = False + + if self.graph: + # First, try to cook using swh-graph, as it is more efficient than + # swh-storage for querying the history + obj_swhid = identifiers.CoreSWHID( + object_type=identifiers.ObjectType.REVISION, object_id=obj_id, + ) + try: + revision_ids = ( + swhid.object_id + for swhid in map( + identifiers.CoreSWHID.from_string, + self.graph.visit_nodes(str(obj_swhid), edges="rev:rev"), + ) + ) + for revision_id_group in grouper(revision_ids, REVISION_BATCH_SIZE): + self.load_revisions_and_directory_subgraphs(revision_id_group) + except GraphArgumentException: + # Revision not found in the graph + pass + else: + loaded_from_graph = True + + if not loaded_from_graph: + # If swh-graph is not available, or the revision is not yet in + # swh-graph, fall back to self.storage.revision_log. + walker = DFSRevisionsWalker(self.storage, obj_id) + for revision in walker: + self.write_revision_node(revision) + self.load_directory_subgraph(revision["directory"]) + + def load_revisions_and_directory_subgraphs(self, obj_ids: List[Sha1Git]) -> None: + """Given a list of revision ids, loads these revisions and their directories; + but not their parent revisions.""" + revisions = self.storage.revision_get(obj_ids) + for revision in revisions: + self.write_revision_node(revision.to_dict()) + self.load_directory_subgraph(revision.directory) def write_revision_node(self, revision: Dict[str, Any]) -> bool: """Writes a revision object to disk""" diff --git a/swh/vault/tests/test_cli.py b/swh/vault/tests/test_cli.py --- a/swh/vault/tests/test_cli.py +++ b/swh/vault/tests/test_cli.py @@ -96,6 +96,7 @@ b"\x00" * 20, backend, storage, + None, ) cooker.cook.assert_called_once_with() diff --git a/swh/vault/tests/test_git_bare_cooker.py b/swh/vault/tests/test_git_bare_cooker.py new file mode 100644 --- /dev/null +++ b/swh/vault/tests/test_git_bare_cooker.py @@ -0,0 +1,181 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +""" +This module contains additional tests for the bare cooker. +Generic cooker tests (eg. without swh-graph) in test_cookers.py also +run on the bare cooker. +""" + +import datetime +import io +import subprocess +import tarfile +import tempfile +import unittest.mock + +import pytest + +from swh.graph.naive_client import NaiveClient as GraphClient +from swh.model.from_disk import DentryPerms +from swh.model.model import ( + Content, + Directory, + DirectoryEntry, + Person, + Revision, + RevisionType, + TimestampWithTimezone, +) +from swh.vault.cookers.git_bare import GitBareCooker +from swh.vault.in_memory_backend import InMemoryVaultBackend + + +def get_objects(last_revision_in_graph): + """ + Build objects:: + + rev1 <------ rev2 + | | + v v + dir1 dir2 + | / | + v / v + cnt1 <----° cnt2 + """ + date = TimestampWithTimezone.from_datetime( + datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc) + ) + author = Person.from_fullname(b"Foo ") + cnt1 = Content.from_data(b"hello") + cnt2 = Content.from_data(b"world") + dir1 = Directory( + entries=( + DirectoryEntry( + name=b"file1", + type="file", + perms=DentryPerms.content, + target=cnt1.sha1_git, + ), + ) + ) + dir2 = Directory( + entries=( + DirectoryEntry( + name=b"file1", + type="file", + perms=DentryPerms.content, + target=cnt1.sha1_git, + ), + DirectoryEntry( + name=b"file2", + type="file", + perms=DentryPerms.content, + target=cnt2.sha1_git, + ), + ) + ) + rev1 = Revision( + message=b"msg1", + date=date, + committer_date=date, + author=author, + committer=author, + directory=dir1.id, + type=RevisionType.GIT, + synthetic=True, + ) + rev2 = Revision( + message=b"msg2", + date=date, + committer_date=date, + author=author, + committer=author, + directory=dir2.id, + parents=(rev1.id,), + type=RevisionType.GIT, + synthetic=True, + ) + + if last_revision_in_graph: + nodes = [str(n.swhid()) for n in [cnt1, cnt2, dir1, dir2, rev1, rev2]] + edges = [ + (str(s.swhid()), str(d.swhid())) + for (s, d) in [ + (dir1, cnt1), + (dir2, cnt1), + (dir2, cnt2), + (rev1, dir1), + (rev2, dir2), + (rev2, rev1), + ] + ] + else: + nodes = [str(n.swhid()) for n in [cnt1, cnt2, dir1, dir2, rev1]] + edges = [ + (str(s.swhid()), str(d.swhid())) + for (s, d) in [(dir1, cnt1), (dir2, cnt1), (dir2, cnt2), (rev1, dir1),] + ] + + return (cnt1, cnt2, dir1, dir2, rev1, rev2, nodes, edges) + + +@pytest.mark.parametrize("last_revision_in_graph", [True, False]) +def test_graph_revisions(swh_storage, last_revision_in_graph): + (cnt1, cnt2, dir1, dir2, rev1, rev2, nodes, edges) = get_objects( + last_revision_in_graph + ) + + # Add all objects to storage + swh_storage.content_add([cnt1, cnt2]) + swh_storage.directory_add([dir1, dir2]) + swh_storage.revision_add([rev1, rev2]) + + # Add spy on swh_storage, to make sure revision_log is not called + # (the graph must be used instead) + swh_storage = unittest.mock.MagicMock(wraps=swh_storage) + + # Add all objects to graph + swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges)) + + # Cook + backend = InMemoryVaultBackend() + cooker = GitBareCooker( + "revision_gitbare", + rev2.id, + backend=backend, + storage=swh_storage, + graph=swh_graph, + ) + cooker.cook() + + # Get bundle + bundle = backend.fetch("revision_gitbare", rev2.id) + + # Extract bundle and make sure both revisions are in it + with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: + with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: + tf.extractall(tempdir) + + output = subprocess.check_output( + [ + "git", + "-C", + f"{tempdir}/{rev2.swhid()}.git", + "log", + "--format=oneline", + "--decorate=", + ] + ) + + assert output.decode() == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n" + + # Make sure the graph was used instead of swh_storage.revision_log + swh_graph.visit_nodes.assert_called_once_with(str(rev2.swhid()), edges="rev:rev") + if last_revision_in_graph: + swh_storage.revision_log.assert_not_called() + swh_storage.revision_shortlog.assert_not_called() + else: + swh_storage.revision_log.assert_called()