diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -3,6 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from collections import defaultdict from dataclasses import dataclass import datetime import logging @@ -326,8 +327,8 @@ if self.dumb: self.dumb_fetcher = dumb.GitObjectsFetcher(self.origin.url, base_repo) self.dumb_fetcher.fetch_object_ids() - self.remote_refs = utils.filter_refs(self.dumb_fetcher.refs) # type: ignore - self.symbolic_refs = self.dumb_fetcher.head + self.remote_refs = utils.filter_refs(self.dumb_fetcher.refs) + self.symbolic_refs = utils.filter_refs(self.dumb_fetcher.head) else: self.pack_buffer = fetch_info.pack_buffer self.pack_size = fetch_info.pack_size @@ -496,6 +497,47 @@ if not branch: unknown_objects[ref_name] = target + if unknown_objects and self.base_snapshots: + # The remote has sent us a partial packfile. It will have skipped + # objects that it knows are ancestors of the heads we have sent as + # known. We can look these objects up in the archive, as they should + # have had all their ancestors loaded when the previous snapshot was + # loaded. + refs_for_target = defaultdict(list) + for ref_name, target in unknown_objects.items(): + refs_for_target[target].append(ref_name) + + targets_unknown = set(refs_for_target) + + for method, target_type in ( + (self.storage.revision_missing, TargetType.REVISION), + (self.storage.release_missing, TargetType.RELEASE), + (self.storage.directory_missing, TargetType.DIRECTORY), + (self.storage.content_missing_per_sha1_git, TargetType.CONTENT), + ): + missing = set(method(list(targets_unknown))) + known = targets_unknown - missing + + for target in known: + for ref_name in refs_for_target[target]: + logger.debug( + "Inferred type %s for branch %s pointing at unfetched %s", + target_type.name, + ref_name.decode(), + hashutil.hash_to_hex(target), + extra={ + "swh_type": "swh_loader_git_inferred_target_type" + }, + ) + branches[ref_name] = SnapshotBranch( + target=target, target_type=target_type + ) + del unknown_objects[ref_name] + + targets_unknown = missing + if not targets_unknown: + break + if unknown_objects: # This object was referenced by the server; We did not fetch # it, and we do not know it from the previous snapshot. This is diff --git a/swh/loader/git/tests/test_from_disk.py b/swh/loader/git/tests/test_from_disk.py --- a/swh/loader/git/tests/test_from_disk.py +++ b/swh/loader/git/tests/test_from_disk.py @@ -6,12 +6,14 @@ import copy import datetime import os.path +from unittest.mock import patch import dulwich.objects import dulwich.porcelain import dulwich.repo import pytest +from swh.loader.git import utils from swh.loader.git.from_disk import GitLoaderFromArchive, GitLoaderFromDisk from swh.loader.tests import ( assert_last_visit_matches, @@ -207,6 +209,52 @@ snapshot=None, ) + def test_load_incremental_partial_history(self): + """Check that loading a partial snapshot, then negotiating a full snapshot, works.""" + + # We pick this branch because it contains the target of the refs/heads/master + # branch of the full snapshot in its history + + interesting_branch = b"refs/tags/branch2-before-delete" + partial_snapshot = Snapshot( + branches={interesting_branch: SNAPSHOT1.branches[interesting_branch]} + ) + + with patch.object( + utils, + "ignore_branch_name", + lambda name: name != interesting_branch, + ), patch.object( + utils, + "filter_refs", + lambda refs: { + ref_name: utils.HexBytes(target) + for ref_name, target in refs.items() + if ref_name == interesting_branch + }, + ): + # Ensure that only the interesting branch is loaded + res = self.loader.load() + + assert res == {"status": "eventful"} + + assert self.loader.storage.snapshot_get_branches(partial_snapshot.id) + + res = self.loader.load() + assert res == {"status": "eventful"} + + stats = get_stats(self.loader.storage) + assert stats == { + "content": 4, + "directory": 7, + "origin": 1, + "origin_visit": 2, + "release": 0, + "revision": 7, + "skipped_content": 0, + "snapshot": 2, + } + class FullGitLoaderTests(CommonGitLoaderTests): """Tests for GitLoader (from disk or not). Includes the common ones, and diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -6,6 +6,7 @@ import datetime from functools import partial from http.server import HTTPServer, SimpleHTTPRequestHandler +import logging import os import subprocess import sys @@ -26,14 +27,7 @@ get_stats, prepare_repository_from_archive, ) -from swh.model.model import ( - Origin, - OriginVisit, - OriginVisitStatus, - Snapshot, - SnapshotBranch, - TargetType, -) +from swh.model.model import Origin, OriginVisit, OriginVisitStatus, Snapshot class CommonGitLoaderNotFound: @@ -225,6 +219,26 @@ "has_parent_origins": False, } + def test_load_incremental_partial_history(self, caplog): + with caplog.at_level(logging.DEBUG, logger="swh.loader.git.loader"): + super().test_load_incremental_partial_history() + + # Check that we've indeed inferred the target type for one of the snapshot + # branches + for record in caplog.records: + if ( + hasattr(record, "swh_type") + and record.swh_type == "swh_loader_git_inferred_target_type" + ): + assert record.args == ( + "REVISION", + "refs/heads/master", + SNAPSHOT1.branches[b"refs/heads/master"].target.hex(), + ) + break + else: + assert False, "did not find log message for inferred branch target type" + class TestGitLoader2(FullGitLoaderTests, CommonGitLoaderNotFound): """Mostly the same loading scenario but with a ``parent_origin`` different from the @@ -564,56 +578,6 @@ call("git_known_refs_percent", "h", expected_git_known_refs_percent, {}, 1), ] - def test_load_incremental_negotiation(self): - """Check that the packfile negotiated when running an incremental load only - contains the "new" commits, and not all objects.""" - - snapshot_id = b"\x01" * 20 - now = datetime.datetime.now(tz=datetime.timezone.utc) - - def ovgl(origin_url, allowed_statuses, require_snapshot, type): - if origin_url == f"base://{self.repo_url}": - return OriginVisit(origin=origin_url, visit=42, date=now, type="git") - else: - return None - - self.loader.storage.origin_visit_get_latest.side_effect = ovgl - self.loader.storage.origin_visit_status_get_latest.return_value = ( - OriginVisitStatus( - origin=f"base://{self.repo_url}", - visit=42, - snapshot=snapshot_id, - date=now, - status="full", - ) - ) - self.loader.storage.snapshot_get_branches.return_value = { - "id": snapshot_id, - "branches": { - b"refs/heads/master": SnapshotBranch( - # id of the initial commit in the git repository fixture - target=bytes.fromhex("b6f40292c4e94a8f7e7b4aff50e6c7429ab98e2a"), - target_type=TargetType.REVISION, - ), - }, - "next_branch": None, - } - - res = self.loader.load() - assert res == {"status": "eventful"} - - stats = get_stats(self.loader.storage) - assert stats == { - "content": 3, # instead of 4 for the full repository - "directory": 6, # instead of 7 - "origin": 1, - "origin_visit": 1, - "release": 0, - "revision": 6, # instead of 7 - "skipped_content": 0, - "snapshot": 1, - } - class DumbGitLoaderTestBase(FullGitLoaderTests): """Prepare a git repository to be loaded using the HTTP dumb transfer protocol.""" diff --git a/swh/loader/git/utils.py b/swh/loader/git/utils.py --- a/swh/loader/git/utils.py +++ b/swh/loader/git/utils.py @@ -10,7 +10,7 @@ import os import shutil import tempfile -from typing import Dict, NewType, Optional +from typing import Dict, Mapping, NewType, Optional, Union from swh.core import tarball from swh.model.model import SnapshotBranch @@ -93,7 +93,7 @@ return False -def filter_refs(refs: Dict[bytes, bytes]) -> Dict[bytes, HexBytes]: +def filter_refs(refs: Mapping[bytes, Union[bytes, HexBytes]]) -> Dict[bytes, HexBytes]: """Filter the refs dictionary using the policy set in `ignore_branch_name`""" return { name: HexBytes(target)