diff --git a/swh/loader/git/from_disk.py b/swh/loader/git/from_disk.py --- a/swh/loader/git/from_disk.py +++ b/swh/loader/git/from_disk.py @@ -232,6 +232,8 @@ branches: Dict[bytes, Optional[SnapshotBranch]] = {} for ref, target in self.repo.refs.as_dict().items(): + if utils.ignore_branch_name(ref): + continue obj = self.get_object(target) if obj: target_type = converters.DULWICH_TARGET_TYPES[obj.type_name] @@ -243,6 +245,8 @@ dangling_branches = {} for ref, target in self.repo.refs.get_symrefs().items(): + if utils.ignore_branch_name(ref): + continue branches[ref] = SnapshotBranch(target=target, target_type=TargetType.ALIAS) if target not in branches: # This handles the case where the pointer is "dangling". diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -35,25 +35,6 @@ from . import converters, utils -def ignore_branch_name(branch_name: bytes) -> bool: - """Should the git loader ignore the branch named `branch_name`?""" - if branch_name.endswith(b"^{}"): - # Peeled refs make the git protocol explode - return True - elif branch_name.startswith(b"refs/pull/") and branch_name.endswith(b"/merge"): - # We filter-out auto-merged GitHub pull requests - return True - - return False - - -def filter_refs(refs: Dict[bytes, bytes]) -> Dict[bytes, bytes]: - """Filter the refs dictionary using the policy set in `ignore_branch_name`""" - return { - name: target for name, target in refs.items() if not ignore_branch_name(name) - } - - class RepoRepresentation: """Repository representation for a Software Heritage origin.""" @@ -99,7 +80,7 @@ # Get the remote heads that we want to fetch remote_heads: Set[bytes] = set() for ref_name, ref_target in refs.items(): - if ignore_branch_name(ref_name): + if utils.ignore_branch_name(ref_name): continue remote_heads.add(ref_target) @@ -203,8 +184,8 @@ pack_buffer.seek(0) return FetchPackReturn( - remote_refs=filter_refs(remote_refs), - symbolic_refs=filter_refs(symbolic_refs), + remote_refs=utils.filter_refs(remote_refs), + symbolic_refs=utils.filter_refs(symbolic_refs), pack_buffer=pack_buffer, pack_size=pack_size, ) diff --git a/swh/loader/git/tests/test_from_disk.py b/swh/loader/git/tests/test_from_disk.py --- a/swh/loader/git/tests/test_from_disk.py +++ b/swh/loader/git/tests/test_from_disk.py @@ -7,10 +7,13 @@ import dulwich.repo +from swh.model.model import Snapshot, SnapshotBranch, TargetType +from swh.model.hashutil import hash_to_bytes + +from swh.loader.core.tests import BaseLoaderTest + from swh.loader.git.from_disk import GitLoaderFromDisk as OrigGitLoaderFromDisk from swh.loader.git.from_disk import GitLoaderFromArchive as OrigGitLoaderFromArchive -from swh.loader.core.tests import BaseLoaderTest -from swh.model.hashutil import hash_to_bytes from . import TEST_LOADER_CONFIG @@ -314,6 +317,51 @@ "target_type": "revision", } + def test_load_filter_branches(self): + filtered_branches = {b"refs/pull/42/merge"} + unfiltered_branches = {b"refs/pull/42/head"} + + # Add branches to the repository on disk; some should be filtered by + # the loader, some should not. + for branch_name in filtered_branches | unfiltered_branches: + self.repo[branch_name] = self.repo[b"refs/heads/master"] + + # Generate the expected snapshot from SNAPSHOT1 (which is the original + # state of the git repo)... + branches = {} + + for branch_name, branch_dict in SNAPSHOT1["branches"].items(): + target_type_name = branch_dict["target_type"] + target_obj = branch_dict["target"] + + if target_type_name != "alias": + target = bytes.fromhex(target_obj) + else: + target = target_obj.encode() + + branch = SnapshotBranch( + target=target, target_type=TargetType(target_type_name) + ) + branches[branch_name.encode()] = branch + + # ... and the unfiltered_branches, which are all pointing to the same + # commit as "refs/heads/master". + for branch_name in unfiltered_branches: + branches[branch_name] = branches[b"refs/heads/master"] + + expected_snapshot = Snapshot(branches=branches) + + # Load the modified repository + res = self.load() + assert res["status"] == "eventful" + + assert self.loader.load_status() == {"status": "eventful"} + assert self.loader.visit_status() == "full" + + visit = self.storage.origin_visit_get_latest(self.repo_url) + assert visit["snapshot"] == expected_snapshot.id + assert visit["status"] == "full" + def test_load_dangling_symref(self): with open(os.path.join(self.destination_path, ".git/HEAD"), "wb") as f: f.write(b"ref: refs/heads/dangling-branch\n") diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -1,12 +1,10 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.model.model import Snapshot, SnapshotBranch, TargetType - from swh.loader.git.loader import GitLoader -from swh.loader.git.tests.test_from_disk import SNAPSHOT1, DirGitLoaderTest +from swh.loader.git.tests.test_from_disk import DirGitLoaderTest from . import TEST_LOADER_CONFIG @@ -26,48 +24,3 @@ def load(self): return self.loader.load() - - def test_load_filter_branches(self): - filtered_branches = {b"refs/pull/42/merge"} - unfiltered_branches = {b"refs/pull/42/head"} - - # Add branches to the repository on disk; some should be filtered by - # the loader, some should not. - for branch_name in filtered_branches | unfiltered_branches: - self.repo[branch_name] = self.repo[b"refs/heads/master"] - - # Generate the expected snapshot from SNAPSHOT1 (which is the original - # state of the git repo)... - branches = {} - - for branch_name, branch_dict in SNAPSHOT1["branches"].items(): - target_type_name = branch_dict["target_type"] - target_obj = branch_dict["target"] - - if target_type_name != "alias": - target = bytes.fromhex(target_obj) - else: - target = target_obj.encode() - - branch = SnapshotBranch( - target=target, target_type=TargetType(target_type_name) - ) - branches[branch_name.encode()] = branch - - # ... and the unfiltered_branches, which are all pointing to the same - # commit as "refs/heads/master". - for branch_name in unfiltered_branches: - branches[branch_name] = branches[b"refs/heads/master"] - - expected_snapshot = Snapshot(branches=branches) - - # Load the modified repository - res = self.load() - assert res["status"] == "eventful" - - assert self.loader.load_status() == {"status": "eventful"} - assert self.loader.visit_status() == "full" - - visit = self.storage.origin_visit_get_latest(self.repo_url) - assert visit["snapshot"] == expected_snapshot.id - assert visit["status"] == "full" diff --git a/swh/loader/git/utils.py b/swh/loader/git/utils.py --- a/swh/loader/git/utils.py +++ b/swh/loader/git/utils.py @@ -78,6 +78,25 @@ datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc) +def ignore_branch_name(branch_name: bytes) -> bool: + """Should the git loader ignore the branch named `branch_name`?""" + if branch_name.endswith(b"^{}"): + # Peeled refs make the git protocol explode + return True + elif branch_name.startswith(b"refs/pull/") and branch_name.endswith(b"/merge"): + # We filter-out auto-merged GitHub pull requests + return True + + return False + + +def filter_refs(refs: Dict[bytes, bytes]) -> Dict[bytes, bytes]: + """Filter the refs dictionary using the policy set in `ignore_branch_name`""" + return { + name: target for name, target in refs.items() if not ignore_branch_name(name) + } + + def warn_dangling_branches( branches: Dict[bytes, Optional[SnapshotBranch]], dangling_branches: Dict[bytes, bytes],