diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py --- a/swh/loader/mercurial/from_disk.py +++ b/swh/loader/mercurial/from_disk.py @@ -304,6 +304,15 @@ except KeyError: # the node does not exist anymore pass + # Mercurial can have more than one head per branch, so we need to exclude + # local heads that have already been loaded as revisions but don't + # correspond to a SnapshotBranch. + # In the future, if the SnapshotBranch model evolves to support multiple + # heads per branch (or anything else that fixes this issue) this might + # become useless. + extids = self.storage.extid_get_from_extid(EXTID_TYPE, repo.heads()) + known_heads = {extid.extid for extid in extids} + existing_heads.extend([repo[head].rev() for head in known_heads]) # select revisions that are not ancestors of heads # and not the heads themselves new_revs = repo.revs("not ::(%ld)", existing_heads) @@ -411,7 +420,7 @@ from_storage = self.storage.extid_get_from_extid(EXTID_TYPE, ids=[hg_nodeid]) msg = "Expected 1 match from storage for hg node %r, got %d" - assert len(from_storage) == 1, msg % (hg_nodeid, len(from_storage)) + assert len(from_storage) == 1, msg % (hg_nodeid.hex(), len(from_storage)) return from_storage[0].target.object_id def get_revision_parents(self, rev_ctx: hgutil.BaseContext) -> Tuple[Sha1Git, ...]: diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py --- a/swh/loader/mercurial/tests/test_from_disk.py +++ b/swh/loader/mercurial/tests/test_from_disk.py @@ -11,6 +11,7 @@ import attr import pytest +from swh.loader.mercurial.loader import HgBundle20Loader from swh.loader.mercurial.utils import parse_visit_date from swh.loader.tests import ( assert_last_visit_matches, @@ -408,6 +409,36 @@ assert get_stats(loader.storage) == {**expected_stats, "origin_visit": 2 + 1} +def test_old_loader_new_loader(swh_storage, datadir, tmp_path): + archive_name = "example" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + repo_path = repo_url.replace("file://", "") + + old_loader = HgBundle20Loader(swh_storage, repo_path) + assert old_loader.load() == {"status": "eventful"} + + expected_stats = { + "content": 7, + "directory": 16, + "origin": 1, + "origin_visit": 1, + "release": 0, + "revision": 9, + "skipped_content": 0, + "snapshot": 1, + } + assert get_stats(old_loader.storage) == expected_stats + + # Shouldn't pick up anything + loader = HgLoaderFromDisk(swh_storage, repo_path) + assert loader.load() == {"status": "uneventful"} + + # Shouldn't pick up anything either + loader = HgLoaderFromDisk(swh_storage, repo_path) + assert loader.load() == {"status": "uneventful"} + + def test_load_unchanged_repo__dangling_extid(swh_storage, datadir, tmp_path): """Checks the loader will load revisions targeted by an ExtID if the revisions are missing from the storage"""