diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py --- a/swh/loader/mercurial/from_disk.py +++ b/swh/loader/mercurial/from_disk.py @@ -188,11 +188,11 @@ # hg node id of the latest snapshot branch heads # used to find what are the new revisions since last snapshot - self._latest_heads: List[bytes] = [] + self._latest_heads: List[HgNodeId] = [] # hg node ids of all the tags recorded on previous runs # Used to compute which tags need to be added, even across incremental runs # that might separate a changeset introducing a tag from its target. - self._saved_tags: Set[bytes] = set() + self._saved_tags: Set[HgNodeId] = set() self._load_status = "eventful" # If set, will override the default value @@ -262,21 +262,24 @@ tags.append(branch.target) self._latest_heads.extend( - extid.extid for extid in self._get_extids_for_targets(heads) + HgNodeId(extid.extid) for extid in self._get_extids_for_targets(heads) ) self._saved_tags.update( - extid.extid for extid in self._get_extids_for_targets(tags) + HgNodeId(extid.extid) for extid in self._get_extids_for_targets(tags) ) - def _get_extids_for_targets(self, targets: List[bytes]) -> List[ExtID]: + def _get_extids_for_targets(self, targets: List[Sha1Git]) -> List[ExtID]: """Get all Mercurial ExtIDs for the targets in the latest snapshot""" - extids = [ - extid - for extid in self.storage.extid_get_from_target( - identifiers.ObjectType.REVISION, targets - ) - if extid.extid_type == EXTID_TYPE and extid.extid_version == EXTID_VERSION - ] + extids = [] + for extid in self.storage.extid_get_from_target( + identifiers.ObjectType.REVISION, targets + ): + if extid.extid_type != EXTID_TYPE or extid.extid_version != EXTID_VERSION: + continue + extids.append(extid) + self._revision_nodeid_to_sha1git[ + HgNodeId(extid.extid) + ] = extid.target.object_id if extids: # Filter out dangling extids, we need to load their target again @@ -290,17 +293,21 @@ ] return extids - def _get_extids_for_hgnodes(self, hgnode_ids: List[bytes]) -> List[ExtID]: + def _get_extids_for_hgnodes(self, hgnode_ids: List[HgNodeId]) -> List[ExtID]: """Get all Mercurial ExtIDs for the mercurial nodes in the list which point to a known revision. """ extids = [] + for group_ids in grouper(hgnode_ids, n=1000): for extid in self.storage.extid_get_from_extid(EXTID_TYPE, group_ids): if extid.extid_version != EXTID_VERSION: continue extids.append(extid) + self._revision_nodeid_to_sha1git[ + HgNodeId(extid.extid) + ] = extid.target.object_id if extids: # Filter out dangling extids, we need to load their target again @@ -342,7 +349,7 @@ return False - def _new_revs(self, heads: List[bytes]) -> Union[HgFilteredSet, HgSpanSet]: + def _new_revs(self, heads: List[HgNodeId]) -> Union[HgFilteredSet, HgSpanSet]: """Return unseen revisions. That is, filter out revisions that are not ancestors of heads""" assert self._repo is not None @@ -384,7 +391,7 @@ hg_nodeids = [repo[nodeid].node() for nodeid in revs_left] yield from self._new_revs( [ - extid.extid + HgNodeId(extid.extid) for extid in self._get_extids_for_hgnodes(hg_nodeids) ] ) @@ -411,9 +418,9 @@ ignored_revs.update(descendents) if length_ingested_revs == 0: - # The repository has nothing to ingest (either empty or broken repository) + # no new revision ingested, so uneventful + # still we'll make a snapshot, so we continue self._load_status = "uneventful" - return branching_info = hgutil.branching_info(repo, ignored_revs) tags_by_name: Dict[bytes, HgNodeId] = repo.tags() diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py --- a/swh/loader/mercurial/tests/test_from_disk.py +++ b/swh/loader/mercurial/tests/test_from_disk.py @@ -177,18 +177,19 @@ assert stats == expected_stats loader2 = HgLoaderFromDisk(swh_storage, url=repo_url) - assert loader2.load() == {"status": "uneventful"} + assert loader2.load() == {"status": "uneventful"} # nothing new happened stats2 = get_stats(loader2.storage) expected_stats2 = expected_stats.copy() expected_stats2["origin_visit"] = 2 # one new visit recorded assert stats2 == expected_stats2 - visit_status = assert_last_visit_matches( - loader2.storage, repo_url, status="full", type="hg", - ) - assert visit_status.snapshot is None - # FIXME: Already seen objects are filtered out, so no new snapshot. - # Current behavior but is it ok? + assert_last_visit_matches( + loader2.storage, + repo_url, + status="full", + type="hg", + snapshot=expected_snapshot.id, + ) # but we got a snapshot nonetheless # This test has as been adapted from the historical `HgBundle20Loader` tests @@ -342,9 +343,8 @@ return new_storage -@pytest.mark.parametrize("mechanism", ("extid", "same storage")) def test_load_unchanged_repo_should_be_uneventful( - swh_storage, datadir, tmp_path, mechanism + swh_storage, datadir, tmp_path, ): """Checks the loader can find which revisions it already loaded, using ExtIDs.""" archive_name = "hello" @@ -365,45 +365,31 @@ "skipped_content": 0, "snapshot": 1, } - - old_storage = swh_storage - - # Create a new storage, and only copy ExtIDs or head revisions to it. - # This should be enough for the loader to know revisions were already loaded - new_storage = _partial_copy_storage( - old_storage, repo_path, mechanism=mechanism, copy_revisions=True + visit_status = assert_last_visit_matches( + loader.storage, repo_path, type=RevisionType.MERCURIAL.value, status="full", ) + assert visit_status.snapshot is not None # Create a new loader (to start with a clean slate, eg. remove the caches), # with the new, partial, storage - loader = HgLoaderFromDisk(new_storage, repo_path) - assert loader.load() == {"status": "uneventful"} + loader2 = HgLoaderFromDisk(swh_storage, repo_path) + assert loader2.load() == {"status": "uneventful"} - if mechanism == "same storage": - # Should have all the objects - assert get_stats(loader.storage) == { - "content": 3, - "directory": 3, - "origin": 1, - "origin_visit": 2, - "release": 1, - "revision": 3, - "skipped_content": 0, - "snapshot": 1, - } - else: - # Should have only the objects we directly inserted from the test, plus - # a new visit - assert get_stats(loader.storage) == { - "content": 0, - "directory": 0, - "origin": 1, - "origin_visit": 2, - "release": 0, - "revision": 1, - "skipped_content": 0, - "snapshot": 1, - } + # Should have all the objects + assert get_stats(loader.storage) == { + "content": 3, + "directory": 3, + "origin": 1, + "origin_visit": 2, + "release": 1, + "revision": 3, + "skipped_content": 0, + "snapshot": 1, + } + visit_status2 = assert_last_visit_matches( + loader2.storage, repo_path, type=RevisionType.MERCURIAL.value, status="full", + ) + assert visit_status2.snapshot == visit_status.snapshot def test_closed_branch_incremental(swh_storage, datadir, tmp_path): @@ -735,11 +721,5 @@ visit_status2 = assert_last_visit_matches( loader.storage, fork_url, status="full", type="hg", ) - assert visit_status.snapshot is not None - assert visit_status2.snapshot is None - - # FIXME: Consistent behavior with filtering data out from already seen snapshot (on - # a given origin). But, the other fork origin has no snapshot at all. We should - # though, shouldn't we? Otherwise, that would mean fork could end up with no - # snapshot at all. + assert visit_status2.snapshot == visit_status.snapshot