diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py --- a/swh/loader/mercurial/from_disk.py +++ b/swh/loader/mercurial/from_disk.py @@ -187,11 +187,11 @@ # hg node id of the latest snapshot branch heads # used to find what are the new revisions since last snapshot - self._latest_heads: List[bytes] = [] + self._latest_heads: List[HgNodeId] = [] # hg node ids of all the tags recorded on previous runs # Used to compute which tags need to be added, even across incremental runs # that might separate a changeset introducing a tag from its target. - self._saved_tags: Set[bytes] = set() + self._saved_tags: Set[HgNodeId] = set() self._load_status = "eventful" # If set, will override the default value @@ -267,15 +267,16 @@ extid.extid for extid in self._get_extids_for_targets(tags) ) - def _get_extids_for_targets(self, targets: List[bytes]) -> List[ExtID]: + def _get_extids_for_targets(self, targets: List[Sha1Git]) -> List[ExtID]: """Get all Mercurial ExtIDs for the targets in the latest snapshot""" - extids = [ - extid - for extid in self.storage.extid_get_from_target( - identifiers.ObjectType.REVISION, targets - ) - if extid.extid_type == EXTID_TYPE and extid.extid_version == EXTID_VERSION - ] + extids = [] + for extid in self.storage.extid_get_from_target( + identifiers.ObjectType.REVISION, targets + ): + if extid.extid_type != EXTID_TYPE or extid.extid_version != EXTID_VERSION: + continue + extids.append(extid) + self._revision_nodeid_to_sha1git[extid.extid] = extid.target.object_id if extids: # Filter out dangling extids, we need to load their target again @@ -289,16 +290,18 @@ ] return extids - def _get_extids_for_hgnodes(self, hgnode_ids: List[bytes]) -> List[ExtID]: + def _get_extids_for_hgnodes(self, hgnode_ids: List[HgNodeId]) -> List[ExtID]: """Get all Mercurial ExtIDs for the mercurial nodes in the list which point to a known revision. """ - extids = [ - extid - for extid in self.storage.extid_get_from_extid(EXTID_TYPE, hgnode_ids) - if extid.extid_version == EXTID_VERSION - ] + extids = [] + for extid in self.storage.extid_get_from_extid(EXTID_TYPE, hgnode_ids): + if extid.extid_version != EXTID_VERSION: + continue + extids.append(extid) + self._revision_nodeid_to_sha1git[extid.extid] = extid.target.object_id + if extids: # Filter out dangling extids, we need to load their target again revisions_missing = self.storage.revision_missing( @@ -339,7 +342,7 @@ return False - def _new_revs(self, heads: List[bytes]): + def _new_revs(self, heads: List[HgNodeId]): """Return unseen revisions. That is, filter out revisions that are not ancestors of heads""" assert self._repo is not None @@ -406,9 +409,9 @@ ignored_revs.update(descendents) if length_ingested_revs == 0: - # The repository has nothing to ingest (either empty or broken repository) + # no new revision ingested, so uneventful + # still we'll make a snapshot, so we continue self._load_status = "uneventful" - return branching_info = hgutil.branching_info(repo, ignored_revs) tags_by_name: Dict[bytes, HgNodeId] = repo.tags() diff --git a/swh/loader/mercurial/hgutil.py b/swh/loader/mercurial/hgutil.py --- a/swh/loader/mercurial/hgutil.py +++ b/swh/loader/mercurial/hgutil.py @@ -10,7 +10,7 @@ import signal import time import traceback -from typing import Dict, List, Mapping, NewType, Optional, Set +from typing import Dict, List, Mapping, Optional, Set from billiard import Process, Queue @@ -19,7 +19,7 @@ import mercurial.ui # type: ignore NULLID = mercurial.node.nullid -HgNodeId = NewType("HgNodeId", bytes) +HgNodeId = bytes Repository = hg.localrepo BaseContext = context.basectx LRUCacheDict = util.lrucachedict diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py --- a/swh/loader/mercurial/tests/test_from_disk.py +++ b/swh/loader/mercurial/tests/test_from_disk.py @@ -177,7 +177,7 @@ assert stats == expected_stats loader2 = HgLoaderFromDisk(swh_storage, url=repo_url) - assert loader2.load() == {"status": "uneventful"} + assert loader2.load() == {"status": "uneventful"} # nothing new happened stats2 = get_stats(loader2.storage) expected_stats2 = expected_stats.copy() @@ -185,12 +185,13 @@ {"origin_visit": 1 + 1,} ) assert stats2 == expected_stats2 - visit_status = assert_last_visit_matches( - loader2.storage, repo_url, status="full", type="hg", - ) - assert visit_status.snapshot is None - # FIXME: Already seen objects are filtered out, so no new snapshot. - # Current behavior but is it ok? + assert_last_visit_matches( + loader2.storage, + repo_url, + status="full", + type="hg", + snapshot=expected_snapshot.id, + ) # but we got a snapshot nonetheless # This test has as been adapted from the historical `HgBundle20Loader` tests @@ -344,9 +345,8 @@ return new_storage -@pytest.mark.parametrize("mechanism", ("extid", "same storage")) def test_load_unchanged_repo_should_be_uneventful( - swh_storage, datadir, tmp_path, mechanism + swh_storage, datadir, tmp_path, ): """Checks the loader can find which revisions it already loaded, using ExtIDs.""" archive_name = "hello" @@ -367,45 +367,31 @@ "skipped_content": 0, "snapshot": 1, } - - old_storage = swh_storage - - # Create a new storage, and only copy ExtIDs or head revisions to it. - # This should be enough for the loader to know revisions were already loaded - new_storage = _partial_copy_storage( - old_storage, repo_path, mechanism=mechanism, copy_revisions=True + visit_status = assert_last_visit_matches( + loader.storage, repo_path, type=RevisionType.MERCURIAL.value, status="full", ) + assert visit_status.snapshot is not None # Create a new loader (to start with a clean slate, eg. remove the caches), # with the new, partial, storage - loader = HgLoaderFromDisk(new_storage, repo_path) - assert loader.load() == {"status": "uneventful"} + loader2 = HgLoaderFromDisk(swh_storage, repo_path) + assert loader2.load() == {"status": "uneventful"} - if mechanism == "same storage": - # Should have all the objects - assert get_stats(loader.storage) == { - "content": 3, - "directory": 3, - "origin": 1, - "origin_visit": 2, - "release": 1, - "revision": 3, - "skipped_content": 0, - "snapshot": 1, - } - else: - # Should have only the objects we directly inserted from the test, plus - # a new visit - assert get_stats(loader.storage) == { - "content": 0, - "directory": 0, - "origin": 1, - "origin_visit": 2, - "release": 0, - "revision": 1, - "skipped_content": 0, - "snapshot": 1, - } + # Should have all the objects + assert get_stats(loader.storage) == { + "content": 3, + "directory": 3, + "origin": 1, + "origin_visit": 2, + "release": 1, + "revision": 3, + "skipped_content": 0, + "snapshot": 1, + } + visit_status2 = assert_last_visit_matches( + loader2.storage, repo_path, type=RevisionType.MERCURIAL.value, status="full", + ) + assert visit_status2.snapshot == visit_status.snapshot def test_closed_branch_incremental(swh_storage, datadir, tmp_path): @@ -737,11 +723,5 @@ visit_status2 = assert_last_visit_matches( loader.storage, fork_url, status="full", type="hg", ) - assert visit_status.snapshot is not None - assert visit_status2.snapshot is None - - # FIXME: Consistent behavior with filtering data out from already seen snapshot (on - # a given origin). But, the other fork origin has no snapshot at all. We should - # though, shouldn't we? Otherwise, that would mean fork could end up with no - # snapshot at all. + assert visit_status2.snapshot == visit_status.snapshot