diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py --- a/swh/loader/mercurial/from_disk.py +++ b/swh/loader/mercurial/from_disk.py @@ -187,11 +187,11 @@ # hg node id of the latest snapshot branch heads # used to find what are the new revisions since last snapshot - self._latest_heads: List[bytes] = [] + self._latest_heads: List[HgNodeId] = [] # hg node ids of all the tags recorded on previous runs # Used to compute which tags need to be added, even across incremental runs # that might separate a changeset introducing a tag from its target. - self._saved_tags: Set[bytes] = set() + self._saved_tags: Set[HgNodeId] = set() self._load_status = "eventful" # If set, will override the default value @@ -267,15 +267,16 @@ extid.extid for extid in self._get_extids_for_targets(tags) ) - def _get_extids_for_targets(self, targets: List[bytes]) -> List[ExtID]: + def _get_extids_for_targets(self, targets: List[HgNodeId]) -> List[ExtID]: """Get all Mercurial ExtIDs for the targets in the latest snapshot""" - extids = [ - extid - for extid in self.storage.extid_get_from_target( - identifiers.ObjectType.REVISION, targets - ) - if extid.extid_type == EXTID_TYPE and extid.extid_version == EXTID_VERSION - ] + extids = [] + for extid in self.storage.extid_get_from_target( + identifiers.ObjectType.REVISION, targets + ): + if extid.extid_type != EXTID_TYPE or extid.extid_version != EXTID_VERSION: + continue + extids.append(extid) + self._revision_nodeid_to_sha1git[extid.extid] = extid.target.object_id if extids: # Filter out dangling extids, we need to load their target again @@ -289,16 +290,18 @@ ] return extids - def _get_extids_for_hgnodes(self, hgnode_ids: List[bytes]) -> List[ExtID]: + def _get_extids_for_hgnodes(self, hgnode_ids: List[HgNodeId]) -> List[ExtID]: """Get all Mercurial ExtIDs for the mercurial nodes in the list which point to a known revision. """ - extids = [ - extid - for extid in self.storage.extid_get_from_extid(EXTID_TYPE, hgnode_ids) - if extid.extid_version == EXTID_VERSION - ] + extids = [] + for extid in self.storage.extid_get_from_extid(EXTID_TYPE, hgnode_ids): + if extid.extid_version != EXTID_VERSION: + continue + extids.append(extid) + self._revision_nodeid_to_sha1git[extid.extid] = extid.target.object_id + if extids: # Filter out dangling extids, we need to load their target again revisions_missing = self.storage.revision_missing( @@ -339,7 +342,7 @@ return False - def _new_revs(self, heads: List[bytes]): + def _new_revs(self, heads: List[HgNodeId]): """Return unseen revisions. That is, filter out revisions that are not ancestors of heads""" assert self._repo is not None @@ -406,13 +409,17 @@ ignored_revs.update(descendents) if length_ingested_revs == 0: - # The repository has nothing to ingest (either empty or broken repository) self._load_status = "uneventful" - return + # uneventful but still we need to make a snapshot if any, so we continue + + self.log.debug( + "Revision ingested: %s (0 means no new revision)", length_ingested_revs + ) + self.log.debug("Ignored revisions: %s", ignored_revs) branching_info = hgutil.branching_info(repo, ignored_revs) - tags_by_name: Dict[bytes, HgNodeId] = repo.tags() + tags_by_name: Dict[bytes, HgNodeId] = repo.tags() snapshot_branches: Dict[bytes, SnapshotBranch] = {} for tag_name, hg_nodeid in tags_by_name.items(): @@ -465,11 +472,11 @@ snapshot_branches[b"HEAD"] = SnapshotBranch( target=default_branch_alias, target_type=TargetType.ALIAS, ) - snapshot = Snapshot(branches=snapshot_branches) - self.storage.snapshot_add([snapshot]) + snapshot = Snapshot(branches=snapshot_branches) + self.storage.snapshot_add([snapshot]) + self.loaded_snapshot_id = snapshot.id self.flush() - self.loaded_snapshot_id = snapshot.id def load_status(self) -> Dict[str, str]: """Detailed loading status. @@ -511,7 +518,7 @@ ] msg = "Expected 1 match from storage for hg node %r, got %d" - assert len(from_storage) == 1, msg % (hg_nodeid.hex(), len(from_storage)) + assert len(from_storage) == 1, msg % (hg_nodeid, len(from_storage)) return from_storage[0].target.object_id def get_revision_parents(self, rev_ctx: hgutil.BaseContext) -> Tuple[Sha1Git, ...]: diff --git a/swh/loader/mercurial/hgutil.py b/swh/loader/mercurial/hgutil.py --- a/swh/loader/mercurial/hgutil.py +++ b/swh/loader/mercurial/hgutil.py @@ -10,7 +10,7 @@ import signal import time import traceback -from typing import Dict, List, Mapping, NewType, Optional, Set +from typing import Dict, List, Mapping, Optional, Set from billiard import Process, Queue @@ -19,7 +19,7 @@ import mercurial.ui # type: ignore NULLID = mercurial.node.nullid -HgNodeId = NewType("HgNodeId", bytes) +HgNodeId = bytes Repository = hg.localrepo BaseContext = context.basectx LRUCacheDict = util.lrucachedict diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py --- a/swh/loader/mercurial/tests/test_from_disk.py +++ b/swh/loader/mercurial/tests/test_from_disk.py @@ -177,7 +177,7 @@ assert stats == expected_stats loader2 = HgLoaderFromDisk(swh_storage, url=repo_url) - assert loader2.load() == {"status": "uneventful"} + assert loader2.load() == {"status": "uneventful"} # nothing new happened stats2 = get_stats(loader2.storage) expected_stats2 = expected_stats.copy() @@ -185,12 +185,13 @@ {"origin_visit": 1 + 1,} ) assert stats2 == expected_stats2 - visit_status = assert_last_visit_matches( - loader2.storage, repo_url, status="full", type="hg", - ) - assert visit_status.snapshot is None - # FIXME: Already seen objects are filtered out, so no new snapshot. - # Current behavior but is it ok? + assert_last_visit_matches( + loader2.storage, + repo_url, + status="full", + type="hg", + snapshot=expected_snapshot.id, + ) # but we got a snapshot nonetheless # This test has as been adapted from the historical `HgBundle20Loader` tests @@ -308,7 +309,7 @@ def _partial_copy_storage( - old_storage, origin_url: str, mechanism: str, copy_revisions: bool + old_storage, origin_url: str, mechanism: str, copy_revisions: bool = False ): """Create a new storage, and only copy ExtIDs or head revisions to it.""" new_storage = get_storage(cls="memory") @@ -344,9 +345,8 @@ return new_storage -@pytest.mark.parametrize("mechanism", ("extid", "same storage")) def test_load_unchanged_repo_should_be_uneventful( - swh_storage, datadir, tmp_path, mechanism + swh_storage, datadir, tmp_path, ): """Checks the loader can find which revisions it already loaded, using ExtIDs.""" archive_name = "hello" @@ -367,45 +367,31 @@ "skipped_content": 0, "snapshot": 1, } - - old_storage = swh_storage - - # Create a new storage, and only copy ExtIDs or head revisions to it. - # This should be enough for the loader to know revisions were already loaded - new_storage = _partial_copy_storage( - old_storage, repo_path, mechanism=mechanism, copy_revisions=True + visit_status = assert_last_visit_matches( + loader.storage, repo_path, type=RevisionType.MERCURIAL.value, status="full", ) + assert visit_status.snapshot is not None # Create a new loader (to start with a clean slate, eg. remove the caches), # with the new, partial, storage - loader = HgLoaderFromDisk(new_storage, repo_path) - assert loader.load() == {"status": "uneventful"} + loader2 = HgLoaderFromDisk(swh_storage, repo_path) + assert loader2.load() == {"status": "uneventful"} - if mechanism == "same storage": - # Should have all the objects - assert get_stats(loader.storage) == { - "content": 3, - "directory": 3, - "origin": 1, - "origin_visit": 2, - "release": 1, - "revision": 3, - "skipped_content": 0, - "snapshot": 1, - } - else: - # Should have only the objects we directly inserted from the test, plus - # a new visit - assert get_stats(loader.storage) == { - "content": 0, - "directory": 0, - "origin": 1, - "origin_visit": 2, - "release": 0, - "revision": 1, - "skipped_content": 0, - "snapshot": 1, - } + # Should have all the objects + assert get_stats(loader.storage) == { + "content": 3, + "directory": 3, + "origin": 1, + "origin_visit": 2, + "release": 1, + "revision": 3, + "skipped_content": 0, + "snapshot": 1, + } + visit_status2 = assert_last_visit_matches( + loader2.storage, repo_path, type=RevisionType.MERCURIAL.value, status="full", + ) + assert visit_status2.snapshot == visit_status.snapshot def test_closed_branch_incremental(swh_storage, datadir, tmp_path):