Page MenuHomeSoftware Heritage

D6268.diff
No OneTemporary

D6268.diff

diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py
--- a/swh/loader/mercurial/from_disk.py
+++ b/swh/loader/mercurial/from_disk.py
@@ -188,11 +188,11 @@
# hg node id of the latest snapshot branch heads
# used to find what are the new revisions since last snapshot
- self._latest_heads: List[bytes] = []
+ self._latest_heads: List[HgNodeId] = []
# hg node ids of all the tags recorded on previous runs
# Used to compute which tags need to be added, even across incremental runs
# that might separate a changeset introducing a tag from its target.
- self._saved_tags: Set[bytes] = set()
+ self._saved_tags: Set[HgNodeId] = set()
self._load_status = "eventful"
# If set, will override the default value
@@ -262,21 +262,24 @@
tags.append(branch.target)
self._latest_heads.extend(
- extid.extid for extid in self._get_extids_for_targets(heads)
+ HgNodeId(extid.extid) for extid in self._get_extids_for_targets(heads)
)
self._saved_tags.update(
- extid.extid for extid in self._get_extids_for_targets(tags)
+ HgNodeId(extid.extid) for extid in self._get_extids_for_targets(tags)
)
- def _get_extids_for_targets(self, targets: List[bytes]) -> List[ExtID]:
+ def _get_extids_for_targets(self, targets: List[Sha1Git]) -> List[ExtID]:
"""Get all Mercurial ExtIDs for the targets in the latest snapshot"""
- extids = [
- extid
- for extid in self.storage.extid_get_from_target(
- identifiers.ObjectType.REVISION, targets
- )
- if extid.extid_type == EXTID_TYPE and extid.extid_version == EXTID_VERSION
- ]
+ extids = []
+ for extid in self.storage.extid_get_from_target(
+ identifiers.ObjectType.REVISION, targets
+ ):
+ if extid.extid_type != EXTID_TYPE or extid.extid_version != EXTID_VERSION:
+ continue
+ extids.append(extid)
+ self._revision_nodeid_to_sha1git[
+ HgNodeId(extid.extid)
+ ] = extid.target.object_id
if extids:
# Filter out dangling extids, we need to load their target again
@@ -290,17 +293,21 @@
]
return extids
- def _get_extids_for_hgnodes(self, hgnode_ids: List[bytes]) -> List[ExtID]:
+ def _get_extids_for_hgnodes(self, hgnode_ids: List[HgNodeId]) -> List[ExtID]:
"""Get all Mercurial ExtIDs for the mercurial nodes in the list which point to
a known revision.
"""
extids = []
+
for group_ids in grouper(hgnode_ids, n=1000):
for extid in self.storage.extid_get_from_extid(EXTID_TYPE, group_ids):
if extid.extid_version != EXTID_VERSION:
continue
extids.append(extid)
+ self._revision_nodeid_to_sha1git[
+ HgNodeId(extid.extid)
+ ] = extid.target.object_id
if extids:
# Filter out dangling extids, we need to load their target again
@@ -342,7 +349,7 @@
return False
- def _new_revs(self, heads: List[bytes]) -> Union[HgFilteredSet, HgSpanSet]:
+ def _new_revs(self, heads: List[HgNodeId]) -> Union[HgFilteredSet, HgSpanSet]:
"""Return unseen revisions. That is, filter out revisions that are not ancestors of
heads"""
assert self._repo is not None
@@ -384,7 +391,7 @@
hg_nodeids = [repo[nodeid].node() for nodeid in revs_left]
yield from self._new_revs(
[
- extid.extid
+ HgNodeId(extid.extid)
for extid in self._get_extids_for_hgnodes(hg_nodeids)
]
)
@@ -411,9 +418,9 @@
ignored_revs.update(descendents)
if length_ingested_revs == 0:
- # The repository has nothing to ingest (either empty or broken repository)
+ # no new revision ingested, so uneventful
+ # still we'll make a snapshot, so we continue
self._load_status = "uneventful"
- return
branching_info = hgutil.branching_info(repo, ignored_revs)
tags_by_name: Dict[bytes, HgNodeId] = repo.tags()
diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py
--- a/swh/loader/mercurial/tests/test_from_disk.py
+++ b/swh/loader/mercurial/tests/test_from_disk.py
@@ -177,18 +177,19 @@
assert stats == expected_stats
loader2 = HgLoaderFromDisk(swh_storage, url=repo_url)
- assert loader2.load() == {"status": "uneventful"}
+ assert loader2.load() == {"status": "uneventful"} # nothing new happened
stats2 = get_stats(loader2.storage)
expected_stats2 = expected_stats.copy()
expected_stats2["origin_visit"] = 2 # one new visit recorded
assert stats2 == expected_stats2
- visit_status = assert_last_visit_matches(
- loader2.storage, repo_url, status="full", type="hg",
- )
- assert visit_status.snapshot is None
- # FIXME: Already seen objects are filtered out, so no new snapshot.
- # Current behavior but is it ok?
+ assert_last_visit_matches(
+ loader2.storage,
+ repo_url,
+ status="full",
+ type="hg",
+ snapshot=expected_snapshot.id,
+ ) # but we got a snapshot nonetheless
# This test has as been adapted from the historical `HgBundle20Loader` tests
@@ -342,9 +343,8 @@
return new_storage
-@pytest.mark.parametrize("mechanism", ("extid", "same storage"))
def test_load_unchanged_repo_should_be_uneventful(
- swh_storage, datadir, tmp_path, mechanism
+ swh_storage, datadir, tmp_path,
):
"""Checks the loader can find which revisions it already loaded, using ExtIDs."""
archive_name = "hello"
@@ -365,45 +365,31 @@
"skipped_content": 0,
"snapshot": 1,
}
-
- old_storage = swh_storage
-
- # Create a new storage, and only copy ExtIDs or head revisions to it.
- # This should be enough for the loader to know revisions were already loaded
- new_storage = _partial_copy_storage(
- old_storage, repo_path, mechanism=mechanism, copy_revisions=True
+ visit_status = assert_last_visit_matches(
+ loader.storage, repo_path, type=RevisionType.MERCURIAL.value, status="full",
)
+ assert visit_status.snapshot is not None
# Create a new loader (to start with a clean slate, eg. remove the caches),
# with the new, partial, storage
- loader = HgLoaderFromDisk(new_storage, repo_path)
- assert loader.load() == {"status": "uneventful"}
+ loader2 = HgLoaderFromDisk(swh_storage, repo_path)
+ assert loader2.load() == {"status": "uneventful"}
- if mechanism == "same storage":
- # Should have all the objects
- assert get_stats(loader.storage) == {
- "content": 3,
- "directory": 3,
- "origin": 1,
- "origin_visit": 2,
- "release": 1,
- "revision": 3,
- "skipped_content": 0,
- "snapshot": 1,
- }
- else:
- # Should have only the objects we directly inserted from the test, plus
- # a new visit
- assert get_stats(loader.storage) == {
- "content": 0,
- "directory": 0,
- "origin": 1,
- "origin_visit": 2,
- "release": 0,
- "revision": 1,
- "skipped_content": 0,
- "snapshot": 1,
- }
+ # Should have all the objects
+ assert get_stats(loader.storage) == {
+ "content": 3,
+ "directory": 3,
+ "origin": 1,
+ "origin_visit": 2,
+ "release": 1,
+ "revision": 3,
+ "skipped_content": 0,
+ "snapshot": 1,
+ }
+ visit_status2 = assert_last_visit_matches(
+ loader2.storage, repo_path, type=RevisionType.MERCURIAL.value, status="full",
+ )
+ assert visit_status2.snapshot == visit_status.snapshot
def test_closed_branch_incremental(swh_storage, datadir, tmp_path):
@@ -735,11 +721,5 @@
visit_status2 = assert_last_visit_matches(
loader.storage, fork_url, status="full", type="hg",
)
-
assert visit_status.snapshot is not None
- assert visit_status2.snapshot is None
-
- # FIXME: Consistent behavior with filtering data out from already seen snapshot (on
- # a given origin). But, the other fork origin has no snapshot at all. We should
- # though, shouldn't we? Otherwise, that would mean fork could end up with no
- # snapshot at all.
+ assert visit_status2.snapshot == visit_status.snapshot

File Metadata

Mime Type
text/plain
Expires
Jan 30 2025, 11:57 AM (6 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217650

Event Timeline