Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7163656
D6268.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
8 KB
Subscribers
None
D6268.diff
View Options
diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py
--- a/swh/loader/mercurial/from_disk.py
+++ b/swh/loader/mercurial/from_disk.py
@@ -188,11 +188,11 @@
# hg node id of the latest snapshot branch heads
# used to find what are the new revisions since last snapshot
- self._latest_heads: List[bytes] = []
+ self._latest_heads: List[HgNodeId] = []
# hg node ids of all the tags recorded on previous runs
# Used to compute which tags need to be added, even across incremental runs
# that might separate a changeset introducing a tag from its target.
- self._saved_tags: Set[bytes] = set()
+ self._saved_tags: Set[HgNodeId] = set()
self._load_status = "eventful"
# If set, will override the default value
@@ -262,21 +262,24 @@
tags.append(branch.target)
self._latest_heads.extend(
- extid.extid for extid in self._get_extids_for_targets(heads)
+ HgNodeId(extid.extid) for extid in self._get_extids_for_targets(heads)
)
self._saved_tags.update(
- extid.extid for extid in self._get_extids_for_targets(tags)
+ HgNodeId(extid.extid) for extid in self._get_extids_for_targets(tags)
)
- def _get_extids_for_targets(self, targets: List[bytes]) -> List[ExtID]:
+ def _get_extids_for_targets(self, targets: List[Sha1Git]) -> List[ExtID]:
"""Get all Mercurial ExtIDs for the targets in the latest snapshot"""
- extids = [
- extid
- for extid in self.storage.extid_get_from_target(
- identifiers.ObjectType.REVISION, targets
- )
- if extid.extid_type == EXTID_TYPE and extid.extid_version == EXTID_VERSION
- ]
+ extids = []
+ for extid in self.storage.extid_get_from_target(
+ identifiers.ObjectType.REVISION, targets
+ ):
+ if extid.extid_type != EXTID_TYPE or extid.extid_version != EXTID_VERSION:
+ continue
+ extids.append(extid)
+ self._revision_nodeid_to_sha1git[
+ HgNodeId(extid.extid)
+ ] = extid.target.object_id
if extids:
# Filter out dangling extids, we need to load their target again
@@ -290,17 +293,21 @@
]
return extids
- def _get_extids_for_hgnodes(self, hgnode_ids: List[bytes]) -> List[ExtID]:
+ def _get_extids_for_hgnodes(self, hgnode_ids: List[HgNodeId]) -> List[ExtID]:
"""Get all Mercurial ExtIDs for the mercurial nodes in the list which point to
a known revision.
"""
extids = []
+
for group_ids in grouper(hgnode_ids, n=1000):
for extid in self.storage.extid_get_from_extid(EXTID_TYPE, group_ids):
if extid.extid_version != EXTID_VERSION:
continue
extids.append(extid)
+ self._revision_nodeid_to_sha1git[
+ HgNodeId(extid.extid)
+ ] = extid.target.object_id
if extids:
# Filter out dangling extids, we need to load their target again
@@ -342,7 +349,7 @@
return False
- def _new_revs(self, heads: List[bytes]) -> Union[HgFilteredSet, HgSpanSet]:
+ def _new_revs(self, heads: List[HgNodeId]) -> Union[HgFilteredSet, HgSpanSet]:
"""Return unseen revisions. That is, filter out revisions that are not ancestors of
heads"""
assert self._repo is not None
@@ -384,7 +391,7 @@
hg_nodeids = [repo[nodeid].node() for nodeid in revs_left]
yield from self._new_revs(
[
- extid.extid
+ HgNodeId(extid.extid)
for extid in self._get_extids_for_hgnodes(hg_nodeids)
]
)
@@ -411,9 +418,9 @@
ignored_revs.update(descendents)
if length_ingested_revs == 0:
- # The repository has nothing to ingest (either empty or broken repository)
+ # no new revision ingested, so uneventful
+ # still we'll make a snapshot, so we continue
self._load_status = "uneventful"
- return
branching_info = hgutil.branching_info(repo, ignored_revs)
tags_by_name: Dict[bytes, HgNodeId] = repo.tags()
diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py
--- a/swh/loader/mercurial/tests/test_from_disk.py
+++ b/swh/loader/mercurial/tests/test_from_disk.py
@@ -177,18 +177,19 @@
assert stats == expected_stats
loader2 = HgLoaderFromDisk(swh_storage, url=repo_url)
- assert loader2.load() == {"status": "uneventful"}
+ assert loader2.load() == {"status": "uneventful"} # nothing new happened
stats2 = get_stats(loader2.storage)
expected_stats2 = expected_stats.copy()
expected_stats2["origin_visit"] = 2 # one new visit recorded
assert stats2 == expected_stats2
- visit_status = assert_last_visit_matches(
- loader2.storage, repo_url, status="full", type="hg",
- )
- assert visit_status.snapshot is None
- # FIXME: Already seen objects are filtered out, so no new snapshot.
- # Current behavior but is it ok?
+ assert_last_visit_matches(
+ loader2.storage,
+ repo_url,
+ status="full",
+ type="hg",
+ snapshot=expected_snapshot.id,
+ ) # but we got a snapshot nonetheless
# This test has as been adapted from the historical `HgBundle20Loader` tests
@@ -342,9 +343,8 @@
return new_storage
-@pytest.mark.parametrize("mechanism", ("extid", "same storage"))
def test_load_unchanged_repo_should_be_uneventful(
- swh_storage, datadir, tmp_path, mechanism
+ swh_storage, datadir, tmp_path,
):
"""Checks the loader can find which revisions it already loaded, using ExtIDs."""
archive_name = "hello"
@@ -365,45 +365,31 @@
"skipped_content": 0,
"snapshot": 1,
}
-
- old_storage = swh_storage
-
- # Create a new storage, and only copy ExtIDs or head revisions to it.
- # This should be enough for the loader to know revisions were already loaded
- new_storage = _partial_copy_storage(
- old_storage, repo_path, mechanism=mechanism, copy_revisions=True
+ visit_status = assert_last_visit_matches(
+ loader.storage, repo_path, type=RevisionType.MERCURIAL.value, status="full",
)
+ assert visit_status.snapshot is not None
# Create a new loader (to start with a clean slate, eg. remove the caches),
# with the new, partial, storage
- loader = HgLoaderFromDisk(new_storage, repo_path)
- assert loader.load() == {"status": "uneventful"}
+ loader2 = HgLoaderFromDisk(swh_storage, repo_path)
+ assert loader2.load() == {"status": "uneventful"}
- if mechanism == "same storage":
- # Should have all the objects
- assert get_stats(loader.storage) == {
- "content": 3,
- "directory": 3,
- "origin": 1,
- "origin_visit": 2,
- "release": 1,
- "revision": 3,
- "skipped_content": 0,
- "snapshot": 1,
- }
- else:
- # Should have only the objects we directly inserted from the test, plus
- # a new visit
- assert get_stats(loader.storage) == {
- "content": 0,
- "directory": 0,
- "origin": 1,
- "origin_visit": 2,
- "release": 0,
- "revision": 1,
- "skipped_content": 0,
- "snapshot": 1,
- }
+ # Should have all the objects
+ assert get_stats(loader.storage) == {
+ "content": 3,
+ "directory": 3,
+ "origin": 1,
+ "origin_visit": 2,
+ "release": 1,
+ "revision": 3,
+ "skipped_content": 0,
+ "snapshot": 1,
+ }
+ visit_status2 = assert_last_visit_matches(
+ loader2.storage, repo_path, type=RevisionType.MERCURIAL.value, status="full",
+ )
+ assert visit_status2.snapshot == visit_status.snapshot
def test_closed_branch_incremental(swh_storage, datadir, tmp_path):
@@ -735,11 +721,5 @@
visit_status2 = assert_last_visit_matches(
loader.storage, fork_url, status="full", type="hg",
)
-
assert visit_status.snapshot is not None
- assert visit_status2.snapshot is None
-
- # FIXME: Consistent behavior with filtering data out from already seen snapshot (on
- # a given origin). But, the other fork origin has no snapshot at all. We should
- # though, shouldn't we? Otherwise, that would mean fork could end up with no
- # snapshot at all.
+ assert visit_status2.snapshot == visit_status.snapshot
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Jan 30 2025, 11:57 AM (6 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217650
Attached To
D6268: mercurial: Build snapshot on visits
Event Timeline
Log In to Comment