Page MenuHomeSoftware Heritage

D6240.id22652.diff
No OneTemporary

D6240.id22652.diff

diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py
--- a/swh/loader/mercurial/from_disk.py
+++ b/swh/loader/mercurial/from_disk.py
@@ -289,6 +289,25 @@
]
return extids
+ def _get_extids_for_hgnodes(self, hgnode_ids: List[bytes]) -> List[ExtID]:
+ """Get all Mercurial ExtIDs for the mercurial nodes in the list."""
+ extids = [
+ extid
+ for extid in self.storage.extid_get_from_extid(EXTID_TYPE, hgnode_ids)
+ if extid.extid_version == EXTID_VERSION
+ ]
+ if extids:
+ # Filter out dangling extids, we need to load their target again
+ revisions_missing = self.storage.revision_missing(
+ [extid.target.object_id for extid in extids]
+ )
+ extids = [
+ extid
+ for extid in extids
+ if extid.target.object_id not in revisions_missing
+ ]
+ return extids
+
def fetch_data(self) -> bool:
"""Fetch the data from the source the loader is currently loading
@@ -317,28 +336,46 @@
return False
+ def _new_revs(self, heads: List[bytes]):
+ """Return unseen revisions. That is, filter out revisions that are not ancestors of
+ heads"""
+ assert self._repo is not None
+ existing_heads = []
+
+ for hg_nodeid in heads:
+ try:
+ rev = self._repo[hg_nodeid].rev()
+ existing_heads.append(rev)
+ except KeyError: # the node does not exist anymore
+ pass
+
+ # select revisions that are not ancestors of heads
+ # and not the heads themselves
+ new_revs = self._repo.revs("not ::(%ld)", existing_heads)
+
+ if new_revs:
+ self.log.info("New revisions found: %d", len(new_revs))
+
+ return new_revs
+
def get_hg_revs_to_load(self) -> Union[HgFilteredSet, HgSpanSet]:
"""Return the hg revision numbers to load."""
assert self._repo is not None
repo: hgutil.Repository = self._repo
+
if self._latest_heads:
- existing_heads = [] # heads that still exist in the repository
- for hg_nodeid in self._latest_heads:
- try:
- rev = repo[hg_nodeid].rev()
- existing_heads.append(rev)
- except KeyError: # the node does not exist anymore
- pass
-
- # select revisions that are not ancestors of heads
- # and not the heads themselves
- new_revs = repo.revs("not ::(%ld)", existing_heads)
-
- if new_revs:
- self.log.info("New revisions found: %d", len(new_revs))
- return new_revs
+ # Reuse existing seen heads from last snapshot
+ new_revs = self._new_revs(self._latest_heads)
else:
- return repo.revs("all()")
+ # otherwise, filter out all seen revisions across hg origins (through
+ # extids)
+ all_revs = repo.revs("all()")
+ hg_nodeids = [repo[nodeid].node() for nodeid in all_revs]
+ new_revs = self._new_revs(
+ [extid.extid for extid in self._get_extids_for_hgnodes(hg_nodeids)]
+ )
+
+ return new_revs
def store_data(self):
"""Store fetched data in the database."""
diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py
--- a/swh/loader/mercurial/tests/test_from_disk.py
+++ b/swh/loader/mercurial/tests/test_from_disk.py
@@ -164,7 +164,7 @@
check_snapshot(expected_snapshot, loader.storage)
stats = get_stats(loader.storage)
- assert stats == {
+ expected_stats = {
"content": 2,
"directory": 3,
"origin": 1,
@@ -175,6 +175,20 @@
"snapshot": 1,
}
+ assert stats == expected_stats
+ loader2 = HgLoaderFromDisk(swh_storage, url=repo_url)
+
+ assert loader2.load() == {"status": "uneventful"}
+
+ stats2 = get_stats(loader2.storage)
+ expected_stats2 = expected_stats.copy()
+ expected_stats2.update(
+ {"origin_visit": 1 + 1,}
+ )
+ assert stats2 == expected_stats2
+ # FIXME: Already seen objects are filtered out, so no new snapshot.
+ # Current behavior but is it ok?
+
# This test has as been adapted from the historical `HgBundle20Loader` tests
# to ensure compatibility of `HgLoaderFromDisk`.
@@ -670,3 +684,61 @@
loader = HgLoaderFromDisk(swh_storage, repo_path)
assert loader.load() == {"status": "uneventful"}
+
+
+def test_loader_hg_extid_filtering(swh_storage, datadir, tmp_path):
+ """The first visit of a fork should filter already seen revisions (through extids)
+
+ """
+ archive_name = "the-sandbox"
+ archive_path = os.path.join(datadir, f"{archive_name}.tgz")
+ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
+
+ loader = HgLoaderFromDisk(swh_storage, url=repo_url)
+
+ assert loader.load() == {"status": "eventful"}
+ stats = get_stats(loader.storage)
+ expected_stats = {
+ "content": 2,
+ "directory": 3,
+ "origin": 1,
+ "origin_visit": 1,
+ "release": 0,
+ "revision": 58,
+ "skipped_content": 0,
+ "snapshot": 1,
+ }
+ assert stats == expected_stats
+
+ visit_status = assert_last_visit_matches(
+ loader.storage, repo_url, status="full", type="hg",
+ )
+
+ # Make a fork of the first repository we ingested
+ fork_url = prepare_repository_from_archive(
+ archive_path, "the-sandbox-reloaded", tmp_path
+ )
+ loader2 = HgLoaderFromDisk(
+ swh_storage, url=fork_url, directory=str(tmp_path / archive_name)
+ )
+
+ assert loader2.load() == {"status": "uneventful"}
+
+ stats = get_stats(loader.storage)
+ expected_stats2 = expected_stats.copy()
+ expected_stats2.update(
+ {"origin": 1 + 1, "origin_visit": 1 + 1,}
+ )
+ assert stats == expected_stats2
+
+ visit_status2 = assert_last_visit_matches(
+ loader.storage, fork_url, status="full", type="hg",
+ )
+
+ assert visit_status.snapshot is not None
+ assert visit_status2.snapshot is None
+
+ # FIXME: Consistent behavior with filtering data out from already seen snapshot (on
+ # a given origin). But, the other fork origin has no snapshot at all. We should
+ # though, shouldn't we? Otherwise, that would mean fork could end up with no
+ # snapshot at all.

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 12:08 AM (2 d, 12 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3230506

Event Timeline