D6240.id22652.diff
No OneTemporary
Actions

Size

6 KB

Subscribers

None

D6240.id22652.diff
View Options

	diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py
	--- a/swh/loader/mercurial/from_disk.py
	+++ b/swh/loader/mercurial/from_disk.py
	@@ -289,6 +289,25 @@
	]
	return extids

	+ def _get_extids_for_hgnodes(self, hgnode_ids: List[bytes]) -> List[ExtID]:
	+ """Get all Mercurial ExtIDs for the mercurial nodes in the list."""
	+ extids = [
	+ extid
	+ for extid in self.storage.extid_get_from_extid(EXTID_TYPE, hgnode_ids)
	+ if extid.extid_version == EXTID_VERSION
	+ ]
	+ if extids:
	+ # Filter out dangling extids, we need to load their target again
	+ revisions_missing = self.storage.revision_missing(
	+ [extid.target.object_id for extid in extids]
	+ )
	+ extids = [
	+ extid
	+ for extid in extids
	+ if extid.target.object_id not in revisions_missing
	+ ]
	+ return extids
	+
	def fetch_data(self) -> bool:
	"""Fetch the data from the source the loader is currently loading

	@@ -317,28 +336,46 @@

	return False

	+ def _new_revs(self, heads: List[bytes]):
	+ """Return unseen revisions. That is, filter out revisions that are not ancestors of
	+ heads"""
	+ assert self._repo is not None
	+ existing_heads = []
	+
	+ for hg_nodeid in heads:
	+ try:
	+ rev = self._repo[hg_nodeid].rev()
	+ existing_heads.append(rev)
	+ except KeyError: # the node does not exist anymore
	+ pass
	+
	+ # select revisions that are not ancestors of heads
	+ # and not the heads themselves
	+ new_revs = self._repo.revs("not ::(%ld)", existing_heads)
	+
	+ if new_revs:
	+ self.log.info("New revisions found: %d", len(new_revs))
	+
	+ return new_revs
	+
	def get_hg_revs_to_load(self) -> Union[HgFilteredSet, HgSpanSet]:
	"""Return the hg revision numbers to load."""
	assert self._repo is not None
	repo: hgutil.Repository = self._repo
	+
	if self._latest_heads:
	- existing_heads = [] # heads that still exist in the repository
	- for hg_nodeid in self._latest_heads:
	- try:
	- rev = repo[hg_nodeid].rev()
	- existing_heads.append(rev)
	- except KeyError: # the node does not exist anymore
	- pass
	-
	- # select revisions that are not ancestors of heads
	- # and not the heads themselves
	- new_revs = repo.revs("not ::(%ld)", existing_heads)
	-
	- if new_revs:
	- self.log.info("New revisions found: %d", len(new_revs))
	- return new_revs
	+ # Reuse existing seen heads from last snapshot
	+ new_revs = self._new_revs(self._latest_heads)
	else:
	- return repo.revs("all()")
	+ # otherwise, filter out all seen revisions across hg origins (through
	+ # extids)
	+ all_revs = repo.revs("all()")
	+ hg_nodeids = [repo[nodeid].node() for nodeid in all_revs]
	+ new_revs = self._new_revs(
	+ [extid.extid for extid in self._get_extids_for_hgnodes(hg_nodeids)]
	+ )
	+
	+ return new_revs

	def store_data(self):
	"""Store fetched data in the database."""
	diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py
	--- a/swh/loader/mercurial/tests/test_from_disk.py
	+++ b/swh/loader/mercurial/tests/test_from_disk.py
	@@ -164,7 +164,7 @@
	check_snapshot(expected_snapshot, loader.storage)

	stats = get_stats(loader.storage)
	- assert stats == {
	+ expected_stats = {
	"content": 2,
	"directory": 3,
	"origin": 1,
	@@ -175,6 +175,20 @@
	"snapshot": 1,
	}

	+ assert stats == expected_stats
	+ loader2 = HgLoaderFromDisk(swh_storage, url=repo_url)
	+
	+ assert loader2.load() == {"status": "uneventful"}
	+
	+ stats2 = get_stats(loader2.storage)
	+ expected_stats2 = expected_stats.copy()
	+ expected_stats2.update(
	+ {"origin_visit": 1 + 1,}
	+ )
	+ assert stats2 == expected_stats2
	+ # FIXME: Already seen objects are filtered out, so no new snapshot.
	+ # Current behavior but is it ok?
	+

	# This test has as been adapted from the historical `HgBundle20Loader` tests
	# to ensure compatibility of `HgLoaderFromDisk`.
	@@ -670,3 +684,61 @@

	loader = HgLoaderFromDisk(swh_storage, repo_path)
	assert loader.load() == {"status": "uneventful"}
	+
	+
	+def test_loader_hg_extid_filtering(swh_storage, datadir, tmp_path):
	+ """The first visit of a fork should filter already seen revisions (through extids)
	+
	+ """
	+ archive_name = "the-sandbox"
	+ archive_path = os.path.join(datadir, f"{archive_name}.tgz")
	+ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
	+
	+ loader = HgLoaderFromDisk(swh_storage, url=repo_url)
	+
	+ assert loader.load() == {"status": "eventful"}
	+ stats = get_stats(loader.storage)
	+ expected_stats = {
	+ "content": 2,
	+ "directory": 3,
	+ "origin": 1,
	+ "origin_visit": 1,
	+ "release": 0,
	+ "revision": 58,
	+ "skipped_content": 0,
	+ "snapshot": 1,
	+ }
	+ assert stats == expected_stats
	+
	+ visit_status = assert_last_visit_matches(
	+ loader.storage, repo_url, status="full", type="hg",
	+ )
	+
	+ # Make a fork of the first repository we ingested
	+ fork_url = prepare_repository_from_archive(
	+ archive_path, "the-sandbox-reloaded", tmp_path
	+ )
	+ loader2 = HgLoaderFromDisk(
	+ swh_storage, url=fork_url, directory=str(tmp_path / archive_name)
	+ )
	+
	+ assert loader2.load() == {"status": "uneventful"}
	+
	+ stats = get_stats(loader.storage)
	+ expected_stats2 = expected_stats.copy()
	+ expected_stats2.update(
	+ {"origin": 1 + 1, "origin_visit": 1 + 1,}
	+ )
	+ assert stats == expected_stats2
	+
	+ visit_status2 = assert_last_visit_matches(
	+ loader.storage, fork_url, status="full", type="hg",
	+ )
	+
	+ assert visit_status.snapshot is not None
	+ assert visit_status2.snapshot is None
	+
	+ # FIXME: Consistent behavior with filtering data out from already seen snapshot (on
	+ # a given origin). But, the other fork origin has no snapshot at all. We should
	+ # though, shouldn't we? Otherwise, that would mean fork could end up with no
	+ # snapshot at all.

File Metadata

Mime Type: text/plain
Expires: Wed, Dec 18, 12:08 AM (2 d, 6 h ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3230506

D6240.id22652.diffNo OneTemporaryActions

D6240.id22652.diffView Options

File Metadata

Event Timeline

D6240.id22652.diff
No OneTemporary
Actions

D6240.id22652.diff
View Options