diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py --- a/swh/loader/mercurial/from_disk.py +++ b/swh/loader/mercurial/from_disk.py @@ -8,7 +8,7 @@ import os from shutil import rmtree from tempfile import mkdtemp -from typing import Deque, Dict, List, Optional, Set, Tuple, TypeVar, Union +from typing import Deque, Dict, Iterator, List, Optional, Set, Tuple, TypeVar, Union from swh.loader.core.loader import BaseLoader from swh.loader.core.utils import clean_dangling_folders @@ -36,7 +36,7 @@ from . import hgutil from .archive_extract import tmp_extract -from .hgutil import HgFilteredSet, HgNodeId, HgSpanSet +from .hgutil import HgNodeId FLAG_PERMS = { b"l": DentryPerms.symlink, @@ -289,6 +289,28 @@ ] return extids + def _get_extids_for_hgnodes(self, hgnode_ids: List[bytes]) -> List[ExtID]: + """Get all Mercurial ExtIDs for the mercurial nodes in the list which point to + a known revision. + + """ + extids = [ + extid + for extid in self.storage.extid_get_from_extid(EXTID_TYPE, hgnode_ids) + if extid.extid_version == EXTID_VERSION + ] + if extids: + # Filter out dangling extids, we need to load their target again + revisions_missing = self.storage.revision_missing( + [extid.target.object_id for extid in extids] + ) + extids = [ + extid + for extid in extids + if extid.target.object_id not in revisions_missing + ] + return extids + def fetch_data(self) -> bool: """Fetch the data from the source the loader is currently loading @@ -317,35 +339,55 @@ return False - def get_hg_revs_to_load(self) -> Union[HgFilteredSet, HgSpanSet]: - """Return the hg revision numbers to load.""" + def _new_revs(self, heads: List[bytes]): + """Return unseen revisions. That is, filter out revisions that are not ancestors of + heads""" + assert self._repo is not None + existing_heads = [] + + for hg_nodeid in heads: + try: + rev = self._repo[hg_nodeid].rev() + existing_heads.append(rev) + except KeyError: # the node does not exist anymore + pass + + # select revisions that are not ancestors of heads + # and not the heads themselves + new_revs = self._repo.revs("not ::(%ld)", existing_heads) + + if new_revs: + self.log.info("New revisions found: %d", len(new_revs)) + + return new_revs + + def get_hg_revs_to_load(self) -> Iterator[int]: + """Yield hg revision numbers to load. + + """ assert self._repo is not None repo: hgutil.Repository = self._repo + + seen_revs: Set[int] = set() + # 1. use snapshot to reuse existing seen heads from it if self._latest_heads: - existing_heads = [] # heads that still exist in the repository - for hg_nodeid in self._latest_heads: - try: - rev = repo[hg_nodeid].rev() - existing_heads.append(rev) - except KeyError: # the node does not exist anymore - pass - - # select revisions that are not ancestors of heads - # and not the heads themselves - new_revs = repo.revs("not ::(%ld)", existing_heads) - - if new_revs: - self.log.info("New revisions found: %d", len(new_revs)) - return new_revs - else: - return repo.revs("all()") + for rev in self._new_revs(self._latest_heads): + seen_revs.add(rev) + yield rev + + # 2. Then filter out remaining revisions through the overall extid mappings + # across hg origins + revs_left = repo.revs("all() - ::(%ld)", seen_revs) + hg_nodeids = [repo[nodeid].node() for nodeid in revs_left] + for rev in self._new_revs( + [extid.extid for extid in self._get_extids_for_hgnodes(hg_nodeids)] + ): + yield rev def store_data(self): """Store fetched data in the database.""" revs = self.get_hg_revs_to_load() - if not revs: - self._load_status = "uneventful" - return + length_ingested_revs = 0 assert self._repo is not None repo = self._repo @@ -356,14 +398,15 @@ continue try: self.store_revision(repo[rev]) + length_ingested_revs += 1 except CorruptedRevision as e: self._visit_status = "partial" self.log.warning("Corrupted revision %s", e) descendents = repo.revs("(%ld)::", [rev]) ignored_revs.update(descendents) - if len(ignored_revs) == len(revs): - # The repository is completely broken, nothing can be loaded + if length_ingested_revs == 0: + # The repository has nothing to ingest (either empty or broken repository) self._load_status = "uneventful" return diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py --- a/swh/loader/mercurial/tests/test_from_disk.py +++ b/swh/loader/mercurial/tests/test_from_disk.py @@ -685,3 +685,61 @@ loader = HgLoaderFromDisk(swh_storage, repo_path) assert loader.load() == {"status": "uneventful"} + + +def test_loader_hg_extid_filtering(swh_storage, datadir, tmp_path): + """The first visit of a fork should filter already seen revisions (through extids) + + """ + archive_name = "the-sandbox" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + + loader = HgLoaderFromDisk(swh_storage, url=repo_url) + + assert loader.load() == {"status": "eventful"} + stats = get_stats(loader.storage) + expected_stats = { + "content": 2, + "directory": 3, + "origin": 1, + "origin_visit": 1, + "release": 0, + "revision": 58, + "skipped_content": 0, + "snapshot": 1, + } + assert stats == expected_stats + + visit_status = assert_last_visit_matches( + loader.storage, repo_url, status="full", type="hg", + ) + + # Make a fork of the first repository we ingested + fork_url = prepare_repository_from_archive( + archive_path, "the-sandbox-reloaded", tmp_path + ) + loader2 = HgLoaderFromDisk( + swh_storage, url=fork_url, directory=str(tmp_path / archive_name) + ) + + assert loader2.load() == {"status": "uneventful"} + + stats = get_stats(loader.storage) + expected_stats2 = expected_stats.copy() + expected_stats2.update( + {"origin": 1 + 1, "origin_visit": 1 + 1,} + ) + assert stats == expected_stats2 + + visit_status2 = assert_last_visit_matches( + loader.storage, fork_url, status="full", type="hg", + ) + + assert visit_status.snapshot is not None + assert visit_status2.snapshot is None + + # FIXME: Consistent behavior with filtering data out from already seen snapshot (on + # a given origin). But, the other fork origin has no snapshot at all. We should + # though, shouldn't we? Otherwise, that would mean fork could end up with no + # snapshot at all.