diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py --- a/swh/loader/mercurial/from_disk.py +++ b/swh/loader/mercurial/from_disk.py @@ -217,11 +217,12 @@ for revision in self.storage.revision_get(snapshot_branches) if revision ] - self._latest_heads = [ - hash_to_bytes(revision.metadata["node"]) - for revision in revisions - if revision.metadata - ] + for revision in revisions: + if not revision.metadata: + continue + hg_nodeid = hash_to_bytes(revision.metadata["node"]) + self._latest_heads.append(hg_nodeid) + self._revision_nodeid_to_swhid[hg_nodeid] = revision.id def fetch_data(self) -> bool: """Fetch the data from the source the loader is currently loading @@ -265,15 +266,9 @@ # select revisions that are not ancestors of heads # and not the heads themselves - new_revs = repo.revs("not ::(%ld)", existing_heads) - - # for now, reload all revisions if there are new commits - # otherwise the loader will crash on missing parents - # incremental loading will come in next commits - if new_revs: - return repo.revs("all()") - else: - return new_revs + revs = repo.revs("not ::(%ld)", existing_heads) + self.log.info(f"News revisions: {len(revs)}") + return revs else: return repo.revs("all()") @@ -340,7 +335,7 @@ "status": self._load_status, } - def get_revision_id_from_hg_nodeid(self, hg_nodeid: HgNodeId) -> Sha1Git: + def get_revision_id_from_hg_nodeid(self, hg_nodeid: HgNodeId) -> Optional[Sha1Git]: """Return the swhid of a revision given its hg nodeid. Args: @@ -349,7 +344,9 @@ Returns: the swhid of the revision. """ - return self._revision_nodeid_to_swhid[hg_nodeid] + # TODO load from storage when research by hg node id is available + # Need: https://forge.softwareheritage.org/T2849 + return self._revision_nodeid_to_swhid.get(hg_nodeid) def get_revision_parents(self, rev_ctx: hgutil.BaseContext) -> Tuple[Sha1Git, ...]: """Return the swhids of the parent revisions. @@ -363,10 +360,20 @@ parents = [] for parent_ctx in rev_ctx.parents(): parent_hg_nodeid = parent_ctx.node() + # nullid is the value of a parent that does not exist if parent_hg_nodeid == hgutil.NULLID: continue - parents.append(self.get_revision_id_from_hg_nodeid(parent_hg_nodeid)) + + sha1_git = self.get_revision_id_from_hg_nodeid(parent_hg_nodeid) + + # parent not in storage from previous run + # and not previously loaded in current run + if sha1_git is None: + hg_nodeid = parent_hg_nodeid.hex() + raise ValueError(f"Cannot load revision with hg nodeid: {hg_nodeid}") + + parents.append(sha1_git) return tuple(parents) diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py --- a/swh/loader/mercurial/tests/test_from_disk.py +++ b/swh/loader/mercurial/tests/test_from_disk.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information import os +import subprocess from datetime import datetime from hashlib import sha1 @@ -22,6 +23,12 @@ from .loader_checker import ExpectedSwhids, LoaderChecker +def hg_strip(repo: str, rev: str) -> None: + subprocess.check_call( + ["hg", "--config", "extensions.strip=", "strip", rev], cwd=repo + ) + + def random_content() -> Content: """Create minimal content object.""" data = str(datetime.now()).encode() @@ -275,3 +282,42 @@ "skipped_content": 0, "snapshot": 1, } + + +def test_load_repo_with_new_commits(swh_config, datadir, tmp_path): + archive_name = "hello" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + json_path = os.path.join(datadir, f"{archive_name}.json") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + + # first load with missing commits + hg_strip(repo_url.replace("file://", ""), "tip") + loader = HgLoaderFromDisk(repo_url) + assert loader.load() == {"status": "eventful"} + assert get_stats(loader.storage) == { + "content": 2, + "directory": 2, + "origin": 1, + "origin_visit": 1, + "release": 0, + "revision": 2, + "skipped_content": 0, + "snapshot": 1, + } + + # second load with all commits + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + loader = HgLoaderFromDisk(repo_url) + LoaderChecker( + loader=HgLoaderFromDisk(repo_url), expected=ExpectedSwhids.load(json_path), + ).check() + assert get_stats(loader.storage) == { + "content": 3, + "directory": 3, + "origin": 1, + "origin_visit": 2, + "release": 1, + "revision": 3, + "skipped_content": 0, + "snapshot": 2, + }