diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py --- a/swh/loader/mercurial/from_disk.py +++ b/swh/loader/mercurial/from_disk.py @@ -48,6 +48,14 @@ T = TypeVar("T") +class CorruptedRevision(ValueError): + """Raised when a revision is corrupted.""" + + def __init__(self, hg_nodeid: HgNodeId) -> None: + super().__init__(hg_nodeid.hex()) + self.hg_nodeid = hg_nodeid + + class HgDirectory(Directory): """A more practical directory. @@ -152,6 +160,8 @@ self._latest_heads: List[HgNodeId] = [] self._load_status = "eventful" + # If set, will override the default value + self._visit_status = None def pre_cleanup(self) -> None: """As a first step, will try and check for dangling data to cleanup. @@ -261,13 +271,25 @@ self._load_status = "uneventful" return + assert self._repo is not None + repo = self._repo + + blacklisted_revs: List[int] = [] for rev in revs: - self.store_revision(self._repo[rev]) + if rev in blacklisted_revs: + continue + try: + self.store_revision(repo[rev]) + except CorruptedRevision as e: + self._visit_status = "partial" + self.log.warning("Corrupted revision %s", e) + descendents = repo.revs("(%ld)::", [rev]) + blacklisted_revs.extend(descendents) branch_by_hg_nodeid: Dict[HgNodeId, bytes] = { - hg_nodeid: name for name, hg_nodeid in hgutil.branches(self._repo).items() + hg_nodeid: name for name, hg_nodeid in hgutil.branches(repo).items() } - tags_by_name: Dict[bytes, HgNodeId] = self._repo.tags() + tags_by_name: Dict[bytes, HgNodeId] = repo.tags() tags_by_hg_nodeid: Dict[HgNodeId, bytes] = { hg_nodeid: name for name, hg_nodeid in tags_by_name.items() } @@ -317,6 +339,12 @@ "status": self._load_status, } + def visit_status(self) -> str: + """Allow overriding the visit status in case of partial load""" + if self._visit_status is not None: + return self._visit_status + return super().visit_status() + def get_revision_id_from_hg_nodeid(self, hg_nodeid: HgNodeId) -> Sha1Git: """Return the swhid of a revision given its hg nodeid. @@ -446,7 +474,18 @@ hg_nodeid = rev_ctx.node() file_ctx = rev_ctx[file_path] - file_nodeid = file_ctx.filenode() + try: + file_nodeid = file_ctx.filenode() + except hgutil.LookupError: + # TODO + # Raising CorruptedRevision avoid crashing the whole loading + # but can lead to a lot of missing revisions. + # SkippedContent could be used but need actual content to calculate its id. + # Maybe the hg_nodeid can be used instead. + # Another option could be to just ignore the missing content. + # This point is left to future commits. + raise CorruptedRevision(hg_nodeid) + perms = FLAG_PERMS[file_ctx.flags()] # Key is file_nodeid + perms because permissions does not participate diff --git a/swh/loader/mercurial/hgutil.py b/swh/loader/mercurial/hgutil.py --- a/swh/loader/mercurial/hgutil.py +++ b/swh/loader/mercurial/hgutil.py @@ -9,7 +9,7 @@ from typing import Dict, NewType # The internal Mercurial API is not guaranteed to be stable. -from mercurial import context, hg, smartset, util # type: ignore +from mercurial import context, error, hg, smartset, util # type: ignore import mercurial.ui # type: ignore NULLID = mercurial.node.nullid @@ -19,6 +19,7 @@ LRUCacheDict = util.lrucachedict HgSpanSet = smartset._spanset HgFilteredSet = smartset.filteredset +LookupError = error.LookupError def repository(path: str) -> hg.localrepo: diff --git a/swh/loader/mercurial/tests/data/missing-filelog.sh b/swh/loader/mercurial/tests/data/missing-filelog.sh new file mode 100755 --- /dev/null +++ b/swh/loader/mercurial/tests/data/missing-filelog.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +set -euo pipefail + +# TODO HG_REPO from $1 else from environment +if [ ! -z "$1" ]; then + HG_REPO="$1" +fi + +# prepare repository +hg init "$HG_REPO" +cd "$HG_REPO" +cat > .hg/hgrc << EOL +[ui] +username = Full Name +EOL + +echo "foo" >> foo +hg add foo +hg commit -m "Add foo" + +echo "bar" >> bar +hg add bar +hg commit -m "Add bar" + +echo "fizz" >> fizz +hg add fizz +hg commit -m "Add fizz" + +# corrupt repository +rm .hg/store/data/bar.i diff --git a/swh/loader/mercurial/tests/data/missing-filelog.tgz b/swh/loader/mercurial/tests/data/missing-filelog.tgz new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@