diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py --- a/swh/loader/mercurial/from_disk.py +++ b/swh/loader/mercurial/from_disk.py @@ -101,7 +101,33 @@ class HgLoaderFromDisk(BaseLoader): - """Load a mercurial repository from a local repository.""" + """Load a mercurial repository from a local repository. + + Mercurial's branching model is more complete than Git's; it allows for multiple + heads per branch, closed heads and bookmarks. The following mapping is used to + represent the branching state of a Mercurial project in a given snapshot: + + - `HEAD` (optional) either the node pointed by the `@` bookmark or the tip of + the `default` branch + - `branch-tip/` (required) the first head of the branch, sorted by + nodeid if there are multiple heads. + - `bookmarks/` (optional) holds the bookmarks mapping if any + - `branch-heads//0..n` (optional) for any branch with multiple open + heads, list all *open* heads + - `branch-closed-heads//0..n` (optional) for any branch with at least + one closed head, list all *closed* heads + - `tags/` (optional) record tags + + The format is not ambiguous regardless of branch name since we know it ends with a + `/`, as long as we have a stable sorting of the heads (we sort by nodeid). + There may be some overlap between the refs, but it's simpler not to try to figure + out de-duplication. + However, to reduce the redundancy between snapshot branches in the most common case, + when a branch has a single open head, it will only be referenced as + `branch-tip/`. The `branch-heads/` hierarchy only appears when a branch + has multiple open heads, which we consistently sort by increasing nodeid. + The `branch-closed-heads/` hierarchy is also sorted by increasing nodeid. + """ CONFIG_BASE_FILENAME = "loader/mercurial" @@ -304,15 +330,6 @@ except KeyError: # the node does not exist anymore pass - # Mercurial can have more than one head per branch, so we need to exclude - # local heads that have already been loaded as revisions but don't - # correspond to a SnapshotBranch. - # In the future, if the SnapshotBranch model evolves to support multiple - # heads per branch (or anything else that fixes this issue) this might - # become useless. - extids = self.storage.extid_get_from_extid(EXTID_TYPE, repo.heads()) - known_heads = {extid.extid for extid in extids} - existing_heads.extend([repo[head].rev() for head in known_heads]) # select revisions that are not ancestors of heads # and not the heads themselves new_revs = repo.revs("not ::(%ld)", existing_heads) @@ -333,9 +350,9 @@ assert self._repo is not None repo = self._repo - blacklisted_revs: List[int] = [] + ignored_revs: Set[int] = set() for rev in revs: - if rev in blacklisted_revs: + if rev in ignored_revs: continue try: self.store_revision(repo[rev]) @@ -343,41 +360,68 @@ self._visit_status = "partial" self.log.warning("Corrupted revision %s", e) descendents = repo.revs("(%ld)::", [rev]) - blacklisted_revs.extend(descendents) + ignored_revs.update(descendents) - branch_by_hg_nodeid: Dict[HgNodeId, bytes] = { - hg_nodeid: name for name, hg_nodeid in hgutil.branches(repo).items() - } + if len(ignored_revs) == len(revs): + # The repository is completely broken, nothing can be loaded + self._load_status = "uneventful" + return + + branching_info = hgutil.branching_info(repo, ignored_revs) tags_by_name: Dict[bytes, HgNodeId] = repo.tags() snapshot_branches: Dict[bytes, SnapshotBranch] = {} for tag_name, hg_nodeid in tags_by_name.items(): if tag_name == b"tip": - # tip is listed in the tags by the mercurial api - # but its not a tag defined by the user in `.hgtags` + # `tip` is listed in the tags by the Mercurial API but its not a tag + # defined by the user in `.hgtags`. continue if hg_nodeid not in self._saved_tags: - revision_sha1git = self.get_revision_id_from_hg_nodeid(hg_nodeid) - snapshot_branches[tag_name] = SnapshotBranch( - target=self.store_release(tag_name, revision_sha1git), + label = b"tags/%s" % tag_name + target = self.get_revision_id_from_hg_nodeid(hg_nodeid) + snapshot_branches[label] = SnapshotBranch( + target=self.store_release(tag_name, target), target_type=TargetType.RELEASE, ) - for hg_nodeid, revision_sha1git in self._revision_nodeid_to_sha1git.items(): - if hg_nodeid in branch_by_hg_nodeid: - name = branch_by_hg_nodeid[hg_nodeid] + for branch_name, node_id in branching_info.tips.items(): + name = b"branch-tip/%s" % branch_name + target = self.get_revision_id_from_hg_nodeid(node_id) + snapshot_branches[name] = SnapshotBranch( + target=target, target_type=TargetType.REVISION + ) + + for bookmark_name, node_id in branching_info.bookmarks.items(): + name = b"bookmarks/%s" % bookmark_name + target = self.get_revision_id_from_hg_nodeid(node_id) + snapshot_branches[name] = SnapshotBranch( + target=target, target_type=TargetType.REVISION + ) + + for branch_name, branch_heads in branching_info.open_heads.items(): + for index, head in enumerate(branch_heads): + name = b"branch-heads/%s/%d" % (branch_name, index) + target = self.get_revision_id_from_hg_nodeid(head) snapshot_branches[name] = SnapshotBranch( - target=revision_sha1git, target_type=TargetType.REVISION, + target=target, target_type=TargetType.REVISION ) - # The tip is mapped to `HEAD` to match - # the historical implementation - if hg_nodeid == tags_by_name[b"tip"]: - snapshot_branches[b"HEAD"] = SnapshotBranch( - target=name, target_type=TargetType.ALIAS, + for branch_name, closed_heads in branching_info.closed_heads.items(): + for index, head in enumerate(closed_heads): + name = b"branch-closed-heads/%s/%d" % (branch_name, index) + target = self.get_revision_id_from_hg_nodeid(head) + snapshot_branches[name] = SnapshotBranch( + target=target, target_type=TargetType.REVISION ) + # If the repo is broken enough or if it has none of the "normal" default + # mechanisms, we ignore `HEAD`. + default_branch_alias = branching_info.default_branch_alias + if default_branch_alias is not None: + snapshot_branches[b"HEAD"] = SnapshotBranch( + target=default_branch_alias, target_type=TargetType.ALIAS, + ) snapshot = Snapshot(branches=snapshot_branches) self.storage.snapshot_add([snapshot]) diff --git a/swh/loader/mercurial/hgutil.py b/swh/loader/mercurial/hgutil.py --- a/swh/loader/mercurial/hgutil.py +++ b/swh/loader/mercurial/hgutil.py @@ -2,18 +2,19 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - +from collections import defaultdict +from dataclasses import dataclass import io import os import signal import time import traceback -from typing import Dict, NewType +from typing import Dict, List, Mapping, NewType, Optional, Set from billiard import Process, Queue # The internal Mercurial API is not guaranteed to be stable. -from mercurial import context, error, hg, smartset, util # type: ignore +from mercurial import bookmarks, context, error, hg, smartset, util # type: ignore import mercurial.ui # type: ignore NULLID = mercurial.node.nullid @@ -31,12 +32,80 @@ return hg.repository(ui, path.encode()) -def branches(repo: hg.localrepo) -> Dict[bytes, HgNodeId]: - """List repository named branches and their tip node.""" - result = {} - for tag, heads, tip, isclosed in repo.branchmap().iterbranches(): - result[tag] = tip - return result +@dataclass +class BranchingInfo: + tips: Mapping[bytes, HgNodeId] + """The first head of the branch, sorted by nodeid if there are multiple heads.""" + bookmarks: Mapping[bytes, HgNodeId] + """all bookmarks in the repository (except local divergent ones)""" + open_heads: Mapping[bytes, List[HgNodeId]] + """All *open* heads of a given branch, sorted by nodeid""" + closed_heads: Mapping[bytes, List[HgNodeId]] + """All *closed* heads of a given branch, sorted by nodeid, if any""" + default_branch_alias: Optional[bytes] + """The default snapshot branch to show in the UI""" + + +def branching_info(repo: hg.localrepo, ignored: Set[int]) -> BranchingInfo: + """Lists all relevant information about branch heads and bookmarks, grouped by type. + + `ignored`: Revisions that we ignore during loading because they are corrupted or + have a corrupted ancestor. + + Categories may have overlapping nodes: a branch tip can be a closed branch head + and have a bookmark on it, for example. + """ + branch_tips: Dict[bytes, HgNodeId] = {} + branch_open_heads = defaultdict(list) + branch_closed_heads = defaultdict(list) + all_bookmarks = bookmarks.listbookmarks(repo) + + for branch_name, heads in repo.branchmap().items(): + # Sort the heads by node id since it's stable and doesn't depend on local + # topology like cloning order. + for head in sorted(heads): + head = repo[head] + if head.rev() in ignored: + # This revision or one of its ancestors is corrupted, ignore it + continue + node_id = head.node() + if head.closesbranch(): + branch_closed_heads[branch_name].append(node_id) + else: + if not branch_tips.get(branch_name): + branch_tips[branch_name] = node_id + branch_open_heads[branch_name].append(node_id) + + # The default revision is where the "@" bookmark is, or failing that the tip of the + # `default` branch. For our purposes we're trying to find a branch tip to alias to, + # so only return those if they are branch tips, otherwise don't bother. + default_rev_alias = None + at_bookmark = all_bookmarks.get(b"@") + if at_bookmark is not None: + bookmark_at_branch = repo[at_bookmark].branch() + if branch_tips.get(bookmark_at_branch) is not None: + default_rev_alias = b"bookmarks/@" + if default_rev_alias is None and branch_tips.get(b"default") is not None: + default_rev_alias = b"branch-tip/default" + + branches_with_one_head = set() + for branch, heads in branch_open_heads.items(): + if len(heads) == 1: + branches_with_one_head.add(branch) + + # The most common case is one head per branch. Simplifying this means we have + # less duplicate data, because open heads are the same as open branch tips. + # We don't do more complex deduplication, this is just a simple optimization. + for branch in branches_with_one_head: + del branch_open_heads[branch] + + return BranchingInfo( + tips=branch_tips, + bookmarks=all_bookmarks, + open_heads=branch_open_heads, + closed_heads=branch_closed_heads, + default_branch_alias=default_rev_alias, + ) class CloneTimeout(Exception): diff --git a/swh/loader/mercurial/tests/data/example.json b/swh/loader/mercurial/tests/data/example.json --- a/swh/loader/mercurial/tests/data/example.json +++ b/swh/loader/mercurial/tests/data/example.json @@ -1 +1 @@ -{"directories": ["048960a9eff9a9f22ce2fc2e2bc9b5f73cdfc26a", "09a1bb68db049b4e37540e52ebde76f59126b3a8", "0dad640e1eb9f31cb9d874158318f1f180be9b3a", "181a22e7ad8bbad9bb5846f51c377a7597a0c914", "218ccb1594f7026492c72309974b44aba353d7dc", "93e88b135dc8c3420cd4984e21d8d1eb2781ddce", "d476a11ddfcfce07236a0a03f78e3c1a73bc20ae", "ecf37a29314efe473b399b700c7e5eacc063ba6e", "fa5e6af79e30fc26ab4acbd96388fde22b4c2f36"], "revisions": ["1171aa960a675f8b8327199ff084b6e7c879361d", "23459c9c498542cde67d8d130bc4c0b3084edf5f", "486d227d252ee8a3a01ef40348964f68e21018a5", "65de9d553502aa1f1cb20df179a6ea04e6d2039e", "8f392d7f64419bf4672a75a07f61ce243a7f2c67", "acd77cd84bfca51b3d4f928109a9de52a45618f2", "bb1befca13ceb1a8ebde25cec05966be3eed9bca", "bf91ae31bdb938c2927e741b53af815380340ea7", "c88ea3f5892a5e726739a10eca3afe5d7fa648ce"], "releases": [], "snapshot": "ac42dfd0f2308197c5b6e5653ad13c8da23d5040"} +{"directories": ["048960a9eff9a9f22ce2fc2e2bc9b5f73cdfc26a", "09a1bb68db049b4e37540e52ebde76f59126b3a8", "0dad640e1eb9f31cb9d874158318f1f180be9b3a", "181a22e7ad8bbad9bb5846f51c377a7597a0c914", "218ccb1594f7026492c72309974b44aba353d7dc", "93e88b135dc8c3420cd4984e21d8d1eb2781ddce", "d476a11ddfcfce07236a0a03f78e3c1a73bc20ae", "ecf37a29314efe473b399b700c7e5eacc063ba6e", "fa5e6af79e30fc26ab4acbd96388fde22b4c2f36"], "revisions": ["1171aa960a675f8b8327199ff084b6e7c879361d", "23459c9c498542cde67d8d130bc4c0b3084edf5f", "486d227d252ee8a3a01ef40348964f68e21018a5", "65de9d553502aa1f1cb20df179a6ea04e6d2039e", "8f392d7f64419bf4672a75a07f61ce243a7f2c67", "acd77cd84bfca51b3d4f928109a9de52a45618f2", "bb1befca13ceb1a8ebde25cec05966be3eed9bca", "bf91ae31bdb938c2927e741b53af815380340ea7", "c88ea3f5892a5e726739a10eca3afe5d7fa648ce"], "releases": [], "snapshot": "3c636151bf2fc069e5f933840e7b9444b02c1965"} diff --git a/swh/loader/mercurial/tests/data/hello.json b/swh/loader/mercurial/tests/data/hello.json --- a/swh/loader/mercurial/tests/data/hello.json +++ b/swh/loader/mercurial/tests/data/hello.json @@ -1 +1 @@ -{"directories": ["43d727f2f3f2f7cb3b098ddad1d7038464a4cee2", "8f2be433c945384c85920a8e60f2a68d2c0f20fb", "b3f85f210ff86d334575f64cb01c5bf49895b63e"], "revisions": ["8dd3db5d5519e4947f035d141581d304565372d2", "93b48d515580522a05f389bec93227fc8e43d940", "c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27"], "releases": ["515c4d72e089404356d0f4b39d60f948b8999140"], "snapshot": "d35668e02e2ba4321dc951cd308cf883786f918a"} \ No newline at end of file +{"directories": ["43d727f2f3f2f7cb3b098ddad1d7038464a4cee2", "8f2be433c945384c85920a8e60f2a68d2c0f20fb", "b3f85f210ff86d334575f64cb01c5bf49895b63e"], "revisions": ["8dd3db5d5519e4947f035d141581d304565372d2", "93b48d515580522a05f389bec93227fc8e43d940", "c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27"], "releases": ["515c4d72e089404356d0f4b39d60f948b8999140"], "snapshot": "7ef082aa8b53136b1bed97f734504be32679bbec"} diff --git a/swh/loader/mercurial/tests/data/multiple-heads.json b/swh/loader/mercurial/tests/data/multiple-heads.json new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/tests/data/multiple-heads.json @@ -0,0 +1 @@ +{"directories": ["496d6428b9cf92981dc9495211e6e1120fb6f2ba", "296e56023cdc034d2735fee8c0d85a659d1b07f4", "765b32c65d38f04c4f287abda055818ec0f26912", "bd1709e84a443bbf3246f89d324ef33445fb5363"], "revisions": ["b36711a46efd2e3d7e63955b2a63308cfa7e2fb4", "374ac7264eee97142146386bfebb0a1336d4d159", "ad9d756cc3613b142f6fbdc05b27433398085a49", "530ddb720ec9b2f6baee64e572ffed746c85476e"], "releases": [], "snapshot": "5fda0ffbb7a530ffb975e7f80a448650b0ed8171"} \ No newline at end of file diff --git a/swh/loader/mercurial/tests/data/multiple-heads.sh b/swh/loader/mercurial/tests/data/multiple-heads.sh new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/tests/data/multiple-heads.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +set -eox pipefail + +# TODO HG_REPO from $1 else from environment +if [ -n "$1" ]; then + HG_REPO="$1" +fi + +# prepare repository +hg init "$HG_REPO" +cd "$HG_REPO" +cat > .hg/hgrc << EOL +[ui] +username = Full Name +EOL + +touch a +hg commit -Aqm "Initial commit" + +touch b +hg commit -Aqm "Forking point" + +touch c +hg commit -Aqm "First head" + +# Go to the forking point +hg up 1 -q + +touch d +hg commit -Aqm "Second head" diff --git a/swh/loader/mercurial/tests/data/multiple-heads.tgz b/swh/loader/mercurial/tests/data/multiple-heads.tgz new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ None: """Removes `revset` and all of their descendants from the local repository.""" # Previously called `hg strip`, it was renamed to `hg debugstrip` in Mercurial 5.7