Changeset View
Standalone View
swh/loader/mercurial/from_disk.py
Show First 20 Lines • Show All 95 Lines • ▼ Show 20 Lines | ) -> Optional[Union[Content, "HgDirectory", T]]: | ||||
# TODO move to swh.model.from_disk.Directory | # TODO move to swh.model.from_disk.Directory | ||||
try: | try: | ||||
return self[path] | return self[path] | ||||
except KeyError: | except KeyError: | ||||
return default | return default | ||||
class HgLoaderFromDisk(BaseLoader): | class HgLoaderFromDisk(BaseLoader): | ||||
"""Load a mercurial repository from a local repository.""" | """Load a mercurial repository from a local repository. | ||||
Mercurial's branching model is more complete than Git's; it allows for multiple | |||||
heads per branch, closed heads and bookmarks. The following mapping is used to | |||||
represent the branching state of a Mercurial project in a given snapshot: | |||||
olasd: I guess it's not required anymore ! | |||||
- `HEAD` (optional) either the node pointed by the `@` bookmark or the tip of | |||||
the `default` branch | |||||
Done Inline ActionsI see that "tipmost head" is a mercurial concept readily referenced in its documentation. If you can describe it in a few words, it would help make the description more generally understandable. olasd: I see that "tipmost head" is a mercurial concept readily referenced in its documentation. If… | |||||
- `branch-tip/<branch-name>` (required) the first head of the branch, sorted by | |||||
Done Inline Actionsextra space olasd: extra space | |||||
nodeid if there are multiple heads. | |||||
- `bookmarks/<bookmark_name>` (optional) holds the bookmarks mapping if any | |||||
- `branch-heads/<branch_name>/0..n` (optional) for any branch with multiple open | |||||
heads, list all *open* heads | |||||
- `branch-closed-heads/<branch_name>/0..n` (optional) for any branch with at least | |||||
one closed head, list all *closed* heads | |||||
- `tags/<tag-name>` (optional) record tags | |||||
The format is not ambiguous regardless of branch name since we know it ends with a | |||||
`/<index>`, as long as we have a stable sorting of the heads (we sort by nodeid). | |||||
There may be some overlap between the refs, but it's simpler not to try to figure | |||||
out de-duplication. | |||||
However, to reduce the redundancy between snapshot branches in the most common case, | |||||
when a branch has a single open head, it will only be referenced as | |||||
Done Inline Actions"To reduce the redundancy between snapshot branches in the most common case, when a branch has a single open head, it will only be referenced as branch-tip/<branch-name>. The branch-heads/ hierarchy only appears when a branch has multiple open heads, which we consistently sort by increasing nodeid. The branch-closed-heads/ hierarchy is also sorted by increasing nodeid." olasd: "To reduce the redundancy between snapshot branches in the most common case, when a branch has… | |||||
`branch-tip/<branch-name>`. The `branch-heads/` hierarchy only appears when a branch | |||||
has multiple open heads, which we consistently sort by increasing nodeid. | |||||
The `branch-closed-heads/` hierarchy is also sorted by increasing nodeid. | |||||
""" | |||||
CONFIG_BASE_FILENAME = "loader/mercurial" | CONFIG_BASE_FILENAME = "loader/mercurial" | ||||
visit_type = "hg" | visit_type = "hg" | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
storage: StorageInterface, | storage: StorageInterface, | ||||
▲ Show 20 Lines • Show All 186 Lines • ▼ Show 20 Lines | def get_hg_revs_to_load(self) -> Union[HgFilteredSet, HgSpanSet]: | ||||
existing_heads = [] # heads that still exist in the repository | existing_heads = [] # heads that still exist in the repository | ||||
for hg_nodeid in self._latest_heads: | for hg_nodeid in self._latest_heads: | ||||
try: | try: | ||||
rev = repo[hg_nodeid].rev() | rev = repo[hg_nodeid].rev() | ||||
existing_heads.append(rev) | existing_heads.append(rev) | ||||
except KeyError: # the node does not exist anymore | except KeyError: # the node does not exist anymore | ||||
pass | pass | ||||
# Mercurial can have more than one head per branch, so we need to exclude | |||||
# local heads that have already been loaded as revisions but don't | |||||
# correspond to a SnapshotBranch. | |||||
# In the future, if the SnapshotBranch model evolves to support multiple | |||||
# heads per branch (or anything else that fixes this issue) this might | |||||
# become useless. | |||||
extids = self.storage.extid_get_from_extid(EXTID_TYPE, repo.heads()) | |||||
known_heads = {extid.extid for extid in extids} | |||||
existing_heads.extend([repo[head].rev() for head in known_heads]) | |||||
# select revisions that are not ancestors of heads | # select revisions that are not ancestors of heads | ||||
# and not the heads themselves | # and not the heads themselves | ||||
new_revs = repo.revs("not ::(%ld)", existing_heads) | new_revs = repo.revs("not ::(%ld)", existing_heads) | ||||
if new_revs: | if new_revs: | ||||
self.log.info("New revisions found: %d", len(new_revs)) | self.log.info("New revisions found: %d", len(new_revs)) | ||||
return new_revs | return new_revs | ||||
else: | else: | ||||
return repo.revs("all()") | return repo.revs("all()") | ||||
def store_data(self): | def store_data(self): | ||||
"""Store fetched data in the database.""" | """Store fetched data in the database.""" | ||||
revs = self.get_hg_revs_to_load() | revs = self.get_hg_revs_to_load() | ||||
if not revs: | if not revs: | ||||
self._load_status = "uneventful" | self._load_status = "uneventful" | ||||
return | return | ||||
assert self._repo is not None | assert self._repo is not None | ||||
repo = self._repo | repo = self._repo | ||||
blacklisted_revs: List[int] = [] | ignored_revs: Set[int] = set() | ||||
for rev in revs: | for rev in revs: | ||||
if rev in blacklisted_revs: | if rev in ignored_revs: | ||||
continue | continue | ||||
try: | try: | ||||
self.store_revision(repo[rev]) | self.store_revision(repo[rev]) | ||||
except CorruptedRevision as e: | except CorruptedRevision as e: | ||||
self._visit_status = "partial" | self._visit_status = "partial" | ||||
self.log.warning("Corrupted revision %s", e) | self.log.warning("Corrupted revision %s", e) | ||||
descendents = repo.revs("(%ld)::", [rev]) | descendents = repo.revs("(%ld)::", [rev]) | ||||
blacklisted_revs.extend(descendents) | ignored_revs.update(descendents) | ||||
branch_by_hg_nodeid: Dict[HgNodeId, bytes] = { | if len(ignored_revs) == len(revs): | ||||
hg_nodeid: name for name, hg_nodeid in hgutil.branches(repo).items() | # The repository is completely broken, nothing can be loaded | ||||
} | self._load_status = "uneventful" | ||||
return | |||||
branching_info = hgutil.branching_info(repo, ignored_revs) | |||||
tags_by_name: Dict[bytes, HgNodeId] = repo.tags() | tags_by_name: Dict[bytes, HgNodeId] = repo.tags() | ||||
Done Inline ActionsI wouldn't mind blacklisted_revs to be renamed to ignored_revs olasd: I wouldn't mind `blacklisted_revs` to be renamed to `ignored_revs` | |||||
snapshot_branches: Dict[bytes, SnapshotBranch] = {} | snapshot_branches: Dict[bytes, SnapshotBranch] = {} | ||||
for tag_name, hg_nodeid in tags_by_name.items(): | for tag_name, hg_nodeid in tags_by_name.items(): | ||||
if tag_name == b"tip": | if tag_name == b"tip": | ||||
# tip is listed in the tags by the mercurial api | # `tip` is listed in the tags by the Mercurial API but its not a tag | ||||
# but its not a tag defined by the user in `.hgtags` | # defined by the user in `.hgtags`. | ||||
continue | continue | ||||
if hg_nodeid not in self._saved_tags: | if hg_nodeid not in self._saved_tags: | ||||
revision_sha1git = self.get_revision_id_from_hg_nodeid(hg_nodeid) | label = b"tags/%s" % tag_name | ||||
snapshot_branches[tag_name] = SnapshotBranch( | target = self.get_revision_id_from_hg_nodeid(hg_nodeid) | ||||
target=self.store_release(tag_name, revision_sha1git), | snapshot_branches[label] = SnapshotBranch( | ||||
target=self.store_release(tag_name, target), | |||||
target_type=TargetType.RELEASE, | target_type=TargetType.RELEASE, | ||||
) | ) | ||||
for hg_nodeid, revision_sha1git in self._revision_nodeid_to_sha1git.items(): | for branch_name, node_id in branching_info.tips.items(): | ||||
if hg_nodeid in branch_by_hg_nodeid: | name = b"branch-tip/%s" % branch_name | ||||
name = branch_by_hg_nodeid[hg_nodeid] | target = self.get_revision_id_from_hg_nodeid(node_id) | ||||
snapshot_branches[name] = SnapshotBranch( | snapshot_branches[name] = SnapshotBranch( | ||||
target=revision_sha1git, target_type=TargetType.REVISION, | target=target, target_type=TargetType.REVISION | ||||
) | ) | ||||
# The tip is mapped to `HEAD` to match | for bookmark_name, node_id in branching_info.bookmarks.items(): | ||||
# the historical implementation | name = b"bookmarks/%s" % bookmark_name | ||||
if hg_nodeid == tags_by_name[b"tip"]: | target = self.get_revision_id_from_hg_nodeid(node_id) | ||||
snapshot_branches[b"HEAD"] = SnapshotBranch( | snapshot_branches[name] = SnapshotBranch( | ||||
target=name, target_type=TargetType.ALIAS, | target=target, target_type=TargetType.REVISION | ||||
) | ) | ||||
for branch_name, branch_heads in branching_info.open_heads.items(): | |||||
for index, head in enumerate(branch_heads): | |||||
Done Inline ActionsDoesn't bytes formatting support %d? olasd: Doesn't bytes formatting support `%d`? | |||||
name = b"branch-heads/%s/%d" % (branch_name, index) | |||||
target = self.get_revision_id_from_hg_nodeid(head) | |||||
snapshot_branches[name] = SnapshotBranch( | |||||
target=target, target_type=TargetType.REVISION | |||||
) | |||||
for branch_name, closed_heads in branching_info.closed_heads.items(): | |||||
for index, head in enumerate(closed_heads): | |||||
name = b"branch-closed-heads/%s/%d" % (branch_name, index) | |||||
target = self.get_revision_id_from_hg_nodeid(head) | |||||
snapshot_branches[name] = SnapshotBranch( | |||||
target=target, target_type=TargetType.REVISION | |||||
) | |||||
# If the repo is broken enough or if it has none of the "normal" default | |||||
# mechanisms, we ignore `HEAD`. | |||||
default_branch_alias = branching_info.default_branch_alias | |||||
if default_branch_alias is not None: | |||||
snapshot_branches[b"HEAD"] = SnapshotBranch( | |||||
target=default_branch_alias, target_type=TargetType.ALIAS, | |||||
) | |||||
snapshot = Snapshot(branches=snapshot_branches) | snapshot = Snapshot(branches=snapshot_branches) | ||||
Not Done Inline ActionsMultiple questions here:
olasd: Multiple questions here:
- Is the @ bookmark guaranteed to be the tip of a branch? | |||||
Done Inline Actions
Nope, you can put it anywhere.
Sure, if that differs, should we just not register HEAD? Alphare: > Multiple questions here:
> - Is the @ bookmark guaranteed to be the tip of a branch?
Nope… | |||||
Not Done Inline ActionsInstead of going through a rev (and losing which heuristic we've used to find the default rev) I think the other function should just return the name of the branch or bookmark that we've used to determine the default. olasd: Instead of going through a rev (and losing which heuristic we've used to find the default rev)… | |||||
Done Inline ActionsI'm not sure what you're suggesting. Should hgutil.branching_info return the name of the branch (or bookmark) on which the default revision is found if and only if said revision is a branch tip, allowing us to add an alias to it? Alphare: I'm not sure what you're suggesting. Should `hgutil.branching_info` return the name of the… | |||||
Not Done Inline ActionsYeah, that's what I would suggest (rather than applying a heuristic in hgutil.branching_info, then having to reverse engineer it when generating the snapshot) olasd: Yeah, that's what I would suggest (rather than applying a heuristic in hgutil.branching_info… | |||||
self.storage.snapshot_add([snapshot]) | self.storage.snapshot_add([snapshot]) | ||||
self.flush() | self.flush() | ||||
self.loaded_snapshot_id = snapshot.id | self.loaded_snapshot_id = snapshot.id | ||||
def load_status(self) -> Dict[str, str]: | def load_status(self) -> Dict[str, str]: | ||||
"""Detailed loading status. | """Detailed loading status. | ||||
▲ Show 20 Lines • Show All 314 Lines • Show Last 20 Lines |
I guess it's not required anymore !