Changeset View
Standalone View
swh/loader/mercurial/from_disk.py
Show First 20 Lines • Show All 100 Lines • ▼ Show 20 Lines | |||||
class HgLoaderFromDisk(BaseLoader): | class HgLoaderFromDisk(BaseLoader): | ||||
"""Load a mercurial repository from a local repository.""" | """Load a mercurial repository from a local repository.""" | ||||
CONFIG_BASE_FILENAME = "loader/mercurial" | CONFIG_BASE_FILENAME = "loader/mercurial" | ||||
visit_type = "hg" | visit_type = "hg" | ||||
olasd: I guess it's not required anymore ! | |||||
def __init__( | def __init__( | ||||
self, | self, | ||||
Done Inline ActionsI see that "tipmost head" is a mercurial concept readily referenced in its documentation. If you can describe it in a few words, it would help make the description more generally understandable. olasd: I see that "tipmost head" is a mercurial concept readily referenced in its documentation. If… | |||||
storage: StorageInterface, | storage: StorageInterface, | ||||
Done Inline Actionsextra space olasd: extra space | |||||
url: str, | url: str, | ||||
directory: Optional[str] = None, | directory: Optional[str] = None, | ||||
logging_class: str = "swh.loader.mercurial.LoaderFromDisk", | logging_class: str = "swh.loader.mercurial.LoaderFromDisk", | ||||
visit_date: Optional[datetime] = None, | visit_date: Optional[datetime] = None, | ||||
temp_directory: str = "/tmp", | temp_directory: str = "/tmp", | ||||
clone_timeout_seconds: int = 7200, | clone_timeout_seconds: int = 7200, | ||||
content_cache_size: int = 10_000, | content_cache_size: int = 10_000, | ||||
max_content_size: Optional[int] = None, | max_content_size: Optional[int] = None, | ||||
): | ): | ||||
"""Initialize the loader. | """Initialize the loader. | ||||
Args: | Args: | ||||
url: url of the repository. | url: url of the repository. | ||||
directory: directory of the local repository. | directory: directory of the local repository. | ||||
Done Inline Actions"To reduce the redundancy between snapshot branches in the most common case, when a branch has a single open head, it will only be referenced as branch-tip/<branch-name>. The branch-heads/ hierarchy only appears when a branch has multiple open heads, which we consistently sort by increasing nodeid. The branch-closed-heads/ hierarchy is also sorted by increasing nodeid." olasd: "To reduce the redundancy between snapshot branches in the most common case, when a branch has… | |||||
logging_class: class of the loader logger. | logging_class: class of the loader logger. | ||||
visit_date: visit date of the repository | visit_date: visit date of the repository | ||||
config: loader configuration | config: loader configuration | ||||
""" | """ | ||||
super().__init__( | super().__init__( | ||||
storage=storage, | storage=storage, | ||||
logging_class=logging_class, | logging_class=logging_class, | ||||
max_content_size=max_content_size, | max_content_size=max_content_size, | ||||
▲ Show 20 Lines • Show All 164 Lines • ▼ Show 20 Lines | def get_hg_revs_to_load(self) -> Union[HgFilteredSet, HgSpanSet]: | ||||
existing_heads = [] # heads that still exist in the repository | existing_heads = [] # heads that still exist in the repository | ||||
for hg_nodeid in self._latest_heads: | for hg_nodeid in self._latest_heads: | ||||
try: | try: | ||||
rev = repo[hg_nodeid].rev() | rev = repo[hg_nodeid].rev() | ||||
existing_heads.append(rev) | existing_heads.append(rev) | ||||
except KeyError: # the node does not exist anymore | except KeyError: # the node does not exist anymore | ||||
pass | pass | ||||
# Mercurial can have more than one head per branch, so we need to exclude | |||||
# local heads that have already been loaded as revisions but don't | |||||
# correspond to a SnapshotBranch. | |||||
# In the future, if the SnapshotBranch model evolves to support multiple | |||||
# heads per branch (or anything else that fixes this issue) this might | |||||
# become useless. | |||||
extids = self.storage.extid_get_from_extid(EXTID_TYPE, repo.heads()) | |||||
known_heads = {extid.extid for extid in extids} | |||||
existing_heads.extend([repo[head].rev() for head in known_heads]) | |||||
# select revisions that are not ancestors of heads | # select revisions that are not ancestors of heads | ||||
# and not the heads themselves | # and not the heads themselves | ||||
new_revs = repo.revs("not ::(%ld)", existing_heads) | new_revs = repo.revs("not ::(%ld)", existing_heads) | ||||
if new_revs: | if new_revs: | ||||
self.log.info("New revisions found: %d", len(new_revs)) | self.log.info("New revisions found: %d", len(new_revs)) | ||||
return new_revs | return new_revs | ||||
else: | else: | ||||
return repo.revs("all()") | return repo.revs("all()") | ||||
def store_data(self): | def store_data(self): | ||||
"""Store fetched data in the database.""" | """Store fetched data in the database.""" | ||||
revs = self.get_hg_revs_to_load() | revs = self.get_hg_revs_to_load() | ||||
if not revs: | if not revs: | ||||
self._load_status = "uneventful" | self._load_status = "uneventful" | ||||
return | return | ||||
assert self._repo is not None | assert self._repo is not None | ||||
repo = self._repo | repo = self._repo | ||||
blacklisted_revs: List[int] = [] | blacklisted_revs: Set[int] = set() | ||||
for rev in revs: | for rev in revs: | ||||
if rev in blacklisted_revs: | if rev in blacklisted_revs: | ||||
continue | continue | ||||
try: | try: | ||||
self.store_revision(repo[rev]) | self.store_revision(repo[rev]) | ||||
except CorruptedRevision as e: | except CorruptedRevision as e: | ||||
self._visit_status = "partial" | self._visit_status = "partial" | ||||
self.log.warning("Corrupted revision %s", e) | self.log.warning("Corrupted revision %s", e) | ||||
descendents = repo.revs("(%ld)::", [rev]) | descendents = repo.revs("(%ld)::", [rev]) | ||||
blacklisted_revs.extend(descendents) | blacklisted_revs.update(descendents) | ||||
branch_by_hg_nodeid: Dict[HgNodeId, bytes] = { | if len(blacklisted_revs) == len(revs): | ||||
hg_nodeid: name for name, hg_nodeid in hgutil.branches(repo).items() | # The repository is completely broken, nothing can be loaded | ||||
} | self._load_status = "uneventful" | ||||
return | |||||
tips, heads, closed_heads, bookmarks = hgutil.branches_info( | |||||
repo, blacklisted_revs | |||||
) | |||||
tags_by_name: Dict[bytes, HgNodeId] = repo.tags() | tags_by_name: Dict[bytes, HgNodeId] = repo.tags() | ||||
Done Inline ActionsI wouldn't mind blacklisted_revs to be renamed to ignored_revs olasd: I wouldn't mind `blacklisted_revs` to be renamed to `ignored_revs` | |||||
snapshot_branches: Dict[bytes, SnapshotBranch] = {} | snapshot_branches: Dict[bytes, SnapshotBranch] = {} | ||||
for tag_name, hg_nodeid in tags_by_name.items(): | for tag_name, hg_nodeid in tags_by_name.items(): | ||||
if tag_name == b"tip": | if tag_name == b"tip": | ||||
# tip is listed in the tags by the mercurial api | # `tip` is listed in the tags by the Mercurial API but its not a tag | ||||
# but its not a tag defined by the user in `.hgtags` | # defined by the user in `.hgtags`. | ||||
continue | continue | ||||
if hg_nodeid not in self._saved_tags: | if hg_nodeid not in self._saved_tags: | ||||
revision_sha1git = self.get_revision_id_from_hg_nodeid(hg_nodeid) | target = self.get_revision_id_from_hg_nodeid(hg_nodeid) | ||||
snapshot_branches[tag_name] = SnapshotBranch( | snapshot_branches[tag_name] = SnapshotBranch( | ||||
target=self.store_release(tag_name, revision_sha1git), | target=self.store_release(tag_name, target), | ||||
target_type=TargetType.RELEASE, | target_type=TargetType.RELEASE, | ||||
) | ) | ||||
for hg_nodeid, revision_sha1git in self._revision_nodeid_to_sha1git.items(): | for branch_name, node_id in tips.items(): | ||||
if hg_nodeid in branch_by_hg_nodeid: | name = b"refs/hg/branch-tip/%s" % branch_name | ||||
name = branch_by_hg_nodeid[hg_nodeid] | target = self.get_revision_id_from_hg_nodeid(node_id) | ||||
snapshot_branches[name] = SnapshotBranch( | snapshot_branches[name] = SnapshotBranch( | ||||
target=revision_sha1git, target_type=TargetType.REVISION, | target=target, target_type=TargetType.REVISION | ||||
) | ) | ||||
# The tip is mapped to `HEAD` to match | for bookmark_name, node_id in bookmarks.items(): | ||||
# the historical implementation | name = b"refs/hg/bookmarks/%s" % bookmark_name | ||||
if hg_nodeid == tags_by_name[b"tip"]: | target = self.get_revision_id_from_hg_nodeid(node_id) | ||||
snapshot_branches[b"HEAD"] = SnapshotBranch( | snapshot_branches[name] = SnapshotBranch( | ||||
target=name, target_type=TargetType.ALIAS, | target=target, target_type=TargetType.REVISION | ||||
) | ) | ||||
for branch_name, branch_heads in heads.items(): | |||||
for index, head in enumerate(branch_heads): | |||||
Done Inline ActionsDoesn't bytes formatting support %d? olasd: Doesn't bytes formatting support `%d`? | |||||
index = str(index).encode() | |||||
name = b"refs/hg/branch-heads/%s/%s" % (branch_name, index) | |||||
target = self.get_revision_id_from_hg_nodeid(head) | |||||
snapshot_branches[name] = SnapshotBranch( | |||||
target=target, target_type=TargetType.REVISION | |||||
) | |||||
for branch_name, closed_heads in closed_heads.items(): | |||||
for index, head in enumerate(closed_heads): | |||||
index = str(index).encode() | |||||
name = b"refs/hg/branch-closed-heads/%s/%s" % (branch_name, index) | |||||
target = self.get_revision_id_from_hg_nodeid(head) | |||||
snapshot_branches[name] = SnapshotBranch( | |||||
target=target, target_type=TargetType.REVISION | |||||
) | |||||
# `tip` is mapped to `HEAD` to match the historical implementation. | |||||
tip_node_id = tags_by_name[b"tip"] | |||||
branch_name = repo[tip_node_id].branch() | |||||
target = b"refs/hg/branch-tip/%s" % branch_name | |||||
snapshot_branches[b"HEAD"] = SnapshotBranch( | |||||
target=target, target_type=TargetType.ALIAS, | |||||
) | |||||
snapshot = Snapshot(branches=snapshot_branches) | snapshot = Snapshot(branches=snapshot_branches) | ||||
Not Done Inline ActionsMultiple questions here:
olasd: Multiple questions here:
- Is the @ bookmark guaranteed to be the tip of a branch? | |||||
Done Inline Actions
Nope, you can put it anywhere.
Sure, if that differs, should we just not register HEAD? Alphare: > Multiple questions here:
> - Is the @ bookmark guaranteed to be the tip of a branch?
Nope… | |||||
Not Done Inline ActionsInstead of going through a rev (and losing which heuristic we've used to find the default rev) I think the other function should just return the name of the branch or bookmark that we've used to determine the default. olasd: Instead of going through a rev (and losing which heuristic we've used to find the default rev)… | |||||
Done Inline ActionsI'm not sure what you're suggesting. Should hgutil.branching_info return the name of the branch (or bookmark) on which the default revision is found if and only if said revision is a branch tip, allowing us to add an alias to it? Alphare: I'm not sure what you're suggesting. Should `hgutil.branching_info` return the name of the… | |||||
Not Done Inline ActionsYeah, that's what I would suggest (rather than applying a heuristic in hgutil.branching_info, then having to reverse engineer it when generating the snapshot) olasd: Yeah, that's what I would suggest (rather than applying a heuristic in hgutil.branching_info… | |||||
self.storage.snapshot_add([snapshot]) | self.storage.snapshot_add([snapshot]) | ||||
self.flush() | self.flush() | ||||
self.loaded_snapshot_id = snapshot.id | self.loaded_snapshot_id = snapshot.id | ||||
def load_status(self) -> Dict[str, str]: | def load_status(self) -> Dict[str, str]: | ||||
"""Detailed loading status. | """Detailed loading status. | ||||
▲ Show 20 Lines • Show All 314 Lines • Show Last 20 Lines |
I guess it's not required anymore !