Differential D5816 Diff 20762 swh/loader/mercurial/from_disk.py

Changeset View

Standalone View

swh/loader/mercurial/from_disk.py

Show First 20 Lines • Show All 100 Lines • ▼ Show 20 Lines


class HgLoaderFromDisk(BaseLoader):		class HgLoaderFromDisk(BaseLoader):
"""Load a mercurial repository from a local repository."""		"""Load a mercurial repository from a local repository."""

CONFIG_BASE_FILENAME = "loader/mercurial"		CONFIG_BASE_FILENAME = "loader/mercurial"

visit_type = "hg"		visit_type = "hg"

		olasdUnsubmitted Not Done Inline Actions I guess it's not required anymore ! olasd: I guess it's not required anymore !
def __init__(		def __init__(
self,		self,
		olasdUnsubmitted Done Inline Actions I see that "tipmost head" is a mercurial concept readily referenced in its documentation. If you can describe it in a few words, it would help make the description more generally understandable. olasd: I see that "tipmost head" is a mercurial concept readily referenced in its documentation. If…
storage: StorageInterface,		storage: StorageInterface,
		olasdUnsubmitted Done Inline Actions extra space olasd: extra space
url: str,		url: str,
directory: Optional[str] = None,		directory: Optional[str] = None,
logging_class: str = "swh.loader.mercurial.LoaderFromDisk",		logging_class: str = "swh.loader.mercurial.LoaderFromDisk",
visit_date: Optional[datetime] = None,		visit_date: Optional[datetime] = None,
temp_directory: str = "/tmp",		temp_directory: str = "/tmp",
clone_timeout_seconds: int = 7200,		clone_timeout_seconds: int = 7200,
content_cache_size: int = 10_000,		content_cache_size: int = 10_000,
max_content_size: Optional[int] = None,		max_content_size: Optional[int] = None,
):		):
"""Initialize the loader.		"""Initialize the loader.

Args:		Args:
url: url of the repository.		url: url of the repository.
directory: directory of the local repository.		directory: directory of the local repository.
		olasdUnsubmitted Done Inline Actions "To reduce the redundancy between snapshot branches in the most common case, when a branch has a single open head, it will only be referenced as `branch-tip/<branch-name>`. The `branch-heads/` hierarchy only appears when a branch has multiple open heads, which we consistently sort by increasing nodeid. The `branch-closed-heads/` hierarchy is also sorted by increasing nodeid." olasd: "To reduce the redundancy between snapshot branches in the most common case, when a branch has…
logging_class: class of the loader logger.		logging_class: class of the loader logger.
visit_date: visit date of the repository		visit_date: visit date of the repository
config: loader configuration		config: loader configuration
"""		"""
super().__init__(		super().__init__(
storage=storage,		storage=storage,
logging_class=logging_class,		logging_class=logging_class,
max_content_size=max_content_size,		max_content_size=max_content_size,
▲ Show 20 Lines • Show All 164 Lines • ▼ Show 20 Lines	def get_hg_revs_to_load(self) -> Union[HgFilteredSet, HgSpanSet]:
existing_heads = [] # heads that still exist in the repository		existing_heads = [] # heads that still exist in the repository
for hg_nodeid in self._latest_heads:		for hg_nodeid in self._latest_heads:
try:		try:
rev = repo[hg_nodeid].rev()		rev = repo[hg_nodeid].rev()
existing_heads.append(rev)		existing_heads.append(rev)
except KeyError: # the node does not exist anymore		except KeyError: # the node does not exist anymore
pass		pass

# Mercurial can have more than one head per branch, so we need to exclude
# local heads that have already been loaded as revisions but don't
# correspond to a SnapshotBranch.
# In the future, if the SnapshotBranch model evolves to support multiple
# heads per branch (or anything else that fixes this issue) this might
# become useless.
extids = self.storage.extid_get_from_extid(EXTID_TYPE, repo.heads())
known_heads = {extid.extid for extid in extids}
existing_heads.extend([repo[head].rev() for head in known_heads])
# select revisions that are not ancestors of heads		# select revisions that are not ancestors of heads
# and not the heads themselves		# and not the heads themselves
new_revs = repo.revs("not ::(%ld)", existing_heads)		new_revs = repo.revs("not ::(%ld)", existing_heads)

if new_revs:		if new_revs:
self.log.info("New revisions found: %d", len(new_revs))		self.log.info("New revisions found: %d", len(new_revs))
return new_revs		return new_revs
else:		else:
return repo.revs("all()")		return repo.revs("all()")

def store_data(self):		def store_data(self):
"""Store fetched data in the database."""		"""Store fetched data in the database."""
revs = self.get_hg_revs_to_load()		revs = self.get_hg_revs_to_load()
if not revs:		if not revs:
self._load_status = "uneventful"		self._load_status = "uneventful"
return		return

assert self._repo is not None		assert self._repo is not None
repo = self._repo		repo = self._repo

blacklisted_revs: List[int] = []		blacklisted_revs: Set[int] = set()
for rev in revs:		for rev in revs:
if rev in blacklisted_revs:		if rev in blacklisted_revs:
continue		continue
try:		try:
self.store_revision(repo[rev])		self.store_revision(repo[rev])
except CorruptedRevision as e:		except CorruptedRevision as e:
self._visit_status = "partial"		self._visit_status = "partial"
self.log.warning("Corrupted revision %s", e)		self.log.warning("Corrupted revision %s", e)
descendents = repo.revs("(%ld)::", [rev])		descendents = repo.revs("(%ld)::", [rev])
blacklisted_revs.extend(descendents)		blacklisted_revs.update(descendents)

branch_by_hg_nodeid: Dict[HgNodeId, bytes] = {		if len(blacklisted_revs) == len(revs):
hg_nodeid: name for name, hg_nodeid in hgutil.branches(repo).items()		# The repository is completely broken, nothing can be loaded
}		self._load_status = "uneventful"
		return
		tips, heads, closed_heads, bookmarks = hgutil.branches_info(
		repo, blacklisted_revs
		)
tags_by_name: Dict[bytes, HgNodeId] = repo.tags()		tags_by_name: Dict[bytes, HgNodeId] = repo.tags()
		olasdUnsubmitted Done Inline Actions I wouldn't mind `blacklisted_revs` to be renamed to `ignored_revs` olasd: I wouldn't mind `blacklisted_revs` to be renamed to `ignored_revs`

snapshot_branches: Dict[bytes, SnapshotBranch] = {}		snapshot_branches: Dict[bytes, SnapshotBranch] = {}

for tag_name, hg_nodeid in tags_by_name.items():		for tag_name, hg_nodeid in tags_by_name.items():
if tag_name == b"tip":		if tag_name == b"tip":
# tip is listed in the tags by the mercurial api		# `tip` is listed in the tags by the Mercurial API but its not a tag
# but its not a tag defined by the user in `.hgtags`		# defined by the user in `.hgtags`.
continue		continue
if hg_nodeid not in self._saved_tags:		if hg_nodeid not in self._saved_tags:
revision_sha1git = self.get_revision_id_from_hg_nodeid(hg_nodeid)		target = self.get_revision_id_from_hg_nodeid(hg_nodeid)
snapshot_branches[tag_name] = SnapshotBranch(		snapshot_branches[tag_name] = SnapshotBranch(
target=self.store_release(tag_name, revision_sha1git),		target=self.store_release(tag_name, target),
target_type=TargetType.RELEASE,		target_type=TargetType.RELEASE,
)		)

for hg_nodeid, revision_sha1git in self._revision_nodeid_to_sha1git.items():		for branch_name, node_id in tips.items():
if hg_nodeid in branch_by_hg_nodeid:		name = b"refs/hg/branch-tip/%s" % branch_name
name = branch_by_hg_nodeid[hg_nodeid]		target = self.get_revision_id_from_hg_nodeid(node_id)
snapshot_branches[name] = SnapshotBranch(		snapshot_branches[name] = SnapshotBranch(
target=revision_sha1git, target_type=TargetType.REVISION,		target=target, target_type=TargetType.REVISION
)		)

# The tip is mapped to `HEAD` to match		for bookmark_name, node_id in bookmarks.items():
# the historical implementation		name = b"refs/hg/bookmarks/%s" % bookmark_name
if hg_nodeid == tags_by_name[b"tip"]:		target = self.get_revision_id_from_hg_nodeid(node_id)
snapshot_branches[b"HEAD"] = SnapshotBranch(		snapshot_branches[name] = SnapshotBranch(
target=name, target_type=TargetType.ALIAS,		target=target, target_type=TargetType.REVISION
)		)

		for branch_name, branch_heads in heads.items():
		for index, head in enumerate(branch_heads):
		olasdUnsubmitted Done Inline Actions Doesn't bytes formatting support `%d`? olasd: Doesn't bytes formatting support `%d`?
		index = str(index).encode()
		name = b"refs/hg/branch-heads/%s/%s" % (branch_name, index)
		target = self.get_revision_id_from_hg_nodeid(head)
		snapshot_branches[name] = SnapshotBranch(
		target=target, target_type=TargetType.REVISION
		)

		for branch_name, closed_heads in closed_heads.items():
		for index, head in enumerate(closed_heads):
		index = str(index).encode()
		name = b"refs/hg/branch-closed-heads/%s/%s" % (branch_name, index)
		target = self.get_revision_id_from_hg_nodeid(head)
		snapshot_branches[name] = SnapshotBranch(
		target=target, target_type=TargetType.REVISION
		)

		# `tip` is mapped to `HEAD` to match the historical implementation.
		tip_node_id = tags_by_name[b"tip"]
		branch_name = repo[tip_node_id].branch()
		target = b"refs/hg/branch-tip/%s" % branch_name
		snapshot_branches[b"HEAD"] = SnapshotBranch(
		target=target, target_type=TargetType.ALIAS,
		)
snapshot = Snapshot(branches=snapshot_branches)		snapshot = Snapshot(branches=snapshot_branches)
		olasdUnsubmitted Not Done Inline Actions Multiple questions here: Is the @ bookmark guaranteed to be the tip of a branch? shouldn't we check that default_rev is indeed the tip of the branch that we're considering? olasd: Multiple questions here: - Is the @ bookmark guaranteed to be the tip of a branch?
		AlphareAuthorUnsubmitted Done Inline Actions Multiple questions here: Is the @ bookmark guaranteed to be the tip of a branch? Nope, you can put it anywhere. shouldn't we check that default_rev is indeed the tip of the branch that we're considering? Sure, if that differs, should we just not register `HEAD`? Alphare: > Multiple questions here: > - Is the @ bookmark guaranteed to be the tip of a branch? Nope…
		olasdUnsubmitted Not Done Inline Actions Instead of going through a rev (and losing which heuristic we've used to find the default rev) I think the other function should just return the name of the branch or bookmark that we've used to determine the default. olasd: Instead of going through a rev (and losing which heuristic we've used to find the default rev)…
		AlphareAuthorUnsubmitted Done Inline Actions I'm not sure what you're suggesting. Should `hgutil.branching_info` return the name of the branch (or bookmark) on which the default revision is found if and only if said revision is a branch tip, allowing us to add an alias to it? Alphare: I'm not sure what you're suggesting. Should `hgutil.branching_info` return the name of the…
		olasdUnsubmitted Not Done Inline Actions Yeah, that's what I would suggest (rather than applying a heuristic in hgutil.branching_info, then having to reverse engineer it when generating the snapshot) olasd: Yeah, that's what I would suggest (rather than applying a heuristic in hgutil.branching_info…
self.storage.snapshot_add([snapshot])		self.storage.snapshot_add([snapshot])

self.flush()		self.flush()
self.loaded_snapshot_id = snapshot.id		self.loaded_snapshot_id = snapshot.id

def load_status(self) -> Dict[str, str]:		def load_status(self) -> Dict[str, str]:
"""Detailed loading status.		"""Detailed loading status.

▲ Show 20 Lines • Show All 314 Lines • Show Last 20 Lines