Differential D6268 Diff 22723 swh/loader/mercurial/from_disk.py

Changeset View

Standalone View

swh/loader/mercurial/from_disk.py

Show First 20 Lines • Show All 182 Lines • ▼ Show 20 Lines	):

# Cache the content hash across revisions to avoid recalculation.		# Cache the content hash across revisions to avoid recalculation.
self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict(		self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict(
content_cache_size,		content_cache_size,
)		)

# hg node id of the latest snapshot branch heads		# hg node id of the latest snapshot branch heads
# used to find what are the new revisions since last snapshot		# used to find what are the new revisions since last snapshot
self._latest_heads: List[bytes] = []		self._latest_heads: List[HgNodeId] = []
# hg node ids of all the tags recorded on previous runs		# hg node ids of all the tags recorded on previous runs
# Used to compute which tags need to be added, even across incremental runs		# Used to compute which tags need to be added, even across incremental runs
# that might separate a changeset introducing a tag from its target.		# that might separate a changeset introducing a tag from its target.
self._saved_tags: Set[bytes] = set()		self._saved_tags: Set[HgNodeId] = set()

self._load_status = "eventful"		self._load_status = "eventful"
# If set, will override the default value		# If set, will override the default value
self._visit_status = None		self._visit_status = None

def pre_cleanup(self) -> None:		def pre_cleanup(self) -> None:
"""As a first step, will try and check for dangling data to cleanup.		"""As a first step, will try and check for dangling data to cleanup.
This should do its best to avoid raising issues.		This should do its best to avoid raising issues.
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	def _set_recorded_state(self, latest_snapshot: Snapshot) -> None:

for branch in latest_snapshot.branches.values():		for branch in latest_snapshot.branches.values():
if branch.target_type == TargetType.REVISION:		if branch.target_type == TargetType.REVISION:
heads.append(branch.target)		heads.append(branch.target)
elif branch.target_type == TargetType.RELEASE:		elif branch.target_type == TargetType.RELEASE:
tags.append(branch.target)		tags.append(branch.target)

self._latest_heads.extend(		self._latest_heads.extend(
extid.extid for extid in self._get_extids_for_targets(heads)		HgNodeId(extid.extid) for extid in self._get_extids_for_targets(heads)
)		)
self._saved_tags.update(		self._saved_tags.update(
extid.extid for extid in self._get_extids_for_targets(tags)		HgNodeId(extid.extid) for extid in self._get_extids_for_targets(tags)
)		)

def _get_extids_for_targets(self, targets: List[bytes]) -> List[ExtID]:		def _get_extids_for_targets(self, targets: List[Sha1Git]) -> List[ExtID]:
"""Get all Mercurial ExtIDs for the targets in the latest snapshot"""		"""Get all Mercurial ExtIDs for the targets in the latest snapshot"""
extids = [		extids = []
extid
for extid in self.storage.extid_get_from_target(		for extid in self.storage.extid_get_from_target(
identifiers.ObjectType.REVISION, targets		identifiers.ObjectType.REVISION, targets
)		):
if extid.extid_type == EXTID_TYPE and extid.extid_version == EXTID_VERSION		if extid.extid_type != EXTID_TYPE or extid.extid_version != EXTID_VERSION:
]		continue
		extids.append(extid)
		self._revision_nodeid_to_sha1git[
		HgNodeId(extid.extid)
		] = extid.target.object_id

		olasdUnsubmitted Done Inline Actions `targets` are swhids/revision ids, not `HgNodeId`s. olasd: `targets` are swhids/revision ids, not `HgNodeId`s.
		ardumontAuthorUnsubmitted Done Inline Actions Right, i fixed the signature, thanks. ardumont: Right, i fixed the signature, thanks.
if extids:		if extids:
# Filter out dangling extids, we need to load their target again		# Filter out dangling extids, we need to load their target again
revisions_missing = self.storage.revision_missing(		revisions_missing = self.storage.revision_missing(
[extid.target.object_id for extid in extids]		[extid.target.object_id for extid in extids]
)		)
extids = [		extids = [
extid		extid
for extid in extids		for extid in extids
if extid.target.object_id not in revisions_missing		if extid.target.object_id not in revisions_missing
]		]
return extids		return extids

def _get_extids_for_hgnodes(self, hgnode_ids: List[bytes]) -> List[ExtID]:		def _get_extids_for_hgnodes(self, hgnode_ids: List[HgNodeId]) -> List[ExtID]:
"""Get all Mercurial ExtIDs for the mercurial nodes in the list which point to		"""Get all Mercurial ExtIDs for the mercurial nodes in the list which point to
a known revision.		a known revision.

"""		"""
extids = []		extids = []

for group_ids in grouper(hgnode_ids, n=1000):		for group_ids in grouper(hgnode_ids, n=1000):
for extid in self.storage.extid_get_from_extid(EXTID_TYPE, group_ids):		for extid in self.storage.extid_get_from_extid(EXTID_TYPE, group_ids):
if extid.extid_version != EXTID_VERSION:		if extid.extid_version != EXTID_VERSION:
continue		continue
extids.append(extid)		extids.append(extid)
		self._revision_nodeid_to_sha1git[
		HgNodeId(extid.extid)
		] = extid.target.object_id

if extids:		if extids:
# Filter out dangling extids, we need to load their target again		# Filter out dangling extids, we need to load their target again
revisions_missing = self.storage.revision_missing(		revisions_missing = self.storage.revision_missing(
[extid.target.object_id for extid in extids]		[extid.target.object_id for extid in extids]
)		)
extids = [		extids = [
extid		extid
Show All 25 Lines	def fetch_data(self) -> bool:
# Allow to load on disk repository without cloning		# Allow to load on disk repository without cloning
# for testing purpose.		# for testing purpose.
self._repo_directory = self.directory		self._repo_directory = self.directory

self._repo = hgutil.repository(self._repo_directory)		self._repo = hgutil.repository(self._repo_directory)

return False		return False

def _new_revs(self, heads: List[bytes]):		def _new_revs(self, heads: List[HgNodeId]):
"""Return unseen revisions. That is, filter out revisions that are not ancestors of		"""Return unseen revisions. That is, filter out revisions that are not ancestors of
heads"""		heads"""
assert self._repo is not None		assert self._repo is not None
existing_heads = []		existing_heads = []

for hg_nodeid in heads:		for hg_nodeid in heads:
try:		try:
rev = self._repo[hg_nodeid].rev()		rev = self._repo[hg_nodeid].rev()
Show All 24 Lines	def get_hg_revs_to_load(self) -> Iterator[int]:
seen_revs.add(rev)		seen_revs.add(rev)
yield rev		yield rev

# 2. Then filter out remaining revisions through the overall extid mappings		# 2. Then filter out remaining revisions through the overall extid mappings
# across hg origins		# across hg origins
revs_left = repo.revs("all() - ::(%ld)", seen_revs)		revs_left = repo.revs("all() - ::(%ld)", seen_revs)
hg_nodeids = [repo[nodeid].node() for nodeid in revs_left]		hg_nodeids = [repo[nodeid].node() for nodeid in revs_left]
for rev in self._new_revs(		for rev in self._new_revs(
[extid.extid for extid in self._get_extids_for_hgnodes(hg_nodeids)]		[
		HgNodeId(extid.extid)
		for extid in self._get_extids_for_hgnodes(hg_nodeids)
		]
):		):
yield rev		yield rev

def store_data(self):		def store_data(self):
"""Store fetched data in the database."""		"""Store fetched data in the database."""
revs = self.get_hg_revs_to_load()		revs = self.get_hg_revs_to_load()
length_ingested_revs = 0		length_ingested_revs = 0

Show All 9 Lines	def store_data(self):
length_ingested_revs += 1		length_ingested_revs += 1
except CorruptedRevision as e:		except CorruptedRevision as e:
self._visit_status = "partial"		self._visit_status = "partial"
self.log.warning("Corrupted revision %s", e)		self.log.warning("Corrupted revision %s", e)
descendents = repo.revs("(%ld)::", [rev])		descendents = repo.revs("(%ld)::", [rev])
ignored_revs.update(descendents)		ignored_revs.update(descendents)

if length_ingested_revs == 0:		if length_ingested_revs == 0:
# The repository has nothing to ingest (either empty or broken repository)		# no new revision ingested, so uneventful
		# still we'll make a snapshot, so we continue
self._load_status = "uneventful"		self._load_status = "uneventful"
return

branching_info = hgutil.branching_info(repo, ignored_revs)		branching_info = hgutil.branching_info(repo, ignored_revs)
tags_by_name: Dict[bytes, HgNodeId] = repo.tags()		tags_by_name: Dict[bytes, HgNodeId] = repo.tags()

snapshot_branches: Dict[bytes, SnapshotBranch] = {}		snapshot_branches: Dict[bytes, SnapshotBranch] = {}

for tag_name, hg_nodeid in tags_by_name.items():		for tag_name, hg_nodeid in tags_by_name.items():
if tag_name == b"tip":		if tag_name == b"tip":
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	def store_data(self):
# If the repo is broken enough or if it has none of the "normal" default		# If the repo is broken enough or if it has none of the "normal" default
# mechanisms, we ignore `HEAD`.		# mechanisms, we ignore `HEAD`.
default_branch_alias = branching_info.default_branch_alias		default_branch_alias = branching_info.default_branch_alias
if default_branch_alias is not None:		if default_branch_alias is not None:
snapshot_branches[b"HEAD"] = SnapshotBranch(		snapshot_branches[b"HEAD"] = SnapshotBranch(
target=default_branch_alias, target_type=TargetType.ALIAS,		target=default_branch_alias, target_type=TargetType.ALIAS,
)		)
snapshot = Snapshot(branches=snapshot_branches)		snapshot = Snapshot(branches=snapshot_branches)
self.storage.snapshot_add([snapshot])		self.storage.snapshot_add([snapshot])

		olasdUnsubmitted Done Inline Actions This should happen regardless of whether there is a default branch alias or not. olasd: This should happen regardless of whether there is a default branch alias or not.
		ardumontAuthorUnsubmitted Done Inline Actions oh that got indented, the f*? ardumont: oh that got indented, the f*?
self.flush()		self.flush()
self.loaded_snapshot_id = snapshot.id		self.loaded_snapshot_id = snapshot.id

def load_status(self) -> Dict[str, str]:		def load_status(self) -> Dict[str, str]:
"""Detailed loading status.		"""Detailed loading status.

Defaults to logging an eventful load.		Defaults to logging an eventful load.

Show All 27 Lines	def get_revision_id_from_hg_nodeid(self, hg_nodeid: HgNodeId) -> Sha1Git:
# The parent was not loaded in this run, get it from storage		# The parent was not loaded in this run, get it from storage
from_storage = [		from_storage = [
extid		extid
for extid in self.storage.extid_get_from_extid(EXTID_TYPE, ids=[hg_nodeid])		for extid in self.storage.extid_get_from_extid(EXTID_TYPE, ids=[hg_nodeid])
if extid.extid_version == EXTID_VERSION		if extid.extid_version == EXTID_VERSION
]		]

msg = "Expected 1 match from storage for hg node %r, got %d"		msg = "Expected 1 match from storage for hg node %r, got %d"
assert len(from_storage) == 1, msg % (hg_nodeid.hex(), len(from_storage))		assert len(from_storage) == 1, msg % (hg_nodeid.hex(), len(from_storage))
		ardumontAuthorUnsubmitted Done Inline Actions that's actually better in bytes to debug but i can revert it. ardumont: that's actually better in bytes to debug but i can revert it.
		olasdUnsubmitted Done Inline Actions Please revert it to keep it human-readable. One can use `bytes.fromhex()` to revert it. olasd: Please revert it to keep it human-readable. One can use `bytes.fromhex()` to revert it.
		ardumontAuthorUnsubmitted Done Inline Actions Yes, ok (i knew but that's annoying to do ;). Reverting. ardumont: Yes, ok (i knew but that's annoying to do ;). Reverting.
return from_storage[0].target.object_id		return from_storage[0].target.object_id

def get_revision_parents(self, rev_ctx: hgutil.BaseContext) -> Tuple[Sha1Git, ...]:		def get_revision_parents(self, rev_ctx: hgutil.BaseContext) -> Tuple[Sha1Git, ...]:
"""Return the git sha1 of the parent revisions.		"""Return the git sha1 of the parent revisions.

Args:		Args:
hg_nodeid: the hg nodeid of the revision.		hg_nodeid: the hg nodeid of the revision.

▲ Show 20 Lines • Show All 279 Lines • Show Last 20 Lines