Differential D4649 Diff 16566 swh/loader/mercurial/from_disk.py

Changeset View

Standalone View

swh/loader/mercurial/from_disk.py

Show First 20 Lines • Show All 211 Lines • ▼ Show 20 Lines	def prepare(self, args, *kwargs) -> None:
for branch in latest_snapshot.branches.values()		for branch in latest_snapshot.branches.values()
if branch.target_type != TargetType.ALIAS		if branch.target_type != TargetType.ALIAS
]		]
revisions = [		revisions = [
revision		revision
for revision in self.storage.revision_get(snapshot_branches)		for revision in self.storage.revision_get(snapshot_branches)
if revision		if revision
]		]
self._latest_heads = [		for revision in revisions:
hash_to_bytes(revision.metadata["node"])		if not revision.metadata:
for revision in revisions		continue
if revision.metadata		hg_nodeid = hash_to_bytes(revision.metadata["node"])
]		self._latest_heads.append(hg_nodeid)
		self._revision_nodeid_to_swhid[hg_nodeid] = revision.id

def fetch_data(self) -> bool:		def fetch_data(self) -> bool:
"""Fetch the data from the source the loader is currently loading		"""Fetch the data from the source the loader is currently loading

Returns:		Returns:
a value that is interpreted as a boolean. If True, fetch_data needs		a value that is interpreted as a boolean. If True, fetch_data needs
to be called again to complete loading.		to be called again to complete loading.

Show All 27 Lines	def get_hg_revs_to_load(self) -> Union[HgFilteredSet, HgSpanSet]:
try:		try:
rev = repo[hg_nodeid].rev()		rev = repo[hg_nodeid].rev()
existing_heads.append(rev)		existing_heads.append(rev)
except KeyError: # the node does not exists anymore		except KeyError: # the node does not exists anymore
pass		pass

# select revisions that are not ancestors of heads		# select revisions that are not ancestors of heads
# and not the heads themselves		# and not the heads themselves
new_revs = repo.revs("not ::(%ld)", existing_heads)		revs = repo.revs("not ::(%ld)", existing_heads)
		self.log.info(f"News revisions: {len(revs)}")
# for now, reload all revisions if there are new commits		return revs
# otherwise the loader will crash on missing parents
# incremental loading will come in next commits
if new_revs:
return repo.revs("all()")
else:
return new_revs
else:		else:
return repo.revs("all()")		return repo.revs("all()")

def store_data(self):		def store_data(self):
"""Store fetched data in the database."""		"""Store fetched data in the database."""
revs = self.get_hg_revs_to_load()		revs = self.get_hg_revs_to_load()
if not revs:		if not revs:
self._load_status = "uneventful"		self._load_status = "uneventful"
▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines	def load_status(self) -> Dict[str, str]:
Returns: a dictionary that is eventually passed back as the task's		Returns: a dictionary that is eventually passed back as the task's
result to the scheduler, allowing tuning of the task recurrence		result to the scheduler, allowing tuning of the task recurrence
mechanism.		mechanism.
"""		"""
return {		return {
"status": self._load_status,		"status": self._load_status,
}		}

def get_revision_id_from_hg_nodeid(self, hg_nodeid: HgNodeId) -> Sha1Git:		def get_revision_id_from_hg_nodeid(self, hg_nodeid: HgNodeId) -> Optional[Sha1Git]:
"""Return the swhid of a revision given its hg nodeid.		"""Return the swhid of a revision given its hg nodeid.

Args:		Args:
hg_nodeid: the hg nodeid of the revision.		hg_nodeid: the hg nodeid of the revision.

Returns:		Returns:
the swhid of the revision.		the swhid of the revision.
"""		"""
return self._revision_nodeid_to_swhid[hg_nodeid]		# TODO load from storage when research by hg node id is available
		# Need: https://forge.softwareheritage.org/T2849
		return self._revision_nodeid_to_swhid.get(hg_nodeid)

def get_revision_parents(self, rev_ctx: hgutil.BaseContext) -> Tuple[Sha1Git, ...]:		def get_revision_parents(self, rev_ctx: hgutil.BaseContext) -> Tuple[Sha1Git, ...]:
"""Return the swhids of the parent revisions.		"""Return the swhids of the parent revisions.

Args:		Args:
hg_nodeid: the hg nodeid of the revision.		hg_nodeid: the hg nodeid of the revision.

Returns:		Returns:
the swhids of the parent revisions.		the swhids of the parent revisions.
"""		"""
parents = []		parents = []
for parent_ctx in rev_ctx.parents():		for parent_ctx in rev_ctx.parents():
parent_hg_nodeid = parent_ctx.node()		parent_hg_nodeid = parent_ctx.node()

# nullid is the value of a parent that does not exist		# nullid is the value of a parent that does not exist
if parent_hg_nodeid == hgutil.NULLID:		if parent_hg_nodeid == hgutil.NULLID:
continue		continue
parents.append(self.get_revision_id_from_hg_nodeid(parent_hg_nodeid))
		sha1_git = self.get_revision_id_from_hg_nodeid(parent_hg_nodeid)

		# parent not in storage from previous run
		# and not previously loaded in current run
		if sha1_git is None:
		douarddaUnsubmitted Done Inline Actions I don't really understand why this is needed. I mean isn't it just a matter of properly sort revisions to add to ensure parents revs are always handled before? ie make sure ingested revisions graph is traversed "properly". I mean if this is needed, then there is recursion `store_revisions -> get_revisions_parents -> store_revision [...]` and there is no guarantee this recursion will stay low enough for python to handle it (-> RecursionError) . So to prevent that, the revision graph must be traversed properly, then this special case (resulting in possible recursion) is not needed anymore. douardda: I don't really understand why this is needed. I mean isn't it just a matter of properly sort…
		acezarUnsubmitted Done Inline Actions It was to get missing parents until they can be fetched from the storage, but since I know some of them from the latest snapshot, I have pre-populated `self._revision_nodeid_to_swhid` instead. acezar: It was to get missing parents until they can be fetched from the storage, but since I know some…
		douarddaUnsubmitted Not Done Inline Actions this still looks wrong to me. @marmoute can you have a look please? In which situation can this sha1_git be None? I think this would happen in the situation where: last snapshot returned a set of hg revisions `s1`, so we know the sha1_git for every changeset `c_i` in this snapshot the updated hg repo includes a changeset `c_new` which parent is known by the SWH Archive but is not in `s1` (so it's an ancestor of at least one element of `s1`), then the call to `self.get_revision_id_from_hg_nodeid(parent_hg_nodeid)` will fail and the whole updating will fail. Am I right? If so, we cannot implement this without a "working" `get_revision_id_from_hg_nodeid()`. douardda: this still looks wrong to me. @marmoute can you have a look please? In which situation can…
		acezarUnsubmitted Done Inline Actions You are right. It will fail in some situations. I added link to T2849 to the diff related objects. acezar: You are right. It will fail in some situations. I added link to T2849 to the diff related…
		acezarUnsubmitted Done Inline Actions Example of `get_revision_id_from_hg_nodeid` returning `None`: The repository is usable but has some corruption that prevent loading part of its revisions. This is the case for `pypy` for example. Not an issue since the rest of the repository is ok to work with as long that you don't use the corrupted part. But when a revision is corrupted all its descendants will fail to get their parent id has they cannot be loaded from the storage or the cache. I'm preparing a diff that will avoid failing of the entire load when there are some corruption by not loading the corrupted revisions. And it will imply that `get_revision_id_from_hg_nodeid` can return `None` or raise a specific exception. acezar: Example of `get_revision_id_from_hg_nodeid` returning `None`: The repository is usable but has…
		hg_nodeid = parent_hg_nodeid.hex()
		raise ValueError(f"Cannot load revision with hg nodeid: {hg_nodeid}")

		parents.append(sha1_git)

return tuple(parents)		return tuple(parents)

def store_revision(self, rev_ctx: hgutil.BaseContext) -> None:		def store_revision(self, rev_ctx: hgutil.BaseContext) -> None:
"""Store a revision given its hg nodeid.		"""Store a revision given its hg nodeid.

Args:		Args:
rev_ctx: the he revision context.		rev_ctx: the he revision context.
▲ Show 20 Lines • Show All 227 Lines • Show Last 20 Lines