Changeset View
Standalone View
swh/loader/mercurial/from_disk.py
Show First 20 Lines • Show All 211 Lines • ▼ Show 20 Lines | def prepare(self, *args, **kwargs) -> None: | ||||
for branch in latest_snapshot.branches.values() | for branch in latest_snapshot.branches.values() | ||||
if branch.target_type != TargetType.ALIAS | if branch.target_type != TargetType.ALIAS | ||||
] | ] | ||||
revisions = [ | revisions = [ | ||||
revision | revision | ||||
for revision in self.storage.revision_get(snapshot_branches) | for revision in self.storage.revision_get(snapshot_branches) | ||||
if revision | if revision | ||||
] | ] | ||||
self._latest_heads = [ | for revision in revisions: | ||||
hash_to_bytes(revision.metadata["node"]) | if not revision.metadata: | ||||
for revision in revisions | continue | ||||
if revision.metadata | hg_nodeid = hash_to_bytes(revision.metadata["node"]) | ||||
] | self._latest_heads.append(hg_nodeid) | ||||
self._revision_nodeid_to_swhid[hg_nodeid] = revision.id | |||||
def fetch_data(self) -> bool: | def fetch_data(self) -> bool: | ||||
"""Fetch the data from the source the loader is currently loading | """Fetch the data from the source the loader is currently loading | ||||
Returns: | Returns: | ||||
a value that is interpreted as a boolean. If True, fetch_data needs | a value that is interpreted as a boolean. If True, fetch_data needs | ||||
to be called again to complete loading. | to be called again to complete loading. | ||||
Show All 27 Lines | def get_hg_revs_to_load(self) -> Union[HgFilteredSet, HgSpanSet]: | ||||
try: | try: | ||||
rev = repo[hg_nodeid].rev() | rev = repo[hg_nodeid].rev() | ||||
existing_heads.append(rev) | existing_heads.append(rev) | ||||
except KeyError: # the node does not exists anymore | except KeyError: # the node does not exists anymore | ||||
pass | pass | ||||
# select revisions that are not ancestors of heads | # select revisions that are not ancestors of heads | ||||
# and not the heads themselves | # and not the heads themselves | ||||
new_revs = repo.revs("not ::(%ld)", existing_heads) | revs = repo.revs("not ::(%ld)", existing_heads) | ||||
self.log.info(f"News revisions: {len(revs)}") | |||||
# for now, reload all revisions if there are new commits | return revs | ||||
# otherwise the loader will crash on missing parents | |||||
# incremental loading will come in next commits | |||||
if new_revs: | |||||
return repo.revs("all()") | |||||
else: | |||||
return new_revs | |||||
else: | else: | ||||
return repo.revs("all()") | return repo.revs("all()") | ||||
def store_data(self): | def store_data(self): | ||||
"""Store fetched data in the database.""" | """Store fetched data in the database.""" | ||||
revs = self.get_hg_revs_to_load() | revs = self.get_hg_revs_to_load() | ||||
if not revs: | if not revs: | ||||
self._load_status = "uneventful" | self._load_status = "uneventful" | ||||
▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines | def load_status(self) -> Dict[str, str]: | ||||
Returns: a dictionary that is eventually passed back as the task's | Returns: a dictionary that is eventually passed back as the task's | ||||
result to the scheduler, allowing tuning of the task recurrence | result to the scheduler, allowing tuning of the task recurrence | ||||
mechanism. | mechanism. | ||||
""" | """ | ||||
return { | return { | ||||
"status": self._load_status, | "status": self._load_status, | ||||
} | } | ||||
def get_revision_id_from_hg_nodeid(self, hg_nodeid: HgNodeId) -> Sha1Git: | def get_revision_id_from_hg_nodeid(self, hg_nodeid: HgNodeId) -> Optional[Sha1Git]: | ||||
"""Return the swhid of a revision given its hg nodeid. | """Return the swhid of a revision given its hg nodeid. | ||||
Args: | Args: | ||||
hg_nodeid: the hg nodeid of the revision. | hg_nodeid: the hg nodeid of the revision. | ||||
Returns: | Returns: | ||||
the swhid of the revision. | the swhid of the revision. | ||||
""" | """ | ||||
return self._revision_nodeid_to_swhid[hg_nodeid] | # TODO load from storage when research by hg node id is available | ||||
# Need: https://forge.softwareheritage.org/T2849 | |||||
return self._revision_nodeid_to_swhid.get(hg_nodeid) | |||||
def get_revision_parents(self, rev_ctx: hgutil.BaseContext) -> Tuple[Sha1Git, ...]: | def get_revision_parents(self, rev_ctx: hgutil.BaseContext) -> Tuple[Sha1Git, ...]: | ||||
"""Return the swhids of the parent revisions. | """Return the swhids of the parent revisions. | ||||
Args: | Args: | ||||
hg_nodeid: the hg nodeid of the revision. | hg_nodeid: the hg nodeid of the revision. | ||||
Returns: | Returns: | ||||
the swhids of the parent revisions. | the swhids of the parent revisions. | ||||
""" | """ | ||||
parents = [] | parents = [] | ||||
for parent_ctx in rev_ctx.parents(): | for parent_ctx in rev_ctx.parents(): | ||||
parent_hg_nodeid = parent_ctx.node() | parent_hg_nodeid = parent_ctx.node() | ||||
# nullid is the value of a parent that does not exist | # nullid is the value of a parent that does not exist | ||||
if parent_hg_nodeid == hgutil.NULLID: | if parent_hg_nodeid == hgutil.NULLID: | ||||
continue | continue | ||||
parents.append(self.get_revision_id_from_hg_nodeid(parent_hg_nodeid)) | |||||
sha1_git = self.get_revision_id_from_hg_nodeid(parent_hg_nodeid) | |||||
# parent not in storage from previous run | |||||
# and not previously loaded in current run | |||||
if sha1_git is None: | |||||
douardda: I don't really understand why this is needed. I mean isn't it just a matter of properly sort… | |||||
Done Inline ActionsIt was to get missing parents until they can be fetched from the storage, but since I know some of them from the latest snapshot, I have pre-populated self._revision_nodeid_to_swhid instead. acezar: It was to get missing parents until they can be fetched from the storage, but since I know some… | |||||
Not Done Inline Actionsthis still looks wrong to me. @marmoute can you have a look please? In which situation can this sha1_git be None? I think this would happen in the situation where:
Am I right? If so, we cannot implement this without a "working" get_revision_id_from_hg_nodeid(). douardda: this still looks wrong to me.
@marmoute can you have a look please?
In which situation can… | |||||
Done Inline ActionsYou are right. It will fail in some situations. I added link to T2849 to the diff related objects. acezar: You are right. It will fail in some situations.
I added link to T2849 to the diff related… | |||||
Done Inline ActionsExample of get_revision_id_from_hg_nodeid returning None: The repository is usable but has some corruption that prevent loading part of its revisions. This is the case for pypy for example. Not an issue since the rest of the repository is ok to work with as long that you don't use the corrupted part. But when a revision is corrupted all its descendants will fail to get their parent id has they cannot be loaded from the storage or the cache. I'm preparing a diff that will avoid failing of the entire load when there are some corruption by not loading the corrupted revisions. And it will imply that get_revision_id_from_hg_nodeid can return None or raise a specific exception. acezar: Example of `get_revision_id_from_hg_nodeid` returning `None`:
The repository is usable but has… | |||||
hg_nodeid = parent_hg_nodeid.hex() | |||||
raise ValueError(f"Cannot load revision with hg nodeid: {hg_nodeid}") | |||||
parents.append(sha1_git) | |||||
return tuple(parents) | return tuple(parents) | ||||
def store_revision(self, rev_ctx: hgutil.BaseContext) -> None: | def store_revision(self, rev_ctx: hgutil.BaseContext) -> None: | ||||
"""Store a revision given its hg nodeid. | """Store a revision given its hg nodeid. | ||||
Args: | Args: | ||||
rev_ctx: the he revision context. | rev_ctx: the he revision context. | ||||
▲ Show 20 Lines • Show All 227 Lines • Show Last 20 Lines |
I don't really understand why this is needed. I mean isn't it just a matter of properly sort revisions to add to ensure parents revs are always handled before? ie make sure ingested revisions graph is traversed "properly".
I mean if this is needed, then there is recursion store_revisions -> get_revisions_parents -> store_revision [...] and there is no guarantee this recursion will stay low enough for python to handle it (-> RecursionError) .
So to prevent that, the revision graph must be traversed properly, then this special case (resulting in possible recursion) is not needed anymore.