Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/mercurial/from_disk.py
Show First 20 Lines • Show All 182 Lines • ▼ Show 20 Lines | ): | ||||
# Cache the content hash across revisions to avoid recalculation. | # Cache the content hash across revisions to avoid recalculation. | ||||
self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict( | self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict( | ||||
content_cache_size, | content_cache_size, | ||||
) | ) | ||||
# hg node id of the latest snapshot branch heads | # hg node id of the latest snapshot branch heads | ||||
# used to find what are the new revisions since last snapshot | # used to find what are the new revisions since last snapshot | ||||
self._latest_heads: List[bytes] = [] | self._latest_heads: List[HgNodeId] = [] | ||||
# hg node ids of all the tags recorded on previous runs | # hg node ids of all the tags recorded on previous runs | ||||
# Used to compute which tags need to be added, even across incremental runs | # Used to compute which tags need to be added, even across incremental runs | ||||
# that might separate a changeset introducing a tag from its target. | # that might separate a changeset introducing a tag from its target. | ||||
self._saved_tags: Set[bytes] = set() | self._saved_tags: Set[HgNodeId] = set() | ||||
self._load_status = "eventful" | self._load_status = "eventful" | ||||
# If set, will override the default value | # If set, will override the default value | ||||
self._visit_status = None | self._visit_status = None | ||||
def pre_cleanup(self) -> None: | def pre_cleanup(self) -> None: | ||||
"""As a first step, will try and check for dangling data to cleanup. | """As a first step, will try and check for dangling data to cleanup. | ||||
This should do its best to avoid raising issues. | This should do its best to avoid raising issues. | ||||
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines | def _set_recorded_state(self, latest_snapshot: Snapshot) -> None: | ||||
for branch in latest_snapshot.branches.values(): | for branch in latest_snapshot.branches.values(): | ||||
if branch.target_type == TargetType.REVISION: | if branch.target_type == TargetType.REVISION: | ||||
heads.append(branch.target) | heads.append(branch.target) | ||||
elif branch.target_type == TargetType.RELEASE: | elif branch.target_type == TargetType.RELEASE: | ||||
tags.append(branch.target) | tags.append(branch.target) | ||||
self._latest_heads.extend( | self._latest_heads.extend( | ||||
extid.extid for extid in self._get_extids_for_targets(heads) | HgNodeId(extid.extid) for extid in self._get_extids_for_targets(heads) | ||||
) | ) | ||||
self._saved_tags.update( | self._saved_tags.update( | ||||
extid.extid for extid in self._get_extids_for_targets(tags) | HgNodeId(extid.extid) for extid in self._get_extids_for_targets(tags) | ||||
) | ) | ||||
def _get_extids_for_targets(self, targets: List[bytes]) -> List[ExtID]: | def _get_extids_for_targets(self, targets: List[Sha1Git]) -> List[ExtID]: | ||||
"""Get all Mercurial ExtIDs for the targets in the latest snapshot""" | """Get all Mercurial ExtIDs for the targets in the latest snapshot""" | ||||
extids = [ | extids = [] | ||||
extid | |||||
for extid in self.storage.extid_get_from_target( | for extid in self.storage.extid_get_from_target( | ||||
identifiers.ObjectType.REVISION, targets | identifiers.ObjectType.REVISION, targets | ||||
) | ): | ||||
if extid.extid_type == EXTID_TYPE and extid.extid_version == EXTID_VERSION | if extid.extid_type != EXTID_TYPE or extid.extid_version != EXTID_VERSION: | ||||
] | continue | ||||
extids.append(extid) | |||||
self._revision_nodeid_to_sha1git[ | |||||
HgNodeId(extid.extid) | |||||
] = extid.target.object_id | |||||
olasd: `targets` are swhids/revision ids, not `HgNodeId`s. | |||||
Done Inline ActionsRight, i fixed the signature, thanks. ardumont: Right, i fixed the signature, thanks. | |||||
if extids: | if extids: | ||||
# Filter out dangling extids, we need to load their target again | # Filter out dangling extids, we need to load their target again | ||||
revisions_missing = self.storage.revision_missing( | revisions_missing = self.storage.revision_missing( | ||||
[extid.target.object_id for extid in extids] | [extid.target.object_id for extid in extids] | ||||
) | ) | ||||
extids = [ | extids = [ | ||||
extid | extid | ||||
for extid in extids | for extid in extids | ||||
if extid.target.object_id not in revisions_missing | if extid.target.object_id not in revisions_missing | ||||
] | ] | ||||
return extids | return extids | ||||
def _get_extids_for_hgnodes(self, hgnode_ids: List[bytes]) -> List[ExtID]: | def _get_extids_for_hgnodes(self, hgnode_ids: List[HgNodeId]) -> List[ExtID]: | ||||
"""Get all Mercurial ExtIDs for the mercurial nodes in the list which point to | """Get all Mercurial ExtIDs for the mercurial nodes in the list which point to | ||||
a known revision. | a known revision. | ||||
""" | """ | ||||
extids = [] | extids = [] | ||||
for group_ids in grouper(hgnode_ids, n=1000): | for group_ids in grouper(hgnode_ids, n=1000): | ||||
for extid in self.storage.extid_get_from_extid(EXTID_TYPE, group_ids): | for extid in self.storage.extid_get_from_extid(EXTID_TYPE, group_ids): | ||||
if extid.extid_version != EXTID_VERSION: | if extid.extid_version != EXTID_VERSION: | ||||
continue | continue | ||||
extids.append(extid) | extids.append(extid) | ||||
self._revision_nodeid_to_sha1git[ | |||||
HgNodeId(extid.extid) | |||||
] = extid.target.object_id | |||||
if extids: | if extids: | ||||
# Filter out dangling extids, we need to load their target again | # Filter out dangling extids, we need to load their target again | ||||
revisions_missing = self.storage.revision_missing( | revisions_missing = self.storage.revision_missing( | ||||
[extid.target.object_id for extid in extids] | [extid.target.object_id for extid in extids] | ||||
) | ) | ||||
extids = [ | extids = [ | ||||
extid | extid | ||||
Show All 25 Lines | def fetch_data(self) -> bool: | ||||
# Allow to load on disk repository without cloning | # Allow to load on disk repository without cloning | ||||
# for testing purpose. | # for testing purpose. | ||||
self._repo_directory = self.directory | self._repo_directory = self.directory | ||||
self._repo = hgutil.repository(self._repo_directory) | self._repo = hgutil.repository(self._repo_directory) | ||||
return False | return False | ||||
def _new_revs(self, heads: List[bytes]): | def _new_revs(self, heads: List[HgNodeId]): | ||||
"""Return unseen revisions. That is, filter out revisions that are not ancestors of | """Return unseen revisions. That is, filter out revisions that are not ancestors of | ||||
heads""" | heads""" | ||||
assert self._repo is not None | assert self._repo is not None | ||||
existing_heads = [] | existing_heads = [] | ||||
for hg_nodeid in heads: | for hg_nodeid in heads: | ||||
try: | try: | ||||
rev = self._repo[hg_nodeid].rev() | rev = self._repo[hg_nodeid].rev() | ||||
Show All 24 Lines | def get_hg_revs_to_load(self) -> Iterator[int]: | ||||
seen_revs.add(rev) | seen_revs.add(rev) | ||||
yield rev | yield rev | ||||
# 2. Then filter out remaining revisions through the overall extid mappings | # 2. Then filter out remaining revisions through the overall extid mappings | ||||
# across hg origins | # across hg origins | ||||
revs_left = repo.revs("all() - ::(%ld)", seen_revs) | revs_left = repo.revs("all() - ::(%ld)", seen_revs) | ||||
hg_nodeids = [repo[nodeid].node() for nodeid in revs_left] | hg_nodeids = [repo[nodeid].node() for nodeid in revs_left] | ||||
for rev in self._new_revs( | for rev in self._new_revs( | ||||
[extid.extid for extid in self._get_extids_for_hgnodes(hg_nodeids)] | [ | ||||
HgNodeId(extid.extid) | |||||
for extid in self._get_extids_for_hgnodes(hg_nodeids) | |||||
] | |||||
): | ): | ||||
yield rev | yield rev | ||||
def store_data(self): | def store_data(self): | ||||
"""Store fetched data in the database.""" | """Store fetched data in the database.""" | ||||
revs = self.get_hg_revs_to_load() | revs = self.get_hg_revs_to_load() | ||||
length_ingested_revs = 0 | length_ingested_revs = 0 | ||||
Show All 9 Lines | def store_data(self): | ||||
length_ingested_revs += 1 | length_ingested_revs += 1 | ||||
except CorruptedRevision as e: | except CorruptedRevision as e: | ||||
self._visit_status = "partial" | self._visit_status = "partial" | ||||
self.log.warning("Corrupted revision %s", e) | self.log.warning("Corrupted revision %s", e) | ||||
descendents = repo.revs("(%ld)::", [rev]) | descendents = repo.revs("(%ld)::", [rev]) | ||||
ignored_revs.update(descendents) | ignored_revs.update(descendents) | ||||
if length_ingested_revs == 0: | if length_ingested_revs == 0: | ||||
# The repository has nothing to ingest (either empty or broken repository) | # no new revision ingested, so uneventful | ||||
# still we'll make a snapshot, so we continue | |||||
self._load_status = "uneventful" | self._load_status = "uneventful" | ||||
return | |||||
branching_info = hgutil.branching_info(repo, ignored_revs) | branching_info = hgutil.branching_info(repo, ignored_revs) | ||||
tags_by_name: Dict[bytes, HgNodeId] = repo.tags() | tags_by_name: Dict[bytes, HgNodeId] = repo.tags() | ||||
snapshot_branches: Dict[bytes, SnapshotBranch] = {} | snapshot_branches: Dict[bytes, SnapshotBranch] = {} | ||||
for tag_name, hg_nodeid in tags_by_name.items(): | for tag_name, hg_nodeid in tags_by_name.items(): | ||||
if tag_name == b"tip": | if tag_name == b"tip": | ||||
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines | def store_data(self): | ||||
# If the repo is broken enough or if it has none of the "normal" default | # If the repo is broken enough or if it has none of the "normal" default | ||||
# mechanisms, we ignore `HEAD`. | # mechanisms, we ignore `HEAD`. | ||||
default_branch_alias = branching_info.default_branch_alias | default_branch_alias = branching_info.default_branch_alias | ||||
if default_branch_alias is not None: | if default_branch_alias is not None: | ||||
snapshot_branches[b"HEAD"] = SnapshotBranch( | snapshot_branches[b"HEAD"] = SnapshotBranch( | ||||
target=default_branch_alias, target_type=TargetType.ALIAS, | target=default_branch_alias, target_type=TargetType.ALIAS, | ||||
) | ) | ||||
snapshot = Snapshot(branches=snapshot_branches) | snapshot = Snapshot(branches=snapshot_branches) | ||||
self.storage.snapshot_add([snapshot]) | self.storage.snapshot_add([snapshot]) | ||||
Done Inline ActionsThis should happen regardless of whether there is a default branch alias or not. olasd: This should happen regardless of whether there is a default branch alias or not. | |||||
Done Inline Actionsoh that got indented, the f***? ardumont: oh that got indented, the f***? | |||||
self.flush() | self.flush() | ||||
self.loaded_snapshot_id = snapshot.id | self.loaded_snapshot_id = snapshot.id | ||||
def load_status(self) -> Dict[str, str]: | def load_status(self) -> Dict[str, str]: | ||||
"""Detailed loading status. | """Detailed loading status. | ||||
Defaults to logging an eventful load. | Defaults to logging an eventful load. | ||||
Show All 27 Lines | def get_revision_id_from_hg_nodeid(self, hg_nodeid: HgNodeId) -> Sha1Git: | ||||
# The parent was not loaded in this run, get it from storage | # The parent was not loaded in this run, get it from storage | ||||
from_storage = [ | from_storage = [ | ||||
extid | extid | ||||
for extid in self.storage.extid_get_from_extid(EXTID_TYPE, ids=[hg_nodeid]) | for extid in self.storage.extid_get_from_extid(EXTID_TYPE, ids=[hg_nodeid]) | ||||
if extid.extid_version == EXTID_VERSION | if extid.extid_version == EXTID_VERSION | ||||
] | ] | ||||
msg = "Expected 1 match from storage for hg node %r, got %d" | msg = "Expected 1 match from storage for hg node %r, got %d" | ||||
assert len(from_storage) == 1, msg % (hg_nodeid.hex(), len(from_storage)) | assert len(from_storage) == 1, msg % (hg_nodeid.hex(), len(from_storage)) | ||||
Done Inline Actionsthat's actually better in bytes to debug but i can revert it. ardumont: that's actually better in bytes to debug but i can revert it. | |||||
Done Inline ActionsPlease revert it to keep it human-readable. One can use bytes.fromhex() to revert it. olasd: Please revert it to keep it human-readable. One can use `bytes.fromhex()` to revert it. | |||||
Done Inline ActionsYes, ok (i knew but that's annoying to do ;). ardumont: Yes, ok (i knew but that's annoying to do ;).
Reverting. | |||||
return from_storage[0].target.object_id | return from_storage[0].target.object_id | ||||
def get_revision_parents(self, rev_ctx: hgutil.BaseContext) -> Tuple[Sha1Git, ...]: | def get_revision_parents(self, rev_ctx: hgutil.BaseContext) -> Tuple[Sha1Git, ...]: | ||||
"""Return the git sha1 of the parent revisions. | """Return the git sha1 of the parent revisions. | ||||
Args: | Args: | ||||
hg_nodeid: the hg nodeid of the revision. | hg_nodeid: the hg nodeid of the revision. | ||||
▲ Show 20 Lines • Show All 279 Lines • Show Last 20 Lines |
targets are swhids/revision ids, not HgNodeIds.