Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/mercurial/from_disk.py
Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines | |||||
} # type: Dict[bytes, DentryPerms] | } # type: Dict[bytes, DentryPerms] | ||||
TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk" | TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk" | ||||
T = TypeVar("T") | T = TypeVar("T") | ||||
class CorruptedRevision(ValueError): | |||||
"""Raised when a revision is corrupted.""" | |||||
def __init__(self, hg_nodeid: HgNodeId) -> None: | |||||
super().__init__(hg_nodeid.hex()) | |||||
self.hg_nodeid = hg_nodeid | |||||
class HgDirectory(Directory): | class HgDirectory(Directory): | ||||
"""A more practical directory. | """A more practical directory. | ||||
- creates missing parent directories | - creates missing parent directories | ||||
- removes empty directories | - removes empty directories | ||||
""" | """ | ||||
def __setitem__(self, path: bytes, value: Union[Content, "HgDirectory"]) -> None: | def __setitem__(self, path: bytes, value: Union[Content, "HgDirectory"]) -> None: | ||||
▲ Show 20 Lines • Show All 197 Lines • ▼ Show 20 Lines | class HgLoaderFromDisk(BaseLoader): | ||||
def store_data(self): | def store_data(self): | ||||
"""Store fetched data in the database.""" | """Store fetched data in the database.""" | ||||
revs = self.get_hg_revs_to_load() | revs = self.get_hg_revs_to_load() | ||||
if not revs: | if not revs: | ||||
self._load_status = "uneventful" | self._load_status = "uneventful" | ||||
return | return | ||||
assert self._repo is not None | |||||
repo = self._repo | |||||
blacklisted_revs: List[int] = [] | |||||
for rev in revs: | for rev in revs: | ||||
self.store_revision(self._repo[rev]) | if rev in blacklisted_revs: | ||||
continue | |||||
try: | |||||
self.store_revision(repo[rev]) | |||||
except CorruptedRevision as e: | |||||
self.log.warning(f"Corrupted revision: {e}") | |||||
descendents = repo.revs("(%ld)::", [rev]) | |||||
blacklisted_revs.extend(descendents) | |||||
vlorentz: replace this line with `assert self._repo` (mypy understands assertions) | |||||
Not Done Inline Actionsself.log.warning("Corrupted revision: %s", e) vlorentz: `self.log.warning("Corrupted revision: %s", e)` | |||||
Not Done Inline Actionsstill relevant ^ ardumont: still relevant ^ | |||||
branch_by_hg_nodeid: Dict[HgNodeId, bytes] = { | branch_by_hg_nodeid: Dict[HgNodeId, bytes] = { | ||||
hg_nodeid: name for name, hg_nodeid in hgutil.branches(self._repo).items() | hg_nodeid: name for name, hg_nodeid in hgutil.branches(repo).items() | ||||
} | } | ||||
tags_by_name: Dict[bytes, HgNodeId] = self._repo.tags() | tags_by_name: Dict[bytes, HgNodeId] = repo.tags() | ||||
tags_by_hg_nodeid: Dict[HgNodeId, bytes] = { | tags_by_hg_nodeid: Dict[HgNodeId, bytes] = { | ||||
hg_nodeid: name for name, hg_nodeid in tags_by_name.items() | hg_nodeid: name for name, hg_nodeid in tags_by_name.items() | ||||
} | } | ||||
snapshot_branches: Dict[bytes, SnapshotBranch] = {} | snapshot_branches: Dict[bytes, SnapshotBranch] = {} | ||||
for hg_nodeid, revision_swhid in self._revision_nodeid_to_swhid.items(): | for hg_nodeid, revision_swhid in self._revision_nodeid_to_swhid.items(): | ||||
tag_name = tags_by_hg_nodeid.get(hg_nodeid) | tag_name = tags_by_hg_nodeid.get(hg_nodeid) | ||||
▲ Show 20 Lines • Show All 162 Lines • ▼ Show 20 Lines | def store_content(self, rev_ctx: hgutil.BaseContext, file_path: bytes) -> Content: | ||||
file_path: the hg path of the content. | file_path: the hg path of the content. | ||||
Returns: | Returns: | ||||
the swhid of the top level directory. | the swhid of the top level directory. | ||||
""" | """ | ||||
hg_nodeid = rev_ctx.node() | hg_nodeid = rev_ctx.node() | ||||
file_ctx = rev_ctx[file_path] | file_ctx = rev_ctx[file_path] | ||||
try: | |||||
file_nodeid = file_ctx.filenode() | file_nodeid = file_ctx.filenode() | ||||
except hgutil.LookupError: | |||||
# TODO | |||||
# Raising CorruptedRevision avoid crashing the whole loading | |||||
# but can lead to a lot of missing revisions. | |||||
# SkippedContent could be used but need actual content to calculate its id. | |||||
# Maybe the hg_nodeid can be used instead. | |||||
# Another option could be to just ignore the missing content. | |||||
Not Done Inline Actions
You might be able to construct a SkippedContent with no id, but I'm not sure vlorentz: > SkippedContent could be used but need actual content to calculate its id.
You might be able… | |||||
Done Inline ActionsIMO this is good enough for now to easily log errors and figure out the next steps for improving the robustness/efficiency of the loader. What do you think? Alphare: IMO this is good enough for now to easily log errors and figure out the next steps for… | |||||
Not Done Inline Actionsyes, plus there is already a todo about it ;) ardumont: yes, plus there is already a todo about it ;) | |||||
# This point is left to future commits. | |||||
raise CorruptedRevision(hg_nodeid) | |||||
perms = FLAG_PERMS[file_ctx.flags()] | perms = FLAG_PERMS[file_ctx.flags()] | ||||
# Key is file_nodeid + perms because permissions does not participate | # Key is file_nodeid + perms because permissions does not participate | ||||
# in content hash in hg while it is the case in swh. | # in content hash in hg while it is the case in swh. | ||||
cache_key = (file_nodeid, perms) | cache_key = (file_nodeid, perms) | ||||
sha1_git = self._content_hash_cache.get(cache_key) | sha1_git = self._content_hash_cache.get(cache_key) | ||||
if sha1_git is not None: | if sha1_git is not None: | ||||
▲ Show 20 Lines • Show All 133 Lines • Show Last 20 Lines |
replace this line with assert self._repo (mypy understands assertions)