diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py --- a/swh/loader/mercurial/from_disk.py +++ b/swh/loader/mercurial/from_disk.py @@ -10,6 +10,7 @@ from typing import Any, Dict, List, Optional, Union import dateutil +from mercurial.util import lrucachedict # type: ignore from swh.core.config import merge_configs from swh.loader.core.loader import BaseLoader @@ -116,6 +117,11 @@ self._repo: Optional[hgutil.Repository] = None self.revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {} + # Cache the content hash across revisions to avoid recalculation. + self.content_hash_cache: lrucachedict = lrucachedict( + self.config["content_cache_size"], + ) + @property def repo(self) -> hgutil.Repository: """ An filtered mercurial repository. @@ -336,7 +342,17 @@ rev_ctx = self.repo[hg_nodeid] file_ctx = rev_ctx[file_path] + file_nodeid = file_ctx._fileid perms = FLAG_PERMS[file_ctx.flags()] + + # Key is file_nodeid + perms because permissions does not participate + # in content hash in hg while it is the case in swh. + cache_key = (file_nodeid, perms) + + sha1_git = self.content_hash_cache.get(cache_key) + if sha1_git is not None: + return Content({"sha1_git": sha1_git, "perms": perms}) + data = file_ctx.data() content_data = MultiHash.from_data(data).digest() @@ -355,6 +371,8 @@ "produced {type(model)!r} instead of {ModelContent!r}" ) + self.content_hash_cache[cache_key] = content.hash + return content def store_directories(self, hg_nodeid: HgNodeId) -> Sha1Git: