diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py --- a/swh/loader/mercurial/from_disk.py +++ b/swh/loader/mercurial/from_disk.py @@ -125,6 +125,11 @@ self._revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {} self._repo_directory: Optional[str] = None + # Cache the content hash across revisions to avoid recalculation. + self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict( + self.config["content_cache_size"], + ) + def pre_cleanup(self) -> None: """As a first step, will try and check for dangling data to cleanup. This should do its best to avoid raising issues. @@ -358,8 +363,18 @@ hg_nodeid = rev_ctx.node() file_ctx = rev_ctx[file_path] + file_nodeid = file_ctx.filenode() perms = FLAG_PERMS[file_ctx.flags()] - data = file_ctx.data() # caching is simple and will come in the next revision. + + # Key is file_nodeid + perms because permissions does not participate + # in content hash in hg while it is the case in swh. + cache_key = (file_nodeid, perms) + + sha1_git = self._content_hash_cache.get(cache_key) + if sha1_git is not None: + return Content({"sha1_git": sha1_git, "perms": perms}) + + data = file_ctx.data() content_data = MultiHash.from_data(data).digest() content_data["length"] = len(data) @@ -377,6 +392,8 @@ "produced {type(model)!r} instead of {ModelContent!r}" ) + self._content_hash_cache[cache_key] = content.hash + # Here we make sure to return only necessary data. return Content({"sha1_git": content.hash, "perms": perms}) diff --git a/swh/loader/mercurial/hgutil.py b/swh/loader/mercurial/hgutil.py --- a/swh/loader/mercurial/hgutil.py +++ b/swh/loader/mercurial/hgutil.py @@ -5,12 +5,13 @@ # The internal Mercurial API is not guaranteed to be stable. import mercurial.ui # type: ignore -from mercurial import context, hg +from mercurial import context, hg, util NULLID = mercurial.node.nullid HgNodeId = NewType("HgNodeId", bytes) Repository = hg.localrepo BaseContext = context.basectx +LRUCacheDict = util.lrucachedict def repository(path: str) -> hg.localrepo: