diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py --- a/swh/loader/mercurial/from_disk.py +++ b/swh/loader/mercurial/from_disk.py @@ -124,6 +124,15 @@ self._repo: Optional[hgutil.Repository] = None self._revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {} + # keeps the last processed hg nodeid + # it is used for differential tree update by store_directories + # NULLID is the parent of the first revision + self._last_hg_nodeid = hgutil.NULLID + + # keeps the last revision tree + # it is used for differential tree update by store_directories + self._last_root = HgDirectory() + # Cache the content hash across revisions to avoid recalculation. self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict( self.config["content_cache_size"], @@ -417,12 +426,25 @@ Returns: the swhid of the top level directory. """ - root = HgDirectory() - for file_path in rev_ctx.manifest(): + prev_ctx = self.repo[self._last_hg_nodeid] + + # TODO maybe do diff on parents + status = prev_ctx.status(rev_ctx) + + for file_path in status.removed: + del self._last_root[file_path] + + for file_path in status.added: + content = self.store_content(rev_ctx, file_path) + self._last_root[file_path] = content + + for file_path in status.modified: content = self.store_content(rev_ctx, file_path) - root[file_path] = content + self._last_root[file_path] = content + + self._last_hg_nodeid = rev_ctx.node() - directories: Deque[Directory] = deque([root]) + directories: Deque[Directory] = deque([self._last_root]) while directories: directory = directories.pop() self.storage.directory_add([directory.to_model()]) @@ -430,7 +452,7 @@ [item for item in directory.values() if isinstance(item, Directory)] ) - return root.hash + return self._last_root.hash class HgArchiveLoaderFromDisk(HgLoaderFromDisk):