diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py --- a/swh/loader/mercurial/from_disk.py +++ b/swh/loader/mercurial/from_disk.py @@ -71,7 +71,11 @@ class HgDirectory(Directory): - """A directory that creates parent directories if missing.""" + """A more practical directory. + + - creates missing parent directories + - removes empty directories + """ def __setitem__(self, path: bytes, value: Union[Content, "HgDirectory"]) -> None: if b"/" in path: @@ -86,6 +90,14 @@ else: super().__setitem__(path, value) + def __delitem__(self, path: bytes) -> None: + super().__delitem__(path) + + while b"/" in path: # remove empty parent directories + path = path.rsplit(b"/", 1)[0] + if len(self[path]) == 0: + super().__delitem__(path) + class HgLoaderFromDisk(BaseLoader): """Load a mercurial repository from a local repository.""" @@ -124,6 +136,15 @@ self._repo: Optional[hgutil.Repository] = None self._revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {} + # keeps the last processed hg nodeid + # it is used for differential tree update by store_directories + # NULLID is the parent of the first revision + self._last_hg_nodeid = hgutil.NULLID + + # keeps the last revision tree + # it is used for differential tree update by store_directories + self._last_root = HgDirectory() + # Cache the content hash across revisions to avoid recalculation. self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict( self.config["content_cache_size"], @@ -417,12 +438,25 @@ Returns: the swhid of the top level directory. """ - root = HgDirectory() - for file_path in rev_ctx.manifest(): + prev_ctx = self.repo[self._last_hg_nodeid] + + # TODO maybe do diff on parents + status = prev_ctx.status(rev_ctx) + + for file_path in status.removed: + del self._last_root[file_path] + + for file_path in status.added: content = self.store_content(rev_ctx, file_path) - root[file_path] = content + self._last_root[file_path] = content + + for file_path in status.modified: + content = self.store_content(rev_ctx, file_path) + self._last_root[file_path] = content + + self._last_hg_nodeid = rev_ctx.node() - directories: Deque[Directory] = deque([root]) + directories: Deque[Directory] = deque([self._last_root]) while directories: directory = directories.pop() self.storage.directory_add([directory.to_model()]) @@ -430,7 +464,7 @@ [item for item in directory.values() if isinstance(item, Directory)] ) - return root.hash + return self._last_root.hash class HgArchiveLoaderFromDisk(HgLoaderFromDisk): diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py --- a/swh/loader/mercurial/tests/test_from_disk.py +++ b/swh/loader/mercurial/tests/test_from_disk.py @@ -25,6 +25,16 @@ directory[b"path/to/some/content"] = Content() +def test_hg_directory_deletes_empty_directories(): + directory = HgDirectory() + directory[b"path/to/content"] = Content() + directory[b"path/to/some/content"] = Content() + + del directory[b"path/to/some/content"] + + assert len(directory) == 1 + + # Those tests assert expectations on repository loading # by reading expected values from associated json files # produced by the `swh-hg-identify` command line utility.