diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py --- a/swh/loader/mercurial/from_disk.py +++ b/swh/loader/mercurial/from_disk.py @@ -8,7 +8,7 @@ from datetime import datetime, timezone from shutil import rmtree from tempfile import mkdtemp -from typing import Any, Deque, Dict, Optional, Tuple, Union +from typing import Any, Deque, Dict, Optional, Tuple, TypeVar, Union import dateutil @@ -49,6 +49,9 @@ TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk" +T = TypeVar("T") + + def parse_visit_date(visit_date: Optional[Union[datetime, str]]) -> Optional[datetime]: """Convert visit date from Optional[Union[str, datetime]] to Optional[datetime]. @@ -71,14 +74,18 @@ class HgDirectory(Directory): - """A directory that creates parent directories if missing.""" + """A more practical directory. + + - creates missing parent directories + - removes empty directories + """ def __setitem__(self, path: bytes, value: Union[Content, "HgDirectory"]) -> None: if b"/" in path: head, tail = path.split(b"/", 1) directory = self.get(head) - if directory is None: + if directory is None or isinstance(directory, Content): directory = HgDirectory() self[head] = directory @@ -86,6 +93,25 @@ else: super().__setitem__(path, value) + def __delitem__(self, path: bytes) -> None: + super().__delitem__(path) + + while b"/" in path: # remove empty parent directories + path = path.rsplit(b"/", 1)[0] + if len(self[path]) == 0: + super().__delitem__(path) + else: + break + + def get( + self, path: bytes, default: Optional[T] = None + ) -> Optional[Union[Content, "HgDirectory", T]]: + # TODO move to swh.model.from_disk.Directory + try: + return self[path] + except KeyError: + return default + class HgLoaderFromDisk(BaseLoader): """Load a mercurial repository from a local repository.""" @@ -125,6 +151,15 @@ self._revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {} self._repo_directory: Optional[str] = None + # keeps the last processed hg nodeid + # it is used for differential tree update by store_directories + # NULLID is the parent of the first revision + self._last_hg_nodeid = hgutil.NULLID + + # keeps the last revision tree + # it is used for differential tree update by store_directories + self._last_root = HgDirectory() + # Cache the content hash across revisions to avoid recalculation. self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict( self.config["content_cache_size"], @@ -409,12 +444,26 @@ Returns: the swhid of the top level directory. """ - root = HgDirectory() - for file_path in rev_ctx.manifest(): + repo: hgutil.Repository = self._repo # mypy can't infer that repo is not None + prev_ctx = repo[self._last_hg_nodeid] + + # TODO maybe do diff on parents + status = prev_ctx.status(rev_ctx) + + for file_path in status.removed: + del self._last_root[file_path] + + for file_path in status.added: content = self.store_content(rev_ctx, file_path) - root[file_path] = content + self._last_root[file_path] = content + + for file_path in status.modified: + content = self.store_content(rev_ctx, file_path) + self._last_root[file_path] = content + + self._last_hg_nodeid = rev_ctx.node() - directories: Deque[Directory] = deque([root]) + directories: Deque[Directory] = deque([self._last_root]) while directories: directory = directories.pop() self.storage.directory_add([directory.to_model()]) @@ -422,7 +471,7 @@ [item for item in directory.values() if isinstance(item, Directory)] ) - return root.hash + return self._last_root.hash class HgArchiveLoaderFromDisk(HgLoaderFromDisk): diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py --- a/swh/loader/mercurial/tests/test_from_disk.py +++ b/swh/loader/mercurial/tests/test_from_disk.py @@ -4,6 +4,8 @@ # See top-level LICENSE file for more information import os +from datetime import datetime +from hashlib import sha1 from swh.loader.tests import ( assert_last_visit_matches, @@ -11,7 +13,7 @@ get_stats, prepare_repository_from_archive, ) -from swh.model.from_disk import Content +from swh.model.from_disk import Content, DentryPerms from swh.model.hashutil import hash_to_bytes from swh.model.model import RevisionType, Snapshot, SnapshotBranch, TargetType from swh.storage.algos.snapshot import snapshot_get_latest @@ -20,9 +22,45 @@ from .loader_checker import ExpectedSwhids, LoaderChecker +def random_content() -> Content: + """Create minimal content object.""" + data = str(datetime.now()).encode() + return Content({"sha1_git": sha1(data).digest(), "perms": DentryPerms.content}) + + def test_hg_directory_creates_missing_directories(): directory = HgDirectory() - directory[b"path/to/some/content"] = Content() + directory[b"path/to/some/content"] = random_content() + + +def test_hg_directory_get(): + content = random_content() + directory = HgDirectory() + + assert directory.get(b"path/to/content") is None + assert directory.get(b"path/to/content", content) == content + + directory[b"path/to/content"] = content + assert directory.get(b"path/to/content") == content + + +def test_hg_directory_deletes_empty_directories(): + directory = HgDirectory() + content = random_content() + directory[b"path/to/content"] = content + directory[b"path/to/some/deep/content"] = random_content() + + del directory[b"path/to/some/deep/content"] + + assert directory.get(b"path/to/some/deep") is None + assert directory.get(b"path/to/some") is None + assert directory.get(b"path/to/content") == content + + +def test_hg_directory_when_directory_replaces_file(): + directory = HgDirectory() + directory[b"path/to/some"] = random_content() + directory[b"path/to/some/content"] = random_content() # Those tests assert expectations on repository loading