Page MenuHomeSoftware Heritage

D4540.id16466.diff
No OneTemporary

D4540.id16466.diff

diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py
--- a/swh/loader/mercurial/from_disk.py
+++ b/swh/loader/mercurial/from_disk.py
@@ -8,7 +8,7 @@
from datetime import datetime, timezone
from shutil import rmtree
from tempfile import mkdtemp
-from typing import Any, Deque, Dict, Optional, Tuple, Union
+from typing import Any, Deque, Dict, Optional, Tuple, TypeVar, Union
import dateutil
@@ -49,6 +49,9 @@
TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk"
+T = TypeVar("T")
+
+
def parse_visit_date(visit_date: Optional[Union[datetime, str]]) -> Optional[datetime]:
"""Convert visit date from Optional[Union[str, datetime]] to Optional[datetime].
@@ -71,14 +74,18 @@
class HgDirectory(Directory):
- """A directory that creates parent directories if missing."""
+ """A more practical directory.
+
+ - creates missing parent directories
+ - removes empty directories
+ """
def __setitem__(self, path: bytes, value: Union[Content, "HgDirectory"]) -> None:
if b"/" in path:
head, tail = path.split(b"/", 1)
directory = self.get(head)
- if directory is None:
+ if directory is None or isinstance(directory, Content):
directory = HgDirectory()
self[head] = directory
@@ -86,6 +93,25 @@
else:
super().__setitem__(path, value)
+ def __delitem__(self, path: bytes) -> None:
+ super().__delitem__(path)
+
+ while b"/" in path: # remove empty parent directories
+ path = path.rsplit(b"/", 1)[0]
+ if len(self[path]) == 0:
+ super().__delitem__(path)
+ else:
+ break
+
+ def get(
+ self, path: bytes, default: Optional[T] = None
+ ) -> Optional[Union[Content, "HgDirectory", T]]:
+ # TODO move to swh.model.from_disk.Directory
+ try:
+ return self[path]
+ except KeyError:
+ return default
+
class HgLoaderFromDisk(BaseLoader):
"""Load a mercurial repository from a local repository."""
@@ -125,6 +151,15 @@
self._revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {}
self._repo_directory: Optional[str] = None
+ # keeps the last processed hg nodeid
+ # it is used for differential tree update by store_directories
+ # NULLID is the parent of the first revision
+ self._last_hg_nodeid = hgutil.NULLID
+
+ # keeps the last revision tree
+ # it is used for differential tree update by store_directories
+ self._last_root = HgDirectory()
+
# Cache the content hash across revisions to avoid recalculation.
self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict(
self.config["content_cache_size"],
@@ -409,12 +444,26 @@
Returns:
the swhid of the top level directory.
"""
- root = HgDirectory()
- for file_path in rev_ctx.manifest():
+ repo: hgutil.Repository = self._repo # mypy can't infer that repo is not None
+ prev_ctx = repo[self._last_hg_nodeid]
+
+ # TODO maybe do diff on parents
+ status = prev_ctx.status(rev_ctx)
+
+ for file_path in status.removed:
+ del self._last_root[file_path]
+
+ for file_path in status.added:
content = self.store_content(rev_ctx, file_path)
- root[file_path] = content
+ self._last_root[file_path] = content
+
+ for file_path in status.modified:
+ content = self.store_content(rev_ctx, file_path)
+ self._last_root[file_path] = content
+
+ self._last_hg_nodeid = rev_ctx.node()
- directories: Deque[Directory] = deque([root])
+ directories: Deque[Directory] = deque([self._last_root])
while directories:
directory = directories.pop()
self.storage.directory_add([directory.to_model()])
@@ -422,7 +471,7 @@
[item for item in directory.values() if isinstance(item, Directory)]
)
- return root.hash
+ return self._last_root.hash
class HgArchiveLoaderFromDisk(HgLoaderFromDisk):
diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py
--- a/swh/loader/mercurial/tests/test_from_disk.py
+++ b/swh/loader/mercurial/tests/test_from_disk.py
@@ -4,6 +4,8 @@
# See top-level LICENSE file for more information
import os
+from datetime import datetime
+from hashlib import sha1
from swh.loader.tests import (
assert_last_visit_matches,
@@ -11,7 +13,7 @@
get_stats,
prepare_repository_from_archive,
)
-from swh.model.from_disk import Content
+from swh.model.from_disk import Content, DentryPerms
from swh.model.hashutil import hash_to_bytes
from swh.model.model import RevisionType, Snapshot, SnapshotBranch, TargetType
from swh.storage.algos.snapshot import snapshot_get_latest
@@ -20,9 +22,45 @@
from .loader_checker import ExpectedSwhids, LoaderChecker
+def random_content() -> Content:
+ """Create minimal content object."""
+ data = str(datetime.now()).encode()
+ return Content({"sha1_git": sha1(data).digest(), "perms": DentryPerms.content})
+
+
def test_hg_directory_creates_missing_directories():
directory = HgDirectory()
- directory[b"path/to/some/content"] = Content()
+ directory[b"path/to/some/content"] = random_content()
+
+
+def test_hg_directory_get():
+ content = random_content()
+ directory = HgDirectory()
+
+ assert directory.get(b"path/to/content") is None
+ assert directory.get(b"path/to/content", content) == content
+
+ directory[b"path/to/content"] = content
+ assert directory.get(b"path/to/content") == content
+
+
+def test_hg_directory_deletes_empty_directories():
+ directory = HgDirectory()
+ content = random_content()
+ directory[b"path/to/content"] = content
+ directory[b"path/to/some/deep/content"] = random_content()
+
+ del directory[b"path/to/some/deep/content"]
+
+ assert directory.get(b"path/to/some/deep") is None
+ assert directory.get(b"path/to/some") is None
+ assert directory.get(b"path/to/content") == content
+
+
+def test_hg_directory_when_directory_replaces_file():
+ directory = HgDirectory()
+ directory[b"path/to/some"] = random_content()
+ directory[b"path/to/some/content"] = random_content()
# Those tests assert expectations on repository loading

File Metadata

Mime Type
text/plain
Expires
Sun, Aug 17, 11:28 PM (1 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221775

Event Timeline