Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9697335
D4540.id16466.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
6 KB
Subscribers
None
D4540.id16466.diff
View Options
diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py
--- a/swh/loader/mercurial/from_disk.py
+++ b/swh/loader/mercurial/from_disk.py
@@ -8,7 +8,7 @@
from datetime import datetime, timezone
from shutil import rmtree
from tempfile import mkdtemp
-from typing import Any, Deque, Dict, Optional, Tuple, Union
+from typing import Any, Deque, Dict, Optional, Tuple, TypeVar, Union
import dateutil
@@ -49,6 +49,9 @@
TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk"
+T = TypeVar("T")
+
+
def parse_visit_date(visit_date: Optional[Union[datetime, str]]) -> Optional[datetime]:
"""Convert visit date from Optional[Union[str, datetime]] to Optional[datetime].
@@ -71,14 +74,18 @@
class HgDirectory(Directory):
- """A directory that creates parent directories if missing."""
+ """A more practical directory.
+
+ - creates missing parent directories
+ - removes empty directories
+ """
def __setitem__(self, path: bytes, value: Union[Content, "HgDirectory"]) -> None:
if b"/" in path:
head, tail = path.split(b"/", 1)
directory = self.get(head)
- if directory is None:
+ if directory is None or isinstance(directory, Content):
directory = HgDirectory()
self[head] = directory
@@ -86,6 +93,25 @@
else:
super().__setitem__(path, value)
+ def __delitem__(self, path: bytes) -> None:
+ super().__delitem__(path)
+
+ while b"/" in path: # remove empty parent directories
+ path = path.rsplit(b"/", 1)[0]
+ if len(self[path]) == 0:
+ super().__delitem__(path)
+ else:
+ break
+
+ def get(
+ self, path: bytes, default: Optional[T] = None
+ ) -> Optional[Union[Content, "HgDirectory", T]]:
+ # TODO move to swh.model.from_disk.Directory
+ try:
+ return self[path]
+ except KeyError:
+ return default
+
class HgLoaderFromDisk(BaseLoader):
"""Load a mercurial repository from a local repository."""
@@ -125,6 +151,15 @@
self._revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {}
self._repo_directory: Optional[str] = None
+ # keeps the last processed hg nodeid
+ # it is used for differential tree update by store_directories
+ # NULLID is the parent of the first revision
+ self._last_hg_nodeid = hgutil.NULLID
+
+ # keeps the last revision tree
+ # it is used for differential tree update by store_directories
+ self._last_root = HgDirectory()
+
# Cache the content hash across revisions to avoid recalculation.
self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict(
self.config["content_cache_size"],
@@ -409,12 +444,26 @@
Returns:
the swhid of the top level directory.
"""
- root = HgDirectory()
- for file_path in rev_ctx.manifest():
+ repo: hgutil.Repository = self._repo # mypy can't infer that repo is not None
+ prev_ctx = repo[self._last_hg_nodeid]
+
+ # TODO maybe do diff on parents
+ status = prev_ctx.status(rev_ctx)
+
+ for file_path in status.removed:
+ del self._last_root[file_path]
+
+ for file_path in status.added:
content = self.store_content(rev_ctx, file_path)
- root[file_path] = content
+ self._last_root[file_path] = content
+
+ for file_path in status.modified:
+ content = self.store_content(rev_ctx, file_path)
+ self._last_root[file_path] = content
+
+ self._last_hg_nodeid = rev_ctx.node()
- directories: Deque[Directory] = deque([root])
+ directories: Deque[Directory] = deque([self._last_root])
while directories:
directory = directories.pop()
self.storage.directory_add([directory.to_model()])
@@ -422,7 +471,7 @@
[item for item in directory.values() if isinstance(item, Directory)]
)
- return root.hash
+ return self._last_root.hash
class HgArchiveLoaderFromDisk(HgLoaderFromDisk):
diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py
--- a/swh/loader/mercurial/tests/test_from_disk.py
+++ b/swh/loader/mercurial/tests/test_from_disk.py
@@ -4,6 +4,8 @@
# See top-level LICENSE file for more information
import os
+from datetime import datetime
+from hashlib import sha1
from swh.loader.tests import (
assert_last_visit_matches,
@@ -11,7 +13,7 @@
get_stats,
prepare_repository_from_archive,
)
-from swh.model.from_disk import Content
+from swh.model.from_disk import Content, DentryPerms
from swh.model.hashutil import hash_to_bytes
from swh.model.model import RevisionType, Snapshot, SnapshotBranch, TargetType
from swh.storage.algos.snapshot import snapshot_get_latest
@@ -20,9 +22,45 @@
from .loader_checker import ExpectedSwhids, LoaderChecker
+def random_content() -> Content:
+ """Create minimal content object."""
+ data = str(datetime.now()).encode()
+ return Content({"sha1_git": sha1(data).digest(), "perms": DentryPerms.content})
+
+
def test_hg_directory_creates_missing_directories():
directory = HgDirectory()
- directory[b"path/to/some/content"] = Content()
+ directory[b"path/to/some/content"] = random_content()
+
+
+def test_hg_directory_get():
+ content = random_content()
+ directory = HgDirectory()
+
+ assert directory.get(b"path/to/content") is None
+ assert directory.get(b"path/to/content", content) == content
+
+ directory[b"path/to/content"] = content
+ assert directory.get(b"path/to/content") == content
+
+
+def test_hg_directory_deletes_empty_directories():
+ directory = HgDirectory()
+ content = random_content()
+ directory[b"path/to/content"] = content
+ directory[b"path/to/some/deep/content"] = random_content()
+
+ del directory[b"path/to/some/deep/content"]
+
+ assert directory.get(b"path/to/some/deep") is None
+ assert directory.get(b"path/to/some") is None
+ assert directory.get(b"path/to/content") == content
+
+
+def test_hg_directory_when_directory_replaces_file():
+ directory = HgDirectory()
+ directory[b"path/to/some"] = random_content()
+ directory[b"path/to/some/content"] = random_content()
# Those tests assert expectations on repository loading
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Aug 17, 11:28 PM (1 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221775
Attached To
D4540: Add tree diffing in HgLoaderFromDisk
Event Timeline
Log In to Comment