diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -25,6 +25,3 @@ [mypy-pyfuse3_asyncio.*] ignore_missing_imports = True - -[mypy-pympler.*] -ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ aiosqlite psutil pyfuse3 -pympler python-daemon diff --git a/swh/fuse/cache.py b/swh/fuse/cache.py --- a/swh/fuse/cache.py +++ b/swh/fuse/cache.py @@ -15,7 +15,6 @@ import aiosqlite from psutil import virtual_memory -from pympler import asizeof from swh.fuse.fs.entry import FuseDirEntry, FuseEntry from swh.fuse.fs.mountpoint import ArchiveDir, MetaDir @@ -240,7 +239,10 @@ class LRU(OrderedDict): max_ram: int used_ram: int = field(init=False, default=0) - object_size: Dict[Any, int] = field(init=False, default_factory=dict) + + def sizeof(self, value: Any) -> int: + # Rough size estimate in bytes for a list of entries + return len(value) * 1000 def __getitem__(self, key: Any) -> Any: value = super().__getitem__(key) @@ -251,15 +253,13 @@ if key in self: self.move_to_end(key) else: - nb_bytes = asizeof.asizeof(value) - self.used_ram += nb_bytes - self.object_size[key] = nb_bytes + self.used_ram += self.sizeof(value) super().__setitem__(key, value) while self.used_ram > self.max_ram and self: oldest = next(iter(self)) - self.used_ram -= self.object_size[oldest] + self.used_ram -= self.sizeof(oldest) del self[oldest] def __init__(self, conf: Dict[str, Any]): diff --git a/swh/fuse/cli.py b/swh/fuse/cli.py --- a/swh/fuse/cli.py +++ b/swh/fuse/cli.py @@ -37,6 +37,7 @@ "url": "https://archive.softwareheritage.org/api/1", "auth-token": None, }, + "sharding": {"depth": 1, "length": 2,}, } diff --git a/swh/fuse/fs/artifact.py b/swh/fuse/fs/artifact.py --- a/swh/fuse/fs/artifact.py +++ b/swh/fuse/fs/artifact.py @@ -3,7 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path from typing import Any, AsyncIterator, List import urllib.parse @@ -208,13 +208,85 @@ async def compute_entries(self) -> AsyncIterator[FuseEntry]: history = await self.fuse.get_history(self.swhid) - root_path = self.get_relative_root_path() - for swhid in history: - yield self.create_child( - FuseSymlinkEntry, - name=str(swhid), - target=Path(root_path, f"archive/{swhid}"), - ) + by_hash = self.create_child( + RevisionHistoryShardByHash, + name="by-hash", + mode=int(EntryMode.RDONLY_DIR), + history_swhid=self.swhid, + ) + by_hash.fill_direntry_cache(history) + yield by_hash + + +@dataclass +class RevisionHistoryShardByHash(FuseDirEntry): + """ Revision virtual `history/by-hash` sharded directory """ + + history_swhid: SWHID + prefix: str = field(default="") + + def get_full_sharded_name(self, swhid: SWHID) -> str: + sharding_depth = self.fuse.conf["sharding"]["depth"] + sharding_length = self.fuse.conf["sharding"]["length"] + if sharding_depth <= 0: + return str(swhid) + else: + basename = swhid.object_id + parts = [ + basename[i * sharding_length : (i + 1) * sharding_length] + for i in range(sharding_depth) + ] + # Always keep the full SWHID as the path basename (otherwise we + # loose the SWHID object type information) + parts.append(str(swhid)) + path = Path(*parts) + return str(path) + + def fill_direntry_cache(self, swhids: List[SWHID]): + sharding_depth = self.fuse.conf["sharding"]["depth"] + sharding_length = self.fuse.conf["sharding"]["length"] + depth = self.prefix.count("/") + children = [] + if depth == sharding_depth: + root_path = self.get_relative_root_path() + for swhid in swhids: + children.append( + self.create_child( + FuseSymlinkEntry, + name=str(swhid), + target=Path(root_path, f"archive/{swhid}"), + ) + ) + else: + subdirs = {} + prefix_len = len(self.prefix) + for swhid in swhids: + name = self.get_full_sharded_name(swhid) + next_prefix = name[prefix_len : prefix_len + sharding_length] + subdirs.setdefault(next_prefix, []).append(swhid) + + # Recursive intermediate sharded directories + for subdir, subentries in subdirs.items(): + child_prefix = f"{self.prefix}{subdir}/" + child = self.create_child( + RevisionHistoryShardByHash, + name=subdir, + mode=int(EntryMode.RDONLY_DIR), + prefix=child_prefix, + history_swhid=self.history_swhid, + ) + children.append(child) + child.fill_direntry_cache(subentries) + self.fuse.cache.direntry.set(self, children) + return children + + async def compute_entries(self) -> AsyncIterator[FuseEntry]: + history = await self.fuse.get_history(self.history_swhid) + hash_prefix = self.prefix.replace("/", "") + swhids = [s for s in history if s.object_id.startswith(hash_prefix)] + + for entry in self.fill_direntry_cache(swhids): + yield entry @dataclass diff --git a/swh/fuse/fuse.py b/swh/fuse/fuse.py --- a/swh/fuse/fuse.py +++ b/swh/fuse/fuse.py @@ -35,6 +35,7 @@ self._inode2entry: Dict[int, FuseEntry] = {} self.root = Root(fuse=self) + self.conf = conf self.time_ns: int = time.time_ns() # start time, used as timestamp self.gid = os.getgid() diff --git a/swh/fuse/tests/test_revision.py b/swh/fuse/tests/test_revision.py --- a/swh/fuse/tests/test_revision.py +++ b/swh/fuse/tests/test_revision.py @@ -8,6 +8,7 @@ get_data_from_web_archive, ) from swh.fuse.tests.data.config import REV_SMALL_HISTORY, ROOT_DIR, ROOT_REV +from swh.model.identifiers import parse_swhid def test_access_meta(fuse_mntdir): @@ -38,13 +39,18 @@ def test_list_history(fuse_mntdir): - dir_path = fuse_mntdir / "archive" / REV_SMALL_HISTORY / "history" + dir_path = fuse_mntdir / "archive" / REV_SMALL_HISTORY / "history/by-hash" history_meta = get_data_from_graph_archive( REV_SMALL_HISTORY, GRAPH_API_REQUEST.HISTORY ) history = history_meta.strip() # Only keep second node in the edge because first node is redundant # information or the root node (hence not an ancestor) - expected = [edge.split(" ")[1] for edge in history.split("\n")] - actual = os.listdir(dir_path) - assert set(actual) == set(expected) + expected = set([edge.split(" ")[1] for edge in history.split("\n")]) + + for swhid in expected: + swhid = parse_swhid(swhid) + depth1 = swhid.object_id[:2] + depth2 = str(swhid) + assert (dir_path / depth1).exists() + assert depth2 in (os.listdir(dir_path / depth1))