diff --git a/swh/fuse/cli.py b/swh/fuse/cli.py --- a/swh/fuse/cli.py +++ b/swh/fuse/cli.py @@ -37,6 +37,7 @@ "url": "https://archive.softwareheritage.org/api/1", "auth-token": None, }, + "sharding": {"depth": 1, "length": 2,}, } diff --git a/swh/fuse/fs/artifact.py b/swh/fuse/fs/artifact.py --- a/swh/fuse/fs/artifact.py +++ b/swh/fuse/fs/artifact.py @@ -11,6 +11,7 @@ from swh.fuse.fs.entry import ( EntryMode, FuseDirEntry, + FuseDirEntryShardByHash, FuseEntry, FuseFileEntry, FuseSymlinkEntry, @@ -208,13 +209,12 @@ async def compute_entries(self) -> AsyncIterator[FuseEntry]: history = await self.fuse.get_history(self.swhid) - root_path = self.get_relative_root_path() - for swhid in history: - yield self.create_child( - FuseSymlinkEntry, - name=str(swhid), - target=Path(root_path, f"archive/{swhid}"), - ) + yield self.create_child( + FuseDirEntryShardByHash, + name="by-hash", + mode=int(EntryMode.RDONLY_DIR), + swhids=history, + ) @dataclass diff --git a/swh/fuse/fs/entry.py b/swh/fuse/fs/entry.py --- a/swh/fuse/fs/entry.py +++ b/swh/fuse/fs/entry.py @@ -9,7 +9,9 @@ from enum import IntEnum from pathlib import Path from stat import S_IFDIR, S_IFLNK, S_IFREG -from typing import Any, AsyncIterator, Sequence, Union +from typing import Any, AsyncIterator, List, Sequence, Union + +from swh.model.identifiers import SWHID # Avoid cycling import Fuse = "Fuse" @@ -104,6 +106,61 @@ return None +@dataclass +class FuseDirEntryShardByHash(FuseDirEntry): + """ FUSE virtual directory entry sharded by SWHID hash """ + + swhids: List[SWHID] + prefix: str = field(default="") + + def get_full_sharded_name(self, swhid: SWHID) -> str: + sharding = self.fuse.conf["sharding"] + if sharding["depth"] <= 0: + return str(swhid) + else: + basename = swhid.object_id + name, i = "", 0 + for _ in range(sharding["depth"]): + name += basename[i : i + sharding["length"]] + name += "/" + i += sharding["length"] + # Always keep the full SWHID as the path basename (otherwise we + # loose the SWHID object type information) + name += str(swhid) + return name + + async def compute_entries(self) -> AsyncIterator[FuseEntry]: + sharding_depth = self.prefix.count("/") + if sharding_depth == self.fuse.conf["sharding"]["depth"]: + root_path = self.get_relative_root_path() + for swhid in self.swhids: + yield self.create_child( + FuseSymlinkEntry, + name=str(swhid), + target=Path(root_path, f"archive/{swhid}"), + ) + else: + subdirs = {} + for swhid in self.swhids: + name = self.get_full_sharded_name(swhid) + prefix_len = len(self.prefix) + + next_prefix = name[ + prefix_len : prefix_len + self.fuse.conf["sharding"]["length"] + ] + subdirs.setdefault(next_prefix, []).append(swhid) + + # Recursive intermediate sharded directories + for subdir, subentries in subdirs.items(): + yield self.create_child( + FuseDirEntryShardByHash, + name=subdir, + mode=int(EntryMode.RDONLY_DIR), + prefix=f"{self.prefix}{subdir}/", + swhids=subentries, + ) + + @dataclass class FuseSymlinkEntry(FuseEntry): """ FUSE virtual symlink entry diff --git a/swh/fuse/fuse.py b/swh/fuse/fuse.py --- a/swh/fuse/fuse.py +++ b/swh/fuse/fuse.py @@ -35,6 +35,7 @@ self._inode2entry: Dict[int, FuseEntry] = {} self.root = Root(fuse=self) + self.conf = conf self.time_ns: int = time.time_ns() # start time, used as timestamp self.gid = os.getgid() diff --git a/swh/fuse/tests/test_revision.py b/swh/fuse/tests/test_revision.py --- a/swh/fuse/tests/test_revision.py +++ b/swh/fuse/tests/test_revision.py @@ -8,6 +8,7 @@ get_data_from_web_archive, ) from swh.fuse.tests.data.config import REV_SMALL_HISTORY, ROOT_DIR, ROOT_REV +from swh.model.identifiers import parse_swhid def test_access_meta(fuse_mntdir): @@ -38,13 +39,18 @@ def test_list_history(fuse_mntdir): - dir_path = fuse_mntdir / "archive" / REV_SMALL_HISTORY / "history" + dir_path = fuse_mntdir / "archive" / REV_SMALL_HISTORY / "history/by-hash" history_meta = get_data_from_graph_archive( REV_SMALL_HISTORY, GRAPH_API_REQUEST.HISTORY ) history = history_meta.strip() # Only keep second node in the edge because first node is redundant # information or the root node (hence not an ancestor) - expected = [edge.split(" ")[1] for edge in history.split("\n")] - actual = os.listdir(dir_path) - assert set(actual) == set(expected) + expected = set([edge.split(" ")[1] for edge in history.split("\n")]) + + for swhid in expected: + swhid = parse_swhid(swhid) + depth1 = swhid.object_id[:2] + depth2 = str(swhid) + assert (dir_path / depth1).exists() + assert depth2 in (os.listdir(dir_path / depth1))