diff --git a/swh/fuse/cli.py b/swh/fuse/cli.py --- a/swh/fuse/cli.py +++ b/swh/fuse/cli.py @@ -37,6 +37,7 @@ "url": "https://archive.softwareheritage.org/api/1", "auth-token": None, }, + "sharding": {"depth": 1, "length": 2,}, } diff --git a/swh/fuse/fs/artifact.py b/swh/fuse/fs/artifact.py --- a/swh/fuse/fs/artifact.py +++ b/swh/fuse/fs/artifact.py @@ -3,7 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path from typing import Any, AsyncIterator, List import urllib.parse @@ -208,13 +208,101 @@ async def compute_entries(self) -> AsyncIterator[FuseEntry]: history = await self.fuse.get_history(self.swhid) - root_path = self.get_relative_root_path() - for swhid in history: - yield self.create_child( - FuseSymlinkEntry, - name=str(swhid), - target=Path(root_path, f"archive/{swhid}"), - ) + by_hash = self.create_child( + RevisionHistoryShardByHash, + name="by-hash", + mode=int(EntryMode.RDONLY_DIR), + history_swhid=self.swhid, + ) + sharding_depth = self.fuse.conf["sharding"]["depth"] + sharding_length = self.fuse.conf["sharding"]["length"] + + def fill_direntry_cache( + curdir: FuseDirEntry, swhids: List[SWHID], prefix: str = "" + ): + depth = prefix.count("/") + if depth == sharding_depth: + return + subdirs = {} + for swhid in swhids: + name = curdir.get_full_sharded_name(swhid) + next_prefix = name[len(prefix) : len(prefix) + sharding_length] + subdirs.setdefault(next_prefix, []).append(swhid) + + children = [] + for subdir, subentries in subdirs.items(): + child_prefix = f"{prefix}{subdir}/" + child = curdir.create_child( + RevisionHistoryShardByHash, + name=subdir, + mode=int(EntryMode.RDONLY_DIR), + prefix=child_prefix, + history_swhid=self.swhid, + ) + children.append(child) + fill_direntry_cache(child, subentries, child_prefix) + self.fuse.cache.direntry.set(curdir, children) + + fill_direntry_cache(by_hash, history) + + yield by_hash + + +@dataclass +class RevisionHistoryShardByHash(FuseDirEntry): + """ Revision virtual `history/by-hash` sharded directory """ + + history_swhid: SWHID + prefix: str = field(default="") + + def get_full_sharded_name(self, swhid: SWHID) -> str: + sharding_depth = self.fuse.conf["sharding"]["depth"] + sharding_length = self.fuse.conf["sharding"]["length"] + if sharding_depth <= 0: + return str(swhid) + else: + basename = swhid.object_id + name = "" + for i in range(sharding_depth): + name += basename[i * sharding_length : (i + 1) * sharding_length] + name += "/" + # Always keep the full SWHID as the path basename (otherwise we + # loose the SWHID object type information) + name += str(swhid) + return name + + async def compute_entries(self) -> AsyncIterator[FuseEntry]: + history = await self.fuse.get_history(self.history_swhid) + hash_prefix = self.prefix.replace("/", "") + swhids = [s for s in history if s.object_id.startswith(hash_prefix)] + + current_sharding_depth = self.prefix.count("/") + if current_sharding_depth == self.fuse.conf["sharding"]["depth"]: + root_path = self.get_relative_root_path() + for swhid in swhids: + yield self.create_child( + FuseSymlinkEntry, + name=str(swhid), + target=Path(root_path, f"archive/{swhid}"), + ) + else: + subdirs = set() + sharding_length = self.fuse.conf["sharding"]["length"] + prefix_length = len(self.prefix) + for swhid in swhids: + name = self.get_full_sharded_name(swhid) + next_prefix = name[prefix_length : prefix_length + sharding_length] + subdirs.add(next_prefix) + + # Recursive intermediate sharded directories + for subdir in subdirs: + yield self.create_child( + RevisionHistoryShardByHash, + name=subdir, + mode=int(EntryMode.RDONLY_DIR), + prefix=f"{self.prefix}{subdir}/", + history_swhid=self.history_swhid, + ) @dataclass diff --git a/swh/fuse/fuse.py b/swh/fuse/fuse.py --- a/swh/fuse/fuse.py +++ b/swh/fuse/fuse.py @@ -35,6 +35,7 @@ self._inode2entry: Dict[int, FuseEntry] = {} self.root = Root(fuse=self) + self.conf = conf self.time_ns: int = time.time_ns() # start time, used as timestamp self.gid = os.getgid() diff --git a/swh/fuse/tests/test_revision.py b/swh/fuse/tests/test_revision.py --- a/swh/fuse/tests/test_revision.py +++ b/swh/fuse/tests/test_revision.py @@ -8,6 +8,7 @@ get_data_from_web_archive, ) from swh.fuse.tests.data.config import REV_SMALL_HISTORY, ROOT_DIR, ROOT_REV +from swh.model.identifiers import parse_swhid def test_access_meta(fuse_mntdir): @@ -38,13 +39,18 @@ def test_list_history(fuse_mntdir): - dir_path = fuse_mntdir / "archive" / REV_SMALL_HISTORY / "history" + dir_path = fuse_mntdir / "archive" / REV_SMALL_HISTORY / "history/by-hash" history_meta = get_data_from_graph_archive( REV_SMALL_HISTORY, GRAPH_API_REQUEST.HISTORY ) history = history_meta.strip() # Only keep second node in the edge because first node is redundant # information or the root node (hence not an ancestor) - expected = [edge.split(" ")[1] for edge in history.split("\n")] - actual = os.listdir(dir_path) - assert set(actual) == set(expected) + expected = set([edge.split(" ")[1] for edge in history.split("\n")]) + + for swhid in expected: + swhid = parse_swhid(swhid) + depth1 = swhid.object_id[:2] + depth2 = str(swhid) + assert (dir_path / depth1).exists() + assert depth2 in (os.listdir(dir_path / depth1))