diff --git a/swh/fuse/fs/artifact.py b/swh/fuse/fs/artifact.py --- a/swh/fuse/fs/artifact.py +++ b/swh/fuse/fs/artifact.py @@ -208,6 +208,7 @@ async def compute_entries(self) -> AsyncIterator[FuseEntry]: history = await self.fuse.get_history(self.swhid) + by_hash = self.create_child( RevisionHistoryShardByHash, name="by-hash", @@ -217,6 +218,15 @@ by_hash.fill_direntry_cache(history) yield by_hash + by_page = self.create_child( + RevisionHistoryShardByPage, + name="by-page", + mode=int(EntryMode.RDONLY_DIR), + history_swhid=self.swhid, + ) + by_page.fill_direntry_cache(history) + yield by_page + @dataclass class RevisionHistoryShardByHash(FuseDirEntry): @@ -289,6 +299,57 @@ yield entry +@dataclass +class RevisionHistoryShardByPage(FuseDirEntry): + """ Revision virtual `history/by-page` sharded directory """ + + history_swhid: SWHID + + PAGE_SIZE = 10_000 + PAGE_FMT = staticmethod(lambda page_number: f"{page_number:03d}") + + def fill_direntry_cache(self, swhids: List[SWHID]): + page_number = -1 + page = None + page_root_path = None + page_children = [] + pages = [] + for idx, swhid in enumerate(swhids): + if idx % self.PAGE_SIZE == 0: + if page: + self.fuse.cache.direntry.set(page, page_children) + pages.append(page) + + page_number += 1 + page = self.create_child( + RevisionHistoryShardByPage, + name=self.PAGE_FMT(page_number), + mode=int(EntryMode.RDONLY_DIR), + history_swhid=self.history_swhid, + ) + page_root_path = page.get_relative_root_path() + page_children = [] + + page_children.append( + page.create_child( + FuseSymlinkEntry, + name=str(swhid), + target=Path(page_root_path, f"archive/{swhid}"), + ) + ) + + if page: + self.fuse.cache.direntry.set(page, page_children) + pages.append(page) + self.fuse.cache.direntry.set(self, pages) + return pages + + async def compute_entries(self) -> AsyncIterator[FuseEntry]: + history = await self.fuse.get_history(self.history_swhid) + for entry in self.fill_direntry_cache(history): + yield entry + + @dataclass class Release(FuseDirEntry): """ Software Heritage release artifact. diff --git a/swh/fuse/tests/test_revision.py b/swh/fuse/tests/test_revision.py --- a/swh/fuse/tests/test_revision.py +++ b/swh/fuse/tests/test_revision.py @@ -1,6 +1,7 @@ import json import os +from swh.fuse.fs.artifact import RevisionHistoryShardByPage from swh.fuse.tests.api_url import GRAPH_API_REQUEST from swh.fuse.tests.common import ( check_dir_name_entries, @@ -39,18 +40,30 @@ def test_list_history(fuse_mntdir): - dir_path = fuse_mntdir / "archive" / REV_SMALL_HISTORY / "history/by-hash" + dir_path = fuse_mntdir / "archive" / REV_SMALL_HISTORY / "history" + assert os.listdir(dir_path) == ["by-hash", "by-page"] + history_meta = get_data_from_graph_archive( REV_SMALL_HISTORY, GRAPH_API_REQUEST.HISTORY ) history = history_meta.strip() # Only keep second node in the edge because first node is redundant # information or the root node (hence not an ancestor) - expected = set([edge.split(" ")[1] for edge in history.split("\n")]) + expected = set( + map(parse_swhid, [edge.split(" ")[1] for edge in history.split("\n")]) + ) + dir_by_hash = dir_path / "by-hash" for swhid in expected: - swhid = parse_swhid(swhid) depth1 = swhid.object_id[:2] depth2 = str(swhid) - assert (dir_path / depth1).exists() - assert depth2 in (os.listdir(dir_path / depth1)) + assert (dir_by_hash / depth1).exists() + assert depth2 in (os.listdir(dir_by_hash / depth1)) + + dir_by_page = dir_path / "by-page" + for idx, swhid in enumerate(expected): + page_number = idx // RevisionHistoryShardByPage.PAGE_SIZE + depth1 = RevisionHistoryShardByPage.PAGE_FMT(page_number) + depth2 = str(swhid) + assert (dir_by_page / depth1).exists() + assert depth2 in (os.listdir(dir_by_page / depth1))