diff --git a/swh/fuse/cache.py b/swh/fuse/cache.py --- a/swh/fuse/cache.py +++ b/swh/fuse/cache.py @@ -4,6 +4,8 @@ # See top-level LICENSE file for more information from abc import ABC +from collections import OrderedDict +from dataclasses import dataclass import json import logging from pathlib import Path @@ -11,6 +13,8 @@ import aiosqlite +from swh.fuse.fs.entry import FuseDirEntry, FuseEntry +from swh.fuse.fs.mountpoint import ArchiveDir, MetaDir from swh.model.exceptions import ValidationError from swh.model.identifiers import SWHID, parse_swhid from swh.web.client.client import typify_json @@ -41,6 +45,7 @@ self.metadata = MetadataCache(self.cache_conf["metadata"]) self.blob = BlobCache(self.cache_conf["blob"]) self.history = HistoryCache(self.cache_conf["history"]) + self.direntry = DirEntryCache(self.cache_conf["direntry"]) await self.metadata.__aenter__() await self.blob.__aenter__() await self.history.__aenter__() @@ -211,3 +216,57 @@ "insert or ignore into history_graph values (?, ?)", edges ) await self.conn.commit() + + +class DirEntryCache: + """ The direntry cache map inode representing directories to the entries + they contain. Each entry comes with its name as well as file attributes + (i.e., all its needed to perform a detailed directory listing). + + Additional attributes of each directory entry should be looked up on a entry + by entry basis, possibly hitting other caches. + + The direntry cache for a given dir is populated, at the latest, when the + content of the directory is listed. More aggressive prefetching might + happen. For instance, when first opening a dir a recursive listing of it can + be retrieved from the remote backend and used to recursively populate the + direntry cache for all (transitive) sub-directories. """ + + @dataclass + class LRU(OrderedDict): + maxsize: int + + def __getitem__(self, key: Any) -> Any: + value = super().__getitem__(key) + self.move_to_end(key) + return value + + def __setitem__(self, key: Any, value: Any) -> None: + # TODO: set the limit based on # dir entries instead of # dir? + # TODO: add a limit on the size of a cached direntry? + if key in self: + self.move_to_end(key) + super().__setitem__(key, value) + if len(self) > self.maxsize: + oldest = next(iter(self)) + del self[oldest] + + def __init__(self, conf: Dict[str, Any]): + self.lru_cache = self.LRU(conf["maxsize"]) + + async def get(self, direntry: FuseDirEntry) -> List[FuseEntry]: + try: + return self.lru_cache[direntry.inode] + except KeyError: + entries = [x async for x in direntry] + self.set(direntry, entries) + return entries + + def set(self, direntry: FuseDirEntry, entries: List[FuseEntry]) -> None: + if isinstance(direntry, ArchiveDir) or isinstance(direntry, MetaDir): + # The `archive/` and `meta/` are populated on the fly so we should + # never cache them + pass + else: + # TODO: store entries as dict referenced by name (except for history/)? + self.lru_cache[direntry.inode] = entries diff --git a/swh/fuse/cli.py b/swh/fuse/cli.py --- a/swh/fuse/cli.py +++ b/swh/fuse/cli.py @@ -31,6 +31,7 @@ "metadata": {"path": CACHE_HOME_DIR / "swh/fuse/metadata.sqlite"}, "blob": {"path": CACHE_HOME_DIR / "swh/fuse/blob.sqlite"}, "history": {"path": CACHE_HOME_DIR / "swh/fuse/history.sqlite"}, + "direntry": {"maxsize": 128}, }, "web-api": { "url": "https://archive.softwareheritage.org/api/1", diff --git a/swh/fuse/fs/entry.py b/swh/fuse/fs/entry.py --- a/swh/fuse/fs/entry.py +++ b/swh/fuse/fs/entry.py @@ -83,7 +83,8 @@ async def lookup(self, name: str) -> FuseEntry: """ Look up a FUSE entry by name """ - async for entry in self: + entries = await self.fuse.cache.direntry.get(self) + for entry in entries: if entry.name == name: return entry return None diff --git a/swh/fuse/fuse.py b/swh/fuse/fuse.py --- a/swh/fuse/fuse.py +++ b/swh/fuse/fuse.py @@ -5,6 +5,7 @@ import asyncio import errno +import itertools import logging import os from pathlib import Path @@ -169,18 +170,13 @@ # opendir() uses inode as directory handle inode = fh - - # TODO: add cache on direntry list? direntry = self.inode2entry(inode) assert isinstance(direntry, FuseDirEntry) + + entries = await self.cache.direntry.get(direntry) next_id = offset + 1 - i = 0 try: - async for entry in direntry: - if i < offset: - i += 1 - continue - + for entry in itertools.islice(entries, offset, None): name = os.fsencode(entry.name) attrs = await self.get_attrs(entry) if not pyfuse3.readdir_reply(token, name, attrs, next_id):