diff --git a/docs/design.md b/docs/design.md --- a/docs/design.md +++ b/docs/design.md @@ -61,6 +61,14 @@ using the percent-encoding mechanism described in [RFC 3986](https://tools.ietf.org/html/rfc3986.html). +- `cache/`: on-disk representation of locally cached objects and metadata. Via + this directory you can browse cached data and selectively remove them from the + cache, freeing disk space. (See `swh fs clean` in the {ref}`CLI + ` to completely empty the cache). The directory is populated + with symlinks to: all artifacts, identified by their SWHIDs and sharded by the + first two character of their object id, the metadata identified by a + `SWHID.json` entry, and the `origin/` directory. + ## File system representation diff --git a/swh/fuse/cache.py b/swh/fuse/cache.py --- a/swh/fuse/cache.py +++ b/swh/fuse/cache.py @@ -19,7 +19,7 @@ from swh.fuse.fs.artifact import RevisionHistoryShardByDate from swh.fuse.fs.entry import FuseDirEntry, FuseEntry -from swh.fuse.fs.mountpoint import ArchiveDir, OriginDir +from swh.fuse.fs.mountpoint import CacheDir, OriginDir from swh.model.exceptions import ValidationError from swh.model.identifiers import REVISION, SWHID, parse_swhid from swh.web.client.client import ORIGIN_VISIT, typify_json @@ -365,9 +365,8 @@ return self.lru_cache.get(direntry.inode, None) def set(self, direntry: FuseDirEntry, entries: List[FuseEntry]) -> None: - if isinstance(direntry, (ArchiveDir, OriginDir)): - # The `archive/`, and `origin/` are populated on the fly so we - # should never cache them + if isinstance(direntry, (CacheDir, OriginDir)): + # The `cache/` and `origin/` directories are populated on the fly pass elif ( isinstance(direntry, RevisionHistoryShardByDate) diff --git a/swh/fuse/fs/mountpoint.py b/swh/fuse/fs/mountpoint.py --- a/swh/fuse/fs/mountpoint.py +++ b/swh/fuse/fs/mountpoint.py @@ -5,14 +5,23 @@ from dataclasses import dataclass, field import json +from pathlib import Path import re from typing import AsyncIterator from swh.fuse.fs.artifact import OBJTYPE_GETTERS, SWHID_REGEXP, Origin -from swh.fuse.fs.entry import EntryMode, FuseDirEntry, FuseEntry, FuseFileEntry +from swh.fuse.fs.entry import ( + EntryMode, + FuseDirEntry, + FuseEntry, + FuseFileEntry, + FuseSymlinkEntry, +) from swh.model.exceptions import ValidationError from swh.model.identifiers import CONTENT, SWHID, parse_swhid +JSON_SUFFIX = ".json" + @dataclass class Root(FuseDirEntry): @@ -25,6 +34,7 @@ async def compute_entries(self) -> AsyncIterator[FuseEntry]: yield self.create_child(ArchiveDir) yield self.create_child(OriginDir) + yield self.create_child(CacheDir) @dataclass @@ -40,7 +50,6 @@ mode: int = field(init=False, default=int(EntryMode.RDONLY_DIR)) ENTRIES_REGEXP = re.compile(r"^(" + SWHID_REGEXP + ")(.json)?$") - JSON_SUFFIX = ".json" async def compute_entries(self) -> AsyncIterator[FuseEntry]: return @@ -49,11 +58,11 @@ async def lookup(self, name: str) -> FuseEntry: # On the fly mounting of a new artifact try: - if name.endswith(self.JSON_SUFFIX): - swhid = parse_swhid(name[: -len(self.JSON_SUFFIX)]) + if name.endswith(JSON_SUFFIX): + swhid = parse_swhid(name[: -len(JSON_SUFFIX)]) return self.create_child( MetaEntry, - name=f"{swhid}{self.JSON_SUFFIX}", + name=f"{swhid}{JSON_SUFFIX}", mode=int(EntryMode.RDONLY_FILE), swhid=swhid, ) @@ -125,3 +134,62 @@ return self.create_child(url_encoded) except ValueError: return None + + +@dataclass +class CacheDir(FuseDirEntry): + """ The cache/ directory is an on-disk representation of locally cached + objects and metadata. Via this directory you can browse cached data and + selectively remove them from the cache, freeing disk space. (See `swh fs + clean` in the {ref}`CLI ` to completely empty the cache). The + directory is populated with symlinks to: all artifacts, identified by their + SWHIDs and sharded by the first two character of their object id, the + metadata identified by a `SWHID.json` entry, and the `origin/` directory. + """ + + name: str = field(init=False, default="cache") + mode: int = field(init=False, default=int(EntryMode.RDONLY_DIR)) + + ENTRIES_REGEXP = re.compile(r"^([a-f0-9]{2})|(" + OriginDir.name + ")$") + + @dataclass + class ArtifactShardBySwhid(FuseDirEntry): + ENTRIES_REGEXP = re.compile(r"^(" + SWHID_REGEXP + ")$") + + prefix: str = field(default="") + + async def compute_entries(self) -> AsyncIterator[FuseEntry]: + root_path = self.get_relative_root_path() + async for swhid in self.fuse.cache.get_cached_swhids(): + if not swhid.object_id.startswith(self.prefix): + continue + + yield self.create_child( + FuseSymlinkEntry, + name=str(swhid), + target=Path(root_path, f"archive/{swhid}"), + ) + yield self.create_child( + FuseSymlinkEntry, + name=f"{swhid}{JSON_SUFFIX}", + target=Path(root_path, f"archive/{swhid}{JSON_SUFFIX}"), + ) + + async def compute_entries(self) -> AsyncIterator[FuseEntry]: + prefixes = set() + async for swhid in self.fuse.cache.get_cached_swhids(): + prefixes.add(swhid.object_id[:2]) + + for prefix in prefixes: + yield self.create_child( + CacheDir.ArtifactShardBySwhid, + name=prefix, + mode=int(EntryMode.RDONLY_DIR), + prefix=prefix, + ) + + yield self.create_child( + FuseSymlinkEntry, + name=OriginDir.name, + target=Path(self.get_relative_root_path(), OriginDir.name), + ) diff --git a/swh/fuse/tests/test_mountpoint.py b/swh/fuse/tests/test_mountpoint.py --- a/swh/fuse/tests/test_mountpoint.py +++ b/swh/fuse/tests/test_mountpoint.py @@ -6,10 +6,11 @@ import os from swh.fuse.tests.data.config import ORIGIN_URL_ENCODED, REGULAR_FILE +from swh.model.identifiers import parse_swhid def test_mountpoint(fuse_mntdir): - assert {"archive", "origin"} <= set(os.listdir(fuse_mntdir)) + assert {"archive", "cache", "origin"} <= set(os.listdir(fuse_mntdir)) def test_on_the_fly_mounting(fuse_mntdir): @@ -19,3 +20,11 @@ assert os.listdir(fuse_mntdir / "origin") == [] assert (fuse_mntdir / "origin" / ORIGIN_URL_ENCODED).is_dir() + + sharded_dir = parse_swhid(REGULAR_FILE).object_id[:2] + assert os.listdir(fuse_mntdir / "cache") == [sharded_dir, "origin"] + assert os.listdir(fuse_mntdir / "cache" / sharded_dir) == [ + REGULAR_FILE, + REGULAR_FILE + ".json", + ] + assert os.listdir(fuse_mntdir / "cache/origin") == [ORIGIN_URL_ENCODED]