diff --git a/docs/design.md b/docs/design.md --- a/docs/design.md +++ b/docs/design.md @@ -49,16 +49,12 @@ The SwhFS mount point contain: -- `archive/`: initially empty, this directory is lazily populated with one entry - per accessed SWHID, having actual SWHIDs as names (possibly sharded into - `xy/../SWHID` paths to avoid overcrowding `archive/`). - -- `meta/`: initially empty, this directory contains one `.json` file for - each `` entry under `archive/`. The JSON file contain all available - meta information about the given SWHID, as returned by the Software Heritage - Web API for that object. Note that, in case of pagination (e.g., snapshot - objects with many branches) the JSON file will contain a complete version with - all pages merged together. +- `archive/`: virtual directory allowing to mount any artifact on the fly using + its SWHID as name. The associated metadata of the artifact from the Software + Heritage Web API can also be accessed through the `SWHID.json` file (in case + of pagination, the JSON file will contain a complete version with all pages + merged together). Note: the archive directory cannot be listed with ls, but + entries in it can be accessed (e.g., using cat or cd). - `origin/`: initially empty, this directory is lazily populated with one entry per accessed origin URL, having encoded URL as names. The URL encoding is done @@ -113,7 +109,7 @@ reverse topological order. The history can be listed through `by-date/`, `by-hash/` or `by-page/` with each its own sharding policy. - `meta.json`: metadata for the current node, as a symlink pointing to the - relevant `meta/.json` file + relevant `archive/.json` file ### `rel` nodes (releases) @@ -127,7 +123,7 @@ (transitively) resolves to a directory. When present it is a symlink pointing into `archive/` to the SWHID of the given directory - `meta.json`: metadata for the current node, as a symlink pointing to the - relevant `meta/.json` file + relevant `archive/.json` file ### `snp` nodes (snapshots) @@ -180,7 +176,7 @@ Artifact id → JSON metadata The metadata cache map each artifact to the complete metadata of the referenced -object. This is analogous to what is available in `meta/.json` file (and +object. This is analogous to what is available in `archive/.json` file (and generally used as data source for returning the content of those files). Artifacts are identified using their SWHIDs, or in the case of origin visits, using their URLs. diff --git a/swh/fuse/cache.py b/swh/fuse/cache.py --- a/swh/fuse/cache.py +++ b/swh/fuse/cache.py @@ -19,7 +19,7 @@ from swh.fuse.fs.artifact import RevisionHistoryShardByDate from swh.fuse.fs.entry import FuseDirEntry, FuseEntry -from swh.fuse.fs.mountpoint import ArchiveDir, MetaDir, OriginDir +from swh.fuse.fs.mountpoint import ArchiveDir, OriginDir from swh.model.exceptions import ValidationError from swh.model.identifiers import REVISION, SWHID, parse_swhid from swh.web.client.client import ORIGIN_VISIT, typify_json @@ -108,7 +108,7 @@ class MetadataCache(AbstractCache): """ The metadata cache map each artifact to the complete metadata of the referenced object. This is analogous to what is available in - `meta/.json` file (and generally used as data source for returning + `archive/.json` file (and generally used as data source for returning the content of those files). Artifacts are identified using their SWHIDs, or in the case of origin visits, using their URLs. """ @@ -365,9 +365,9 @@ return self.lru_cache.get(direntry.inode, None) def set(self, direntry: FuseDirEntry, entries: List[FuseEntry]) -> None: - if isinstance(direntry, (ArchiveDir, MetaDir, OriginDir)): - # The `archive/`, `meta/`, and `origin/` are populated on the fly so - # we should never cache them + if isinstance(direntry, (ArchiveDir, OriginDir)): + # The `archive/`, and `origin/` are populated on the fly so we + # should never cache them pass elif ( isinstance(direntry, RevisionHistoryShardByDate) diff --git a/swh/fuse/fs/artifact.py b/swh/fuse/fs/artifact.py --- a/swh/fuse/fs/artifact.py +++ b/swh/fuse/fs/artifact.py @@ -146,7 +146,7 @@ in reverse topological order. The history can be listed through `by-date/`, `by-hash/` or `by-page/` with each its own sharding policy. - `meta.json`: metadata for the current node, as a symlink pointing to the - relevant `meta/.json` file """ + relevant `archive/.json` file """ swhid: SWHID @@ -165,7 +165,7 @@ yield self.create_child( FuseSymlinkEntry, name="meta.json", - target=Path(root_path, f"meta/{self.swhid}.json"), + target=Path(root_path, f"archive/{self.swhid}.json"), ) yield self.create_child( RevisionParents, @@ -398,7 +398,7 @@ (transitively) resolves to a directory. When present it is a symlink pointing into `archive/` to the SWHID of the given directory - `meta.json`: metadata for the current node, as a symlink pointing to the - relevant `meta/.json` file """ + relevant `archive/.json` file """ swhid: SWHID @@ -421,7 +421,7 @@ yield self.create_child( FuseSymlinkEntry, name="meta.json", - target=Path(root_path, f"meta/{self.swhid}.json"), + target=Path(root_path, f"archive/{self.swhid}.json"), ) target = metadata["target"] diff --git a/swh/fuse/fs/mountpoint.py b/swh/fuse/fs/mountpoint.py --- a/swh/fuse/fs/mountpoint.py +++ b/swh/fuse/fs/mountpoint.py @@ -15,7 +15,7 @@ @dataclass class Root(FuseDirEntry): - """ The FUSE mountpoint, consisting of the archive/ and meta/ directories """ + """ The FUSE mountpoint, consisting of the archive/ and origin/ directories """ name: str = field(init=False, default=None) mode: int = field(init=False, default=int(EntryMode.RDONLY_DIR)) @@ -23,80 +23,66 @@ async def compute_entries(self) -> AsyncIterator[FuseEntry]: yield self.create_child(ArchiveDir) - yield self.create_child(MetaDir) yield self.create_child(OriginDir) @dataclass class ArchiveDir(FuseDirEntry): - """ The archive/ directory is lazily populated with one entry per accessed - SWHID, having actual SWHIDs as names """ + """ The `archive/` virtual directory allows to mount any artifact on the fly + using its SWHID as name. The associated metadata of the artifact from the + Software Heritage Web API can also be accessed through the `SWHID.json` file + (in case of pagination, the JSON file will contain a complete version with + all pages merged together). Note: the archive directory cannot be listed + with ls, but entries in it can be accessed (e.g., using cat or cd). """ name: str = field(init=False, default="archive") mode: int = field(init=False, default=int(EntryMode.RDONLY_DIR)) - def create_child(self, swhid: SWHID) -> FuseEntry: - if swhid.object_type == CONTENT: - mode = EntryMode.RDONLY_FILE - else: - mode = EntryMode.RDONLY_DIR - return super().create_child( - OBJTYPE_GETTERS[swhid.object_type], - name=str(swhid), - mode=int(mode), - swhid=swhid, - ) + JSON_SUFFIX = ".json" async def compute_entries(self) -> AsyncIterator[FuseEntry]: - async for swhid in self.fuse.cache.get_cached_swhids(): - yield self.create_child(swhid) + return + yield async def lookup(self, name: str) -> FuseEntry: - entry = await super().lookup(name) - if entry: - return entry - # On the fly mounting of a new artifact try: - swhid = parse_swhid(name) - await self.fuse.get_metadata(swhid) - return self.create_child(swhid) + if name.endswith(self.JSON_SUFFIX): + swhid = parse_swhid(name[: -len(self.JSON_SUFFIX)]) + return self.create_child( + MetaEntry, + name=f"{swhid}{self.JSON_SUFFIX}", + mode=int(EntryMode.RDONLY_FILE), + swhid=swhid, + ) + else: + swhid = parse_swhid(name) + await self.fuse.get_metadata(swhid) + return self.create_child( + OBJTYPE_GETTERS[swhid.object_type], + name=str(swhid), + mode=int( + EntryMode.RDONLY_FILE + if swhid.object_type == CONTENT + else EntryMode.RDONLY_DIR + ), + swhid=swhid, + ) except ValidationError: return None -@dataclass -class MetaDir(FuseDirEntry): - """ The meta/ directory contains one SWHID.json file for each SWHID entry - under archive/. The JSON file contain all available meta information about - the given SWHID, as returned by the Software Heritage Web API for that - object. Note that, in case of pagination (e.g., snapshot objects with many - branches) the JSON file will contain a complete version with all pages - merged together. """ - - name: str = field(init=False, default="meta") - mode: int = field(init=False, default=int(EntryMode.RDONLY_DIR)) - - async def compute_entries(self) -> AsyncIterator[FuseEntry]: - async for swhid in self.fuse.cache.get_cached_swhids(): - yield self.create_child( - MetaEntry, - name=f"{swhid}.json", - mode=int(EntryMode.RDONLY_FILE), - swhid=swhid, - ) - - @dataclass class MetaEntry(FuseFileEntry): - """ An entry from the meta/ directory, containing for each accessed SWHID a - corresponding SWHID.json file with all the metadata from the Software - Heritage archive. """ + """ An entry for a `archive/.json` file, containing all the SWHID's + metadata from the Software Heritage archive. """ swhid: SWHID async def get_content(self) -> bytes: - # Get raw JSON metadata from API (un-typified) + # Make sure the metadata is in cache + await self.fuse.get_metadata(self.swhid) + # Retrieve raw JSON metadata from cache (un-typified) metadata = await self.fuse.cache.metadata.get(self.swhid, typify=False) json_str = json.dumps(metadata, indent=self.fuse.conf["json-indent"]) return (json_str + "\n").encode() diff --git a/swh/fuse/tests/test_meta.py b/swh/fuse/tests/test_meta.py --- a/swh/fuse/tests/test_meta.py +++ b/swh/fuse/tests/test_meta.py @@ -7,10 +7,6 @@ def test_access_meta_file(fuse_mntdir): for swhid in ALL_ENTRIES: # On the fly mounting - file_path_archive = fuse_mntdir / "archive" / swhid - file_path_archive.exists() - - file_path_meta = fuse_mntdir / f"meta/{swhid}.json" - assert file_path_meta.exists() + file_path_meta = fuse_mntdir / f"archive/{swhid}.json" expected = json.dumps(get_data_from_web_archive(swhid)) assert file_path_meta.read_text().strip() == expected.strip() diff --git a/swh/fuse/tests/test_mountpoint.py b/swh/fuse/tests/test_mountpoint.py --- a/swh/fuse/tests/test_mountpoint.py +++ b/swh/fuse/tests/test_mountpoint.py @@ -9,14 +9,13 @@ def test_mountpoint(fuse_mntdir): - assert os.listdir(fuse_mntdir) == ["archive", "meta", "origin"] + assert {"archive", "origin"} <= set(os.listdir(fuse_mntdir)) def test_on_the_fly_mounting(fuse_mntdir): assert os.listdir(fuse_mntdir / "archive") == [] - assert os.listdir(fuse_mntdir / "meta") == [] assert (fuse_mntdir / "archive" / REGULAR_FILE).is_file() - assert (fuse_mntdir / "meta" / (REGULAR_FILE + ".json")).is_file() + assert (fuse_mntdir / "archive" / (REGULAR_FILE + ".json")).is_file() assert os.listdir(fuse_mntdir / "origin") == [] assert (fuse_mntdir / "origin" / ORIGIN_URL_ENCODED).is_dir()