diff --git a/docs/design.md b/docs/design.md --- a/docs/design.md +++ b/docs/design.md @@ -49,16 +49,12 @@ The SwhFS mount point contain: -- `archive/`: initially empty, this directory is lazily populated with one entry - per accessed SWHID, having actual SWHIDs as names (possibly sharded into - `xy/../SWHID` paths to avoid overcrowding `archive/`). - -- `meta/`: initially empty, this directory contains one `.json` file for - each `` entry under `archive/`. The JSON file contain all available - meta information about the given SWHID, as returned by the Software Heritage - Web API for that object. Note that, in case of pagination (e.g., snapshot - objects with many branches) the JSON file will contain a complete version with - all pages merged together. +- `archive/`: virtual directory always considered empty, allowing to mount any + artifact on the fly using its SWHID as name. The associated metadata of the + artifact from the Software Heritage Web API can also be accessed through the + `SWHID.json` file (in case of pagination, e.g.: snapshot objects with many + branches, the JSON file will contain a complete version with all pages merged + together). - `origin/`: initially empty, this directory is lazily populated with one entry per accessed origin URL, having encoded URL as names. The URL encoding is done @@ -113,7 +109,7 @@ reverse topological order. The history can be listed through `by-date/`, `by-hash/` or `by-page/` with each its own sharding policy. - `meta.json`: metadata for the current node, as a symlink pointing to the - relevant `meta/.json` file + relevant `archive/.json` file ### `rel` nodes (releases) @@ -127,7 +123,7 @@ (transitively) resolves to a directory. When present it is a symlink pointing into `archive/` to the SWHID of the given directory - `meta.json`: metadata for the current node, as a symlink pointing to the - relevant `meta/.json` file + relevant `archive/.json` file ### `snp` nodes (snapshots) @@ -180,7 +176,7 @@ Artifact id → JSON metadata The metadata cache map each artifact to the complete metadata of the referenced -object. This is analogous to what is available in `meta/.json` file (and +object. This is analogous to what is available in `archive/.json` file (and generally used as data source for returning the content of those files). Artifacts are identified using their SWHIDs, or in the case of origin visits, using their URLs. diff --git a/swh/fuse/cache.py b/swh/fuse/cache.py --- a/swh/fuse/cache.py +++ b/swh/fuse/cache.py @@ -19,7 +19,7 @@ from swh.fuse.fs.artifact import RevisionHistoryShardByDate from swh.fuse.fs.entry import FuseDirEntry, FuseEntry -from swh.fuse.fs.mountpoint import ArchiveDir, MetaDir, OriginDir +from swh.fuse.fs.mountpoint import ArchiveDir, OriginDir from swh.model.exceptions import ValidationError from swh.model.identifiers import REVISION, SWHID, parse_swhid from swh.web.client.client import ORIGIN_VISIT, typify_json @@ -62,17 +62,6 @@ await self.blob.__aexit__() await self.history.__aexit__() - async def get_cached_swhids(self) -> AsyncGenerator[SWHID, None]: - """ Return a list of all previously cached SWHID """ - - # Use the metadata db since it should always contain all accessed SWHIDs - metadata_cursor = await self.metadata.conn.execute( - "select swhid from metadata_cache" - ) - swhids = await metadata_cursor.fetchall() - for raw_swhid in swhids: - yield parse_swhid(raw_swhid[0]) - async def get_cached_visits(self) -> AsyncGenerator[str, None]: """ Return a list of all previously cached visit URL """ @@ -108,7 +97,7 @@ class MetadataCache(AbstractCache): """ The metadata cache map each artifact to the complete metadata of the referenced object. This is analogous to what is available in - `meta/.json` file (and generally used as data source for returning + `archive/.json` file (and generally used as data source for returning the content of those files). Artifacts are identified using their SWHIDs, or in the case of origin visits, using their URLs. """ @@ -365,9 +354,9 @@ return self.lru_cache.get(direntry.inode, None) def set(self, direntry: FuseDirEntry, entries: List[FuseEntry]) -> None: - if isinstance(direntry, (ArchiveDir, MetaDir, OriginDir)): - # The `archive/`, `meta/`, and `origin/` are populated on the fly so - # we should never cache them + if isinstance(direntry, (ArchiveDir, OriginDir)): + # The `archive/`, and `origin/` are populated on the fly so we + # should never cache them pass elif ( isinstance(direntry, RevisionHistoryShardByDate) diff --git a/swh/fuse/fs/artifact.py b/swh/fuse/fs/artifact.py --- a/swh/fuse/fs/artifact.py +++ b/swh/fuse/fs/artifact.py @@ -146,7 +146,7 @@ in reverse topological order. The history can be listed through `by-date/`, `by-hash/` or `by-page/` with each its own sharding policy. - `meta.json`: metadata for the current node, as a symlink pointing to the - relevant `meta/.json` file """ + relevant `archive/.json` file """ swhid: SWHID @@ -165,7 +165,7 @@ yield self.create_child( FuseSymlinkEntry, name="meta.json", - target=Path(root_path, f"meta/{self.swhid}.json"), + target=Path(root_path, f"archive/{self.swhid}.json"), ) yield self.create_child( RevisionParents, @@ -398,7 +398,7 @@ (transitively) resolves to a directory. When present it is a symlink pointing into `archive/` to the SWHID of the given directory - `meta.json`: metadata for the current node, as a symlink pointing to the - relevant `meta/.json` file """ + relevant `archive/.json` file """ swhid: SWHID @@ -421,7 +421,7 @@ yield self.create_child( FuseSymlinkEntry, name="meta.json", - target=Path(root_path, f"meta/{self.swhid}.json"), + target=Path(root_path, f"archive/{self.swhid}.json"), ) target = metadata["target"] diff --git a/swh/fuse/fs/mountpoint.py b/swh/fuse/fs/mountpoint.py --- a/swh/fuse/fs/mountpoint.py +++ b/swh/fuse/fs/mountpoint.py @@ -15,7 +15,7 @@ @dataclass class Root(FuseDirEntry): - """ The FUSE mountpoint, consisting of the archive/ and meta/ directories """ + """ The FUSE mountpoint, consisting of the archive/ and origin/ directories """ name: str = field(init=False, default=None) mode: int = field(init=False, default=int(EntryMode.RDONLY_DIR)) @@ -23,80 +23,64 @@ async def compute_entries(self) -> AsyncIterator[FuseEntry]: yield self.create_child(ArchiveDir) - yield self.create_child(MetaDir) yield self.create_child(OriginDir) @dataclass class ArchiveDir(FuseDirEntry): - """ The archive/ directory is lazily populated with one entry per accessed - SWHID, having actual SWHIDs as names """ + """ The `archive/` virtual directory is always considered empty, and allows + to mount any artifact on the fly using its SWHID as name. The associated + metadata of the artifact from the Software Heritage Web API can also be + accessed through the `SWHID.json` file (in case of pagination, e.g.: + snapshot objects with many branches, the JSON file will contain a complete + version with all pages merged together). """ name: str = field(init=False, default="archive") mode: int = field(init=False, default=int(EntryMode.RDONLY_DIR)) - def create_child(self, swhid: SWHID) -> FuseEntry: - if swhid.object_type == CONTENT: - mode = EntryMode.RDONLY_FILE - else: - mode = EntryMode.RDONLY_DIR - return super().create_child( - OBJTYPE_GETTERS[swhid.object_type], - name=str(swhid), - mode=int(mode), - swhid=swhid, - ) - async def compute_entries(self) -> AsyncIterator[FuseEntry]: - async for swhid in self.fuse.cache.get_cached_swhids(): - yield self.create_child(swhid) + return + yield async def lookup(self, name: str) -> FuseEntry: - entry = await super().lookup(name) - if entry: - return entry - # On the fly mounting of a new artifact try: - swhid = parse_swhid(name) - await self.fuse.get_metadata(swhid) - return self.create_child(swhid) + if name.endswith(".json"): + swhid = parse_swhid(name[: -len(".json")]) + return self.create_child( + MetaEntry, + name=f"{swhid}.json", + mode=int(EntryMode.RDONLY_FILE), + swhid=swhid, + ) + else: + swhid = parse_swhid(name) + await self.fuse.get_metadata(swhid) + return self.create_child( + OBJTYPE_GETTERS[swhid.object_type], + name=str(swhid), + mode=int( + EntryMode.RDONLY_FILE + if swhid.object_type == CONTENT + else EntryMode.RDONLY_DIR + ), + swhid=swhid, + ) except ValidationError: return None -@dataclass -class MetaDir(FuseDirEntry): - """ The meta/ directory contains one SWHID.json file for each SWHID entry - under archive/. The JSON file contain all available meta information about - the given SWHID, as returned by the Software Heritage Web API for that - object. Note that, in case of pagination (e.g., snapshot objects with many - branches) the JSON file will contain a complete version with all pages - merged together. """ - - name: str = field(init=False, default="meta") - mode: int = field(init=False, default=int(EntryMode.RDONLY_DIR)) - - async def compute_entries(self) -> AsyncIterator[FuseEntry]: - async for swhid in self.fuse.cache.get_cached_swhids(): - yield self.create_child( - MetaEntry, - name=f"{swhid}.json", - mode=int(EntryMode.RDONLY_FILE), - swhid=swhid, - ) - - @dataclass class MetaEntry(FuseFileEntry): - """ An entry from the meta/ directory, containing for each accessed SWHID a - corresponding SWHID.json file with all the metadata from the Software - Heritage archive. """ + """ An entry for a `archive/SWHID.json` file, containing all the SWHID's + metadata from the Software Heritage archive. """ swhid: SWHID async def get_content(self) -> bytes: - # Get raw JSON metadata from API (un-typified) + # Make sure the metadata is in cache + await self.fuse.get_metadata(self.swhid) + # Retrieve raw JSON metadata from cache (un-typified) metadata = await self.fuse.cache.metadata.get(self.swhid, typify=False) json_str = json.dumps(metadata, indent=self.fuse.conf["json-indent"]) return (json_str + "\n").encode() diff --git a/swh/fuse/tests/test_meta.py b/swh/fuse/tests/test_meta.py --- a/swh/fuse/tests/test_meta.py +++ b/swh/fuse/tests/test_meta.py @@ -7,10 +7,6 @@ def test_access_meta_file(fuse_mntdir): for swhid in ALL_ENTRIES: # On the fly mounting - file_path_archive = fuse_mntdir / "archive" / swhid - file_path_archive.exists() - - file_path_meta = fuse_mntdir / f"meta/{swhid}.json" - assert file_path_meta.exists() + file_path_meta = fuse_mntdir / f"archive/{swhid}.json" expected = json.dumps(get_data_from_web_archive(swhid)) assert file_path_meta.read_text().strip() == expected.strip() diff --git a/swh/fuse/tests/test_mountpoint.py b/swh/fuse/tests/test_mountpoint.py --- a/swh/fuse/tests/test_mountpoint.py +++ b/swh/fuse/tests/test_mountpoint.py @@ -9,14 +9,13 @@ def test_mountpoint(fuse_mntdir): - assert os.listdir(fuse_mntdir) == ["archive", "meta", "origin"] + assert os.listdir(fuse_mntdir) == ["archive", "origin"] def test_on_the_fly_mounting(fuse_mntdir): assert os.listdir(fuse_mntdir / "archive") == [] - assert os.listdir(fuse_mntdir / "meta") == [] assert (fuse_mntdir / "archive" / REGULAR_FILE).is_file() - assert (fuse_mntdir / "meta" / (REGULAR_FILE + ".json")).is_file() + assert (fuse_mntdir / "archive" / (REGULAR_FILE + ".json")).is_file() assert os.listdir(fuse_mntdir / "origin") == [] assert (fuse_mntdir / "origin" / ORIGIN_URL_ENCODED).is_dir()