diff --git a/swh/fuse/fs/artifact.py b/swh/fuse/fs/artifact.py index db0d289..590a224 100644 --- a/swh/fuse/fs/artifact.py +++ b/swh/fuse/fs/artifact.py @@ -1,49 +1,78 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Any, Dict, Iterator, List +from typing import Any, Iterator -from swh.fuse.fs.entry import ArtifactEntry, VirtualEntry +from swh.fuse.fs.entry import ArtifactEntry, EntryMode +from swh.model.identifiers import CONTENT, DIRECTORY, SWHID +# Avoid cycling import +Fuse = "Fuse" -class Content: + +def typify(name: str, mode: int, fuse: Fuse, swhid: SWHID, prefetch: Any = None) -> Any: + """ Create an artifact entry corresponding to the given artifact type """ + + constructor = {CONTENT: Content, DIRECTORY: Directory} + return constructor[swhid.object_type](name, mode, fuse, swhid, prefetch) + + +class Content(ArtifactEntry): """ Software Heritage content artifact. Content leaves (AKA blobs) are represented on disks as regular files, containing the corresponding bytes, as archived. Note that permissions are associated to blobs only in the context of directories. Hence, when accessing blobs from the top-level `archive/` directory, the permissions of the `archive/SWHID` file will be arbitrary and not meaningful (e.g., `0x644`). """ - def __init__(self, json: Dict[str, Any]): - self.json = json + def __str__(self) -> str: + return self.fuse.get_blob(self.swhid) + def __len__(self) -> int: + # When listing entries from a directory, the API already gave us information + if self.prefetch: + return self.prefetch["length"] + return len(str(self)) -class Directory: + def __iter__(self): + raise ValueError("Cannot iterate over a content type artifact") + + +class Directory(ArtifactEntry): """ Software Heritage directory artifact. Directory nodes are represented as directories on the file-system, containing one entry for each entry of the archived directory. Entry names and other metadata, including permissions, will correspond to the archived entry metadata. Note that the FUSE mount is read-only, no matter what the permissions say. So it is possible that, in the context of a directory, a file is presented as writable, whereas actually writing to it will fail with `EPERM`. """ - def __init__(self, json: List[Dict[str, Any]]): - self.json = json - - def __iter__(self) -> Iterator[VirtualEntry]: + def __iter__(self) -> Iterator[ArtifactEntry]: entries = [] - for entry in self.json: - name, swhid = entry["name"], entry["target"] - # The directory API has extra info we can use to set attributes - # without additional Software Heritage API call - prefetch = entry - entries.append(ArtifactEntry(name, swhid, prefetch)) + for entry in self.fuse.get_metadata(self.swhid): + entries.append( + typify( + name=entry["name"], + # Use default read-only permissions for directories, and + # archived permissions for contents + mode=( + entry["perms"] + if entry["target"].object_type == CONTENT + else int(EntryMode.RDONLY_DIR) + ), + fuse=self.fuse, + swhid=entry["target"], + # The directory API has extra info we can use to set attributes + # without additional Software Heritage API call + prefetch=entry, + ) + ) return iter(entries) diff --git a/swh/fuse/fs/entry.py b/swh/fuse/fs/entry.py index 4994927..5f153c8 100644 --- a/swh/fuse/fs/entry.py +++ b/swh/fuse/fs/entry.py @@ -1,55 +1,69 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from dataclasses import dataclass from enum import IntEnum from stat import S_IFDIR, S_IFREG -from typing import Any, Dict +from typing import Any from swh.model.identifiers import SWHID +# Avoid cycling import +Fuse = "Fuse" + class EntryMode(IntEnum): """ Default entry mode and permissions for the FUSE. The FUSE mount is always read-only, even if permissions contradict this statement (in a context of a directory, entries are listed with permissions taken from the archive). """ RDONLY_FILE = S_IFREG | 0o444 RDONLY_DIR = S_IFDIR | 0o555 -class VirtualEntry: +@dataclass +class FuseEntry: """ Main wrapper class to manipulate virtual FUSE entries Attributes: name: entry filename mode: entry permission mode + fuse: internal reference to the main FUSE class """ - def __init__(self, name: str, mode: EntryMode): - self.name = name - self.mode = mode + name: str + mode: int + fuse: Fuse + + def __len__(self) -> int: + return 0 + + # TODO: type hint? + def __iter__(self): + return None + # TODO: remove + def __hash__(self): + return hash((self.name, self.mode)) -class ArtifactEntry(VirtualEntry): + +@dataclass +class ArtifactEntry(FuseEntry): """ FUSE virtual entry for a Software Heritage Artifact Attributes: - name: entry filename swhid: Software Heritage persistent identifier prefetch: optional prefetched metadata used to set entry attributes """ - def __init__(self, name: str, swhid: SWHID, prefetch: Dict[str, Any] = None): - self.name = name - self.swhid = swhid - self.prefetch = prefetch - + swhid: SWHID + prefetch: Any = None -ROOT_DIRENTRY = VirtualEntry("root", EntryMode.RDONLY_DIR) -ARCHIVE_DIRENTRY = VirtualEntry("archive", EntryMode.RDONLY_DIR) -META_DIRENTRY = VirtualEntry("meta", EntryMode.RDONLY_DIR) + # TODO: remove + def __hash__(self): + return hash((self.name, self.mode, self.swhid)) diff --git a/swh/fuse/fs/mountpoint.py b/swh/fuse/fs/mountpoint.py index 1b66c3d..a072148 100644 --- a/swh/fuse/fs/mountpoint.py +++ b/swh/fuse/fs/mountpoint.py @@ -1,56 +1,85 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Iterator -from swh.fuse.cache import FuseCache -from swh.fuse.fs.entry import ( - ARCHIVE_DIRENTRY, - META_DIRENTRY, - ArtifactEntry, - EntryMode, - VirtualEntry, -) +from swh.fuse.fs.artifact import typify +from swh.fuse.fs.entry import EntryMode, FuseEntry +from swh.model.identifiers import CONTENT, SWHID +# Avoid cycling import +Fuse = "Fuse" -class Root: + +class Root(FuseEntry): """ The FUSE mountpoint, consisting of the archive/ and meta/ directories """ - def __iter__(self) -> Iterator[VirtualEntry]: - entries = [ARCHIVE_DIRENTRY, META_DIRENTRY] + def __init__(self, fuse: Fuse): + self.name = "root" + self.mode = int(EntryMode.RDONLY_DIR) + self.fuse = fuse + + def __iter__(self) -> Iterator[FuseEntry]: + entries = [ArchiveDir(self.fuse), MetaDir(self.fuse)] return iter(entries) -class Archive: +class ArchiveDir(FuseEntry): """ The archive/ directory is lazily populated with one entry per accessed SWHID, having actual SWHIDs as names """ - def __init__(self, cache: FuseCache): - self.cache = cache + def __init__(self, fuse: Fuse): + self.name = "archive" + self.mode = int(EntryMode.RDONLY_DIR) + self.fuse = fuse - def __iter__(self) -> Iterator[VirtualEntry]: + def __iter__(self) -> Iterator[FuseEntry]: entries = [] - for swhid in self.cache.get_cached_swhids(): - entries.append(ArtifactEntry(str(swhid), swhid)) + for swhid in self.fuse.cache.get_cached_swhids(): + if swhid.object_type == CONTENT: + mode = EntryMode.RDONLY_FILE + else: + mode = EntryMode.RDONLY_DIR + entries.append(typify(str(swhid), int(mode), self.fuse, swhid)) return iter(entries) -class Meta: +class MetaDir(FuseEntry): """ The meta/ directory contains one SWHID.json file for each SWHID entry under archive/. The JSON file contain all available meta information about the given SWHID, as returned by the Software Heritage Web API for that object. Note that, in case of pagination (e.g., snapshot objects with many branches) the JSON file will contain a complete version with all pages merged together. """ - def __init__(self, cache: FuseCache): - self.cache = cache + def __init__(self, fuse: Fuse): + self.name = "meta" + self.mode = int(EntryMode.RDONLY_DIR) + self.fuse = fuse - def __iter__(self) -> Iterator[VirtualEntry]: + def __iter__(self) -> Iterator[FuseEntry]: entries = [] - for swhid in self.cache.get_cached_swhids(): - filename = str(swhid) + ".json" - entries.append(VirtualEntry(filename, EntryMode.RDONLY_FILE)) + for swhid in self.fuse.cache.get_cached_swhids(): + entries.append(MetaEntry(swhid, self.fuse)) return iter(entries) + + +class MetaEntry(FuseEntry): + """ An entry from the meta/ directory, containing for each accessed SWHID a + corresponding SWHID.json file with all the metadata from the Software + Heritage archive. """ + + def __init__(self, swhid: SWHID, fuse: Fuse): + self.name = str(swhid) + ".json" + self.mode = int(EntryMode.RDONLY_FILE) + self.fuse = fuse + self.swhid = swhid + + def __str__(self) -> str: + metadata = self.fuse.get_metadata(self.swhid) + return str(metadata) + + def __len__(self) -> int: + return len(str(self)) diff --git a/swh/fuse/fuse.py b/swh/fuse/fuse.py index b9ec4e6..dfc1d4f 100644 --- a/swh/fuse/fuse.py +++ b/swh/fuse/fuse.py @@ -1,333 +1,250 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import asyncio import errno import itertools import logging import os from pathlib import Path import time from typing import Any, Dict, List import pyfuse3 import pyfuse3_asyncio import requests from swh.fuse.cache import FuseCache -from swh.fuse.fs.artifact import Content, Directory -from swh.fuse.fs.entry import ( - ARCHIVE_DIRENTRY, - META_DIRENTRY, - ROOT_DIRENTRY, - ArtifactEntry, - EntryMode, - VirtualEntry, -) -from swh.fuse.fs.mountpoint import Archive, Meta, Root -from swh.model.identifiers import CONTENT, DIRECTORY, SWHID, parse_swhid +from swh.fuse.fs.entry import FuseEntry +from swh.fuse.fs.mountpoint import Root +from swh.model.identifiers import CONTENT, SWHID from swh.web.client.client import WebAPIClient class Fuse(pyfuse3.Operations): """ Software Heritage Filesystem in Userspace (FUSE). Locally mount parts of the archive and navigate it as a virtual file system. """ def __init__( - self, - swhids: List[SWHID], - root_path: Path, - cache: FuseCache, - conf: Dict[str, Any], + self, root_path: Path, cache: FuseCache, conf: Dict[str, Any], ): super(Fuse, self).__init__() + root_direntry = Root(fuse=self) + self._next_inode: int = pyfuse3.ROOT_INODE self._next_fd: int = 0 root_inode = self._next_inode self._next_inode += 1 - self._inode2entry: Dict[int, VirtualEntry] = {root_inode: ROOT_DIRENTRY} - self._entry2inode: Dict[VirtualEntry, int] = {ROOT_DIRENTRY: root_inode} - self._entry2fd: Dict[VirtualEntry, int] = {} - self._fd2entry: Dict[int, VirtualEntry] = {} + self._inode2entry: Dict[int, FuseEntry] = {root_inode: root_direntry} + self._entry2inode: Dict[FuseEntry, int] = {root_direntry: root_inode} + self._entry2fd: Dict[FuseEntry, int] = {} + self._fd2entry: Dict[int, FuseEntry] = {} self._inode2path: Dict[int, Path] = {root_inode: root_path} self.time_ns: int = time.time_ns() # start time, used as timestamp self.gid = os.getgid() self.uid = os.getuid() self.web_api = WebAPIClient( conf["web-api"]["url"], conf["web-api"]["auth-token"] ) self.cache = cache - # Initially populate the cache - for swhid in swhids: - self.get_metadata(swhid) - def shutdown(self) -> None: pass - def _alloc_inode(self, entry: VirtualEntry) -> int: + def _alloc_inode(self, entry: FuseEntry) -> int: """ Return a unique inode integer for a given entry """ try: return self._entry2inode[entry] except KeyError: inode = self._next_inode self._next_inode += 1 self._entry2inode[entry] = inode self._inode2entry[inode] = entry # TODO add inode recycling with invocation to invalidate_inode when # the dicts get too big return inode - def _alloc_fd(self, entry: VirtualEntry) -> int: + def _alloc_fd(self, entry: FuseEntry) -> int: """ Return a unique file descriptor integer for a given entry """ try: return self._entry2fd[entry] except KeyError: fd = self._next_fd self._next_fd += 1 self._entry2fd[entry] = fd self._fd2entry[fd] = entry return fd - def inode2entry(self, inode: int) -> VirtualEntry: + def inode2entry(self, inode: int) -> FuseEntry: """ Return the entry matching a given inode """ try: return self._inode2entry[inode] except KeyError: raise pyfuse3.FUSEError(errno.ENOENT) - def entry2inode(self, entry: VirtualEntry) -> int: - """ Return the inode matching a given entry """ - - try: - return self._entry2inode[entry] - except KeyError: - raise pyfuse3.FUSEError(errno.ENOENT) - def inode2path(self, inode: int) -> Path: """ Return the path matching a given inode """ try: return self._inode2path[inode] except KeyError: raise pyfuse3.FUSEError(errno.ENOENT) def get_metadata(self, swhid: SWHID) -> Any: """ Retrieve metadata for a given SWHID using Software Heritage API """ # TODO: swh-graph API cache = self.cache.metadata[swhid] if cache: return cache try: metadata = self.web_api.get(swhid) self.cache.metadata[swhid] = metadata return metadata except requests.HTTPError: logging.error(f"Unknown SWHID: '{swhid}'") def get_blob(self, swhid: SWHID) -> str: """ Retrieve the blob bytes for a given content SWHID using Software Heritage API """ if swhid.object_type != CONTENT: raise pyfuse3.FUSEError(errno.EINVAL) cache = self.cache.blob[swhid] if cache: return cache resp = list(self.web_api.content_raw(swhid)) blob = "".join(map(bytes.decode, resp)) self.cache.blob[swhid] = blob return blob - def get_direntries(self, entry: VirtualEntry) -> Any: - """ Return directory entries of a given entry """ - - if isinstance(entry, ArtifactEntry): - if entry.swhid.object_type == CONTENT: - raise pyfuse3.FUSEError(errno.ENOTDIR) - - metadata = self.get_metadata(entry.swhid) - if entry.swhid.object_type == CONTENT: - return Content(metadata) - if entry.swhid.object_type == DIRECTORY: - return Directory(metadata) - # TODO: add other objects - else: - if entry == ROOT_DIRENTRY: - return Root() - elif entry == ARCHIVE_DIRENTRY: - return Archive(self.cache) - elif entry == META_DIRENTRY: - return Meta(self.cache) - # TODO: error handling - - def get_attrs(self, entry: VirtualEntry) -> pyfuse3.EntryAttributes: + def get_attrs(self, entry: FuseEntry) -> pyfuse3.EntryAttributes: """ Return entry attributes """ attrs = pyfuse3.EntryAttributes() attrs.st_size = 0 attrs.st_atime_ns = self.time_ns attrs.st_ctime_ns = self.time_ns attrs.st_mtime_ns = self.time_ns attrs.st_gid = self.gid attrs.st_uid = self.uid attrs.st_ino = self._alloc_inode(entry) - - if isinstance(entry, ArtifactEntry): - metadata = entry.prefetch or self.get_metadata(entry.swhid) - if entry.swhid.object_type == CONTENT: - # Only in the context of a directory entry do we have archived - # permissions. Otherwise, fallback to default read-only. - attrs.st_mode = metadata.get("perms", int(EntryMode.RDONLY_FILE)) - attrs.st_size = metadata["length"] - else: - attrs.st_mode = int(EntryMode.RDONLY_DIR) - else: - attrs.st_mode = int(entry.mode) - # Meta JSON entries (under the root meta/ directory) - if entry.name.endswith(".json"): - swhid = parse_swhid(entry.name.replace(".json", "")) - metadata = self.get_metadata(swhid) - attrs.st_size = len(str(metadata)) - + attrs.st_mode = entry.mode + attrs.st_size = len(entry) return attrs async def getattr( self, inode: int, _ctx: pyfuse3.RequestContext ) -> pyfuse3.EntryAttributes: """ Get attributes for a given inode """ entry = self.inode2entry(inode) return self.get_attrs(entry) async def opendir(self, inode: int, _ctx: pyfuse3.RequestContext) -> int: """ Open a directory referred by a given inode """ # Re-use inode as directory handle return inode async def readdir( self, inode: int, offset: int, token: pyfuse3.ReaddirToken ) -> None: """ Read entries in an open directory """ direntry = self.inode2entry(inode) path = self.inode2path(inode) # TODO: add cache on direntry list? - entries = self.get_direntries(direntry) next_id = offset + 1 - for entry in itertools.islice(entries, offset, None): + for entry in itertools.islice(direntry, offset, None): name = os.fsencode(entry.name) attrs = self.get_attrs(entry) if not pyfuse3.readdir_reply(token, name, attrs, next_id): break next_id += 1 self._inode2entry[attrs.st_ino] = entry self._inode2path[attrs.st_ino] = Path(path, entry.name) async def open( self, inode: int, _flags: int, _ctx: pyfuse3.RequestContext ) -> pyfuse3.FileInfo: """ Open an inode and return a unique file descriptor """ entry = self.inode2entry(inode) fd = self._alloc_fd(entry) return pyfuse3.FileInfo(fh=fd, keep_cache=True) async def read(self, fd: int, _offset: int, _length: int) -> bytes: """ Read blob content pointed by the given `fd` (file descriptor). Both parameters `_offset` and `_length` are ignored. """ # TODO: use offset/length try: entry = self._fd2entry[fd] except KeyError: raise pyfuse3.FUSEError(errno.ENOENT) - if isinstance(entry, ArtifactEntry): - blob = self.get_blob(entry.swhid) - return blob.encode() - else: - # Meta JSON entries (under the root meta/ directory) - if entry.name.endswith(".json"): - swhid = parse_swhid(entry.name.replace(".json", "")) - metadata = self.get_metadata(swhid) - return str(metadata).encode() - else: - # TODO: error handling - raise pyfuse3.FUSEError(errno.ENOENT) + data = str(entry) + return data.encode() async def lookup( self, parent_inode: int, name: str, _ctx: pyfuse3.RequestContext ) -> pyfuse3.EntryAttributes: """ Look up a directory entry by name and get its attributes """ name = os.fsdecode(name) path = Path(self.inode2path(parent_inode), name) parent_entry = self.inode2entry(parent_inode) - attr = None - if isinstance(parent_entry, ArtifactEntry): - metadata = self.get_metadata(parent_entry.swhid) - for entry in metadata: - if entry["name"] == name: - swhid = entry["target"] - attr = self.get_attrs(ArtifactEntry(name, swhid)) - # TODO: this is fragile, maybe cache attrs? - else: - if parent_entry == ROOT_DIRENTRY: - if name == ARCHIVE_DIRENTRY.name: - attr = self.get_attrs(ARCHIVE_DIRENTRY) - elif name == META_DIRENTRY.name: - attr = self.get_attrs(META_DIRENTRY) - else: - swhid = parse_swhid(name) - attr = self.get_attrs(ArtifactEntry(name, swhid)) - - if attr: - self._inode2path[attr.st_ino] = path - return attr - else: - # TODO: error handling (name not found) - return pyfuse3.EntryAttributes() + for entry in parent_entry: + if name == entry.name: + attr = self.get_attrs(entry) + self._inode2path[attr.st_ino] = path + return attr + + # TODO: error handling (name not found) + return pyfuse3.EntryAttributes() def main(swhids: List[SWHID], root_path: Path, conf: Dict[str, Any]) -> None: """ swh-fuse CLI entry-point """ # Use pyfuse3 asyncio layer to match the rest of Software Heritage codebase pyfuse3_asyncio.enable() with FuseCache(conf["cache"]) as cache: - fs = Fuse(swhids, root_path, cache, conf) + fs = Fuse(root_path, cache, conf) + + # Initially populate the cache + for swhid in swhids: + fs.get_metadata(swhid) fuse_options = set(pyfuse3.default_options) fuse_options.add("fsname=swhfs") fuse_options.add("debug") pyfuse3.init(fs, root_path, fuse_options) loop = asyncio.get_event_loop() try: loop.run_until_complete(pyfuse3.main()) fs.shutdown() finally: pyfuse3.close(unmount=True) loop.close()