diff --git a/docs/design.md b/docs/design.md --- a/docs/design.md +++ b/docs/design.md @@ -60,6 +60,10 @@ many branches) the JSON file will contain a complete version with all pages merged together. +- `origin/`: initially empty, this directory is lazily populated with one +entry per accessed origin URL, having encoded URL as names. The URL encoding is +done using the percent-encoding mechanism described in RFC 3986. + ## File system representation @@ -138,6 +142,17 @@ points into `archive/` to the SWHID corresponding to the branch target. +### `ori` nodes (origins) + +Origin nodes are represented on the file-system as directories with one entry +for each origin visit. + +The visits directories are named after the visit date (`YYYY-MM-DD`, if multiple +visits occur the same day only the first one is kept). Each visit directory +contains a `meta.json` with associated metadata for the origin node, and +potentially a `snapshot` symlink pointing to the visit's snapshot node. + + ## Caching SwhFS retrieves both metadata and file contents from the Software Heritage diff --git a/swh/fuse/cache.py b/swh/fuse/cache.py --- a/swh/fuse/cache.py +++ b/swh/fuse/cache.py @@ -21,7 +21,7 @@ from swh.fuse.fs.mountpoint import ArchiveDir, MetaDir from swh.model.exceptions import ValidationError from swh.model.identifiers import SWHID, parse_swhid -from swh.web.client.client import typify_json +from swh.web.client.client import ORIGIN_VISIT, typify_json class FuseCache: @@ -71,6 +71,14 @@ for raw_swhid in swhids: yield parse_swhid(raw_swhid[0]) + async def get_cached_visits(self) -> AsyncGenerator[str, None]: + """ Return a list of all previously cached visit URL """ + + cursor = await self.metadata.conn.execute("select url from visits_cache") + urls = await cursor.fetchall() + for raw_url in urls: + yield raw_url[0] + class AbstractCache(ABC): """ Abstract cache implementation to share common behavior between cache @@ -104,6 +112,9 @@ await self.conn.execute( "create table if not exists metadata_cache (swhid, metadata)" ) + await self.conn.execute( + "create table if not exists visits_cache (url, metadata)" + ) await self.conn.commit() return self @@ -118,6 +129,21 @@ else: return None + async def get_visits( + self, url_encoded: str, typify: bool = True + ) -> Optional[List[Dict[str, Any]]]: + cursor = await self.conn.execute( + "select metadata from visits_cache where url=?", (url_encoded,) + ) + cache = await cursor.fetchone() + if cache: + visits = json.loads(cache[0]) + if typify: + visits = [typify_json(v, ORIGIN_VISIT) for v in visits] + return visits + else: + return None + async def set(self, swhid: SWHID, metadata: Any) -> None: await self.conn.execute( "insert into metadata_cache values (?, ?)", @@ -125,6 +151,12 @@ ) await self.conn.commit() + async def set_visits(self, url_encoded: str, visits: List[Dict[str, Any]]) -> None: + await self.conn.execute( + "insert into visits_cache values (?, ?)", (url_encoded, json.dumps(visits)), + ) + await self.conn.commit() + async def get_cached_subset(self, swhids: List[SWHID]) -> List[SWHID]: swhids_str = ",".join(f'"{x}"' for x in swhids) cursor = await self.conn.execute( diff --git a/swh/fuse/fs/artifact.py b/swh/fuse/fs/artifact.py --- a/swh/fuse/fs/artifact.py +++ b/swh/fuse/fs/artifact.py @@ -5,8 +5,10 @@ import asyncio from dataclasses import dataclass, field +import json +import logging from pathlib import Path -from typing import Any, AsyncIterator, List +from typing import Any, AsyncIterator, Dict, List import urllib.parse from swh.fuse.fs.entry import ( @@ -49,7 +51,7 @@ if self.prefetch: return self.prefetch["length"] else: - return len(await self.get_content()) + return await super().size() @dataclass @@ -261,9 +263,6 @@ fmt = f"Done: {self.done}/{self.todo}\n" return fmt.encode() - async def size(self) -> int: - return len(await self.get_content()) - async def compute_entries(self) -> AsyncIterator[FuseEntry]: history = await self.fuse.get_history(self.history_swhid) # Only check for cached revisions since fetching all of them with the @@ -456,9 +455,6 @@ async def get_content(self) -> bytes: return str.encode(self.target_type + "\n") - async def size(self) -> int: - return len(await self.get_content()) - @dataclass class Snapshot(FuseDirEntry): @@ -488,6 +484,72 @@ ) +@dataclass +class Origin(FuseDirEntry): + """ Software Heritage origin artifact. + + Origin nodes are represented on the file-system as directories with one + entry for each origin visit. + + The visits directories are named after the visit date (`YYYY-MM-DD`, if + multiple visits occur the same day only the first one is kept). Each visit + directory contains a `meta.json` with associated metadata for the origin + node, and potentially a `snapshot` symlink pointing to the visit's snapshot + node. """ + + DATE_FMT = "{year:04d}-{month:02d}-{day:02d}" + + async def compute_entries(self) -> AsyncIterator[FuseEntry]: + # The origin's name is always its URL (encoded to create a valid UNIX filename) + visits = await self.fuse.get_visits(self.name) + + seen_date = set() + for visit in visits: + date = visit["date"] + name = self.DATE_FMT.format(year=date.year, month=date.month, day=date.day) + + if name in seen_date: + logging.debug( + "Conflict date on origin: %s, %s", visit["origin"], str(name) + ) + else: + seen_date.add(name) + yield self.create_child( + OriginVisit, name=name, mode=int(EntryMode.RDONLY_DIR), meta=visit, + ) + + +@dataclass +class OriginVisit(FuseDirEntry): + """ Origin visit virtual directory """ + + meta: Dict[str, Any] + + @dataclass + class MetaFile(FuseFileEntry): + content: str + + async def get_content(self) -> bytes: + return str.encode(self.content + "\n") + + async def compute_entries(self) -> AsyncIterator[FuseEntry]: + snapshot_swhid = self.meta["snapshot"] + if snapshot_swhid: + root_path = self.get_relative_root_path() + yield self.create_child( + FuseSymlinkEntry, + name="snapshot", + target=Path(root_path, f"archive/{snapshot_swhid}"), + ) + + yield self.create_child( + OriginVisit.MetaFile, + name="meta.json", + mode=int(EntryMode.RDONLY_FILE), + content=json.dumps(self.meta, default=lambda x: str(x)), + ) + + OBJTYPE_GETTERS = { CONTENT: Content, DIRECTORY: Directory, diff --git a/swh/fuse/fs/entry.py b/swh/fuse/fs/entry.py --- a/swh/fuse/fs/entry.py +++ b/swh/fuse/fs/entry.py @@ -68,6 +68,9 @@ raise NotImplementedError + async def size(self) -> int: + return len(await self.get_content()) + class FuseDirEntry(FuseEntry): """ FUSE virtual directory entry """ diff --git a/swh/fuse/fs/mountpoint.py b/swh/fuse/fs/mountpoint.py --- a/swh/fuse/fs/mountpoint.py +++ b/swh/fuse/fs/mountpoint.py @@ -7,7 +7,7 @@ import json from typing import AsyncIterator -from swh.fuse.fs.artifact import OBJTYPE_GETTERS +from swh.fuse.fs.artifact import OBJTYPE_GETTERS, Origin from swh.fuse.fs.entry import EntryMode, FuseDirEntry, FuseEntry, FuseFileEntry from swh.model.exceptions import ValidationError from swh.model.identifiers import CONTENT, SWHID, parse_swhid @@ -24,6 +24,7 @@ async def compute_entries(self) -> AsyncIterator[FuseEntry]: yield self.create_child(ArchiveDir) yield self.create_child(MetaDir) + yield self.create_child(OriginDir) @dataclass @@ -101,3 +102,35 @@ async def size(self) -> int: return len(await self.get_content()) + + +@dataclass +class OriginDir(FuseDirEntry): + """ The origin/ directory is lazily populated with one entry per accessed + origin URL (mangled to create a valid UNIX filename). The URL encoding is + done using the percent-encoding mechanism described in RFC 3986. """ + + name: str = field(init=False, default="origin") + mode: int = field(init=False, default=int(EntryMode.RDONLY_DIR)) + + def create_child(self, url_encoded: str) -> FuseEntry: + return super().create_child( + Origin, name=url_encoded, mode=int(EntryMode.RDONLY_DIR), + ) + + async def compute_entries(self) -> AsyncIterator[FuseEntry]: + async for url in self.fuse.cache.get_cached_visits(): + yield self.create_child(url) + + async def lookup(self, name: str) -> FuseEntry: + entry = await super().lookup(name) + if entry: + return entry + + # On the fly mounting of new origin url + try: + url_encoded = name + await self.fuse.get_visits(url_encoded) + return self.create_child(url_encoded) + except ValidationError: + return None diff --git a/swh/fuse/fuse.py b/swh/fuse/fuse.py --- a/swh/fuse/fuse.py +++ b/swh/fuse/fuse.py @@ -5,11 +5,13 @@ import asyncio import errno +import functools import logging import os from pathlib import Path import time from typing import Any, Dict, List +import urllib.parse import pyfuse3 import pyfuse3_asyncio @@ -136,6 +138,29 @@ # an empty list. return [] + async def get_visits(self, url_encoded: str) -> List[Dict[str, Any]]: + """ Retrieve origin visits given an encoded-URL using Software Heritage API """ + + cache = await self.cache.metadata.get_visits(url_encoded) + if cache: + return cache + + try: + typify = False # Get the raw JSON from the API + loop = asyncio.get_event_loop() + # Web API only takes non-encoded URL + url = urllib.parse.unquote_plus(url_encoded) + visits_it = await loop.run_in_executor( + None, functools.partial(self.web_api.visits, url, typify=typify) + ) + visits = list(visits_it) + await self.cache.metadata.set_visits(url_encoded, visits) + # Retrieve it from cache so it is correctly typed + return await self.cache.metadata.get_visits(url_encoded) + except requests.HTTPError as err: + logging.error("Cannot fetch visits for object %s: %s", url_encoded, err) + raise + async def get_attrs(self, entry: FuseEntry) -> pyfuse3.EntryAttributes: """ Return entry attributes """ diff --git a/swh/fuse/tests/common.py b/swh/fuse/tests/common.py --- a/swh/fuse/tests/common.py +++ b/swh/fuse/tests/common.py @@ -25,6 +25,10 @@ return MOCK_ARCHIVE[url] +def get_origin_data_from_web_archive(url: str) -> Any: + return MOCK_ARCHIVE[f"origin/{url}/visits/"] + + def get_data_from_graph_archive(swhid: str, request_type: GRAPH_API_REQUEST) -> Any: url = swhid_to_graph_url(swhid, request_type) return MOCK_ARCHIVE[url] diff --git a/swh/fuse/tests/data/api_data.py b/swh/fuse/tests/data/api_data.py --- a/swh/fuse/tests/data/api_data.py +++ b/swh/fuse/tests/data/api_data.py @@ -2817,4 +2817,116 @@ "directory_url": "https://archive.softwareheritage.org/api/1/directory/1ac29db0e7280af41064676569a96d1f88ccfa96/", }, "graph/visit/edges/swh:1:rev:430a9fd4c797c50cea26157141b2408073b2ed91": "", + "origin/https://github.com/rust-lang/rust/visits/": [ + { + "origin": "https://github.com/rust-lang/rust", + "visit": 101, + "date": "2020-04-03T23:29:46.270146+00:00", + "status": "full", + "snapshot": "85cd4712d7438226762a6abb2ec91f99fbb6197e", + "metadata": {}, + "type": "git", + "origin_visit_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/rust-lang/rust/visit/101/", + "snapshot_url": "https://archive.softwareheritage.org/api/1/snapshot/85cd4712d7438226762a6abb2ec91f99fbb6197e/", + }, + { + "origin": "https://github.com/rust-lang/rust", + "visit": 100, + "date": "2019-01-15T19:57:22.613094+00:00", + "status": "full", + "snapshot": "9b8c3f037d81b230f9cb2264deb73de8b04fe87d", + "metadata": {}, + "type": "git", + "origin_visit_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/rust-lang/rust/visit/100/", + "snapshot_url": "https://archive.softwareheritage.org/api/1/snapshot/9b8c3f037d81b230f9cb2264deb73de8b04fe87d/", + }, + { + "origin": "https://github.com/rust-lang/rust", + "visit": 99, + "date": "2018-12-26T08:27:01.995475+00:00", + "status": "partial", + "snapshot": None, + "metadata": {}, + "type": "git", + "origin_visit_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/rust-lang/rust/visit/99/", + "snapshot_url": None, + }, + { + "origin": "https://github.com/rust-lang/rust", + "visit": 98, + "date": "2018-10-22T12:58:15.736562+00:00", + "status": "partial", + "snapshot": None, + "metadata": {}, + "type": "git", + "origin_visit_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/rust-lang/rust/visit/98/", + "snapshot_url": None, + }, + { + "origin": "https://github.com/rust-lang/rust", + "visit": 97, + "date": "2018-10-22T10:41:47.681929+00:00", + "status": "full", + "snapshot": "0d8fda805e5b2b4d193f5c9a82161027ca78aef4", + "metadata": {}, + "type": "git", + "origin_visit_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/rust-lang/rust/visit/97/", + "snapshot_url": "https://archive.softwareheritage.org/api/1/snapshot/0d8fda805e5b2b4d193f5c9a82161027ca78aef4/", + }, + { + "origin": "https://github.com/rust-lang/rust", + "visit": 96, + "date": "2018-10-21T16:50:14.479079+00:00", + "status": "full", + "snapshot": "fa3d76d8a5e90f6cc5570f9dc473e8a784ef1979", + "metadata": {}, + "type": "git", + "origin_visit_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/rust-lang/rust/visit/96/", + "snapshot_url": "https://archive.softwareheritage.org/api/1/snapshot/fa3d76d8a5e90f6cc5570f9dc473e8a784ef1979/", + }, + { + "origin": "https://github.com/rust-lang/rust", + "visit": 95, + "date": "2018-10-21T10:12:18.364044+00:00", + "status": "full", + "snapshot": "18e29a509811e19cdf35ad832db55a8a6063a45d", + "metadata": {}, + "type": "git", + "origin_visit_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/rust-lang/rust/visit/95/", + "snapshot_url": "https://archive.softwareheritage.org/api/1/snapshot/18e29a509811e19cdf35ad832db55a8a6063a45d/", + }, + { + "origin": "https://github.com/rust-lang/rust", + "visit": 94, + "date": "2018-10-21T02:00:53.875615+00:00", + "status": "partial", + "snapshot": None, + "metadata": {}, + "type": "git", + "origin_visit_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/rust-lang/rust/visit/94/", + "snapshot_url": None, + }, + { + "origin": "https://github.com/rust-lang/rust", + "visit": 93, + "date": "2018-10-19T17:05:02.828519+00:00", + "status": "full", + "snapshot": "0be400b6358c26c4f42670e78848f5ab26434b4a", + "metadata": {}, + "type": "git", + "origin_visit_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/rust-lang/rust/visit/93/", + "snapshot_url": "https://archive.softwareheritage.org/api/1/snapshot/0be400b6358c26c4f42670e78848f5ab26434b4a/", + }, + { + "origin": "https://github.com/rust-lang/rust", + "visit": 92, + "date": "2018-10-17T04:16:53.330449+00:00", + "status": "partial", + "snapshot": None, + "metadata": {}, + "type": "git", + "origin_visit_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/rust-lang/rust/visit/92/", + "snapshot_url": None, + }, + ], } diff --git a/swh/fuse/tests/data/config.py b/swh/fuse/tests/data/config.py --- a/swh/fuse/tests/data/config.py +++ b/swh/fuse/tests/data/config.py @@ -32,6 +32,9 @@ # problem, only the mock offline one. ROOT_SNP = "swh:1:snp:02db117fef22434f1658b833a756775ca6effed0" ROOT_SNP_MASTER_BRANCH = "swh:1:rev:430a9fd4c797c50cea26157141b2408073b2ed91" +# Origin +ORIGIN_URL = "https://github.com/rust-lang/rust" +ORIGIN_URL_ENCODED = "https%3A%2F%2Fgithub.com%2Frust-lang%2Frust" # Special corner cases (not from Rust compiler) REL_TARGET_CNT = "swh:1:rel:da5f9898d6248ab26277116f54aca855338401d2" diff --git a/swh/fuse/tests/data/gen-api-data.py b/swh/fuse/tests/data/gen-api-data.py --- a/swh/fuse/tests/data/gen-api-data.py +++ b/swh/fuse/tests/data/gen-api-data.py @@ -16,7 +16,7 @@ swhid_to_graph_url, swhid_to_web_url, ) -from swh.fuse.tests.data.config import ALL_ENTRIES, REV_SMALL_HISTORY +from swh.fuse.tests.data.config import ALL_ENTRIES, ORIGIN_URL, REV_SMALL_HISTORY from swh.model.identifiers import ( CONTENT, DIRECTORY, @@ -138,11 +138,21 @@ generate_archive_web_api(swhid, recursive=False) +def generate_origin_archive_web_api(url: str): + url = f"origin/{url}/visits/" + data = requests.get(f"{API_URL_real}/{url}").text + data = json.loads(data) + MOCK_ARCHIVE[url] = data + + for entry in ALL_ENTRIES: swhid = parse_swhid(entry) generate_archive_web_api(swhid, recursive=True) generate_archive_graph_api(swhid) +# Origin artifacts are not identified by SWHID but using an URL +generate_origin_archive_web_api(ORIGIN_URL) + print("# GENERATED FILE, DO NOT EDIT.") print("# Run './gen-api-data.py > api_data.py' instead.") print("# flake8: noqa") diff --git a/swh/fuse/tests/test_origin.py b/swh/fuse/tests/test_origin.py new file mode 100644 --- /dev/null +++ b/swh/fuse/tests/test_origin.py @@ -0,0 +1,22 @@ +import os +from pathlib import Path + +import dateutil.parser + +from swh.fuse.fs.artifact import Origin +from swh.fuse.tests.common import get_origin_data_from_web_archive +from swh.fuse.tests.data.config import ORIGIN_URL, ORIGIN_URL_ENCODED + + +def test_list_visits(fuse_mntdir): + visits_dir = fuse_mntdir / "origin" / ORIGIN_URL_ENCODED + visits_meta = get_origin_data_from_web_archive(ORIGIN_URL) + for visit in visits_meta: + date = dateutil.parser.parse(visit["date"]) + name = Origin.DATE_FMT.format(year=date.year, month=date.month, day=date.day) + assert name in os.listdir(visits_dir) + + dirname = visits_dir / name + assert Path(dirname / "meta.json").is_file() + if "snapshot" in os.listdir(dirname): + assert Path(dirname / "snapshot").is_symlink()