diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py --- a/swh/provenance/__init__.py +++ b/swh/provenance/__init__.py @@ -76,13 +76,22 @@ from .postgresql.provenancedb_base import ProvenanceDBBase conn = BaseDb.connect(**kwargs["db"]).conn - if ProvenanceDBBase(conn).flavor == "with-path": + raise_on_commit = kwargs.get("raise_on_commit", False) + if ProvenanceDBBase(conn, raise_on_commit).flavor == "with-path": from .postgresql.provenancedb_with_path import ProvenanceWithPathDB - return ProvenanceWithPathDB(conn) + return ProvenanceWithPathDB(conn, raise_on_commit) else: from .postgresql.provenancedb_without_path import ProvenanceWithoutPathDB - return ProvenanceWithoutPathDB(conn) + return ProvenanceWithoutPathDB(conn, raise_on_commit) + + elif cls == "remote": + from .api.client import RemoteProvenanceStorage + + storage = RemoteProvenanceStorage(**kwargs) + assert isinstance(storage, ProvenanceStorageInterface) + return storage + else: raise ValueError diff --git a/swh/provenance/api/__init__.py b/swh/provenance/api/__init__.py new file mode 100644 diff --git a/swh/provenance/api/client.py b/swh/provenance/api/client.py new file mode 100644 --- /dev/null +++ b/swh/provenance/api/client.py @@ -0,0 +1,17 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.core.api import RPCClient + +from ..interface import ProvenanceStorageInterface +from .serializers import DECODERS, ENCODERS + + +class RemoteProvenanceStorage(RPCClient): + """Proxy to a remote provenance storage API""" + + backend_class = ProvenanceStorageInterface + extra_type_decoders = DECODERS + extra_type_encoders = ENCODERS diff --git a/swh/provenance/api/serializers.py b/swh/provenance/api/serializers.py new file mode 100644 --- /dev/null +++ b/swh/provenance/api/serializers.py @@ -0,0 +1,48 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from dataclasses import asdict +from typing import Callable, Dict, List, Tuple + +from .. import interface + + +def _encode_dataclass(obj): + return { + **asdict(obj), + "__type__": type(obj).__name__, + } + + +def _decode_dataclass(d): + return getattr(interface, d.pop("__type__"))(**d) + + +def _encode_enum(obj): + return { + "value": obj.value, + "__type__": type(obj).__name__, + } + + +def _decode_enum(d): + return getattr(interface, d.pop("__type__"))(d["value"]) + + +ENCODERS: List[Tuple[type, str, Callable]] = [ + (interface.ProvenanceResult, "dataclass", _encode_dataclass), + (interface.RelationData, "dataclass", _encode_dataclass), + (interface.RevisionData, "dataclass", _encode_dataclass), + (interface.EntityType, "enum", _encode_enum), + (interface.RelationType, "enum", _encode_enum), + (set, "set", list), +] + + +DECODERS: Dict[str, Callable] = { + "dataclass": _decode_dataclass, + "enum": _decode_enum, + "set": set, +} diff --git a/swh/provenance/api/server.py b/swh/provenance/api/server.py new file mode 100644 --- /dev/null +++ b/swh/provenance/api/server.py @@ -0,0 +1,148 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging +import os + +from swh.core import config +from swh.core.api import JSONFormatter, MsgpackFormatter, RPCServerApp +from swh.core.api import negotiate +from swh.provenance import get_provenance_storage +from swh.provenance.interface import ProvenanceStorageInterface + +from .serializers import DECODERS, ENCODERS + + +storage = None + + +def get_global_provenance_storage(): + global storage + if not storage: + storage = get_provenance_storage(**app.config["provenance"]["storage"]) + return storage + + +class ProvenanceStorageServerApp(RPCServerApp): + extra_type_decoders = DECODERS + extra_type_encoders = ENCODERS + + +app = ProvenanceStorageServerApp( + __name__, + backend_class=ProvenanceStorageInterface, + backend_factory=get_global_provenance_storage, +) + + +def has_no_empty_params(rule): + return len(rule.defaults or ()) >= len(rule.arguments or ()) + + +@app.route("/") +def index(): + return """ +Software Heritage provenance storage RPC server + +

You have reached the +Software Heritage +provenance storage RPC server.
+See its +documentation +and API for more information

+ +""" + + +@app.route("/site-map") +@negotiate(MsgpackFormatter) +@negotiate(JSONFormatter) +def site_map(): + links = [] + for rule in app.url_map.iter_rules(): + if has_no_empty_params(rule) and hasattr( + ProvenanceStorageInterface, rule.endpoint + ): + links.append( + dict( + rule=rule.rule, + description=getattr( + ProvenanceStorageInterface, rule.endpoint + ).__doc__, + ) + ) + # links is now a list of url, endpoint tuples + return links + + +def load_and_check_config(config_path, type="local"): + """Check the minimal configuration is set to run the api or raise an + error explanation. + + Args: + config_path (str): Path to the configuration file to load + type (str): configuration type. For 'local' type, more + checks are done. + + Raises: + Error if the setup is not as expected + + Returns: + configuration as a dict + + """ + if not config_path: + raise EnvironmentError("Configuration file must be defined") + + if not os.path.exists(config_path): + raise FileNotFoundError(f"Configuration file {config_path} does not exist") + + cfg = config.read(config_path) + + pcfg = cfg.get("provenance") + if not pcfg: + raise KeyError("Missing 'provenance' configuration") + + scfg = pcfg.get("storage") + if not scfg: + raise KeyError("Missing 'provenance.storage' configuration") + + if type == "local": + cls = scfg.get("cls") + if cls != "local": + raise ValueError( + "The provenance backend can only be started with a 'local' " + "configuration" + ) + + db = scfg.get("db") + if not db: + raise KeyError("Invalid configuration; missing 'db' config entry") + + return cfg + + +api_cfg = None + + +def make_app_from_configfile(): + """Run the WSGI app from the webserver, loading the configuration from + a configuration file. + + SWH_CONFIG_FILENAME environment variable defines the + configuration path to load. + + """ + global api_cfg + if not api_cfg: + config_path = os.environ.get("SWH_CONFIG_FILENAME") + api_cfg = load_and_check_config(config_path) + app.config.update(api_cfg) + import pprint + + pprint.pprint(app.config) + handler = logging.StreamHandler() + app.logger.addHandler(handler) + return app diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py --- a/swh/provenance/interface.py +++ b/swh/provenance/interface.py @@ -10,6 +10,7 @@ from typing_extensions import Protocol, runtime_checkable +from swh.core.api import remote_api_endpoint from swh.model.model import Sha1Git from .model import DirectoryEntry, FileEntry, OriginEntry, RevisionEntry @@ -65,30 +66,33 @@ @runtime_checkable class ProvenanceStorageInterface(Protocol): - raise_on_commit: bool = False - + @remote_api_endpoint("content_find_first") def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: """Retrieve the first occurrence of the blob identified by `id`.""" ... + @remote_api_endpoint("content_find_all") def content_find_all( self, id: Sha1Git, limit: Optional[int] = None ) -> Generator[ProvenanceResult, None, None]: """Retrieve all the occurrences of the blob identified by `id`.""" ... + @remote_api_endpoint("content_set_date") def content_set_date(self, dates: Dict[Sha1Git, datetime]) -> bool: """Associate dates to blobs identified by sha1 ids, as paired in `dates`. Return a boolean stating whether the information was successfully stored. """ ... + @remote_api_endpoint("content_get") def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: """Retrieve the associated date for each blob sha1 in `ids`. If some blob has no associated date, it is not present in the resulting dictionary. """ ... + @remote_api_endpoint("directory_set_date") def directory_set_date(self, dates: Dict[Sha1Git, datetime]) -> bool: """Associate dates to directories identified by sha1 ids, as paired in `dates`. Return a boolean stating whether the information was successfully @@ -96,40 +100,47 @@ """ ... + @remote_api_endpoint("directory_get") def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: """Retrieve the associated date for each directory sha1 in `ids`. If some directory has no associated date, it is not present in the resulting dictionary. """ ... + @remote_api_endpoint("entity_get_all") def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]: """Retrieve all sha1 ids for entities of type `entity` present in the provenance model. """ ... + @remote_api_endpoint("location_get") def location_get(self) -> Set[bytes]: """Retrieve all paths present in the provenance model.""" ... + @remote_api_endpoint("origin_set_url") def origin_set_url(self, urls: Dict[Sha1Git, str]) -> bool: """Associate urls to origins identified by sha1 ids, as paired in `urls`. Return a boolean stating whether the information was successfully stored. """ ... + @remote_api_endpoint("origin_get") def origin_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, str]: """Retrieve the associated url for each origin sha1 in `ids`. If some origin has no associated date, it is not present in the resulting dictionary. """ ... + @remote_api_endpoint("revision_set_date") def revision_set_date(self, dates: Dict[Sha1Git, datetime]) -> bool: """Associate dates to revisions identified by sha1 ids, as paired in `dates`. Return a boolean stating whether the information was successfully stored. """ ... + @remote_api_endpoint("revision_set_origin") def revision_set_origin(self, origins: Dict[Sha1Git, Sha1Git]) -> bool: """Associate origins to revisions identified by sha1 ids, as paired in `origins` (revision ids are keys and origin ids, values). Return a boolean @@ -137,6 +148,7 @@ """ ... + @remote_api_endpoint("revision_get") def revision_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, RevisionData]: """Retrieve the associated date and origin for each revision sha1 in `ids`. If some revision has no associated date nor origin, it is not present in the @@ -144,12 +156,14 @@ """ ... + @remote_api_endpoint("relation_add") def relation_add( self, relation: RelationType, data: Iterable[RelationData] ) -> bool: """Add entries in the selected `relation`.""" ... + @remote_api_endpoint("relation_get") def relation_get( self, relation: RelationType, ids: Iterable[Sha1Git], reverse: bool = False ) -> Set[RelationData]: @@ -159,12 +173,17 @@ """ ... + @remote_api_endpoint("relation_get_all") def relation_get_all(self, relation: RelationType) -> Set[RelationData]: """Retrieve all entries in the selected `relation` that are present in the provenance model. """ ... + @remote_api_endpoint("with_path") + def with_path(self) -> bool: + ... + @runtime_checkable class ProvenanceInterface(Protocol): diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb_base.py --- a/swh/provenance/postgresql/provenancedb_base.py +++ b/swh/provenance/postgresql/provenancedb_base.py @@ -25,9 +25,9 @@ class ProvenanceDBBase: - raise_on_commit: bool = False - - def __init__(self, conn: psycopg2.extensions.connection): + def __init__( + self, conn: psycopg2.extensions.connection, raise_on_commit: bool = False + ): BaseDb.adapt_conn(conn) conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) conn.set_session(autocommit=True) @@ -37,6 +37,7 @@ sql = "SET timezone TO 'UTC'" self.cursor.execute(sql) self._flavor: Optional[str] = None + self.raise_on_commit = raise_on_commit @property def flavor(self) -> str: @@ -47,7 +48,6 @@ assert self._flavor is not None return self._flavor - @property def with_path(self) -> bool: return self.flavor == "with-path" diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -12,45 +12,69 @@ import pytest from typing_extensions import TypedDict -from swh.core.db import BaseDb from swh.journal.serializers import msgpack_ext_hook from swh.model.hashutil import hash_to_bytes from swh.model.model import Sha1Git from swh.model.tests.swh_model_data import TEST_OBJECTS -from swh.provenance import get_provenance +from swh.provenance import get_provenance, get_provenance_storage +from swh.provenance.api.client import RemoteProvenanceStorage +import swh.provenance.api.server as server from swh.provenance.archive import ArchiveInterface -from swh.provenance.interface import ProvenanceInterface +from swh.provenance.interface import ProvenanceInterface, ProvenanceStorageInterface from swh.provenance.postgresql.archive import ArchivePostgreSQL -from swh.provenance.postgresql.provenancedb_base import ProvenanceDBBase from swh.provenance.storage.archive import ArchiveStorage from swh.storage.postgresql.storage import Storage from swh.storage.replay import process_replay_objects @pytest.fixture(params=["with-path", "without-path"]) -def provenance( +def populated_db( request, # TODO: add proper type annotation postgresql: psycopg2.extensions.connection, -) -> ProvenanceInterface: - """return a working and initialized provenance db""" +) -> Dict[str, str]: from swh.core.cli.db import populate_database_for_package - flavor = request.param + flavor = "with-path" if request.param == "client-server" else request.param populate_database_for_package("swh.provenance", postgresql.dsn, flavor=flavor) - - BaseDb.adapt_conn(postgresql) - - args: Dict[str, str] = { + return { item.split("=")[0]: item.split("=")[1] for item in postgresql.dsn.split() if item.split("=")[0] != "options" } - prov = get_provenance(cls="local", db=args) - assert isinstance(prov.storage, ProvenanceDBBase) - assert prov.storage.flavor == flavor - # in test sessions, we DO want to raise any exception occurring at commit time - prov.storage.raise_on_commit = True - return prov + + +# the Flask app used as server in these tests +@pytest.fixture +def app(populated_db: Dict[str, str]): + assert hasattr(server, "storage") + server.storage = get_provenance_storage(cls="local", db=populated_db) + yield server.app + + +# the RPCClient class used as client used in these tests +@pytest.fixture +def swh_rpc_client_class(): + return RemoteProvenanceStorage + + +@pytest.fixture(params=["local", "remote"]) +def provenance( + request, # TODO: add proper type annotation + populated_db: Dict[str, str], + swh_rpc_client: RemoteProvenanceStorage, +) -> ProvenanceInterface: + """return a working and initialized provenance db""" + + if request.param == "remote": + from swh.provenance.provenance import Provenance + + assert isinstance(swh_rpc_client, ProvenanceStorageInterface) + return Provenance(swh_rpc_client) + + else: + # in test sessions, we DO want to raise any exception occurring at commit time + prov = get_provenance(cls=request.param, db=populated_db, raise_on_commit=True) + return prov @pytest.fixture diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py --- a/swh/provenance/tests/test_provenance_db.py +++ b/swh/provenance/tests/test_provenance_db.py @@ -41,11 +41,11 @@ def test_provenance_flavor(provenance: ProvenanceInterface) -> None: - assert isinstance(provenance.storage, ProvenanceDBBase) - assert provenance.storage.flavor in ("with-path", "without-path") - backend_class: Type[ProvenanceStorageInterface] - if provenance.storage.flavor == "with-path": - backend_class = ProvenanceWithPathDB - else: - backend_class = ProvenanceWithoutPathDB - assert isinstance(provenance.storage, backend_class) + if isinstance(provenance.storage, ProvenanceDBBase): + assert provenance.storage.flavor in ("with-path", "without-path") + backend_class: Type[ProvenanceStorageInterface] + if provenance.storage.flavor == "with-path": + backend_class = ProvenanceWithPathDB + else: + backend_class = ProvenanceWithoutPathDB + assert isinstance(provenance.storage, backend_class) diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py --- a/swh/provenance/tests/test_provenance_heuristics.py +++ b/swh/provenance/tests/test_provenance_heuristics.py @@ -11,7 +11,6 @@ from swh.provenance.archive import ArchiveInterface from swh.provenance.interface import EntityType, ProvenanceInterface, RelationType from swh.provenance.model import RevisionEntry -from swh.provenance.postgresql.provenancedb_base import ProvenanceDBBase from swh.provenance.revision import revision_add from swh.provenance.tests.conftest import ( fill_storage, @@ -61,8 +60,7 @@ } def maybe_path(path: str) -> Optional[bytes]: - assert isinstance(provenance.storage, ProvenanceDBBase) - if provenance.storage.with_path: + if provenance.storage.with_path(): return path.encode("utf-8") return None @@ -155,8 +153,7 @@ == provenance.storage.content_get([dc["dst"]])[dc["dst"]].timestamp() ), synth_rev["msg"] - assert isinstance(provenance.storage, ProvenanceDBBase) - if provenance.storage.with_path: + if provenance.storage.with_path(): # check for location entries rows["location"] |= set(x["path"] for x in synth_rev["R_C"]) rows["location"] |= set(x["path"] for x in synth_rev["D_C"]) @@ -199,8 +196,7 @@ ] def maybe_path(path: str) -> str: - assert isinstance(provenance.storage, ProvenanceDBBase) - if provenance.storage.with_path: + if provenance.storage.with_path(): return path return "" @@ -230,7 +226,6 @@ (rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"])) ) - assert isinstance(provenance.storage, ProvenanceDBBase) for content_id, results in expected_occurrences.items(): expected = [(content_id, *result) for result in results] db_occurrences = [ @@ -243,7 +238,7 @@ ) for occur in provenance.content_find_all(hash_to_bytes(content_id)) ] - if provenance.storage.with_path: + if provenance.storage.with_path(): # this is not true if the db stores no path, because a same content # that appears several times in a given revision may be reported # only once by content_find_all() @@ -319,7 +314,6 @@ assert sha1 in expected_first # nothing to do there, this content cannot be a "first seen file" - assert isinstance(provenance.storage, ProvenanceDBBase) for content_id, (rev_id, ts, paths) in expected_first.items(): occur = provenance.content_find_first(hash_to_bytes(content_id)) assert occur is not None @@ -327,5 +321,5 @@ assert occur.revision.hex() == rev_id assert occur.date.timestamp() == ts assert occur.origin is None - if provenance.storage.with_path: + if provenance.storage.with_path(): assert occur.path.decode() in paths diff --git a/swh/provenance/tests/test_provenance_storage.py b/swh/provenance/tests/test_provenance_storage.py new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/test_provenance_storage.py @@ -0,0 +1,46 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import inspect + +from ..interface import ProvenanceInterface, ProvenanceStorageInterface + + +def test_types(provenance: ProvenanceInterface): + """Checks all methods of ProvenanceStorageInterface are implemented by this + backend, and that they have the same signature.""" + # Create an instance of the protocol (which cannot be instantiated + # directly, so this creates a subclass, then instantiates it) + interface = type("_", (ProvenanceStorageInterface,), {})() + storage = provenance.storage + + assert "content_find_first" in dir(interface) + + missing_methods = [] + + for meth_name in dir(interface): + if meth_name.startswith("_"): + continue + interface_meth = getattr(interface, meth_name) + try: + concrete_meth = getattr(storage, meth_name) + except AttributeError: + if not getattr(interface_meth, "deprecated_endpoint", False): + # The backend is missing a (non-deprecated) endpoint + missing_methods.append(meth_name) + continue + + expected_signature = inspect.signature(interface_meth) + actual_signature = inspect.signature(concrete_meth) + + assert expected_signature == actual_signature, meth_name + + assert missing_methods == [] + + # If all the assertions above succeed, then this one should too. + # But there's no harm in double-checking. + # And we could replace the assertions above by this one, but unlike + # the assertions above, it doesn't explain what is missing. + assert isinstance(storage, ProvenanceStorageInterface)