diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py
--- a/swh/provenance/__init__.py
+++ b/swh/provenance/__init__.py
@@ -76,13 +76,22 @@
from .postgresql.provenancedb_base import ProvenanceDBBase
conn = BaseDb.connect(**kwargs["db"]).conn
- if ProvenanceDBBase(conn).flavor == "with-path":
+ raise_on_commit = kwargs.get("db", False)
+ if ProvenanceDBBase(conn, raise_on_commit).flavor == "with-path":
from .postgresql.provenancedb_with_path import ProvenanceWithPathDB
- return ProvenanceWithPathDB(conn)
+ return ProvenanceWithPathDB(conn, raise_on_commit)
else:
from .postgresql.provenancedb_without_path import ProvenanceWithoutPathDB
- return ProvenanceWithoutPathDB(conn)
+ return ProvenanceWithoutPathDB(conn, raise_on_commit)
+
+ elif cls == "remote":
+ from .api.client import RemoteProvenanceStorage
+
+ storage = RemoteProvenanceStorage(**kwargs)
+ assert isinstance(storage, ProvenanceStorageInterface)
+ return storage
+
else:
raise ValueError
diff --git a/swh/provenance/api/__init__.py b/swh/provenance/api/__init__.py
new file mode 100644
diff --git a/swh/provenance/api/client.py b/swh/provenance/api/client.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/api/client.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.core.api import RPCClient
+
+from ..interface import ProvenanceStorageInterface
+from .serializers import DECODERS, ENCODERS
+
+
+class RemoteProvenanceStorage(RPCClient):
+ """Proxy to a remote provenance storage API"""
+
+ backend_class = ProvenanceStorageInterface
+ extra_type_decoders = DECODERS
+ extra_type_encoders = ENCODERS
diff --git a/swh/provenance/api/serializers.py b/swh/provenance/api/serializers.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/api/serializers.py
@@ -0,0 +1,48 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from dataclasses import asdict
+from typing import Callable, Dict, List, Tuple
+
+from .. import interface
+
+
+def _encode_dataclass(obj):
+ return {
+ **asdict(obj),
+ "__type__": type(obj).__name__,
+ }
+
+
+def _decode_dataclass(d):
+ return getattr(interface, d.pop("__type__"))(**d)
+
+
+def _encode_enum(obj):
+ return {
+ "value": obj.value,
+ "__type__": type(obj).__name__,
+ }
+
+
+def _decode_enum(d):
+ return getattr(interface, d.pop("__type__"))(d["value"])
+
+
+ENCODERS: List[Tuple[type, str, Callable]] = [
+ (interface.ProvenanceResult, "dataclass", _encode_dataclass),
+ (interface.RelationData, "dataclass", _encode_dataclass),
+ (interface.RevisionData, "dataclass", _encode_dataclass),
+ (interface.EntityType, "enum", _encode_enum),
+ (interface.RelationType, "enum", _encode_enum),
+ (set, "set", list),
+]
+
+
+DECODERS: Dict[str, Callable] = {
+ "dataclass": _decode_dataclass,
+ "enum": _decode_enum,
+ "set": set,
+}
diff --git a/swh/provenance/api/server.py b/swh/provenance/api/server.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/api/server.py
@@ -0,0 +1,148 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import logging
+import os
+
+from swh.core import config
+from swh.core.api import JSONFormatter, MsgpackFormatter, RPCServerApp
+from swh.core.api import negotiate
+from swh.provenance import get_provenance_storage
+from swh.provenance.interface import ProvenanceStorageInterface
+
+from .serializers import DECODERS, ENCODERS
+
+
+storage = None
+
+
+def get_global_provenance_storage():
+ global storage
+ if not storage:
+ storage = get_provenance_storage(**app.config["provenance"]["storage"])
+ return storage
+
+
+class ProvenanceStorageServerApp(RPCServerApp):
+ extra_type_decoders = DECODERS
+ extra_type_encoders = ENCODERS
+
+
+app = ProvenanceStorageServerApp(
+ __name__,
+ backend_class=ProvenanceStorageInterface,
+ backend_factory=get_global_provenance_storage,
+)
+
+
+def has_no_empty_params(rule):
+ return len(rule.defaults or ()) >= len(rule.arguments or ())
+
+
+@app.route("/")
+def index():
+ return """
+
Software Heritage provenance storage RPC server
+
+You have reached the
+Software Heritage
+provenance storage RPC server.
+See its
+documentation
+and API for more information
+
+"""
+
+
+@app.route("/site-map")
+@negotiate(MsgpackFormatter)
+@negotiate(JSONFormatter)
+def site_map():
+ links = []
+ for rule in app.url_map.iter_rules():
+ if has_no_empty_params(rule) and hasattr(
+ ProvenanceStorageInterface, rule.endpoint
+ ):
+ links.append(
+ dict(
+ rule=rule.rule,
+ description=getattr(
+ ProvenanceStorageInterface, rule.endpoint
+ ).__doc__,
+ )
+ )
+ # links is now a list of url, endpoint tuples
+ return links
+
+
+def load_and_check_config(config_path, type="local"):
+ """Check the minimal configuration is set to run the api or raise an
+ error explanation.
+
+ Args:
+ config_path (str): Path to the configuration file to load
+ type (str): configuration type. For 'local' type, more
+ checks are done.
+
+ Raises:
+ Error if the setup is not as expected
+
+ Returns:
+ configuration as a dict
+
+ """
+ if not config_path:
+ raise EnvironmentError("Configuration file must be defined")
+
+ if not os.path.exists(config_path):
+ raise FileNotFoundError(f"Configuration file {config_path} does not exist")
+
+ cfg = config.read(config_path)
+
+ pcfg = cfg.get("provenance")
+ if not pcfg:
+ raise KeyError("Missing 'provenance' configuration")
+
+ scfg = pcfg.get("storage")
+ if not scfg:
+ raise KeyError("Missing 'provenance.storage' configuration")
+
+ if type == "local":
+ cls = scfg.get("cls")
+ if cls != "local":
+ raise ValueError(
+ "The provenance backend can only be started with a 'local' "
+ "configuration"
+ )
+
+ db = scfg.get("db")
+ if not db:
+ raise KeyError("Invalid configuration; missing 'db' config entry")
+
+ return cfg
+
+
+api_cfg = None
+
+
+def make_app_from_configfile():
+ """Run the WSGI app from the webserver, loading the configuration from
+ a configuration file.
+
+ SWH_CONFIG_FILENAME environment variable defines the
+ configuration path to load.
+
+ """
+ global api_cfg
+ if not api_cfg:
+ config_path = os.environ.get("SWH_CONFIG_FILENAME")
+ api_cfg = load_and_check_config(config_path)
+ app.config.update(api_cfg)
+ import pprint
+
+ pprint.pprint(app.config)
+ handler = logging.StreamHandler()
+ app.logger.addHandler(handler)
+ return app
diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py
--- a/swh/provenance/interface.py
+++ b/swh/provenance/interface.py
@@ -10,6 +10,7 @@
from typing_extensions import Protocol, runtime_checkable
+from swh.core.api import remote_api_endpoint
from swh.model.model import Sha1Git
from .model import DirectoryEntry, FileEntry, OriginEntry, RevisionEntry
@@ -65,30 +66,33 @@
@runtime_checkable
class ProvenanceStorageInterface(Protocol):
- raise_on_commit: bool = False
-
+ @remote_api_endpoint("content_find_first")
def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]:
"""Retrieve the first occurrence of the blob identified by `id`."""
...
+ @remote_api_endpoint("content_find_all")
def content_find_all(
self, id: Sha1Git, limit: Optional[int] = None
) -> Generator[ProvenanceResult, None, None]:
"""Retrieve all the occurrences of the blob identified by `id`."""
...
+ @remote_api_endpoint("content_set_date")
def content_set_date(self, dates: Dict[Sha1Git, datetime]) -> bool:
"""Associate dates to blobs identified by sha1 ids, as paired in `dates`. Return
a boolean stating whether the information was successfully stored.
"""
...
+ @remote_api_endpoint("content_get")
def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]:
"""Retrieve the associated date for each blob sha1 in `ids`. If some blob has
no associated date, it is not present in the resulting dictionary.
"""
...
+ @remote_api_endpoint("directory_set_date")
def directory_set_date(self, dates: Dict[Sha1Git, datetime]) -> bool:
"""Associate dates to directories identified by sha1 ids, as paired in
`dates`. Return a boolean stating whether the information was successfully
@@ -96,40 +100,47 @@
"""
...
+ @remote_api_endpoint("directory_get")
def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]:
"""Retrieve the associated date for each directory sha1 in `ids`. If some
directory has no associated date, it is not present in the resulting dictionary.
"""
...
+ @remote_api_endpoint("entity_get_all")
def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]:
"""Retrieve all sha1 ids for entities of type `entity` present in the provenance
model.
"""
...
+ @remote_api_endpoint("location_get")
def location_get(self) -> Set[bytes]:
"""Retrieve all paths present in the provenance model."""
...
+ @remote_api_endpoint("origin_set_url")
def origin_set_url(self, urls: Dict[Sha1Git, str]) -> bool:
"""Associate urls to origins identified by sha1 ids, as paired in `urls`. Return
a boolean stating whether the information was successfully stored.
"""
...
+ @remote_api_endpoint("origin_get")
def origin_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, str]:
"""Retrieve the associated url for each origin sha1 in `ids`. If some origin has
no associated date, it is not present in the resulting dictionary.
"""
...
+ @remote_api_endpoint("revision_set_date")
def revision_set_date(self, dates: Dict[Sha1Git, datetime]) -> bool:
"""Associate dates to revisions identified by sha1 ids, as paired in `dates`.
Return a boolean stating whether the information was successfully stored.
"""
...
+ @remote_api_endpoint("revision_set_origin")
def revision_set_origin(self, origins: Dict[Sha1Git, Sha1Git]) -> bool:
"""Associate origins to revisions identified by sha1 ids, as paired in
`origins` (revision ids are keys and origin ids, values). Return a boolean
@@ -137,6 +148,7 @@
"""
...
+ @remote_api_endpoint("revision_get")
def revision_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, RevisionData]:
"""Retrieve the associated date and origin for each revision sha1 in `ids`. If
some revision has no associated date nor origin, it is not present in the
@@ -144,12 +156,14 @@
"""
...
+ @remote_api_endpoint("relation_add")
def relation_add(
self, relation: RelationType, data: Iterable[RelationData]
) -> bool:
"""Add entries in the selected `relation`."""
...
+ @remote_api_endpoint("relation_get")
def relation_get(
self, relation: RelationType, ids: Iterable[Sha1Git], reverse: bool = False
) -> Set[RelationData]:
@@ -159,12 +173,17 @@
"""
...
+ @remote_api_endpoint("relation_get_all")
def relation_get_all(self, relation: RelationType) -> Set[RelationData]:
"""Retrieve all entries in the selected `relation` that are present in the
provenance model.
"""
...
+ @remote_api_endpoint("with_path")
+ def with_path(self) -> bool:
+ ...
+
@runtime_checkable
class ProvenanceInterface(Protocol):
diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb_base.py
--- a/swh/provenance/postgresql/provenancedb_base.py
+++ b/swh/provenance/postgresql/provenancedb_base.py
@@ -25,9 +25,9 @@
class ProvenanceDBBase:
- raise_on_commit: bool = False
-
- def __init__(self, conn: psycopg2.extensions.connection):
+ def __init__(
+ self, conn: psycopg2.extensions.connection, raise_on_commit: bool = False
+ ):
BaseDb.adapt_conn(conn)
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
conn.set_session(autocommit=True)
@@ -37,6 +37,7 @@
sql = "SET timezone TO 'UTC'"
self.cursor.execute(sql)
self._flavor: Optional[str] = None
+ self.raise_on_commit = raise_on_commit
@property
def flavor(self) -> str:
@@ -47,7 +48,6 @@
assert self._flavor is not None
return self._flavor
- @property
def with_path(self) -> bool:
return self.flavor == "with-path"
diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
--- a/swh/provenance/tests/conftest.py
+++ b/swh/provenance/tests/conftest.py
@@ -12,45 +12,69 @@
import pytest
from typing_extensions import TypedDict
-from swh.core.db import BaseDb
from swh.journal.serializers import msgpack_ext_hook
from swh.model.hashutil import hash_to_bytes
from swh.model.model import Sha1Git
from swh.model.tests.swh_model_data import TEST_OBJECTS
-from swh.provenance import get_provenance
+from swh.provenance import get_provenance, get_provenance_storage
+from swh.provenance.api.client import RemoteProvenanceStorage
+import swh.provenance.api.server as server
from swh.provenance.archive import ArchiveInterface
-from swh.provenance.interface import ProvenanceInterface
+from swh.provenance.interface import ProvenanceInterface, ProvenanceStorageInterface
from swh.provenance.postgresql.archive import ArchivePostgreSQL
-from swh.provenance.postgresql.provenancedb_base import ProvenanceDBBase
from swh.provenance.storage.archive import ArchiveStorage
from swh.storage.postgresql.storage import Storage
from swh.storage.replay import process_replay_objects
@pytest.fixture(params=["with-path", "without-path"])
-def provenance(
+def populated_db(
request, # TODO: add proper type annotation
postgresql: psycopg2.extensions.connection,
-) -> ProvenanceInterface:
- """return a working and initialized provenance db"""
+) -> Dict[str, str]:
from swh.core.cli.db import populate_database_for_package
- flavor = request.param
+ flavor = "with-path" if request.param == "client-server" else request.param
populate_database_for_package("swh.provenance", postgresql.dsn, flavor=flavor)
-
- BaseDb.adapt_conn(postgresql)
-
- args: Dict[str, str] = {
+ return {
item.split("=")[0]: item.split("=")[1]
for item in postgresql.dsn.split()
if item.split("=")[0] != "options"
}
- prov = get_provenance(cls="local", db=args)
- assert isinstance(prov.storage, ProvenanceDBBase)
- assert prov.storage.flavor == flavor
- # in test sessions, we DO want to raise any exception occurring at commit time
- prov.storage.raise_on_commit = True
- return prov
+
+
+# the Flask app used as server in these tests
+@pytest.fixture
+def app(populated_db: Dict[str, str]):
+ assert hasattr(server, "storage")
+ server.storage = get_provenance_storage(cls="local", db=populated_db)
+ yield server.app
+
+
+# the RPCClient class used as client used in these tests
+@pytest.fixture
+def swh_rpc_client_class():
+ return RemoteProvenanceStorage
+
+
+@pytest.fixture(params=["local", "remote"])
+def provenance(
+ request, # TODO: add proper type annotation
+ populated_db: Dict[str, str],
+ swh_rpc_client: RemoteProvenanceStorage,
+) -> ProvenanceInterface:
+ """return a working and initialized provenance db"""
+
+ if request.param == "remote":
+ from swh.provenance.provenance import Provenance
+
+ assert isinstance(swh_rpc_client, ProvenanceStorageInterface)
+ return Provenance(swh_rpc_client)
+
+ else:
+ # in test sessions, we DO want to raise any exception occurring at commit time
+ prov = get_provenance(cls=request.param, db=populated_db, raise_on_commit=True)
+ return prov
@pytest.fixture
diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py
--- a/swh/provenance/tests/test_provenance_db.py
+++ b/swh/provenance/tests/test_provenance_db.py
@@ -41,11 +41,11 @@
def test_provenance_flavor(provenance: ProvenanceInterface) -> None:
- assert isinstance(provenance.storage, ProvenanceDBBase)
- assert provenance.storage.flavor in ("with-path", "without-path")
- backend_class: Type[ProvenanceStorageInterface]
- if provenance.storage.flavor == "with-path":
- backend_class = ProvenanceWithPathDB
- else:
- backend_class = ProvenanceWithoutPathDB
- assert isinstance(provenance.storage, backend_class)
+ if isinstance(provenance.storage, ProvenanceDBBase):
+ assert provenance.storage.flavor in ("with-path", "without-path")
+ backend_class: Type[ProvenanceStorageInterface]
+ if provenance.storage.flavor == "with-path":
+ backend_class = ProvenanceWithPathDB
+ else:
+ backend_class = ProvenanceWithoutPathDB
+ assert isinstance(provenance.storage, backend_class)
diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py
--- a/swh/provenance/tests/test_provenance_heuristics.py
+++ b/swh/provenance/tests/test_provenance_heuristics.py
@@ -11,7 +11,6 @@
from swh.provenance.archive import ArchiveInterface
from swh.provenance.interface import EntityType, ProvenanceInterface, RelationType
from swh.provenance.model import RevisionEntry
-from swh.provenance.postgresql.provenancedb_base import ProvenanceDBBase
from swh.provenance.revision import revision_add
from swh.provenance.tests.conftest import (
fill_storage,
@@ -61,8 +60,7 @@
}
def maybe_path(path: str) -> Optional[bytes]:
- assert isinstance(provenance.storage, ProvenanceDBBase)
- if provenance.storage.with_path:
+ if provenance.storage.with_path():
return path.encode("utf-8")
return None
@@ -155,8 +153,7 @@
== provenance.storage.content_get([dc["dst"]])[dc["dst"]].timestamp()
), synth_rev["msg"]
- assert isinstance(provenance.storage, ProvenanceDBBase)
- if provenance.storage.with_path:
+ if provenance.storage.with_path():
# check for location entries
rows["location"] |= set(x["path"] for x in synth_rev["R_C"])
rows["location"] |= set(x["path"] for x in synth_rev["D_C"])
@@ -199,8 +196,7 @@
]
def maybe_path(path: str) -> str:
- assert isinstance(provenance.storage, ProvenanceDBBase)
- if provenance.storage.with_path:
+ if provenance.storage.with_path():
return path
return ""
@@ -230,7 +226,6 @@
(rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"]))
)
- assert isinstance(provenance.storage, ProvenanceDBBase)
for content_id, results in expected_occurrences.items():
expected = [(content_id, *result) for result in results]
db_occurrences = [
@@ -243,7 +238,7 @@
)
for occur in provenance.content_find_all(hash_to_bytes(content_id))
]
- if provenance.storage.with_path:
+ if provenance.storage.with_path():
# this is not true if the db stores no path, because a same content
# that appears several times in a given revision may be reported
# only once by content_find_all()
@@ -319,7 +314,6 @@
assert sha1 in expected_first
# nothing to do there, this content cannot be a "first seen file"
- assert isinstance(provenance.storage, ProvenanceDBBase)
for content_id, (rev_id, ts, paths) in expected_first.items():
occur = provenance.content_find_first(hash_to_bytes(content_id))
assert occur is not None
@@ -327,5 +321,5 @@
assert occur.revision.hex() == rev_id
assert occur.date.timestamp() == ts
assert occur.origin is None
- if provenance.storage.with_path:
+ if provenance.storage.with_path():
assert occur.path.decode() in paths
diff --git a/swh/provenance/tests/test_provenance_storage.py b/swh/provenance/tests/test_provenance_storage.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_provenance_storage.py
@@ -0,0 +1,46 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import inspect
+
+from ..interface import ProvenanceInterface, ProvenanceStorageInterface
+
+
+def test_types(provenance: ProvenanceInterface):
+ """Checks all methods of ProvenanceStorageInterface are implemented by this
+ backend, and that they have the same signature."""
+ # Create an instance of the protocol (which cannot be instantiated
+ # directly, so this creates a subclass, then instantiates it)
+ interface = type("_", (ProvenanceStorageInterface,), {})()
+ storage = provenance.storage
+
+ assert "content_find_first" in dir(interface)
+
+ missing_methods = []
+
+ for meth_name in dir(interface):
+ if meth_name.startswith("_"):
+ continue
+ interface_meth = getattr(interface, meth_name)
+ try:
+ concrete_meth = getattr(storage, meth_name)
+ except AttributeError:
+ if not getattr(interface_meth, "deprecated_endpoint", False):
+ # The backend is missing a (non-deprecated) endpoint
+ missing_methods.append(meth_name)
+ continue
+
+ expected_signature = inspect.signature(interface_meth)
+ actual_signature = inspect.signature(concrete_meth)
+
+ assert expected_signature == actual_signature, meth_name
+
+ assert missing_methods == []
+
+ # If all the assertions above succeed, then this one should too.
+ # But there's no harm in double-checking.
+ # And we could replace the assertions above by this one, but unlike
+ # the assertions above, it doesn't explain what is missing.
+ assert isinstance(storage, ProvenanceStorageInterface)