Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9345632
D5986.id21586.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
23 KB
Subscribers
None
D5986.id21586.diff
View Options
diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py
--- a/swh/provenance/__init__.py
+++ b/swh/provenance/__init__.py
@@ -76,13 +76,22 @@
from .postgresql.provenancedb_base import ProvenanceDBBase
conn = BaseDb.connect(**kwargs["db"]).conn
- if ProvenanceDBBase(conn).flavor == "with-path":
+ raise_on_commit = kwargs.get("raise_on_commit", False)
+ if ProvenanceDBBase(conn, raise_on_commit).flavor == "with-path":
from .postgresql.provenancedb_with_path import ProvenanceWithPathDB
- return ProvenanceWithPathDB(conn)
+ return ProvenanceWithPathDB(conn, raise_on_commit)
else:
from .postgresql.provenancedb_without_path import ProvenanceWithoutPathDB
- return ProvenanceWithoutPathDB(conn)
+ return ProvenanceWithoutPathDB(conn, raise_on_commit)
+
+ elif cls == "remote":
+ from .api.client import RemoteProvenanceStorage
+
+ storage = RemoteProvenanceStorage(**kwargs)
+ assert isinstance(storage, ProvenanceStorageInterface)
+ return storage
+
else:
raise ValueError
diff --git a/swh/provenance/api/__init__.py b/swh/provenance/api/__init__.py
new file mode 100644
diff --git a/swh/provenance/api/client.py b/swh/provenance/api/client.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/api/client.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.core.api import RPCClient
+
+from ..interface import ProvenanceStorageInterface
+from .serializers import DECODERS, ENCODERS
+
+
+class RemoteProvenanceStorage(RPCClient):
+ """Proxy to a remote provenance storage API"""
+
+ backend_class = ProvenanceStorageInterface
+ extra_type_decoders = DECODERS
+ extra_type_encoders = ENCODERS
diff --git a/swh/provenance/api/serializers.py b/swh/provenance/api/serializers.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/api/serializers.py
@@ -0,0 +1,48 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from dataclasses import asdict
+from typing import Callable, Dict, List, Tuple
+
+from .. import interface
+
+
+def _encode_dataclass(obj):
+ return {
+ **asdict(obj),
+ "__type__": type(obj).__name__,
+ }
+
+
+def _decode_dataclass(d):
+ return getattr(interface, d.pop("__type__"))(**d)
+
+
+def _encode_enum(obj):
+ return {
+ "value": obj.value,
+ "__type__": type(obj).__name__,
+ }
+
+
+def _decode_enum(d):
+ return getattr(interface, d.pop("__type__"))(d["value"])
+
+
+ENCODERS: List[Tuple[type, str, Callable]] = [
+ (interface.ProvenanceResult, "dataclass", _encode_dataclass),
+ (interface.RelationData, "dataclass", _encode_dataclass),
+ (interface.RevisionData, "dataclass", _encode_dataclass),
+ (interface.EntityType, "enum", _encode_enum),
+ (interface.RelationType, "enum", _encode_enum),
+ (set, "set", list),
+]
+
+
+DECODERS: Dict[str, Callable] = {
+ "dataclass": _decode_dataclass,
+ "enum": _decode_enum,
+ "set": set,
+}
diff --git a/swh/provenance/api/server.py b/swh/provenance/api/server.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/api/server.py
@@ -0,0 +1,148 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import logging
+import os
+
+from swh.core import config
+from swh.core.api import JSONFormatter, MsgpackFormatter, RPCServerApp
+from swh.core.api import negotiate
+from swh.provenance import get_provenance_storage
+from swh.provenance.interface import ProvenanceStorageInterface
+
+from .serializers import DECODERS, ENCODERS
+
+
+storage = None
+
+
+def get_global_provenance_storage():
+ global storage
+ if not storage:
+ storage = get_provenance_storage(**app.config["provenance"]["storage"])
+ return storage
+
+
+class ProvenanceStorageServerApp(RPCServerApp):
+ extra_type_decoders = DECODERS
+ extra_type_encoders = ENCODERS
+
+
+app = ProvenanceStorageServerApp(
+ __name__,
+ backend_class=ProvenanceStorageInterface,
+ backend_factory=get_global_provenance_storage,
+)
+
+
+def has_no_empty_params(rule):
+ return len(rule.defaults or ()) >= len(rule.arguments or ())
+
+
+@app.route("/")
+def index():
+ return """<html>
+<head><title>Software Heritage provenance storage RPC server</title></head>
+<body>
+<p>You have reached the
+<a href="https://www.softwareheritage.org/">Software Heritage</a>
+provenance storage RPC server.<br />
+See its
+<a href="https://docs.softwareheritage.org/devel/swh-provenance/">documentation
+and API</a> for more information</p>
+</body>
+</html>"""
+
+
+@app.route("/site-map")
+@negotiate(MsgpackFormatter)
+@negotiate(JSONFormatter)
+def site_map():
+ links = []
+ for rule in app.url_map.iter_rules():
+ if has_no_empty_params(rule) and hasattr(
+ ProvenanceStorageInterface, rule.endpoint
+ ):
+ links.append(
+ dict(
+ rule=rule.rule,
+ description=getattr(
+ ProvenanceStorageInterface, rule.endpoint
+ ).__doc__,
+ )
+ )
+ # links is now a list of url, endpoint tuples
+ return links
+
+
+def load_and_check_config(config_path, type="local"):
+ """Check the minimal configuration is set to run the api or raise an
+ error explanation.
+
+ Args:
+ config_path (str): Path to the configuration file to load
+ type (str): configuration type. For 'local' type, more
+ checks are done.
+
+ Raises:
+ Error if the setup is not as expected
+
+ Returns:
+ configuration as a dict
+
+ """
+ if not config_path:
+ raise EnvironmentError("Configuration file must be defined")
+
+ if not os.path.exists(config_path):
+ raise FileNotFoundError(f"Configuration file {config_path} does not exist")
+
+ cfg = config.read(config_path)
+
+ pcfg = cfg.get("provenance")
+ if not pcfg:
+ raise KeyError("Missing 'provenance' configuration")
+
+ scfg = pcfg.get("storage")
+ if not scfg:
+ raise KeyError("Missing 'provenance.storage' configuration")
+
+ if type == "local":
+ cls = scfg.get("cls")
+ if cls != "local":
+ raise ValueError(
+ "The provenance backend can only be started with a 'local' "
+ "configuration"
+ )
+
+ db = scfg.get("db")
+ if not db:
+ raise KeyError("Invalid configuration; missing 'db' config entry")
+
+ return cfg
+
+
+api_cfg = None
+
+
+def make_app_from_configfile():
+ """Run the WSGI app from the webserver, loading the configuration from
+ a configuration file.
+
+ SWH_CONFIG_FILENAME environment variable defines the
+ configuration path to load.
+
+ """
+ global api_cfg
+ if not api_cfg:
+ config_path = os.environ.get("SWH_CONFIG_FILENAME")
+ api_cfg = load_and_check_config(config_path)
+ app.config.update(api_cfg)
+ import pprint
+
+ pprint.pprint(app.config)
+ handler = logging.StreamHandler()
+ app.logger.addHandler(handler)
+ return app
diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py
--- a/swh/provenance/interface.py
+++ b/swh/provenance/interface.py
@@ -10,6 +10,7 @@
from typing_extensions import Protocol, runtime_checkable
+from swh.core.api import remote_api_endpoint
from swh.model.model import Sha1Git
from .model import DirectoryEntry, FileEntry, OriginEntry, RevisionEntry
@@ -65,30 +66,33 @@
@runtime_checkable
class ProvenanceStorageInterface(Protocol):
- raise_on_commit: bool = False
-
+ @remote_api_endpoint("content_find_first")
def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]:
"""Retrieve the first occurrence of the blob identified by `id`."""
...
+ @remote_api_endpoint("content_find_all")
def content_find_all(
self, id: Sha1Git, limit: Optional[int] = None
) -> Generator[ProvenanceResult, None, None]:
"""Retrieve all the occurrences of the blob identified by `id`."""
...
+ @remote_api_endpoint("content_set_date")
def content_set_date(self, dates: Dict[Sha1Git, datetime]) -> bool:
"""Associate dates to blobs identified by sha1 ids, as paired in `dates`. Return
a boolean stating whether the information was successfully stored.
"""
...
+ @remote_api_endpoint("content_get")
def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]:
"""Retrieve the associated date for each blob sha1 in `ids`. If some blob has
no associated date, it is not present in the resulting dictionary.
"""
...
+ @remote_api_endpoint("directory_set_date")
def directory_set_date(self, dates: Dict[Sha1Git, datetime]) -> bool:
"""Associate dates to directories identified by sha1 ids, as paired in
`dates`. Return a boolean stating whether the information was successfully
@@ -96,40 +100,47 @@
"""
...
+ @remote_api_endpoint("directory_get")
def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]:
"""Retrieve the associated date for each directory sha1 in `ids`. If some
directory has no associated date, it is not present in the resulting dictionary.
"""
...
+ @remote_api_endpoint("entity_get_all")
def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]:
"""Retrieve all sha1 ids for entities of type `entity` present in the provenance
model.
"""
...
+ @remote_api_endpoint("location_get")
def location_get(self) -> Set[bytes]:
"""Retrieve all paths present in the provenance model."""
...
+ @remote_api_endpoint("origin_set_url")
def origin_set_url(self, urls: Dict[Sha1Git, str]) -> bool:
"""Associate urls to origins identified by sha1 ids, as paired in `urls`. Return
a boolean stating whether the information was successfully stored.
"""
...
+ @remote_api_endpoint("origin_get")
def origin_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, str]:
"""Retrieve the associated url for each origin sha1 in `ids`. If some origin has
no associated date, it is not present in the resulting dictionary.
"""
...
+ @remote_api_endpoint("revision_set_date")
def revision_set_date(self, dates: Dict[Sha1Git, datetime]) -> bool:
"""Associate dates to revisions identified by sha1 ids, as paired in `dates`.
Return a boolean stating whether the information was successfully stored.
"""
...
+ @remote_api_endpoint("revision_set_origin")
def revision_set_origin(self, origins: Dict[Sha1Git, Sha1Git]) -> bool:
"""Associate origins to revisions identified by sha1 ids, as paired in
`origins` (revision ids are keys and origin ids, values). Return a boolean
@@ -137,6 +148,7 @@
"""
...
+ @remote_api_endpoint("revision_get")
def revision_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, RevisionData]:
"""Retrieve the associated date and origin for each revision sha1 in `ids`. If
some revision has no associated date nor origin, it is not present in the
@@ -144,12 +156,14 @@
"""
...
+ @remote_api_endpoint("relation_add")
def relation_add(
self, relation: RelationType, data: Iterable[RelationData]
) -> bool:
"""Add entries in the selected `relation`."""
...
+ @remote_api_endpoint("relation_get")
def relation_get(
self, relation: RelationType, ids: Iterable[Sha1Git], reverse: bool = False
) -> Set[RelationData]:
@@ -159,12 +173,17 @@
"""
...
+ @remote_api_endpoint("relation_get_all")
def relation_get_all(self, relation: RelationType) -> Set[RelationData]:
"""Retrieve all entries in the selected `relation` that are present in the
provenance model.
"""
...
+ @remote_api_endpoint("with_path")
+ def with_path(self) -> bool:
+ ...
+
@runtime_checkable
class ProvenanceInterface(Protocol):
diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb_base.py
--- a/swh/provenance/postgresql/provenancedb_base.py
+++ b/swh/provenance/postgresql/provenancedb_base.py
@@ -25,9 +25,9 @@
class ProvenanceDBBase:
- raise_on_commit: bool = False
-
- def __init__(self, conn: psycopg2.extensions.connection):
+ def __init__(
+ self, conn: psycopg2.extensions.connection, raise_on_commit: bool = False
+ ):
BaseDb.adapt_conn(conn)
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
conn.set_session(autocommit=True)
@@ -37,6 +37,7 @@
sql = "SET timezone TO 'UTC'"
self.cursor.execute(sql)
self._flavor: Optional[str] = None
+ self.raise_on_commit = raise_on_commit
@property
def flavor(self) -> str:
@@ -47,7 +48,6 @@
assert self._flavor is not None
return self._flavor
- @property
def with_path(self) -> bool:
return self.flavor == "with-path"
diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
--- a/swh/provenance/tests/conftest.py
+++ b/swh/provenance/tests/conftest.py
@@ -12,45 +12,69 @@
import pytest
from typing_extensions import TypedDict
-from swh.core.db import BaseDb
from swh.journal.serializers import msgpack_ext_hook
from swh.model.hashutil import hash_to_bytes
from swh.model.model import Sha1Git
from swh.model.tests.swh_model_data import TEST_OBJECTS
-from swh.provenance import get_provenance
+from swh.provenance import get_provenance, get_provenance_storage
+from swh.provenance.api.client import RemoteProvenanceStorage
+import swh.provenance.api.server as server
from swh.provenance.archive import ArchiveInterface
-from swh.provenance.interface import ProvenanceInterface
+from swh.provenance.interface import ProvenanceInterface, ProvenanceStorageInterface
from swh.provenance.postgresql.archive import ArchivePostgreSQL
-from swh.provenance.postgresql.provenancedb_base import ProvenanceDBBase
from swh.provenance.storage.archive import ArchiveStorage
from swh.storage.postgresql.storage import Storage
from swh.storage.replay import process_replay_objects
@pytest.fixture(params=["with-path", "without-path"])
-def provenance(
+def populated_db(
request, # TODO: add proper type annotation
postgresql: psycopg2.extensions.connection,
-) -> ProvenanceInterface:
- """return a working and initialized provenance db"""
+) -> Dict[str, str]:
from swh.core.cli.db import populate_database_for_package
- flavor = request.param
+ flavor = "with-path" if request.param == "client-server" else request.param
populate_database_for_package("swh.provenance", postgresql.dsn, flavor=flavor)
-
- BaseDb.adapt_conn(postgresql)
-
- args: Dict[str, str] = {
+ return {
item.split("=")[0]: item.split("=")[1]
for item in postgresql.dsn.split()
if item.split("=")[0] != "options"
}
- prov = get_provenance(cls="local", db=args)
- assert isinstance(prov.storage, ProvenanceDBBase)
- assert prov.storage.flavor == flavor
- # in test sessions, we DO want to raise any exception occurring at commit time
- prov.storage.raise_on_commit = True
- return prov
+
+
+# the Flask app used as server in these tests
+@pytest.fixture
+def app(populated_db: Dict[str, str]):
+ assert hasattr(server, "storage")
+ server.storage = get_provenance_storage(cls="local", db=populated_db)
+ yield server.app
+
+
+# the RPCClient class used as client used in these tests
+@pytest.fixture
+def swh_rpc_client_class():
+ return RemoteProvenanceStorage
+
+
+@pytest.fixture(params=["local", "remote"])
+def provenance(
+ request, # TODO: add proper type annotation
+ populated_db: Dict[str, str],
+ swh_rpc_client: RemoteProvenanceStorage,
+) -> ProvenanceInterface:
+ """return a working and initialized provenance db"""
+
+ if request.param == "remote":
+ from swh.provenance.provenance import Provenance
+
+ assert isinstance(swh_rpc_client, ProvenanceStorageInterface)
+ return Provenance(swh_rpc_client)
+
+ else:
+ # in test sessions, we DO want to raise any exception occurring at commit time
+ prov = get_provenance(cls=request.param, db=populated_db, raise_on_commit=True)
+ return prov
@pytest.fixture
diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py
--- a/swh/provenance/tests/test_provenance_db.py
+++ b/swh/provenance/tests/test_provenance_db.py
@@ -41,11 +41,11 @@
def test_provenance_flavor(provenance: ProvenanceInterface) -> None:
- assert isinstance(provenance.storage, ProvenanceDBBase)
- assert provenance.storage.flavor in ("with-path", "without-path")
- backend_class: Type[ProvenanceStorageInterface]
- if provenance.storage.flavor == "with-path":
- backend_class = ProvenanceWithPathDB
- else:
- backend_class = ProvenanceWithoutPathDB
- assert isinstance(provenance.storage, backend_class)
+ if isinstance(provenance.storage, ProvenanceDBBase):
+ assert provenance.storage.flavor in ("with-path", "without-path")
+ backend_class: Type[ProvenanceStorageInterface]
+ if provenance.storage.flavor == "with-path":
+ backend_class = ProvenanceWithPathDB
+ else:
+ backend_class = ProvenanceWithoutPathDB
+ assert isinstance(provenance.storage, backend_class)
diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py
--- a/swh/provenance/tests/test_provenance_heuristics.py
+++ b/swh/provenance/tests/test_provenance_heuristics.py
@@ -11,7 +11,6 @@
from swh.provenance.archive import ArchiveInterface
from swh.provenance.interface import EntityType, ProvenanceInterface, RelationType
from swh.provenance.model import RevisionEntry
-from swh.provenance.postgresql.provenancedb_base import ProvenanceDBBase
from swh.provenance.revision import revision_add
from swh.provenance.tests.conftest import (
fill_storage,
@@ -61,8 +60,7 @@
}
def maybe_path(path: str) -> Optional[bytes]:
- assert isinstance(provenance.storage, ProvenanceDBBase)
- if provenance.storage.with_path:
+ if provenance.storage.with_path():
return path.encode("utf-8")
return None
@@ -155,8 +153,7 @@
== provenance.storage.content_get([dc["dst"]])[dc["dst"]].timestamp()
), synth_rev["msg"]
- assert isinstance(provenance.storage, ProvenanceDBBase)
- if provenance.storage.with_path:
+ if provenance.storage.with_path():
# check for location entries
rows["location"] |= set(x["path"] for x in synth_rev["R_C"])
rows["location"] |= set(x["path"] for x in synth_rev["D_C"])
@@ -199,8 +196,7 @@
]
def maybe_path(path: str) -> str:
- assert isinstance(provenance.storage, ProvenanceDBBase)
- if provenance.storage.with_path:
+ if provenance.storage.with_path():
return path
return ""
@@ -230,7 +226,6 @@
(rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"]))
)
- assert isinstance(provenance.storage, ProvenanceDBBase)
for content_id, results in expected_occurrences.items():
expected = [(content_id, *result) for result in results]
db_occurrences = [
@@ -243,7 +238,7 @@
)
for occur in provenance.content_find_all(hash_to_bytes(content_id))
]
- if provenance.storage.with_path:
+ if provenance.storage.with_path():
# this is not true if the db stores no path, because a same content
# that appears several times in a given revision may be reported
# only once by content_find_all()
@@ -319,7 +314,6 @@
assert sha1 in expected_first
# nothing to do there, this content cannot be a "first seen file"
- assert isinstance(provenance.storage, ProvenanceDBBase)
for content_id, (rev_id, ts, paths) in expected_first.items():
occur = provenance.content_find_first(hash_to_bytes(content_id))
assert occur is not None
@@ -327,5 +321,5 @@
assert occur.revision.hex() == rev_id
assert occur.date.timestamp() == ts
assert occur.origin is None
- if provenance.storage.with_path:
+ if provenance.storage.with_path():
assert occur.path.decode() in paths
diff --git a/swh/provenance/tests/test_provenance_storage.py b/swh/provenance/tests/test_provenance_storage.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_provenance_storage.py
@@ -0,0 +1,46 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import inspect
+
+from ..interface import ProvenanceInterface, ProvenanceStorageInterface
+
+
+def test_types(provenance: ProvenanceInterface):
+ """Checks all methods of ProvenanceStorageInterface are implemented by this
+ backend, and that they have the same signature."""
+ # Create an instance of the protocol (which cannot be instantiated
+ # directly, so this creates a subclass, then instantiates it)
+ interface = type("_", (ProvenanceStorageInterface,), {})()
+ storage = provenance.storage
+
+ assert "content_find_first" in dir(interface)
+
+ missing_methods = []
+
+ for meth_name in dir(interface):
+ if meth_name.startswith("_"):
+ continue
+ interface_meth = getattr(interface, meth_name)
+ try:
+ concrete_meth = getattr(storage, meth_name)
+ except AttributeError:
+ if not getattr(interface_meth, "deprecated_endpoint", False):
+ # The backend is missing a (non-deprecated) endpoint
+ missing_methods.append(meth_name)
+ continue
+
+ expected_signature = inspect.signature(interface_meth)
+ actual_signature = inspect.signature(concrete_meth)
+
+ assert expected_signature == actual_signature, meth_name
+
+ assert missing_methods == []
+
+ # If all the assertions above succeed, then this one should too.
+ # But there's no harm in double-checking.
+ # And we could replace the assertions above by this one, but unlike
+ # the assertions above, it doesn't explain what is missing.
+ assert isinstance(storage, ProvenanceStorageInterface)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 3:27 PM (1 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3230811
Attached To
D5986: Add provenance storage server/client classes
Event Timeline
Log In to Comment