Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123600
D8591.id31021.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
41 KB
Subscribers
None
D8591.id31021.diff
View Options
diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,5 +1,5 @@
# Add here internal Software Heritage dependencies, one per line.
-swh.core[db,http] >= 0.14
+swh.core[db,http] >= 2
swh.model >= 2.6.1
swh.storage
swh.graph >= 2.0.0
diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py
--- a/swh/provenance/__init__.py
+++ b/swh/provenance/__init__.py
@@ -119,3 +119,6 @@
return rmq_storage
raise ValueError
+
+
+get_datastore = get_provenance_storage
diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py
--- a/swh/provenance/postgresql/provenance.py
+++ b/swh/provenance/postgresql/provenance.py
@@ -51,6 +51,8 @@
class ProvenanceStoragePostgreSql:
+ current_version = 3
+
def __init__(
self, page_size: Optional[int] = None, raise_on_commit: bool = False, **kwargs
) -> None:
diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql
--- a/swh/provenance/sql/30-schema.sql
+++ b/swh/provenance/sql/30-schema.sql
@@ -2,22 +2,6 @@
select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset
select position('without-path' in swh_get_dbflavor()::text) = 0 as dbflavor_with_path \gset
-create table dbversion
-(
- version int primary key,
- release timestamptz,
- description text
-);
-
-comment on table dbversion is 'Details of current db version';
-comment on column dbversion.version is 'SQL schema version';
-comment on column dbversion.release is 'Version deployment timestamp';
-comment on column dbversion.description is 'Release description';
-
--- latest schema version
-insert into dbversion(version, release, description)
- values(3, now(), 'Work In Progress');
-
-- a Git object ID, i.e., a Git-style salted SHA1 checksum
create domain sha1_git as bytea check (length(value) = 20);
diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
--- a/swh/provenance/tests/conftest.py
+++ b/swh/provenance/tests/conftest.py
@@ -5,6 +5,7 @@
from contextlib import contextmanager
from datetime import datetime
+from functools import partial
import multiprocessing
from os import path
from pathlib import Path
@@ -15,87 +16,56 @@
import msgpack
import psycopg2.extensions
import pytest
-from pytest_postgresql.factories import postgresql
+from pytest_postgresql import factories
+from swh.core.db.db_utils import initialize_database_for_module
from swh.graph.http_rpc_server import make_app
from swh.journal.serializers import msgpack_ext_hook
from swh.model.model import BaseModel, TimestampWithTimezone
from swh.provenance import get_provenance, get_provenance_storage
from swh.provenance.archive import ArchiveInterface
from swh.provenance.interface import ProvenanceInterface, ProvenanceStorageInterface
+from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql
from swh.provenance.storage.archive import ArchiveStorage
from swh.storage.interface import StorageInterface
from swh.storage.replay import OBJECT_CONVERTERS, OBJECT_FIXERS, process_replay_objects
-
-@pytest.fixture(
- params=[
- "with-path",
- "without-path",
- "with-path-denormalized",
- "without-path-denormalized",
- ]
+provenance_postgresql_proc = factories.postgresql_proc(
+ load=[
+ partial(
+ initialize_database_for_module,
+ modname="provenance",
+ flavor="with-path",
+ version=ProvenanceStoragePostgreSql.current_version,
+ )
+ ],
)
-def provenance_postgresqldb(
- request: SubRequest,
- postgresql: psycopg2.extensions.connection,
-) -> Dict[str, str]:
- """return a working and initialized provenance db"""
- from swh.core.db.db_utils import (
- init_admin_extensions,
- populate_database_for_package,
- )
- init_admin_extensions("swh.provenance", postgresql.dsn)
- populate_database_for_package(
- "swh.provenance", postgresql.dsn, flavor=request.param
- )
- return postgresql.get_dsn_parameters()
+postgres_provenance = factories.postgresql("provenance_postgresql_proc")
-@pytest.fixture(params=["postgresql", "rabbitmq"])
+@pytest.fixture()
+def provenance_postgresqldb(request, postgres_provenance):
+ return postgres_provenance.get_dsn_parameters()
+
+
+@pytest.fixture()
def provenance_storage(
request: SubRequest,
provenance_postgresqldb: Dict[str, str],
) -> Generator[ProvenanceStorageInterface, None, None]:
"""Return a working and initialized ProvenanceStorageInterface object"""
- if request.param == "rabbitmq":
- from swh.provenance.api.server import ProvenanceStorageRabbitMQServer
-
- rabbitmq = request.getfixturevalue("rabbitmq")
- host = rabbitmq.args["host"]
- port = rabbitmq.args["port"]
- rabbitmq_params: Dict[str, Any] = {
- "url": f"amqp://guest:guest@{host}:{port}/%2f",
- "storage_config": {
- "cls": "postgresql",
- "db": provenance_postgresqldb,
- "raise_on_commit": True,
- },
- }
- server = ProvenanceStorageRabbitMQServer(
- url=rabbitmq_params["url"], storage_config=rabbitmq_params["storage_config"]
- )
- server.start()
- with get_provenance_storage(cls=request.param, **rabbitmq_params) as storage:
- yield storage
- server.stop()
-
- else:
- # in test sessions, we DO want to raise any exception occurring at commit time
- with get_provenance_storage(
- cls=request.param, db=provenance_postgresqldb, raise_on_commit=True
- ) as storage:
- yield storage
-
-
-provenance_postgresql = postgresql("postgresql_proc", dbname="provenance_tests")
+ # in test sessions, we DO want to raise any exception occurring at commit time
+ with get_provenance_storage(
+ cls="postgresql", db=provenance_postgresqldb, raise_on_commit=True
+ ) as storage:
+ yield storage
@pytest.fixture
def provenance(
- provenance_postgresql: psycopg2.extensions.connection,
+ postgres_provenance: psycopg2.extensions.connection,
) -> Generator[ProvenanceInterface, None, None]:
"""Return a working and initialized ProvenanceInterface object"""
@@ -104,14 +74,14 @@
populate_database_for_package,
)
- init_admin_extensions("swh.provenance", provenance_postgresql.dsn)
+ init_admin_extensions("swh.provenance", postgres_provenance.dsn)
populate_database_for_package(
- "swh.provenance", provenance_postgresql.dsn, flavor="with-path"
+ "swh.provenance", postgres_provenance.dsn, flavor="with-path"
)
# in test sessions, we DO want to raise any exception occurring at commit time
with get_provenance(
cls="postgresql",
- db=provenance_postgresql.get_dsn_parameters(),
+ db=postgres_provenance.get_dsn_parameters(),
raise_on_commit=True,
) as provenance:
yield provenance
@@ -169,9 +139,15 @@
def run_grpc_server(queue, dataset_path):
try:
- config = {"graph": {"path": dataset_path}}
+ config = {
+ "graph": {
+ "cls": "local",
+ "grpc_server": {"path": dataset_path},
+ "http_rpc_server": {"debug": True},
+ }
+ }
with loop_context() as loop:
- app = make_app(config=config, debug=True, spawn_rpc_port=None)
+ app = make_app(config=config)
client = TestClient(TestServer(app), loop=loop)
loop.run_until_complete(client.start_server())
url = client.make_url("/graph/")
diff --git a/swh/provenance/tests/test_journal_client.py b/swh/provenance/tests/test_journal_client.py
--- a/swh/provenance/tests/test_journal_client.py
+++ b/swh/provenance/tests/test_journal_client.py
@@ -35,7 +35,7 @@
consumer: Consumer,
tmp_path: str,
provenance,
- provenance_postgresql,
+ postgres_provenance,
) -> None:
"""Test origin journal client cli"""
@@ -63,7 +63,7 @@
},
"storage": {
"cls": "postgresql",
- "db": provenance_postgresql.get_dsn_parameters(),
+ "db": postgres_provenance.get_dsn_parameters(),
},
},
}
@@ -89,7 +89,7 @@
consumer: Consumer,
tmp_path: str,
provenance,
- provenance_postgresql,
+ postgres_provenance,
) -> None:
"""Test revision journal client cli"""
@@ -116,7 +116,7 @@
},
"storage": {
"cls": "postgresql",
- "db": provenance_postgresql.get_dsn_parameters(),
+ "db": postgres_provenance.get_dsn_parameters(),
},
},
}
diff --git a/swh/provenance/tests/test_provenance_storage.py b/swh/provenance/tests/test_provenance_storage.py
--- a/swh/provenance/tests/test_provenance_storage.py
+++ b/swh/provenance/tests/test_provenance_storage.py
@@ -28,130 +28,343 @@
from swh.provenance.tests.conftest import fill_storage, load_repo_data, ts2dt
-def test_provenance_storage_content(
- provenance_storage: ProvenanceStorageInterface,
-) -> None:
- """Tests content methods for every `ProvenanceStorageInterface` implementation."""
-
- # Read data/README.md for more details on how these datasets are generated.
- data = load_repo_data("cmdbts2")
-
- # Add all content present in the current repo to the storage, just assigning their
- # creation dates. Then check that the returned results when querying are the same.
- cnt_dates = {
- cnt["sha1_git"]: cnt["ctime"] for idx, cnt in enumerate(data["content"])
- }
- assert provenance_storage.content_add(cnt_dates)
- assert provenance_storage.content_get(set(cnt_dates.keys())) == cnt_dates
- assert provenance_storage.entity_get_all(EntityType.CONTENT) == set(
- cnt_dates.keys()
- )
+class TestProvenanceStorage:
+ def test_provenance_storage_content(
+ self,
+ provenance_storage: ProvenanceStorageInterface,
+ ) -> None:
+ """Tests content methods for every `ProvenanceStorageInterface` implementation."""
+
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data("cmdbts2")
+
+ # Add all content present in the current repo to the storage, just assigning their
+ # creation dates. Then check that the returned results when querying are the same.
+ cnt_dates = {
+ cnt["sha1_git"]: cnt["ctime"] for idx, cnt in enumerate(data["content"])
+ }
+ assert provenance_storage.content_add(cnt_dates)
+ assert provenance_storage.content_get(set(cnt_dates.keys())) == cnt_dates
+ assert provenance_storage.entity_get_all(EntityType.CONTENT) == set(
+ cnt_dates.keys()
+ )
+ def test_provenance_storage_directory(
+ self,
+ provenance_storage: ProvenanceStorageInterface,
+ ) -> None:
+ """Tests directory methods for every `ProvenanceStorageInterface` implementation."""
+
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data("cmdbts2")
+
+ # Of all directories present in the current repo, only assign a date to those
+ # containing blobs (picking the max date among the available ones). Then check that
+ # the returned results when querying are the same.
+ def getmaxdate(
+ directory: Dict[str, Any], contents: Iterable[Dict[str, Any]]
+ ) -> Optional[datetime]:
+ dates = [
+ content["ctime"]
+ for entry in directory["entries"]
+ for content in contents
+ if entry["type"] == "file" and entry["target"] == content["sha1_git"]
+ ]
+ return max(dates) if dates else None
+
+ flat_values = (False, True)
+ dir_dates = {}
+ for idx, dir in enumerate(data["directory"]):
+ date = getmaxdate(dir, data["content"])
+ if date is not None:
+ dir_dates[dir["id"]] = DirectoryData(
+ date=date, flat=flat_values[idx % 2]
+ )
+ assert provenance_storage.directory_add(dir_dates)
+ assert provenance_storage.directory_get(set(dir_dates.keys())) == dir_dates
+ assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == set(
+ dir_dates.keys()
+ )
-def test_provenance_storage_directory(
- provenance_storage: ProvenanceStorageInterface,
-) -> None:
- """Tests directory methods for every `ProvenanceStorageInterface` implementation."""
-
- # Read data/README.md for more details on how these datasets are generated.
- data = load_repo_data("cmdbts2")
-
- # Of all directories present in the current repo, only assign a date to those
- # containing blobs (picking the max date among the available ones). Then check that
- # the returned results when querying are the same.
- def getmaxdate(
- directory: Dict[str, Any], contents: Iterable[Dict[str, Any]]
- ) -> Optional[datetime]:
- dates = [
- content["ctime"]
- for entry in directory["entries"]
- for content in contents
- if entry["type"] == "file" and entry["target"] == content["sha1_git"]
- ]
- return max(dates) if dates else None
-
- flat_values = (False, True)
- dir_dates = {}
- for idx, dir in enumerate(data["directory"]):
- date = getmaxdate(dir, data["content"])
- if date is not None:
- dir_dates[dir["id"]] = DirectoryData(date=date, flat=flat_values[idx % 2])
- assert provenance_storage.directory_add(dir_dates)
- assert provenance_storage.directory_get(set(dir_dates.keys())) == dir_dates
- assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == set(
- dir_dates.keys()
- )
+ def test_provenance_storage_location(
+ self,
+ provenance_storage: ProvenanceStorageInterface,
+ ) -> None:
+ """Tests location methods for every `ProvenanceStorageInterface` implementation."""
+
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data("cmdbts2")
+
+ # Add all names of entries present in the directories of the current repo as paths
+ # to the storage. Then check that the returned results when querying are the same.
+ paths = {entry["name"] for dir in data["directory"] for entry in dir["entries"]}
+ assert provenance_storage.location_add(paths)
+
+ if provenance_storage.with_path():
+ assert provenance_storage.location_get_all() == paths
+ else:
+ assert provenance_storage.location_get_all() == set()
+
+ def test_provenance_storage_origin(
+ self,
+ provenance_storage: ProvenanceStorageInterface,
+ ) -> None:
+ """Tests origin methods for every `ProvenanceStorageInterface` implementation."""
+
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data("cmdbts2")
+
+ # Test origin methods.
+ # Add all origins present in the current repo to the storage. Then check that the
+ # returned results when querying are the same.
+ orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]}
+ assert orgs
+ assert provenance_storage.origin_add(orgs)
+ assert provenance_storage.origin_get(set(orgs.keys())) == orgs
+ assert provenance_storage.entity_get_all(EntityType.ORIGIN) == set(orgs.keys())
+
+ def test_provenance_storage_revision(
+ self,
+ provenance_storage: ProvenanceStorageInterface,
+ ) -> None:
+ """Tests revision methods for every `ProvenanceStorageInterface` implementation."""
+
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data("cmdbts2")
+
+ # Test revision methods.
+ # Add all revisions present in the current repo to the storage, assigning their
+ # dates and an arbitrary origin to each one. Then check that the returned results
+ # when querying are the same.
+ origin = Origin(url=next(iter(data["origin"]))["url"])
+ # Origin must be inserted in advance.
+ assert provenance_storage.origin_add({origin.id: origin.url})
+
+ revs = {rev["id"] for idx, rev in enumerate(data["revision"]) if idx % 6 == 0}
+ rev_data = {
+ rev["id"]: RevisionData(
+ date=ts2dt(rev["date"]) if idx % 2 != 0 else None,
+ origin=origin.id if idx % 3 != 0 else None,
+ )
+ for idx, rev in enumerate(data["revision"])
+ if idx % 6 != 0
+ }
+ assert revs
+ assert provenance_storage.revision_add(revs)
+ assert provenance_storage.revision_add(rev_data)
+ assert provenance_storage.revision_get(set(rev_data.keys())) == rev_data
+ assert provenance_storage.entity_get_all(EntityType.REVISION) == revs | set(
+ rev_data.keys()
+ )
+ def test_provenance_storage_relation(
+ self,
+ provenance_storage: ProvenanceStorageInterface,
+ ) -> None:
+ """Tests relation methods for every `ProvenanceStorageInterface` implementation."""
-def test_provenance_storage_location(
- provenance_storage: ProvenanceStorageInterface,
-) -> None:
- """Tests location methods for every `ProvenanceStorageInterface` implementation."""
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data("cmdbts2")
- # Read data/README.md for more details on how these datasets are generated.
- data = load_repo_data("cmdbts2")
+ # Test content-in-revision relation.
+ # Create flat models of every root directory for the revisions in the dataset.
+ cnt_in_rev: Dict[Sha1Git, Set[RelationData]] = {}
+ for rev in data["revision"]:
+ root = next(
+ subdir
+ for subdir in data["directory"]
+ if subdir["id"] == rev["directory"]
+ )
+ for cnt, rel in dircontent(data, rev["id"], root):
+ cnt_in_rev.setdefault(cnt, set()).add(rel)
+ relation_add_and_compare_result(
+ provenance_storage, RelationType.CNT_EARLY_IN_REV, cnt_in_rev
+ )
- # Add all names of entries present in the directories of the current repo as paths
- # to the storage. Then check that the returned results when querying are the same.
- paths = {entry["name"] for dir in data["directory"] for entry in dir["entries"]}
- assert provenance_storage.location_add(paths)
+ # Test content-in-directory relation.
+ # Create flat models for every directory in the dataset.
+ cnt_in_dir: Dict[Sha1Git, Set[RelationData]] = {}
+ for dir in data["directory"]:
+ for cnt, rel in dircontent(data, dir["id"], dir):
+ cnt_in_dir.setdefault(cnt, set()).add(rel)
+ relation_add_and_compare_result(
+ provenance_storage, RelationType.CNT_IN_DIR, cnt_in_dir
+ )
- if provenance_storage.with_path():
- assert provenance_storage.location_get_all() == paths
- else:
- assert provenance_storage.location_get_all() == set()
+ # Test content-in-directory relation.
+ # Add root directories to their correspondent revision in the dataset.
+ dir_in_rev: Dict[Sha1Git, Set[RelationData]] = {}
+ for rev in data["revision"]:
+ dir_in_rev.setdefault(rev["directory"], set()).add(
+ RelationData(dst=rev["id"], path=b".")
+ )
+ relation_add_and_compare_result(
+ provenance_storage, RelationType.DIR_IN_REV, dir_in_rev
+ )
+ # Test revision-in-origin relation.
+ # Origins must be inserted in advance (cannot be done by `entity_add` inside
+ # `relation_add_and_compare_result`).
+ orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]}
+ assert provenance_storage.origin_add(orgs)
+ # Add all revisions that are head of some snapshot branch to the corresponding
+ # origin.
+ rev_in_org: Dict[Sha1Git, Set[RelationData]] = {}
+ for status in data["origin_visit_status"]:
+ if status["snapshot"] is not None:
+ for snapshot in data["snapshot"]:
+ if snapshot["id"] == status["snapshot"]:
+ for branch in snapshot["branches"].values():
+ if branch["target_type"] == "revision":
+ rev_in_org.setdefault(branch["target"], set()).add(
+ RelationData(
+ dst=Origin(url=status["origin"]).id,
+ path=None,
+ )
+ )
+ relation_add_and_compare_result(
+ provenance_storage, RelationType.REV_IN_ORG, rev_in_org
+ )
-def test_provenance_storage_origin(
- provenance_storage: ProvenanceStorageInterface,
-) -> None:
- """Tests origin methods for every `ProvenanceStorageInterface` implementation."""
+ # Test revision-before-revision relation.
+ # For each revision in the data set add an entry for each parent to the relation.
+ rev_before_rev: Dict[Sha1Git, Set[RelationData]] = {}
+ for rev in data["revision"]:
+ for parent in rev["parents"]:
+ rev_before_rev.setdefault(parent, set()).add(
+ RelationData(dst=rev["id"], path=None)
+ )
+ relation_add_and_compare_result(
+ provenance_storage, RelationType.REV_BEFORE_REV, rev_before_rev
+ )
- # Read data/README.md for more details on how these datasets are generated.
- data = load_repo_data("cmdbts2")
+ def test_provenance_storage_find(
+ self,
+ provenance: ProvenanceInterface,
+ provenance_storage: ProvenanceStorageInterface,
+ archive: ArchiveInterface,
+ ) -> None:
+ """Tests `content_find_first` and `content_find_all` methods for every
+ `ProvenanceStorageInterface` implementation.
+ """
+
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data("cmdbts2")
+ fill_storage(archive.storage, data)
+
+ # Test content_find_first and content_find_all, first only executing the
+ # revision-content algorithm, then adding the origin-revision layer.
+ def adapt_result(
+ result: Optional[ProvenanceResult], with_path: bool
+ ) -> Optional[ProvenanceResult]:
+ if result is not None:
+ return ProvenanceResult(
+ result.content,
+ result.revision,
+ result.date,
+ result.origin,
+ result.path if with_path else b"",
+ )
+ return result
+
+ # Execute the revision-content algorithm on both storages.
+ revisions = [
+ RevisionEntry(id=rev["id"], date=ts2dt(rev["date"]), root=rev["directory"])
+ for rev in data["revision"]
+ ]
+ revision_add(provenance, archive, revisions)
+ revision_add(Provenance(provenance_storage), archive, revisions)
- # Test origin methods.
- # Add all origins present in the current repo to the storage. Then check that the
- # returned results when querying are the same.
- orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]}
- assert orgs
- assert provenance_storage.origin_add(orgs)
- assert provenance_storage.origin_get(set(orgs.keys())) == orgs
- assert provenance_storage.entity_get_all(EntityType.ORIGIN) == set(orgs.keys())
+ assert adapt_result(
+ ProvenanceResult(
+ content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
+ revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
+ date=datetime.fromtimestamp(1000000000.0, timezone.utc),
+ origin=None,
+ path=b"A/B/C/a",
+ ),
+ provenance_storage.with_path(),
+ ) == provenance_storage.content_find_first(
+ hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494")
+ )
+ for cnt in {cnt["sha1_git"] for cnt in data["content"]}:
+ assert adapt_result(
+ provenance.storage.content_find_first(cnt),
+ provenance_storage.with_path(),
+ ) == provenance_storage.content_find_first(cnt)
+ assert {
+ adapt_result(occur, provenance_storage.with_path())
+ for occur in provenance.storage.content_find_all(cnt)
+ } == set(provenance_storage.content_find_all(cnt))
+
+ # Execute the origin-revision algorithm on both storages.
+ origins = [
+ OriginEntry(url=sta["origin"], snapshot=sta["snapshot"])
+ for sta in data["origin_visit_status"]
+ if sta["snapshot"] is not None
+ ]
+ origin_add(provenance, archive, origins)
+ origin_add(Provenance(provenance_storage), archive, origins)
-def test_provenance_storage_revision(
- provenance_storage: ProvenanceStorageInterface,
-) -> None:
- """Tests revision methods for every `ProvenanceStorageInterface` implementation."""
-
- # Read data/README.md for more details on how these datasets are generated.
- data = load_repo_data("cmdbts2")
-
- # Test revision methods.
- # Add all revisions present in the current repo to the storage, assigning their
- # dates and an arbitrary origin to each one. Then check that the returned results
- # when querying are the same.
- origin = Origin(url=next(iter(data["origin"]))["url"])
- # Origin must be inserted in advance.
- assert provenance_storage.origin_add({origin.id: origin.url})
-
- revs = {rev["id"] for idx, rev in enumerate(data["revision"]) if idx % 6 == 0}
- rev_data = {
- rev["id"]: RevisionData(
- date=ts2dt(rev["date"]) if idx % 2 != 0 else None,
- origin=origin.id if idx % 3 != 0 else None,
+ assert adapt_result(
+ ProvenanceResult(
+ content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
+ revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
+ date=datetime.fromtimestamp(1000000000.0, timezone.utc),
+ origin="https://cmdbts2",
+ path=b"A/B/C/a",
+ ),
+ provenance_storage.with_path(),
+ ) == provenance_storage.content_find_first(
+ hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494")
)
- for idx, rev in enumerate(data["revision"])
- if idx % 6 != 0
- }
- assert revs
- assert provenance_storage.revision_add(revs)
- assert provenance_storage.revision_add(rev_data)
- assert provenance_storage.revision_get(set(rev_data.keys())) == rev_data
- assert provenance_storage.entity_get_all(EntityType.REVISION) == revs | set(
- rev_data.keys()
- )
+
+ for cnt in {cnt["sha1_git"] for cnt in data["content"]}:
+ assert adapt_result(
+ provenance.storage.content_find_first(cnt),
+ provenance_storage.with_path(),
+ ) == provenance_storage.content_find_first(cnt)
+ assert {
+ adapt_result(occur, provenance_storage.with_path())
+ for occur in provenance.storage.content_find_all(cnt)
+ } == set(provenance_storage.content_find_all(cnt))
+
+ def test_types(self, provenance_storage: ProvenanceStorageInterface) -> None:
+ """Checks all methods of ProvenanceStorageInterface are implemented by this
+ backend, and that they have the same signature."""
+ # Create an instance of the protocol (which cannot be instantiated
+ # directly, so this creates a subclass, then instantiates it)
+ interface = type("_", (ProvenanceStorageInterface,), {})()
+
+ assert "content_find_first" in dir(interface)
+
+ missing_methods = []
+
+ for meth_name in dir(interface):
+ if meth_name.startswith("_"):
+ continue
+ interface_meth = getattr(interface, meth_name)
+ try:
+ concrete_meth = getattr(provenance_storage, meth_name)
+ except AttributeError:
+ if not getattr(interface_meth, "deprecated_endpoint", False):
+ # The backend is missing a (non-deprecated) endpoint
+ missing_methods.append(meth_name)
+ continue
+
+ expected_signature = inspect.signature(interface_meth)
+ actual_signature = inspect.signature(concrete_meth)
+
+ assert expected_signature == actual_signature, meth_name
+
+ assert missing_methods == []
+
+ # If all the assertions above succeed, then this one should too.
+ # But there's no harm in double-checking.
+ # And we could replace the assertions above by this one, but unlike
+ # the assertions above, it doesn't explain what is missing.
+ assert isinstance(provenance_storage, ProvenanceStorageInterface)
def dircontent(
@@ -255,209 +468,3 @@
}
for src_sha1, rels in expected.items()
} == computed
-
-
-def test_provenance_storage_relation(
- provenance_storage: ProvenanceStorageInterface,
-) -> None:
- """Tests relation methods for every `ProvenanceStorageInterface` implementation."""
-
- # Read data/README.md for more details on how these datasets are generated.
- data = load_repo_data("cmdbts2")
-
- # Test content-in-revision relation.
- # Create flat models of every root directory for the revisions in the dataset.
- cnt_in_rev: Dict[Sha1Git, Set[RelationData]] = {}
- for rev in data["revision"]:
- root = next(
- subdir for subdir in data["directory"] if subdir["id"] == rev["directory"]
- )
- for cnt, rel in dircontent(data, rev["id"], root):
- cnt_in_rev.setdefault(cnt, set()).add(rel)
- relation_add_and_compare_result(
- provenance_storage, RelationType.CNT_EARLY_IN_REV, cnt_in_rev
- )
-
- # Test content-in-directory relation.
- # Create flat models for every directory in the dataset.
- cnt_in_dir: Dict[Sha1Git, Set[RelationData]] = {}
- for dir in data["directory"]:
- for cnt, rel in dircontent(data, dir["id"], dir):
- cnt_in_dir.setdefault(cnt, set()).add(rel)
- relation_add_and_compare_result(
- provenance_storage, RelationType.CNT_IN_DIR, cnt_in_dir
- )
-
- # Test content-in-directory relation.
- # Add root directories to their correspondent revision in the dataset.
- dir_in_rev: Dict[Sha1Git, Set[RelationData]] = {}
- for rev in data["revision"]:
- dir_in_rev.setdefault(rev["directory"], set()).add(
- RelationData(dst=rev["id"], path=b".")
- )
- relation_add_and_compare_result(
- provenance_storage, RelationType.DIR_IN_REV, dir_in_rev
- )
-
- # Test revision-in-origin relation.
- # Origins must be inserted in advance (cannot be done by `entity_add` inside
- # `relation_add_and_compare_result`).
- orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]}
- assert provenance_storage.origin_add(orgs)
- # Add all revisions that are head of some snapshot branch to the corresponding
- # origin.
- rev_in_org: Dict[Sha1Git, Set[RelationData]] = {}
- for status in data["origin_visit_status"]:
- if status["snapshot"] is not None:
- for snapshot in data["snapshot"]:
- if snapshot["id"] == status["snapshot"]:
- for branch in snapshot["branches"].values():
- if branch["target_type"] == "revision":
- rev_in_org.setdefault(branch["target"], set()).add(
- RelationData(
- dst=Origin(url=status["origin"]).id,
- path=None,
- )
- )
- relation_add_and_compare_result(
- provenance_storage, RelationType.REV_IN_ORG, rev_in_org
- )
-
- # Test revision-before-revision relation.
- # For each revision in the data set add an entry for each parent to the relation.
- rev_before_rev: Dict[Sha1Git, Set[RelationData]] = {}
- for rev in data["revision"]:
- for parent in rev["parents"]:
- rev_before_rev.setdefault(parent, set()).add(
- RelationData(dst=rev["id"], path=None)
- )
- relation_add_and_compare_result(
- provenance_storage, RelationType.REV_BEFORE_REV, rev_before_rev
- )
-
-
-def test_provenance_storage_find(
- provenance: ProvenanceInterface,
- provenance_storage: ProvenanceStorageInterface,
- archive: ArchiveInterface,
-) -> None:
- """Tests `content_find_first` and `content_find_all` methods for every
- `ProvenanceStorageInterface` implementation.
- """
-
- # Read data/README.md for more details on how these datasets are generated.
- data = load_repo_data("cmdbts2")
- fill_storage(archive.storage, data)
-
- # Test content_find_first and content_find_all, first only executing the
- # revision-content algorithm, then adding the origin-revision layer.
- def adapt_result(
- result: Optional[ProvenanceResult], with_path: bool
- ) -> Optional[ProvenanceResult]:
- if result is not None:
- return ProvenanceResult(
- result.content,
- result.revision,
- result.date,
- result.origin,
- result.path if with_path else b"",
- )
- return result
-
- # Execute the revision-content algorithm on both storages.
- revisions = [
- RevisionEntry(id=rev["id"], date=ts2dt(rev["date"]), root=rev["directory"])
- for rev in data["revision"]
- ]
- revision_add(provenance, archive, revisions)
- revision_add(Provenance(provenance_storage), archive, revisions)
-
- assert adapt_result(
- ProvenanceResult(
- content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
- revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
- date=datetime.fromtimestamp(1000000000.0, timezone.utc),
- origin=None,
- path=b"A/B/C/a",
- ),
- provenance_storage.with_path(),
- ) == provenance_storage.content_find_first(
- hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494")
- )
-
- for cnt in {cnt["sha1_git"] for cnt in data["content"]}:
- assert adapt_result(
- provenance.storage.content_find_first(cnt), provenance_storage.with_path()
- ) == provenance_storage.content_find_first(cnt)
- assert {
- adapt_result(occur, provenance_storage.with_path())
- for occur in provenance.storage.content_find_all(cnt)
- } == set(provenance_storage.content_find_all(cnt))
-
- # Execute the origin-revision algorithm on both storages.
- origins = [
- OriginEntry(url=sta["origin"], snapshot=sta["snapshot"])
- for sta in data["origin_visit_status"]
- if sta["snapshot"] is not None
- ]
- origin_add(provenance, archive, origins)
- origin_add(Provenance(provenance_storage), archive, origins)
-
- assert adapt_result(
- ProvenanceResult(
- content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
- revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
- date=datetime.fromtimestamp(1000000000.0, timezone.utc),
- origin="https://cmdbts2",
- path=b"A/B/C/a",
- ),
- provenance_storage.with_path(),
- ) == provenance_storage.content_find_first(
- hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494")
- )
-
- for cnt in {cnt["sha1_git"] for cnt in data["content"]}:
- assert adapt_result(
- provenance.storage.content_find_first(cnt), provenance_storage.with_path()
- ) == provenance_storage.content_find_first(cnt)
- assert {
- adapt_result(occur, provenance_storage.with_path())
- for occur in provenance.storage.content_find_all(cnt)
- } == set(provenance_storage.content_find_all(cnt))
-
-
-def test_types(provenance_storage: ProvenanceStorageInterface) -> None:
- """Checks all methods of ProvenanceStorageInterface are implemented by this
- backend, and that they have the same signature."""
- # Create an instance of the protocol (which cannot be instantiated
- # directly, so this creates a subclass, then instantiates it)
- interface = type("_", (ProvenanceStorageInterface,), {})()
-
- assert "content_find_first" in dir(interface)
-
- missing_methods = []
-
- for meth_name in dir(interface):
- if meth_name.startswith("_"):
- continue
- interface_meth = getattr(interface, meth_name)
- try:
- concrete_meth = getattr(provenance_storage, meth_name)
- except AttributeError:
- if not getattr(interface_meth, "deprecated_endpoint", False):
- # The backend is missing a (non-deprecated) endpoint
- missing_methods.append(meth_name)
- continue
-
- expected_signature = inspect.signature(interface_meth)
- actual_signature = inspect.signature(concrete_meth)
-
- assert expected_signature == actual_signature, meth_name
-
- assert missing_methods == []
-
- # If all the assertions above succeed, then this one should too.
- # But there's no harm in double-checking.
- # And we could replace the assertions above by this one, but unlike
- # the assertions above, it doesn't explain what is missing.
- assert isinstance(provenance_storage, ProvenanceStorageInterface)
diff --git a/swh/provenance/tests/test_provenance_storage_rabbitmq.py b/swh/provenance/tests/test_provenance_storage_rabbitmq.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_provenance_storage_rabbitmq.py
@@ -0,0 +1,38 @@
+from typing import Any, Dict, Generator
+
+import pytest
+
+from swh.provenance import get_provenance_storage
+from swh.provenance.interface import ProvenanceStorageInterface
+
+from .test_provenance_storage import TestProvenanceStorage # noqa: F401
+
+
+@pytest.fixture()
+def provenance_storage(
+ provenance_postgresqldb: Dict[str, str],
+ rabbitmq,
+) -> Generator[ProvenanceStorageInterface, None, None]:
+ """Return a working and initialized ProvenanceStorageInterface object"""
+
+ from swh.provenance.api.server import ProvenanceStorageRabbitMQServer
+
+ host = rabbitmq.args["host"]
+ port = rabbitmq.args["port"]
+ rabbitmq_params: Dict[str, Any] = {
+ "url": f"amqp://guest:guest@{host}:{port}/%2f",
+ "storage_config": {
+ "cls": "postgresql",
+ "db": provenance_postgresqldb,
+ "raise_on_commit": True,
+ },
+ }
+ server = ProvenanceStorageRabbitMQServer(
+ url=rabbitmq_params["url"], storage_config=rabbitmq_params["storage_config"]
+ )
+ server.start()
+ try:
+ with get_provenance_storage(cls="rabbitmq", **rabbitmq_params) as storage:
+ yield storage
+ finally:
+ server.stop()
diff --git a/swh/provenance/tests/test_provenance_storage_with_path_denormalized.py b/swh/provenance/tests/test_provenance_storage_with_path_denormalized.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_provenance_storage_with_path_denormalized.py
@@ -0,0 +1,19 @@
+from functools import partial
+
+from pytest_postgresql import factories
+
+from swh.core.db.db_utils import initialize_database_for_module
+from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql
+
+from .test_provenance_storage import TestProvenanceStorage # noqa: F401
+
+provenance_postgresql_proc = factories.postgresql_proc(
+ load=[
+ partial(
+ initialize_database_for_module,
+ modname="provenance",
+ flavor="with-path-denormalized",
+ version=ProvenanceStoragePostgreSql.current_version,
+ )
+ ],
+)
diff --git a/swh/provenance/tests/test_provenance_storage_without_path.py b/swh/provenance/tests/test_provenance_storage_without_path.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_provenance_storage_without_path.py
@@ -0,0 +1,19 @@
+from functools import partial
+
+from pytest_postgresql import factories
+
+from swh.core.db.db_utils import initialize_database_for_module
+from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql
+
+from .test_provenance_storage import TestProvenanceStorage # noqa: F401
+
+provenance_postgresql_proc = factories.postgresql_proc(
+ load=[
+ partial(
+ initialize_database_for_module,
+ modname="provenance",
+ flavor="without-path",
+ version=ProvenanceStoragePostgreSql.current_version,
+ )
+ ],
+)
diff --git a/swh/provenance/tests/test_provenance_storage_without_path_denormalized.py b/swh/provenance/tests/test_provenance_storage_without_path_denormalized.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_provenance_storage_without_path_denormalized.py
@@ -0,0 +1,19 @@
+from functools import partial
+
+from pytest_postgresql import factories
+
+from swh.core.db.db_utils import initialize_database_for_module
+from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql
+
+from .test_provenance_storage import TestProvenanceStorage # noqa: F401
+
+provenance_postgresql_proc = factories.postgresql_proc(
+ load=[
+ partial(
+ initialize_database_for_module,
+ modname="provenance",
+ flavor="without-path-denormalized",
+ version=ProvenanceStoragePostgreSql.current_version,
+ )
+ ],
+)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Dec 19, 12:14 PM (15 h, 24 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3230766
Attached To
D8591: Adapt postgresql backend to swh.core.db >= 2.0
Event Timeline
Log In to Comment