Page MenuHomeSoftware Heritage

D8591.id31021.diff
No OneTemporary

D8591.id31021.diff

diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,5 +1,5 @@
# Add here internal Software Heritage dependencies, one per line.
-swh.core[db,http] >= 0.14
+swh.core[db,http] >= 2
swh.model >= 2.6.1
swh.storage
swh.graph >= 2.0.0
diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py
--- a/swh/provenance/__init__.py
+++ b/swh/provenance/__init__.py
@@ -119,3 +119,6 @@
return rmq_storage
raise ValueError
+
+
+get_datastore = get_provenance_storage
diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py
--- a/swh/provenance/postgresql/provenance.py
+++ b/swh/provenance/postgresql/provenance.py
@@ -51,6 +51,8 @@
class ProvenanceStoragePostgreSql:
+ current_version = 3
+
def __init__(
self, page_size: Optional[int] = None, raise_on_commit: bool = False, **kwargs
) -> None:
diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql
--- a/swh/provenance/sql/30-schema.sql
+++ b/swh/provenance/sql/30-schema.sql
@@ -2,22 +2,6 @@
select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset
select position('without-path' in swh_get_dbflavor()::text) = 0 as dbflavor_with_path \gset
-create table dbversion
-(
- version int primary key,
- release timestamptz,
- description text
-);
-
-comment on table dbversion is 'Details of current db version';
-comment on column dbversion.version is 'SQL schema version';
-comment on column dbversion.release is 'Version deployment timestamp';
-comment on column dbversion.description is 'Release description';
-
--- latest schema version
-insert into dbversion(version, release, description)
- values(3, now(), 'Work In Progress');
-
-- a Git object ID, i.e., a Git-style salted SHA1 checksum
create domain sha1_git as bytea check (length(value) = 20);
diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
--- a/swh/provenance/tests/conftest.py
+++ b/swh/provenance/tests/conftest.py
@@ -5,6 +5,7 @@
from contextlib import contextmanager
from datetime import datetime
+from functools import partial
import multiprocessing
from os import path
from pathlib import Path
@@ -15,87 +16,56 @@
import msgpack
import psycopg2.extensions
import pytest
-from pytest_postgresql.factories import postgresql
+from pytest_postgresql import factories
+from swh.core.db.db_utils import initialize_database_for_module
from swh.graph.http_rpc_server import make_app
from swh.journal.serializers import msgpack_ext_hook
from swh.model.model import BaseModel, TimestampWithTimezone
from swh.provenance import get_provenance, get_provenance_storage
from swh.provenance.archive import ArchiveInterface
from swh.provenance.interface import ProvenanceInterface, ProvenanceStorageInterface
+from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql
from swh.provenance.storage.archive import ArchiveStorage
from swh.storage.interface import StorageInterface
from swh.storage.replay import OBJECT_CONVERTERS, OBJECT_FIXERS, process_replay_objects
-
-@pytest.fixture(
- params=[
- "with-path",
- "without-path",
- "with-path-denormalized",
- "without-path-denormalized",
- ]
+provenance_postgresql_proc = factories.postgresql_proc(
+ load=[
+ partial(
+ initialize_database_for_module,
+ modname="provenance",
+ flavor="with-path",
+ version=ProvenanceStoragePostgreSql.current_version,
+ )
+ ],
)
-def provenance_postgresqldb(
- request: SubRequest,
- postgresql: psycopg2.extensions.connection,
-) -> Dict[str, str]:
- """return a working and initialized provenance db"""
- from swh.core.db.db_utils import (
- init_admin_extensions,
- populate_database_for_package,
- )
- init_admin_extensions("swh.provenance", postgresql.dsn)
- populate_database_for_package(
- "swh.provenance", postgresql.dsn, flavor=request.param
- )
- return postgresql.get_dsn_parameters()
+postgres_provenance = factories.postgresql("provenance_postgresql_proc")
-@pytest.fixture(params=["postgresql", "rabbitmq"])
+@pytest.fixture()
+def provenance_postgresqldb(request, postgres_provenance):
+ return postgres_provenance.get_dsn_parameters()
+
+
+@pytest.fixture()
def provenance_storage(
request: SubRequest,
provenance_postgresqldb: Dict[str, str],
) -> Generator[ProvenanceStorageInterface, None, None]:
"""Return a working and initialized ProvenanceStorageInterface object"""
- if request.param == "rabbitmq":
- from swh.provenance.api.server import ProvenanceStorageRabbitMQServer
-
- rabbitmq = request.getfixturevalue("rabbitmq")
- host = rabbitmq.args["host"]
- port = rabbitmq.args["port"]
- rabbitmq_params: Dict[str, Any] = {
- "url": f"amqp://guest:guest@{host}:{port}/%2f",
- "storage_config": {
- "cls": "postgresql",
- "db": provenance_postgresqldb,
- "raise_on_commit": True,
- },
- }
- server = ProvenanceStorageRabbitMQServer(
- url=rabbitmq_params["url"], storage_config=rabbitmq_params["storage_config"]
- )
- server.start()
- with get_provenance_storage(cls=request.param, **rabbitmq_params) as storage:
- yield storage
- server.stop()
-
- else:
- # in test sessions, we DO want to raise any exception occurring at commit time
- with get_provenance_storage(
- cls=request.param, db=provenance_postgresqldb, raise_on_commit=True
- ) as storage:
- yield storage
-
-
-provenance_postgresql = postgresql("postgresql_proc", dbname="provenance_tests")
+ # in test sessions, we DO want to raise any exception occurring at commit time
+ with get_provenance_storage(
+ cls="postgresql", db=provenance_postgresqldb, raise_on_commit=True
+ ) as storage:
+ yield storage
@pytest.fixture
def provenance(
- provenance_postgresql: psycopg2.extensions.connection,
+ postgres_provenance: psycopg2.extensions.connection,
) -> Generator[ProvenanceInterface, None, None]:
"""Return a working and initialized ProvenanceInterface object"""
@@ -104,14 +74,14 @@
populate_database_for_package,
)
- init_admin_extensions("swh.provenance", provenance_postgresql.dsn)
+ init_admin_extensions("swh.provenance", postgres_provenance.dsn)
populate_database_for_package(
- "swh.provenance", provenance_postgresql.dsn, flavor="with-path"
+ "swh.provenance", postgres_provenance.dsn, flavor="with-path"
)
# in test sessions, we DO want to raise any exception occurring at commit time
with get_provenance(
cls="postgresql",
- db=provenance_postgresql.get_dsn_parameters(),
+ db=postgres_provenance.get_dsn_parameters(),
raise_on_commit=True,
) as provenance:
yield provenance
@@ -169,9 +139,15 @@
def run_grpc_server(queue, dataset_path):
try:
- config = {"graph": {"path": dataset_path}}
+ config = {
+ "graph": {
+ "cls": "local",
+ "grpc_server": {"path": dataset_path},
+ "http_rpc_server": {"debug": True},
+ }
+ }
with loop_context() as loop:
- app = make_app(config=config, debug=True, spawn_rpc_port=None)
+ app = make_app(config=config)
client = TestClient(TestServer(app), loop=loop)
loop.run_until_complete(client.start_server())
url = client.make_url("/graph/")
diff --git a/swh/provenance/tests/test_journal_client.py b/swh/provenance/tests/test_journal_client.py
--- a/swh/provenance/tests/test_journal_client.py
+++ b/swh/provenance/tests/test_journal_client.py
@@ -35,7 +35,7 @@
consumer: Consumer,
tmp_path: str,
provenance,
- provenance_postgresql,
+ postgres_provenance,
) -> None:
"""Test origin journal client cli"""
@@ -63,7 +63,7 @@
},
"storage": {
"cls": "postgresql",
- "db": provenance_postgresql.get_dsn_parameters(),
+ "db": postgres_provenance.get_dsn_parameters(),
},
},
}
@@ -89,7 +89,7 @@
consumer: Consumer,
tmp_path: str,
provenance,
- provenance_postgresql,
+ postgres_provenance,
) -> None:
"""Test revision journal client cli"""
@@ -116,7 +116,7 @@
},
"storage": {
"cls": "postgresql",
- "db": provenance_postgresql.get_dsn_parameters(),
+ "db": postgres_provenance.get_dsn_parameters(),
},
},
}
diff --git a/swh/provenance/tests/test_provenance_storage.py b/swh/provenance/tests/test_provenance_storage.py
--- a/swh/provenance/tests/test_provenance_storage.py
+++ b/swh/provenance/tests/test_provenance_storage.py
@@ -28,130 +28,343 @@
from swh.provenance.tests.conftest import fill_storage, load_repo_data, ts2dt
-def test_provenance_storage_content(
- provenance_storage: ProvenanceStorageInterface,
-) -> None:
- """Tests content methods for every `ProvenanceStorageInterface` implementation."""
-
- # Read data/README.md for more details on how these datasets are generated.
- data = load_repo_data("cmdbts2")
-
- # Add all content present in the current repo to the storage, just assigning their
- # creation dates. Then check that the returned results when querying are the same.
- cnt_dates = {
- cnt["sha1_git"]: cnt["ctime"] for idx, cnt in enumerate(data["content"])
- }
- assert provenance_storage.content_add(cnt_dates)
- assert provenance_storage.content_get(set(cnt_dates.keys())) == cnt_dates
- assert provenance_storage.entity_get_all(EntityType.CONTENT) == set(
- cnt_dates.keys()
- )
+class TestProvenanceStorage:
+ def test_provenance_storage_content(
+ self,
+ provenance_storage: ProvenanceStorageInterface,
+ ) -> None:
+ """Tests content methods for every `ProvenanceStorageInterface` implementation."""
+
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data("cmdbts2")
+
+ # Add all content present in the current repo to the storage, just assigning their
+ # creation dates. Then check that the returned results when querying are the same.
+ cnt_dates = {
+ cnt["sha1_git"]: cnt["ctime"] for idx, cnt in enumerate(data["content"])
+ }
+ assert provenance_storage.content_add(cnt_dates)
+ assert provenance_storage.content_get(set(cnt_dates.keys())) == cnt_dates
+ assert provenance_storage.entity_get_all(EntityType.CONTENT) == set(
+ cnt_dates.keys()
+ )
+ def test_provenance_storage_directory(
+ self,
+ provenance_storage: ProvenanceStorageInterface,
+ ) -> None:
+ """Tests directory methods for every `ProvenanceStorageInterface` implementation."""
+
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data("cmdbts2")
+
+ # Of all directories present in the current repo, only assign a date to those
+ # containing blobs (picking the max date among the available ones). Then check that
+ # the returned results when querying are the same.
+ def getmaxdate(
+ directory: Dict[str, Any], contents: Iterable[Dict[str, Any]]
+ ) -> Optional[datetime]:
+ dates = [
+ content["ctime"]
+ for entry in directory["entries"]
+ for content in contents
+ if entry["type"] == "file" and entry["target"] == content["sha1_git"]
+ ]
+ return max(dates) if dates else None
+
+ flat_values = (False, True)
+ dir_dates = {}
+ for idx, dir in enumerate(data["directory"]):
+ date = getmaxdate(dir, data["content"])
+ if date is not None:
+ dir_dates[dir["id"]] = DirectoryData(
+ date=date, flat=flat_values[idx % 2]
+ )
+ assert provenance_storage.directory_add(dir_dates)
+ assert provenance_storage.directory_get(set(dir_dates.keys())) == dir_dates
+ assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == set(
+ dir_dates.keys()
+ )
-def test_provenance_storage_directory(
- provenance_storage: ProvenanceStorageInterface,
-) -> None:
- """Tests directory methods for every `ProvenanceStorageInterface` implementation."""
-
- # Read data/README.md for more details on how these datasets are generated.
- data = load_repo_data("cmdbts2")
-
- # Of all directories present in the current repo, only assign a date to those
- # containing blobs (picking the max date among the available ones). Then check that
- # the returned results when querying are the same.
- def getmaxdate(
- directory: Dict[str, Any], contents: Iterable[Dict[str, Any]]
- ) -> Optional[datetime]:
- dates = [
- content["ctime"]
- for entry in directory["entries"]
- for content in contents
- if entry["type"] == "file" and entry["target"] == content["sha1_git"]
- ]
- return max(dates) if dates else None
-
- flat_values = (False, True)
- dir_dates = {}
- for idx, dir in enumerate(data["directory"]):
- date = getmaxdate(dir, data["content"])
- if date is not None:
- dir_dates[dir["id"]] = DirectoryData(date=date, flat=flat_values[idx % 2])
- assert provenance_storage.directory_add(dir_dates)
- assert provenance_storage.directory_get(set(dir_dates.keys())) == dir_dates
- assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == set(
- dir_dates.keys()
- )
+ def test_provenance_storage_location(
+ self,
+ provenance_storage: ProvenanceStorageInterface,
+ ) -> None:
+ """Tests location methods for every `ProvenanceStorageInterface` implementation."""
+
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data("cmdbts2")
+
+ # Add all names of entries present in the directories of the current repo as paths
+ # to the storage. Then check that the returned results when querying are the same.
+ paths = {entry["name"] for dir in data["directory"] for entry in dir["entries"]}
+ assert provenance_storage.location_add(paths)
+
+ if provenance_storage.with_path():
+ assert provenance_storage.location_get_all() == paths
+ else:
+ assert provenance_storage.location_get_all() == set()
+
+ def test_provenance_storage_origin(
+ self,
+ provenance_storage: ProvenanceStorageInterface,
+ ) -> None:
+ """Tests origin methods for every `ProvenanceStorageInterface` implementation."""
+
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data("cmdbts2")
+
+ # Test origin methods.
+ # Add all origins present in the current repo to the storage. Then check that the
+ # returned results when querying are the same.
+ orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]}
+ assert orgs
+ assert provenance_storage.origin_add(orgs)
+ assert provenance_storage.origin_get(set(orgs.keys())) == orgs
+ assert provenance_storage.entity_get_all(EntityType.ORIGIN) == set(orgs.keys())
+
+ def test_provenance_storage_revision(
+ self,
+ provenance_storage: ProvenanceStorageInterface,
+ ) -> None:
+ """Tests revision methods for every `ProvenanceStorageInterface` implementation."""
+
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data("cmdbts2")
+
+ # Test revision methods.
+ # Add all revisions present in the current repo to the storage, assigning their
+ # dates and an arbitrary origin to each one. Then check that the returned results
+ # when querying are the same.
+ origin = Origin(url=next(iter(data["origin"]))["url"])
+ # Origin must be inserted in advance.
+ assert provenance_storage.origin_add({origin.id: origin.url})
+
+ revs = {rev["id"] for idx, rev in enumerate(data["revision"]) if idx % 6 == 0}
+ rev_data = {
+ rev["id"]: RevisionData(
+ date=ts2dt(rev["date"]) if idx % 2 != 0 else None,
+ origin=origin.id if idx % 3 != 0 else None,
+ )
+ for idx, rev in enumerate(data["revision"])
+ if idx % 6 != 0
+ }
+ assert revs
+ assert provenance_storage.revision_add(revs)
+ assert provenance_storage.revision_add(rev_data)
+ assert provenance_storage.revision_get(set(rev_data.keys())) == rev_data
+ assert provenance_storage.entity_get_all(EntityType.REVISION) == revs | set(
+ rev_data.keys()
+ )
+ def test_provenance_storage_relation(
+ self,
+ provenance_storage: ProvenanceStorageInterface,
+ ) -> None:
+ """Tests relation methods for every `ProvenanceStorageInterface` implementation."""
-def test_provenance_storage_location(
- provenance_storage: ProvenanceStorageInterface,
-) -> None:
- """Tests location methods for every `ProvenanceStorageInterface` implementation."""
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data("cmdbts2")
- # Read data/README.md for more details on how these datasets are generated.
- data = load_repo_data("cmdbts2")
+ # Test content-in-revision relation.
+ # Create flat models of every root directory for the revisions in the dataset.
+ cnt_in_rev: Dict[Sha1Git, Set[RelationData]] = {}
+ for rev in data["revision"]:
+ root = next(
+ subdir
+ for subdir in data["directory"]
+ if subdir["id"] == rev["directory"]
+ )
+ for cnt, rel in dircontent(data, rev["id"], root):
+ cnt_in_rev.setdefault(cnt, set()).add(rel)
+ relation_add_and_compare_result(
+ provenance_storage, RelationType.CNT_EARLY_IN_REV, cnt_in_rev
+ )
- # Add all names of entries present in the directories of the current repo as paths
- # to the storage. Then check that the returned results when querying are the same.
- paths = {entry["name"] for dir in data["directory"] for entry in dir["entries"]}
- assert provenance_storage.location_add(paths)
+ # Test content-in-directory relation.
+ # Create flat models for every directory in the dataset.
+ cnt_in_dir: Dict[Sha1Git, Set[RelationData]] = {}
+ for dir in data["directory"]:
+ for cnt, rel in dircontent(data, dir["id"], dir):
+ cnt_in_dir.setdefault(cnt, set()).add(rel)
+ relation_add_and_compare_result(
+ provenance_storage, RelationType.CNT_IN_DIR, cnt_in_dir
+ )
- if provenance_storage.with_path():
- assert provenance_storage.location_get_all() == paths
- else:
- assert provenance_storage.location_get_all() == set()
+ # Test content-in-directory relation.
+ # Add root directories to their correspondent revision in the dataset.
+ dir_in_rev: Dict[Sha1Git, Set[RelationData]] = {}
+ for rev in data["revision"]:
+ dir_in_rev.setdefault(rev["directory"], set()).add(
+ RelationData(dst=rev["id"], path=b".")
+ )
+ relation_add_and_compare_result(
+ provenance_storage, RelationType.DIR_IN_REV, dir_in_rev
+ )
+ # Test revision-in-origin relation.
+ # Origins must be inserted in advance (cannot be done by `entity_add` inside
+ # `relation_add_and_compare_result`).
+ orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]}
+ assert provenance_storage.origin_add(orgs)
+ # Add all revisions that are head of some snapshot branch to the corresponding
+ # origin.
+ rev_in_org: Dict[Sha1Git, Set[RelationData]] = {}
+ for status in data["origin_visit_status"]:
+ if status["snapshot"] is not None:
+ for snapshot in data["snapshot"]:
+ if snapshot["id"] == status["snapshot"]:
+ for branch in snapshot["branches"].values():
+ if branch["target_type"] == "revision":
+ rev_in_org.setdefault(branch["target"], set()).add(
+ RelationData(
+ dst=Origin(url=status["origin"]).id,
+ path=None,
+ )
+ )
+ relation_add_and_compare_result(
+ provenance_storage, RelationType.REV_IN_ORG, rev_in_org
+ )
-def test_provenance_storage_origin(
- provenance_storage: ProvenanceStorageInterface,
-) -> None:
- """Tests origin methods for every `ProvenanceStorageInterface` implementation."""
+ # Test revision-before-revision relation.
+ # For each revision in the data set add an entry for each parent to the relation.
+ rev_before_rev: Dict[Sha1Git, Set[RelationData]] = {}
+ for rev in data["revision"]:
+ for parent in rev["parents"]:
+ rev_before_rev.setdefault(parent, set()).add(
+ RelationData(dst=rev["id"], path=None)
+ )
+ relation_add_and_compare_result(
+ provenance_storage, RelationType.REV_BEFORE_REV, rev_before_rev
+ )
- # Read data/README.md for more details on how these datasets are generated.
- data = load_repo_data("cmdbts2")
+ def test_provenance_storage_find(
+ self,
+ provenance: ProvenanceInterface,
+ provenance_storage: ProvenanceStorageInterface,
+ archive: ArchiveInterface,
+ ) -> None:
+ """Tests `content_find_first` and `content_find_all` methods for every
+ `ProvenanceStorageInterface` implementation.
+ """
+
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data("cmdbts2")
+ fill_storage(archive.storage, data)
+
+ # Test content_find_first and content_find_all, first only executing the
+ # revision-content algorithm, then adding the origin-revision layer.
+ def adapt_result(
+ result: Optional[ProvenanceResult], with_path: bool
+ ) -> Optional[ProvenanceResult]:
+ if result is not None:
+ return ProvenanceResult(
+ result.content,
+ result.revision,
+ result.date,
+ result.origin,
+ result.path if with_path else b"",
+ )
+ return result
+
+ # Execute the revision-content algorithm on both storages.
+ revisions = [
+ RevisionEntry(id=rev["id"], date=ts2dt(rev["date"]), root=rev["directory"])
+ for rev in data["revision"]
+ ]
+ revision_add(provenance, archive, revisions)
+ revision_add(Provenance(provenance_storage), archive, revisions)
- # Test origin methods.
- # Add all origins present in the current repo to the storage. Then check that the
- # returned results when querying are the same.
- orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]}
- assert orgs
- assert provenance_storage.origin_add(orgs)
- assert provenance_storage.origin_get(set(orgs.keys())) == orgs
- assert provenance_storage.entity_get_all(EntityType.ORIGIN) == set(orgs.keys())
+ assert adapt_result(
+ ProvenanceResult(
+ content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
+ revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
+ date=datetime.fromtimestamp(1000000000.0, timezone.utc),
+ origin=None,
+ path=b"A/B/C/a",
+ ),
+ provenance_storage.with_path(),
+ ) == provenance_storage.content_find_first(
+ hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494")
+ )
+ for cnt in {cnt["sha1_git"] for cnt in data["content"]}:
+ assert adapt_result(
+ provenance.storage.content_find_first(cnt),
+ provenance_storage.with_path(),
+ ) == provenance_storage.content_find_first(cnt)
+ assert {
+ adapt_result(occur, provenance_storage.with_path())
+ for occur in provenance.storage.content_find_all(cnt)
+ } == set(provenance_storage.content_find_all(cnt))
+
+ # Execute the origin-revision algorithm on both storages.
+ origins = [
+ OriginEntry(url=sta["origin"], snapshot=sta["snapshot"])
+ for sta in data["origin_visit_status"]
+ if sta["snapshot"] is not None
+ ]
+ origin_add(provenance, archive, origins)
+ origin_add(Provenance(provenance_storage), archive, origins)
-def test_provenance_storage_revision(
- provenance_storage: ProvenanceStorageInterface,
-) -> None:
- """Tests revision methods for every `ProvenanceStorageInterface` implementation."""
-
- # Read data/README.md for more details on how these datasets are generated.
- data = load_repo_data("cmdbts2")
-
- # Test revision methods.
- # Add all revisions present in the current repo to the storage, assigning their
- # dates and an arbitrary origin to each one. Then check that the returned results
- # when querying are the same.
- origin = Origin(url=next(iter(data["origin"]))["url"])
- # Origin must be inserted in advance.
- assert provenance_storage.origin_add({origin.id: origin.url})
-
- revs = {rev["id"] for idx, rev in enumerate(data["revision"]) if idx % 6 == 0}
- rev_data = {
- rev["id"]: RevisionData(
- date=ts2dt(rev["date"]) if idx % 2 != 0 else None,
- origin=origin.id if idx % 3 != 0 else None,
+ assert adapt_result(
+ ProvenanceResult(
+ content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
+ revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
+ date=datetime.fromtimestamp(1000000000.0, timezone.utc),
+ origin="https://cmdbts2",
+ path=b"A/B/C/a",
+ ),
+ provenance_storage.with_path(),
+ ) == provenance_storage.content_find_first(
+ hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494")
)
- for idx, rev in enumerate(data["revision"])
- if idx % 6 != 0
- }
- assert revs
- assert provenance_storage.revision_add(revs)
- assert provenance_storage.revision_add(rev_data)
- assert provenance_storage.revision_get(set(rev_data.keys())) == rev_data
- assert provenance_storage.entity_get_all(EntityType.REVISION) == revs | set(
- rev_data.keys()
- )
+
+ for cnt in {cnt["sha1_git"] for cnt in data["content"]}:
+ assert adapt_result(
+ provenance.storage.content_find_first(cnt),
+ provenance_storage.with_path(),
+ ) == provenance_storage.content_find_first(cnt)
+ assert {
+ adapt_result(occur, provenance_storage.with_path())
+ for occur in provenance.storage.content_find_all(cnt)
+ } == set(provenance_storage.content_find_all(cnt))
+
+ def test_types(self, provenance_storage: ProvenanceStorageInterface) -> None:
+ """Checks all methods of ProvenanceStorageInterface are implemented by this
+ backend, and that they have the same signature."""
+ # Create an instance of the protocol (which cannot be instantiated
+ # directly, so this creates a subclass, then instantiates it)
+ interface = type("_", (ProvenanceStorageInterface,), {})()
+
+ assert "content_find_first" in dir(interface)
+
+ missing_methods = []
+
+ for meth_name in dir(interface):
+ if meth_name.startswith("_"):
+ continue
+ interface_meth = getattr(interface, meth_name)
+ try:
+ concrete_meth = getattr(provenance_storage, meth_name)
+ except AttributeError:
+ if not getattr(interface_meth, "deprecated_endpoint", False):
+ # The backend is missing a (non-deprecated) endpoint
+ missing_methods.append(meth_name)
+ continue
+
+ expected_signature = inspect.signature(interface_meth)
+ actual_signature = inspect.signature(concrete_meth)
+
+ assert expected_signature == actual_signature, meth_name
+
+ assert missing_methods == []
+
+ # If all the assertions above succeed, then this one should too.
+ # But there's no harm in double-checking.
+ # And we could replace the assertions above by this one, but unlike
+ # the assertions above, it doesn't explain what is missing.
+ assert isinstance(provenance_storage, ProvenanceStorageInterface)
def dircontent(
@@ -255,209 +468,3 @@
}
for src_sha1, rels in expected.items()
} == computed
-
-
-def test_provenance_storage_relation(
- provenance_storage: ProvenanceStorageInterface,
-) -> None:
- """Tests relation methods for every `ProvenanceStorageInterface` implementation."""
-
- # Read data/README.md for more details on how these datasets are generated.
- data = load_repo_data("cmdbts2")
-
- # Test content-in-revision relation.
- # Create flat models of every root directory for the revisions in the dataset.
- cnt_in_rev: Dict[Sha1Git, Set[RelationData]] = {}
- for rev in data["revision"]:
- root = next(
- subdir for subdir in data["directory"] if subdir["id"] == rev["directory"]
- )
- for cnt, rel in dircontent(data, rev["id"], root):
- cnt_in_rev.setdefault(cnt, set()).add(rel)
- relation_add_and_compare_result(
- provenance_storage, RelationType.CNT_EARLY_IN_REV, cnt_in_rev
- )
-
- # Test content-in-directory relation.
- # Create flat models for every directory in the dataset.
- cnt_in_dir: Dict[Sha1Git, Set[RelationData]] = {}
- for dir in data["directory"]:
- for cnt, rel in dircontent(data, dir["id"], dir):
- cnt_in_dir.setdefault(cnt, set()).add(rel)
- relation_add_and_compare_result(
- provenance_storage, RelationType.CNT_IN_DIR, cnt_in_dir
- )
-
- # Test content-in-directory relation.
- # Add root directories to their correspondent revision in the dataset.
- dir_in_rev: Dict[Sha1Git, Set[RelationData]] = {}
- for rev in data["revision"]:
- dir_in_rev.setdefault(rev["directory"], set()).add(
- RelationData(dst=rev["id"], path=b".")
- )
- relation_add_and_compare_result(
- provenance_storage, RelationType.DIR_IN_REV, dir_in_rev
- )
-
- # Test revision-in-origin relation.
- # Origins must be inserted in advance (cannot be done by `entity_add` inside
- # `relation_add_and_compare_result`).
- orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]}
- assert provenance_storage.origin_add(orgs)
- # Add all revisions that are head of some snapshot branch to the corresponding
- # origin.
- rev_in_org: Dict[Sha1Git, Set[RelationData]] = {}
- for status in data["origin_visit_status"]:
- if status["snapshot"] is not None:
- for snapshot in data["snapshot"]:
- if snapshot["id"] == status["snapshot"]:
- for branch in snapshot["branches"].values():
- if branch["target_type"] == "revision":
- rev_in_org.setdefault(branch["target"], set()).add(
- RelationData(
- dst=Origin(url=status["origin"]).id,
- path=None,
- )
- )
- relation_add_and_compare_result(
- provenance_storage, RelationType.REV_IN_ORG, rev_in_org
- )
-
- # Test revision-before-revision relation.
- # For each revision in the data set add an entry for each parent to the relation.
- rev_before_rev: Dict[Sha1Git, Set[RelationData]] = {}
- for rev in data["revision"]:
- for parent in rev["parents"]:
- rev_before_rev.setdefault(parent, set()).add(
- RelationData(dst=rev["id"], path=None)
- )
- relation_add_and_compare_result(
- provenance_storage, RelationType.REV_BEFORE_REV, rev_before_rev
- )
-
-
-def test_provenance_storage_find(
- provenance: ProvenanceInterface,
- provenance_storage: ProvenanceStorageInterface,
- archive: ArchiveInterface,
-) -> None:
- """Tests `content_find_first` and `content_find_all` methods for every
- `ProvenanceStorageInterface` implementation.
- """
-
- # Read data/README.md for more details on how these datasets are generated.
- data = load_repo_data("cmdbts2")
- fill_storage(archive.storage, data)
-
- # Test content_find_first and content_find_all, first only executing the
- # revision-content algorithm, then adding the origin-revision layer.
- def adapt_result(
- result: Optional[ProvenanceResult], with_path: bool
- ) -> Optional[ProvenanceResult]:
- if result is not None:
- return ProvenanceResult(
- result.content,
- result.revision,
- result.date,
- result.origin,
- result.path if with_path else b"",
- )
- return result
-
- # Execute the revision-content algorithm on both storages.
- revisions = [
- RevisionEntry(id=rev["id"], date=ts2dt(rev["date"]), root=rev["directory"])
- for rev in data["revision"]
- ]
- revision_add(provenance, archive, revisions)
- revision_add(Provenance(provenance_storage), archive, revisions)
-
- assert adapt_result(
- ProvenanceResult(
- content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
- revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
- date=datetime.fromtimestamp(1000000000.0, timezone.utc),
- origin=None,
- path=b"A/B/C/a",
- ),
- provenance_storage.with_path(),
- ) == provenance_storage.content_find_first(
- hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494")
- )
-
- for cnt in {cnt["sha1_git"] for cnt in data["content"]}:
- assert adapt_result(
- provenance.storage.content_find_first(cnt), provenance_storage.with_path()
- ) == provenance_storage.content_find_first(cnt)
- assert {
- adapt_result(occur, provenance_storage.with_path())
- for occur in provenance.storage.content_find_all(cnt)
- } == set(provenance_storage.content_find_all(cnt))
-
- # Execute the origin-revision algorithm on both storages.
- origins = [
- OriginEntry(url=sta["origin"], snapshot=sta["snapshot"])
- for sta in data["origin_visit_status"]
- if sta["snapshot"] is not None
- ]
- origin_add(provenance, archive, origins)
- origin_add(Provenance(provenance_storage), archive, origins)
-
- assert adapt_result(
- ProvenanceResult(
- content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
- revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
- date=datetime.fromtimestamp(1000000000.0, timezone.utc),
- origin="https://cmdbts2",
- path=b"A/B/C/a",
- ),
- provenance_storage.with_path(),
- ) == provenance_storage.content_find_first(
- hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494")
- )
-
- for cnt in {cnt["sha1_git"] for cnt in data["content"]}:
- assert adapt_result(
- provenance.storage.content_find_first(cnt), provenance_storage.with_path()
- ) == provenance_storage.content_find_first(cnt)
- assert {
- adapt_result(occur, provenance_storage.with_path())
- for occur in provenance.storage.content_find_all(cnt)
- } == set(provenance_storage.content_find_all(cnt))
-
-
-def test_types(provenance_storage: ProvenanceStorageInterface) -> None:
- """Checks all methods of ProvenanceStorageInterface are implemented by this
- backend, and that they have the same signature."""
- # Create an instance of the protocol (which cannot be instantiated
- # directly, so this creates a subclass, then instantiates it)
- interface = type("_", (ProvenanceStorageInterface,), {})()
-
- assert "content_find_first" in dir(interface)
-
- missing_methods = []
-
- for meth_name in dir(interface):
- if meth_name.startswith("_"):
- continue
- interface_meth = getattr(interface, meth_name)
- try:
- concrete_meth = getattr(provenance_storage, meth_name)
- except AttributeError:
- if not getattr(interface_meth, "deprecated_endpoint", False):
- # The backend is missing a (non-deprecated) endpoint
- missing_methods.append(meth_name)
- continue
-
- expected_signature = inspect.signature(interface_meth)
- actual_signature = inspect.signature(concrete_meth)
-
- assert expected_signature == actual_signature, meth_name
-
- assert missing_methods == []
-
- # If all the assertions above succeed, then this one should too.
- # But there's no harm in double-checking.
- # And we could replace the assertions above by this one, but unlike
- # the assertions above, it doesn't explain what is missing.
- assert isinstance(provenance_storage, ProvenanceStorageInterface)
diff --git a/swh/provenance/tests/test_provenance_storage_rabbitmq.py b/swh/provenance/tests/test_provenance_storage_rabbitmq.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_provenance_storage_rabbitmq.py
@@ -0,0 +1,38 @@
+from typing import Any, Dict, Generator
+
+import pytest
+
+from swh.provenance import get_provenance_storage
+from swh.provenance.interface import ProvenanceStorageInterface
+
+from .test_provenance_storage import TestProvenanceStorage # noqa: F401
+
+
+@pytest.fixture()
+def provenance_storage(
+ provenance_postgresqldb: Dict[str, str],
+ rabbitmq,
+) -> Generator[ProvenanceStorageInterface, None, None]:
+ """Return a working and initialized ProvenanceStorageInterface object"""
+
+ from swh.provenance.api.server import ProvenanceStorageRabbitMQServer
+
+ host = rabbitmq.args["host"]
+ port = rabbitmq.args["port"]
+ rabbitmq_params: Dict[str, Any] = {
+ "url": f"amqp://guest:guest@{host}:{port}/%2f",
+ "storage_config": {
+ "cls": "postgresql",
+ "db": provenance_postgresqldb,
+ "raise_on_commit": True,
+ },
+ }
+ server = ProvenanceStorageRabbitMQServer(
+ url=rabbitmq_params["url"], storage_config=rabbitmq_params["storage_config"]
+ )
+ server.start()
+ try:
+ with get_provenance_storage(cls="rabbitmq", **rabbitmq_params) as storage:
+ yield storage
+ finally:
+ server.stop()
diff --git a/swh/provenance/tests/test_provenance_storage_with_path_denormalized.py b/swh/provenance/tests/test_provenance_storage_with_path_denormalized.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_provenance_storage_with_path_denormalized.py
@@ -0,0 +1,19 @@
+from functools import partial
+
+from pytest_postgresql import factories
+
+from swh.core.db.db_utils import initialize_database_for_module
+from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql
+
+from .test_provenance_storage import TestProvenanceStorage # noqa: F401
+
+provenance_postgresql_proc = factories.postgresql_proc(
+ load=[
+ partial(
+ initialize_database_for_module,
+ modname="provenance",
+ flavor="with-path-denormalized",
+ version=ProvenanceStoragePostgreSql.current_version,
+ )
+ ],
+)
diff --git a/swh/provenance/tests/test_provenance_storage_without_path.py b/swh/provenance/tests/test_provenance_storage_without_path.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_provenance_storage_without_path.py
@@ -0,0 +1,19 @@
+from functools import partial
+
+from pytest_postgresql import factories
+
+from swh.core.db.db_utils import initialize_database_for_module
+from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql
+
+from .test_provenance_storage import TestProvenanceStorage # noqa: F401
+
+provenance_postgresql_proc = factories.postgresql_proc(
+ load=[
+ partial(
+ initialize_database_for_module,
+ modname="provenance",
+ flavor="without-path",
+ version=ProvenanceStoragePostgreSql.current_version,
+ )
+ ],
+)
diff --git a/swh/provenance/tests/test_provenance_storage_without_path_denormalized.py b/swh/provenance/tests/test_provenance_storage_without_path_denormalized.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_provenance_storage_without_path_denormalized.py
@@ -0,0 +1,19 @@
+from functools import partial
+
+from pytest_postgresql import factories
+
+from swh.core.db.db_utils import initialize_database_for_module
+from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql
+
+from .test_provenance_storage import TestProvenanceStorage # noqa: F401
+
+provenance_postgresql_proc = factories.postgresql_proc(
+ load=[
+ partial(
+ initialize_database_for_module,
+ modname="provenance",
+ flavor="without-path-denormalized",
+ version=ProvenanceStoragePostgreSql.current_version,
+ )
+ ],
+)

File Metadata

Mime Type
text/plain
Expires
Thu, Dec 19, 12:14 PM (15 h, 24 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3230766

Event Timeline