D8591.id31021.diff
No OneTemporary
Actions

Size

41 KB

Subscribers

None

D8591.id31021.diff
View Options

	diff --git a/requirements-swh.txt b/requirements-swh.txt
	--- a/requirements-swh.txt
	+++ b/requirements-swh.txt
	@@ -1,5 +1,5 @@
	# Add here internal Software Heritage dependencies, one per line.
	-swh.core[db,http] >= 0.14
	+swh.core[db,http] >= 2
	swh.model >= 2.6.1
	swh.storage
	swh.graph >= 2.0.0
	diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py
	--- a/swh/provenance/__init__.py
	+++ b/swh/provenance/__init__.py
	@@ -119,3 +119,6 @@
	return rmq_storage

	raise ValueError
	+
	+
	+get_datastore = get_provenance_storage
	diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py
	--- a/swh/provenance/postgresql/provenance.py
	+++ b/swh/provenance/postgresql/provenance.py
	@@ -51,6 +51,8 @@


	class ProvenanceStoragePostgreSql:
	+ current_version = 3
	+
	def __init__(
	self, page_size: Optional[int] = None, raise_on_commit: bool = False, **kwargs
	) -> None:
	diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql
	--- a/swh/provenance/sql/30-schema.sql
	+++ b/swh/provenance/sql/30-schema.sql
	@@ -2,22 +2,6 @@
	select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset
	select position('without-path' in swh_get_dbflavor()::text) = 0 as dbflavor_with_path \gset

	-create table dbversion
	-(
	- version int primary key,
	- release timestamptz,
	- description text
	-);
	-
	-comment on table dbversion is 'Details of current db version';
	-comment on column dbversion.version is 'SQL schema version';
	-comment on column dbversion.release is 'Version deployment timestamp';
	-comment on column dbversion.description is 'Release description';
	-
	--- latest schema version
	-insert into dbversion(version, release, description)
	- values(3, now(), 'Work In Progress');
	-
	-- a Git object ID, i.e., a Git-style salted SHA1 checksum
	create domain sha1_git as bytea check (length(value) = 20);

	diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
	--- a/swh/provenance/tests/conftest.py
	+++ b/swh/provenance/tests/conftest.py
	@@ -5,6 +5,7 @@

	from contextlib import contextmanager
	from datetime import datetime
	+from functools import partial
	import multiprocessing
	from os import path
	from pathlib import Path
	@@ -15,87 +16,56 @@
	import msgpack
	import psycopg2.extensions
	import pytest
	-from pytest_postgresql.factories import postgresql
	+from pytest_postgresql import factories

	+from swh.core.db.db_utils import initialize_database_for_module
	from swh.graph.http_rpc_server import make_app
	from swh.journal.serializers import msgpack_ext_hook
	from swh.model.model import BaseModel, TimestampWithTimezone
	from swh.provenance import get_provenance, get_provenance_storage
	from swh.provenance.archive import ArchiveInterface
	from swh.provenance.interface import ProvenanceInterface, ProvenanceStorageInterface
	+from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql
	from swh.provenance.storage.archive import ArchiveStorage
	from swh.storage.interface import StorageInterface
	from swh.storage.replay import OBJECT_CONVERTERS, OBJECT_FIXERS, process_replay_objects

	-
	-@pytest.fixture(
	- params=[
	- "with-path",
	- "without-path",
	- "with-path-denormalized",
	- "without-path-denormalized",
	- ]
	+provenance_postgresql_proc = factories.postgresql_proc(
	+ load=[
	+ partial(
	+ initialize_database_for_module,
	+ modname="provenance",
	+ flavor="with-path",
	+ version=ProvenanceStoragePostgreSql.current_version,
	+ )
	+ ],
	)
	-def provenance_postgresqldb(
	- request: SubRequest,
	- postgresql: psycopg2.extensions.connection,
	-) -> Dict[str, str]:
	- """return a working and initialized provenance db"""
	- from swh.core.db.db_utils import (
	- init_admin_extensions,
	- populate_database_for_package,
	- )

	- init_admin_extensions("swh.provenance", postgresql.dsn)
	- populate_database_for_package(
	- "swh.provenance", postgresql.dsn, flavor=request.param
	- )
	- return postgresql.get_dsn_parameters()
	+postgres_provenance = factories.postgresql("provenance_postgresql_proc")


	-@pytest.fixture(params=["postgresql", "rabbitmq"])
	+@pytest.fixture()
	+def provenance_postgresqldb(request, postgres_provenance):
	+ return postgres_provenance.get_dsn_parameters()
	+
	+
	+@pytest.fixture()
	def provenance_storage(
	request: SubRequest,
	provenance_postgresqldb: Dict[str, str],
	) -> Generator[ProvenanceStorageInterface, None, None]:
	"""Return a working and initialized ProvenanceStorageInterface object"""

	- if request.param == "rabbitmq":
	- from swh.provenance.api.server import ProvenanceStorageRabbitMQServer
	-
	- rabbitmq = request.getfixturevalue("rabbitmq")
	- host = rabbitmq.args["host"]
	- port = rabbitmq.args["port"]
	- rabbitmq_params: Dict[str, Any] = {
	- "url": f"amqp://guest:guest@{host}:{port}/%2f",
	- "storage_config": {
	- "cls": "postgresql",
	- "db": provenance_postgresqldb,
	- "raise_on_commit": True,
	- },
	- }
	- server = ProvenanceStorageRabbitMQServer(
	- url=rabbitmq_params["url"], storage_config=rabbitmq_params["storage_config"]
	- )
	- server.start()
	- with get_provenance_storage(cls=request.param, **rabbitmq_params) as storage:
	- yield storage
	- server.stop()
	-
	- else:
	- # in test sessions, we DO want to raise any exception occurring at commit time
	- with get_provenance_storage(
	- cls=request.param, db=provenance_postgresqldb, raise_on_commit=True
	- ) as storage:
	- yield storage
	-
	-
	-provenance_postgresql = postgresql("postgresql_proc", dbname="provenance_tests")
	+ # in test sessions, we DO want to raise any exception occurring at commit time
	+ with get_provenance_storage(
	+ cls="postgresql", db=provenance_postgresqldb, raise_on_commit=True
	+ ) as storage:
	+ yield storage


	@pytest.fixture
	def provenance(
	- provenance_postgresql: psycopg2.extensions.connection,
	+ postgres_provenance: psycopg2.extensions.connection,
	) -> Generator[ProvenanceInterface, None, None]:
	"""Return a working and initialized ProvenanceInterface object"""

	@@ -104,14 +74,14 @@
	populate_database_for_package,
	)

	- init_admin_extensions("swh.provenance", provenance_postgresql.dsn)
	+ init_admin_extensions("swh.provenance", postgres_provenance.dsn)
	populate_database_for_package(
	- "swh.provenance", provenance_postgresql.dsn, flavor="with-path"
	+ "swh.provenance", postgres_provenance.dsn, flavor="with-path"
	)
	# in test sessions, we DO want to raise any exception occurring at commit time
	with get_provenance(
	cls="postgresql",
	- db=provenance_postgresql.get_dsn_parameters(),
	+ db=postgres_provenance.get_dsn_parameters(),
	raise_on_commit=True,
	) as provenance:
	yield provenance
	@@ -169,9 +139,15 @@

	def run_grpc_server(queue, dataset_path):
	try:
	- config = {"graph": {"path": dataset_path}}
	+ config = {
	+ "graph": {
	+ "cls": "local",
	+ "grpc_server": {"path": dataset_path},
	+ "http_rpc_server": {"debug": True},
	+ }
	+ }
	with loop_context() as loop:
	- app = make_app(config=config, debug=True, spawn_rpc_port=None)
	+ app = make_app(config=config)
	client = TestClient(TestServer(app), loop=loop)
	loop.run_until_complete(client.start_server())
	url = client.make_url("/graph/")
	diff --git a/swh/provenance/tests/test_journal_client.py b/swh/provenance/tests/test_journal_client.py
	--- a/swh/provenance/tests/test_journal_client.py
	+++ b/swh/provenance/tests/test_journal_client.py
	@@ -35,7 +35,7 @@
	consumer: Consumer,
	tmp_path: str,
	provenance,
	- provenance_postgresql,
	+ postgres_provenance,
	) -> None:
	"""Test origin journal client cli"""

	@@ -63,7 +63,7 @@
	},
	"storage": {
	"cls": "postgresql",
	- "db": provenance_postgresql.get_dsn_parameters(),
	+ "db": postgres_provenance.get_dsn_parameters(),
	},
	},
	}
	@@ -89,7 +89,7 @@
	consumer: Consumer,
	tmp_path: str,
	provenance,
	- provenance_postgresql,
	+ postgres_provenance,
	) -> None:
	"""Test revision journal client cli"""

	@@ -116,7 +116,7 @@
	},
	"storage": {
	"cls": "postgresql",
	- "db": provenance_postgresql.get_dsn_parameters(),
	+ "db": postgres_provenance.get_dsn_parameters(),
	},
	},
	}
	diff --git a/swh/provenance/tests/test_provenance_storage.py b/swh/provenance/tests/test_provenance_storage.py
	--- a/swh/provenance/tests/test_provenance_storage.py
	+++ b/swh/provenance/tests/test_provenance_storage.py
	@@ -28,130 +28,343 @@
	from swh.provenance.tests.conftest import fill_storage, load_repo_data, ts2dt


	-def test_provenance_storage_content(
	- provenance_storage: ProvenanceStorageInterface,
	-) -> None:
	- """Tests content methods for every `ProvenanceStorageInterface` implementation."""
	-
	- # Read data/README.md for more details on how these datasets are generated.
	- data = load_repo_data("cmdbts2")
	-
	- # Add all content present in the current repo to the storage, just assigning their
	- # creation dates. Then check that the returned results when querying are the same.
	- cnt_dates = {
	- cnt["sha1_git"]: cnt["ctime"] for idx, cnt in enumerate(data["content"])
	- }
	- assert provenance_storage.content_add(cnt_dates)
	- assert provenance_storage.content_get(set(cnt_dates.keys())) == cnt_dates
	- assert provenance_storage.entity_get_all(EntityType.CONTENT) == set(
	- cnt_dates.keys()
	- )
	+class TestProvenanceStorage:
	+ def test_provenance_storage_content(
	+ self,
	+ provenance_storage: ProvenanceStorageInterface,
	+ ) -> None:
	+ """Tests content methods for every `ProvenanceStorageInterface` implementation."""
	+
	+ # Read data/README.md for more details on how these datasets are generated.
	+ data = load_repo_data("cmdbts2")
	+
	+ # Add all content present in the current repo to the storage, just assigning their
	+ # creation dates. Then check that the returned results when querying are the same.
	+ cnt_dates = {
	+ cnt["sha1_git"]: cnt["ctime"] for idx, cnt in enumerate(data["content"])
	+ }
	+ assert provenance_storage.content_add(cnt_dates)
	+ assert provenance_storage.content_get(set(cnt_dates.keys())) == cnt_dates
	+ assert provenance_storage.entity_get_all(EntityType.CONTENT) == set(
	+ cnt_dates.keys()
	+ )

	+ def test_provenance_storage_directory(
	+ self,
	+ provenance_storage: ProvenanceStorageInterface,
	+ ) -> None:
	+ """Tests directory methods for every `ProvenanceStorageInterface` implementation."""
	+
	+ # Read data/README.md for more details on how these datasets are generated.
	+ data = load_repo_data("cmdbts2")
	+
	+ # Of all directories present in the current repo, only assign a date to those
	+ # containing blobs (picking the max date among the available ones). Then check that
	+ # the returned results when querying are the same.
	+ def getmaxdate(
	+ directory: Dict[str, Any], contents: Iterable[Dict[str, Any]]
	+ ) -> Optional[datetime]:
	+ dates = [
	+ content["ctime"]
	+ for entry in directory["entries"]
	+ for content in contents
	+ if entry["type"] == "file" and entry["target"] == content["sha1_git"]
	+ ]
	+ return max(dates) if dates else None
	+
	+ flat_values = (False, True)
	+ dir_dates = {}
	+ for idx, dir in enumerate(data["directory"]):
	+ date = getmaxdate(dir, data["content"])
	+ if date is not None:
	+ dir_dates[dir["id"]] = DirectoryData(
	+ date=date, flat=flat_values[idx % 2]
	+ )
	+ assert provenance_storage.directory_add(dir_dates)
	+ assert provenance_storage.directory_get(set(dir_dates.keys())) == dir_dates
	+ assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == set(
	+ dir_dates.keys()
	+ )

	-def test_provenance_storage_directory(
	- provenance_storage: ProvenanceStorageInterface,
	-) -> None:
	- """Tests directory methods for every `ProvenanceStorageInterface` implementation."""
	-
	- # Read data/README.md for more details on how these datasets are generated.
	- data = load_repo_data("cmdbts2")
	-
	- # Of all directories present in the current repo, only assign a date to those
	- # containing blobs (picking the max date among the available ones). Then check that
	- # the returned results when querying are the same.
	- def getmaxdate(
	- directory: Dict[str, Any], contents: Iterable[Dict[str, Any]]
	- ) -> Optional[datetime]:
	- dates = [
	- content["ctime"]
	- for entry in directory["entries"]
	- for content in contents
	- if entry["type"] == "file" and entry["target"] == content["sha1_git"]
	- ]
	- return max(dates) if dates else None
	-
	- flat_values = (False, True)
	- dir_dates = {}
	- for idx, dir in enumerate(data["directory"]):
	- date = getmaxdate(dir, data["content"])
	- if date is not None:
	- dir_dates[dir["id"]] = DirectoryData(date=date, flat=flat_values[idx % 2])
	- assert provenance_storage.directory_add(dir_dates)
	- assert provenance_storage.directory_get(set(dir_dates.keys())) == dir_dates
	- assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == set(
	- dir_dates.keys()
	- )
	+ def test_provenance_storage_location(
	+ self,
	+ provenance_storage: ProvenanceStorageInterface,
	+ ) -> None:
	+ """Tests location methods for every `ProvenanceStorageInterface` implementation."""
	+
	+ # Read data/README.md for more details on how these datasets are generated.
	+ data = load_repo_data("cmdbts2")
	+
	+ # Add all names of entries present in the directories of the current repo as paths
	+ # to the storage. Then check that the returned results when querying are the same.
	+ paths = {entry["name"] for dir in data["directory"] for entry in dir["entries"]}
	+ assert provenance_storage.location_add(paths)
	+
	+ if provenance_storage.with_path():
	+ assert provenance_storage.location_get_all() == paths
	+ else:
	+ assert provenance_storage.location_get_all() == set()
	+
	+ def test_provenance_storage_origin(
	+ self,
	+ provenance_storage: ProvenanceStorageInterface,
	+ ) -> None:
	+ """Tests origin methods for every `ProvenanceStorageInterface` implementation."""
	+
	+ # Read data/README.md for more details on how these datasets are generated.
	+ data = load_repo_data("cmdbts2")
	+
	+ # Test origin methods.
	+ # Add all origins present in the current repo to the storage. Then check that the
	+ # returned results when querying are the same.
	+ orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]}
	+ assert orgs
	+ assert provenance_storage.origin_add(orgs)
	+ assert provenance_storage.origin_get(set(orgs.keys())) == orgs
	+ assert provenance_storage.entity_get_all(EntityType.ORIGIN) == set(orgs.keys())
	+
	+ def test_provenance_storage_revision(
	+ self,
	+ provenance_storage: ProvenanceStorageInterface,
	+ ) -> None:
	+ """Tests revision methods for every `ProvenanceStorageInterface` implementation."""
	+
	+ # Read data/README.md for more details on how these datasets are generated.
	+ data = load_repo_data("cmdbts2")
	+
	+ # Test revision methods.
	+ # Add all revisions present in the current repo to the storage, assigning their
	+ # dates and an arbitrary origin to each one. Then check that the returned results
	+ # when querying are the same.
	+ origin = Origin(url=next(iter(data["origin"]))["url"])
	+ # Origin must be inserted in advance.
	+ assert provenance_storage.origin_add({origin.id: origin.url})
	+
	+ revs = {rev["id"] for idx, rev in enumerate(data["revision"]) if idx % 6 == 0}
	+ rev_data = {
	+ rev["id"]: RevisionData(
	+ date=ts2dt(rev["date"]) if idx % 2 != 0 else None,
	+ origin=origin.id if idx % 3 != 0 else None,
	+ )
	+ for idx, rev in enumerate(data["revision"])
	+ if idx % 6 != 0
	+ }
	+ assert revs
	+ assert provenance_storage.revision_add(revs)
	+ assert provenance_storage.revision_add(rev_data)
	+ assert provenance_storage.revision_get(set(rev_data.keys())) == rev_data
	+ assert provenance_storage.entity_get_all(EntityType.REVISION) == revs \| set(
	+ rev_data.keys()
	+ )

	+ def test_provenance_storage_relation(
	+ self,
	+ provenance_storage: ProvenanceStorageInterface,
	+ ) -> None:
	+ """Tests relation methods for every `ProvenanceStorageInterface` implementation."""

	-def test_provenance_storage_location(
	- provenance_storage: ProvenanceStorageInterface,
	-) -> None:
	- """Tests location methods for every `ProvenanceStorageInterface` implementation."""
	+ # Read data/README.md for more details on how these datasets are generated.
	+ data = load_repo_data("cmdbts2")

	- # Read data/README.md for more details on how these datasets are generated.
	- data = load_repo_data("cmdbts2")
	+ # Test content-in-revision relation.
	+ # Create flat models of every root directory for the revisions in the dataset.
	+ cnt_in_rev: Dict[Sha1Git, Set[RelationData]] = {}
	+ for rev in data["revision"]:
	+ root = next(
	+ subdir
	+ for subdir in data["directory"]
	+ if subdir["id"] == rev["directory"]
	+ )
	+ for cnt, rel in dircontent(data, rev["id"], root):
	+ cnt_in_rev.setdefault(cnt, set()).add(rel)
	+ relation_add_and_compare_result(
	+ provenance_storage, RelationType.CNT_EARLY_IN_REV, cnt_in_rev
	+ )

	- # Add all names of entries present in the directories of the current repo as paths
	- # to the storage. Then check that the returned results when querying are the same.
	- paths = {entry["name"] for dir in data["directory"] for entry in dir["entries"]}
	- assert provenance_storage.location_add(paths)
	+ # Test content-in-directory relation.
	+ # Create flat models for every directory in the dataset.
	+ cnt_in_dir: Dict[Sha1Git, Set[RelationData]] = {}
	+ for dir in data["directory"]:
	+ for cnt, rel in dircontent(data, dir["id"], dir):
	+ cnt_in_dir.setdefault(cnt, set()).add(rel)
	+ relation_add_and_compare_result(
	+ provenance_storage, RelationType.CNT_IN_DIR, cnt_in_dir
	+ )

	- if provenance_storage.with_path():
	- assert provenance_storage.location_get_all() == paths
	- else:
	- assert provenance_storage.location_get_all() == set()
	+ # Test content-in-directory relation.
	+ # Add root directories to their correspondent revision in the dataset.
	+ dir_in_rev: Dict[Sha1Git, Set[RelationData]] = {}
	+ for rev in data["revision"]:
	+ dir_in_rev.setdefault(rev["directory"], set()).add(
	+ RelationData(dst=rev["id"], path=b".")
	+ )
	+ relation_add_and_compare_result(
	+ provenance_storage, RelationType.DIR_IN_REV, dir_in_rev
	+ )

	+ # Test revision-in-origin relation.
	+ # Origins must be inserted in advance (cannot be done by `entity_add` inside
	+ # `relation_add_and_compare_result`).
	+ orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]}
	+ assert provenance_storage.origin_add(orgs)
	+ # Add all revisions that are head of some snapshot branch to the corresponding
	+ # origin.
	+ rev_in_org: Dict[Sha1Git, Set[RelationData]] = {}
	+ for status in data["origin_visit_status"]:
	+ if status["snapshot"] is not None:
	+ for snapshot in data["snapshot"]:
	+ if snapshot["id"] == status["snapshot"]:
	+ for branch in snapshot["branches"].values():
	+ if branch["target_type"] == "revision":
	+ rev_in_org.setdefault(branch["target"], set()).add(
	+ RelationData(
	+ dst=Origin(url=status["origin"]).id,
	+ path=None,
	+ )
	+ )
	+ relation_add_and_compare_result(
	+ provenance_storage, RelationType.REV_IN_ORG, rev_in_org
	+ )

	-def test_provenance_storage_origin(
	- provenance_storage: ProvenanceStorageInterface,
	-) -> None:
	- """Tests origin methods for every `ProvenanceStorageInterface` implementation."""
	+ # Test revision-before-revision relation.
	+ # For each revision in the data set add an entry for each parent to the relation.
	+ rev_before_rev: Dict[Sha1Git, Set[RelationData]] = {}
	+ for rev in data["revision"]:
	+ for parent in rev["parents"]:
	+ rev_before_rev.setdefault(parent, set()).add(
	+ RelationData(dst=rev["id"], path=None)
	+ )
	+ relation_add_and_compare_result(
	+ provenance_storage, RelationType.REV_BEFORE_REV, rev_before_rev
	+ )

	- # Read data/README.md for more details on how these datasets are generated.
	- data = load_repo_data("cmdbts2")
	+ def test_provenance_storage_find(
	+ self,
	+ provenance: ProvenanceInterface,
	+ provenance_storage: ProvenanceStorageInterface,
	+ archive: ArchiveInterface,
	+ ) -> None:
	+ """Tests `content_find_first` and `content_find_all` methods for every
	+ `ProvenanceStorageInterface` implementation.
	+ """
	+
	+ # Read data/README.md for more details on how these datasets are generated.
	+ data = load_repo_data("cmdbts2")
	+ fill_storage(archive.storage, data)
	+
	+ # Test content_find_first and content_find_all, first only executing the
	+ # revision-content algorithm, then adding the origin-revision layer.
	+ def adapt_result(
	+ result: Optional[ProvenanceResult], with_path: bool
	+ ) -> Optional[ProvenanceResult]:
	+ if result is not None:
	+ return ProvenanceResult(
	+ result.content,
	+ result.revision,
	+ result.date,
	+ result.origin,
	+ result.path if with_path else b"",
	+ )
	+ return result
	+
	+ # Execute the revision-content algorithm on both storages.
	+ revisions = [
	+ RevisionEntry(id=rev["id"], date=ts2dt(rev["date"]), root=rev["directory"])
	+ for rev in data["revision"]
	+ ]
	+ revision_add(provenance, archive, revisions)
	+ revision_add(Provenance(provenance_storage), archive, revisions)

	- # Test origin methods.
	- # Add all origins present in the current repo to the storage. Then check that the
	- # returned results when querying are the same.
	- orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]}
	- assert orgs
	- assert provenance_storage.origin_add(orgs)
	- assert provenance_storage.origin_get(set(orgs.keys())) == orgs
	- assert provenance_storage.entity_get_all(EntityType.ORIGIN) == set(orgs.keys())
	+ assert adapt_result(
	+ ProvenanceResult(
	+ content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
	+ revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
	+ date=datetime.fromtimestamp(1000000000.0, timezone.utc),
	+ origin=None,
	+ path=b"A/B/C/a",
	+ ),
	+ provenance_storage.with_path(),
	+ ) == provenance_storage.content_find_first(
	+ hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494")
	+ )

	+ for cnt in {cnt["sha1_git"] for cnt in data["content"]}:
	+ assert adapt_result(
	+ provenance.storage.content_find_first(cnt),
	+ provenance_storage.with_path(),
	+ ) == provenance_storage.content_find_first(cnt)
	+ assert {
	+ adapt_result(occur, provenance_storage.with_path())
	+ for occur in provenance.storage.content_find_all(cnt)
	+ } == set(provenance_storage.content_find_all(cnt))
	+
	+ # Execute the origin-revision algorithm on both storages.
	+ origins = [
	+ OriginEntry(url=sta["origin"], snapshot=sta["snapshot"])
	+ for sta in data["origin_visit_status"]
	+ if sta["snapshot"] is not None
	+ ]
	+ origin_add(provenance, archive, origins)
	+ origin_add(Provenance(provenance_storage), archive, origins)

	-def test_provenance_storage_revision(
	- provenance_storage: ProvenanceStorageInterface,
	-) -> None:
	- """Tests revision methods for every `ProvenanceStorageInterface` implementation."""
	-
	- # Read data/README.md for more details on how these datasets are generated.
	- data = load_repo_data("cmdbts2")
	-
	- # Test revision methods.
	- # Add all revisions present in the current repo to the storage, assigning their
	- # dates and an arbitrary origin to each one. Then check that the returned results
	- # when querying are the same.
	- origin = Origin(url=next(iter(data["origin"]))["url"])
	- # Origin must be inserted in advance.
	- assert provenance_storage.origin_add({origin.id: origin.url})
	-
	- revs = {rev["id"] for idx, rev in enumerate(data["revision"]) if idx % 6 == 0}
	- rev_data = {
	- rev["id"]: RevisionData(
	- date=ts2dt(rev["date"]) if idx % 2 != 0 else None,
	- origin=origin.id if idx % 3 != 0 else None,
	+ assert adapt_result(
	+ ProvenanceResult(
	+ content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
	+ revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
	+ date=datetime.fromtimestamp(1000000000.0, timezone.utc),
	+ origin="https://cmdbts2",
	+ path=b"A/B/C/a",
	+ ),
	+ provenance_storage.with_path(),
	+ ) == provenance_storage.content_find_first(
	+ hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494")
	)
	- for idx, rev in enumerate(data["revision"])
	- if idx % 6 != 0
	- }
	- assert revs
	- assert provenance_storage.revision_add(revs)
	- assert provenance_storage.revision_add(rev_data)
	- assert provenance_storage.revision_get(set(rev_data.keys())) == rev_data
	- assert provenance_storage.entity_get_all(EntityType.REVISION) == revs \| set(
	- rev_data.keys()
	- )
	+
	+ for cnt in {cnt["sha1_git"] for cnt in data["content"]}:
	+ assert adapt_result(
	+ provenance.storage.content_find_first(cnt),
	+ provenance_storage.with_path(),
	+ ) == provenance_storage.content_find_first(cnt)
	+ assert {
	+ adapt_result(occur, provenance_storage.with_path())
	+ for occur in provenance.storage.content_find_all(cnt)
	+ } == set(provenance_storage.content_find_all(cnt))
	+
	+ def test_types(self, provenance_storage: ProvenanceStorageInterface) -> None:
	+ """Checks all methods of ProvenanceStorageInterface are implemented by this
	+ backend, and that they have the same signature."""
	+ # Create an instance of the protocol (which cannot be instantiated
	+ # directly, so this creates a subclass, then instantiates it)
	+ interface = type("_", (ProvenanceStorageInterface,), {})()
	+
	+ assert "content_find_first" in dir(interface)
	+
	+ missing_methods = []
	+
	+ for meth_name in dir(interface):
	+ if meth_name.startswith("_"):
	+ continue
	+ interface_meth = getattr(interface, meth_name)
	+ try:
	+ concrete_meth = getattr(provenance_storage, meth_name)
	+ except AttributeError:
	+ if not getattr(interface_meth, "deprecated_endpoint", False):
	+ # The backend is missing a (non-deprecated) endpoint
	+ missing_methods.append(meth_name)
	+ continue
	+
	+ expected_signature = inspect.signature(interface_meth)
	+ actual_signature = inspect.signature(concrete_meth)
	+
	+ assert expected_signature == actual_signature, meth_name
	+
	+ assert missing_methods == []
	+
	+ # If all the assertions above succeed, then this one should too.
	+ # But there's no harm in double-checking.
	+ # And we could replace the assertions above by this one, but unlike
	+ # the assertions above, it doesn't explain what is missing.
	+ assert isinstance(provenance_storage, ProvenanceStorageInterface)


	def dircontent(
	@@ -255,209 +468,3 @@
	}
	for src_sha1, rels in expected.items()
	} == computed
	-
	-
	-def test_provenance_storage_relation(
	- provenance_storage: ProvenanceStorageInterface,
	-) -> None:
	- """Tests relation methods for every `ProvenanceStorageInterface` implementation."""
	-
	- # Read data/README.md for more details on how these datasets are generated.
	- data = load_repo_data("cmdbts2")
	-
	- # Test content-in-revision relation.
	- # Create flat models of every root directory for the revisions in the dataset.
	- cnt_in_rev: Dict[Sha1Git, Set[RelationData]] = {}
	- for rev in data["revision"]:
	- root = next(
	- subdir for subdir in data["directory"] if subdir["id"] == rev["directory"]
	- )
	- for cnt, rel in dircontent(data, rev["id"], root):
	- cnt_in_rev.setdefault(cnt, set()).add(rel)
	- relation_add_and_compare_result(
	- provenance_storage, RelationType.CNT_EARLY_IN_REV, cnt_in_rev
	- )
	-
	- # Test content-in-directory relation.
	- # Create flat models for every directory in the dataset.
	- cnt_in_dir: Dict[Sha1Git, Set[RelationData]] = {}
	- for dir in data["directory"]:
	- for cnt, rel in dircontent(data, dir["id"], dir):
	- cnt_in_dir.setdefault(cnt, set()).add(rel)
	- relation_add_and_compare_result(
	- provenance_storage, RelationType.CNT_IN_DIR, cnt_in_dir
	- )
	-
	- # Test content-in-directory relation.
	- # Add root directories to their correspondent revision in the dataset.
	- dir_in_rev: Dict[Sha1Git, Set[RelationData]] = {}
	- for rev in data["revision"]:
	- dir_in_rev.setdefault(rev["directory"], set()).add(
	- RelationData(dst=rev["id"], path=b".")
	- )
	- relation_add_and_compare_result(
	- provenance_storage, RelationType.DIR_IN_REV, dir_in_rev
	- )
	-
	- # Test revision-in-origin relation.
	- # Origins must be inserted in advance (cannot be done by `entity_add` inside
	- # `relation_add_and_compare_result`).
	- orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]}
	- assert provenance_storage.origin_add(orgs)
	- # Add all revisions that are head of some snapshot branch to the corresponding
	- # origin.
	- rev_in_org: Dict[Sha1Git, Set[RelationData]] = {}
	- for status in data["origin_visit_status"]:
	- if status["snapshot"] is not None:
	- for snapshot in data["snapshot"]:
	- if snapshot["id"] == status["snapshot"]:
	- for branch in snapshot["branches"].values():
	- if branch["target_type"] == "revision":
	- rev_in_org.setdefault(branch["target"], set()).add(
	- RelationData(
	- dst=Origin(url=status["origin"]).id,
	- path=None,
	- )
	- )
	- relation_add_and_compare_result(
	- provenance_storage, RelationType.REV_IN_ORG, rev_in_org
	- )
	-
	- # Test revision-before-revision relation.
	- # For each revision in the data set add an entry for each parent to the relation.
	- rev_before_rev: Dict[Sha1Git, Set[RelationData]] = {}
	- for rev in data["revision"]:
	- for parent in rev["parents"]:
	- rev_before_rev.setdefault(parent, set()).add(
	- RelationData(dst=rev["id"], path=None)
	- )
	- relation_add_and_compare_result(
	- provenance_storage, RelationType.REV_BEFORE_REV, rev_before_rev
	- )
	-
	-
	-def test_provenance_storage_find(
	- provenance: ProvenanceInterface,
	- provenance_storage: ProvenanceStorageInterface,
	- archive: ArchiveInterface,
	-) -> None:
	- """Tests `content_find_first` and `content_find_all` methods for every
	- `ProvenanceStorageInterface` implementation.
	- """
	-
	- # Read data/README.md for more details on how these datasets are generated.
	- data = load_repo_data("cmdbts2")
	- fill_storage(archive.storage, data)
	-
	- # Test content_find_first and content_find_all, first only executing the
	- # revision-content algorithm, then adding the origin-revision layer.
	- def adapt_result(
	- result: Optional[ProvenanceResult], with_path: bool
	- ) -> Optional[ProvenanceResult]:
	- if result is not None:
	- return ProvenanceResult(
	- result.content,
	- result.revision,
	- result.date,
	- result.origin,
	- result.path if with_path else b"",
	- )
	- return result
	-
	- # Execute the revision-content algorithm on both storages.
	- revisions = [
	- RevisionEntry(id=rev["id"], date=ts2dt(rev["date"]), root=rev["directory"])
	- for rev in data["revision"]
	- ]
	- revision_add(provenance, archive, revisions)
	- revision_add(Provenance(provenance_storage), archive, revisions)
	-
	- assert adapt_result(
	- ProvenanceResult(
	- content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
	- revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
	- date=datetime.fromtimestamp(1000000000.0, timezone.utc),
	- origin=None,
	- path=b"A/B/C/a",
	- ),
	- provenance_storage.with_path(),
	- ) == provenance_storage.content_find_first(
	- hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494")
	- )
	-
	- for cnt in {cnt["sha1_git"] for cnt in data["content"]}:
	- assert adapt_result(
	- provenance.storage.content_find_first(cnt), provenance_storage.with_path()
	- ) == provenance_storage.content_find_first(cnt)
	- assert {
	- adapt_result(occur, provenance_storage.with_path())
	- for occur in provenance.storage.content_find_all(cnt)
	- } == set(provenance_storage.content_find_all(cnt))
	-
	- # Execute the origin-revision algorithm on both storages.
	- origins = [
	- OriginEntry(url=sta["origin"], snapshot=sta["snapshot"])
	- for sta in data["origin_visit_status"]
	- if sta["snapshot"] is not None
	- ]
	- origin_add(provenance, archive, origins)
	- origin_add(Provenance(provenance_storage), archive, origins)
	-
	- assert adapt_result(
	- ProvenanceResult(
	- content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
	- revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
	- date=datetime.fromtimestamp(1000000000.0, timezone.utc),
	- origin="https://cmdbts2",
	- path=b"A/B/C/a",
	- ),
	- provenance_storage.with_path(),
	- ) == provenance_storage.content_find_first(
	- hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494")
	- )
	-
	- for cnt in {cnt["sha1_git"] for cnt in data["content"]}:
	- assert adapt_result(
	- provenance.storage.content_find_first(cnt), provenance_storage.with_path()
	- ) == provenance_storage.content_find_first(cnt)
	- assert {
	- adapt_result(occur, provenance_storage.with_path())
	- for occur in provenance.storage.content_find_all(cnt)
	- } == set(provenance_storage.content_find_all(cnt))
	-
	-
	-def test_types(provenance_storage: ProvenanceStorageInterface) -> None:
	- """Checks all methods of ProvenanceStorageInterface are implemented by this
	- backend, and that they have the same signature."""
	- # Create an instance of the protocol (which cannot be instantiated
	- # directly, so this creates a subclass, then instantiates it)
	- interface = type("_", (ProvenanceStorageInterface,), {})()
	-
	- assert "content_find_first" in dir(interface)
	-
	- missing_methods = []
	-
	- for meth_name in dir(interface):
	- if meth_name.startswith("_"):
	- continue
	- interface_meth = getattr(interface, meth_name)
	- try:
	- concrete_meth = getattr(provenance_storage, meth_name)
	- except AttributeError:
	- if not getattr(interface_meth, "deprecated_endpoint", False):
	- # The backend is missing a (non-deprecated) endpoint
	- missing_methods.append(meth_name)
	- continue
	-
	- expected_signature = inspect.signature(interface_meth)
	- actual_signature = inspect.signature(concrete_meth)
	-
	- assert expected_signature == actual_signature, meth_name
	-
	- assert missing_methods == []
	-
	- # If all the assertions above succeed, then this one should too.
	- # But there's no harm in double-checking.
	- # And we could replace the assertions above by this one, but unlike
	- # the assertions above, it doesn't explain what is missing.
	- assert isinstance(provenance_storage, ProvenanceStorageInterface)
	diff --git a/swh/provenance/tests/test_provenance_storage_rabbitmq.py b/swh/provenance/tests/test_provenance_storage_rabbitmq.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/provenance/tests/test_provenance_storage_rabbitmq.py
	@@ -0,0 +1,38 @@
	+from typing import Any, Dict, Generator
	+
	+import pytest
	+
	+from swh.provenance import get_provenance_storage
	+from swh.provenance.interface import ProvenanceStorageInterface
	+
	+from .test_provenance_storage import TestProvenanceStorage # noqa: F401
	+
	+
	+@pytest.fixture()
	+def provenance_storage(
	+ provenance_postgresqldb: Dict[str, str],
	+ rabbitmq,
	+) -> Generator[ProvenanceStorageInterface, None, None]:
	+ """Return a working and initialized ProvenanceStorageInterface object"""
	+
	+ from swh.provenance.api.server import ProvenanceStorageRabbitMQServer
	+
	+ host = rabbitmq.args["host"]
	+ port = rabbitmq.args["port"]
	+ rabbitmq_params: Dict[str, Any] = {
	+ "url": f"amqp://guest:guest@{host}:{port}/%2f",
	+ "storage_config": {
	+ "cls": "postgresql",
	+ "db": provenance_postgresqldb,
	+ "raise_on_commit": True,
	+ },
	+ }
	+ server = ProvenanceStorageRabbitMQServer(
	+ url=rabbitmq_params["url"], storage_config=rabbitmq_params["storage_config"]
	+ )
	+ server.start()
	+ try:
	+ with get_provenance_storage(cls="rabbitmq", **rabbitmq_params) as storage:
	+ yield storage
	+ finally:
	+ server.stop()
	diff --git a/swh/provenance/tests/test_provenance_storage_with_path_denormalized.py b/swh/provenance/tests/test_provenance_storage_with_path_denormalized.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/provenance/tests/test_provenance_storage_with_path_denormalized.py
	@@ -0,0 +1,19 @@
	+from functools import partial
	+
	+from pytest_postgresql import factories
	+
	+from swh.core.db.db_utils import initialize_database_for_module
	+from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql
	+
	+from .test_provenance_storage import TestProvenanceStorage # noqa: F401
	+
	+provenance_postgresql_proc = factories.postgresql_proc(
	+ load=[
	+ partial(
	+ initialize_database_for_module,
	+ modname="provenance",
	+ flavor="with-path-denormalized",
	+ version=ProvenanceStoragePostgreSql.current_version,
	+ )
	+ ],
	+)
	diff --git a/swh/provenance/tests/test_provenance_storage_without_path.py b/swh/provenance/tests/test_provenance_storage_without_path.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/provenance/tests/test_provenance_storage_without_path.py
	@@ -0,0 +1,19 @@
	+from functools import partial
	+
	+from pytest_postgresql import factories
	+
	+from swh.core.db.db_utils import initialize_database_for_module
	+from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql
	+
	+from .test_provenance_storage import TestProvenanceStorage # noqa: F401
	+
	+provenance_postgresql_proc = factories.postgresql_proc(
	+ load=[
	+ partial(
	+ initialize_database_for_module,
	+ modname="provenance",
	+ flavor="without-path",
	+ version=ProvenanceStoragePostgreSql.current_version,
	+ )
	+ ],
	+)
	diff --git a/swh/provenance/tests/test_provenance_storage_without_path_denormalized.py b/swh/provenance/tests/test_provenance_storage_without_path_denormalized.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/provenance/tests/test_provenance_storage_without_path_denormalized.py
	@@ -0,0 +1,19 @@
	+from functools import partial
	+
	+from pytest_postgresql import factories
	+
	+from swh.core.db.db_utils import initialize_database_for_module
	+from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql
	+
	+from .test_provenance_storage import TestProvenanceStorage # noqa: F401
	+
	+provenance_postgresql_proc = factories.postgresql_proc(
	+ load=[
	+ partial(
	+ initialize_database_for_module,
	+ modname="provenance",
	+ flavor="without-path-denormalized",
	+ version=ProvenanceStoragePostgreSql.current_version,
	+ )
	+ ],
	+)

File Metadata

Mime Type: text/plain
Expires: Thu, Dec 19, 12:14 PM (15 h, 24 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3230766

D8591.id31021.diffNo OneTemporaryActions

D8591.id31021.diffView Options

File Metadata

Event Timeline

D8591.id31021.diff
No OneTemporary
Actions

D8591.id31021.diff
View Options