diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ # Add here internal Software Heritage dependencies, one per line. -swh.core[db,http] >= 0.14 +swh.core[db,http] >= 2 swh.model >= 2.6.1 swh.storage swh.graph >= 2.0.0 diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py --- a/swh/provenance/__init__.py +++ b/swh/provenance/__init__.py @@ -119,3 +119,6 @@ return rmq_storage raise ValueError + + +get_datastore = get_provenance_storage diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py --- a/swh/provenance/postgresql/provenance.py +++ b/swh/provenance/postgresql/provenance.py @@ -51,6 +51,8 @@ class ProvenanceStoragePostgreSql: + current_version = 3 + def __init__( self, page_size: Optional[int] = None, raise_on_commit: bool = False, **kwargs ) -> None: diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql --- a/swh/provenance/sql/30-schema.sql +++ b/swh/provenance/sql/30-schema.sql @@ -2,22 +2,6 @@ select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset select position('without-path' in swh_get_dbflavor()::text) = 0 as dbflavor_with_path \gset -create table dbversion -( - version int primary key, - release timestamptz, - description text -); - -comment on table dbversion is 'Details of current db version'; -comment on column dbversion.version is 'SQL schema version'; -comment on column dbversion.release is 'Version deployment timestamp'; -comment on column dbversion.description is 'Release description'; - --- latest schema version -insert into dbversion(version, release, description) - values(3, now(), 'Work In Progress'); - -- a Git object ID, i.e., a Git-style salted SHA1 checksum create domain sha1_git as bytea check (length(value) = 20); diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -5,6 +5,7 @@ from contextlib import contextmanager from datetime import datetime +from functools import partial import multiprocessing from os import path from pathlib import Path @@ -15,87 +16,56 @@ import msgpack import psycopg2.extensions import pytest -from pytest_postgresql.factories import postgresql +from pytest_postgresql import factories +from swh.core.db.db_utils import initialize_database_for_module from swh.graph.http_rpc_server import make_app from swh.journal.serializers import msgpack_ext_hook from swh.model.model import BaseModel, TimestampWithTimezone from swh.provenance import get_provenance, get_provenance_storage from swh.provenance.archive import ArchiveInterface from swh.provenance.interface import ProvenanceInterface, ProvenanceStorageInterface +from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql from swh.provenance.storage.archive import ArchiveStorage from swh.storage.interface import StorageInterface from swh.storage.replay import OBJECT_CONVERTERS, OBJECT_FIXERS, process_replay_objects - -@pytest.fixture( - params=[ - "with-path", - "without-path", - "with-path-denormalized", - "without-path-denormalized", - ] +provenance_postgresql_proc = factories.postgresql_proc( + load=[ + partial( + initialize_database_for_module, + modname="provenance", + flavor="with-path", + version=ProvenanceStoragePostgreSql.current_version, + ) + ], ) -def provenance_postgresqldb( - request: SubRequest, - postgresql: psycopg2.extensions.connection, -) -> Dict[str, str]: - """return a working and initialized provenance db""" - from swh.core.db.db_utils import ( - init_admin_extensions, - populate_database_for_package, - ) - init_admin_extensions("swh.provenance", postgresql.dsn) - populate_database_for_package( - "swh.provenance", postgresql.dsn, flavor=request.param - ) - return postgresql.get_dsn_parameters() +postgres_provenance = factories.postgresql("provenance_postgresql_proc") -@pytest.fixture(params=["postgresql", "rabbitmq"]) +@pytest.fixture() +def provenance_postgresqldb(request, postgres_provenance): + return postgres_provenance.get_dsn_parameters() + + +@pytest.fixture() def provenance_storage( request: SubRequest, provenance_postgresqldb: Dict[str, str], ) -> Generator[ProvenanceStorageInterface, None, None]: """Return a working and initialized ProvenanceStorageInterface object""" - if request.param == "rabbitmq": - from swh.provenance.api.server import ProvenanceStorageRabbitMQServer - - rabbitmq = request.getfixturevalue("rabbitmq") - host = rabbitmq.args["host"] - port = rabbitmq.args["port"] - rabbitmq_params: Dict[str, Any] = { - "url": f"amqp://guest:guest@{host}:{port}/%2f", - "storage_config": { - "cls": "postgresql", - "db": provenance_postgresqldb, - "raise_on_commit": True, - }, - } - server = ProvenanceStorageRabbitMQServer( - url=rabbitmq_params["url"], storage_config=rabbitmq_params["storage_config"] - ) - server.start() - with get_provenance_storage(cls=request.param, **rabbitmq_params) as storage: - yield storage - server.stop() - - else: - # in test sessions, we DO want to raise any exception occurring at commit time - with get_provenance_storage( - cls=request.param, db=provenance_postgresqldb, raise_on_commit=True - ) as storage: - yield storage - - -provenance_postgresql = postgresql("postgresql_proc", dbname="provenance_tests") + # in test sessions, we DO want to raise any exception occurring at commit time + with get_provenance_storage( + cls="postgresql", db=provenance_postgresqldb, raise_on_commit=True + ) as storage: + yield storage @pytest.fixture def provenance( - provenance_postgresql: psycopg2.extensions.connection, + postgres_provenance: psycopg2.extensions.connection, ) -> Generator[ProvenanceInterface, None, None]: """Return a working and initialized ProvenanceInterface object""" @@ -104,14 +74,14 @@ populate_database_for_package, ) - init_admin_extensions("swh.provenance", provenance_postgresql.dsn) + init_admin_extensions("swh.provenance", postgres_provenance.dsn) populate_database_for_package( - "swh.provenance", provenance_postgresql.dsn, flavor="with-path" + "swh.provenance", postgres_provenance.dsn, flavor="with-path" ) # in test sessions, we DO want to raise any exception occurring at commit time with get_provenance( cls="postgresql", - db=provenance_postgresql.get_dsn_parameters(), + db=postgres_provenance.get_dsn_parameters(), raise_on_commit=True, ) as provenance: yield provenance diff --git a/swh/provenance/tests/test_journal_client.py b/swh/provenance/tests/test_journal_client.py --- a/swh/provenance/tests/test_journal_client.py +++ b/swh/provenance/tests/test_journal_client.py @@ -35,7 +35,7 @@ consumer: Consumer, tmp_path: str, provenance, - provenance_postgresql, + postgres_provenance, ) -> None: """Test origin journal client cli""" @@ -63,7 +63,7 @@ }, "storage": { "cls": "postgresql", - "db": provenance_postgresql.get_dsn_parameters(), + "db": postgres_provenance.get_dsn_parameters(), }, }, } @@ -89,7 +89,7 @@ consumer: Consumer, tmp_path: str, provenance, - provenance_postgresql, + postgres_provenance, ) -> None: """Test revision journal client cli""" @@ -116,7 +116,7 @@ }, "storage": { "cls": "postgresql", - "db": provenance_postgresql.get_dsn_parameters(), + "db": postgres_provenance.get_dsn_parameters(), }, }, } diff --git a/swh/provenance/tests/test_provenance_storage.py b/swh/provenance/tests/test_provenance_storage.py --- a/swh/provenance/tests/test_provenance_storage.py +++ b/swh/provenance/tests/test_provenance_storage.py @@ -28,130 +28,343 @@ from swh.provenance.tests.conftest import fill_storage, load_repo_data, ts2dt -def test_provenance_storage_content( - provenance_storage: ProvenanceStorageInterface, -) -> None: - """Tests content methods for every `ProvenanceStorageInterface` implementation.""" - - # Read data/README.md for more details on how these datasets are generated. - data = load_repo_data("cmdbts2") - - # Add all content present in the current repo to the storage, just assigning their - # creation dates. Then check that the returned results when querying are the same. - cnt_dates = { - cnt["sha1_git"]: cnt["ctime"] for idx, cnt in enumerate(data["content"]) - } - assert provenance_storage.content_add(cnt_dates) - assert provenance_storage.content_get(set(cnt_dates.keys())) == cnt_dates - assert provenance_storage.entity_get_all(EntityType.CONTENT) == set( - cnt_dates.keys() - ) +class TestProvenanceStorage: + def test_provenance_storage_content( + self, + provenance_storage: ProvenanceStorageInterface, + ) -> None: + """Tests content methods for every `ProvenanceStorageInterface` implementation.""" + + # Read data/README.md for more details on how these datasets are generated. + data = load_repo_data("cmdbts2") + + # Add all content present in the current repo to the storage, just assigning their + # creation dates. Then check that the returned results when querying are the same. + cnt_dates = { + cnt["sha1_git"]: cnt["ctime"] for idx, cnt in enumerate(data["content"]) + } + assert provenance_storage.content_add(cnt_dates) + assert provenance_storage.content_get(set(cnt_dates.keys())) == cnt_dates + assert provenance_storage.entity_get_all(EntityType.CONTENT) == set( + cnt_dates.keys() + ) + def test_provenance_storage_directory( + self, + provenance_storage: ProvenanceStorageInterface, + ) -> None: + """Tests directory methods for every `ProvenanceStorageInterface` implementation.""" + + # Read data/README.md for more details on how these datasets are generated. + data = load_repo_data("cmdbts2") + + # Of all directories present in the current repo, only assign a date to those + # containing blobs (picking the max date among the available ones). Then check that + # the returned results when querying are the same. + def getmaxdate( + directory: Dict[str, Any], contents: Iterable[Dict[str, Any]] + ) -> Optional[datetime]: + dates = [ + content["ctime"] + for entry in directory["entries"] + for content in contents + if entry["type"] == "file" and entry["target"] == content["sha1_git"] + ] + return max(dates) if dates else None + + flat_values = (False, True) + dir_dates = {} + for idx, dir in enumerate(data["directory"]): + date = getmaxdate(dir, data["content"]) + if date is not None: + dir_dates[dir["id"]] = DirectoryData( + date=date, flat=flat_values[idx % 2] + ) + assert provenance_storage.directory_add(dir_dates) + assert provenance_storage.directory_get(set(dir_dates.keys())) == dir_dates + assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == set( + dir_dates.keys() + ) -def test_provenance_storage_directory( - provenance_storage: ProvenanceStorageInterface, -) -> None: - """Tests directory methods for every `ProvenanceStorageInterface` implementation.""" - - # Read data/README.md for more details on how these datasets are generated. - data = load_repo_data("cmdbts2") - - # Of all directories present in the current repo, only assign a date to those - # containing blobs (picking the max date among the available ones). Then check that - # the returned results when querying are the same. - def getmaxdate( - directory: Dict[str, Any], contents: Iterable[Dict[str, Any]] - ) -> Optional[datetime]: - dates = [ - content["ctime"] - for entry in directory["entries"] - for content in contents - if entry["type"] == "file" and entry["target"] == content["sha1_git"] - ] - return max(dates) if dates else None - - flat_values = (False, True) - dir_dates = {} - for idx, dir in enumerate(data["directory"]): - date = getmaxdate(dir, data["content"]) - if date is not None: - dir_dates[dir["id"]] = DirectoryData(date=date, flat=flat_values[idx % 2]) - assert provenance_storage.directory_add(dir_dates) - assert provenance_storage.directory_get(set(dir_dates.keys())) == dir_dates - assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == set( - dir_dates.keys() - ) + def test_provenance_storage_location( + self, + provenance_storage: ProvenanceStorageInterface, + ) -> None: + """Tests location methods for every `ProvenanceStorageInterface` implementation.""" + + # Read data/README.md for more details on how these datasets are generated. + data = load_repo_data("cmdbts2") + + # Add all names of entries present in the directories of the current repo as paths + # to the storage. Then check that the returned results when querying are the same. + paths = {entry["name"] for dir in data["directory"] for entry in dir["entries"]} + assert provenance_storage.location_add(paths) + + if provenance_storage.with_path(): + assert provenance_storage.location_get_all() == paths + else: + assert provenance_storage.location_get_all() == set() + + def test_provenance_storage_origin( + self, + provenance_storage: ProvenanceStorageInterface, + ) -> None: + """Tests origin methods for every `ProvenanceStorageInterface` implementation.""" + + # Read data/README.md for more details on how these datasets are generated. + data = load_repo_data("cmdbts2") + + # Test origin methods. + # Add all origins present in the current repo to the storage. Then check that the + # returned results when querying are the same. + orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]} + assert orgs + assert provenance_storage.origin_add(orgs) + assert provenance_storage.origin_get(set(orgs.keys())) == orgs + assert provenance_storage.entity_get_all(EntityType.ORIGIN) == set(orgs.keys()) + + def test_provenance_storage_revision( + self, + provenance_storage: ProvenanceStorageInterface, + ) -> None: + """Tests revision methods for every `ProvenanceStorageInterface` implementation.""" + + # Read data/README.md for more details on how these datasets are generated. + data = load_repo_data("cmdbts2") + + # Test revision methods. + # Add all revisions present in the current repo to the storage, assigning their + # dates and an arbitrary origin to each one. Then check that the returned results + # when querying are the same. + origin = Origin(url=next(iter(data["origin"]))["url"]) + # Origin must be inserted in advance. + assert provenance_storage.origin_add({origin.id: origin.url}) + + revs = {rev["id"] for idx, rev in enumerate(data["revision"]) if idx % 6 == 0} + rev_data = { + rev["id"]: RevisionData( + date=ts2dt(rev["date"]) if idx % 2 != 0 else None, + origin=origin.id if idx % 3 != 0 else None, + ) + for idx, rev in enumerate(data["revision"]) + if idx % 6 != 0 + } + assert revs + assert provenance_storage.revision_add(revs) + assert provenance_storage.revision_add(rev_data) + assert provenance_storage.revision_get(set(rev_data.keys())) == rev_data + assert provenance_storage.entity_get_all(EntityType.REVISION) == revs | set( + rev_data.keys() + ) + def test_provenance_storage_relation( + self, + provenance_storage: ProvenanceStorageInterface, + ) -> None: + """Tests relation methods for every `ProvenanceStorageInterface` implementation.""" -def test_provenance_storage_location( - provenance_storage: ProvenanceStorageInterface, -) -> None: - """Tests location methods for every `ProvenanceStorageInterface` implementation.""" + # Read data/README.md for more details on how these datasets are generated. + data = load_repo_data("cmdbts2") - # Read data/README.md for more details on how these datasets are generated. - data = load_repo_data("cmdbts2") + # Test content-in-revision relation. + # Create flat models of every root directory for the revisions in the dataset. + cnt_in_rev: Dict[Sha1Git, Set[RelationData]] = {} + for rev in data["revision"]: + root = next( + subdir + for subdir in data["directory"] + if subdir["id"] == rev["directory"] + ) + for cnt, rel in dircontent(data, rev["id"], root): + cnt_in_rev.setdefault(cnt, set()).add(rel) + relation_add_and_compare_result( + provenance_storage, RelationType.CNT_EARLY_IN_REV, cnt_in_rev + ) - # Add all names of entries present in the directories of the current repo as paths - # to the storage. Then check that the returned results when querying are the same. - paths = {entry["name"] for dir in data["directory"] for entry in dir["entries"]} - assert provenance_storage.location_add(paths) + # Test content-in-directory relation. + # Create flat models for every directory in the dataset. + cnt_in_dir: Dict[Sha1Git, Set[RelationData]] = {} + for dir in data["directory"]: + for cnt, rel in dircontent(data, dir["id"], dir): + cnt_in_dir.setdefault(cnt, set()).add(rel) + relation_add_and_compare_result( + provenance_storage, RelationType.CNT_IN_DIR, cnt_in_dir + ) - if provenance_storage.with_path(): - assert provenance_storage.location_get_all() == paths - else: - assert provenance_storage.location_get_all() == set() + # Test content-in-directory relation. + # Add root directories to their correspondent revision in the dataset. + dir_in_rev: Dict[Sha1Git, Set[RelationData]] = {} + for rev in data["revision"]: + dir_in_rev.setdefault(rev["directory"], set()).add( + RelationData(dst=rev["id"], path=b".") + ) + relation_add_and_compare_result( + provenance_storage, RelationType.DIR_IN_REV, dir_in_rev + ) + # Test revision-in-origin relation. + # Origins must be inserted in advance (cannot be done by `entity_add` inside + # `relation_add_and_compare_result`). + orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]} + assert provenance_storage.origin_add(orgs) + # Add all revisions that are head of some snapshot branch to the corresponding + # origin. + rev_in_org: Dict[Sha1Git, Set[RelationData]] = {} + for status in data["origin_visit_status"]: + if status["snapshot"] is not None: + for snapshot in data["snapshot"]: + if snapshot["id"] == status["snapshot"]: + for branch in snapshot["branches"].values(): + if branch["target_type"] == "revision": + rev_in_org.setdefault(branch["target"], set()).add( + RelationData( + dst=Origin(url=status["origin"]).id, + path=None, + ) + ) + relation_add_and_compare_result( + provenance_storage, RelationType.REV_IN_ORG, rev_in_org + ) -def test_provenance_storage_origin( - provenance_storage: ProvenanceStorageInterface, -) -> None: - """Tests origin methods for every `ProvenanceStorageInterface` implementation.""" + # Test revision-before-revision relation. + # For each revision in the data set add an entry for each parent to the relation. + rev_before_rev: Dict[Sha1Git, Set[RelationData]] = {} + for rev in data["revision"]: + for parent in rev["parents"]: + rev_before_rev.setdefault(parent, set()).add( + RelationData(dst=rev["id"], path=None) + ) + relation_add_and_compare_result( + provenance_storage, RelationType.REV_BEFORE_REV, rev_before_rev + ) - # Read data/README.md for more details on how these datasets are generated. - data = load_repo_data("cmdbts2") + def test_provenance_storage_find( + self, + provenance: ProvenanceInterface, + provenance_storage: ProvenanceStorageInterface, + archive: ArchiveInterface, + ) -> None: + """Tests `content_find_first` and `content_find_all` methods for every + `ProvenanceStorageInterface` implementation. + """ + + # Read data/README.md for more details on how these datasets are generated. + data = load_repo_data("cmdbts2") + fill_storage(archive.storage, data) + + # Test content_find_first and content_find_all, first only executing the + # revision-content algorithm, then adding the origin-revision layer. + def adapt_result( + result: Optional[ProvenanceResult], with_path: bool + ) -> Optional[ProvenanceResult]: + if result is not None: + return ProvenanceResult( + result.content, + result.revision, + result.date, + result.origin, + result.path if with_path else b"", + ) + return result + + # Execute the revision-content algorithm on both storages. + revisions = [ + RevisionEntry(id=rev["id"], date=ts2dt(rev["date"]), root=rev["directory"]) + for rev in data["revision"] + ] + revision_add(provenance, archive, revisions) + revision_add(Provenance(provenance_storage), archive, revisions) - # Test origin methods. - # Add all origins present in the current repo to the storage. Then check that the - # returned results when querying are the same. - orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]} - assert orgs - assert provenance_storage.origin_add(orgs) - assert provenance_storage.origin_get(set(orgs.keys())) == orgs - assert provenance_storage.entity_get_all(EntityType.ORIGIN) == set(orgs.keys()) + assert adapt_result( + ProvenanceResult( + content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), + revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"), + date=datetime.fromtimestamp(1000000000.0, timezone.utc), + origin=None, + path=b"A/B/C/a", + ), + provenance_storage.with_path(), + ) == provenance_storage.content_find_first( + hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494") + ) + for cnt in {cnt["sha1_git"] for cnt in data["content"]}: + assert adapt_result( + provenance.storage.content_find_first(cnt), + provenance_storage.with_path(), + ) == provenance_storage.content_find_first(cnt) + assert { + adapt_result(occur, provenance_storage.with_path()) + for occur in provenance.storage.content_find_all(cnt) + } == set(provenance_storage.content_find_all(cnt)) + + # Execute the origin-revision algorithm on both storages. + origins = [ + OriginEntry(url=sta["origin"], snapshot=sta["snapshot"]) + for sta in data["origin_visit_status"] + if sta["snapshot"] is not None + ] + origin_add(provenance, archive, origins) + origin_add(Provenance(provenance_storage), archive, origins) -def test_provenance_storage_revision( - provenance_storage: ProvenanceStorageInterface, -) -> None: - """Tests revision methods for every `ProvenanceStorageInterface` implementation.""" - - # Read data/README.md for more details on how these datasets are generated. - data = load_repo_data("cmdbts2") - - # Test revision methods. - # Add all revisions present in the current repo to the storage, assigning their - # dates and an arbitrary origin to each one. Then check that the returned results - # when querying are the same. - origin = Origin(url=next(iter(data["origin"]))["url"]) - # Origin must be inserted in advance. - assert provenance_storage.origin_add({origin.id: origin.url}) - - revs = {rev["id"] for idx, rev in enumerate(data["revision"]) if idx % 6 == 0} - rev_data = { - rev["id"]: RevisionData( - date=ts2dt(rev["date"]) if idx % 2 != 0 else None, - origin=origin.id if idx % 3 != 0 else None, + assert adapt_result( + ProvenanceResult( + content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), + revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"), + date=datetime.fromtimestamp(1000000000.0, timezone.utc), + origin="https://cmdbts2", + path=b"A/B/C/a", + ), + provenance_storage.with_path(), + ) == provenance_storage.content_find_first( + hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494") ) - for idx, rev in enumerate(data["revision"]) - if idx % 6 != 0 - } - assert revs - assert provenance_storage.revision_add(revs) - assert provenance_storage.revision_add(rev_data) - assert provenance_storage.revision_get(set(rev_data.keys())) == rev_data - assert provenance_storage.entity_get_all(EntityType.REVISION) == revs | set( - rev_data.keys() - ) + + for cnt in {cnt["sha1_git"] for cnt in data["content"]}: + assert adapt_result( + provenance.storage.content_find_first(cnt), + provenance_storage.with_path(), + ) == provenance_storage.content_find_first(cnt) + assert { + adapt_result(occur, provenance_storage.with_path()) + for occur in provenance.storage.content_find_all(cnt) + } == set(provenance_storage.content_find_all(cnt)) + + def test_types(self, provenance_storage: ProvenanceStorageInterface) -> None: + """Checks all methods of ProvenanceStorageInterface are implemented by this + backend, and that they have the same signature.""" + # Create an instance of the protocol (which cannot be instantiated + # directly, so this creates a subclass, then instantiates it) + interface = type("_", (ProvenanceStorageInterface,), {})() + + assert "content_find_first" in dir(interface) + + missing_methods = [] + + for meth_name in dir(interface): + if meth_name.startswith("_"): + continue + interface_meth = getattr(interface, meth_name) + try: + concrete_meth = getattr(provenance_storage, meth_name) + except AttributeError: + if not getattr(interface_meth, "deprecated_endpoint", False): + # The backend is missing a (non-deprecated) endpoint + missing_methods.append(meth_name) + continue + + expected_signature = inspect.signature(interface_meth) + actual_signature = inspect.signature(concrete_meth) + + assert expected_signature == actual_signature, meth_name + + assert missing_methods == [] + + # If all the assertions above succeed, then this one should too. + # But there's no harm in double-checking. + # And we could replace the assertions above by this one, but unlike + # the assertions above, it doesn't explain what is missing. + assert isinstance(provenance_storage, ProvenanceStorageInterface) def dircontent( @@ -255,209 +468,3 @@ } for src_sha1, rels in expected.items() } == computed - - -def test_provenance_storage_relation( - provenance_storage: ProvenanceStorageInterface, -) -> None: - """Tests relation methods for every `ProvenanceStorageInterface` implementation.""" - - # Read data/README.md for more details on how these datasets are generated. - data = load_repo_data("cmdbts2") - - # Test content-in-revision relation. - # Create flat models of every root directory for the revisions in the dataset. - cnt_in_rev: Dict[Sha1Git, Set[RelationData]] = {} - for rev in data["revision"]: - root = next( - subdir for subdir in data["directory"] if subdir["id"] == rev["directory"] - ) - for cnt, rel in dircontent(data, rev["id"], root): - cnt_in_rev.setdefault(cnt, set()).add(rel) - relation_add_and_compare_result( - provenance_storage, RelationType.CNT_EARLY_IN_REV, cnt_in_rev - ) - - # Test content-in-directory relation. - # Create flat models for every directory in the dataset. - cnt_in_dir: Dict[Sha1Git, Set[RelationData]] = {} - for dir in data["directory"]: - for cnt, rel in dircontent(data, dir["id"], dir): - cnt_in_dir.setdefault(cnt, set()).add(rel) - relation_add_and_compare_result( - provenance_storage, RelationType.CNT_IN_DIR, cnt_in_dir - ) - - # Test content-in-directory relation. - # Add root directories to their correspondent revision in the dataset. - dir_in_rev: Dict[Sha1Git, Set[RelationData]] = {} - for rev in data["revision"]: - dir_in_rev.setdefault(rev["directory"], set()).add( - RelationData(dst=rev["id"], path=b".") - ) - relation_add_and_compare_result( - provenance_storage, RelationType.DIR_IN_REV, dir_in_rev - ) - - # Test revision-in-origin relation. - # Origins must be inserted in advance (cannot be done by `entity_add` inside - # `relation_add_and_compare_result`). - orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]} - assert provenance_storage.origin_add(orgs) - # Add all revisions that are head of some snapshot branch to the corresponding - # origin. - rev_in_org: Dict[Sha1Git, Set[RelationData]] = {} - for status in data["origin_visit_status"]: - if status["snapshot"] is not None: - for snapshot in data["snapshot"]: - if snapshot["id"] == status["snapshot"]: - for branch in snapshot["branches"].values(): - if branch["target_type"] == "revision": - rev_in_org.setdefault(branch["target"], set()).add( - RelationData( - dst=Origin(url=status["origin"]).id, - path=None, - ) - ) - relation_add_and_compare_result( - provenance_storage, RelationType.REV_IN_ORG, rev_in_org - ) - - # Test revision-before-revision relation. - # For each revision in the data set add an entry for each parent to the relation. - rev_before_rev: Dict[Sha1Git, Set[RelationData]] = {} - for rev in data["revision"]: - for parent in rev["parents"]: - rev_before_rev.setdefault(parent, set()).add( - RelationData(dst=rev["id"], path=None) - ) - relation_add_and_compare_result( - provenance_storage, RelationType.REV_BEFORE_REV, rev_before_rev - ) - - -def test_provenance_storage_find( - provenance: ProvenanceInterface, - provenance_storage: ProvenanceStorageInterface, - archive: ArchiveInterface, -) -> None: - """Tests `content_find_first` and `content_find_all` methods for every - `ProvenanceStorageInterface` implementation. - """ - - # Read data/README.md for more details on how these datasets are generated. - data = load_repo_data("cmdbts2") - fill_storage(archive.storage, data) - - # Test content_find_first and content_find_all, first only executing the - # revision-content algorithm, then adding the origin-revision layer. - def adapt_result( - result: Optional[ProvenanceResult], with_path: bool - ) -> Optional[ProvenanceResult]: - if result is not None: - return ProvenanceResult( - result.content, - result.revision, - result.date, - result.origin, - result.path if with_path else b"", - ) - return result - - # Execute the revision-content algorithm on both storages. - revisions = [ - RevisionEntry(id=rev["id"], date=ts2dt(rev["date"]), root=rev["directory"]) - for rev in data["revision"] - ] - revision_add(provenance, archive, revisions) - revision_add(Provenance(provenance_storage), archive, revisions) - - assert adapt_result( - ProvenanceResult( - content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), - revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"), - date=datetime.fromtimestamp(1000000000.0, timezone.utc), - origin=None, - path=b"A/B/C/a", - ), - provenance_storage.with_path(), - ) == provenance_storage.content_find_first( - hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494") - ) - - for cnt in {cnt["sha1_git"] for cnt in data["content"]}: - assert adapt_result( - provenance.storage.content_find_first(cnt), provenance_storage.with_path() - ) == provenance_storage.content_find_first(cnt) - assert { - adapt_result(occur, provenance_storage.with_path()) - for occur in provenance.storage.content_find_all(cnt) - } == set(provenance_storage.content_find_all(cnt)) - - # Execute the origin-revision algorithm on both storages. - origins = [ - OriginEntry(url=sta["origin"], snapshot=sta["snapshot"]) - for sta in data["origin_visit_status"] - if sta["snapshot"] is not None - ] - origin_add(provenance, archive, origins) - origin_add(Provenance(provenance_storage), archive, origins) - - assert adapt_result( - ProvenanceResult( - content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), - revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"), - date=datetime.fromtimestamp(1000000000.0, timezone.utc), - origin="https://cmdbts2", - path=b"A/B/C/a", - ), - provenance_storage.with_path(), - ) == provenance_storage.content_find_first( - hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494") - ) - - for cnt in {cnt["sha1_git"] for cnt in data["content"]}: - assert adapt_result( - provenance.storage.content_find_first(cnt), provenance_storage.with_path() - ) == provenance_storage.content_find_first(cnt) - assert { - adapt_result(occur, provenance_storage.with_path()) - for occur in provenance.storage.content_find_all(cnt) - } == set(provenance_storage.content_find_all(cnt)) - - -def test_types(provenance_storage: ProvenanceStorageInterface) -> None: - """Checks all methods of ProvenanceStorageInterface are implemented by this - backend, and that they have the same signature.""" - # Create an instance of the protocol (which cannot be instantiated - # directly, so this creates a subclass, then instantiates it) - interface = type("_", (ProvenanceStorageInterface,), {})() - - assert "content_find_first" in dir(interface) - - missing_methods = [] - - for meth_name in dir(interface): - if meth_name.startswith("_"): - continue - interface_meth = getattr(interface, meth_name) - try: - concrete_meth = getattr(provenance_storage, meth_name) - except AttributeError: - if not getattr(interface_meth, "deprecated_endpoint", False): - # The backend is missing a (non-deprecated) endpoint - missing_methods.append(meth_name) - continue - - expected_signature = inspect.signature(interface_meth) - actual_signature = inspect.signature(concrete_meth) - - assert expected_signature == actual_signature, meth_name - - assert missing_methods == [] - - # If all the assertions above succeed, then this one should too. - # But there's no harm in double-checking. - # And we could replace the assertions above by this one, but unlike - # the assertions above, it doesn't explain what is missing. - assert isinstance(provenance_storage, ProvenanceStorageInterface) diff --git a/swh/provenance/tests/test_provenance_storage_rabbitmq.py b/swh/provenance/tests/test_provenance_storage_rabbitmq.py new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/test_provenance_storage_rabbitmq.py @@ -0,0 +1,38 @@ +from typing import Any, Dict, Generator + +import pytest + +from swh.provenance import get_provenance_storage +from swh.provenance.interface import ProvenanceStorageInterface + +from .test_provenance_storage import TestProvenanceStorage # noqa: F401 + + +@pytest.fixture() +def provenance_storage( + provenance_postgresqldb: Dict[str, str], + rabbitmq, +) -> Generator[ProvenanceStorageInterface, None, None]: + """Return a working and initialized ProvenanceStorageInterface object""" + + from swh.provenance.api.server import ProvenanceStorageRabbitMQServer + + host = rabbitmq.args["host"] + port = rabbitmq.args["port"] + rabbitmq_params: Dict[str, Any] = { + "url": f"amqp://guest:guest@{host}:{port}/%2f", + "storage_config": { + "cls": "postgresql", + "db": provenance_postgresqldb, + "raise_on_commit": True, + }, + } + server = ProvenanceStorageRabbitMQServer( + url=rabbitmq_params["url"], storage_config=rabbitmq_params["storage_config"] + ) + server.start() + try: + with get_provenance_storage(cls="rabbitmq", **rabbitmq_params) as storage: + yield storage + finally: + server.stop() diff --git a/swh/provenance/tests/test_provenance_storage_with_path_denormalized.py b/swh/provenance/tests/test_provenance_storage_with_path_denormalized.py new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/test_provenance_storage_with_path_denormalized.py @@ -0,0 +1,19 @@ +from functools import partial + +from pytest_postgresql import factories + +from swh.core.db.db_utils import initialize_database_for_module +from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql + +from .test_provenance_storage import TestProvenanceStorage # noqa: F401 + +provenance_postgresql_proc = factories.postgresql_proc( + load=[ + partial( + initialize_database_for_module, + modname="provenance", + flavor="with-path-denormalized", + version=ProvenanceStoragePostgreSql.current_version, + ) + ], +) diff --git a/swh/provenance/tests/test_provenance_storage_without_path.py b/swh/provenance/tests/test_provenance_storage_without_path.py new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/test_provenance_storage_without_path.py @@ -0,0 +1,19 @@ +from functools import partial + +from pytest_postgresql import factories + +from swh.core.db.db_utils import initialize_database_for_module +from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql + +from .test_provenance_storage import TestProvenanceStorage # noqa: F401 + +provenance_postgresql_proc = factories.postgresql_proc( + load=[ + partial( + initialize_database_for_module, + modname="provenance", + flavor="without-path", + version=ProvenanceStoragePostgreSql.current_version, + ) + ], +) diff --git a/swh/provenance/tests/test_provenance_storage_without_path_denormalized.py b/swh/provenance/tests/test_provenance_storage_without_path_denormalized.py new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/test_provenance_storage_without_path_denormalized.py @@ -0,0 +1,19 @@ +from functools import partial + +from pytest_postgresql import factories + +from swh.core.db.db_utils import initialize_database_for_module +from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql + +from .test_provenance_storage import TestProvenanceStorage # noqa: F401 + +provenance_postgresql_proc = factories.postgresql_proc( + load=[ + partial( + initialize_database_for_module, + modname="provenance", + flavor="without-path-denormalized", + version=ProvenanceStoragePostgreSql.current_version, + ) + ], +)