Page MenuHomeSoftware Heritage

D6054.id21945.diff
No OneTemporary

D6054.id21945.diff

diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -20,5 +20,8 @@
[mypy-pytest.*]
ignore_missing_imports = True
+[mypy-pytest_postgresql.*]
+ignore_missing_imports = True
+
[mypy-psycopg2.*]
ignore_missing_imports = True
diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
--- a/swh/provenance/tests/conftest.py
+++ b/swh/provenance/tests/conftest.py
@@ -10,6 +10,7 @@
import msgpack
import psycopg2.extensions
import pytest
+from pytest_postgresql.factories import postgresql
from swh.journal.serializers import msgpack_ext_hook
from swh.provenance import get_provenance, get_provenance_storage
@@ -49,23 +50,43 @@
@pytest.fixture(params=["local", "remote"])
-def provenance(
+def provenance_storage(
request: SubRequest,
populated_db: Dict[str, str],
swh_rpc_client: RemoteProvenanceStorage,
-) -> ProvenanceInterface:
- """Return a working and initialized ProvenanceInterface object"""
+) -> ProvenanceStorageInterface:
+ """Return a working and initialized ProvenanceStorageInterface object"""
if request.param == "remote":
- from swh.provenance.provenance import Provenance
-
assert isinstance(swh_rpc_client, ProvenanceStorageInterface)
- return Provenance(swh_rpc_client)
+ return swh_rpc_client
else:
# in test sessions, we DO want to raise any exception occurring at commit time
- prov = get_provenance(cls=request.param, db=populated_db, raise_on_commit=True)
- return prov
+ storage = get_provenance_storage(
+ cls=request.param, db=populated_db, raise_on_commit=True
+ )
+ return storage
+
+
+provenance_postgresql = postgresql("postgresql_proc", dbname="provenance_tests")
+
+
+@pytest.fixture
+def provenance(
+ provenance_postgresql: psycopg2.extensions.connection,
+) -> ProvenanceInterface:
+ """Return a working and initialized ProvenanceInterface object"""
+
+ from swh.core.cli.db import populate_database_for_package
+
+ populate_database_for_package(
+ "swh.provenance", provenance_postgresql.dsn, flavor="with-path"
+ )
+ # in test sessions, we DO want to raise any exception occurring at commit time
+ return get_provenance(
+ cls="local", db=provenance_postgresql.get_dsn_parameters(), raise_on_commit=True
+ )
@pytest.fixture
diff --git a/swh/provenance/tests/test_provenance_storage.py b/swh/provenance/tests/test_provenance_storage.py
--- a/swh/provenance/tests/test_provenance_storage.py
+++ b/swh/provenance/tests/test_provenance_storage.py
@@ -3,9 +3,315 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from datetime import datetime
import inspect
+import os
+from typing import Any, Dict, Iterable, Optional, Set
-from ..interface import ProvenanceInterface, ProvenanceStorageInterface
+import pytest
+
+from swh.model.hashutil import hash_to_bytes
+from swh.model.identifiers import origin_identifier
+from swh.model.model import Sha1Git
+from swh.provenance.interface import (
+ EntityType,
+ ProvenanceInterface,
+ ProvenanceResult,
+ ProvenanceStorageInterface,
+ RelationData,
+ RelationType,
+)
+from swh.provenance.tests.conftest import load_repo_data
+from swh.provenance.tests.test_provenance_db import ts2dt
+
+
+def relation_add_and_compare_result(
+ relation: RelationType,
+ data: Set[RelationData],
+ refstorage: ProvenanceStorageInterface,
+ storage: ProvenanceStorageInterface,
+ with_path: bool = True,
+) -> None:
+ assert data
+ assert refstorage.relation_add(relation, data) == storage.relation_add(
+ relation, data
+ )
+
+ assert relation_compare_result(
+ refstorage.relation_get(relation, (reldata.src for reldata in data)),
+ storage.relation_get(relation, (reldata.src for reldata in data)),
+ with_path,
+ )
+ assert relation_compare_result(
+ refstorage.relation_get(
+ relation,
+ (reldata.dst for reldata in data),
+ reverse=True,
+ ),
+ storage.relation_get(
+ relation,
+ (reldata.dst for reldata in data),
+ reverse=True,
+ ),
+ with_path,
+ )
+ assert relation_compare_result(
+ refstorage.relation_get_all(relation),
+ storage.relation_get_all(relation),
+ with_path,
+ )
+
+
+def relation_compare_result(
+ expected: Set[RelationData], computed: Set[RelationData], with_path: bool
+) -> bool:
+ return {
+ RelationData(reldata.src, reldata.dst, reldata.path if with_path else None)
+ for reldata in expected
+ } == computed
+
+
+def dircontent(
+ data: Dict[str, Any],
+ ref: Sha1Git,
+ dir: Dict[str, Any],
+ prefix: bytes = b"",
+) -> Iterable[RelationData]:
+ content = {
+ RelationData(entry["target"], ref, os.path.join(prefix, entry["name"]))
+ for entry in dir["entries"]
+ if entry["type"] == "file"
+ }
+ for entry in dir["entries"]:
+ if entry["type"] == "dir":
+ child = next(
+ subdir
+ for subdir in data["directory"]
+ if subdir["id"] == entry["target"]
+ )
+ content.update(
+ dircontent(data, ref, child, os.path.join(prefix, entry["name"]))
+ )
+ return content
+
+
+@pytest.mark.parametrize(
+ "repo",
+ ("cmdbts2", "out-of-order", "with-merges"),
+)
+def test_provenance_storage(
+ provenance: ProvenanceInterface,
+ provenance_storage: ProvenanceStorageInterface,
+ repo: str,
+) -> None:
+ """Tests every ProvenanceStorageInterface implementation against the one provided
+ for provenance.storage."""
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data(repo)
+
+ # Assuming provenance.storage has the 'with-path' flavor.
+ assert provenance.storage.with_path()
+
+ # Test content methods.
+ # Add all content present in the current repo to both storages, just assigning their
+ # creation dates. Then check that the inserted content is the same in both cases.
+ cnt_dates = {cnt["sha1_git"]: cnt["ctime"] for cnt in data["content"]}
+ assert cnt_dates
+ assert provenance.storage.content_set_date(
+ cnt_dates
+ ) == provenance_storage.content_set_date(cnt_dates)
+
+ assert provenance.storage.content_get(cnt_dates) == provenance_storage.content_get(
+ cnt_dates
+ )
+ assert provenance.storage.entity_get_all(
+ EntityType.CONTENT
+ ) == provenance_storage.entity_get_all(EntityType.CONTENT)
+
+ # Test directory methods.
+ # Of all directories present in the current repo, only assign a date to those
+ # containing blobs (picking the max date among the available ones). Then check that
+ # the inserted data is the same in both storages.
+ def getmaxdate(
+ dir: Dict[str, Any], cnt_dates: Dict[Sha1Git, datetime]
+ ) -> Optional[datetime]:
+ dates = [
+ cnt_dates[entry["target"]]
+ for entry in dir["entries"]
+ if entry["type"] == "file"
+ ]
+ return max(dates) if dates else None
+
+ dir_dates = {dir["id"]: getmaxdate(dir, cnt_dates) for dir in data["directory"]}
+ assert dir_dates
+ assert provenance.storage.directory_set_date(
+ {sha1: date for sha1, date in dir_dates.items() if date is not None}
+ ) == provenance_storage.directory_set_date(
+ {sha1: date for sha1, date in dir_dates.items() if date is not None}
+ )
+
+ assert provenance.storage.directory_get(
+ dir_dates
+ ) == provenance_storage.directory_get(dir_dates)
+ assert provenance.storage.entity_get_all(
+ EntityType.DIRECTORY
+ ) == provenance_storage.entity_get_all(EntityType.DIRECTORY)
+
+ # Test origin methods.
+ # Add all origins present in the current repo to both storages. Then check that the
+ # inserted data is the same in both cases.
+ org_urls = {
+ hash_to_bytes(origin_identifier(org)): org["url"] for org in data["origin"]
+ }
+ assert org_urls
+ assert provenance.storage.origin_set_url(
+ org_urls
+ ) == provenance_storage.origin_set_url(org_urls)
+
+ assert provenance.storage.origin_get(org_urls) == provenance_storage.origin_get(
+ org_urls
+ )
+ assert provenance.storage.entity_get_all(
+ EntityType.ORIGIN
+ ) == provenance_storage.entity_get_all(EntityType.ORIGIN)
+
+ # Test revision methods.
+ # Add all revisions present in the current repo to both storages, assigning their
+ # dataes and an arbitrary origin to each one. Then check that the inserted data is
+ # the same in both cases.
+ rev_dates = {rev["id"]: ts2dt(rev["date"]) for rev in data["revision"]}
+ assert rev_dates
+ assert provenance.storage.revision_set_date(
+ rev_dates
+ ) == provenance_storage.revision_set_date(rev_dates)
+
+ rev_origins = {
+ rev["id"]: next(iter(org_urls)) # any arbitrary origin will do
+ for rev in data["revision"]
+ }
+ assert rev_origins
+ assert provenance.storage.revision_set_origin(
+ rev_origins
+ ) == provenance_storage.revision_set_origin(rev_origins)
+
+ assert provenance.storage.revision_get(
+ rev_dates
+ ) == provenance_storage.revision_get(rev_dates)
+ assert provenance.storage.entity_get_all(
+ EntityType.REVISION
+ ) == provenance_storage.entity_get_all(EntityType.REVISION)
+
+ # Test content-in-revision relation.
+ # Create flat models of every root directory for the revisions in the dataset.
+ cnt_in_rev: Set[RelationData] = set()
+ for rev in data["revision"]:
+ root = next(
+ subdir for subdir in data["directory"] if subdir["id"] == rev["directory"]
+ )
+ cnt_in_rev.update(dircontent(data, rev["id"], root))
+
+ relation_add_and_compare_result(
+ RelationType.CNT_EARLY_IN_REV,
+ cnt_in_rev,
+ provenance.storage,
+ provenance_storage,
+ provenance_storage.with_path(),
+ )
+
+ # Test content-in-directory relation.
+ # Create flat models for every directory in the dataset.
+ cnt_in_dir: Set[RelationData] = set()
+ for dir in data["directory"]:
+ cnt_in_dir.update(dircontent(data, dir["id"], dir))
+
+ relation_add_and_compare_result(
+ RelationType.CNT_IN_DIR,
+ cnt_in_dir,
+ provenance.storage,
+ provenance_storage,
+ provenance_storage.with_path(),
+ )
+
+ # Test content-in-directory relation.
+ # Add root directories to their correspondent revision in the dataset.
+ dir_in_rev = {
+ RelationData(rev["directory"], rev["id"], b".") for rev in data["revision"]
+ }
+
+ relation_add_and_compare_result(
+ RelationType.DIR_IN_REV,
+ dir_in_rev,
+ provenance.storage,
+ provenance_storage,
+ provenance_storage.with_path(),
+ )
+
+ # Test revision-in-origin relation.
+ # Add all revisions that are head of some snapshot branch to the corresponding
+ # origin.
+ rev_in_org = {
+ RelationData(
+ branch["target"],
+ hash_to_bytes(origin_identifier({"url": status["origin"]})),
+ None,
+ )
+ for status in data["origin_visit_status"]
+ if status["snapshot"] is not None
+ for snapshot in data["snapshot"]
+ if snapshot["id"] == status["snapshot"]
+ for _, branch in snapshot["branches"].items()
+ if branch["target_type"] == "revision"
+ }
+
+ relation_add_and_compare_result(
+ RelationType.REV_IN_ORG,
+ rev_in_org,
+ provenance.storage,
+ provenance_storage,
+ )
+
+ # Test revision-before-revision relation.
+ # For each revision in the data set add an entry for each parent to the relation.
+ rev_before_rev = {
+ RelationData(parent, rev["id"], None)
+ for rev in data["revision"]
+ for parent in rev["parents"]
+ }
+
+ relation_add_and_compare_result(
+ RelationType.REV_BEFORE_REV,
+ rev_before_rev,
+ provenance.storage,
+ provenance_storage,
+ )
+
+ # Test location_get.
+ if provenance_storage.with_path():
+ assert provenance.storage.location_get() == provenance_storage.location_get()
+
+ # Test content_find_first and content_find_all.
+ def adapt_result(
+ result: Optional[ProvenanceResult], with_path: bool
+ ) -> Optional[ProvenanceResult]:
+ if result is not None:
+ return ProvenanceResult(
+ result.content,
+ result.revision,
+ result.date,
+ result.origin,
+ result.path if with_path else b"",
+ )
+ return result
+
+ for cnt in cnt_dates:
+ assert adapt_result(
+ provenance.storage.content_find_first(cnt), provenance_storage.with_path()
+ ) == provenance_storage.content_find_first(cnt)
+
+ assert {
+ adapt_result(occur, provenance_storage.with_path())
+ for occur in provenance.storage.content_find_all(cnt)
+ } == set(provenance_storage.content_find_all(cnt))
def test_types(provenance: ProvenanceInterface) -> None:
@@ -14,7 +320,6 @@
# Create an instance of the protocol (which cannot be instantiated
# directly, so this creates a subclass, then instantiates it)
interface = type("_", (ProvenanceStorageInterface,), {})()
- storage = provenance.storage
assert "content_find_first" in dir(interface)
@@ -25,7 +330,7 @@
continue
interface_meth = getattr(interface, meth_name)
try:
- concrete_meth = getattr(storage, meth_name)
+ concrete_meth = getattr(provenance.storage, meth_name)
except AttributeError:
if not getattr(interface_meth, "deprecated_endpoint", False):
# The backend is missing a (non-deprecated) endpoint
@@ -43,4 +348,4 @@
# But there's no harm in double-checking.
# And we could replace the assertions above by this one, but unlike
# the assertions above, it doesn't explain what is missing.
- assert isinstance(storage, ProvenanceStorageInterface)
+ assert isinstance(provenance.storage, ProvenanceStorageInterface)

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 3:26 PM (1 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3234244

Event Timeline