Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9343597
D6231.id22543.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
38 KB
Subscribers
None
D6231.id22543.diff
View Options
diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py
--- a/swh/provenance/interface.py
+++ b/swh/provenance/interface.py
@@ -23,6 +23,11 @@
ORIGIN = "origin"
+class UnsupportedEntityError(Exception):
+ def __init__(self, entity: EntityType) -> None:
+ super().__init__(f"Unsupported entity: {entity.value}")
+
+
class RelationType(enum.Enum):
CNT_EARLY_IN_REV = "content_in_revision"
CNT_IN_DIR = "content_in_directory"
@@ -107,6 +112,15 @@
"""
...
+ @remote_api_endpoint("entity_get_all")
+ def entity_add(self, entity: EntityType, ids: Iterable[Sha1Git]) -> bool:
+ """Add entries to the selected `entity` with `None` values in all optional
+ fields. `EntityType.ORIGIN` is not supported by this method (it raises a
+ `UnsupportedEntityError`) since origins have non-optional associated fields
+ (ie. `url`). See `origin_set_url` for adding origin entries to the storage.
+ """
+ ...
+
@remote_api_endpoint("entity_get_all")
def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]:
"""Retrieve all sha1 ids for entities of type `entity` present in the provenance
@@ -114,6 +128,11 @@
"""
...
+ @remote_api_endpoint("location_get")
+ def location_add(self, paths: Iterable[bytes]) -> bool:
+ """Register the given `paths` in the storage."""
+ ...
+
@remote_api_endpoint("location_get")
def location_get(self) -> Set[bytes]:
"""Retrieve all paths present in the provenance model."""
@@ -144,7 +163,8 @@
def revision_set_origin(self, origins: Dict[Sha1Git, Sha1Git]) -> bool:
"""Associate origins to revisions identified by sha1 ids, as paired in
`origins` (revision ids are keys and origin ids, values). Return a boolean
- stating whether the information was successfully stored.
+ stating whether the information was successfully stored. This method assumes
+ all origins are are already registered in the storage. See `origin_set_url`.
"""
...
@@ -160,7 +180,10 @@
def relation_add(
self, relation: RelationType, data: Iterable[RelationData]
) -> bool:
- """Add entries in the selected `relation`."""
+ """Add entries in the selected `relation`. This method assumes all entities
+ being related are already registered in the storage. See `entity_add` and
+ `origin_set_url`.
+ """
...
@remote_api_endpoint("relation_get")
diff --git a/swh/provenance/mongo/backend.py b/swh/provenance/mongo/backend.py
--- a/swh/provenance/mongo/backend.py
+++ b/swh/provenance/mongo/backend.py
@@ -18,6 +18,7 @@
RelationData,
RelationType,
RevisionData,
+ UnsupportedEntityError,
)
@@ -171,6 +172,36 @@
)
}
+ def entity_add(self, entity: EntityType, ids: Iterable[Sha1Git]) -> bool:
+ if entity == EntityType.ORIGIN:
+ raise UnsupportedEntityError(entity)
+
+ sha1s = list(set(ids))
+ if ids:
+ obj: Dict[str, Any] = {"ts": None}
+ if entity.value == "content":
+ obj["revision"] = {}
+ obj["directory"] = {}
+ if entity.value == "directory":
+ obj["revision"] = {}
+ if entity.value == "revision":
+ obj["preferred"] = None
+ obj["origin"] = []
+ obj["revision"] = []
+
+ existing = {
+ x["sha1"]
+ for x in self.db.get_collection(entity.value).find(
+ {"sha1": {"$in": sha1s}}, {"_id": 0, "sha1": 1}
+ )
+ }
+ for sha1 in sha1s:
+ if sha1 not in existing:
+ self.db.get_collection(entity.value).insert_one(
+ dict(obj, **{"sha1": sha1})
+ )
+ return True
+
def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]:
return {
x["sha1"]
@@ -179,6 +210,10 @@
)
}
+ def location_add(self, paths: Iterable[bytes]) -> bool:
+ # TODO: implement this methods if path are to be stored in a separate collection
+ return True
+
def location_get(self) -> Set[bytes]:
contents = self.db.content.find({}, {"revision": 1, "_id": 0, "directory": 1})
paths: List[Iterable[bytes]] = []
@@ -283,40 +318,10 @@
src_relation, *_, dst_relation = relation.value.split("_")
set_data = set(data)
- dst_sha1s = {x.dst for x in set_data}
- if dst_relation in ["content", "directory", "revision"]:
- dst_obj: Dict[str, Any] = {"ts": None}
- if dst_relation == "content":
- dst_obj["revision"] = {}
- dst_obj["directory"] = {}
- if dst_relation == "directory":
- dst_obj["revision"] = {}
- if dst_relation == "revision":
- dst_obj["preferred"] = None
- dst_obj["origin"] = []
- dst_obj["revision"] = []
-
- existing = {
- x["sha1"]
- for x in self.db.get_collection(dst_relation).find(
- {"sha1": {"$in": list(dst_sha1s)}}, {"_id": 0, "sha1": 1}
- )
- }
-
- for sha1 in dst_sha1s:
- if sha1 not in existing:
- self.db.get_collection(dst_relation).insert_one(
- dict(dst_obj, **{"sha1": sha1})
- )
- elif dst_relation == "origin":
- # TODO, check origins are already in the DB
- # if not, algo has something wrong (algo inserts it initially)
- pass
-
dst_objs = {
x["sha1"]: x["_id"]
for x in self.db.get_collection(dst_relation).find(
- {"sha1": {"$in": list(dst_sha1s)}}, {"_id": 1, "sha1": 1}
+ {"sha1": {"$in": [x.dst for x in set_data]}}, {"_id": 1, "sha1": 1}
)
}
@@ -337,42 +342,24 @@
}
for sha1, dsts in denorm.items():
- if sha1 in src_objs:
- # update
- if src_relation != "revision":
- k = {
- obj_id: list(set(paths + dsts.get(obj_id, [])))
- for obj_id, paths in src_objs[sha1][dst_relation].items()
- }
- self.db.get_collection(src_relation).update_one(
- {"_id": src_objs[sha1]["_id"]},
- {"$set": {dst_relation: dict(dsts, **k)}},
- )
- else:
- self.db.get_collection(src_relation).update_one(
- {"_id": src_objs[sha1]["_id"]},
- {
- "$set": {
- dst_relation: list(
- set(src_objs[sha1][dst_relation] + dsts)
- )
- }
- },
- )
+ # update
+ if src_relation != "revision":
+ k = {
+ obj_id: list(set(paths + dsts.get(obj_id, [])))
+ for obj_id, paths in src_objs[sha1][dst_relation].items()
+ }
+ self.db.get_collection(src_relation).update_one(
+ {"_id": src_objs[sha1]["_id"]},
+ {"$set": {dst_relation: dict(dsts, **k)}},
+ )
else:
- # add new rev
- src_obj: Dict[str, Any] = {"ts": None}
- if src_relation == "content":
- src_obj["revision"] = {}
- src_obj["directory"] = {}
- if src_relation == "directory":
- src_obj["revision"] = {}
- if src_relation == "revision":
- src_obj["preferred"] = None
- src_obj["origin"] = []
- src_obj["revision"] = []
- self.db.get_collection(src_relation).insert_one(
- dict(src_obj, **{"sha1": sha1, dst_relation: dsts})
+ self.db.get_collection(src_relation).update_one(
+ {"_id": src_objs[sha1]["_id"]},
+ {
+ "$set": {
+ dst_relation: list(set(src_objs[sha1][dst_relation] + dsts))
+ }
+ },
)
return True
diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py
--- a/swh/provenance/postgresql/provenance.py
+++ b/swh/provenance/postgresql/provenance.py
@@ -22,6 +22,7 @@
RelationData,
RelationType,
RevisionData,
+ UnsupportedEntityError,
)
LOGGER = logging.getLogger(__name__)
@@ -87,11 +88,52 @@
def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]:
return self._entity_get_date("directory", ids)
+ def entity_add(self, entity: EntityType, ids: Iterable[Sha1Git]) -> bool:
+ if entity == EntityType.ORIGIN:
+ raise UnsupportedEntityError(entity)
+
+ try:
+ sha1s = [(sha1,) for sha1 in ids]
+ if sha1s:
+ sql = f"""
+ INSERT INTO {entity.value}(sha1) VALUES %s
+ ON CONFLICT DO NOTHING
+ """
+ with self.transaction() as cursor:
+ psycopg2.extras.execute_values(cursor, sql, argslist=sha1s)
+ return True
+ except: # noqa: E722
+ # Unexpected error occurred, rollback all changes and log message
+ LOGGER.exception("Unexpected error")
+ if self.raise_on_commit:
+ raise
+ return False
+
def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]:
with self.transaction(readonly=True) as cursor:
cursor.execute(f"SELECT sha1 FROM {entity.value}")
return {row["sha1"] for row in cursor}
+ def location_add(self, paths: Iterable[bytes]) -> bool:
+ if not self.with_path():
+ return True
+ try:
+ values = [(path,) for path in paths]
+ if values:
+ sql = """
+ INSERT INTO location(path) VALUES %s
+ ON CONFLICT DO NOTHING
+ """
+ with self.transaction() as cursor:
+ psycopg2.extras.execute_values(cursor, sql, argslist=values)
+ return True
+ except: # noqa: E722
+ # Unexpected error occurred, rollback all changes and log message
+ LOGGER.exception("Unexpected error")
+ if self.raise_on_commit:
+ raise
+ return False
+
def location_get(self) -> Set[bytes]:
with self.transaction(readonly=True) as cursor:
cursor.execute("SELECT location.path AS path FROM location")
@@ -187,32 +229,6 @@
rel_table = relation.value
src_table, *_, dst_table = rel_table.split("_")
- if src_table != "origin":
- # Origin entries should be inserted previously as they require extra
- # non-null information
- srcs = tuple(set((sha1,) for (sha1, _, _) in rows))
- sql = f"""
- INSERT INTO {src_table}(sha1) VALUES %s
- ON CONFLICT DO NOTHING
- """
- with self.transaction() as cursor:
- psycopg2.extras.execute_values(
- cur=cursor, sql=sql, argslist=srcs
- )
-
- if dst_table != "origin":
- # Origin entries should be inserted previously as they require extra
- # non-null information
- dsts = tuple(set((sha1,) for (_, sha1, _) in rows))
- sql = f"""
- INSERT INTO {dst_table}(sha1) VALUES %s
- ON CONFLICT DO NOTHING
- """
- with self.transaction() as cursor:
- psycopg2.extras.execute_values(
- cur=cursor, sql=sql, argslist=dsts
- )
-
# Put the next three queries in a manual single transaction:
# they use the same temp table
with self.transaction() as cursor:
diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py
--- a/swh/provenance/provenance.py
+++ b/swh/provenance/provenance.py
@@ -13,6 +13,7 @@
from swh.model.model import Sha1Git
from .interface import (
+ EntityType,
ProvenanceResult,
ProvenanceStorageInterface,
RelationData,
@@ -79,6 +80,49 @@
def flush(self) -> None:
# Revision-content layer insertions ############################################
+ # After relations, dates for the entities can be safely set, acknowledging that
+ # these entities won't need to be reprocessed in case of failure.
+ sha1s = {
+ src
+ for src, _, _ in self.cache["content_in_revision"]
+ | self.cache["content_in_directory"]
+ }
+ if sha1s:
+ while not self.storage.entity_add(EntityType.CONTENT, sha1s):
+ LOGGER.warning(
+ "Unable to write content entities to the storage. Retrying..."
+ )
+
+ sha1s = {dst for _, dst, _ in self.cache["content_in_directory"]}
+ if sha1s:
+ while not self.storage.entity_add(EntityType.DIRECTORY, sha1s):
+ LOGGER.warning(
+ "Unable to write directory entities to the storage. Retrying..."
+ )
+
+ sha1s = {
+ dst
+ for _, dst, _ in self.cache["content_in_revision"]
+ | self.cache["directory_in_revision"]
+ }
+ if sha1s:
+ while not self.storage.entity_add(EntityType.REVISION, sha1s):
+ LOGGER.warning(
+ "Unable to write revision entities to the storage. Retrying..."
+ )
+
+ paths = {
+ path
+ for _, _, path in self.cache["content_in_revision"]
+ | self.cache["content_in_directory"]
+ | self.cache["directory_in_revision"]
+ }
+ if paths:
+ while not self.storage.location_add(paths):
+ LOGGER.warning(
+ "Unable to write locations entities to the storage. Retrying..."
+ )
+
# For this layer, relations need to be inserted first so that, in case of
# failure, reprocessing the input does not generated an inconsistent database.
if self.cache["content_in_revision"]:
@@ -170,6 +214,17 @@
"Unable to write origins urls to the storage. Retrying..."
)
+ sha1s = (
+ {src for src in self.cache["revision_origin"]["added"]}
+ # Destinations in this relation should match origins in the previous one
+ | {src for src in self.cache["revision_before_revision"]}
+ )
+ if sha1s:
+ while not self.storage.entity_add(EntityType.REVISION, sha1s):
+ LOGGER.warning(
+ "Unable to write revision entities to the storage. Retrying..."
+ )
+
# Second, flat models for revisions' histories (ie. revision-before-revision).
data: Iterable[RelationData] = sum(
[
diff --git a/swh/provenance/sql/40-funcs.sql b/swh/provenance/sql/40-funcs.sql
--- a/swh/provenance/sql/40-funcs.sql
+++ b/swh/provenance/sql/40-funcs.sql
@@ -99,11 +99,6 @@
join_location text;
begin
if src_table in ('content'::regclass, 'directory'::regclass) then
- insert into location(path)
- select V.path
- from tmp_relation_add as V
- on conflict (path) do nothing;
-
select_fields := 'D.id, L.id';
join_location := 'inner join location as L on (L.path = V.path)';
else
@@ -419,11 +414,6 @@
on_conflict text;
begin
if src_table in ('content'::regclass, 'directory'::regclass) then
- insert into location(path)
- select V.path
- from tmp_relation_add as V
- on conflict (path) do nothing;
-
select_fields := 'array_agg((D.id, L.id)::rel_dst)';
join_location := 'inner join location as L on (L.path = V.path)';
group_entries := 'group by S.id';
diff --git a/swh/provenance/tests/test_provenance_storage.py b/swh/provenance/tests/test_provenance_storage.py
--- a/swh/provenance/tests/test_provenance_storage.py
+++ b/swh/provenance/tests/test_provenance_storage.py
@@ -3,7 +3,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from datetime import datetime
+from datetime import datetime, timezone
import inspect
import os
from typing import Any, Dict, Iterable, Optional, Set
@@ -13,6 +13,7 @@
from swh.model.hashutil import hash_to_bytes
from swh.model.identifiers import origin_identifier
from swh.model.model import Sha1Git
+from swh.provenance.archive import ArchiveInterface
from swh.provenance.interface import (
EntityType,
ProvenanceInterface,
@@ -20,54 +21,210 @@
ProvenanceStorageInterface,
RelationData,
RelationType,
+ RevisionData,
+ UnsupportedEntityError,
)
-from swh.provenance.tests.conftest import load_repo_data, ts2dt
+from swh.provenance.model import OriginEntry, RevisionEntry
+from swh.provenance.mongo.backend import ProvenanceStorageMongoDb
+from swh.provenance.origin import origin_add
+from swh.provenance.provenance import Provenance
+from swh.provenance.revision import revision_add
+from swh.provenance.tests.conftest import fill_storage, load_repo_data, ts2dt
-def relation_add_and_compare_result(
- relation: RelationType,
- data: Set[RelationData],
- refstorage: ProvenanceStorageInterface,
- storage: ProvenanceStorageInterface,
- with_path: bool = True,
+@pytest.mark.parametrize(
+ "repo",
+ ("cmdbts2",),
+)
+def test_provenance_storage_content(
+ provenance_storage: ProvenanceStorageInterface,
+ repo: str,
) -> None:
- assert data
- assert refstorage.relation_add(relation, data) == storage.relation_add(
- relation, data
- )
+ """Tests content methods for every `ProvenanceStorageInterface` implementation."""
- assert relation_compare_result(
- refstorage.relation_get(relation, (reldata.src for reldata in data)),
- storage.relation_get(relation, (reldata.src for reldata in data)),
- with_path,
- )
- assert relation_compare_result(
- refstorage.relation_get(
- relation,
- (reldata.dst for reldata in data),
- reverse=True,
- ),
- storage.relation_get(
- relation,
- (reldata.dst for reldata in data),
- reverse=True,
- ),
- with_path,
- )
- assert relation_compare_result(
- refstorage.relation_get_all(relation),
- storage.relation_get_all(relation),
- with_path,
- )
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data(repo)
+ # Add all content present in the current repo to the storage, just assigning their
+ # creation dates. Then check that the returned results when querying are the same.
+ dates = {cnt["sha1_git"]: cnt["ctime"] for cnt in data["content"]}
+ assert dates
+ assert provenance_storage.content_set_date(dates)
+ assert provenance_storage.content_get(set(dates.keys())) == dates
+ assert provenance_storage.entity_get_all(EntityType.CONTENT) == set(dates.keys())
-def relation_compare_result(
- expected: Set[RelationData], computed: Set[RelationData], with_path: bool
-) -> bool:
- return {
- RelationData(reldata.src, reldata.dst, reldata.path if with_path else None)
- for reldata in expected
- } == computed
+
+@pytest.mark.parametrize(
+ "repo",
+ ("cmdbts2",),
+)
+def test_provenance_storage_directory(
+ provenance_storage: ProvenanceStorageInterface,
+ repo: str,
+) -> None:
+ """Tests directory methods for every `ProvenanceStorageInterface` implementation."""
+
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data(repo)
+
+ # Of all directories present in the current repo, only assign a date to those
+ # containing blobs (picking the max date among the available ones). Then check that
+ # the returned results when querying are the same.
+ def getmaxdate(
+ directory: Dict[str, Any], contents: Iterable[Dict[str, Any]]
+ ) -> datetime:
+ dates = [
+ content["ctime"]
+ for entry in directory["entries"]
+ for content in contents
+ if entry["type"] == "file" and entry["target"] == content["sha1_git"]
+ ]
+ return max(dates) if dates else datetime.now(tz=timezone.utc)
+
+ dates = {dir["id"]: getmaxdate(dir, data["content"]) for dir in data["directory"]}
+ assert dates
+ assert provenance_storage.directory_set_date(dates)
+ assert provenance_storage.directory_get(set(dates.keys())) == dates
+ assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == set(dates.keys())
+
+
+@pytest.mark.parametrize(
+ "repo",
+ ("cmdbts2",),
+)
+def test_provenance_storage_entity(
+ provenance_storage: ProvenanceStorageInterface,
+ repo: str,
+) -> None:
+ """Tests entity methods for every `ProvenanceStorageInterface` implementation."""
+
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data(repo)
+
+ # Test EntityType.CONTENT
+ # Add all contents present in the current repo to the storage. Then check that the
+ # returned results when querying are the same.
+ sha1s = {cnt["sha1_git"] for cnt in data["content"]}
+ assert sha1s
+ assert provenance_storage.entity_add(EntityType.CONTENT, sha1s)
+ assert provenance_storage.entity_get_all(EntityType.CONTENT) == sha1s
+
+ # Test EntityType.DIRECTORY
+ # Add all directories present in the current repo to the storage. Then check that
+ # the returned directories when querying are the same.
+ sha1s = {dir["id"] for dir in data["directory"]}
+ assert sha1s
+ assert provenance_storage.entity_add(EntityType.DIRECTORY, sha1s)
+ assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == sha1s
+
+ # Test EntityType.REVISION
+ # Add all revisions present in the current repo to the storage. Then check that the
+ # returned revisions when querying are the same.
+ sha1s = {rev["id"] for rev in data["revision"]}
+ assert sha1s
+ assert provenance_storage.entity_add(EntityType.REVISION, sha1s)
+ assert provenance_storage.entity_get_all(EntityType.REVISION) == sha1s
+
+ # Test EntityType.ORIGIN
+ # Add all origins present in the current repo. It should fail with a
+ # `UnsupportedEntityError`. Then check that indeed nothing was inserted.
+ sha1s = {hash_to_bytes(origin_identifier(org)) for org in data["origin"]}
+ assert sha1s
+ with pytest.raises(UnsupportedEntityError) as error:
+ provenance_storage.entity_add(EntityType.ORIGIN, sha1s)
+ assert "Unsupported entity: origin" in str(error.value)
+ assert provenance_storage.entity_get_all(EntityType.ORIGIN) == set()
+
+
+@pytest.mark.parametrize(
+ "repo",
+ ("cmdbts2",),
+)
+def test_provenance_storage_location(
+ provenance_storage: ProvenanceStorageInterface,
+ repo: str,
+) -> None:
+ """Tests location methods for every `ProvenanceStorageInterface` implementation."""
+
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data(repo)
+
+ # Add all names of entries present in the directories of the current repo as paths
+ # to the storage. Then check that the returned results when querying are the same.
+ paths = {entry["name"] for dir in data["directory"] for entry in dir["entries"]}
+ assert provenance_storage.location_add(paths)
+
+ if isinstance(provenance_storage, ProvenanceStorageMongoDb):
+ # TODO: remove this when `location_add` is properly implemented for MongoDb.
+ return
+
+ if provenance_storage.with_path():
+ assert provenance_storage.location_get() == paths
+ else:
+ assert provenance_storage.location_get() == set()
+
+
+@pytest.mark.parametrize(
+ "repo",
+ ("cmdbts2",),
+)
+def test_provenance_storage_origin(
+ provenance_storage: ProvenanceStorageInterface,
+ repo: str,
+) -> None:
+ """Tests origin methods for every `ProvenanceStorageInterface` implementation."""
+
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data(repo)
+
+ # Test origin methods.
+ # Add all origins present in the current repo to the storage. Then check that the
+ # returned results when querying are the same.
+ urls = {hash_to_bytes(origin_identifier(org)): org["url"] for org in data["origin"]}
+ assert urls
+ assert provenance_storage.origin_set_url(urls)
+ assert provenance_storage.origin_get(set(urls.keys())) == urls
+ assert provenance_storage.entity_get_all(EntityType.ORIGIN) == set(urls.keys())
+
+
+@pytest.mark.parametrize(
+ "repo",
+ ("cmdbts2",),
+)
+def test_provenance_storage_revision(
+ provenance_storage: ProvenanceStorageInterface,
+ repo: str,
+) -> None:
+ """Tests revision methods for every `ProvenanceStorageInterface` implementation."""
+
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data(repo)
+
+ # Test revision methods.
+ # Add all revisions present in the current repo to the storage, assigning their
+ # dates and an arbitrary origin to each one. Then check that the returned results
+ # when querying are the same.
+ origin = next(iter(data["origin"]))
+ org_sha1 = hash_to_bytes(origin_identifier(origin))
+ # Origin must be inserted in advance.
+ assert provenance_storage.origin_set_url({org_sha1: origin["url"]})
+
+ dates = {rev["id"]: ts2dt(rev["date"]) for rev in data["revision"]}
+ orgs = {rev["id"]: org_sha1 for rev in data["revision"]}
+ assert set(dates.keys()) == set(orgs.keys())
+ revs = {
+ rev: RevisionData(date, org)
+ for sha1, date in dates.items()
+ for rev, org in orgs.items()
+ if rev == sha1
+ }
+
+ assert dates
+ assert orgs
+ assert provenance_storage.revision_set_date(dates)
+ assert provenance_storage.revision_set_origin(orgs)
+ assert provenance_storage.revision_get(set(revs.keys())) == revs
+ assert provenance_storage.entity_get_all(EntityType.REVISION) == set(revs.keys())
def dircontent(
@@ -94,41 +251,66 @@
return content
+def relation_add_and_compare_result(
+ relation: RelationType, data: Set[RelationData], storage: ProvenanceStorageInterface
+) -> None:
+ # Source, destinations and locations must be added in advance.
+ src, *_, dst = relation.value.split("_")
+ if src != "origin":
+ assert storage.entity_add(EntityType(src), {entry.src for entry in data})
+ if dst != "origin":
+ assert storage.entity_add(EntityType(dst), {entry.dst for entry in data})
+ if storage.with_path():
+ assert storage.location_add(
+ {entry.path for entry in data if entry.path is not None}
+ )
+
+ assert data
+ assert storage.relation_add(relation, data)
+
+ for row in data:
+ assert relation_compare_result(
+ storage.relation_get(relation, [row.src]),
+ {entry for entry in data if entry.src == row.src},
+ storage.with_path(),
+ )
+ assert relation_compare_result(
+ storage.relation_get(
+ relation,
+ [row.dst],
+ reverse=True,
+ ),
+ {entry for entry in data if entry.dst == row.dst},
+ storage.with_path(),
+ )
+
+ assert relation_compare_result(
+ storage.relation_get_all(relation), data, storage.with_path()
+ )
+
+
+def relation_compare_result(
+ computed: Set[RelationData], expected: Set[RelationData], with_path: bool
+) -> bool:
+ return {
+ RelationData(row.src, row.dst, row.path if with_path else None)
+ for row in expected
+ } == computed
+
+
@pytest.mark.parametrize(
"repo",
- ("cmdbts2", "out-of-order", "with-merges"),
+ ("cmdbts2",),
)
-def test_provenance_storage(
- provenance: ProvenanceInterface,
+def test_provenance_storage_relation(
provenance_storage: ProvenanceStorageInterface,
repo: str,
) -> None:
- """Tests every ProvenanceStorageInterface implementation against the one provided
- for provenance.storage."""
+ """Tests relation methods for every `ProvenanceStorageInterface` implementation."""
+
# Read data/README.md for more details on how these datasets are generated.
data = load_repo_data(repo)
- # Assuming provenance.storage has the 'with-path' flavor.
- assert provenance.storage.with_path()
-
- # Test origin methods.
- # Add all origins present in the current repo to both storages. Then check that the
- # inserted data is the same in both cases.
- org_urls = {
- hash_to_bytes(origin_identifier(org)): org["url"] for org in data["origin"]
- }
- assert org_urls
- assert provenance.storage.origin_set_url(
- org_urls
- ) == provenance_storage.origin_set_url(org_urls)
-
- assert provenance.storage.origin_get(org_urls) == provenance_storage.origin_get(
- org_urls
- )
- assert provenance.storage.entity_get_all(
- EntityType.ORIGIN
- ) == provenance_storage.entity_get_all(EntityType.ORIGIN)
-
# Test content-in-revision relation.
# Create flat models of every root directory for the revisions in the dataset.
cnt_in_rev: Set[RelationData] = set()
@@ -137,13 +319,8 @@
subdir for subdir in data["directory"] if subdir["id"] == rev["directory"]
)
cnt_in_rev.update(dircontent(data, rev["id"], root))
-
relation_add_and_compare_result(
- RelationType.CNT_EARLY_IN_REV,
- cnt_in_rev,
- provenance.storage,
- provenance_storage,
- provenance_storage.with_path(),
+ RelationType.CNT_EARLY_IN_REV, cnt_in_rev, provenance_storage
)
# Test content-in-directory relation.
@@ -151,13 +328,8 @@
cnt_in_dir: Set[RelationData] = set()
for dir in data["directory"]:
cnt_in_dir.update(dircontent(data, dir["id"], dir))
-
relation_add_and_compare_result(
- RelationType.CNT_IN_DIR,
- cnt_in_dir,
- provenance.storage,
- provenance_storage,
- provenance_storage.with_path(),
+ RelationType.CNT_IN_DIR, cnt_in_dir, provenance_storage
)
# Test content-in-directory relation.
@@ -165,13 +337,8 @@
dir_in_rev = {
RelationData(rev["directory"], rev["id"], b".") for rev in data["revision"]
}
-
relation_add_and_compare_result(
- RelationType.DIR_IN_REV,
- dir_in_rev,
- provenance.storage,
- provenance_storage,
- provenance_storage.with_path(),
+ RelationType.DIR_IN_REV, dir_in_rev, provenance_storage
)
# Test revision-in-origin relation.
@@ -190,12 +357,16 @@
for _, branch in snapshot["branches"].items()
if branch["target_type"] == "revision"
}
+ # Origins must be inserted in advance (cannot be done by `entity_add` inside
+ # `relation_add_and_compare_result`).
+ urls = {
+ hash_to_bytes(origin_identifier(origin)): origin["url"]
+ for origin in data["origin"]
+ }
+ assert provenance_storage.origin_set_url(urls)
relation_add_and_compare_result(
- RelationType.REV_IN_ORG,
- rev_in_org,
- provenance.storage,
- provenance_storage,
+ RelationType.REV_IN_ORG, rev_in_org, provenance_storage
)
# Test revision-before-revision relation.
@@ -205,87 +376,45 @@
for rev in data["revision"]
for parent in rev["parents"]
}
-
relation_add_and_compare_result(
- RelationType.REV_BEFORE_REV,
- rev_before_rev,
- provenance.storage,
- provenance_storage,
+ RelationType.REV_BEFORE_REV, rev_before_rev, provenance_storage
)
- # Test content methods.
- # Add all content present in the current repo to both storages, just assigning their
- # creation dates. Then check that the inserted content is the same in both cases.
- cnt_dates = {cnt["sha1_git"]: cnt["ctime"] for cnt in data["content"]}
- assert cnt_dates
- assert provenance.storage.content_set_date(
- cnt_dates
- ) == provenance_storage.content_set_date(cnt_dates)
-
- assert provenance.storage.content_get(cnt_dates) == provenance_storage.content_get(
- cnt_dates
- )
- assert provenance.storage.entity_get_all(
- EntityType.CONTENT
- ) == provenance_storage.entity_get_all(EntityType.CONTENT)
- # Test directory methods.
- # Of all directories present in the current repo, only assign a date to those
- # containing blobs (picking the max date among the available ones). Then check that
- # the inserted data is the same in both storages.
- def getmaxdate(
- dir: Dict[str, Any], cnt_dates: Dict[Sha1Git, datetime]
- ) -> Optional[datetime]:
- dates = [
- cnt_dates[entry["target"]]
- for entry in dir["entries"]
- if entry["type"] == "file"
- ]
- return max(dates) if dates else None
-
- dir_dates = {dir["id"]: getmaxdate(dir, cnt_dates) for dir in data["directory"]}
- assert dir_dates
- assert provenance.storage.directory_set_date(
- {sha1: date for sha1, date in dir_dates.items() if date is not None}
- ) == provenance_storage.directory_set_date(
- {sha1: date for sha1, date in dir_dates.items() if date is not None}
- )
- assert provenance.storage.directory_get(
- dir_dates
- ) == provenance_storage.directory_get(dir_dates)
- assert provenance.storage.entity_get_all(
- EntityType.DIRECTORY
- ) == provenance_storage.entity_get_all(EntityType.DIRECTORY)
+@pytest.mark.parametrize(
+ "repo",
+ ("cmdbts2",),
+)
+def test_provenance_storage_find(
+ archive: ArchiveInterface,
+ provenance: ProvenanceInterface,
+ provenance_storage: ProvenanceStorageInterface,
+ repo: str,
+) -> None:
+ """Tests `content_find_first` and `content_find_all` methods for every
+ `ProvenanceStorageInterface` implementation.
+ """
- # Test revision methods.
- # Add all revisions present in the current repo to both storages, assigning their
- # dataes and an arbitrary origin to each one. Then check that the inserted data is
- # the same in both cases.
- rev_dates = {rev["id"]: ts2dt(rev["date"]) for rev in data["revision"]}
- assert rev_dates
- assert provenance.storage.revision_set_date(
- rev_dates
- ) == provenance_storage.revision_set_date(rev_dates)
-
- rev_origins = {
- rev["id"]: next(iter(org_urls)) # any arbitrary origin will do
+ # Read data/README.md for more details on how these datasets are generated.
+ data = load_repo_data(repo)
+ fill_storage(archive.storage, data)
+
+ # Execute the origin-revision algorithm on both storages.
+ origins = [
+ OriginEntry(url=sta["origin"], snapshot=sta["snapshot"])
+ for sta in data["origin_visit_status"]
+ if sta["snapshot"] is not None
+ ]
+ origin_add(provenance, archive, origins)
+ origin_add(Provenance(provenance_storage), archive, origins)
+
+ # Execute the revision-content algorithm on both storages.
+ revisions = [
+ RevisionEntry(id=rev["id"], date=ts2dt(rev["date"]), root=rev["directory"])
for rev in data["revision"]
- }
- assert rev_origins
- assert provenance.storage.revision_set_origin(
- rev_origins
- ) == provenance_storage.revision_set_origin(rev_origins)
-
- assert provenance.storage.revision_get(
- rev_dates
- ) == provenance_storage.revision_get(rev_dates)
- assert provenance.storage.entity_get_all(
- EntityType.REVISION
- ) == provenance_storage.entity_get_all(EntityType.REVISION)
-
- # Test location_get.
- if provenance_storage.with_path():
- assert provenance.storage.location_get() == provenance_storage.location_get()
+ ]
+ revision_add(provenance, archive, revisions)
+ revision_add(Provenance(provenance_storage), archive, revisions)
# Test content_find_first and content_find_all.
def adapt_result(
@@ -301,7 +430,7 @@
)
return result
- for cnt in cnt_dates:
+ for cnt in {cnt["sha1_git"] for cnt in data["content"]}:
assert adapt_result(
provenance.storage.content_find_first(cnt), provenance_storage.with_path()
) == provenance_storage.content_find_first(cnt)
@@ -312,7 +441,7 @@
} == set(provenance_storage.content_find_all(cnt))
-def test_types(provenance: ProvenanceInterface) -> None:
+def test_types(provenance_storage: ProvenanceInterface) -> None:
"""Checks all methods of ProvenanceStorageInterface are implemented by this
backend, and that they have the same signature."""
# Create an instance of the protocol (which cannot be instantiated
@@ -328,7 +457,7 @@
continue
interface_meth = getattr(interface, meth_name)
try:
- concrete_meth = getattr(provenance.storage, meth_name)
+ concrete_meth = getattr(provenance_storage, meth_name)
except AttributeError:
if not getattr(interface_meth, "deprecated_endpoint", False):
# The backend is missing a (non-deprecated) endpoint
@@ -346,4 +475,4 @@
# But there's no harm in double-checking.
# And we could replace the assertions above by this one, but unlike
# the assertions above, it doesn't explain what is missing.
- assert isinstance(provenance.storage, ProvenanceStorageInterface)
+ assert isinstance(provenance_storage, ProvenanceStorageInterface)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 1:40 PM (6 d, 19 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217454
Attached To
D6231: Rework `ProvenanceStorageInterface` to have a single add method per entity
Event Timeline
Log In to Comment