Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9346061
D6094.id22200.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
33 KB
Subscribers
None
D6094.id22200.diff
View Options
diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -5,6 +5,9 @@
# 3rd party libraries without stubs (yet)
+[mypy-bson.*]
+ignore_missing_imports = True
+
[mypy-iso8601.*]
ignore_missing_imports = True
@@ -17,6 +20,9 @@
[mypy-pkg_resources.*]
ignore_missing_imports = True
+[mypy-pymongo.*]
+ignore_missing_imports = True
+
[mypy-pytest.*]
ignore_missing_imports = True
diff --git a/pytest.ini b/pytest.ini
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,2 +1,6 @@
[pytest]
norecursedirs = docs .*
+
+mongodb_fixture_dir = swh/provenance/tests/data/mongo
+mongodb_engine = mongomock
+mongodb_dbname = test
diff --git a/requirements-test.txt b/requirements-test.txt
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,4 +1,5 @@
pytest
+pytest-mongodb
swh.loader.git >= 0.8
swh.journal >= 0.8
types-Werkzeug
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,8 @@
click
iso8601
methodtools
+pymongo
PyYAML
types-click
types-PyYAML
+types-Werkzeug
diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py
--- a/swh/provenance/__init__.py
+++ b/swh/provenance/__init__.py
@@ -87,6 +87,15 @@
raise_on_commit = kwargs.get("raise_on_commit", False)
return ProvenanceStoragePostgreSql(conn, raise_on_commit)
+ elif cls == "mongodb":
+ from pymongo import MongoClient
+
+ from .mongo.backend import ProvenanceStorageMongoDb
+
+ dbname = kwargs["db"].pop("dbname")
+ db = MongoClient(**kwargs["db"]).get_database(dbname)
+ return ProvenanceStorageMongoDb(db)
+
elif cls == "remote":
from .api.client import RemoteProvenanceStorage
diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py
--- a/swh/provenance/cli.py
+++ b/swh/provenance/cli.py
@@ -26,11 +26,13 @@
DEFAULT_CONFIG: Dict[str, Any] = {
"provenance": {
"archive": {
+ # Storage API based Archive object
# "cls": "api",
# "storage": {
# "cls": "remote",
# "url": "http://uffizi.internal.softwareheritage.org:5002",
# }
+ # Direct access Archive object
"cls": "direct",
"db": {
"host": "db.internal.softwareheritage.org",
@@ -39,8 +41,22 @@
},
},
"storage": {
+ # Local PostgreSQL Storage
"cls": "postgresql",
- "db": {"host": "localhost", "dbname": "provenance"},
+ "db": {
+ "host": "localhost",
+ "user": "postgres",
+ "password": "postgres",
+ "dbname": "provenance",
+ },
+ # Local MongoDB Storage
+ # "cls": "mongodb",
+ # "db": {
+ # "dbname": "provenance",
+ # },
+ # Remote REST-API/PostgreSQL
+ # "cls": "remote",
+ # "url": "http://localhost:8080/%2f",
},
}
}
diff --git a/swh/provenance/mongo/README.md b/swh/provenance/mongo/README.md
new file mode 100644
--- /dev/null
+++ b/swh/provenance/mongo/README.md
@@ -0,0 +1,44 @@
+mongo backend
+=============
+
+Provenance storage implementation using MongoDB
+
+initial data-model
+------------------
+
+```json
+content
+{
+ id: sha1
+ ts: int //optional
+ revision: {<ref revision str>: [<ref path>]}
+ directory: {<ref directory str>: [<ref path>]}
+}
+
+directory
+{
+ id: sha1
+ ts: int //optional
+ revision: {<ref revision str>: [<ref path>]}
+}
+
+revision
+{
+ id: sha1
+ ts: int // optional
+ preferred <ref origin> //optinal
+ origin [<ref origin>]
+ revision [<ref revisions>]
+}
+
+origin
+{
+ id: sha1
+ url: str
+}
+
+path
+{
+ path: str
+}
+```
diff --git a/swh/provenance/mongo/__init__.py b/swh/provenance/mongo/__init__.py
new file mode 100644
diff --git a/swh/provenance/mongo/backend.py b/swh/provenance/mongo/backend.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/mongo/backend.py
@@ -0,0 +1,488 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from datetime import datetime, timezone
+import os
+from typing import Any, Dict, Generator, Iterable, List, Optional, Set
+
+from bson import ObjectId
+import pymongo.database
+
+from swh.model.model import Sha1Git
+
+from ..interface import (
+ EntityType,
+ ProvenanceResult,
+ RelationData,
+ RelationType,
+ RevisionData,
+)
+
+
+class ProvenanceStorageMongoDb:
+ def __init__(self, db: pymongo.database.Database):
+ self.db = db
+
+ def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]:
+ # get all the revisions
+ # iterate and find the earliest
+ content = self.db.content.find_one({"sha1": id})
+ if not content:
+ return None
+
+ occurs = []
+ for revision in self.db.revision.find(
+ {"_id": {"$in": [ObjectId(obj_id) for obj_id in content["revision"]]}}
+ ):
+ origin = self.db.origin.find_one({"sha1": revision["preferred"]})
+ assert origin is not None
+
+ for path in content["revision"][str(revision["_id"])]:
+ occurs.append(
+ ProvenanceResult(
+ content=id,
+ revision=revision["sha1"],
+ date=datetime.fromtimestamp(revision["ts"], timezone.utc),
+ origin=origin["url"],
+ path=path,
+ )
+ )
+ return sorted(occurs, key=lambda x: (x.date, x.revision, x.origin, x.path))[0]
+
+ def content_find_all(
+ self, id: Sha1Git, limit: Optional[int] = None
+ ) -> Generator[ProvenanceResult, None, None]:
+ content = self.db.content.find_one({"sha1": id})
+ if not content:
+ return None
+
+ occurs = []
+ for revision in self.db.revision.find(
+ {"_id": {"$in": [ObjectId(obj_id) for obj_id in content["revision"]]}}
+ ):
+ origin = self.db.origin.find_one({"sha1": revision["preferred"]})
+ assert origin is not None
+
+ for path in content["revision"][str(revision["_id"])]:
+ occurs.append(
+ ProvenanceResult(
+ content=id,
+ revision=revision["sha1"],
+ date=datetime.fromtimestamp(revision["ts"], timezone.utc),
+ origin=origin["url"],
+ path=path,
+ )
+ )
+ for directory in self.db.directory.find(
+ {"_id": {"$in": [ObjectId(obj_id) for obj_id in content["directory"]]}}
+ ):
+ for revision in self.db.revision.find(
+ {"_id": {"$in": [ObjectId(obj_id) for obj_id in directory["revision"]]}}
+ ):
+ origin = self.db.origin.find_one({"sha1": revision["preferred"]})
+ assert origin is not None
+
+ for suffix in content["directory"][str(directory["_id"])]:
+ for prefix in directory["revision"][str(revision["_id"])]:
+ path = (
+ os.path.join(prefix, suffix)
+ if prefix not in [b".", b""]
+ else suffix
+ )
+ occurs.append(
+ ProvenanceResult(
+ content=id,
+ revision=revision["sha1"],
+ date=datetime.fromtimestamp(
+ revision["ts"], timezone.utc
+ ),
+ origin=origin["url"],
+ path=path,
+ )
+ )
+ yield from sorted(occurs, key=lambda x: (x.date, x.revision, x.origin, x.path))
+
+ def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]:
+ return {
+ x["sha1"]: datetime.fromtimestamp(x["ts"], timezone.utc)
+ for x in self.db.content.find(
+ {"sha1": {"$in": list(ids)}, "ts": {"$ne": None}},
+ {"sha1": 1, "ts": 1, "_id": 0},
+ )
+ }
+
+ def content_set_date(self, dates: Dict[Sha1Git, datetime]) -> bool:
+ # get all the docuemtns with the id, add date, add missing records
+ cnts = {
+ x["sha1"]: x
+ for x in self.db.content.find(
+ {"sha1": {"$in": list(dates)}}, {"sha1": 1, "ts": 1, "_id": 1}
+ )
+ }
+
+ for sha1, date in dates.items():
+ ts = datetime.timestamp(date)
+ if sha1 in cnts:
+ # update
+ if cnts[sha1]["ts"] is None or ts < cnts[sha1]["ts"]:
+ self.db.content.update_one(
+ {"_id": cnts[sha1]["_id"]}, {"$set": {"ts": ts}}
+ )
+ else:
+ # add new content
+ self.db.content.insert_one(
+ {
+ "sha1": sha1,
+ "ts": ts,
+ "revision": {},
+ "directory": {},
+ }
+ )
+ return True
+
+ def directory_set_date(self, dates: Dict[Sha1Git, datetime]) -> bool:
+ dirs = {
+ x["sha1"]: x
+ for x in self.db.directory.find(
+ {"sha1": {"$in": list(dates)}}, {"sha1": 1, "ts": 1, "_id": 1}
+ )
+ }
+ for sha1, date in dates.items():
+ ts = datetime.timestamp(date)
+ if sha1 in dirs:
+ # update
+ if dirs[sha1]["ts"] is None or ts < dirs[sha1]["ts"]:
+ self.db.directory.update_one(
+ {"_id": dirs[sha1]["_id"]}, {"$set": {"ts": ts}}
+ )
+ else:
+ # add new dir
+ self.db.directory.insert_one({"sha1": sha1, "ts": ts, "revision": {}})
+ return True
+
+ def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]:
+ return {
+ x["sha1"]: datetime.fromtimestamp(x["ts"], timezone.utc)
+ for x in self.db.directory.find(
+ {"sha1": {"$in": list(ids)}, "ts": {"$ne": None}},
+ {"sha1": 1, "ts": 1, "_id": 0},
+ )
+ }
+
+ def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]:
+ return {
+ x["sha1"]
+ for x in self.db.get_collection(entity.value).find(
+ {}, {"sha1": 1, "_id": 0}
+ )
+ }
+
+ def location_get(self) -> Set[bytes]:
+ contents = self.db.content.find({}, {"revision": 1, "_id": 0, "directory": 1})
+ paths: List[Iterable[bytes]] = []
+ for content in contents:
+ paths.extend(value for _, value in content["revision"].items())
+ paths.extend(value for _, value in content["directory"].items())
+
+ dirs = self.db.directory.find({}, {"revision": 1, "_id": 0})
+ for each_dir in dirs:
+ paths.extend(value for _, value in each_dir["revision"].items())
+ return set(sum(paths, []))
+
+ def origin_set_url(self, urls: Dict[Sha1Git, str]) -> bool:
+ origins = {
+ x["sha1"]: x
+ for x in self.db.origin.find(
+ {"sha1": {"$in": list(urls)}}, {"sha1": 1, "url": 1, "_id": 1}
+ )
+ }
+ for sha1, url in urls.items():
+ if sha1 not in origins:
+ # add new origin
+ self.db.origin.insert_one({"sha1": sha1, "url": url})
+ return True
+
+ def origin_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, str]:
+ return {
+ x["sha1"]: x["url"]
+ for x in self.db.origin.find(
+ {"sha1": {"$in": list(ids)}}, {"sha1": 1, "url": 1, "_id": 0}
+ )
+ }
+
+ def revision_set_date(self, dates: Dict[Sha1Git, datetime]) -> bool:
+ revs = {
+ x["sha1"]: x
+ for x in self.db.revision.find(
+ {"sha1": {"$in": list(dates)}}, {"sha1": 1, "ts": 1, "_id": 1}
+ )
+ }
+ for sha1, date in dates.items():
+ ts = datetime.timestamp(date)
+ if sha1 in revs:
+ # update
+ if revs[sha1]["ts"] is None or ts < revs[sha1]["ts"]:
+ self.db.revision.update_one(
+ {"_id": revs[sha1]["_id"]}, {"$set": {"ts": ts}}
+ )
+ else:
+ # add new rev
+ self.db.revision.insert_one(
+ {
+ "sha1": sha1,
+ "preferred": None,
+ "origin": [],
+ "revision": [],
+ "ts": ts,
+ }
+ )
+ return True
+
+ def revision_set_origin(self, origins: Dict[Sha1Git, Sha1Git]) -> bool:
+ revs = {
+ x["sha1"]: x
+ for x in self.db.revision.find(
+ {"sha1": {"$in": list(origins)}}, {"sha1": 1, "preferred": 1, "_id": 1}
+ )
+ }
+ for sha1, origin in origins.items():
+ if sha1 in revs:
+ self.db.revision.update_one(
+ {"_id": revs[sha1]["_id"]}, {"$set": {"preferred": origin}}
+ )
+ else:
+ # add new rev
+ self.db.revision.insert_one(
+ {
+ "sha1": sha1,
+ "preferred": origin,
+ "origin": [],
+ "revision": [],
+ "ts": None,
+ }
+ )
+ return True
+
+ def revision_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, RevisionData]:
+ return {
+ x["sha1"]: RevisionData(
+ date=datetime.fromtimestamp(x["ts"], timezone.utc) if x["ts"] else None,
+ origin=x["preferred"],
+ )
+ for x in self.db.revision.find(
+ {"sha1": {"$in": list(ids)}},
+ {"sha1": 1, "preferred": 1, "ts": 1, "_id": 0},
+ )
+ }
+
+ def relation_add(
+ self, relation: RelationType, data: Iterable[RelationData]
+ ) -> bool:
+ src_relation, *_, dst_relation = relation.value.split("_")
+ set_data = set(data)
+
+ dst_sha1s = {x.dst for x in data}
+ if dst_relation in ["content", "directory", "revision"]:
+ dst_obj: Dict[str, Any] = {"ts": None}
+ if dst_relation == "content":
+ dst_obj["revision"] = {}
+ dst_obj["directory"] = {}
+ if dst_relation == "directory":
+ dst_obj["revision"] = {}
+ if dst_relation == "revision":
+ dst_obj["preferred"] = None
+ dst_obj["origin"] = []
+ dst_obj["revision"] = []
+
+ existing = {
+ x["sha1"]
+ for x in self.db.get_collection(dst_relation).find(
+ {"sha1": {"$in": list(dst_sha1s)}}, {"_id": 0, "sha1": 1}
+ )
+ }
+
+ for sha1 in dst_sha1s:
+ if sha1 not in existing:
+ self.db.get_collection(dst_relation).insert_one(
+ dict(dst_obj, **{"sha1": sha1})
+ )
+ elif dst_relation == "origin":
+ # TODO, check origins are already in the DB
+ # if not, algo has something wrong (algo inserts it initially)
+ pass
+
+ dst_objs = {
+ x["sha1"]: x["_id"]
+ for x in self.db.get_collection(dst_relation).find(
+ {"sha1": {"$in": list(dst_sha1s)}}, {"_id": 1, "sha1": 1}
+ )
+ }
+
+ denorm: Dict[Sha1Git, Any] = {}
+ for each in set_data:
+ if src_relation != "revision":
+ denorm.setdefault(each.src, {}).setdefault(
+ str(dst_objs[each.dst]), []
+ ).append(each.path)
+ else:
+ denorm.setdefault(each.src, []).append(dst_objs[each.dst])
+
+ src_objs = {
+ x["sha1"]: x
+ for x in self.db.get_collection(src_relation).find(
+ {"sha1": {"$in": list(denorm)}}
+ )
+ }
+
+ for sha1, _ in denorm.items():
+ if sha1 in src_objs:
+ # update
+ if src_relation != "revision":
+ k = {
+ obj_id: list(set(paths + denorm[sha1][obj_id]))
+ for obj_id, paths in src_objs[sha1][dst_relation].items()
+ }
+ self.db.get_collection(src_relation).update_one(
+ {"_id": src_objs[sha1]["_id"]},
+ {"$set": {dst_relation: dict(denorm[sha1], **k)}},
+ )
+ else:
+ self.db.get_collection(src_relation).update_one(
+ {"_id": src_objs[sha1]["_id"]},
+ {
+ "$set": {
+ dst_relation: list(
+ set(src_objs[sha1][dst_relation] + denorm[sha1])
+ )
+ }
+ },
+ )
+ else:
+ # add new rev
+ src_obj: Dict[str, Any] = {"ts": None}
+ if src_relation == "content":
+ src_obj["revision"] = {}
+ src_obj["directory"] = {}
+ if src_relation == "directory":
+ src_obj["revision"] = {}
+ if src_relation == "revision":
+ src_obj["preferred"] = None
+ src_obj["origin"] = []
+ src_obj["revision"] = []
+ self.db.get_collection(src_relation).insert_one(
+ dict(src_obj, **{"sha1": sha1, dst_relation: denorm[sha1]})
+ )
+ return True
+
+ def relation_get(
+ self, relation: RelationType, ids: Iterable[Sha1Git], reverse: bool = False
+ ) -> Set[RelationData]:
+ src, *_, dst = relation.value.split("_")
+ sha1s = set(ids)
+ if not reverse:
+ src_objs = {
+ x["sha1"]: x[dst]
+ for x in self.db.get_collection(src).find(
+ {"sha1": {"$in": list(sha1s)}}, {"_id": 0, "sha1": 1, dst: 1}
+ )
+ }
+ dst_ids = list(
+ {ObjectId(obj_id) for _, value in src_objs.items() for obj_id in value}
+ )
+ dst_objs = {
+ x["sha1"]: x["_id"]
+ for x in self.db.get_collection(dst).find(
+ {"_id": {"$in": dst_ids}}, {"_id": 1, "sha1": 1}
+ )
+ }
+ if src != "revision":
+ return {
+ RelationData(src=src_sha1, dst=dst_sha1, path=path)
+ for src_sha1, denorm in src_objs.items()
+ for dst_sha1, dst_obj_id in dst_objs.items()
+ for dst_obj_str, paths in denorm.items()
+ for path in paths
+ if dst_obj_id == ObjectId(dst_obj_str)
+ }
+ else:
+ return {
+ RelationData(src=src_sha1, dst=dst_sha1, path=None)
+ for src_sha1, denorm in src_objs.items()
+ for dst_sha1, dst_obj_id in dst_objs.items()
+ for dst_obj_ref in denorm
+ if dst_obj_id == dst_obj_ref
+ }
+ else:
+ dst_objs = {
+ x["sha1"]: x["_id"]
+ for x in self.db.get_collection(dst).find(
+ {"sha1": {"$in": list(sha1s)}}, {"_id": 1, "sha1": 1}
+ )
+ }
+ src_objs = {
+ x["sha1"]: x[dst]
+ for x in self.db.get_collection(src).find(
+ {}, {"_id": 0, "sha1": 1, dst: 1}
+ )
+ }
+ if src != "revision":
+ return {
+ RelationData(src=src_sha1, dst=dst_sha1, path=path)
+ for src_sha1, denorm in src_objs.items()
+ for dst_sha1, dst_obj_id in dst_objs.items()
+ for dst_obj_str, paths in denorm.items()
+ for path in paths
+ if dst_obj_id == ObjectId(dst_obj_str)
+ }
+ else:
+ return {
+ RelationData(src=src_sha1, dst=dst_sha1, path=None)
+ for src_sha1, denorm in src_objs.items()
+ for dst_sha1, dst_obj_id in dst_objs.items()
+ for dst_obj_ref in denorm
+ if dst_obj_id == dst_obj_ref
+ }
+
+ def relation_get_all(self, relation: RelationType) -> Set[RelationData]:
+ src, *_, dst = relation.value.split("_")
+ src_objs = {
+ x["sha1"]: x[dst]
+ for x in self.db.get_collection(src).find({}, {"_id": 0, "sha1": 1, dst: 1})
+ }
+ dst_ids = list(
+ {ObjectId(obj_id) for _, value in src_objs.items() for obj_id in value}
+ )
+ if src != "revision":
+ dst_objs = {
+ x["_id"]: x["sha1"]
+ for x in self.db.get_collection(dst).find(
+ {"_id": {"$in": dst_ids}}, {"_id": 1, "sha1": 1}
+ )
+ }
+ return {
+ RelationData(src=src_sha1, dst=dst_sha1, path=path)
+ for src_sha1, denorm in src_objs.items()
+ for dst_obj_id, dst_sha1 in dst_objs.items()
+ for dst_obj_str, paths in denorm.items()
+ for path in paths
+ if dst_obj_id == ObjectId(dst_obj_str)
+ }
+ else:
+ dst_objs = {
+ x["_id"]: x["sha1"]
+ for x in self.db.get_collection(dst).find(
+ {"_id": {"$in": dst_ids}}, {"_id": 1, "sha1": 1}
+ )
+ }
+ return {
+ RelationData(src=src_sha1, dst=dst_sha1, path=None)
+ for src_sha1, denorm in src_objs.items()
+ for dst_obj_id, dst_sha1 in dst_objs.items()
+ for dst_obj_ref in denorm
+ if dst_obj_id == dst_obj_ref
+ }
+
+ def with_path(self) -> bool:
+ return True
diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py
--- a/swh/provenance/postgresql/provenance.py
+++ b/swh/provenance/postgresql/provenance.py
@@ -268,6 +268,7 @@
SELECT sha1, date
FROM {entity}
WHERE sha1 IN ({values})
+ AND date IS NOT NULL
"""
self.cursor.execute(sql, sha1s)
dates.update((row["sha1"], row["date"]) for row in self.cursor.fetchall())
diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py
--- a/swh/provenance/provenance.py
+++ b/swh/provenance/provenance.py
@@ -155,8 +155,8 @@
# Origins urls should be inserted first so that internal ids' resolution works
# properly.
urls = {
- sha1: date
- for sha1, date in self.cache["origin"]["data"].items()
+ sha1: url
+ for sha1, url in self.cache["origin"]["data"].items()
if sha1 in self.cache["origin"]["added"]
}
while not self.storage.origin_set_url(urls):
@@ -280,14 +280,13 @@
updated = {
id: rev.date
for id, rev in self.storage.revision_get(missing_ids).items()
- if rev.date is not None
}
else:
updated = getattr(self.storage, f"{entity}_get")(missing_ids)
cache["data"].update(updated)
dates: Dict[Sha1Git, datetime] = {}
for sha1 in ids:
- date = cache["data"].get(sha1)
+ date = cache["data"].setdefault(sha1, None)
if date is not None:
dates[sha1] = date
return dates
diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
--- a/swh/provenance/tests/conftest.py
+++ b/swh/provenance/tests/conftest.py
@@ -10,6 +10,7 @@
from _pytest.fixtures import SubRequest
import msgpack
import psycopg2.extensions
+import pymongo.database
import pytest
from pytest_postgresql.factories import postgresql
@@ -32,7 +33,7 @@
"without-path-denormalized",
]
)
-def populated_db(
+def provenance_postgresqldb(
request: SubRequest,
postgresql: psycopg2.extensions.connection,
) -> Dict[str, str]:
@@ -47,9 +48,13 @@
# the Flask app used as server in these tests
@pytest.fixture
-def app(populated_db: Dict[str, str]) -> Iterator[server.ProvenanceStorageServerApp]:
+def app(
+ provenance_postgresqldb: Dict[str, str]
+) -> Iterator[server.ProvenanceStorageServerApp]:
assert hasattr(server, "storage")
- server.storage = get_provenance_storage(cls="postgresql", db=populated_db)
+ server.storage = get_provenance_storage(
+ cls="postgresql", db=provenance_postgresqldb
+ )
yield server.app
@@ -59,10 +64,11 @@
return RemoteProvenanceStorage
-@pytest.fixture(params=["postgresql", "remote"])
+@pytest.fixture(params=["mongodb"])
def provenance_storage(
request: SubRequest,
- populated_db: Dict[str, str],
+ provenance_postgresqldb: Dict[str, str],
+ mongodb: pymongo.database.Database,
swh_rpc_client: RemoteProvenanceStorage,
) -> ProvenanceStorageInterface:
"""Return a working and initialized ProvenanceStorageInterface object"""
@@ -71,10 +77,15 @@
assert isinstance(swh_rpc_client, ProvenanceStorageInterface)
return swh_rpc_client
+ elif request.param == "mongodb":
+ from swh.provenance.mongo.backend import ProvenanceStorageMongoDb
+
+ return ProvenanceStorageMongoDb(mongodb)
+
else:
# in test sessions, we DO want to raise any exception occurring at commit time
return get_provenance_storage(
- cls=request.param, db=populated_db, raise_on_commit=True
+ cls=request.param, db=provenance_postgresqldb, raise_on_commit=True
)
diff --git a/swh/provenance/tests/data/mongo/.gitkeep b/swh/provenance/tests/data/mongo/.gitkeep
new file mode 100644
diff --git a/swh/provenance/tests/test_provenance_storage.py b/swh/provenance/tests/test_provenance_storage.py
--- a/swh/provenance/tests/test_provenance_storage.py
+++ b/swh/provenance/tests/test_provenance_storage.py
@@ -111,51 +111,6 @@
# Assuming provenance.storage has the 'with-path' flavor.
assert provenance.storage.with_path()
- # Test content methods.
- # Add all content present in the current repo to both storages, just assigning their
- # creation dates. Then check that the inserted content is the same in both cases.
- cnt_dates = {cnt["sha1_git"]: cnt["ctime"] for cnt in data["content"]}
- assert cnt_dates
- assert provenance.storage.content_set_date(
- cnt_dates
- ) == provenance_storage.content_set_date(cnt_dates)
-
- assert provenance.storage.content_get(cnt_dates) == provenance_storage.content_get(
- cnt_dates
- )
- assert provenance.storage.entity_get_all(
- EntityType.CONTENT
- ) == provenance_storage.entity_get_all(EntityType.CONTENT)
-
- # Test directory methods.
- # Of all directories present in the current repo, only assign a date to those
- # containing blobs (picking the max date among the available ones). Then check that
- # the inserted data is the same in both storages.
- def getmaxdate(
- dir: Dict[str, Any], cnt_dates: Dict[Sha1Git, datetime]
- ) -> Optional[datetime]:
- dates = [
- cnt_dates[entry["target"]]
- for entry in dir["entries"]
- if entry["type"] == "file"
- ]
- return max(dates) if dates else None
-
- dir_dates = {dir["id"]: getmaxdate(dir, cnt_dates) for dir in data["directory"]}
- assert dir_dates
- assert provenance.storage.directory_set_date(
- {sha1: date for sha1, date in dir_dates.items() if date is not None}
- ) == provenance_storage.directory_set_date(
- {sha1: date for sha1, date in dir_dates.items() if date is not None}
- )
-
- assert provenance.storage.directory_get(
- dir_dates
- ) == provenance_storage.directory_get(dir_dates)
- assert provenance.storage.entity_get_all(
- EntityType.DIRECTORY
- ) == provenance_storage.entity_get_all(EntityType.DIRECTORY)
-
# Test origin methods.
# Add all origins present in the current repo to both storages. Then check that the
# inserted data is the same in both cases.
@@ -174,32 +129,6 @@
EntityType.ORIGIN
) == provenance_storage.entity_get_all(EntityType.ORIGIN)
- # Test revision methods.
- # Add all revisions present in the current repo to both storages, assigning their
- # dataes and an arbitrary origin to each one. Then check that the inserted data is
- # the same in both cases.
- rev_dates = {rev["id"]: ts2dt(rev["date"]) for rev in data["revision"]}
- assert rev_dates
- assert provenance.storage.revision_set_date(
- rev_dates
- ) == provenance_storage.revision_set_date(rev_dates)
-
- rev_origins = {
- rev["id"]: next(iter(org_urls)) # any arbitrary origin will do
- for rev in data["revision"]
- }
- assert rev_origins
- assert provenance.storage.revision_set_origin(
- rev_origins
- ) == provenance_storage.revision_set_origin(rev_origins)
-
- assert provenance.storage.revision_get(
- rev_dates
- ) == provenance_storage.revision_get(rev_dates)
- assert provenance.storage.entity_get_all(
- EntityType.REVISION
- ) == provenance_storage.entity_get_all(EntityType.REVISION)
-
# Test content-in-revision relation.
# Create flat models of every root directory for the revisions in the dataset.
cnt_in_rev: Set[RelationData] = set()
@@ -284,6 +213,76 @@
provenance_storage,
)
+ # Test content methods.
+ # Add all content present in the current repo to both storages, just assigning their
+ # creation dates. Then check that the inserted content is the same in both cases.
+ cnt_dates = {cnt["sha1_git"]: cnt["ctime"] for cnt in data["content"]}
+ assert cnt_dates
+ assert provenance.storage.content_set_date(
+ cnt_dates
+ ) == provenance_storage.content_set_date(cnt_dates)
+
+ assert provenance.storage.content_get(cnt_dates) == provenance_storage.content_get(
+ cnt_dates
+ )
+ assert provenance.storage.entity_get_all(
+ EntityType.CONTENT
+ ) == provenance_storage.entity_get_all(EntityType.CONTENT)
+
+ # Test directory methods.
+ # Of all directories present in the current repo, only assign a date to those
+ # containing blobs (picking the max date among the available ones). Then check that
+ # the inserted data is the same in both storages.
+ def getmaxdate(
+ dir: Dict[str, Any], cnt_dates: Dict[Sha1Git, datetime]
+ ) -> Optional[datetime]:
+ dates = [
+ cnt_dates[entry["target"]]
+ for entry in dir["entries"]
+ if entry["type"] == "file"
+ ]
+ return max(dates) if dates else None
+
+ dir_dates = {dir["id"]: getmaxdate(dir, cnt_dates) for dir in data["directory"]}
+ assert dir_dates
+ assert provenance.storage.directory_set_date(
+ {sha1: date for sha1, date in dir_dates.items() if date is not None}
+ ) == provenance_storage.directory_set_date(
+ {sha1: date for sha1, date in dir_dates.items() if date is not None}
+ )
+ assert provenance.storage.directory_get(
+ dir_dates
+ ) == provenance_storage.directory_get(dir_dates)
+ assert provenance.storage.entity_get_all(
+ EntityType.DIRECTORY
+ ) == provenance_storage.entity_get_all(EntityType.DIRECTORY)
+
+ # Test revision methods.
+ # Add all revisions present in the current repo to both storages, assigning their
+ # dataes and an arbitrary origin to each one. Then check that the inserted data is
+ # the same in both cases.
+ rev_dates = {rev["id"]: ts2dt(rev["date"]) for rev in data["revision"]}
+ assert rev_dates
+ assert provenance.storage.revision_set_date(
+ rev_dates
+ ) == provenance_storage.revision_set_date(rev_dates)
+
+ rev_origins = {
+ rev["id"]: next(iter(org_urls)) # any arbitrary origin will do
+ for rev in data["revision"]
+ }
+ assert rev_origins
+ assert provenance.storage.revision_set_origin(
+ rev_origins
+ ) == provenance_storage.revision_set_origin(rev_origins)
+
+ assert provenance.storage.revision_get(
+ rev_dates
+ ) == provenance_storage.revision_get(rev_dates)
+ assert provenance.storage.entity_get_all(
+ EntityType.REVISION
+ ) == provenance_storage.entity_get_all(EntityType.REVISION)
+
# Test location_get.
if provenance_storage.with_path():
assert provenance.storage.location_get() == provenance_storage.location_get()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 3:42 PM (2 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222927
Attached To
D6094: Provenance Mongo backend
Event Timeline
Log In to Comment