diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py --- a/swh/provenance/__init__.py +++ b/swh/provenance/__init__.py @@ -1,4 +1,5 @@ from typing import TYPE_CHECKING +import warnings from .postgresql.db_utils import connect @@ -24,9 +25,23 @@ def get_provenance(cls: str, **kwargs) -> "ProvenanceInterface": if cls == "local": conn = connect(kwargs["db"]) - with_path = kwargs.get("with_path", True) + if "with_path" in kwargs: + warnings.warn( + "Usage of the 'with-path' config option is deprecated. " + "The db flavor is now used instead.", + DeprecationWarning, + ) + + with_path = kwargs.get("with_path") from swh.provenance.provenance import ProvenanceBackend - return ProvenanceBackend(conn, with_path=with_path) + prov = ProvenanceBackend(conn) + if with_path is not None: + flavor = "with-path" if with_path else "without-path" + if prov.storage.flavor != flavor: + raise ValueError( + "The given flavor does not match the flavor stored in the backend." + ) + return prov else: raise NotImplementedError diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb_base.py --- a/swh/provenance/postgresql/provenancedb_base.py +++ b/swh/provenance/postgresql/provenancedb_base.py @@ -15,6 +15,19 @@ self.cursor = self.conn.cursor() # XXX: not sure this is the best place to do it! self.cursor.execute("SET timezone TO 'UTC'") + self._flavor: Optional[str] = None + + @property + def flavor(self) -> str: + if self._flavor is None: + self.cursor.execute("select swh_get_dbflavor()") + self._flavor = self.cursor.fetchone()[0] + assert self._flavor is not None + return self._flavor + + @property + def with_path(self) -> bool: + return self.flavor == "with-path" def commit(self, data: Dict[str, Any], raise_on_commit: bool = False) -> bool: try: diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py --- a/swh/provenance/provenance.py +++ b/swh/provenance/provenance.py @@ -139,12 +139,13 @@ class ProvenanceBackend: raise_on_commit: bool = False - def __init__(self, conn: psycopg2.extensions.connection, with_path: bool = True): + def __init__(self, conn: psycopg2.extensions.connection): from .postgresql.provenancedb_base import ProvenanceDBBase # TODO: this class should not know what the actual used DB is. self.storage: ProvenanceDBBase - if with_path: + flavor = ProvenanceDBBase(conn).flavor + if flavor == "with-path": from .postgresql.provenancedb_with_path import ProvenanceWithPathDB self.storage = ProvenanceWithPathDB(conn) @@ -152,7 +153,6 @@ from .postgresql.provenancedb_without_path import ProvenanceWithoutPathDB self.storage = ProvenanceWithoutPathDB(conn) - self.cache: ProvenanceCache = new_cache() def clear_caches(self): diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -3,7 +3,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import glob from os import path import re from typing import Iterable, Iterator, List, Optional @@ -13,33 +12,25 @@ from swh.core.api.serializers import msgpack_loads from swh.core.db import BaseDb -from swh.core.db.pytest_plugin import postgresql_fact -from swh.core.utils import numfile_sortkey as sortkey from swh.model.model import Content, Directory, DirectoryEntry, Revision from swh.model.tests.swh_model_data import TEST_OBJECTS -import swh.provenance from swh.provenance.postgresql.archive import ArchivePostgreSQL from swh.provenance.storage.archive import ArchiveStorage -SQL_DIR = path.join(path.dirname(swh.provenance.__file__), "sql") -SQL_FILES = [ - sqlfile - for sqlfile in sorted(glob.glob(path.join(SQL_DIR, "*.sql")), key=sortkey) - if "-without-path-" not in sqlfile -] -provenance_db = postgresql_fact( - "postgresql_proc", dbname="provenance", dump_files=SQL_FILES -) +@pytest.fixture(params=["with-path", "without-path"]) +def provenance(request, postgresql): + """return a working and initialized provenance db""" + from swh.core.cli.db import populate_database_for_package + flavor = request.param + populate_database_for_package("swh.provenance", postgresql.dsn, flavor=flavor) -@pytest.fixture -def provenance(provenance_db): - """return a working and initialized provenance db""" from swh.provenance.provenance import ProvenanceBackend - BaseDb.adapt_conn(provenance_db) - prov = ProvenanceBackend(provenance_db) + BaseDb.adapt_conn(postgresql) + prov = ProvenanceBackend(postgresql) + assert prov.storage.flavor == flavor # in test sessions, we DO want to raise any exception occurring at commit time prov.raise_on_commit = True return prov diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py --- a/swh/provenance/tests/test_cli.py +++ b/swh/provenance/tests/test_cli.py @@ -86,12 +86,12 @@ assert tables == dbtables -def test_cli_init_db_default_flavor(provenance_db): +def test_cli_init_db_default_flavor(postgresql): "Test that 'swh db init provenance' defaults to a with-path flavored DB" - dbname = provenance_db.dsn + dbname = postgresql.dsn result = CliRunner().invoke(swhmain, ["db", "init", "-d", dbname, "provenance"]) assert result.exit_code == 0, result.output - with provenance_db.cursor() as cur: + with postgresql.cursor() as cur: cur.execute("select swh_get_dbflavor()") assert cur.fetchone() == ("with-path",) diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py --- a/swh/provenance/tests/test_provenance_db.py +++ b/swh/provenance/tests/test_provenance_db.py @@ -8,6 +8,8 @@ from swh.model.tests.swh_model_data import TEST_OBJECTS from swh.provenance.model import OriginEntry from swh.provenance.origin import origin_add +from swh.provenance.postgresql.provenancedb_with_path import ProvenanceWithPathDB +from swh.provenance.postgresql.provenancedb_without_path import ProvenanceWithoutPathDB from swh.provenance.storage.archive import ArchiveStorage @@ -29,3 +31,12 @@ ) origin_add(provenance, archive, [entry]) # TODO: check some facts here + + +def test_provenance_flavor(provenance): + assert provenance.storage.flavor in ("with-path", "without-path") + if provenance.storage.flavor == "with-path": + backend_class = ProvenanceWithPathDB + else: + backend_class = ProvenanceWithoutPathDB + assert isinstance(provenance.storage, backend_class) diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py --- a/swh/provenance/tests/test_provenance_heuristics.py +++ b/swh/provenance/tests/test_provenance_heuristics.py @@ -45,18 +45,34 @@ 'cur' is a cursor to the provenance index DB. """ relation = f"{src}_in_{dst}" + cur.execute("select swh_get_dbflavor()") + with_path = cur.fetchone()[0] == "with-path" + # note that the columns have the same name as the relations they refer to, # so we can write things like "rel.{dst}=src.id" in the query below - cur.execute( - f"SELECT encode(src.sha1::bytea, 'hex')," - f" encode(dst.sha1::bytea, 'hex')," - f" encode(location.path::bytea, 'escape') " - f"FROM {relation} as rel, " - f" {src} as src, {dst} as dst, location " - f"WHERE rel.{src}=src.id " - f" AND rel.{dst}=dst.id " - f" AND rel.location=location.id" - ) + if with_path: + cur.execute( + f""" + SELECT encode(src.sha1::bytea, 'hex'), + encode(dst.sha1::bytea, 'hex'), + encode(location.path::bytea, 'escape') + FROM {relation} as relation + INNER JOIN {src} AS src ON (relation.{src} = src.id) + INNER JOIN {dst} AS dst ON (relation.{dst} = dst.id) + INNER JOIN location ON (relation.location = location.id) + """ + ) + else: + cur.execute( + f""" + SELECT encode(src.sha1::bytea, 'hex'), + encode(dst.sha1::bytea, 'hex'), + '' + FROM {relation} as relation + INNER JOIN {src} AS src ON (src.id = relation.{src}) + INNER JOIN {dst} AS dst ON (dst.id = relation.{dst}) + """ + ) return set(cur.fetchall()) @@ -102,6 +118,11 @@ } cursor = provenance.storage.cursor + def maybe_path(path: str) -> str: + if provenance.storage.with_path: + return path + return "" + for synth_rev in synthetic_result(syntheticfile): revision = revisions[synth_rev["sha1"]] entry = RevisionEntry( @@ -128,7 +149,8 @@ # check for R-C (direct) entries # these are added directly in the content_early_in_rev table rows["content_in_revision"] |= set( - (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_C"] + (x["dst"].hex(), x["src"].hex(), maybe_path(x["path"])) + for x in synth_rev["R_C"] ) assert rows["content_in_revision"] == relations( cursor, "content", "revision" @@ -148,7 +170,8 @@ # ... + a number of rows in the "directory_in_rev" table... # check for R-D entries rows["directory_in_revision"] |= set( - (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_D"] + (x["dst"].hex(), x["src"].hex(), maybe_path(x["path"])) + for x in synth_rev["R_D"] ) assert rows["directory_in_revision"] == relations( cursor, "directory", "revision" @@ -163,7 +186,8 @@ # for content of the directory. # check for D-C entries rows["content_in_directory"] |= set( - (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["D_C"] + (x["dst"].hex(), x["src"].hex(), maybe_path(x["path"])) + for x in synth_rev["D_C"] ) assert rows["content_in_directory"] == relations( cursor, "content", "directory" @@ -174,11 +198,12 @@ rev_ts + dc["rel_ts"] ], synth_rev["msg"] - # check for location entries - rows["location"] |= set(x["path"] for x in synth_rev["R_C"]) - rows["location"] |= set(x["path"] for x in synth_rev["D_C"]) - rows["location"] |= set(x["path"] for x in synth_rev["R_D"]) - assert rows["location"] == locations(cursor), synth_rev["msg"] + if provenance.storage.with_path: + # check for location entries + rows["location"] |= set(x["path"] for x in synth_rev["R_C"]) + rows["location"] |= set(x["path"] for x in synth_rev["D_C"]) + rows["location"] |= set(x["path"] for x in synth_rev["R_D"]) + assert rows["location"] == locations(cursor), synth_rev["msg"] @pytest.mark.parametrize( @@ -206,6 +231,11 @@ for revision in data["revision"] ] + def maybe_path(path: str) -> str: + if provenance.storage.with_path: + return path + return "" + # XXX adding all revisions at once should be working just fine, but it does not... # revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth) # ...so add revisions one at a time for now @@ -222,12 +252,12 @@ for rc in synth_rev["R_C"]: expected_occurrences.setdefault(rc["dst"].hex(), []).append( - (rev_id, rev_ts, rc["path"]) + (rev_id, rev_ts, maybe_path(rc["path"])) ) for dc in synth_rev["D_C"]: assert dc["prefix"] is not None # to please mypy expected_occurrences.setdefault(dc["dst"].hex(), []).append( - (rev_id, rev_ts, dc["prefix"] + "/" + dc["path"]) + (rev_id, rev_ts, maybe_path(dc["prefix"] + "/" + dc["path"])) ) for content_id, results in expected_occurrences.items(): @@ -238,7 +268,11 @@ bytes.fromhex(content_id) ) ] - assert len(db_occurrences) == len(expected) + if provenance.storage.with_path: + # this is not true if the db stores no path, because a same content + # that appears several times in a given revision may be reported + # only once by content_find_all() + assert len(db_occurrences) == len(expected) assert set(db_occurrences) == set(expected) @@ -308,4 +342,5 @@ assert r_sha1.hex() == content_id assert r_rev_id.hex() == rev_id assert r_ts.timestamp() == ts - assert r_path.decode() in paths + if provenance.storage.with_path: + assert r_path.decode() in paths