diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py --- a/swh/provenance/__init__.py +++ b/swh/provenance/__init__.py @@ -77,7 +77,7 @@ conn = BaseDb.connect(**kwargs["db"]).conn raise_on_commit = kwargs.get("raise_on_commit", False) - if ProvenanceDBBase(conn, raise_on_commit).flavor == "with-path": + if "with-path" in ProvenanceDBBase(conn, raise_on_commit).flavor: from .postgresql.provenancedb_with_path import ProvenanceWithPathDB return ProvenanceWithPathDB(conn, raise_on_commit) diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb_base.py --- a/swh/provenance/postgresql/provenancedb_base.py +++ b/swh/provenance/postgresql/provenancedb_base.py @@ -49,7 +49,11 @@ return self._flavor def with_path(self) -> bool: - return self.flavor == "with-path" + return "with-path" in self.flavor + + @property + def denormalized(self) -> bool: + return "denormalized" in self.flavor def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: ... @@ -186,7 +190,12 @@ f"INNER JOIN {src} AS S ON (S.sha1=V.src)", f"INNER JOIN {dst} AS D ON (D.sha1=V.dst)", ] - selected = ["S.id", "D.id"] + nope = (RelationType.REV_BEFORE_REV, RelationType.REV_IN_ORG) + selected = ["S.id"] + if self.denormalized and relation not in nope: + selected.append("array_agg(D.id)") + else: + selected.append("D.id") if self._relation_uses_location_table(relation): locations = tuple(set((path,) for (_, _, path) in rows)) @@ -198,16 +207,30 @@ psycopg2.extras.execute_values(self.cursor, sql, locations) joins.append("INNER JOIN location AS L ON (L.path=V.path)") - selected.append("L.id") + if self.denormalized: + selected.append("array_agg(L.id)") + else: + selected.append("L.id") + sql_l = [ + f"INSERT INTO {table}", + f" SELECT {', '.join(selected)}", + " FROM (VALUES %s) AS V(src, dst, path)", + *joins, + ] - sql = f""" - INSERT INTO {table} - (SELECT {", ".join(selected)} - FROM (VALUES %s) AS V(src, dst, path) - {''' - '''.join(joins)}) - ON CONFLICT DO NOTHING + if self.denormalized and relation not in nope: + sql_l.append("GROUP BY S.id") + sql_l.append( + f"""ON CONFLICT ({src}) DO UPDATE + set {dst}=array( + select unnest({table}.{dst} || excluded.{dst})), + location=array( + select unnest({relation.value}.location || excluded.location)) """ + ) + else: + sql_l.append("ON CONFLICT DO NOTHING") + sql = "\n".join(sql_l) psycopg2.extras.execute_values(self.cursor, sql, rows) return True except: # noqa: E722 @@ -276,11 +299,16 @@ sha1s: Optional[Tuple[Tuple[Sha1Git, ...]]] if ids is not None: sha1s = (tuple(ids),) - where = f"WHERE {'S.sha1' if not reverse else 'D.sha1'} IN %s" + where = f"WHERE {'S' if not reverse else 'D'}.sha1 IN %s" else: sha1s = None where = "" + aggreg_dst = self.denormalized and relation in ( + RelationType.CNT_EARLY_IN_REV, + RelationType.CNT_IN_DIR, + RelationType.DIR_IN_REV, + ) if sha1s is None or sha1s[0]: table = relation.value src, *_, dst = table.split("_") @@ -293,24 +321,34 @@ src_field = src dst_field = dst - joins = [ - f"INNER JOIN {src} AS S ON (S.id=R.{src_field})", - f"INNER JOIN {dst} AS D ON (D.id=R.{dst_field})", - ] - selected = ["S.sha1 AS src", "D.sha1 AS dst"] + if aggreg_dst: + revloc = f"UNNEST(R.{dst_field}) as dst" + if self._relation_uses_location_table(relation): + revloc += ", UNNEST(R.location) as path" + else: + revloc = f"R.{dst_field} as dst" + if self._relation_uses_location_table(relation): + revloc += ", R.location as path" + + inner_sql = f""" + SELECT S.sha1 as src, {revloc} + FROM {table} as R + INNER JOIN {src} AS S ON (S.id=R.{src_field}) + {where} + """ if self._relation_uses_location_table(relation): - joins.append("INNER JOIN location AS L ON (L.id=R.location)") - selected.append("L.path AS path") + loc = "L.path AS path" else: - selected.append("NULL AS path") - + loc = "NULL AS path" sql = f""" - SELECT {", ".join(selected)} - FROM {table} AS R - {" ".join(joins)} - {where} - """ + SELECT CL.src, D.sha1 as dst, {loc} + FROM ({inner_sql}) as CL + INNER JOIN {dst} AS D ON (D.id=CL.dst) + """ + if self._relation_uses_location_table(relation): + sql += "INNER JOIN location AS L ON (L.id=CL.path)" + self.cursor.execute(sql, sha1s) result.update(RelationData(**row) for row in self.cursor.fetchall()) return result diff --git a/swh/provenance/postgresql/provenancedb_with_path.py b/swh/provenance/postgresql/provenancedb_with_path.py --- a/swh/provenance/postgresql/provenancedb_with_path.py +++ b/swh/provenance/postgresql/provenancedb_with_path.py @@ -13,7 +13,27 @@ class ProvenanceWithPathDB(ProvenanceDBBase): def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: - sql = """ + if self.denormalized: + sql = """ + select C_L.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url AS origin, + L.path as path + from ( + select C.sha1 as sha1, + unnest(revision) as revision, + unnest(location) as location + from content_in_revision as C_R + inner join content as C on (C.id=C_R.content) + where C.sha1=%s) as C_L + inner join revision as R on (R.id=C_L.revision) + inner join location as L on (L.id=C_L.location) + LEFT JOIN origin as O ON (R.origin=O.id) + order by date, revision, origin, path asc limit 1 + """ + else: + sql = """ SELECT C.sha1 AS content, R.sha1 AS revision, R.date AS date, @@ -35,7 +55,61 @@ self, id: Sha1Git, limit: Optional[int] = None ) -> Generator[ProvenanceResult, None, None]: early_cut = f"LIMIT {limit}" if limit is not None else "" - sql = f""" + if self.denormalized: + sql = f""" + (SELECT C_L.sha1 AS content, + R.sha1 AS revision, + R.date AS date, + O.url AS origin, + L.path AS path + FROM ( + SELECT C.sha1 AS sha1, + unnest(revision) AS revision, + unnest(location) AS location + FROM content_in_revision AS C_R + INNER JOIN content AS C ON (C.id = C_R.content) + WHERE C.sha1=%s) AS C_L + INNER JOIN revision AS R ON (R.id = C_L.revision) + INNER JOIN location AS L ON (L.id = C_L.location) + LEFT JOIN origin as O ON (R.origin=O.id) + ) + UNION + (WITH + C_D as ( + SELECT C.sha1 AS content_sha1, + unnest(CD.directory) AS directory, + unnest(CD.location) AS location + FROM content AS C + INNER JOIN content_in_directory AS CD ON (CD.content = C.id) + WHERE C.sha1=%s + ), + D_R as ( + SELECT C_D.content_sha1 AS content_sha1, + DL.path AS file_path, + unnest(DR.revision) AS revision, + unnest(DR.location) AS prefix_location + FROM C_D + INNER JOIN directory_in_revision AS DR ON (DR.directory = C_D.directory) + INNER JOIN location AS DL ON (DL.id = C_D.location) + ) + SELECT D_R.content_sha1 AS sha1, + R.sha1 AS revision, + R.date AS date, + O.url AS origin, + CASE DL.path + WHEN '' THEN D_R.file_path + WHEN '.' THEN D_R.file_path + ELSE (DL.path || '/' || D_R.file_path)::unix_path + END AS path + FROM D_R + INNER JOIN location AS DL ON (D_R.prefix_location = DL.id) + INNER JOIN revision AS R ON (D_R.revision = R.id) + LEFT JOIN origin as O ON (R.origin=O.id) + ) + ORDER BY date, revision, origin, path {early_cut} + """ + else: + sql = f""" (SELECT C.sha1 AS content, R.sha1 AS revision, R.date AS date, diff --git a/swh/provenance/postgresql/provenancedb_without_path.py b/swh/provenance/postgresql/provenancedb_without_path.py --- a/swh/provenance/postgresql/provenancedb_without_path.py +++ b/swh/provenance/postgresql/provenancedb_without_path.py @@ -13,19 +13,38 @@ class ProvenanceWithoutPathDB(ProvenanceDBBase): def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: - sql = """ + if self.denormalized: + sql = """ + select C_L.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url AS origin, + '\\x'::bytea as path + FROM ( + SELECT C.sha1, unnest(revision) AS revision + FROM content_in_revision AS C_R + INNER JOIN content AS C ON (C.id=C_R.content) + WHERE C.sha1=%s + ) AS C_L + INNER JOIN revision as R on (R.id=C_L.revision) + LEFT JOIN origin as O ON (R.origin=O.id) + ORDER BY date, revision, origin ASC LIMIT 1 + """ + else: + sql = """ SELECT C.sha1 AS content, R.sha1 AS revision, R.date AS date, O.url AS origin, - '\\x'::bytea as path + '\\x'::bytea AS path FROM content AS C - INNER JOIN content_in_revision AS CR ON (CR.content=C.id) - INNER JOIN revision as R ON (CR.revision=R.id) + INNER JOIN content_in_revision AS CR ON (CR.content = C.id) + INNER JOIN revision AS R ON (CR.revision = R.id) LEFT JOIN origin as O ON (R.origin=O.id) WHERE C.sha1=%s ORDER BY date, revision, origin ASC LIMIT 1 """ + self.cursor.execute(sql, (id,)) row = self.cursor.fetchone() return ProvenanceResult(**row) if row is not None else None @@ -34,7 +53,49 @@ self, id: Sha1Git, limit: Optional[int] = None ) -> Generator[ProvenanceResult, None, None]: early_cut = f"LIMIT {limit}" if limit is not None else "" - sql = f""" + if self.denormalized: + sql = f""" + (SELECT C_L.sha1 AS content, + R.sha1 AS revision, + R.date AS date, + O.url AS origin, + '\\x'::bytea as path + FROM ( + SELECT C.sha1, unnest(revision) AS revision + FROM content_in_revision AS C_R + INNER JOIN content AS C ON (C.id=C_R.content) + WHERE C.sha1=%s) AS C_L + INNER JOIN revision AS R ON (R.id=C_L.revision) + LEFT JOIN origin as O ON (R.origin=O.id) + ) + UNION + (WITH + C_D as ( + SELECT C.sha1 AS content_sha1, + unnest(CD.directory) AS directory + FROM content AS C + INNER JOIN content_in_directory AS CD ON (CD.content = C.id) + WHERE C.sha1=%s + ), + D_R as ( + SELECT C_D.content_sha1 AS content_sha1, + unnest(DR.revision) AS revision + FROM C_D + INNER JOIN directory_in_revision AS DR ON (DR.directory = C_D.directory) + ) + SELECT D_R.content_sha1 as content, + R.sha1 AS revision, + R.date AS date, + O.url AS origin, + '\\x'::bytea AS path + FROM D_R + INNER JOIN revision AS R ON (D_R.revision = R.id) + LEFT JOIN origin as O ON (R.origin=O.id) + ) + ORDER BY date, revision, path {early_cut} + """ + else: + sql = f""" (SELECT C.sha1 AS content, R.sha1 AS revision, R.date AS date, diff --git a/swh/provenance/sql/15-flavor.sql b/swh/provenance/sql/15-flavor.sql --- a/swh/provenance/sql/15-flavor.sql +++ b/swh/provenance/sql/15-flavor.sql @@ -1,7 +1,9 @@ -- database flavor create type database_flavor as enum ( 'with-path', - 'without-path' + 'without-path', + 'with-path-denormalized', + 'without-path-denormalized' ); comment on type database_flavor is 'Flavor of the current database'; diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql --- a/swh/provenance/sql/30-schema.sql +++ b/swh/provenance/sql/30-schema.sql @@ -1,4 +1,5 @@ -- psql variables to get the current database flavor +select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset create table dbversion ( @@ -78,8 +79,13 @@ create table content_in_revision ( content bigint not null, -- internal identifier of the content blob +\if :dbflavor_norm revision bigint not null, -- internal identifier of the revision where the blob appears for the first time location bigint -- location of the content relative to the revision root directory +\else + revision bigint[], -- internal identifier of the revision where the blob appears for the first time + location bigint[] -- location of the content relative to the revision root directory +\endif -- foreign key (blob) references content (id), -- foreign key (rev) references revision (id), -- foreign key (loc) references location (id) @@ -91,8 +97,13 @@ create table content_in_directory ( content bigint not null, -- internal identifier of the content blob +\if :dbflavor_norm directory bigint not null, -- internal identifier of the directory containing the blob location bigint -- location of the content relative to its parent directory in the isochrone frontier +\else + directory bigint[], + location bigint[] +\endif -- foreign key (blob) references content (id), -- foreign key (dir) references directory (id), -- foreign key (loc) references location (id) @@ -104,8 +115,13 @@ create table directory_in_revision ( directory bigint not null, -- internal identifier of the directory appearing in the revision +\if :dbflavor_norm revision bigint not null, -- internal identifier of the revision containing the directory location bigint -- location of the directory relative to the revision root directory +\else + revision bigint[], + location bigint[] +\endif -- foreign key (dir) references directory (id), -- foreign key (rev) references revision (id), -- foreign key (loc) references location (id) diff --git a/swh/provenance/sql/60-indexes.sql b/swh/provenance/sql/60-indexes.sql --- a/swh/provenance/sql/60-indexes.sql +++ b/swh/provenance/sql/60-indexes.sql @@ -1,9 +1,18 @@ -- psql variables to get the current database flavor +select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset -- create unique indexes (instead of pkey) because location might be null for -- the without-path flavor +\if :dbflavor_norm create unique index on content_in_revision(content, revision, location); create unique index on directory_in_revision(directory, revision, location); create unique index on content_in_directory(content, directory, location); +\else +create unique index on content_in_revision(content); +create unique index on directory_in_revision(directory); +create unique index on content_in_directory(content); +\endif + + alter table revision_in_origin add primary key (revision, origin); alter table revision_before_revision add primary key (prev, next); diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -27,19 +27,29 @@ from swh.storage.replay import process_replay_objects -@pytest.fixture(params=["with-path", "without-path"]) +@pytest.fixture( + params=[ + "with-path", + "without-path", + "with-path-denormalized", + "without-path-denormalized", + ] +) def populated_db( request, # TODO: add proper type annotation postgresql: psycopg2.extensions.connection, ) -> Dict[str, str]: + """return a working and initialized provenance db""" from swh.core.cli.db import populate_database_for_package - flavor = "with-path" if request.param == "client-server" else request.param - populate_database_for_package("swh.provenance", postgresql.dsn, flavor=flavor) + # flavor = "with-path" if request.param == "client-server" else request.param + populate_database_for_package( + "swh.provenance", postgresql.dsn, flavor=request.param + ) return { - item.split("=")[0]: item.split("=")[1] - for item in postgresql.dsn.split() - if item.split("=")[0] != "options" + k: v + for (k, v) in (item.split("=") for item in postgresql.dsn.split()) + if k != "options" } diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py --- a/swh/provenance/tests/test_provenance_db.py +++ b/swh/provenance/tests/test_provenance_db.py @@ -42,9 +42,14 @@ def test_provenance_flavor(provenance: ProvenanceInterface) -> None: if isinstance(provenance.storage, ProvenanceDBBase): - assert provenance.storage.flavor in ("with-path", "without-path") + assert provenance.storage.flavor in ( + "with-path", + "without-path", + "with-path-denormalized", + "without-path-denormalized", + ) backend_class: Type[ProvenanceStorageInterface] - if provenance.storage.flavor == "with-path": + if "with-path" in provenance.storage.flavor: backend_class = ProvenanceWithPathDB else: backend_class = ProvenanceWithoutPathDB