diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb_base.py --- a/swh/provenance/postgresql/provenancedb_base.py +++ b/swh/provenance/postgresql/provenancedb_base.py @@ -27,7 +27,11 @@ @property def with_path(self) -> bool: - return self.flavor == "with-path" + return "with-path" in self.flavor + + @property + def denormalized(self) -> bool: + return "denormalized" in self.flavor def commit(self, data: Dict[str, Any], raise_on_commit: bool = False) -> bool: try: diff --git a/swh/provenance/postgresql/provenancedb_with_path.py b/swh/provenance/postgresql/provenancedb_with_path.py --- a/swh/provenance/postgresql/provenancedb_with_path.py +++ b/swh/provenance/postgresql/provenancedb_with_path.py @@ -11,8 +11,25 @@ def content_find_first( self, blob: bytes ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]: - self.cursor.execute( + if self.denormalized: + sql = """ + select C_L.sha1 as blob, + R.sha1 as rev, + R.date as date, + L.path as path + from ( + select C.sha1 as sha1, + unnest(revision) as revision, + unnest(location) as location + from content_in_revision as C_R + inner join content as C on (C.id=C_R.content) + where C.sha1=%s) as C_L + inner join revision as R on (R.id=C_L.revision) + inner join location as L on (L.id=C_L.location) + order by date, rev, path asc limit 1 """ + else: + sql = """ SELECT C.sha1 AS blob, R.sha1 AS rev, R.date AS date, @@ -23,19 +40,67 @@ INNER JOIN revision as R ON (CR.revision = R.id) WHERE C.sha1=%s ORDER BY date, rev, path ASC LIMIT 1 - """, - (blob,), - ) + """ + self.cursor.execute(sql, (blob,)) return self.cursor.fetchone() def content_find_all( self, blob: bytes, limit: Optional[int] = None ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]: early_cut = f"LIMIT {limit}" if limit is not None else "" - self.cursor.execute( - f""" + if self.denormalized: + sql = f""" + (SELECT C_L.sha1 AS content, + R.sha1 AS revision, + R.date AS date, + L.path AS path + FROM ( + SELECT C.sha1 AS sha1, + unnest(revision) AS revision, + unnest(location) AS location + FROM content_in_revision AS C_R + INNER JOIN content AS C ON (C.id = C_R.content) + WHERE C.sha1=%s) AS C_L + INNER JOIN revision AS R ON (R.id = C_L.revision) + INNER JOIN location AS L ON (L.id = C_L.location) + ) + UNION + (WITH + C_D as ( + SELECT C.sha1 AS content_sha1, + unnest(CD.directory) AS directory, + unnest(CD.location) AS location + FROM content AS C + INNER JOIN content_in_directory AS CD ON (CD.content = C.id) + WHERE C.sha1=%s + ), + D_R as ( + SELECT C_D.content_sha1 AS content_sha1, + DL.path AS file_path, + unnest(DR.revision) AS revision, + unnest(DR.location) AS prefix_location + FROM C_D + INNER JOIN directory_in_revision AS DR ON (DR.directory = C_D.directory) + INNER JOIN location AS DL ON (DL.id = C_D.location) + ) + SELECT D_R.content_sha1 AS sha1, + R.sha1 AS revision, + R.date AS date, + CASE DL.path + WHEN '' THEN D_R.file_path + WHEN '.' THEN D_R.file_path + ELSE (DL.path || '/' || D_R.file_path)::unix_path + END AS path + FROM D_R + INNER JOIN location AS DL ON (D_R.prefix_location = DL.id) + INNER JOIN revision AS R ON (D_R.revision = R.id) + ) + ORDER BY date, revision, path {early_cut} + """ + else: + sql = f""" (SELECT C.sha1 AS blob, - R.sha1 AS rev, + R.sha1 AS revision, R.date AS date, L.path AS path FROM content AS C @@ -59,10 +124,9 @@ INNER JOIN location AS CL ON (CD.location = CL.id) INNER JOIN location AS DL ON (DR.location = DL.id) WHERE C.sha1=%s) - ORDER BY date, rev, path {early_cut} - """, - (blob, blob), - ) + ORDER BY date, revision, path {early_cut} + """ + self.cursor.execute(sql, (blob, blob)) yield from self.cursor.fetchall() def insert_relation(self, relation: str, data: Set[Tuple[bytes, bytes, bytes]]): @@ -89,9 +153,24 @@ """, locations, ) - psycopg2.extras.execute_values( - self.cursor, - f""" + if self.denormalized: + sql = f""" + LOCK TABLE ONLY {relation}; + insert into {relation} + SELECT {src}.id, array_agg({dst}.id), array_agg(location.id) + FROM (VALUES %s) AS V(src, dst, path) + INNER JOIN {src} on ({src}.sha1=V.src) + INNER JOIN {dst} on ({dst}.sha1=V.dst) + INNER JOIN location on (location.path=V.path) + GROUP BY {src}.id + on conflict ({src}) do update + set {dst}=array( + select unnest({relation}.{dst} || excluded.{dst})), + location=array( + select unnest({relation}.location || excluded.location)) + """ + else: + sql = f""" LOCK TABLE ONLY {relation}; INSERT INTO {relation} SELECT {src}.id, {dst}.id, location.id @@ -99,7 +178,7 @@ INNER JOIN {src} on ({src}.sha1=V.src) INNER JOIN {dst} on ({dst}.sha1=V.dst) INNER JOIN location on (location.path=V.path) - """, - data, - ) + ON CONFLICT DO NOTHING + """ + psycopg2.extras.execute_values(self.cursor, sql, data) data.clear() diff --git a/swh/provenance/postgresql/provenancedb_without_path.py b/swh/provenance/postgresql/provenancedb_without_path.py --- a/swh/provenance/postgresql/provenancedb_without_path.py +++ b/swh/provenance/postgresql/provenancedb_without_path.py @@ -15,30 +15,81 @@ def content_find_first( self, blob: bytes ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]: - self.cursor.execute( + if self.denormalized: + sql = """ + select C_L.sha1 as blob, + R.sha1 as rev, + R.date as date, + '\\x'::bytea as path + from ( + select C.sha1, unnest(revision) as revision + from content_in_revision as C_R + inner join content as C on (C.id=C_R.content) + where C.sha1=%s) as C_L + inner join revision as R on (R.id=C_L.revision) + order by date, rev, path asc limit 1 """ + else: + sql = """ SELECT C.sha1 AS blob, R.sha1 AS rev, R.date AS date, - '\\x'::bytea as path + '\\x'::bytea AS path FROM content AS C INNER JOIN content_in_revision AS CR ON (CR.content = C.id) - INNER JOIN revision as R ON (CR.revision = R.id) + INNER JOIN revision AS R ON (CR.revision = R.id) WHERE C.sha1=%s ORDER BY date, rev ASC LIMIT 1 - """, - (blob,), - ) + """ + + self.cursor.execute(sql, (blob,)) return self.cursor.fetchone() def content_find_all( self, blob: bytes, limit: Optional[int] = None ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]: early_cut = f"LIMIT {limit}" if limit is not None else "" - self.cursor.execute( - f""" - (SELECT C.sha1 AS blob, - R.sha1 AS rev, + if self.denormalized: + sql = f""" + (SELECT C_L.sha1 AS content, + R.sha1 AS revision, + R.date AS date, + '\\x'::bytea as path + FROM ( + SELECT C.sha1, unnest(revision) AS revision + FROM content_in_revision AS C_R + INNER JOIN content AS C ON (C.id=C_R.content) + WHERE C.sha1=%s) AS C_L + INNER JOIN revision AS R ON (R.id=C_L.revision) + ) + UNION + (WITH + C_D as ( + SELECT C.sha1 AS content_sha1, + unnest(CD.directory) AS directory + FROM content AS C + INNER JOIN content_in_directory AS CD ON (CD.content = C.id) + WHERE C.sha1=%s + ), + D_R as ( + SELECT C_D.content_sha1 AS content_sha1, + unnest(DR.revision) AS revision + FROM C_D + INNER JOIN directory_in_revision AS DR ON (DR.directory = C_D.directory) + ) + SELECT D_R.content_sha1 as sha1, + R.sha1 AS revision, + R.date AS date, + '\\x'::bytea AS path + FROM D_R + INNER JOIN revision AS R ON (D_R.revision = R.id) + ) + ORDER BY date, revision, path {early_cut} + """ + else: + sql = f""" + (SELECT C.sha1 AS content, + R.sha1 AS revision, R.date AS date, '\\x'::bytea as path FROM content AS C @@ -51,14 +102,13 @@ R.date AS date, '\\x'::bytea as path FROM content AS C - INNER JOIN content_in_directory AS CD ON (C.id = CD.content) - INNER JOIN directory_in_revision AS DR ON (CD.directory = DR.directory) - INNER JOIN revision AS R ON (DR.revision = R.id) + INNER JOIN content_in_directory AS CD ON (CD.content = C.id) + INNER JOIN directory_in_revision AS DR ON (DR.directory = CD.directory) + INNER JOIN revision AS R ON (R.id = DR.revision) WHERE C.sha1=%s) - ORDER BY date, rev, path {early_cut} - """, - (blob, blob), - ) + ORDER BY date, revision, path {early_cut} + """ + self.cursor.execute(sql, (blob, blob)) yield from self.cursor.fetchall() def insert_relation(self, relation: str, data: Set[Tuple[bytes, bytes, bytes]]): @@ -70,16 +120,28 @@ ) src, dst = relation.split("_in_") - psycopg2.extras.execute_values( - self.cursor, - f""" + if self.denormalized: + sql = f""" LOCK TABLE ONLY {relation}; - INSERT INTO {relation} + insert into {relation} ({src}, {dst}) + SELECT {src}.id, array_agg({dst}.id) + FROM (VALUES %s) AS V(src, dst) + INNER JOIN {src} on ({src}.sha1=V.src) + INNER JOIN {dst} on ({dst}.sha1=V.dst) + GROUP BY {src}.id + on conflict({src}) do update + set {dst}=array( + select unnest({relation}.{dst} || excluded.{dst})) + """ + else: + sql = f""" + LOCK TABLE ONLY {relation}; + INSERT INTO {relation} ({src}, {dst}) SELECT {src}.id, {dst}.id FROM (VALUES %s) AS V(src, dst) INNER JOIN {src} on ({src}.sha1=V.src) INNER JOIN {dst} on ({dst}.sha1=V.dst) - """, - data, - ) + ON CONFLICT DO NOTHING + """ + psycopg2.extras.execute_values(self.cursor, sql, data) data.clear() diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py --- a/swh/provenance/provenance.py +++ b/swh/provenance/provenance.py @@ -148,8 +148,7 @@ # TODO: this class should not know what the actual used DB is. self.storage: ProvenanceDBBase - flavor = ProvenanceDBBase(conn).flavor - if flavor == "with-path": + if ProvenanceDBBase(conn).with_path: from .postgresql.provenancedb_with_path import ProvenanceWithPathDB self.storage = ProvenanceWithPathDB(conn) diff --git a/swh/provenance/sql/15-flavor.sql b/swh/provenance/sql/15-flavor.sql --- a/swh/provenance/sql/15-flavor.sql +++ b/swh/provenance/sql/15-flavor.sql @@ -1,7 +1,9 @@ -- database flavor create type database_flavor as enum ( 'with-path', - 'without-path' + 'without-path', + 'with-path-denormalized', + 'without-path-denormalized' ); comment on type database_flavor is 'Flavor of the current database'; diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql --- a/swh/provenance/sql/30-schema.sql +++ b/swh/provenance/sql/30-schema.sql @@ -1,4 +1,5 @@ -- psql variables to get the current database flavor +select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset create table dbversion ( @@ -76,8 +77,13 @@ create table content_in_revision ( content bigint not null, -- internal identifier of the content blob +\if :dbflavor_norm revision bigint not null, -- internal identifier of the revision where the blob appears for the first time - location bigint -- location of the content relative to the revision root directory + location bigint -- location of the content relative to the revision root directory +\else + revision bigint[], -- internal identifier of the revision where the blob appears for the first time + location bigint[] -- location of the content relative to the revision root directory +\endif -- foreign key (blob) references content (id), -- foreign key (rev) references revision (id), -- foreign key (loc) references location (id) @@ -89,8 +95,13 @@ create table content_in_directory ( content bigint not null, -- internal identifier of the content blob +\if :dbflavor_norm directory bigint not null, -- internal identifier of the directory containing the blob location bigint -- location of the content relative to its parent directory in the isochrone frontier +\else + directory bigint[], + location bigint[] +\endif -- foreign key (blob) references content (id), -- foreign key (dir) references directory (id), -- foreign key (loc) references location (id) @@ -102,8 +113,13 @@ create table directory_in_revision ( directory bigint not null, -- internal identifier of the directory appearing in the revision +\if :dbflavor_norm revision bigint not null, -- internal identifier of the revision containing the directory location bigint -- location of the directory relative to the revision root directory +\else + revision bigint[], + location bigint[] +\endif -- foreign key (dir) references directory (id), -- foreign key (rev) references revision (id), -- foreign key (loc) references location (id) diff --git a/swh/provenance/sql/60-indexes.sql b/swh/provenance/sql/60-indexes.sql --- a/swh/provenance/sql/60-indexes.sql +++ b/swh/provenance/sql/60-indexes.sql @@ -1,9 +1,18 @@ -- psql variables to get the current database flavor +select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset -- create unique indexes (instead of pkey) because location might be null for -- the without-path flavor +\if :dbflavor_norm create unique index on content_in_revision(content, revision, location); create unique index on directory_in_revision(directory, revision, location); create unique index on content_in_directory(content, directory, location); +\else +create unique index on content_in_revision(content); +create unique index on directory_in_revision(directory); +create unique index on content_in_directory(content); +\endif + + alter table revision_in_origin add primary key (revision, origin); alter table revision_before_revision add primary key (prev, next); diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -19,7 +19,14 @@ from swh.storage.replay import process_replay_objects -@pytest.fixture(params=["with-path", "without-path"]) +@pytest.fixture( + params=[ + "with-path", + "without-path", + "with-path-denormalized", + "without-path-denormalized", + ] +) def provenance(request, postgresql): """return a working and initialized provenance db""" from swh.core.cli.db import populate_database_for_package diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py --- a/swh/provenance/tests/test_provenance_db.py +++ b/swh/provenance/tests/test_provenance_db.py @@ -34,8 +34,13 @@ def test_provenance_flavor(provenance): - assert provenance.storage.flavor in ("with-path", "without-path") - if provenance.storage.flavor == "with-path": + assert provenance.storage.flavor in ( + "with-path", + "without-path", + "with-path-denormalized", + "without-path-denormalized", + ) + if "with-path" in provenance.storage.flavor: backend_class = ProvenanceWithPathDB else: backend_class = ProvenanceWithoutPathDB diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py --- a/swh/provenance/tests/test_provenance_heuristics.py +++ b/swh/provenance/tests/test_provenance_heuristics.py @@ -46,33 +46,65 @@ """ relation = f"{src}_in_{dst}" cur.execute("select swh_get_dbflavor()") - with_path = cur.fetchone()[0] == "with-path" + flavor = cur.fetchone()[0] + with_path = "with-path" in flavor + denormalized = "denormalized" in flavor # note that the columns have the same name as the relations they refer to, # so we can write things like "rel.{dst}=src.id" in the query below if with_path: - cur.execute( - f""" - SELECT encode(src.sha1::bytea, 'hex'), - encode(dst.sha1::bytea, 'hex'), + if denormalized: + sql = f""" + WITH unnested AS ( + SELECT {src}.sha1 as sha1, + unnest({relation}.{dst}) as {dst}, + unnest({relation}.location) as location + FROM {relation} + INNER JOIN {src} ON ({src}.id = {relation}.{src}) + ) + SELECT encode(unnested.sha1::bytea, 'hex'), + encode({dst}.sha1::bytea, 'hex'), encode(location.path::bytea, 'escape') - FROM {relation} as relation - INNER JOIN {src} AS src ON (relation.{src} = src.id) - INNER JOIN {dst} AS dst ON (relation.{dst} = dst.id) - INNER JOIN location ON (relation.location = location.id) + FROM unnested + INNER JOIN {dst} ON (unnested.{dst} = {dst}.id) + INNER JOIN location ON (unnested.location = location.id) + """ + else: + sql = f""" + SELECT encode({src}.sha1::bytea, 'hex'), + encode({dst}.sha1::bytea, 'hex'), + encode(location.path::bytea, 'escape') + FROM {relation} + INNER JOIN {src} ON ({relation}.{src} = {src}.id) + INNER JOIN {dst} ON ({relation}.{dst} = {dst}.id) + INNER JOIN location ON ({relation}.location = location.id) """ - ) else: - cur.execute( - f""" - SELECT encode(src.sha1::bytea, 'hex'), - encode(dst.sha1::bytea, 'hex'), + if denormalized: + sql = f""" + WITH unnested AS ( + SELECT {src}.sha1 as sha1, + unnest({relation}.{dst}) as {dst} + FROM {relation} + INNER JOIN {src} ON ({relation}.{src} = {src}.id) + ) + SELECT encode(unnested.sha1::bytea, 'hex'), + encode({dst}.sha1::bytea, 'hex'), '' - FROM {relation} as relation - INNER JOIN {src} AS src ON (src.id = relation.{src}) - INNER JOIN {dst} AS dst ON (dst.id = relation.{dst}) + FROM unnested + INNER JOIN {dst} ON (unnested.{dst} = {dst}.id) """ - ) + else: + sql = f""" + SELECT encode({src}.sha1::bytea, 'hex'), + encode({dst}.sha1::bytea, 'hex'), + '' + FROM {relation} + INNER JOIN {src} ON ({src}.id = {relation}.{src}) + INNER JOIN {dst} ON ({dst}.id = {relation}.{dst}) + """ + + cur.execute(sql) return set(cur.fetchall())