diff --git a/swh/provenance/postgresql/provenancedb.py b/swh/provenance/postgresql/provenancedb.py --- a/swh/provenance/postgresql/provenancedb.py +++ b/swh/provenance/postgresql/provenancedb.py @@ -28,6 +28,7 @@ directory: Cache revision: Cache # below are insertion caches only + # triplets below are (sha1_src, sha1_dst, path) content_early_in_rev: Set[Tuple[bytes, bytes, bytes]] content_in_dir: Set[Tuple[bytes, bytes, bytes]] directory_in_rev: Set[Tuple[bytes, bytes, bytes]] @@ -62,6 +63,19 @@ self.cursor.execute("SET timezone TO 'UTC'") self.cache: ProvenanceCache = new_cache() self.clear_caches() + self._flavor: Optional[str] = None + + @property + def flavor(self) -> str: + if self._flavor is None: + self.cursor.execute("select swh_get_dbflavor()") + self._flavor = self.cursor.fetchone()[0] + assert self._flavor is not None + return self._flavor + + @property + def normalized(self) -> bool: + return self.flavor == "normalized" def clear_caches(self): self.cache = new_cache() @@ -304,58 +318,93 @@ def content_find_first( self, blobid: bytes ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]: - self.cursor.execute( - """ + if self.normalized: + self.cursor.execute( + """ + SELECT C.sha1 AS blob, + R.sha1 AS rev, + R.date AS date, + L.path AS path + FROM content AS C + INNER JOIN content_early_in_rev AS CR ON (CR.blob = C.id) + INNER JOIN location as L ON (CR.loc = L.id) + INNER JOIN revision as R ON (CR.rev = R.id) + WHERE C.sha1=%s + ORDER BY date, rev, path ASC LIMIT 1 + """, + (blobid,), + ) + else: + self.cursor.execute( + """ + select C_L.sha1 as blob, + R.sha1 as rev, + R.date as date, + L.path as path + from ( + select C.sha1, unnest(revs) as rev, unnest(locs) as loc + from content_early_in_rev as C_R + inner join content as C on (C.id=C_R.cnt) + where C.sha1=%s) as C_L + inner join revision as R on (R.id=C_L.rev) + inner join location as L on (L.id=C_L.loc) + order by date, rev, path asc limit 1 + """, + (blobid,), + ) + return self.cursor.fetchone() + + def content_find_all( + self, blobid: bytes, limit: Optional[int] = None + ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]: + early_cut = f"LIMIT {limit}" if limit is not None else "" + if self.normalized: + sql_CR = """ SELECT C.sha1 AS blob, R.sha1 AS rev, R.date AS date, L.path AS path FROM content AS C INNER JOIN content_early_in_rev AS CR ON (CR.blob = C.id) - INNER JOIN location as L ON (CR.loc = L.id) - INNER JOIN revision as R ON (CR.rev = R.id) + INNER JOIN location AS L ON (CR.loc = L.id) + INNER JOIN revision AS R ON (CR.rev = R.id) WHERE C.sha1=%s - ORDER BY date, rev, path ASC LIMIT 1 - """, - (blobid,), - ) - return self.cursor.fetchone() - - def content_find_all( - self, blobid: bytes, limit: Optional[int] = None - ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]: - early_cut = f"LIMIT {limit}" if limit is not None else "" - self.cursor.execute( - f""" - (SELECT C.sha1 AS blob, - R.sha1 AS rev, - R.date AS date, - L.path AS path - FROM content AS C - INNER JOIN content_early_in_rev AS CR ON (CR.blob = C.id) - INNER JOIN location AS L ON (CR.loc = L.id) - INNER JOIN revision AS R ON (CR.rev = R.id) - WHERE C.sha1=%s) - UNION - (SELECT C.sha1 AS blob, - R.sha1 AS rev, - R.date AS date, - CASE DL.path - WHEN '' THEN CL.path - WHEN '.' THEN CL.path - ELSE (DL.path || '/' || CL.path)::unix_path - END AS path - FROM content AS C - INNER JOIN content_in_dir AS CD ON (C.id = CD.blob) - INNER JOIN directory_in_rev AS DR ON (CD.dir = DR.dir) - INNER JOIN revision AS R ON (DR.rev = R.id) - INNER JOIN location AS CL ON (CD.loc = CL.id) - INNER JOIN location AS DL ON (DR.loc = DL.id) - WHERE C.sha1=%s) - ORDER BY date, rev, path {early_cut} - """, - (blobid, blobid), - ) + """ + else: + sql_CR = """ + select C_L.sha1 as blob, + R.sha1 as rev, + R.date as date, + L.path as path + from ( + select C.sha1, unnest(revs) as rev, unnest(locs) as loc + from content_early_in_rev as C_R + inner join content as C on (C.id=C_R.cnt) + where C.sha1=%s) as C_L + inner join revision as R on (R.id=C_L.rev) + inner join location as L on (L.id=C_L.loc) + """ + sql = f""" + ({sql_CR}) + UNION + (SELECT C.sha1 AS blob, + R.sha1 AS rev, + R.date AS date, + CASE DL.path + WHEN '' THEN CL.path + WHEN '.' THEN CL.path + ELSE (DL.path || '/' || CL.path)::unix_path + END AS path + FROM content AS C + INNER JOIN content_in_dir AS CD ON (C.id = CD.blob) + INNER JOIN directory_in_rev AS DR ON (CD.dir = DR.dir) + INNER JOIN revision AS R ON (DR.rev = R.id) + INNER JOIN location AS CL ON (CD.loc = CL.id) + INNER JOIN location AS DL ON (DR.loc = DL.id) + WHERE C.sha1=%s) + ORDER BY date, rev, path {early_cut} + """ + self.cursor.execute(sql, (blobid, blobid)) # TODO: use POSTGRESQL EXPLAIN looking for query optimizations. yield from self.cursor.fetchall() @@ -379,14 +428,28 @@ """, locations, ) + if dst_table == "content_early_in_rev" and not self.normalized: + sql = """ + insert into {dst_table} + SELECT {src0_table}.id, array_agg({src1_table}.id), array_agg(location.id) + FROM (VALUES %s) AS V(src, dst, path) + INNER JOIN {src0_table} on ({src0_table}.sha1=V.src) + INNER JOIN {src1_table} on ({src1_table}.sha1=V.dst) + INNER JOIN location on (location.path=V.path) + GROUP BY {src0_table}.id + on conflict do update + set revs=array(select unnest({dst_table}.revs || excluded.revs)), + locs=array(select unnest({dst_table}.locs || excluded.locs)) + """ - sql = f""" - INSERT INTO {dst_table} - SELECT {src0_table}.id, {src1_table}.id, location.id - FROM (VALUES %s) AS V(src, dst, path) - INNER JOIN {src0_table} on ({src0_table}.sha1=V.src) - INNER JOIN {src1_table} on ({src1_table}.sha1=V.dst) - INNER JOIN location on (location.path=V.path) - """ + else: + sql = f""" + INSERT INTO {dst_table} + SELECT {src0_table}.id, {src1_table}.id, location.id + FROM (VALUES %s) AS V(src, dst, path) + INNER JOIN {src0_table} on ({src0_table}.sha1=V.src) + INNER JOIN {src1_table} on ({src1_table}.sha1=V.dst) + INNER JOIN location on (location.path=V.path) + """ psycopg2.extras.execute_values(self.cursor, sql, self.cache[dst_table]) self.cache[dst_table].clear() diff --git a/swh/provenance/sql/15-flavor.sql b/swh/provenance/sql/15-flavor.sql new file mode 100644 --- /dev/null +++ b/swh/provenance/sql/15-flavor.sql @@ -0,0 +1,21 @@ +-- database flavor +create type database_flavor as enum ( + 'normalized', + 'denormalized' +); +comment on type database_flavor is 'Flavor of the current database'; + +create table dbflavor ( + flavor database_flavor, + single_row char(1) primary key default 'x', + check (single_row = 'x') +); +comment on table dbflavor is 'Database flavor storage'; +comment on column dbflavor.flavor is 'Database flavor currently deployed'; +comment on column dbflavor.single_row is 'Bogus column to force the table to have a single row'; + +create or replace function swh_get_dbflavor() returns database_flavor language sql stable as $$ + select coalesce((select flavor from dbflavor), 'normalized'); +$$; + +comment on function swh_get_dbflavor is 'Get the flavor of the database currently deployed'; diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql --- a/swh/provenance/sql/30-schema.sql +++ b/swh/provenance/sql/30-schema.sql @@ -1,4 +1,5 @@ -- psql variables to get the current database flavor +select swh_get_dbflavor() = 'normalized' as dbflavor_norm \gset create table dbversion ( @@ -35,15 +36,25 @@ create table content_early_in_rev ( blob bigint not null, -- internal identifier of the content blob +\if :dbflavor_norm rev bigint not null, -- internal identifier of the revision where the blob appears for the first time loc bigint not null -- location of the content relative to the revision root directory +\else + revs bigint[], -- internal identifier of the revision where the blob appears for the first time + loc bigint[] -- location of the content relative to the revision root directory +\endif -- foreign key (blob) references content (id), -- foreign key (rev) references revision (id), -- foreign key (loc) references location (id) ); comment on column content_early_in_rev.blob is 'Content internal identifier'; +\if :dbflavor_norm comment on column content_early_in_rev.rev is 'Revision internal identifier'; comment on column content_early_in_rev.loc is 'Location of content in revision'; +\else +comment on column content_early_in_rev.revs is 'Revision internal identifier'; +comment on column content_early_in_rev.locs is 'Location of content in revision'; +\endif create table content_in_dir ( diff --git a/swh/provenance/sql/60-indexes.sql b/swh/provenance/sql/60-indexes.sql --- a/swh/provenance/sql/60-indexes.sql +++ b/swh/provenance/sql/60-indexes.sql @@ -1,5 +1,11 @@ -- psql variables to get the current database flavor +select swh_get_dbflavor() = 'normalized' as dbflavor_norm \gset +\if :dbflavor_norm alter table content_early_in_rev add primary key (blob, rev, loc); +\else +create unique index on content_early_in_rev(blob); +\endif + alter table content_in_dir add primary key (blob, dir, loc); alter table directory_in_rev add primary key (dir, rev, loc); diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -3,7 +3,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import glob from os import path import re from typing import Iterable, Iterator, List, Optional @@ -14,28 +13,26 @@ from swh.core.api.serializers import msgpack_loads from swh.core.db import BaseDb from swh.core.db.pytest_plugin import postgresql_fact -from swh.core.utils import numfile_sortkey as sortkey from swh.model.model import Content, Directory, DirectoryEntry, Revision from swh.model.tests.swh_model_data import TEST_OBJECTS -import swh.provenance from swh.provenance.postgresql.archive import ArchivePostgreSQL from swh.provenance.storage.archive import ArchiveStorage -SQL_DIR = path.join(path.dirname(swh.provenance.__file__), "sql") -SQL_FILES = [ - sqlfile - for sqlfile in sorted(glob.glob(path.join(SQL_DIR, "*.sql")), key=sortkey) - if "-without-path-" not in sqlfile -] - provenance_db = postgresql_fact( - "postgresql_proc", dbname="provenance", dump_files=SQL_FILES + "postgresql_proc", + dbname="provenance", ) -@pytest.fixture -def provenance(provenance_db): +@pytest.fixture(params=["normalized", "denormalized"]) +def provenance(request, provenance_db): """return a working and initialized provenance db""" + from swh.core.cli.db import populate_database_for_package + + populate_database_for_package( + "swh.provenance", provenance_db.dsn, flavor=request.param + ) + from swh.provenance.postgresql.provenancedb import ProvenanceDB BaseDb.adapt_conn(provenance_db) diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py --- a/swh/provenance/tests/test_cli.py +++ b/swh/provenance/tests/test_cli.py @@ -45,6 +45,7 @@ TABLES = { + "dbflavor", "dbversion", "content", "content_early_in_rev",