D5843.id20924.diff
No OneTemporary
Actions

Size

13 KB

Subscribers

None

D5843.id20924.diff
View Options

	diff --git a/swh/provenance/postgresql/provenancedb.py b/swh/provenance/postgresql/provenancedb.py
	--- a/swh/provenance/postgresql/provenancedb.py
	+++ b/swh/provenance/postgresql/provenancedb.py
	@@ -28,6 +28,7 @@
	directory: Cache
	revision: Cache
	# below are insertion caches only
	+ # triplets below are (sha1_src, sha1_dst, path)
	content_early_in_rev: Set[Tuple[bytes, bytes, bytes]]
	content_in_dir: Set[Tuple[bytes, bytes, bytes]]
	directory_in_rev: Set[Tuple[bytes, bytes, bytes]]
	@@ -62,6 +63,19 @@
	self.cursor.execute("SET timezone TO 'UTC'")
	self.cache: ProvenanceCache = new_cache()
	self.clear_caches()
	+ self._flavor: Optional[str] = None
	+
	+ @property
	+ def flavor(self) -> str:
	+ if self._flavor is None:
	+ self.cursor.execute("select swh_get_dbflavor()")
	+ self._flavor = self.cursor.fetchone()[0]
	+ assert self._flavor is not None
	+ return self._flavor
	+
	+ @property
	+ def normalized(self) -> bool:
	+ return self.flavor == "normalized"

	def clear_caches(self):
	self.cache = new_cache()
	@@ -304,58 +318,93 @@
	def content_find_first(
	self, blobid: bytes
	) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
	- self.cursor.execute(
	- """
	+ if self.normalized:
	+ self.cursor.execute(
	+ """
	+ SELECT C.sha1 AS blob,
	+ R.sha1 AS rev,
	+ R.date AS date,
	+ L.path AS path
	+ FROM content AS C
	+ INNER JOIN content_early_in_rev AS CR ON (CR.blob = C.id)
	+ INNER JOIN location as L ON (CR.loc = L.id)
	+ INNER JOIN revision as R ON (CR.rev = R.id)
	+ WHERE C.sha1=%s
	+ ORDER BY date, rev, path ASC LIMIT 1
	+ """,
	+ (blobid,),
	+ )
	+ else:
	+ self.cursor.execute(
	+ """
	+ select C_L.sha1 as blob,
	+ R.sha1 as rev,
	+ R.date as date,
	+ L.path as path
	+ from (
	+ select C.sha1, unnest(revs) as rev, unnest(locs) as loc
	+ from content_early_in_rev as C_R
	+ inner join content as C on (C.id=C_R.cnt)
	+ where C.sha1=%s) as C_L
	+ inner join revision as R on (R.id=C_L.rev)
	+ inner join location as L on (L.id=C_L.loc)
	+ order by date, rev, path asc limit 1
	+ """,
	+ (blobid,),
	+ )
	+ return self.cursor.fetchone()
	+
	+ def content_find_all(
	+ self, blobid: bytes, limit: Optional[int] = None
	+ ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
	+ early_cut = f"LIMIT {limit}" if limit is not None else ""
	+ if self.normalized:
	+ sql_CR = """
	SELECT C.sha1 AS blob,
	R.sha1 AS rev,
	R.date AS date,
	L.path AS path
	FROM content AS C
	INNER JOIN content_early_in_rev AS CR ON (CR.blob = C.id)
	- INNER JOIN location as L ON (CR.loc = L.id)
	- INNER JOIN revision as R ON (CR.rev = R.id)
	+ INNER JOIN location AS L ON (CR.loc = L.id)
	+ INNER JOIN revision AS R ON (CR.rev = R.id)
	WHERE C.sha1=%s
	- ORDER BY date, rev, path ASC LIMIT 1
	- """,
	- (blobid,),
	- )
	- return self.cursor.fetchone()
	-
	- def content_find_all(
	- self, blobid: bytes, limit: Optional[int] = None
	- ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
	- early_cut = f"LIMIT {limit}" if limit is not None else ""
	- self.cursor.execute(
	- f"""
	- (SELECT C.sha1 AS blob,
	- R.sha1 AS rev,
	- R.date AS date,
	- L.path AS path
	- FROM content AS C
	- INNER JOIN content_early_in_rev AS CR ON (CR.blob = C.id)
	- INNER JOIN location AS L ON (CR.loc = L.id)
	- INNER JOIN revision AS R ON (CR.rev = R.id)
	- WHERE C.sha1=%s)
	- UNION
	- (SELECT C.sha1 AS blob,
	- R.sha1 AS rev,
	- R.date AS date,
	- CASE DL.path
	- WHEN '' THEN CL.path
	- WHEN '.' THEN CL.path
	- ELSE (DL.path \|\| '/' \|\| CL.path)::unix_path
	- END AS path
	- FROM content AS C
	- INNER JOIN content_in_dir AS CD ON (C.id = CD.blob)
	- INNER JOIN directory_in_rev AS DR ON (CD.dir = DR.dir)
	- INNER JOIN revision AS R ON (DR.rev = R.id)
	- INNER JOIN location AS CL ON (CD.loc = CL.id)
	- INNER JOIN location AS DL ON (DR.loc = DL.id)
	- WHERE C.sha1=%s)
	- ORDER BY date, rev, path {early_cut}
	- """,
	- (blobid, blobid),
	- )
	+ """
	+ else:
	+ sql_CR = """
	+ select C_L.sha1 as blob,
	+ R.sha1 as rev,
	+ R.date as date,
	+ L.path as path
	+ from (
	+ select C.sha1, unnest(revs) as rev, unnest(locs) as loc
	+ from content_early_in_rev as C_R
	+ inner join content as C on (C.id=C_R.cnt)
	+ where C.sha1=%s) as C_L
	+ inner join revision as R on (R.id=C_L.rev)
	+ inner join location as L on (L.id=C_L.loc)
	+ """
	+ sql = f"""
	+ ({sql_CR})
	+ UNION
	+ (SELECT C.sha1 AS blob,
	+ R.sha1 AS rev,
	+ R.date AS date,
	+ CASE DL.path
	+ WHEN '' THEN CL.path
	+ WHEN '.' THEN CL.path
	+ ELSE (DL.path \|\| '/' \|\| CL.path)::unix_path
	+ END AS path
	+ FROM content AS C
	+ INNER JOIN content_in_dir AS CD ON (C.id = CD.blob)
	+ INNER JOIN directory_in_rev AS DR ON (CD.dir = DR.dir)
	+ INNER JOIN revision AS R ON (DR.rev = R.id)
	+ INNER JOIN location AS CL ON (CD.loc = CL.id)
	+ INNER JOIN location AS DL ON (DR.loc = DL.id)
	+ WHERE C.sha1=%s)
	+ ORDER BY date, rev, path {early_cut}
	+ """
	+ self.cursor.execute(sql, (blobid, blobid))
	# TODO: use POSTGRESQL EXPLAIN looking for query optimizations.
	yield from self.cursor.fetchall()

	@@ -379,14 +428,28 @@
	""",
	locations,
	)
	+ if dst_table == "content_early_in_rev" and not self.normalized:
	+ sql = """
	+ insert into {dst_table}
	+ SELECT {src0_table}.id, array_agg({src1_table}.id), array_agg(location.id)
	+ FROM (VALUES %s) AS V(src, dst, path)
	+ INNER JOIN {src0_table} on ({src0_table}.sha1=V.src)
	+ INNER JOIN {src1_table} on ({src1_table}.sha1=V.dst)
	+ INNER JOIN location on (location.path=V.path)
	+ GROUP BY {src0_table}.id
	+ on conflict do update
	+ set revs=array(select unnest({dst_table}.revs \|\| excluded.revs)),
	+ locs=array(select unnest({dst_table}.locs \|\| excluded.locs))
	+ """

	- sql = f"""
	- INSERT INTO {dst_table}
	- SELECT {src0_table}.id, {src1_table}.id, location.id
	- FROM (VALUES %s) AS V(src, dst, path)
	- INNER JOIN {src0_table} on ({src0_table}.sha1=V.src)
	- INNER JOIN {src1_table} on ({src1_table}.sha1=V.dst)
	- INNER JOIN location on (location.path=V.path)
	- """
	+ else:
	+ sql = f"""
	+ INSERT INTO {dst_table}
	+ SELECT {src0_table}.id, {src1_table}.id, location.id
	+ FROM (VALUES %s) AS V(src, dst, path)
	+ INNER JOIN {src0_table} on ({src0_table}.sha1=V.src)
	+ INNER JOIN {src1_table} on ({src1_table}.sha1=V.dst)
	+ INNER JOIN location on (location.path=V.path)
	+ """
	psycopg2.extras.execute_values(self.cursor, sql, self.cache[dst_table])
	self.cache[dst_table].clear()
	diff --git a/swh/provenance/sql/15-flavor.sql b/swh/provenance/sql/15-flavor.sql
	new file mode 100644
	--- /dev/null
	+++ b/swh/provenance/sql/15-flavor.sql
	@@ -0,0 +1,21 @@
	+-- database flavor
	+create type database_flavor as enum (
	+ 'normalized',
	+ 'denormalized'
	+);
	+comment on type database_flavor is 'Flavor of the current database';
	+
	+create table dbflavor (
	+ flavor database_flavor,
	+ single_row char(1) primary key default 'x',
	+ check (single_row = 'x')
	+);
	+comment on table dbflavor is 'Database flavor storage';
	+comment on column dbflavor.flavor is 'Database flavor currently deployed';
	+comment on column dbflavor.single_row is 'Bogus column to force the table to have a single row';
	+
	+create or replace function swh_get_dbflavor() returns database_flavor language sql stable as $$
	+ select coalesce((select flavor from dbflavor), 'normalized');
	+$$;
	+
	+comment on function swh_get_dbflavor is 'Get the flavor of the database currently deployed';
	diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql
	--- a/swh/provenance/sql/30-schema.sql
	+++ b/swh/provenance/sql/30-schema.sql
	@@ -1,4 +1,5 @@
	-- psql variables to get the current database flavor
	+select swh_get_dbflavor() = 'normalized' as dbflavor_norm \gset

	create table dbversion
	(
	@@ -35,15 +36,25 @@
	create table content_early_in_rev
	(
	blob bigint not null, -- internal identifier of the content blob
	+\if :dbflavor_norm
	rev bigint not null, -- internal identifier of the revision where the blob appears for the first time
	loc bigint not null -- location of the content relative to the revision root directory
	+\else
	+ revs bigint[], -- internal identifier of the revision where the blob appears for the first time
	+ loc bigint[] -- location of the content relative to the revision root directory
	+\endif
	-- foreign key (blob) references content (id),
	-- foreign key (rev) references revision (id),
	-- foreign key (loc) references location (id)
	);
	comment on column content_early_in_rev.blob is 'Content internal identifier';
	+\if :dbflavor_norm
	comment on column content_early_in_rev.rev is 'Revision internal identifier';
	comment on column content_early_in_rev.loc is 'Location of content in revision';
	+\else
	+comment on column content_early_in_rev.revs is 'Revision internal identifier';
	+comment on column content_early_in_rev.locs is 'Location of content in revision';
	+\endif

	create table content_in_dir
	(
	diff --git a/swh/provenance/sql/60-indexes.sql b/swh/provenance/sql/60-indexes.sql
	--- a/swh/provenance/sql/60-indexes.sql
	+++ b/swh/provenance/sql/60-indexes.sql
	@@ -1,5 +1,11 @@
	-- psql variables to get the current database flavor
	+select swh_get_dbflavor() = 'normalized' as dbflavor_norm \gset

	+\if :dbflavor_norm
	alter table content_early_in_rev add primary key (blob, rev, loc);
	+\else
	+create unique index on content_early_in_rev(blob);
	+\endif
	+
	alter table content_in_dir add primary key (blob, dir, loc);
	alter table directory_in_rev add primary key (dir, rev, loc);
	diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
	--- a/swh/provenance/tests/conftest.py
	+++ b/swh/provenance/tests/conftest.py
	@@ -3,7 +3,6 @@
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	-import glob
	from os import path
	import re
	from typing import Iterable, Iterator, List, Optional
	@@ -14,28 +13,26 @@
	from swh.core.api.serializers import msgpack_loads
	from swh.core.db import BaseDb
	from swh.core.db.pytest_plugin import postgresql_fact
	-from swh.core.utils import numfile_sortkey as sortkey
	from swh.model.model import Content, Directory, DirectoryEntry, Revision
	from swh.model.tests.swh_model_data import TEST_OBJECTS
	-import swh.provenance
	from swh.provenance.postgresql.archive import ArchivePostgreSQL
	from swh.provenance.storage.archive import ArchiveStorage

	-SQL_DIR = path.join(path.dirname(swh.provenance.__file__), "sql")
	-SQL_FILES = [
	- sqlfile
	- for sqlfile in sorted(glob.glob(path.join(SQL_DIR, "*.sql")), key=sortkey)
	- if "-without-path-" not in sqlfile
	-]
	-
	provenance_db = postgresql_fact(
	- "postgresql_proc", dbname="provenance", dump_files=SQL_FILES
	+ "postgresql_proc",
	+ dbname="provenance",
	)


	-@pytest.fixture
	-def provenance(provenance_db):
	+@pytest.fixture(params=["normalized", "denormalized"])
	+def provenance(request, provenance_db):
	"""return a working and initialized provenance db"""
	+ from swh.core.cli.db import populate_database_for_package
	+
	+ populate_database_for_package(
	+ "swh.provenance", provenance_db.dsn, flavor=request.param
	+ )
	+
	from swh.provenance.postgresql.provenancedb import ProvenanceDB

	BaseDb.adapt_conn(provenance_db)
	diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py
	--- a/swh/provenance/tests/test_cli.py
	+++ b/swh/provenance/tests/test_cli.py
	@@ -45,6 +45,7 @@


	TABLES = {
	+ "dbflavor",
	"dbversion",
	"content",
	"content_early_in_rev",

File Metadata

Mime Type: text/plain
Expires: Jul 3 2025, 6:07 PM (4 w, 5 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3232616

D5843.id20924.diffNo OneTemporaryActions

D5843.id20924.diffView Options

File Metadata

Event Timeline

D5843.id20924.diff
No OneTemporary
Actions

D5843.id20924.diff
View Options