Page MenuHomeSoftware Heritage

D5843.id20924.diff
No OneTemporary

D5843.id20924.diff

diff --git a/swh/provenance/postgresql/provenancedb.py b/swh/provenance/postgresql/provenancedb.py
--- a/swh/provenance/postgresql/provenancedb.py
+++ b/swh/provenance/postgresql/provenancedb.py
@@ -28,6 +28,7 @@
directory: Cache
revision: Cache
# below are insertion caches only
+ # triplets below are (sha1_src, sha1_dst, path)
content_early_in_rev: Set[Tuple[bytes, bytes, bytes]]
content_in_dir: Set[Tuple[bytes, bytes, bytes]]
directory_in_rev: Set[Tuple[bytes, bytes, bytes]]
@@ -62,6 +63,19 @@
self.cursor.execute("SET timezone TO 'UTC'")
self.cache: ProvenanceCache = new_cache()
self.clear_caches()
+ self._flavor: Optional[str] = None
+
+ @property
+ def flavor(self) -> str:
+ if self._flavor is None:
+ self.cursor.execute("select swh_get_dbflavor()")
+ self._flavor = self.cursor.fetchone()[0]
+ assert self._flavor is not None
+ return self._flavor
+
+ @property
+ def normalized(self) -> bool:
+ return self.flavor == "normalized"
def clear_caches(self):
self.cache = new_cache()
@@ -304,58 +318,93 @@
def content_find_first(
self, blobid: bytes
) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
- self.cursor.execute(
- """
+ if self.normalized:
+ self.cursor.execute(
+ """
+ SELECT C.sha1 AS blob,
+ R.sha1 AS rev,
+ R.date AS date,
+ L.path AS path
+ FROM content AS C
+ INNER JOIN content_early_in_rev AS CR ON (CR.blob = C.id)
+ INNER JOIN location as L ON (CR.loc = L.id)
+ INNER JOIN revision as R ON (CR.rev = R.id)
+ WHERE C.sha1=%s
+ ORDER BY date, rev, path ASC LIMIT 1
+ """,
+ (blobid,),
+ )
+ else:
+ self.cursor.execute(
+ """
+ select C_L.sha1 as blob,
+ R.sha1 as rev,
+ R.date as date,
+ L.path as path
+ from (
+ select C.sha1, unnest(revs) as rev, unnest(locs) as loc
+ from content_early_in_rev as C_R
+ inner join content as C on (C.id=C_R.cnt)
+ where C.sha1=%s) as C_L
+ inner join revision as R on (R.id=C_L.rev)
+ inner join location as L on (L.id=C_L.loc)
+ order by date, rev, path asc limit 1
+ """,
+ (blobid,),
+ )
+ return self.cursor.fetchone()
+
+ def content_find_all(
+ self, blobid: bytes, limit: Optional[int] = None
+ ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
+ early_cut = f"LIMIT {limit}" if limit is not None else ""
+ if self.normalized:
+ sql_CR = """
SELECT C.sha1 AS blob,
R.sha1 AS rev,
R.date AS date,
L.path AS path
FROM content AS C
INNER JOIN content_early_in_rev AS CR ON (CR.blob = C.id)
- INNER JOIN location as L ON (CR.loc = L.id)
- INNER JOIN revision as R ON (CR.rev = R.id)
+ INNER JOIN location AS L ON (CR.loc = L.id)
+ INNER JOIN revision AS R ON (CR.rev = R.id)
WHERE C.sha1=%s
- ORDER BY date, rev, path ASC LIMIT 1
- """,
- (blobid,),
- )
- return self.cursor.fetchone()
-
- def content_find_all(
- self, blobid: bytes, limit: Optional[int] = None
- ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
- early_cut = f"LIMIT {limit}" if limit is not None else ""
- self.cursor.execute(
- f"""
- (SELECT C.sha1 AS blob,
- R.sha1 AS rev,
- R.date AS date,
- L.path AS path
- FROM content AS C
- INNER JOIN content_early_in_rev AS CR ON (CR.blob = C.id)
- INNER JOIN location AS L ON (CR.loc = L.id)
- INNER JOIN revision AS R ON (CR.rev = R.id)
- WHERE C.sha1=%s)
- UNION
- (SELECT C.sha1 AS blob,
- R.sha1 AS rev,
- R.date AS date,
- CASE DL.path
- WHEN '' THEN CL.path
- WHEN '.' THEN CL.path
- ELSE (DL.path || '/' || CL.path)::unix_path
- END AS path
- FROM content AS C
- INNER JOIN content_in_dir AS CD ON (C.id = CD.blob)
- INNER JOIN directory_in_rev AS DR ON (CD.dir = DR.dir)
- INNER JOIN revision AS R ON (DR.rev = R.id)
- INNER JOIN location AS CL ON (CD.loc = CL.id)
- INNER JOIN location AS DL ON (DR.loc = DL.id)
- WHERE C.sha1=%s)
- ORDER BY date, rev, path {early_cut}
- """,
- (blobid, blobid),
- )
+ """
+ else:
+ sql_CR = """
+ select C_L.sha1 as blob,
+ R.sha1 as rev,
+ R.date as date,
+ L.path as path
+ from (
+ select C.sha1, unnest(revs) as rev, unnest(locs) as loc
+ from content_early_in_rev as C_R
+ inner join content as C on (C.id=C_R.cnt)
+ where C.sha1=%s) as C_L
+ inner join revision as R on (R.id=C_L.rev)
+ inner join location as L on (L.id=C_L.loc)
+ """
+ sql = f"""
+ ({sql_CR})
+ UNION
+ (SELECT C.sha1 AS blob,
+ R.sha1 AS rev,
+ R.date AS date,
+ CASE DL.path
+ WHEN '' THEN CL.path
+ WHEN '.' THEN CL.path
+ ELSE (DL.path || '/' || CL.path)::unix_path
+ END AS path
+ FROM content AS C
+ INNER JOIN content_in_dir AS CD ON (C.id = CD.blob)
+ INNER JOIN directory_in_rev AS DR ON (CD.dir = DR.dir)
+ INNER JOIN revision AS R ON (DR.rev = R.id)
+ INNER JOIN location AS CL ON (CD.loc = CL.id)
+ INNER JOIN location AS DL ON (DR.loc = DL.id)
+ WHERE C.sha1=%s)
+ ORDER BY date, rev, path {early_cut}
+ """
+ self.cursor.execute(sql, (blobid, blobid))
# TODO: use POSTGRESQL EXPLAIN looking for query optimizations.
yield from self.cursor.fetchall()
@@ -379,14 +428,28 @@
""",
locations,
)
+ if dst_table == "content_early_in_rev" and not self.normalized:
+ sql = """
+ insert into {dst_table}
+ SELECT {src0_table}.id, array_agg({src1_table}.id), array_agg(location.id)
+ FROM (VALUES %s) AS V(src, dst, path)
+ INNER JOIN {src0_table} on ({src0_table}.sha1=V.src)
+ INNER JOIN {src1_table} on ({src1_table}.sha1=V.dst)
+ INNER JOIN location on (location.path=V.path)
+ GROUP BY {src0_table}.id
+ on conflict do update
+ set revs=array(select unnest({dst_table}.revs || excluded.revs)),
+ locs=array(select unnest({dst_table}.locs || excluded.locs))
+ """
- sql = f"""
- INSERT INTO {dst_table}
- SELECT {src0_table}.id, {src1_table}.id, location.id
- FROM (VALUES %s) AS V(src, dst, path)
- INNER JOIN {src0_table} on ({src0_table}.sha1=V.src)
- INNER JOIN {src1_table} on ({src1_table}.sha1=V.dst)
- INNER JOIN location on (location.path=V.path)
- """
+ else:
+ sql = f"""
+ INSERT INTO {dst_table}
+ SELECT {src0_table}.id, {src1_table}.id, location.id
+ FROM (VALUES %s) AS V(src, dst, path)
+ INNER JOIN {src0_table} on ({src0_table}.sha1=V.src)
+ INNER JOIN {src1_table} on ({src1_table}.sha1=V.dst)
+ INNER JOIN location on (location.path=V.path)
+ """
psycopg2.extras.execute_values(self.cursor, sql, self.cache[dst_table])
self.cache[dst_table].clear()
diff --git a/swh/provenance/sql/15-flavor.sql b/swh/provenance/sql/15-flavor.sql
new file mode 100644
--- /dev/null
+++ b/swh/provenance/sql/15-flavor.sql
@@ -0,0 +1,21 @@
+-- database flavor
+create type database_flavor as enum (
+ 'normalized',
+ 'denormalized'
+);
+comment on type database_flavor is 'Flavor of the current database';
+
+create table dbflavor (
+ flavor database_flavor,
+ single_row char(1) primary key default 'x',
+ check (single_row = 'x')
+);
+comment on table dbflavor is 'Database flavor storage';
+comment on column dbflavor.flavor is 'Database flavor currently deployed';
+comment on column dbflavor.single_row is 'Bogus column to force the table to have a single row';
+
+create or replace function swh_get_dbflavor() returns database_flavor language sql stable as $$
+ select coalesce((select flavor from dbflavor), 'normalized');
+$$;
+
+comment on function swh_get_dbflavor is 'Get the flavor of the database currently deployed';
diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql
--- a/swh/provenance/sql/30-schema.sql
+++ b/swh/provenance/sql/30-schema.sql
@@ -1,4 +1,5 @@
-- psql variables to get the current database flavor
+select swh_get_dbflavor() = 'normalized' as dbflavor_norm \gset
create table dbversion
(
@@ -35,15 +36,25 @@
create table content_early_in_rev
(
blob bigint not null, -- internal identifier of the content blob
+\if :dbflavor_norm
rev bigint not null, -- internal identifier of the revision where the blob appears for the first time
loc bigint not null -- location of the content relative to the revision root directory
+\else
+ revs bigint[], -- internal identifier of the revision where the blob appears for the first time
+ loc bigint[] -- location of the content relative to the revision root directory
+\endif
-- foreign key (blob) references content (id),
-- foreign key (rev) references revision (id),
-- foreign key (loc) references location (id)
);
comment on column content_early_in_rev.blob is 'Content internal identifier';
+\if :dbflavor_norm
comment on column content_early_in_rev.rev is 'Revision internal identifier';
comment on column content_early_in_rev.loc is 'Location of content in revision';
+\else
+comment on column content_early_in_rev.revs is 'Revision internal identifier';
+comment on column content_early_in_rev.locs is 'Location of content in revision';
+\endif
create table content_in_dir
(
diff --git a/swh/provenance/sql/60-indexes.sql b/swh/provenance/sql/60-indexes.sql
--- a/swh/provenance/sql/60-indexes.sql
+++ b/swh/provenance/sql/60-indexes.sql
@@ -1,5 +1,11 @@
-- psql variables to get the current database flavor
+select swh_get_dbflavor() = 'normalized' as dbflavor_norm \gset
+\if :dbflavor_norm
alter table content_early_in_rev add primary key (blob, rev, loc);
+\else
+create unique index on content_early_in_rev(blob);
+\endif
+
alter table content_in_dir add primary key (blob, dir, loc);
alter table directory_in_rev add primary key (dir, rev, loc);
diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
--- a/swh/provenance/tests/conftest.py
+++ b/swh/provenance/tests/conftest.py
@@ -3,7 +3,6 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import glob
from os import path
import re
from typing import Iterable, Iterator, List, Optional
@@ -14,28 +13,26 @@
from swh.core.api.serializers import msgpack_loads
from swh.core.db import BaseDb
from swh.core.db.pytest_plugin import postgresql_fact
-from swh.core.utils import numfile_sortkey as sortkey
from swh.model.model import Content, Directory, DirectoryEntry, Revision
from swh.model.tests.swh_model_data import TEST_OBJECTS
-import swh.provenance
from swh.provenance.postgresql.archive import ArchivePostgreSQL
from swh.provenance.storage.archive import ArchiveStorage
-SQL_DIR = path.join(path.dirname(swh.provenance.__file__), "sql")
-SQL_FILES = [
- sqlfile
- for sqlfile in sorted(glob.glob(path.join(SQL_DIR, "*.sql")), key=sortkey)
- if "-without-path-" not in sqlfile
-]
-
provenance_db = postgresql_fact(
- "postgresql_proc", dbname="provenance", dump_files=SQL_FILES
+ "postgresql_proc",
+ dbname="provenance",
)
-@pytest.fixture
-def provenance(provenance_db):
+@pytest.fixture(params=["normalized", "denormalized"])
+def provenance(request, provenance_db):
"""return a working and initialized provenance db"""
+ from swh.core.cli.db import populate_database_for_package
+
+ populate_database_for_package(
+ "swh.provenance", provenance_db.dsn, flavor=request.param
+ )
+
from swh.provenance.postgresql.provenancedb import ProvenanceDB
BaseDb.adapt_conn(provenance_db)
diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py
--- a/swh/provenance/tests/test_cli.py
+++ b/swh/provenance/tests/test_cli.py
@@ -45,6 +45,7 @@
TABLES = {
+ "dbflavor",
"dbversion",
"content",
"content_early_in_rev",

File Metadata

Mime Type
text/plain
Expires
Jul 3 2025, 6:07 PM (4 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3232616

Event Timeline