Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9348010
D5843.id20924.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
13 KB
Subscribers
None
D5843.id20924.diff
View Options
diff --git a/swh/provenance/postgresql/provenancedb.py b/swh/provenance/postgresql/provenancedb.py
--- a/swh/provenance/postgresql/provenancedb.py
+++ b/swh/provenance/postgresql/provenancedb.py
@@ -28,6 +28,7 @@
directory: Cache
revision: Cache
# below are insertion caches only
+ # triplets below are (sha1_src, sha1_dst, path)
content_early_in_rev: Set[Tuple[bytes, bytes, bytes]]
content_in_dir: Set[Tuple[bytes, bytes, bytes]]
directory_in_rev: Set[Tuple[bytes, bytes, bytes]]
@@ -62,6 +63,19 @@
self.cursor.execute("SET timezone TO 'UTC'")
self.cache: ProvenanceCache = new_cache()
self.clear_caches()
+ self._flavor: Optional[str] = None
+
+ @property
+ def flavor(self) -> str:
+ if self._flavor is None:
+ self.cursor.execute("select swh_get_dbflavor()")
+ self._flavor = self.cursor.fetchone()[0]
+ assert self._flavor is not None
+ return self._flavor
+
+ @property
+ def normalized(self) -> bool:
+ return self.flavor == "normalized"
def clear_caches(self):
self.cache = new_cache()
@@ -304,58 +318,93 @@
def content_find_first(
self, blobid: bytes
) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
- self.cursor.execute(
- """
+ if self.normalized:
+ self.cursor.execute(
+ """
+ SELECT C.sha1 AS blob,
+ R.sha1 AS rev,
+ R.date AS date,
+ L.path AS path
+ FROM content AS C
+ INNER JOIN content_early_in_rev AS CR ON (CR.blob = C.id)
+ INNER JOIN location as L ON (CR.loc = L.id)
+ INNER JOIN revision as R ON (CR.rev = R.id)
+ WHERE C.sha1=%s
+ ORDER BY date, rev, path ASC LIMIT 1
+ """,
+ (blobid,),
+ )
+ else:
+ self.cursor.execute(
+ """
+ select C_L.sha1 as blob,
+ R.sha1 as rev,
+ R.date as date,
+ L.path as path
+ from (
+ select C.sha1, unnest(revs) as rev, unnest(locs) as loc
+ from content_early_in_rev as C_R
+ inner join content as C on (C.id=C_R.cnt)
+ where C.sha1=%s) as C_L
+ inner join revision as R on (R.id=C_L.rev)
+ inner join location as L on (L.id=C_L.loc)
+ order by date, rev, path asc limit 1
+ """,
+ (blobid,),
+ )
+ return self.cursor.fetchone()
+
+ def content_find_all(
+ self, blobid: bytes, limit: Optional[int] = None
+ ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
+ early_cut = f"LIMIT {limit}" if limit is not None else ""
+ if self.normalized:
+ sql_CR = """
SELECT C.sha1 AS blob,
R.sha1 AS rev,
R.date AS date,
L.path AS path
FROM content AS C
INNER JOIN content_early_in_rev AS CR ON (CR.blob = C.id)
- INNER JOIN location as L ON (CR.loc = L.id)
- INNER JOIN revision as R ON (CR.rev = R.id)
+ INNER JOIN location AS L ON (CR.loc = L.id)
+ INNER JOIN revision AS R ON (CR.rev = R.id)
WHERE C.sha1=%s
- ORDER BY date, rev, path ASC LIMIT 1
- """,
- (blobid,),
- )
- return self.cursor.fetchone()
-
- def content_find_all(
- self, blobid: bytes, limit: Optional[int] = None
- ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
- early_cut = f"LIMIT {limit}" if limit is not None else ""
- self.cursor.execute(
- f"""
- (SELECT C.sha1 AS blob,
- R.sha1 AS rev,
- R.date AS date,
- L.path AS path
- FROM content AS C
- INNER JOIN content_early_in_rev AS CR ON (CR.blob = C.id)
- INNER JOIN location AS L ON (CR.loc = L.id)
- INNER JOIN revision AS R ON (CR.rev = R.id)
- WHERE C.sha1=%s)
- UNION
- (SELECT C.sha1 AS blob,
- R.sha1 AS rev,
- R.date AS date,
- CASE DL.path
- WHEN '' THEN CL.path
- WHEN '.' THEN CL.path
- ELSE (DL.path || '/' || CL.path)::unix_path
- END AS path
- FROM content AS C
- INNER JOIN content_in_dir AS CD ON (C.id = CD.blob)
- INNER JOIN directory_in_rev AS DR ON (CD.dir = DR.dir)
- INNER JOIN revision AS R ON (DR.rev = R.id)
- INNER JOIN location AS CL ON (CD.loc = CL.id)
- INNER JOIN location AS DL ON (DR.loc = DL.id)
- WHERE C.sha1=%s)
- ORDER BY date, rev, path {early_cut}
- """,
- (blobid, blobid),
- )
+ """
+ else:
+ sql_CR = """
+ select C_L.sha1 as blob,
+ R.sha1 as rev,
+ R.date as date,
+ L.path as path
+ from (
+ select C.sha1, unnest(revs) as rev, unnest(locs) as loc
+ from content_early_in_rev as C_R
+ inner join content as C on (C.id=C_R.cnt)
+ where C.sha1=%s) as C_L
+ inner join revision as R on (R.id=C_L.rev)
+ inner join location as L on (L.id=C_L.loc)
+ """
+ sql = f"""
+ ({sql_CR})
+ UNION
+ (SELECT C.sha1 AS blob,
+ R.sha1 AS rev,
+ R.date AS date,
+ CASE DL.path
+ WHEN '' THEN CL.path
+ WHEN '.' THEN CL.path
+ ELSE (DL.path || '/' || CL.path)::unix_path
+ END AS path
+ FROM content AS C
+ INNER JOIN content_in_dir AS CD ON (C.id = CD.blob)
+ INNER JOIN directory_in_rev AS DR ON (CD.dir = DR.dir)
+ INNER JOIN revision AS R ON (DR.rev = R.id)
+ INNER JOIN location AS CL ON (CD.loc = CL.id)
+ INNER JOIN location AS DL ON (DR.loc = DL.id)
+ WHERE C.sha1=%s)
+ ORDER BY date, rev, path {early_cut}
+ """
+ self.cursor.execute(sql, (blobid, blobid))
# TODO: use POSTGRESQL EXPLAIN looking for query optimizations.
yield from self.cursor.fetchall()
@@ -379,14 +428,28 @@
""",
locations,
)
+ if dst_table == "content_early_in_rev" and not self.normalized:
+ sql = """
+ insert into {dst_table}
+ SELECT {src0_table}.id, array_agg({src1_table}.id), array_agg(location.id)
+ FROM (VALUES %s) AS V(src, dst, path)
+ INNER JOIN {src0_table} on ({src0_table}.sha1=V.src)
+ INNER JOIN {src1_table} on ({src1_table}.sha1=V.dst)
+ INNER JOIN location on (location.path=V.path)
+ GROUP BY {src0_table}.id
+ on conflict do update
+ set revs=array(select unnest({dst_table}.revs || excluded.revs)),
+ locs=array(select unnest({dst_table}.locs || excluded.locs))
+ """
- sql = f"""
- INSERT INTO {dst_table}
- SELECT {src0_table}.id, {src1_table}.id, location.id
- FROM (VALUES %s) AS V(src, dst, path)
- INNER JOIN {src0_table} on ({src0_table}.sha1=V.src)
- INNER JOIN {src1_table} on ({src1_table}.sha1=V.dst)
- INNER JOIN location on (location.path=V.path)
- """
+ else:
+ sql = f"""
+ INSERT INTO {dst_table}
+ SELECT {src0_table}.id, {src1_table}.id, location.id
+ FROM (VALUES %s) AS V(src, dst, path)
+ INNER JOIN {src0_table} on ({src0_table}.sha1=V.src)
+ INNER JOIN {src1_table} on ({src1_table}.sha1=V.dst)
+ INNER JOIN location on (location.path=V.path)
+ """
psycopg2.extras.execute_values(self.cursor, sql, self.cache[dst_table])
self.cache[dst_table].clear()
diff --git a/swh/provenance/sql/15-flavor.sql b/swh/provenance/sql/15-flavor.sql
new file mode 100644
--- /dev/null
+++ b/swh/provenance/sql/15-flavor.sql
@@ -0,0 +1,21 @@
+-- database flavor
+create type database_flavor as enum (
+ 'normalized',
+ 'denormalized'
+);
+comment on type database_flavor is 'Flavor of the current database';
+
+create table dbflavor (
+ flavor database_flavor,
+ single_row char(1) primary key default 'x',
+ check (single_row = 'x')
+);
+comment on table dbflavor is 'Database flavor storage';
+comment on column dbflavor.flavor is 'Database flavor currently deployed';
+comment on column dbflavor.single_row is 'Bogus column to force the table to have a single row';
+
+create or replace function swh_get_dbflavor() returns database_flavor language sql stable as $$
+ select coalesce((select flavor from dbflavor), 'normalized');
+$$;
+
+comment on function swh_get_dbflavor is 'Get the flavor of the database currently deployed';
diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql
--- a/swh/provenance/sql/30-schema.sql
+++ b/swh/provenance/sql/30-schema.sql
@@ -1,4 +1,5 @@
-- psql variables to get the current database flavor
+select swh_get_dbflavor() = 'normalized' as dbflavor_norm \gset
create table dbversion
(
@@ -35,15 +36,25 @@
create table content_early_in_rev
(
blob bigint not null, -- internal identifier of the content blob
+\if :dbflavor_norm
rev bigint not null, -- internal identifier of the revision where the blob appears for the first time
loc bigint not null -- location of the content relative to the revision root directory
+\else
+ revs bigint[], -- internal identifier of the revision where the blob appears for the first time
+ loc bigint[] -- location of the content relative to the revision root directory
+\endif
-- foreign key (blob) references content (id),
-- foreign key (rev) references revision (id),
-- foreign key (loc) references location (id)
);
comment on column content_early_in_rev.blob is 'Content internal identifier';
+\if :dbflavor_norm
comment on column content_early_in_rev.rev is 'Revision internal identifier';
comment on column content_early_in_rev.loc is 'Location of content in revision';
+\else
+comment on column content_early_in_rev.revs is 'Revision internal identifier';
+comment on column content_early_in_rev.locs is 'Location of content in revision';
+\endif
create table content_in_dir
(
diff --git a/swh/provenance/sql/60-indexes.sql b/swh/provenance/sql/60-indexes.sql
--- a/swh/provenance/sql/60-indexes.sql
+++ b/swh/provenance/sql/60-indexes.sql
@@ -1,5 +1,11 @@
-- psql variables to get the current database flavor
+select swh_get_dbflavor() = 'normalized' as dbflavor_norm \gset
+\if :dbflavor_norm
alter table content_early_in_rev add primary key (blob, rev, loc);
+\else
+create unique index on content_early_in_rev(blob);
+\endif
+
alter table content_in_dir add primary key (blob, dir, loc);
alter table directory_in_rev add primary key (dir, rev, loc);
diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
--- a/swh/provenance/tests/conftest.py
+++ b/swh/provenance/tests/conftest.py
@@ -3,7 +3,6 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import glob
from os import path
import re
from typing import Iterable, Iterator, List, Optional
@@ -14,28 +13,26 @@
from swh.core.api.serializers import msgpack_loads
from swh.core.db import BaseDb
from swh.core.db.pytest_plugin import postgresql_fact
-from swh.core.utils import numfile_sortkey as sortkey
from swh.model.model import Content, Directory, DirectoryEntry, Revision
from swh.model.tests.swh_model_data import TEST_OBJECTS
-import swh.provenance
from swh.provenance.postgresql.archive import ArchivePostgreSQL
from swh.provenance.storage.archive import ArchiveStorage
-SQL_DIR = path.join(path.dirname(swh.provenance.__file__), "sql")
-SQL_FILES = [
- sqlfile
- for sqlfile in sorted(glob.glob(path.join(SQL_DIR, "*.sql")), key=sortkey)
- if "-without-path-" not in sqlfile
-]
-
provenance_db = postgresql_fact(
- "postgresql_proc", dbname="provenance", dump_files=SQL_FILES
+ "postgresql_proc",
+ dbname="provenance",
)
-@pytest.fixture
-def provenance(provenance_db):
+@pytest.fixture(params=["normalized", "denormalized"])
+def provenance(request, provenance_db):
"""return a working and initialized provenance db"""
+ from swh.core.cli.db import populate_database_for_package
+
+ populate_database_for_package(
+ "swh.provenance", provenance_db.dsn, flavor=request.param
+ )
+
from swh.provenance.postgresql.provenancedb import ProvenanceDB
BaseDb.adapt_conn(provenance_db)
diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py
--- a/swh/provenance/tests/test_cli.py
+++ b/swh/provenance/tests/test_cli.py
@@ -45,6 +45,7 @@
TABLES = {
+ "dbflavor",
"dbversion",
"content",
"content_early_in_rev",
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Jul 3 2025, 6:07 PM (4 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3232616
Attached To
D5843: Add support for a denormalized version of the provenance DB
Event Timeline
Log In to Comment