Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7066253
D5843.id21831.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
21 KB
Subscribers
None
D5843.id21831.diff
View Options
diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py
--- a/swh/provenance/__init__.py
+++ b/swh/provenance/__init__.py
@@ -77,7 +77,7 @@
conn = BaseDb.connect(**kwargs["db"]).conn
raise_on_commit = kwargs.get("raise_on_commit", False)
- if ProvenanceDBBase(conn, raise_on_commit).flavor == "with-path":
+ if "with-path" in ProvenanceDBBase(conn, raise_on_commit).flavor:
from .postgresql.provenancedb_with_path import ProvenanceWithPathDB
return ProvenanceWithPathDB(conn, raise_on_commit)
diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb_base.py
--- a/swh/provenance/postgresql/provenancedb_base.py
+++ b/swh/provenance/postgresql/provenancedb_base.py
@@ -49,7 +49,11 @@
return self._flavor
def with_path(self) -> bool:
- return self.flavor == "with-path"
+ return "with-path" in self.flavor
+
+ @property
+ def denormalized(self) -> bool:
+ return "denormalized" in self.flavor
def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]:
...
@@ -186,7 +190,12 @@
f"INNER JOIN {src} AS S ON (S.sha1=V.src)",
f"INNER JOIN {dst} AS D ON (D.sha1=V.dst)",
]
- selected = ["S.id", "D.id"]
+ nope = (RelationType.REV_BEFORE_REV, RelationType.REV_IN_ORG)
+ selected = ["S.id"]
+ if self.denormalized and relation not in nope:
+ selected.append("ARRAY_AGG(D.id)")
+ else:
+ selected.append("D.id")
if self._relation_uses_location_table(relation):
locations = tuple(set((path,) for (_, _, path) in rows))
@@ -198,16 +207,30 @@
psycopg2.extras.execute_values(self.cursor, sql, locations)
joins.append("INNER JOIN location AS L ON (L.path=V.path)")
- selected.append("L.id")
+ if self.denormalized:
+ selected.append("ARRAY_AGG(L.id)")
+ else:
+ selected.append("L.id")
+ sql_l = [
+ f"INSERT INTO {table}",
+ f" SELECT {', '.join(selected)}",
+ " FROM (VALUES %s) AS V(src, dst, path)",
+ *joins,
+ ]
- sql = f"""
- INSERT INTO {table}
- (SELECT {", ".join(selected)}
- FROM (VALUES %s) AS V(src, dst, path)
- {'''
- '''.join(joins)})
- ON CONFLICT DO NOTHING
+ if self.denormalized and relation not in nope:
+ sql_l.append("GROUP BY S.id")
+ sql_l.append(
+ f"""ON CONFLICT ({src}) DO UPDATE
+ SET {dst}=ARRAY(
+ SELECT UNNEST({table}.{dst} || excluded.{dst})),
+ location=ARRAY(
+ SELECT UNNEST({relation.value}.location || excluded.location))
"""
+ )
+ else:
+ sql_l.append("ON CONFLICT DO NOTHING")
+ sql = "\n".join(sql_l)
psycopg2.extras.execute_values(self.cursor, sql, rows)
return True
except: # noqa: E722
@@ -276,11 +299,16 @@
sha1s: Optional[Tuple[Tuple[Sha1Git, ...]]]
if ids is not None:
sha1s = (tuple(ids),)
- where = f"WHERE {'S.sha1' if not reverse else 'D.sha1'} IN %s"
+ where = f"WHERE {'S' if not reverse else 'D'}.sha1 IN %s"
else:
sha1s = None
where = ""
+ aggreg_dst = self.denormalized and relation in (
+ RelationType.CNT_EARLY_IN_REV,
+ RelationType.CNT_IN_DIR,
+ RelationType.DIR_IN_REV,
+ )
if sha1s is None or sha1s[0]:
table = relation.value
src, *_, dst = table.split("_")
@@ -293,24 +321,34 @@
src_field = src
dst_field = dst
- joins = [
- f"INNER JOIN {src} AS S ON (S.id=R.{src_field})",
- f"INNER JOIN {dst} AS D ON (D.id=R.{dst_field})",
- ]
- selected = ["S.sha1 AS src", "D.sha1 AS dst"]
+ if aggreg_dst:
+ revloc = f"UNNEST(R.{dst_field}) AS dst"
+ if self._relation_uses_location_table(relation):
+ revloc += ", UNNEST(R.location) AS path"
+ else:
+ revloc = f"R.{dst_field} AS dst"
+ if self._relation_uses_location_table(relation):
+ revloc += ", R.location AS path"
+
+ inner_sql = f"""
+ SELECT S.sha1 AS src, {revloc}
+ FROM {table} AS R
+ INNER JOIN {src} AS S ON (S.id=R.{src_field})
+ {where}
+ """
if self._relation_uses_location_table(relation):
- joins.append("INNER JOIN location AS L ON (L.id=R.location)")
- selected.append("L.path AS path")
+ loc = "L.path AS path"
else:
- selected.append("NULL AS path")
-
+ loc = "NULL AS path"
sql = f"""
- SELECT {", ".join(selected)}
- FROM {table} AS R
- {" ".join(joins)}
- {where}
- """
+ SELECT CL.src, D.sha1 AS dst, {loc}
+ FROM ({inner_sql}) AS CL
+ INNER JOIN {dst} AS D ON (D.id=CL.dst)
+ """
+ if self._relation_uses_location_table(relation):
+ sql += "INNER JOIN location AS L ON (L.id=CL.path)"
+
self.cursor.execute(sql, sha1s)
result.update(RelationData(**row) for row in self.cursor.fetchall())
return result
diff --git a/swh/provenance/postgresql/provenancedb_with_path.py b/swh/provenance/postgresql/provenancedb_with_path.py
--- a/swh/provenance/postgresql/provenancedb_with_path.py
+++ b/swh/provenance/postgresql/provenancedb_with_path.py
@@ -13,7 +13,27 @@
class ProvenanceWithPathDB(ProvenanceDBBase):
def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]:
- sql = """
+ if self.denormalized:
+ sql = """
+ SELECT C_L.sha1 AS content,
+ R.sha1 AS revision,
+ R.date AS date,
+ O.url AS origin,
+ L.path AS path
+ FROM (
+ sELECT C.sha1 AS sha1,
+ UNNEST(revision) AS revision,
+ UNNEST(location) AS location
+ FROM content_in_revision AS C_R
+ INNER JOIN content AS C ON (C.id=C_R.content)
+ WHERE C.sha1=%s) AS C_L
+ INNER JOIN revision AS R ON (R.id=C_L.revision)
+ INNER JOIN location AS L ON (L.id=C_L.location)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
+ ORDER BY date, revision, origin, path ASC LIMIT 1
+ """
+ else:
+ sql = """
SELECT C.sha1 AS content,
R.sha1 AS revision,
R.date AS date,
@@ -23,7 +43,7 @@
INNER JOIN content_in_revision AS CR ON (CR.content=C.id)
INNER JOIN location as L ON (CR.location=L.id)
INNER JOIN revision as R ON (CR.revision=R.id)
- LEFT JOIN origin as O ON (R.origin=O.id)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
WHERE C.sha1=%s
ORDER BY date, revision, origin, path ASC LIMIT 1
"""
@@ -35,7 +55,61 @@
self, id: Sha1Git, limit: Optional[int] = None
) -> Generator[ProvenanceResult, None, None]:
early_cut = f"LIMIT {limit}" if limit is not None else ""
- sql = f"""
+ if self.denormalized:
+ sql = f"""
+ (SELECT C_L.sha1 AS content,
+ R.sha1 AS revision,
+ R.date AS date,
+ O.url AS origin,
+ L.path AS path
+ FROM (
+ SELECT C.sha1 AS sha1,
+ unnest(revision) AS revision,
+ unnest(location) AS location
+ FROM content_in_revision AS C_R
+ INNER JOIN content AS C ON (C.id = C_R.content)
+ WHERE C.sha1=%s) AS C_L
+ INNER JOIN revision AS R ON (R.id = C_L.revision)
+ INNER JOIN location AS L ON (L.id = C_L.location)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
+ )
+ UNION
+ (WITH
+ C_D as (
+ SELECT C.sha1 AS content_sha1,
+ unnest(CD.directory) AS directory,
+ unnest(CD.location) AS location
+ FROM content AS C
+ INNER JOIN content_in_directory AS CD ON (CD.content = C.id)
+ WHERE C.sha1=%s
+ ),
+ D_R as (
+ SELECT C_D.content_sha1 AS content_sha1,
+ DL.path AS file_path,
+ unnest(DR.revision) AS revision,
+ unnest(DR.location) AS prefix_location
+ FROM C_D
+ INNER JOIN directory_in_revision AS DR ON (DR.directory = C_D.directory)
+ INNER JOIN location AS DL ON (DL.id = C_D.location)
+ )
+ SELECT D_R.content_sha1 AS sha1,
+ R.sha1 AS revision,
+ R.date AS date,
+ O.url AS origin,
+ CASE DL.path
+ WHEN '' THEN D_R.file_path
+ WHEN '.' THEN D_R.file_path
+ ELSE (DL.path || '/' || D_R.file_path)::unix_path
+ END AS path
+ FROM D_R
+ INNER JOIN location AS DL ON (D_R.prefix_location = DL.id)
+ INNER JOIN revision AS R ON (D_R.revision = R.id)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
+ )
+ ORDER BY date, revision, origin, path {early_cut}
+ """
+ else:
+ sql = f"""
(SELECT C.sha1 AS content,
R.sha1 AS revision,
R.date AS date,
@@ -45,7 +119,7 @@
INNER JOIN content_in_revision AS CR ON (CR.content=C.id)
INNER JOIN location AS L ON (CR.location=L.id)
INNER JOIN revision AS R ON (CR.revision=R.id)
- LEFT JOIN origin as O ON (R.origin=O.id)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
WHERE C.sha1=%s)
UNION
(SELECT C.sha1 AS content,
diff --git a/swh/provenance/postgresql/provenancedb_without_path.py b/swh/provenance/postgresql/provenancedb_without_path.py
--- a/swh/provenance/postgresql/provenancedb_without_path.py
+++ b/swh/provenance/postgresql/provenancedb_without_path.py
@@ -13,19 +13,38 @@
class ProvenanceWithoutPathDB(ProvenanceDBBase):
def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]:
- sql = """
+ if self.denormalized:
+ sql = """
+ SELECT C_L.sha1 AS content,
+ R.sha1 AS revision,
+ R.date AS date,
+ O.url AS origin,
+ '\\x'::bytea AS path
+ FROM (
+ SELECT C.sha1, UNNEST(revision) AS revision
+ FROM content_in_revision AS C_R
+ INNER JOIN content AS C ON (C.id=C_R.content)
+ WHERE C.sha1=%s
+ ) AS C_L
+ INNER JOIN revision AS R ON (R.id=C_L.revision)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
+ ORDER BY date, revision, origin ASC LIMIT 1
+ """
+ else:
+ sql = """
SELECT C.sha1 AS content,
R.sha1 AS revision,
R.date AS date,
O.url AS origin,
- '\\x'::bytea as path
+ '\\x'::bytea AS path
FROM content AS C
- INNER JOIN content_in_revision AS CR ON (CR.content=C.id)
- INNER JOIN revision as R ON (CR.revision=R.id)
- LEFT JOIN origin as O ON (R.origin=O.id)
+ INNER JOIN content_in_revision AS CR ON (CR.content = C.id)
+ INNER JOIN revision AS R ON (CR.revision = R.id)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
WHERE C.sha1=%s
ORDER BY date, revision, origin ASC LIMIT 1
"""
+
self.cursor.execute(sql, (id,))
row = self.cursor.fetchone()
return ProvenanceResult(**row) if row is not None else None
@@ -34,7 +53,49 @@
self, id: Sha1Git, limit: Optional[int] = None
) -> Generator[ProvenanceResult, None, None]:
early_cut = f"LIMIT {limit}" if limit is not None else ""
- sql = f"""
+ if self.denormalized:
+ sql = f"""
+ (SELECT C_L.sha1 AS content,
+ R.sha1 AS revision,
+ R.date AS date,
+ O.url AS origin,
+ '\\x'::bytea as path
+ FROM (
+ SELECT C.sha1, UNNEST(revision) AS revision
+ FROM content_in_revision AS C_R
+ INNER JOIN content AS C ON (C.id=C_R.content)
+ WHERE C.sha1=%s) AS C_L
+ INNER JOIN revision AS R ON (R.id=C_L.revision)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
+ )
+ UNION
+ (WITH
+ C_D AS (
+ SELECT C.sha1 AS content_sha1,
+ unnest(CD.directory) AS directory
+ FROM content AS C
+ INNER JOIN content_in_directory AS CD ON (CD.content = C.id)
+ WHERE C.sha1=%s
+ ),
+ D_R AS (
+ SELECT C_D.content_sha1 AS content_sha1,
+ UNNEST(DR.revision) AS revision
+ FROM C_D
+ INNER JOIN directory_in_revision AS DR ON (DR.directory = C_D.directory)
+ )
+ SELECT D_R.content_sha1 AS content,
+ R.sha1 AS revision,
+ R.date AS date,
+ O.url AS origin,
+ '\\x'::bytea AS path
+ FROM D_R
+ INNER JOIN revision AS R ON (D_R.revision = R.id)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
+ )
+ ORDER BY date, revision, path {early_cut}
+ """
+ else:
+ sql = f"""
(SELECT C.sha1 AS content,
R.sha1 AS revision,
R.date AS date,
@@ -43,19 +104,19 @@
FROM content AS C
INNER JOIN content_in_revision AS CR ON (CR.content=C.id)
INNER JOIN revision AS R ON (CR.revision=R.id)
- LEFT JOIN origin as O ON (R.origin=O.id)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
WHERE C.sha1=%s)
UNION
(SELECT C.sha1 AS content,
R.sha1 AS revision,
R.date AS date,
O.url AS origin,
- '\\x'::bytea as path
+ '\\x'::bytea AS path
FROM content AS C
INNER JOIN content_in_directory AS CD ON (C.id=CD.content)
INNER JOIN directory_in_revision AS DR ON (CD.directory=DR.directory)
INNER JOIN revision AS R ON (DR.revision=R.id)
- LEFT JOIN origin as O ON (R.origin=O.id)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
WHERE C.sha1=%s)
ORDER BY date, revision, origin {early_cut}
"""
diff --git a/swh/provenance/sql/15-flavor.sql b/swh/provenance/sql/15-flavor.sql
--- a/swh/provenance/sql/15-flavor.sql
+++ b/swh/provenance/sql/15-flavor.sql
@@ -1,7 +1,9 @@
-- database flavor
create type database_flavor as enum (
'with-path',
- 'without-path'
+ 'without-path',
+ 'with-path-denormalized',
+ 'without-path-denormalized'
);
comment on type database_flavor is 'Flavor of the current database';
diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql
--- a/swh/provenance/sql/30-schema.sql
+++ b/swh/provenance/sql/30-schema.sql
@@ -1,4 +1,5 @@
-- psql variables to get the current database flavor
+select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset
create table dbversion
(
@@ -78,8 +79,13 @@
create table content_in_revision
(
content bigint not null, -- internal identifier of the content blob
+\if :dbflavor_norm
revision bigint not null, -- internal identifier of the revision where the blob appears for the first time
location bigint -- location of the content relative to the revision root directory
+\else
+ revision bigint[], -- internal identifier of the revision where the blob appears for the first time
+ location bigint[] -- location of the content relative to the revision root directory
+\endif
-- foreign key (blob) references content (id),
-- foreign key (rev) references revision (id),
-- foreign key (loc) references location (id)
@@ -91,8 +97,13 @@
create table content_in_directory
(
content bigint not null, -- internal identifier of the content blob
+\if :dbflavor_norm
directory bigint not null, -- internal identifier of the directory containing the blob
location bigint -- location of the content relative to its parent directory in the isochrone frontier
+\else
+ directory bigint[],
+ location bigint[]
+\endif
-- foreign key (blob) references content (id),
-- foreign key (dir) references directory (id),
-- foreign key (loc) references location (id)
@@ -104,8 +115,13 @@
create table directory_in_revision
(
directory bigint not null, -- internal identifier of the directory appearing in the revision
+\if :dbflavor_norm
revision bigint not null, -- internal identifier of the revision containing the directory
location bigint -- location of the directory relative to the revision root directory
+\else
+ revision bigint[],
+ location bigint[]
+\endif
-- foreign key (dir) references directory (id),
-- foreign key (rev) references revision (id),
-- foreign key (loc) references location (id)
diff --git a/swh/provenance/sql/60-indexes.sql b/swh/provenance/sql/60-indexes.sql
--- a/swh/provenance/sql/60-indexes.sql
+++ b/swh/provenance/sql/60-indexes.sql
@@ -1,9 +1,18 @@
-- psql variables to get the current database flavor
+select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset
-- create unique indexes (instead of pkey) because location might be null for
-- the without-path flavor
+\if :dbflavor_norm
create unique index on content_in_revision(content, revision, location);
create unique index on directory_in_revision(directory, revision, location);
create unique index on content_in_directory(content, directory, location);
+\else
+create unique index on content_in_revision(content);
+create unique index on directory_in_revision(directory);
+create unique index on content_in_directory(content);
+\endif
+
+
alter table revision_in_origin add primary key (revision, origin);
alter table revision_before_revision add primary key (prev, next);
diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
--- a/swh/provenance/tests/conftest.py
+++ b/swh/provenance/tests/conftest.py
@@ -27,19 +27,29 @@
from swh.storage.replay import process_replay_objects
-@pytest.fixture(params=["with-path", "without-path"])
+@pytest.fixture(
+ params=[
+ "with-path",
+ "without-path",
+ "with-path-denormalized",
+ "without-path-denormalized",
+ ]
+)
def populated_db(
request, # TODO: add proper type annotation
postgresql: psycopg2.extensions.connection,
) -> Dict[str, str]:
+ """return a working and initialized provenance db"""
from swh.core.cli.db import populate_database_for_package
- flavor = "with-path" if request.param == "client-server" else request.param
- populate_database_for_package("swh.provenance", postgresql.dsn, flavor=flavor)
+ # flavor = "with-path" if request.param == "client-server" else request.param
+ populate_database_for_package(
+ "swh.provenance", postgresql.dsn, flavor=request.param
+ )
return {
- item.split("=")[0]: item.split("=")[1]
- for item in postgresql.dsn.split()
- if item.split("=")[0] != "options"
+ k: v
+ for (k, v) in (item.split("=") for item in postgresql.dsn.split())
+ if k != "options"
}
diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py
--- a/swh/provenance/tests/test_provenance_db.py
+++ b/swh/provenance/tests/test_provenance_db.py
@@ -42,9 +42,14 @@
def test_provenance_flavor(provenance: ProvenanceInterface) -> None:
if isinstance(provenance.storage, ProvenanceDBBase):
- assert provenance.storage.flavor in ("with-path", "without-path")
+ assert provenance.storage.flavor in (
+ "with-path",
+ "without-path",
+ "with-path-denormalized",
+ "without-path-denormalized",
+ )
backend_class: Type[ProvenanceStorageInterface]
- if provenance.storage.flavor == "with-path":
+ if "with-path" in provenance.storage.flavor:
backend_class = ProvenanceWithPathDB
else:
backend_class = ProvenanceWithoutPathDB
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Nov 5 2024, 4:13 AM (19 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3225087
Attached To
D5843: Add support for a denormalized version of the provenance DB
Event Timeline
Log In to Comment