Page MenuHomeSoftware Heritage

D5843.id21831.diff
No OneTemporary

D5843.id21831.diff

diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py
--- a/swh/provenance/__init__.py
+++ b/swh/provenance/__init__.py
@@ -77,7 +77,7 @@
conn = BaseDb.connect(**kwargs["db"]).conn
raise_on_commit = kwargs.get("raise_on_commit", False)
- if ProvenanceDBBase(conn, raise_on_commit).flavor == "with-path":
+ if "with-path" in ProvenanceDBBase(conn, raise_on_commit).flavor:
from .postgresql.provenancedb_with_path import ProvenanceWithPathDB
return ProvenanceWithPathDB(conn, raise_on_commit)
diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb_base.py
--- a/swh/provenance/postgresql/provenancedb_base.py
+++ b/swh/provenance/postgresql/provenancedb_base.py
@@ -49,7 +49,11 @@
return self._flavor
def with_path(self) -> bool:
- return self.flavor == "with-path"
+ return "with-path" in self.flavor
+
+ @property
+ def denormalized(self) -> bool:
+ return "denormalized" in self.flavor
def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]:
...
@@ -186,7 +190,12 @@
f"INNER JOIN {src} AS S ON (S.sha1=V.src)",
f"INNER JOIN {dst} AS D ON (D.sha1=V.dst)",
]
- selected = ["S.id", "D.id"]
+ nope = (RelationType.REV_BEFORE_REV, RelationType.REV_IN_ORG)
+ selected = ["S.id"]
+ if self.denormalized and relation not in nope:
+ selected.append("ARRAY_AGG(D.id)")
+ else:
+ selected.append("D.id")
if self._relation_uses_location_table(relation):
locations = tuple(set((path,) for (_, _, path) in rows))
@@ -198,16 +207,30 @@
psycopg2.extras.execute_values(self.cursor, sql, locations)
joins.append("INNER JOIN location AS L ON (L.path=V.path)")
- selected.append("L.id")
+ if self.denormalized:
+ selected.append("ARRAY_AGG(L.id)")
+ else:
+ selected.append("L.id")
+ sql_l = [
+ f"INSERT INTO {table}",
+ f" SELECT {', '.join(selected)}",
+ " FROM (VALUES %s) AS V(src, dst, path)",
+ *joins,
+ ]
- sql = f"""
- INSERT INTO {table}
- (SELECT {", ".join(selected)}
- FROM (VALUES %s) AS V(src, dst, path)
- {'''
- '''.join(joins)})
- ON CONFLICT DO NOTHING
+ if self.denormalized and relation not in nope:
+ sql_l.append("GROUP BY S.id")
+ sql_l.append(
+ f"""ON CONFLICT ({src}) DO UPDATE
+ SET {dst}=ARRAY(
+ SELECT UNNEST({table}.{dst} || excluded.{dst})),
+ location=ARRAY(
+ SELECT UNNEST({relation.value}.location || excluded.location))
"""
+ )
+ else:
+ sql_l.append("ON CONFLICT DO NOTHING")
+ sql = "\n".join(sql_l)
psycopg2.extras.execute_values(self.cursor, sql, rows)
return True
except: # noqa: E722
@@ -276,11 +299,16 @@
sha1s: Optional[Tuple[Tuple[Sha1Git, ...]]]
if ids is not None:
sha1s = (tuple(ids),)
- where = f"WHERE {'S.sha1' if not reverse else 'D.sha1'} IN %s"
+ where = f"WHERE {'S' if not reverse else 'D'}.sha1 IN %s"
else:
sha1s = None
where = ""
+ aggreg_dst = self.denormalized and relation in (
+ RelationType.CNT_EARLY_IN_REV,
+ RelationType.CNT_IN_DIR,
+ RelationType.DIR_IN_REV,
+ )
if sha1s is None or sha1s[0]:
table = relation.value
src, *_, dst = table.split("_")
@@ -293,24 +321,34 @@
src_field = src
dst_field = dst
- joins = [
- f"INNER JOIN {src} AS S ON (S.id=R.{src_field})",
- f"INNER JOIN {dst} AS D ON (D.id=R.{dst_field})",
- ]
- selected = ["S.sha1 AS src", "D.sha1 AS dst"]
+ if aggreg_dst:
+ revloc = f"UNNEST(R.{dst_field}) AS dst"
+ if self._relation_uses_location_table(relation):
+ revloc += ", UNNEST(R.location) AS path"
+ else:
+ revloc = f"R.{dst_field} AS dst"
+ if self._relation_uses_location_table(relation):
+ revloc += ", R.location AS path"
+
+ inner_sql = f"""
+ SELECT S.sha1 AS src, {revloc}
+ FROM {table} AS R
+ INNER JOIN {src} AS S ON (S.id=R.{src_field})
+ {where}
+ """
if self._relation_uses_location_table(relation):
- joins.append("INNER JOIN location AS L ON (L.id=R.location)")
- selected.append("L.path AS path")
+ loc = "L.path AS path"
else:
- selected.append("NULL AS path")
-
+ loc = "NULL AS path"
sql = f"""
- SELECT {", ".join(selected)}
- FROM {table} AS R
- {" ".join(joins)}
- {where}
- """
+ SELECT CL.src, D.sha1 AS dst, {loc}
+ FROM ({inner_sql}) AS CL
+ INNER JOIN {dst} AS D ON (D.id=CL.dst)
+ """
+ if self._relation_uses_location_table(relation):
+ sql += "INNER JOIN location AS L ON (L.id=CL.path)"
+
self.cursor.execute(sql, sha1s)
result.update(RelationData(**row) for row in self.cursor.fetchall())
return result
diff --git a/swh/provenance/postgresql/provenancedb_with_path.py b/swh/provenance/postgresql/provenancedb_with_path.py
--- a/swh/provenance/postgresql/provenancedb_with_path.py
+++ b/swh/provenance/postgresql/provenancedb_with_path.py
@@ -13,7 +13,27 @@
class ProvenanceWithPathDB(ProvenanceDBBase):
def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]:
- sql = """
+ if self.denormalized:
+ sql = """
+ SELECT C_L.sha1 AS content,
+ R.sha1 AS revision,
+ R.date AS date,
+ O.url AS origin,
+ L.path AS path
+ FROM (
+ sELECT C.sha1 AS sha1,
+ UNNEST(revision) AS revision,
+ UNNEST(location) AS location
+ FROM content_in_revision AS C_R
+ INNER JOIN content AS C ON (C.id=C_R.content)
+ WHERE C.sha1=%s) AS C_L
+ INNER JOIN revision AS R ON (R.id=C_L.revision)
+ INNER JOIN location AS L ON (L.id=C_L.location)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
+ ORDER BY date, revision, origin, path ASC LIMIT 1
+ """
+ else:
+ sql = """
SELECT C.sha1 AS content,
R.sha1 AS revision,
R.date AS date,
@@ -23,7 +43,7 @@
INNER JOIN content_in_revision AS CR ON (CR.content=C.id)
INNER JOIN location as L ON (CR.location=L.id)
INNER JOIN revision as R ON (CR.revision=R.id)
- LEFT JOIN origin as O ON (R.origin=O.id)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
WHERE C.sha1=%s
ORDER BY date, revision, origin, path ASC LIMIT 1
"""
@@ -35,7 +55,61 @@
self, id: Sha1Git, limit: Optional[int] = None
) -> Generator[ProvenanceResult, None, None]:
early_cut = f"LIMIT {limit}" if limit is not None else ""
- sql = f"""
+ if self.denormalized:
+ sql = f"""
+ (SELECT C_L.sha1 AS content,
+ R.sha1 AS revision,
+ R.date AS date,
+ O.url AS origin,
+ L.path AS path
+ FROM (
+ SELECT C.sha1 AS sha1,
+ unnest(revision) AS revision,
+ unnest(location) AS location
+ FROM content_in_revision AS C_R
+ INNER JOIN content AS C ON (C.id = C_R.content)
+ WHERE C.sha1=%s) AS C_L
+ INNER JOIN revision AS R ON (R.id = C_L.revision)
+ INNER JOIN location AS L ON (L.id = C_L.location)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
+ )
+ UNION
+ (WITH
+ C_D as (
+ SELECT C.sha1 AS content_sha1,
+ unnest(CD.directory) AS directory,
+ unnest(CD.location) AS location
+ FROM content AS C
+ INNER JOIN content_in_directory AS CD ON (CD.content = C.id)
+ WHERE C.sha1=%s
+ ),
+ D_R as (
+ SELECT C_D.content_sha1 AS content_sha1,
+ DL.path AS file_path,
+ unnest(DR.revision) AS revision,
+ unnest(DR.location) AS prefix_location
+ FROM C_D
+ INNER JOIN directory_in_revision AS DR ON (DR.directory = C_D.directory)
+ INNER JOIN location AS DL ON (DL.id = C_D.location)
+ )
+ SELECT D_R.content_sha1 AS sha1,
+ R.sha1 AS revision,
+ R.date AS date,
+ O.url AS origin,
+ CASE DL.path
+ WHEN '' THEN D_R.file_path
+ WHEN '.' THEN D_R.file_path
+ ELSE (DL.path || '/' || D_R.file_path)::unix_path
+ END AS path
+ FROM D_R
+ INNER JOIN location AS DL ON (D_R.prefix_location = DL.id)
+ INNER JOIN revision AS R ON (D_R.revision = R.id)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
+ )
+ ORDER BY date, revision, origin, path {early_cut}
+ """
+ else:
+ sql = f"""
(SELECT C.sha1 AS content,
R.sha1 AS revision,
R.date AS date,
@@ -45,7 +119,7 @@
INNER JOIN content_in_revision AS CR ON (CR.content=C.id)
INNER JOIN location AS L ON (CR.location=L.id)
INNER JOIN revision AS R ON (CR.revision=R.id)
- LEFT JOIN origin as O ON (R.origin=O.id)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
WHERE C.sha1=%s)
UNION
(SELECT C.sha1 AS content,
diff --git a/swh/provenance/postgresql/provenancedb_without_path.py b/swh/provenance/postgresql/provenancedb_without_path.py
--- a/swh/provenance/postgresql/provenancedb_without_path.py
+++ b/swh/provenance/postgresql/provenancedb_without_path.py
@@ -13,19 +13,38 @@
class ProvenanceWithoutPathDB(ProvenanceDBBase):
def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]:
- sql = """
+ if self.denormalized:
+ sql = """
+ SELECT C_L.sha1 AS content,
+ R.sha1 AS revision,
+ R.date AS date,
+ O.url AS origin,
+ '\\x'::bytea AS path
+ FROM (
+ SELECT C.sha1, UNNEST(revision) AS revision
+ FROM content_in_revision AS C_R
+ INNER JOIN content AS C ON (C.id=C_R.content)
+ WHERE C.sha1=%s
+ ) AS C_L
+ INNER JOIN revision AS R ON (R.id=C_L.revision)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
+ ORDER BY date, revision, origin ASC LIMIT 1
+ """
+ else:
+ sql = """
SELECT C.sha1 AS content,
R.sha1 AS revision,
R.date AS date,
O.url AS origin,
- '\\x'::bytea as path
+ '\\x'::bytea AS path
FROM content AS C
- INNER JOIN content_in_revision AS CR ON (CR.content=C.id)
- INNER JOIN revision as R ON (CR.revision=R.id)
- LEFT JOIN origin as O ON (R.origin=O.id)
+ INNER JOIN content_in_revision AS CR ON (CR.content = C.id)
+ INNER JOIN revision AS R ON (CR.revision = R.id)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
WHERE C.sha1=%s
ORDER BY date, revision, origin ASC LIMIT 1
"""
+
self.cursor.execute(sql, (id,))
row = self.cursor.fetchone()
return ProvenanceResult(**row) if row is not None else None
@@ -34,7 +53,49 @@
self, id: Sha1Git, limit: Optional[int] = None
) -> Generator[ProvenanceResult, None, None]:
early_cut = f"LIMIT {limit}" if limit is not None else ""
- sql = f"""
+ if self.denormalized:
+ sql = f"""
+ (SELECT C_L.sha1 AS content,
+ R.sha1 AS revision,
+ R.date AS date,
+ O.url AS origin,
+ '\\x'::bytea as path
+ FROM (
+ SELECT C.sha1, UNNEST(revision) AS revision
+ FROM content_in_revision AS C_R
+ INNER JOIN content AS C ON (C.id=C_R.content)
+ WHERE C.sha1=%s) AS C_L
+ INNER JOIN revision AS R ON (R.id=C_L.revision)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
+ )
+ UNION
+ (WITH
+ C_D AS (
+ SELECT C.sha1 AS content_sha1,
+ unnest(CD.directory) AS directory
+ FROM content AS C
+ INNER JOIN content_in_directory AS CD ON (CD.content = C.id)
+ WHERE C.sha1=%s
+ ),
+ D_R AS (
+ SELECT C_D.content_sha1 AS content_sha1,
+ UNNEST(DR.revision) AS revision
+ FROM C_D
+ INNER JOIN directory_in_revision AS DR ON (DR.directory = C_D.directory)
+ )
+ SELECT D_R.content_sha1 AS content,
+ R.sha1 AS revision,
+ R.date AS date,
+ O.url AS origin,
+ '\\x'::bytea AS path
+ FROM D_R
+ INNER JOIN revision AS R ON (D_R.revision = R.id)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
+ )
+ ORDER BY date, revision, path {early_cut}
+ """
+ else:
+ sql = f"""
(SELECT C.sha1 AS content,
R.sha1 AS revision,
R.date AS date,
@@ -43,19 +104,19 @@
FROM content AS C
INNER JOIN content_in_revision AS CR ON (CR.content=C.id)
INNER JOIN revision AS R ON (CR.revision=R.id)
- LEFT JOIN origin as O ON (R.origin=O.id)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
WHERE C.sha1=%s)
UNION
(SELECT C.sha1 AS content,
R.sha1 AS revision,
R.date AS date,
O.url AS origin,
- '\\x'::bytea as path
+ '\\x'::bytea AS path
FROM content AS C
INNER JOIN content_in_directory AS CD ON (C.id=CD.content)
INNER JOIN directory_in_revision AS DR ON (CD.directory=DR.directory)
INNER JOIN revision AS R ON (DR.revision=R.id)
- LEFT JOIN origin as O ON (R.origin=O.id)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
WHERE C.sha1=%s)
ORDER BY date, revision, origin {early_cut}
"""
diff --git a/swh/provenance/sql/15-flavor.sql b/swh/provenance/sql/15-flavor.sql
--- a/swh/provenance/sql/15-flavor.sql
+++ b/swh/provenance/sql/15-flavor.sql
@@ -1,7 +1,9 @@
-- database flavor
create type database_flavor as enum (
'with-path',
- 'without-path'
+ 'without-path',
+ 'with-path-denormalized',
+ 'without-path-denormalized'
);
comment on type database_flavor is 'Flavor of the current database';
diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql
--- a/swh/provenance/sql/30-schema.sql
+++ b/swh/provenance/sql/30-schema.sql
@@ -1,4 +1,5 @@
-- psql variables to get the current database flavor
+select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset
create table dbversion
(
@@ -78,8 +79,13 @@
create table content_in_revision
(
content bigint not null, -- internal identifier of the content blob
+\if :dbflavor_norm
revision bigint not null, -- internal identifier of the revision where the blob appears for the first time
location bigint -- location of the content relative to the revision root directory
+\else
+ revision bigint[], -- internal identifier of the revision where the blob appears for the first time
+ location bigint[] -- location of the content relative to the revision root directory
+\endif
-- foreign key (blob) references content (id),
-- foreign key (rev) references revision (id),
-- foreign key (loc) references location (id)
@@ -91,8 +97,13 @@
create table content_in_directory
(
content bigint not null, -- internal identifier of the content blob
+\if :dbflavor_norm
directory bigint not null, -- internal identifier of the directory containing the blob
location bigint -- location of the content relative to its parent directory in the isochrone frontier
+\else
+ directory bigint[],
+ location bigint[]
+\endif
-- foreign key (blob) references content (id),
-- foreign key (dir) references directory (id),
-- foreign key (loc) references location (id)
@@ -104,8 +115,13 @@
create table directory_in_revision
(
directory bigint not null, -- internal identifier of the directory appearing in the revision
+\if :dbflavor_norm
revision bigint not null, -- internal identifier of the revision containing the directory
location bigint -- location of the directory relative to the revision root directory
+\else
+ revision bigint[],
+ location bigint[]
+\endif
-- foreign key (dir) references directory (id),
-- foreign key (rev) references revision (id),
-- foreign key (loc) references location (id)
diff --git a/swh/provenance/sql/60-indexes.sql b/swh/provenance/sql/60-indexes.sql
--- a/swh/provenance/sql/60-indexes.sql
+++ b/swh/provenance/sql/60-indexes.sql
@@ -1,9 +1,18 @@
-- psql variables to get the current database flavor
+select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset
-- create unique indexes (instead of pkey) because location might be null for
-- the without-path flavor
+\if :dbflavor_norm
create unique index on content_in_revision(content, revision, location);
create unique index on directory_in_revision(directory, revision, location);
create unique index on content_in_directory(content, directory, location);
+\else
+create unique index on content_in_revision(content);
+create unique index on directory_in_revision(directory);
+create unique index on content_in_directory(content);
+\endif
+
+
alter table revision_in_origin add primary key (revision, origin);
alter table revision_before_revision add primary key (prev, next);
diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
--- a/swh/provenance/tests/conftest.py
+++ b/swh/provenance/tests/conftest.py
@@ -27,19 +27,29 @@
from swh.storage.replay import process_replay_objects
-@pytest.fixture(params=["with-path", "without-path"])
+@pytest.fixture(
+ params=[
+ "with-path",
+ "without-path",
+ "with-path-denormalized",
+ "without-path-denormalized",
+ ]
+)
def populated_db(
request, # TODO: add proper type annotation
postgresql: psycopg2.extensions.connection,
) -> Dict[str, str]:
+ """return a working and initialized provenance db"""
from swh.core.cli.db import populate_database_for_package
- flavor = "with-path" if request.param == "client-server" else request.param
- populate_database_for_package("swh.provenance", postgresql.dsn, flavor=flavor)
+ # flavor = "with-path" if request.param == "client-server" else request.param
+ populate_database_for_package(
+ "swh.provenance", postgresql.dsn, flavor=request.param
+ )
return {
- item.split("=")[0]: item.split("=")[1]
- for item in postgresql.dsn.split()
- if item.split("=")[0] != "options"
+ k: v
+ for (k, v) in (item.split("=") for item in postgresql.dsn.split())
+ if k != "options"
}
diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py
--- a/swh/provenance/tests/test_provenance_db.py
+++ b/swh/provenance/tests/test_provenance_db.py
@@ -42,9 +42,14 @@
def test_provenance_flavor(provenance: ProvenanceInterface) -> None:
if isinstance(provenance.storage, ProvenanceDBBase):
- assert provenance.storage.flavor in ("with-path", "without-path")
+ assert provenance.storage.flavor in (
+ "with-path",
+ "without-path",
+ "with-path-denormalized",
+ "without-path-denormalized",
+ )
backend_class: Type[ProvenanceStorageInterface]
- if provenance.storage.flavor == "with-path":
+ if "with-path" in provenance.storage.flavor:
backend_class = ProvenanceWithPathDB
else:
backend_class = ProvenanceWithoutPathDB

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 4:13 AM (19 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3225087

Event Timeline