diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py --- a/swh/provenance/postgresql/provenance.py +++ b/swh/provenance/postgresql/provenance.py @@ -6,7 +6,7 @@ from datetime import datetime import itertools import logging -from typing import Dict, Generator, Iterable, Optional, Set, Tuple +from typing import Dict, Generator, Iterable, List, Optional, Set import psycopg2.extensions import psycopg2.extras @@ -166,78 +166,39 @@ self, relation: RelationType, data: Iterable[RelationData] ) -> bool: try: - rows = tuple((rel.src, rel.dst, rel.path) for rel in data) + rows = [(rel.src, rel.dst, rel.path) for rel in data] if rows: - table = relation.value - src, *_, dst = table.split("_") + rel_table = relation.value + src_table, *_, dst_table = rel_table.split("_") - if src != "origin": + if src_table != "origin": # Origin entries should be inserted previously as they require extra # non-null information srcs = tuple(set((sha1,) for (sha1, _, _) in rows)) sql = f""" - LOCK TABLE ONLY {src}; - INSERT INTO {src}(sha1) VALUES %s + LOCK TABLE ONLY {src_table}; + INSERT INTO {src_table}(sha1) VALUES %s ON CONFLICT DO NOTHING """ psycopg2.extras.execute_values(self.cursor, sql, srcs) - if dst != "origin": + + if dst_table != "origin": # Origin entries should be inserted previously as they require extra # non-null information dsts = tuple(set((sha1,) for (_, sha1, _) in rows)) sql = f""" - LOCK TABLE ONLY {dst}; - INSERT INTO {dst}(sha1) VALUES %s + LOCK TABLE ONLY {dst_table}; + INSERT INTO {dst_table}(sha1) VALUES %s ON CONFLICT DO NOTHING """ psycopg2.extras.execute_values(self.cursor, sql, dsts) - joins = [ - f"INNER JOIN {src} AS S ON (S.sha1=V.src)", - f"INNER JOIN {dst} AS D ON (D.sha1=V.dst)", - ] - nope = (RelationType.REV_BEFORE_REV, RelationType.REV_IN_ORG) - selected = ["S.id"] - if self.denormalized and relation not in nope: - selected.append("ARRAY_AGG(D.id)") - else: - selected.append("D.id") - - if self._relation_uses_location_table(relation): - locations = tuple(set((path,) for (_, _, path) in rows)) - sql = """ - LOCK TABLE ONLY location; - INSERT INTO location(path) VALUES %s - ON CONFLICT (path) DO NOTHING - """ - psycopg2.extras.execute_values(self.cursor, sql, locations) - - joins.append("INNER JOIN location AS L ON (L.path=V.path)") - if self.denormalized: - selected.append("ARRAY_AGG(L.id)") - else: - selected.append("L.id") - sql_l = [ - f"INSERT INTO {table}", - f" SELECT {', '.join(selected)}", - " FROM (VALUES %s) AS V(src, dst, path)", - *joins, - ] - - if self.denormalized and relation not in nope: - sql_l.append("GROUP BY S.id") - sql_l.append( - f"""ON CONFLICT ({src}) DO UPDATE - SET {dst}=ARRAY( - SELECT UNNEST({table}.{dst} || EXCLUDED.{dst}) - ), location=ARRAY( - SELECT UNNEST({relation.value}.location || EXCLUDED.location) - ) - """ + + sql = """ + SELECT * FROM swh_provenance_relation_add( + %s, %s, %s, %s::rel_row[] ) - else: - sql_l.append("ON CONFLICT DO NOTHING") - sql = "\n".join(sql_l) - psycopg2.extras.execute_values(self.cursor, sql, rows) + """ + self.cursor.execute(sql, (rel_table, src_table, dst_table, rows)) return True except: # noqa: E722 # Unexpected error occurred, rollback all changes and log message @@ -304,71 +265,22 @@ ) -> Set[RelationData]: result: Set[RelationData] = set() - sha1s: Optional[Tuple[Tuple[Sha1Git, ...]]] + sha1s: List[Sha1Git] if ids is not None: - sha1s = (tuple(ids),) - where = f"WHERE {'S' if not reverse else 'D'}.sha1 IN %s" + sha1s = list(ids) + filter = "filter-src" if not reverse else "filter-dst" else: - sha1s = None - where = "" - - aggreg_dst = self.denormalized and relation in ( - RelationType.CNT_EARLY_IN_REV, - RelationType.CNT_IN_DIR, - RelationType.DIR_IN_REV, - ) - if sha1s is None or sha1s[0]: - table = relation.value - src, *_, dst = table.split("_") - - # TODO: improve this! - if src == "revision" and dst == "revision": - src_field = "prev" - dst_field = "next" - else: - src_field = src - dst_field = dst - - if aggreg_dst: - revloc = f"UNNEST(R.{dst_field}) AS dst" - if self._relation_uses_location_table(relation): - revloc += ", UNNEST(R.location) AS path" - else: - revloc = f"R.{dst_field} AS dst" - if self._relation_uses_location_table(relation): - revloc += ", R.location AS path" - - inner_sql = f""" - SELECT S.sha1 AS src, {revloc} - FROM {table} AS R - INNER JOIN {src} AS S ON (S.id=R.{src_field}) - """ - if where != "" and not reverse: - inner_sql += where - - if self._relation_uses_location_table(relation): - loc = "L.path AS path" - else: - loc = "NULL AS path" - sql = f""" - SELECT CL.src, D.sha1 AS dst, {loc} - FROM ({inner_sql}) AS CL - INNER JOIN {dst} AS D ON (D.id=CL.dst) - """ - if self._relation_uses_location_table(relation): - sql += "INNER JOIN location AS L ON (L.id=CL.path)" - if where != "" and reverse: - sql += where + sha1s = [] + filter = "no-filter" - self.cursor.execute(sql, sha1s) + if filter == "no-filter" or sha1s: + rel_table = relation.value + src_table, *_, dst_table = rel_table.split("_") + + sql = "SELECT * FROM swh_provenance_relation_get(%s, %s, %s, %s, %s)" + self.cursor.execute(sql, (rel_table, src_table, dst_table, filter, sha1s)) result.update(RelationData(**row) for row in self.cursor.fetchall()) return result - def _relation_uses_location_table(self, relation: RelationType) -> bool: - if self.with_path(): - src = relation.value.split("_")[0] - return src in ("content", "directory") - return False - def with_path(self) -> bool: return "with-path" in self.flavor diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql --- a/swh/provenance/sql/30-schema.sql +++ b/swh/provenance/sql/30-schema.sql @@ -1,5 +1,6 @@ -- psql variables to get the current database flavor select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset +select position('without-path' in swh_get_dbflavor()::text) = 0 as dbflavor_with_path \gset create table dbversion ( @@ -23,6 +24,24 @@ -- UNIX path (absolute, relative, individual path component, etc.) create domain unix_path as bytea; +-- a relation destination ID (used for denormalized flavors: with-path vs. without-path) +\if :dbflavor_with_path +create type rel_dst as (id bigint, loc bigint); +\else +create domain rel_dst as bigint; +\endif + +-- relation filter options for querying +create type rel_flt as enum ( + 'filter-src', + 'filter-dst', + 'no-filter' +); +comment on type rel_flt is 'Relation get filter types'; + +-- a relation entry row, i.e. sr/dst Git object ID and optional UNIX path +create type rel_row as (src sha1_git, dst sha1_git, path unix_path); + -- entity tables create table content ( @@ -50,7 +69,7 @@ sha1 sha1_git unique not null, -- intrinsic identifier of the revision date timestamptz, -- timestamp of the revision origin bigint -- id of the preferred origin - -- foreign key (org) references origin (id) + -- foreign key (origin) references origin (id) ); comment on column revision.id is 'Revision internal identifier'; comment on column revision.sha1 is 'Revision intrinsic identifier'; @@ -83,16 +102,19 @@ revision bigint not null, -- internal identifier of the revision where the blob appears for the first time location bigint -- location of the content relative to the revision root directory \else - revision bigint[], -- internal identifier of the revision where the blob appears for the first time - location bigint[] -- location of the content relative to the revision root directory + revision rel_dst[] -- internal reference of the revision (and location) where the blob appears for the first time \endif - -- foreign key (blob) references content (id), - -- foreign key (rev) references revision (id), - -- foreign key (loc) references location (id) + -- foreign key (content) references content (id), + -- foreign key (revision) references revision (id), + -- foreign key (location) references location (id) ); comment on column content_in_revision.content is 'Content internal identifier'; +\if :dbflavor_norm comment on column content_in_revision.revision is 'Revision internal identifier'; comment on column content_in_revision.location is 'Location of content in revision'; +\else +comment on column content_in_revision.revision is 'Revision/location internal identifiers'; +\endif create table content_in_directory ( @@ -101,16 +123,19 @@ directory bigint not null, -- internal identifier of the directory containing the blob location bigint -- location of the content relative to its parent directory in the isochrone frontier \else - directory bigint[], - location bigint[] + directory rel_dst[] -- internal reference of the directory (and location) containing the blob \endif - -- foreign key (blob) references content (id), - -- foreign key (dir) references directory (id), - -- foreign key (loc) references location (id) + -- foreign key (content) references content (id), + -- foreign key (directory) references directory (id), + -- foreign key (location) references location (id) ); comment on column content_in_directory.content is 'Content internal identifier'; +\if :dbflavor_norm comment on column content_in_directory.directory is 'Directory internal identifier'; comment on column content_in_directory.location is 'Location of content in directory'; +\else +comment on column content_in_directory.directory is 'Directory/location internal identifiers'; +\endif create table directory_in_revision ( @@ -119,23 +144,26 @@ revision bigint not null, -- internal identifier of the revision containing the directory location bigint -- location of the directory relative to the revision root directory \else - revision bigint[], - location bigint[] + revision rel_dst[] -- internal reference of the revision (and location) containing the directory \endif - -- foreign key (dir) references directory (id), - -- foreign key (rev) references revision (id), - -- foreign key (loc) references location (id) + -- foreign key (directory) references directory (id), + -- foreign key (revision) references revision (id), + -- foreign key (location) references location (id) ); comment on column directory_in_revision.directory is 'Directory internal identifier'; +\if :dbflavor_norm comment on column directory_in_revision.revision is 'Revision internal identifier'; -comment on column directory_in_revision.location is 'Location of directory in revision'; +comment on column directory_in_revision.location is 'Location of content in revision'; +\else +comment on column directory_in_revision.revision is 'Revision/location internal identifiers'; +\endif create table revision_in_origin ( revision bigint not null, -- internal identifier of the revision poined by the origin origin bigint not null -- internal identifier of the origin that points to the revision - -- foreign key (rev) references revision (id), - -- foreign key (org) references origin (id) + -- foreign key (revision) references revision (id), + -- foreign key (origin) references origin (id) ); comment on column revision_in_origin.revision is 'Revision internal identifier'; comment on column revision_in_origin.origin is 'Origin internal identifier'; diff --git a/swh/provenance/sql/40-funcs.sql b/swh/provenance/sql/40-funcs.sql --- a/swh/provenance/sql/40-funcs.sql +++ b/swh/provenance/sql/40-funcs.sql @@ -1,8 +1,7 @@ +-- psql variables to get the current database flavor select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset select position('with-path' in swh_get_dbflavor()::text) != 0 as dbflavor_with_path \gset -create type relation_row as (src sha1_git, dst sha1_git, loc unix_path); - \if :dbflavor_norm \if :dbflavor_with_path @@ -46,36 +45,130 @@ language sql stable as $$ - (select C.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - L.path as path - from content as C - inner join content_in_revision as CR on (CR.content = C.id) - inner join location as L on (L.id = CR.location) - inner join revision as R on (R.id = CR.revision) - left join origin as O on (O.id = R.origin) - where C.sha1 = content_id) - union - (select C.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - case DL.path - when '' then CL.path - when '.' then CL.path - else (DL.path || '/' || CL.path)::unix_path - end as path - from content as C - inner join content_in_directory as CD on (CD.content = C.id) - inner join directory_in_revision as DR on (DR.directory = CD.directory) - inner join revision as R on (R.id = DR.revision) - inner join location as CL on (CL.id = CD.location) - inner join location as DL on (DL.id = DR.location) - left join origin as O on (O.id = R.origin) - where C.sha1 = content_id) - order by date, revision, origin, path limit early_cut + (select C.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + L.path as path + from content as C + inner join content_in_revision as CR on (CR.content = C.id) + inner join location as L on (L.id = CR.location) + inner join revision as R on (R.id = CR.revision) + left join origin as O on (O.id = R.origin) + where C.sha1 = content_id) + union + (select C.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + case DL.path + when '' then CL.path + when '.' then CL.path + else (DL.path || '/' || CL.path)::unix_path + end as path + from content as C + inner join content_in_directory as CD on (CD.content = C.id) + inner join directory_in_revision as DR on (DR.directory = CD.directory) + inner join revision as R on (R.id = DR.revision) + inner join location as CL on (CL.id = CD.location) + inner join location as DL on (DL.id = DR.location) + left join origin as O on (O.id = R.origin) + where C.sha1 = content_id) + order by date, revision, origin, path limit early_cut +$$; + +create or replace function swh_provenance_relation_add( + rel_table regclass, src_table regclass, dst_table regclass, rel_data rel_row[] +) + returns void + language plpgsql + volatile +as $$ + declare + select_fields text; + join_location text; + begin + if src_table in ('content'::regclass, 'directory'::regclass) then + lock table only location; + insert into location(path) + select V.path + from unnest(rel_data) as V + on conflict (path) do nothing; + + select_fields := 'D.id, L.id'; + join_location := 'inner join location as L on (L.path = V.path)'; + else + select_fields := 'D.id'; + join_location := ''; + end if; + + execute format( + 'lock table only %s; + insert into %s + select S.id, ' || select_fields || ' + from unnest($1) as V + inner join %s as S on (S.sha1 = V.src) + inner join %s as D on (D.sha1 = V.dst) + ' || join_location || ' + on conflict do nothing', + rel_table, rel_table, src_table, dst_table + ) using rel_data; + end; +$$; + +create or replace function swh_provenance_relation_get( + rel_table regclass, src_table regclass, dst_table regclass, filter rel_flt, sha1s sha1_git[] +) + returns table ( + src sha1_git, + dst sha1_git, + path unix_path + ) + language plpgsql + stable +as $$ + declare + src_field text; + dst_field text; + join_location text; + proj_location text; + filter_result text; + begin + if rel_table = 'revision_before_revision'::regclass then + src_field := 'prev'; + dst_field := 'next'; + else + src_field := src_table::text; + dst_field := dst_table::text; + end if; + + if src_table in ('content'::regclass, 'directory'::regclass) then + join_location := 'inner join location as L on (L.id = R.location)'; + proj_location := 'L.path'; + else + join_location := ''; + proj_location := 'NULL::unix_path'; + end if; + + case filter + when 'filter-src'::rel_flt then + filter_result := 'where S.sha1 = any($1)'; + when 'filter-dst'::rel_flt then + filter_result := 'where D.sha1 = any($1)'; + else + filter_result := ''; + end case; + + return query execute format( + 'select S.sha1 as src, D.sha1 as dst, ' || proj_location || ' as path + from %s as R + inner join %s as S on (S.id = R.' || src_field || ') + inner join %s as D on (D.id = R.' || dst_field || ') + ' || join_location || ' + ' || filter_result, + rel_table, src_table, dst_table + ) using sha1s; + end; $$; \else @@ -93,7 +186,31 @@ language sql stable as $$ - select C.sha1 as content, + select C.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + '\x'::unix_path as path + from content as C + inner join content_in_revision as CR on (CR.content = C.id) + inner join revision as R on (R.id = CR.revision) + left join origin as O on (O.id = R.origin) + where C.sha1 = content_id + order by date, revision, origin asc limit 1 +$$; + +create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin text, + path unix_path + ) + language sql + stable +as $$ + (select C.sha1 as content, R.sha1 as revision, R.date as date, O.url as origin, @@ -102,44 +219,85 @@ inner join content_in_revision as CR on (CR.content = C.id) inner join revision as R on (R.id = CR.revision) left join origin as O on (O.id = R.origin) - where C.sha1 = content_id - order by date, revision, origin asc limit 1 + where C.sha1 = content_id) + union + (select C.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + '\x'::unix_path as path + from content as C + inner join content_in_directory as CD on (CD.content = C.id) + inner join directory_in_revision as DR on (DR.directory = CD.directory) + inner join revision as R on (R.id = DR.revision) + left join origin as O on (O.id = R.origin) + where C.sha1 = content_id) + order by date, revision, origin, path limit early_cut $$; -create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) +create or replace function swh_provenance_relation_add( + rel_table regclass, src_table regclass, dst_table regclass, rel_data rel_row[] +) + returns void + language plpgsql + volatile +as $$ + begin + execute format( + 'lock table only %s; + insert into %s + select S.id, D.id + from unnest($1) as V + inner join %s as S on (S.sha1 = V.src) + inner join %s as D on (D.sha1 = V.dst) + on conflict do nothing', + rel_table, rel_table, src_table, dst_table + ) using rel_data; + end; +$$; + +create or replace function swh_provenance_relation_get( + rel_table regclass, src_table regclass, dst_table regclass, filter rel_flt, sha1s sha1_git[] +) returns table ( - content sha1_git, - revision sha1_git, - date timestamptz, - origin text, + src sha1_git, + dst sha1_git, path unix_path ) - language sql + language plpgsql stable as $$ - (select C.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - '\x'::unix_path as path - from content as C - inner join content_in_revision as CR on (CR.content = C.id) - inner join revision as R on (R.id = CR.revision) - left join origin as O on (O.id = R.origin) - where C.sha1 = content_id) - union - (select C.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - '\x'::unix_path as path - from content as C - inner join content_in_directory as CD on (CD.content = C.id) - inner join directory_in_revision as DR on (DR.directory = CD.directory) - inner join revision as R on (R.id = DR.revision) - left join origin as O on (O.id = R.origin) - where C.sha1 = content_id) - order by date, revision, origin, path limit early_cut + declare + src_field text; + dst_field text; + filter_result text; + begin + if rel_table = 'revision_before_revision'::regclass then + src_field := 'prev'; + dst_field := 'next'; + else + src_field := src_table::text; + dst_field := dst_table::text; + end if; + + case filter + when 'filter-src'::rel_flt then + filter_result := 'where S.sha1 = any($1)'; + when 'filter-dst'::rel_flt then + filter_result := 'where D.sha1 = any($1)'; + else + filter_result := ''; + end case; + + return query execute format( + 'select S.sha1 as src, D.sha1 as dst, NULL::unix_path as path + from %s as R + inner join %s as S on (S.id = R.' || src_field || ') + inner join %s as D on (D.id = R.' || dst_field || ') + ' || filter_result, + rel_table, src_table, dst_table + ) using sha1s; + end; $$; -- :dbflavor_with_path @@ -171,14 +329,13 @@ L.path as path from ( select C.sha1 as sha1, - unnest(CR.revision) as revision, - unnest(CR.location) as location + unnest(CR.revision) as revision from content_in_revision as CR inner join content as C on (C.id = CR.content) where C.sha1 = content_id ) as CL - inner join revision as R on (R.id = CL.revision) - inner join location as L on (L.id = CL.location) + inner join revision as R on (R.id = (CL.revision).id) + inner join location as L on (L.id = (CL.revision).loc) left join origin as O on (O.id = R.origin) order by date, revision, origin, path asc limit 1 $$; @@ -194,57 +351,172 @@ language sql stable as $$ - (with cntrev as ( + (with + cntrev as ( select C.sha1 as sha1, - unnest(CR.revision) as revision, - unnest(CR.location) as location + unnest(CR.revision) as revision from content_in_revision as CR inner join content as C on (C.id = CR.content) - where C.sha1 = content_id - ) - select CR.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - L.path as path - from cntrev as CR - inner join revision as R on (R.id = CR.revision) - inner join location as L on (L.id = CR.location) - left join origin as O on (O.id = R.origin) - ) - union - (with cntdir as ( + where C.sha1 = content_id) + select CR.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + L.path as path + from cntrev as CR + inner join revision as R on (R.id = (CR.revision).id) + inner join location as L on (L.id = (CR.revision).loc) + left join origin as O on (O.id = R.origin)) + union + (with + cntdir as ( select C.sha1 as sha1, - unnest(CD.directory) as directory, - unnest(CD.location) as location + unnest(CD.directory) as directory from content as C inner join content_in_directory as CD on (CD.content = C.id) - where C.sha1 = content_id - ), - cntrev as ( - select CD.sha1 as sha1, - L.path as path, - unnest(DR.revision) as revision, - unnest(DR.location) as prefix - from cntdir as CD - inner join directory_in_revision as DR on (DR.directory = CD.directory) - inner join location as L on (L.id = CD.location) - ) - select CR.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - case DL.path - when '' then CR.path - when '.' then CR.path - else (DL.path || '/' || CR.path)::unix_path - end as path - from cntrev as CR - inner join revision as R on (R.id = CR.revision) - inner join location as DL on (DL.id = CR.prefix) - left join origin as O on (O.id = R.origin) - ) - order by date, revision, origin, path limit early_cut + where C.sha1 = content_id), + cntrev as ( + select CD.sha1 as sha1, + L.path as path, + unnest(DR.revision) as revision + from cntdir as CD + inner join directory_in_revision as DR on (DR.directory = (CD.directory).id) + inner join location as L on (L.id = (CD.directory).loc)) + select CR.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + case DL.path + when '' then CR.path + when '.' then CR.path + else (DL.path || '/' || CR.path)::unix_path + end as path + from cntrev as CR + inner join revision as R on (R.id = (CR.revision).id) + inner join location as DL on (DL.id = (CR.revision).loc) + left join origin as O on (O.id = R.origin)) + order by date, revision, origin, path limit early_cut +$$; + +create or replace function swh_provenance_relation_add( + rel_table regclass, src_table regclass, dst_table regclass, rel_data rel_row[] +) + returns void + language plpgsql + volatile +as $$ + declare + select_fields text; + join_location text; + group_entries text; + on_conflict text; + begin + if src_table in ('content'::regclass, 'directory'::regclass) then + lock table only location; + insert into location(path) + select V.path + from unnest(rel_data) as V + on conflict (path) do nothing; + + select_fields := 'array_agg((D.id, L.id)::rel_dst)'; + join_location := 'inner join location as L on (L.path = V.path)'; + group_entries := 'group by S.id'; + on_conflict := format(' + (%s) do update + set %s=array( + select distinct unnest( + %s.' || dst_table::text || ' || excluded.' || dst_table::text || ' + ) + )', + src_table, dst_table, rel_table, rel_table, rel_table + ); + else + select_fields := 'D.id'; + join_location := ''; + group_entries := ''; + on_conflict := 'do nothing'; + end if; + + execute format( + 'lock table only %s; + insert into %s + select S.id, ' || select_fields || ' + from unnest($1) as V + inner join %s as S on (S.sha1 = V.src) + inner join %s as D on (D.sha1 = V.dst) + ' || join_location || ' + ' || group_entries || ' + on conflict ' || on_conflict, + rel_table, rel_table, src_table, dst_table + ) using rel_data; + end; +$$; + +create or replace function swh_provenance_relation_get( + rel_table regclass, src_table regclass, dst_table regclass, filter rel_flt, sha1s sha1_git[] +) + returns table ( + src sha1_git, + dst sha1_git, + path unix_path + ) + language plpgsql + stable +as $$ + declare + src_field text; + dst_field text; + proj_dst_id text; + proj_unnested text; + proj_location text; + join_location text; + filter_inner_result text; + filter_outer_result text; + begin + if rel_table = 'revision_before_revision'::regclass then + src_field := 'prev'; + dst_field := 'next'; + else + src_field := src_table::text; + dst_field := dst_table::text; + end if; + + if src_table in ('content'::regclass, 'directory'::regclass) then + proj_unnested := 'unnest(R.' || dst_field || ') as dst'; + proj_dst_id := '(CL.dst).id'; + join_location := 'inner join location as L on (L.id = (CL.dst).loc)'; + proj_location := 'L.path'; + else + proj_unnested := 'R.' || dst_field || ' as dst'; + proj_dst_id := 'CL.dst'; + join_location := ''; + proj_location := 'NULL::unix_path'; + end if; + + case filter + when 'filter-src'::rel_flt then + filter_inner_result := 'where S.sha1 = any($1)'; + filter_outer_result := ''; + when 'filter-dst'::rel_flt then + filter_inner_result := ''; + filter_outer_result := 'where D.sha1 = any($1)'; + else + filter_inner_result := ''; + filter_outer_result := ''; + end case; + + return query execute format( + 'select CL.src, D.sha1 as dst, ' || proj_location || ' as path + from (select S.sha1 as src, ' || proj_unnested || ' + from %s as R + inner join %s as S on (S.id = R.' || src_field || ') + ' || filter_inner_result || ') as CL + inner join %s as D on (D.id = ' || proj_dst_id || ') + ' || join_location || ' + ' || filter_outer_result, + rel_table, src_table, dst_table + ) using sha1s; + end; $$; \else @@ -268,17 +540,16 @@ O.url as origin, '\x'::unix_path as path from ( - select C.sha1, unnest(revision) as revision - from content_in_revision as CR - inner join content as C on (C.id = CR.content) - where C.sha1=content_id + select C.sha1, unnest(revision) as revision + from content_in_revision as CR + inner join content as C on (C.id = CR.content) + where C.sha1=content_id ) as CL inner join revision as R on (R.id = CL.revision) left join origin as O on (O.id = R.origin) order by date, revision, origin, path asc limit 1 $$; - create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) returns table ( content sha1_git, @@ -290,46 +561,144 @@ language sql stable as $$ - (with cntrev as ( + (with + cntrev as ( select C.sha1 as sha1, unnest(CR.revision) as revision from content_in_revision as CR inner join content as C on (C.id = CR.content) - where C.sha1 = content_id - ) - select CR.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - '\x'::unix_path as path - from cntrev as CR - inner join revision as R on (R.id = CR.revision) - left join origin as O on (O.id = R.origin) + where C.sha1 = content_id) + select CR.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + '\x'::unix_path as path + from cntrev as CR + inner join revision as R on (R.id = CR.revision) + left join origin as O on (O.id = R.origin)) + union + (with + cntdir as ( + select C.sha1 as sha1, + unnest(CD.directory) as directory + from content as C + inner join content_in_directory as CD on (CD.content = C.id) + where C.sha1 = content_id), + cntrev as ( + select CD.sha1 as sha1, + unnest(DR.revision) as revision + from cntdir as CD + inner join directory_in_revision as DR on (DR.directory = CD.directory)) + select CR.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + '\x'::unix_path as path + from cntrev as CR + inner join revision as R on (R.id = CR.revision) + left join origin as O on (O.id = R.origin)) + order by date, revision, origin, path limit early_cut +$$; + +create or replace function swh_provenance_relation_add( + rel_table regclass, src_table regclass, dst_table regclass, rel_data rel_row[] +) + returns void + language plpgsql + volatile +as $$ + declare + select_fields text; + group_entries text; + on_conflict text; + begin + if src_table in ('content'::regclass, 'directory'::regclass) then + select_fields := 'array_agg(D.id)'; + group_entries := 'group by S.id'; + on_conflict := format(' + (%s) do update + set %s=array( + select distinct unnest( + %s.' || dst_table::text || ' || excluded.' || dst_table::text || ' + ) + )', + src_table, dst_table, rel_table, rel_table + ); + else + select_fields := 'D.id'; + group_entries := ''; + on_conflict := 'do nothing'; + end if; + + execute format( + 'lock table only %s; + insert into %s + select S.id, ' || select_fields || ' + from unnest($1) as V + inner join %s as S on (S.sha1 = V.src) + inner join %s as D on (D.sha1 = V.dst) + ' || group_entries || ' + on conflict ' || on_conflict, + rel_table, rel_table, src_table, dst_table + ) using rel_data; + end; +$$; + +create or replace function swh_provenance_relation_get( + rel_table regclass, src_table regclass, dst_table regclass, filter rel_flt, sha1s sha1_git[] +) + returns table ( + src sha1_git, + dst sha1_git, + path unix_path ) - union - (with cntdir as ( - select C.sha1 as sha1, - unnest(CD.directory) as directory - from content as C - inner join content_in_directory as CD on (CD.content = C.id) - where C.sha1 = content_id - ), - cntrev as ( - select CD.sha1 as sha1, - unnest(DR.revision) as revision - from cntdir as CD - inner join directory_in_revision as DR on (DR.directory = CD.directory) - ) - select CR.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - '\x'::unix_path as path - from cntrev as CR - inner join revision as R on (R.id = CR.revision) - left join origin as O on (O.id = R.origin) - ) - order by date, revision, origin, path limit early_cut + language plpgsql + stable +as $$ + declare + src_field text; + dst_field text; + proj_unnested text; + filter_inner_result text; + filter_outer_result text; + begin + if rel_table = 'revision_before_revision'::regclass then + src_field := 'prev'; + dst_field := 'next'; + else + src_field := src_table::text; + dst_field := dst_table::text; + end if; + + if src_table in ('content'::regclass, 'directory'::regclass) then + proj_unnested := 'unnest(R.' || dst_field || ') as dst'; + else + proj_unnested := 'R.' || dst_field || ' as dst'; + end if; + + case filter + when 'filter-src'::rel_flt then + filter_inner_result := 'where S.sha1 = any($1)'; + filter_outer_result := ''; + when 'filter-dst'::rel_flt then + filter_inner_result := ''; + filter_outer_result := 'where D.sha1 = any($1)'; + else + filter_inner_result := ''; + filter_outer_result := ''; + end case; + + return query execute format( + 'select CL.src, D.sha1 as dst, NULL::unix_path as path + from (select S.sha1 as src, ' || proj_unnested || ' + from %s as R + inner join %s as S on (S.id = R.' || src_field || ') + ' || filter_inner_result || ') as CL + inner join %s as D on (D.id = CL.dst) + ' || filter_outer_result, + rel_table, src_table, dst_table + ) using sha1s; + end; $$; \endif