diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py --- a/swh/provenance/postgresql/provenance.py +++ b/swh/provenance/postgresql/provenance.py @@ -6,7 +6,7 @@ from datetime import datetime import itertools import logging -from typing import Dict, Generator, Iterable, Optional, Set, Tuple +from typing import Dict, Generator, Iterable, List, Optional, Set import psycopg2.extensions import psycopg2.extras @@ -304,63 +304,20 @@ ) -> Set[RelationData]: result: Set[RelationData] = set() - sha1s: Optional[Tuple[Tuple[Sha1Git, ...]]] + sha1s: List[Sha1Git] if ids is not None: - sha1s = (tuple(ids),) - where = f"WHERE {'S' if not reverse else 'D'}.sha1 IN %s" + sha1s = list(ids) + filter = 1 if not reverse else 2 else: - sha1s = None - where = "" - - aggreg_dst = self.denormalized and relation in ( - RelationType.CNT_EARLY_IN_REV, - RelationType.CNT_IN_DIR, - RelationType.DIR_IN_REV, - ) - if sha1s is None or sha1s[0]: - table = relation.value - src, *_, dst = table.split("_") - - # TODO: improve this! - if src == "revision" and dst == "revision": - src_field = "prev" - dst_field = "next" - else: - src_field = src - dst_field = dst - - if aggreg_dst: - revloc = f"UNNEST(R.{dst_field}) AS dst" - if self._relation_uses_location_table(relation): - revloc += ", UNNEST(R.location) AS path" - else: - revloc = f"R.{dst_field} AS dst" - if self._relation_uses_location_table(relation): - revloc += ", R.location AS path" - - inner_sql = f""" - SELECT S.sha1 AS src, {revloc} - FROM {table} AS R - INNER JOIN {src} AS S ON (S.id=R.{src_field}) - """ - if where != "" and not reverse: - inner_sql += where - - if self._relation_uses_location_table(relation): - loc = "L.path AS path" - else: - loc = "NULL AS path" - sql = f""" - SELECT CL.src, D.sha1 AS dst, {loc} - FROM ({inner_sql}) AS CL - INNER JOIN {dst} AS D ON (D.id=CL.dst) - """ - if self._relation_uses_location_table(relation): - sql += "INNER JOIN location AS L ON (L.id=CL.path)" - if where != "" and reverse: - sql += where + sha1s = [] + filter = 0 - self.cursor.execute(sql, sha1s) + if filter == 0 or sha1s: + rel_table = relation.value + src_table, *_, dst_table = rel_table.split("_") + + sql = "SELECT * FROM swh_provenance_relation_get(%s, %s, %s, %s, %s)" + self.cursor.execute(sql, (rel_table, src_table, dst_table, filter, sha1s)) result.update(RelationData(**row) for row in self.cursor.fetchall()) return result diff --git a/swh/provenance/sql/40-funcs.sql b/swh/provenance/sql/40-funcs.sql --- a/swh/provenance/sql/40-funcs.sql +++ b/swh/provenance/sql/40-funcs.sql @@ -1,8 +1,6 @@ select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset select position('with-path' in swh_get_dbflavor()::text) != 0 as dbflavor_with_path \gset -create type relation_row as (src sha1_git, dst sha1_git, loc unix_path); - \if :dbflavor_norm \if :dbflavor_with_path @@ -46,36 +44,91 @@ language sql stable as $$ - (select C.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - L.path as path - from content as C - inner join content_in_revision as CR on (CR.content = C.id) - inner join location as L on (L.id = CR.location) - inner join revision as R on (R.id = CR.revision) - left join origin as O on (O.id = R.origin) - where C.sha1 = content_id) - union - (select C.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - case DL.path - when '' then CL.path - when '.' then CL.path - else (DL.path || '/' || CL.path)::unix_path - end as path - from content as C - inner join content_in_directory as CD on (CD.content = C.id) - inner join directory_in_revision as DR on (DR.directory = CD.directory) - inner join revision as R on (R.id = DR.revision) - inner join location as CL on (CL.id = CD.location) - inner join location as DL on (DL.id = DR.location) - left join origin as O on (O.id = R.origin) - where C.sha1 = content_id) - order by date, revision, origin, path limit early_cut + (select C.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + L.path as path + from content as C + inner join content_in_revision as CR on (CR.content = C.id) + inner join location as L on (L.id = CR.location) + inner join revision as R on (R.id = CR.revision) + left join origin as O on (O.id = R.origin) + where C.sha1 = content_id) + union + (select C.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + case DL.path + when '' then CL.path + when '.' then CL.path + else (DL.path || '/' || CL.path)::unix_path + end as path + from content as C + inner join content_in_directory as CD on (CD.content = C.id) + inner join directory_in_revision as DR on (DR.directory = CD.directory) + inner join revision as R on (R.id = DR.revision) + inner join location as CL on (CL.id = CD.location) + inner join location as DL on (DL.id = DR.location) + left join origin as O on (O.id = R.origin) + where C.sha1 = content_id) + order by date, revision, origin, path limit early_cut +$$; + +create or replace function swh_provenance_relation_get( + rel_table regclass, src_table regclass, dst_table regclass, filter integer, sha1s sha1_git[] +) + returns table ( + src sha1_git, + dst sha1_git, + path unix_path + ) + language plpgsql + stable +as $$ + declare + src_field text; + dst_field text; + join_location text; + proj_location text; + filter_result text; + begin + if rel_table = 'revision_before_revision'::regclass then + src_field := 'prev'; + dst_field := 'next'; + else + src_field := src_table::text; + dst_field := dst_table::text; + end if; + + if src_table in ('content'::regclass, 'directory'::regclass) then + join_location := 'inner join location as L on (L.id = R.location)'; + proj_location := 'L.path'; + else + join_location := ''; + proj_location := 'NULL::unix_path'; + end if; + + case filter + when 1 then + filter_result := 'where S.sha1 = any($1)'; + when 2 then + filter_result := 'where D.sha1 = any($1)'; + else + filter_result := ''; + end case; + + return query execute format( + 'select S.sha1 as src, D.sha1 as dst, ' || proj_location || ' as path + from %s as R + inner join %s as S on (S.id = R.' || src_field || ') + inner join %s as D on (D.id = R.' || dst_field || ') + ' || join_location || ' + ' || filter_result, + rel_table, src_table, dst_table + ) using sha1s; + end; $$; \else @@ -93,7 +146,31 @@ language sql stable as $$ - select C.sha1 as content, + select C.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + '\x'::unix_path as path + from content as C + inner join content_in_revision as CR on (CR.content = C.id) + inner join revision as R on (R.id = CR.revision) + left join origin as O on (O.id = R.origin) + where C.sha1 = content_id + order by date, revision, origin asc limit 1 +$$; + +create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin text, + path unix_path + ) + language sql + stable +as $$ + (select C.sha1 as content, R.sha1 as revision, R.date as date, O.url as origin, @@ -102,44 +179,64 @@ inner join content_in_revision as CR on (CR.content = C.id) inner join revision as R on (R.id = CR.revision) left join origin as O on (O.id = R.origin) - where C.sha1 = content_id - order by date, revision, origin asc limit 1 + where C.sha1 = content_id) + union + (select C.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + '\x'::unix_path as path + from content as C + inner join content_in_directory as CD on (CD.content = C.id) + inner join directory_in_revision as DR on (DR.directory = CD.directory) + inner join revision as R on (R.id = DR.revision) + left join origin as O on (O.id = R.origin) + where C.sha1 = content_id) + order by date, revision, origin, path limit early_cut $$; -create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) +create or replace function swh_provenance_relation_get( + rel_table regclass, src_table regclass, dst_table regclass, filter integer, sha1s sha1_git[] +) returns table ( - content sha1_git, - revision sha1_git, - date timestamptz, - origin text, + src sha1_git, + dst sha1_git, path unix_path ) - language sql + language plpgsql stable as $$ - (select C.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - '\x'::unix_path as path - from content as C - inner join content_in_revision as CR on (CR.content = C.id) - inner join revision as R on (R.id = CR.revision) - left join origin as O on (O.id = R.origin) - where C.sha1 = content_id) - union - (select C.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - '\x'::unix_path as path - from content as C - inner join content_in_directory as CD on (CD.content = C.id) - inner join directory_in_revision as DR on (DR.directory = CD.directory) - inner join revision as R on (R.id = DR.revision) - left join origin as O on (O.id = R.origin) - where C.sha1 = content_id) - order by date, revision, origin, path limit early_cut + declare + src_field text; + dst_field text; + filter_result text; + begin + if rel_table = 'revision_before_revision'::regclass then + src_field := 'prev'; + dst_field := 'next'; + else + src_field := src_table::text; + dst_field := dst_table::text; + end if; + + case filter + when 1 then + filter_result := 'where S.sha1 = any($1)'; + when 2 then + filter_result := 'where D.sha1 = any($1)'; + else + filter_result := ''; + end case; + + return query execute format( + 'select S.sha1 as src, D.sha1 as dst, NULL::unix_path as path + from %s as R + inner join %s as S on (S.id = R.' || src_field || ') + inner join %s as D on (D.id = R.' || dst_field || ') + ' || filter_result, + rel_table, src_table, dst_table + ) using sha1s; + end; $$; -- :dbflavor_with_path @@ -194,57 +291,118 @@ language sql stable as $$ - (with cntrev as ( + (with + cntrev as ( select C.sha1 as sha1, unnest(CR.revision) as revision, unnest(CR.location) as location from content_in_revision as CR inner join content as C on (C.id = CR.content) - where C.sha1 = content_id - ) - select CR.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - L.path as path - from cntrev as CR - inner join revision as R on (R.id = CR.revision) - inner join location as L on (L.id = CR.location) - left join origin as O on (O.id = R.origin) - ) - union - (with cntdir as ( + where C.sha1 = content_id) + select CR.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + L.path as path + from cntrev as CR + inner join revision as R on (R.id = CR.revision) + inner join location as L on (L.id = CR.location) + left join origin as O on (O.id = R.origin)) + union + (with + cntdir as ( select C.sha1 as sha1, unnest(CD.directory) as directory, unnest(CD.location) as location from content as C inner join content_in_directory as CD on (CD.content = C.id) - where C.sha1 = content_id - ), - cntrev as ( - select CD.sha1 as sha1, - L.path as path, - unnest(DR.revision) as revision, - unnest(DR.location) as prefix - from cntdir as CD - inner join directory_in_revision as DR on (DR.directory = CD.directory) - inner join location as L on (L.id = CD.location) - ) - select CR.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - case DL.path - when '' then CR.path - when '.' then CR.path - else (DL.path || '/' || CR.path)::unix_path - end as path - from cntrev as CR - inner join revision as R on (R.id = CR.revision) - inner join location as DL on (DL.id = CR.prefix) - left join origin as O on (O.id = R.origin) - ) - order by date, revision, origin, path limit early_cut + where C.sha1 = content_id), + cntrev as ( + select CD.sha1 as sha1, + L.path as path, + unnest(DR.revision) as revision, + unnest(DR.location) as prefix + from cntdir as CD + inner join directory_in_revision as DR on (DR.directory = CD.directory) + inner join location as L on (L.id = CD.location)) + select CR.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + case DL.path + when '' then CR.path + when '.' then CR.path + else (DL.path || '/' || CR.path)::unix_path + end as path + from cntrev as CR + inner join revision as R on (R.id = CR.revision) + inner join location as DL on (DL.id = CR.prefix) + left join origin as O on (O.id = R.origin)) + order by date, revision, origin, path limit early_cut +$$; + +create or replace function swh_provenance_relation_get( + rel_table regclass, src_table regclass, dst_table regclass, filter integer, sha1s sha1_git[] +) + returns table ( + src sha1_git, + dst sha1_git, + path unix_path + ) + language plpgsql + stable +as $$ + declare + src_field text; + dst_field text; + proj_unnested text; + proj_location text; + join_location text; + filter_inner_result text; + filter_outer_result text; + begin + if rel_table = 'revision_before_revision'::regclass then + src_field := 'prev'; + dst_field := 'next'; + else + src_field := src_table::text; + dst_field := dst_table::text; + end if; + + if src_table in ('content'::regclass, 'directory'::regclass) then + proj_unnested := 'unnest(R.' || dst_field || ') as dst, unnest(R.location) as loc'; + join_location := 'inner join location as L on (L.id = CL.loc)'; + proj_location := 'L.path'; + else + proj_unnested := 'R.' || dst_field || ' as dst'; + join_location := ''; + proj_location := 'NULL::unix_path'; + end if; + + case filter + when 1 then + filter_inner_result := 'where S.sha1 = any($1)'; + filter_outer_result := ''; + when 2 then + filter_inner_result := ''; + filter_outer_result := 'where D.sha1 = any($1)'; + else + filter_inner_result := ''; + filter_outer_result := ''; + end case; + + return query execute format( + 'select CL.src, D.sha1 as dst, ' || proj_location || ' as path + from (select S.sha1 as src, ' || proj_unnested || ' + from %s as R + inner join %s as S on (S.id = R.' || src_field || ') + ' || filter_inner_result || ') as CL + inner join %s as D on (D.id = CL.dst) + ' || join_location || ' + ' || filter_outer_result, + rel_table, src_table, dst_table + ) using sha1s; + end; $$; \else @@ -268,17 +426,16 @@ O.url as origin, '\x'::unix_path as path from ( - select C.sha1, unnest(revision) as revision - from content_in_revision as CR - inner join content as C on (C.id = CR.content) - where C.sha1=content_id + select C.sha1, unnest(revision) as revision + from content_in_revision as CR + inner join content as C on (C.id = CR.content) + where C.sha1=content_id ) as CL inner join revision as R on (R.id = CL.revision) left join origin as O on (O.id = R.origin) order by date, revision, origin, path asc limit 1 $$; - create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) returns table ( content sha1_git, @@ -290,46 +447,102 @@ language sql stable as $$ - (with cntrev as ( + (with + cntrev as ( select C.sha1 as sha1, unnest(CR.revision) as revision from content_in_revision as CR inner join content as C on (C.id = CR.content) - where C.sha1 = content_id - ) - select CR.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - '\x'::unix_path as path - from cntrev as CR - inner join revision as R on (R.id = CR.revision) - left join origin as O on (O.id = R.origin) + where C.sha1 = content_id) + select CR.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + '\x'::unix_path as path + from cntrev as CR + inner join revision as R on (R.id = CR.revision) + left join origin as O on (O.id = R.origin)) + union + (with + cntdir as ( + select C.sha1 as sha1, + unnest(CD.directory) as directory + from content as C + inner join content_in_directory as CD on (CD.content = C.id) + where C.sha1 = content_id), + cntrev as ( + select CD.sha1 as sha1, + unnest(DR.revision) as revision + from cntdir as CD + inner join directory_in_revision as DR on (DR.directory = CD.directory)) + select CR.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + '\x'::unix_path as path + from cntrev as CR + inner join revision as R on (R.id = CR.revision) + left join origin as O on (O.id = R.origin)) + order by date, revision, origin, path limit early_cut +$$; + +create or replace function swh_provenance_relation_get( + rel_table regclass, src_table regclass, dst_table regclass, filter integer, sha1s sha1_git[] +) + returns table ( + src sha1_git, + dst sha1_git, + path unix_path ) - union - (with cntdir as ( - select C.sha1 as sha1, - unnest(CD.directory) as directory - from content as C - inner join content_in_directory as CD on (CD.content = C.id) - where C.sha1 = content_id - ), - cntrev as ( - select CD.sha1 as sha1, - unnest(DR.revision) as revision - from cntdir as CD - inner join directory_in_revision as DR on (DR.directory = CD.directory) - ) - select CR.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - '\x'::unix_path as path - from cntrev as CR - inner join revision as R on (R.id = CR.revision) - left join origin as O on (O.id = R.origin) - ) - order by date, revision, origin, path limit early_cut + language plpgsql + stable +as $$ + declare + src_field text; + dst_field text; + proj_unnested text; + proj_location text; + join_location text; + filter_inner_result text; + filter_outer_result text; + begin + if rel_table = 'revision_before_revision'::regclass then + src_field := 'prev'; + dst_field := 'next'; + else + src_field := src_table::text; + dst_field := dst_table::text; + end if; + + if src_table in ('content'::regclass, 'directory'::regclass) then + proj_unnested := 'unnest(R.' || dst_field || ') as dst'; + else + proj_unnested := 'R.' || dst_field || ' as dst'; + end if; + + case filter + when 1 then + filter_inner_result := 'where S.sha1 = any($1)'; + filter_outer_result := ''; + when 2 then + filter_inner_result := ''; + filter_outer_result := 'where D.sha1 = any($1)'; + else + filter_inner_result := ''; + filter_outer_result := ''; + end case; + + return query execute format( + 'select CL.src, D.sha1 as dst, NULL::unix_path as path + from (select S.sha1 as src, ' || proj_unnested || ' + from %s as R + inner join %s as S on (S.id = R.' || src_field || ') + ' || filter_inner_result || ') as CL + inner join %s as D on (D.id = CL.dst) + ' || filter_outer_result, + rel_table, src_table, dst_table + ) using sha1s; + end; $$; \endif diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -64,7 +64,7 @@ return RemoteProvenanceStorage -@pytest.fixture(params=["mongodb"]) +@pytest.fixture(params=["mongodb", "postgresql", "remote"]) def provenance_storage( request: SubRequest, provenance_postgresqldb: Dict[str, str],