diff --git a/swh/provenance/sql/15-flavor.sql b/swh/provenance/sql/15-flavor.sql index 6a75587..e0ced6c 100644 --- a/swh/provenance/sql/15-flavor.sql +++ b/swh/provenance/sql/15-flavor.sql @@ -1,23 +1,21 @@ -- database flavor create type database_flavor as enum ( - 'with-path', - 'without-path', - 'with-path-denormalized', - 'without-path-denormalized' + 'normalized', + 'denormalized' ); comment on type database_flavor is 'Flavor of the current database'; create table dbflavor ( flavor database_flavor, single_row char(1) primary key default 'x', check (single_row = 'x') ); comment on table dbflavor is 'Database flavor storage'; comment on column dbflavor.flavor is 'Database flavor currently deployed'; comment on column dbflavor.single_row is 'Bogus column to force the table to have a single row'; create or replace function swh_get_dbflavor() returns database_flavor language sql stable as $$ - select coalesce((select flavor from dbflavor), 'with-path'); + select coalesce((select flavor from dbflavor), 'normalized'); $$; comment on function swh_get_dbflavor is 'Get the flavor of the database currently deployed'; diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql index 73551d7..3e3a4fc 100644 --- a/swh/provenance/sql/30-schema.sql +++ b/swh/provenance/sql/30-schema.sql @@ -1,157 +1,156 @@ -- psql variables to get the current database flavor select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset -select position('without-path' in swh_get_dbflavor()::text) = 0 as dbflavor_with_path \gset -- a Git object ID, i.e., a Git-style salted SHA1 checksum create domain sha1_git as bytea check (length(value) = 20); -- UNIX path (absolute, relative, individual path component, etc.) create domain unix_path as bytea; -- relation filter options for querying create type rel_flt as enum ( 'filter-src', 'filter-dst', 'no-filter' ); comment on type rel_flt is 'Relation get filter types'; -- entity tables create table content ( id bigserial primary key, -- internal identifier of the content blob sha1 sha1_git unique not null, -- intrinsic identifier of the content blob date timestamptz not null -- timestamp of the revision where the blob appears early ); comment on column content.id is 'Content internal identifier'; comment on column content.sha1 is 'Content intrinsic identifier'; comment on column content.date is 'Earliest timestamp for the content (first seen time)'; create table directory ( id bigserial primary key, -- internal identifier of the directory appearing in an isochrone inner frontier sha1 sha1_git unique not null, -- intrinsic identifier of the directory date timestamptz not null, -- max timestamp among those of the directory children's flat boolean not null default false -- flag acknowledging if the directory is flattenned in the model ); comment on column directory.id is 'Directory internal identifier'; comment on column directory.sha1 is 'Directory intrinsic identifier'; comment on column directory.date is 'Latest timestamp for the content in the directory'; create table revision ( id bigserial primary key, -- internal identifier of the revision sha1 sha1_git unique not null, -- intrinsic identifier of the revision date timestamptz, -- timestamp of the revision origin bigint -- id of the preferred origin -- foreign key (origin) references origin (id) ); comment on column revision.id is 'Revision internal identifier'; comment on column revision.sha1 is 'Revision intrinsic identifier'; comment on column revision.date is 'Revision timestamp'; comment on column revision.origin is 'preferred origin for the revision'; create table location ( id bigserial primary key, -- internal identifier of the location path unix_path -- path to the location ); comment on column location.id is 'Location internal identifier'; comment on column location.path is 'Path to the location'; create table origin ( id bigserial primary key, -- internal identifier of the origin sha1 sha1_git unique not null, -- intrinsic identifier of the origin url text -- url of the origin ); comment on column origin.id is 'Origin internal identifier'; comment on column origin.sha1 is 'Origin intrinsic identifier'; comment on column origin.url is 'URL of the origin'; -- relation tables create table content_in_revision ( content bigint not null, -- internal identifier of the content blob \if :dbflavor_norm revision bigint not null, -- internal identifier of the revision where the blob appears for the first time location bigint -- location of the content relative to the revision's root directory \else revision bigint[], -- internal identifiers of the revisions where the blob appears for the first time location bigint[] -- locations of the content relative to the revisions' root directory \endif -- foreign key (content) references content (id), -- foreign key (revision) references revision (id), -- foreign key (location) references location (id) ); comment on column content_in_revision.content is 'Content internal identifier'; \if :dbflavor_norm comment on column content_in_revision.revision is 'Revision internal identifier'; comment on column content_in_revision.location is 'Location of content in revision'; \else comment on column content_in_revision.revision is 'Revision/location internal identifiers'; \endif create table content_in_directory ( content bigint not null, -- internal identifier of the content blob \if :dbflavor_norm directory bigint not null, -- internal identifier of the directory containing the blob location bigint -- location of the content relative to its parent directory in the isochrone frontier \else directory bigint[], -- internal reference of the directories containing the blob location bigint[] -- locations of the content relative to its parent directories in the isochrone frontier \endif -- foreign key (content) references content (id), -- foreign key (directory) references directory (id), -- foreign key (location) references location (id) ); comment on column content_in_directory.content is 'Content internal identifier'; \if :dbflavor_norm comment on column content_in_directory.directory is 'Directory internal identifier'; comment on column content_in_directory.location is 'Location of content in directory'; \else comment on column content_in_directory.directory is 'Directory/location internal identifiers'; \endif create table directory_in_revision ( directory bigint not null, -- internal identifier of the directory appearing in the revision \if :dbflavor_norm revision bigint not null, -- internal identifier of the revision containing the directory location bigint -- location of the directory relative to the revision's root directory \else revision bigint[], -- internal identifiers of the revisions containing the directory location bigint[] -- locations of the directory relative to the revisions' root directory \endif -- foreign key (directory) references directory (id), -- foreign key (revision) references revision (id), -- foreign key (location) references location (id) ); comment on column directory_in_revision.directory is 'Directory internal identifier'; \if :dbflavor_norm comment on column directory_in_revision.revision is 'Revision internal identifier'; comment on column directory_in_revision.location is 'Location of content in revision'; \else comment on column directory_in_revision.revision is 'Revision/location internal identifiers'; \endif create table revision_in_origin ( revision bigint not null, -- internal identifier of the revision poined by the origin origin bigint not null -- internal identifier of the origin that points to the revision -- foreign key (revision) references revision (id), -- foreign key (origin) references origin (id) ); comment on column revision_in_origin.revision is 'Revision internal identifier'; comment on column revision_in_origin.origin is 'Origin internal identifier'; create table revision_before_revision ( prev bigserial not null, -- internal identifier of the source revision next bigserial not null -- internal identifier of the destination revision -- foreign key (prev) references revision (id), -- foreign key (next) references revision (id) ); comment on column revision_before_revision.prev is 'Source revision internal identifier'; comment on column revision_before_revision.next is 'Destination revision internal identifier'; diff --git a/swh/provenance/sql/40-funcs.sql b/swh/provenance/sql/40-funcs.sql index 8ef93e9..d23eefa 100644 --- a/swh/provenance/sql/40-funcs.sql +++ b/swh/provenance/sql/40-funcs.sql @@ -1,709 +1,392 @@ -- psql variables to get the current database flavor select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset -select position('with-path' in swh_get_dbflavor()::text) != 0 as dbflavor_with_path \gset create or replace function swh_mktemp_relation_add() returns void language sql as $$ create temp table tmp_relation_add ( src sha1_git not null, dst sha1_git not null, path unix_path ) on commit drop $$; \if :dbflavor_norm -\if :dbflavor_with_path -- --- with path and normalized +-- normalized -- create or replace function swh_provenance_content_find_first(content_id sha1_git) returns table ( content sha1_git, revision sha1_git, date timestamptz, origin text, path unix_path ) language sql stable as $$ select C.sha1 as content, R.sha1 as revision, R.date as date, O.url as origin, L.path as path from content as C inner join content_in_revision as CR on (CR.content = C.id) inner join location as L on (L.id = CR.location) inner join revision as R on (R.id = CR.revision) left join origin as O on (O.id = R.origin) where C.sha1 = content_id order by date, revision, origin, path asc limit 1 $$; create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) returns table ( content sha1_git, revision sha1_git, date timestamptz, origin text, path unix_path ) language sql stable as $$ (select C.sha1 as content, R.sha1 as revision, R.date as date, O.url as origin, L.path as path from content as C inner join content_in_revision as CR on (CR.content = C.id) inner join location as L on (L.id = CR.location) inner join revision as R on (R.id = CR.revision) left join origin as O on (O.id = R.origin) where C.sha1 = content_id) union (select C.sha1 as content, R.sha1 as revision, R.date as date, O.url as origin, case DL.path when '' then CL.path when '.' then CL.path else (DL.path || '/' || CL.path)::unix_path end as path from content as C inner join content_in_directory as CD on (CD.content = C.id) inner join directory_in_revision as DR on (DR.directory = CD.directory) inner join revision as R on (R.id = DR.revision) inner join location as CL on (CL.id = CD.location) inner join location as DL on (DL.id = DR.location) left join origin as O on (O.id = R.origin) where C.sha1 = content_id) order by date, revision, origin, path limit early_cut $$; create or replace function swh_provenance_relation_add_from_temp( rel_table regclass, src_table regclass, dst_table regclass ) returns void language plpgsql volatile as $$ declare select_fields text; join_location text; begin if src_table in ('content'::regclass, 'directory'::regclass) then select_fields := 'D.id, L.id'; join_location := 'inner join location as L on (digest(L.path,''sha1'') = digest(V.path,''sha1''))'; else select_fields := 'D.id'; join_location := ''; end if; execute format( 'insert into %s select S.id, ' || select_fields || ' from tmp_relation_add as V inner join %s as S on (S.sha1 = V.src) inner join %s as D on (D.sha1 = V.dst) ' || join_location || ' on conflict do nothing', rel_table, src_table, dst_table ); end; $$; create or replace function swh_provenance_relation_get( rel_table regclass, src_table regclass, dst_table regclass, filter rel_flt, sha1s sha1_git[] ) returns table ( src sha1_git, dst sha1_git, path unix_path ) language plpgsql stable as $$ declare src_field text; dst_field text; join_location text; proj_location text; filter_result text; begin if rel_table = 'revision_before_revision'::regclass then src_field := 'prev'; dst_field := 'next'; else src_field := src_table::text; dst_field := dst_table::text; end if; if src_table in ('content'::regclass, 'directory'::regclass) then join_location := 'inner join location as L on (L.id = R.location)'; proj_location := 'L.path'; else join_location := ''; proj_location := 'NULL::unix_path'; end if; case filter when 'filter-src'::rel_flt then filter_result := 'where S.sha1 = any($1)'; when 'filter-dst'::rel_flt then filter_result := 'where D.sha1 = any($1)'; else filter_result := ''; end case; return query execute format( 'select S.sha1 as src, D.sha1 as dst, ' || proj_location || ' as path from %s as R inner join %s as S on (S.id = R.' || src_field || ') inner join %s as D on (D.id = R.' || dst_field || ') ' || join_location || ' ' || filter_result, rel_table, src_table, dst_table ) using sha1s; end; $$; -\else --- --- without path and normalized --- -create or replace function swh_provenance_content_find_first(content_id sha1_git) - returns table ( - content sha1_git, - revision sha1_git, - date timestamptz, - origin text, - path unix_path - ) - language sql - stable -as $$ - select C.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - '\x'::unix_path as path - from content as C - inner join content_in_revision as CR on (CR.content = C.id) - inner join revision as R on (R.id = CR.revision) - left join origin as O on (O.id = R.origin) - where C.sha1 = content_id - order by date, revision, origin asc limit 1 -$$; - -create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) - returns table ( - content sha1_git, - revision sha1_git, - date timestamptz, - origin text, - path unix_path - ) - language sql - stable -as $$ - (select C.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - '\x'::unix_path as path - from content as C - inner join content_in_revision as CR on (CR.content = C.id) - inner join revision as R on (R.id = CR.revision) - left join origin as O on (O.id = R.origin) - where C.sha1 = content_id) - union - (select C.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - '\x'::unix_path as path - from content as C - inner join content_in_directory as CD on (CD.content = C.id) - inner join directory_in_revision as DR on (DR.directory = CD.directory) - inner join revision as R on (R.id = DR.revision) - left join origin as O on (O.id = R.origin) - where C.sha1 = content_id) - order by date, revision, origin, path limit early_cut -$$; - -create or replace function swh_provenance_relation_add_from_temp( - rel_table regclass, src_table regclass, dst_table regclass -) - returns void - language plpgsql - volatile -as $$ - begin - execute format( - 'insert into %s - select S.id, D.id - from tmp_relation_add as V - inner join %s as S on (S.sha1 = V.src) - inner join %s as D on (D.sha1 = V.dst) - on conflict do nothing', - rel_table, src_table, dst_table - ); - end; -$$; - -create or replace function swh_provenance_relation_get( - rel_table regclass, src_table regclass, dst_table regclass, filter rel_flt, sha1s sha1_git[] -) - returns table ( - src sha1_git, - dst sha1_git, - path unix_path - ) - language plpgsql - stable -as $$ - declare - src_field text; - dst_field text; - filter_result text; - begin - if rel_table = 'revision_before_revision'::regclass then - src_field := 'prev'; - dst_field := 'next'; - else - src_field := src_table::text; - dst_field := dst_table::text; - end if; - - case filter - when 'filter-src'::rel_flt then - filter_result := 'where S.sha1 = any($1)'; - when 'filter-dst'::rel_flt then - filter_result := 'where D.sha1 = any($1)'; - else - filter_result := ''; - end case; - - return query execute format( - 'select S.sha1 as src, D.sha1 as dst, NULL::unix_path as path - from %s as R - inner join %s as S on (S.id = R.' || src_field || ') - inner join %s as D on (D.id = R.' || dst_field || ') - ' || filter_result, - rel_table, src_table, dst_table - ) using sha1s; - end; -$$; - --- :dbflavor_with_path -\endif - -- :dbflavor_norm \else -\if :dbflavor_with_path -- --- with path and denormalized +-- denormalized -- create or replace function swh_provenance_content_find_first(content_id sha1_git) returns table ( content sha1_git, revision sha1_git, date timestamptz, origin text, path unix_path ) language sql stable as $$ select CL.sha1 as content, R.sha1 as revision, R.date as date, O.url as origin, L.path as path from ( select C.sha1 as sha1, unnest(CR.revision) as revision, unnest(CR.location) as location from content_in_revision as CR inner join content as C on (C.id = CR.content) where C.sha1 = content_id ) as CL inner join revision as R on (R.id = CL.revision) inner join location as L on (L.id = CL.location) left join origin as O on (O.id = R.origin) order by date, revision, origin, path asc limit 1 $$; create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) returns table ( content sha1_git, revision sha1_git, date timestamptz, origin text, path unix_path ) language sql stable as $$ (with cntrev as ( select C.sha1 as sha1, unnest(CR.revision) as revision, unnest(CR.location) as location from content_in_revision as CR inner join content as C on (C.id = CR.content) where C.sha1 = content_id) select CR.sha1 as content, R.sha1 as revision, R.date as date, O.url as origin, L.path as path from cntrev as CR inner join revision as R on (R.id = CR.revision) inner join location as L on (L.id = CR.location) left join origin as O on (O.id = R.origin)) union (with cntdir as ( select C.sha1 as sha1, unnest(CD.directory) as directory, unnest(CD.location) as location from content as C inner join content_in_directory as CD on (CD.content = C.id) where C.sha1 = content_id), cntrev as ( select CD.sha1 as sha1, L.path as path, unnest(DR.revision) as revision, unnest(DR.location) as location from cntdir as CD inner join directory_in_revision as DR on (DR.directory = CD.directory) inner join location as L on (L.id = CD.location)) select CR.sha1 as content, R.sha1 as revision, R.date as date, O.url as origin, case DL.path when '' then CR.path when '.' then CR.path else (DL.path || '/' || CR.path)::unix_path end as path from cntrev as CR inner join revision as R on (R.id = CR.revision) inner join location as DL on (DL.id = CR.location) left join origin as O on (O.id = R.origin)) order by date, revision, origin, path limit early_cut $$; create or replace function swh_provenance_relation_add_from_temp( rel_table regclass, src_table regclass, dst_table regclass ) returns void language plpgsql volatile as $$ declare select_fields text; join_location text; group_entries text; on_conflict text; begin if src_table in ('content'::regclass, 'directory'::regclass) then select_fields := 'array_agg(D.id), array_agg(L.id)'; join_location := 'inner join location as L on (digest(L.path,''sha1'') = digest(V.path,''sha1''))'; group_entries := 'group by S.id'; on_conflict := format(' (%s) do update set (%s, location) = ( with pairs as ( select distinct * from unnest( %s.' || dst_table::text || ' || excluded.' || dst_table::text || ', %s.location || excluded.location ) as pair(dst, loc) ) select array(select pairs.dst from pairs), array(select pairs.loc from pairs) )', src_table, dst_table, rel_table, rel_table, rel_table, rel_table ); else select_fields := 'D.id'; join_location := ''; group_entries := ''; on_conflict := 'do nothing'; end if; execute format( 'insert into %s select S.id, ' || select_fields || ' from tmp_relation_add as V inner join %s as S on (S.sha1 = V.src) inner join %s as D on (D.sha1 = V.dst) ' || join_location || ' ' || group_entries || ' on conflict ' || on_conflict, rel_table, src_table, dst_table ); end; $$; create or replace function swh_provenance_relation_get( rel_table regclass, src_table regclass, dst_table regclass, filter rel_flt, sha1s sha1_git[] ) returns table ( src sha1_git, dst sha1_git, path unix_path ) language plpgsql stable as $$ declare src_field text; dst_field text; proj_dst_id text; proj_unnested text; proj_location text; join_location text; filter_inner_result text; filter_outer_result text; begin if rel_table = 'revision_before_revision'::regclass then src_field := 'prev'; dst_field := 'next'; else src_field := src_table::text; dst_field := dst_table::text; end if; if src_table in ('content'::regclass, 'directory'::regclass) then proj_unnested := 'unnest(R.' || dst_field || ') as dst, unnest(R.location) as loc'; proj_dst_id := 'CL.dst'; join_location := 'inner join location as L on (L.id = CL.loc)'; proj_location := 'L.path'; else proj_unnested := 'R.' || dst_field || ' as dst'; proj_dst_id := 'CL.dst'; join_location := ''; proj_location := 'NULL::unix_path'; end if; case filter when 'filter-src'::rel_flt then filter_inner_result := 'where S.sha1 = any($1)'; filter_outer_result := ''; when 'filter-dst'::rel_flt then filter_inner_result := ''; filter_outer_result := 'where D.sha1 = any($1)'; else filter_inner_result := ''; filter_outer_result := ''; end case; return query execute format( 'select CL.src, D.sha1 as dst, ' || proj_location || ' as path from (select S.sha1 as src, ' || proj_unnested || ' from %s as R inner join %s as S on (S.id = R.' || src_field || ') ' || filter_inner_result || ') as CL inner join %s as D on (D.id = ' || proj_dst_id || ') ' || join_location || ' ' || filter_outer_result, rel_table, src_table, dst_table ) using sha1s; end; $$; -\else --- --- without path and denormalized --- -create or replace function swh_provenance_content_find_first(content_id sha1_git) - returns table ( - content sha1_git, - revision sha1_git, - date timestamptz, - origin text, - path unix_path - ) - language sql - stable -as $$ - select CL.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - '\x'::unix_path as path - from ( - select C.sha1, unnest(revision) as revision - from content_in_revision as CR - inner join content as C on (C.id = CR.content) - where C.sha1=content_id - ) as CL - inner join revision as R on (R.id = CL.revision) - left join origin as O on (O.id = R.origin) - order by date, revision, origin, path asc limit 1 -$$; - -create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) - returns table ( - content sha1_git, - revision sha1_git, - date timestamptz, - origin text, - path unix_path - ) - language sql - stable -as $$ - (with - cntrev as ( - select C.sha1 as sha1, - unnest(CR.revision) as revision - from content_in_revision as CR - inner join content as C on (C.id = CR.content) - where C.sha1 = content_id) - select CR.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - '\x'::unix_path as path - from cntrev as CR - inner join revision as R on (R.id = CR.revision) - left join origin as O on (O.id = R.origin)) - union - (with - cntdir as ( - select C.sha1 as sha1, - unnest(CD.directory) as directory - from content as C - inner join content_in_directory as CD on (CD.content = C.id) - where C.sha1 = content_id), - cntrev as ( - select CD.sha1 as sha1, - unnest(DR.revision) as revision - from cntdir as CD - inner join directory_in_revision as DR on (DR.directory = CD.directory)) - select CR.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url as origin, - '\x'::unix_path as path - from cntrev as CR - inner join revision as R on (R.id = CR.revision) - left join origin as O on (O.id = R.origin)) - order by date, revision, origin, path limit early_cut -$$; - -create or replace function swh_provenance_relation_add_from_temp( - rel_table regclass, src_table regclass, dst_table regclass -) - returns void - language plpgsql - volatile -as $$ - declare - select_fields text; - group_entries text; - on_conflict text; - begin - if src_table in ('content'::regclass, 'directory'::regclass) then - select_fields := 'array_agg(D.id)'; - group_entries := 'group by S.id'; - on_conflict := format(' - (%s) do update - set %s=array( - select distinct unnest( - %s.' || dst_table::text || ' || excluded.' || dst_table::text || ' - ) - )', - src_table, dst_table, rel_table, rel_table - ); - else - select_fields := 'D.id'; - group_entries := ''; - on_conflict := 'do nothing'; - end if; - - execute format( - 'insert into %s - select S.id, ' || select_fields || ' - from tmp_relation_add as V - inner join %s as S on (S.sha1 = V.src) - inner join %s as D on (D.sha1 = V.dst) - ' || group_entries || ' - on conflict ' || on_conflict, - rel_table, src_table, dst_table - ); - end; -$$; - -create or replace function swh_provenance_relation_get( - rel_table regclass, src_table regclass, dst_table regclass, filter rel_flt, sha1s sha1_git[] -) - returns table ( - src sha1_git, - dst sha1_git, - path unix_path - ) - language plpgsql - stable -as $$ - declare - src_field text; - dst_field text; - proj_unnested text; - filter_inner_result text; - filter_outer_result text; - begin - if rel_table = 'revision_before_revision'::regclass then - src_field := 'prev'; - dst_field := 'next'; - else - src_field := src_table::text; - dst_field := dst_table::text; - end if; - - if src_table in ('content'::regclass, 'directory'::regclass) then - proj_unnested := 'unnest(R.' || dst_field || ') as dst'; - else - proj_unnested := 'R.' || dst_field || ' as dst'; - end if; - - case filter - when 'filter-src'::rel_flt then - filter_inner_result := 'where S.sha1 = any($1)'; - filter_outer_result := ''; - when 'filter-dst'::rel_flt then - filter_inner_result := ''; - filter_outer_result := 'where D.sha1 = any($1)'; - else - filter_inner_result := ''; - filter_outer_result := ''; - end case; - - return query execute format( - 'select CL.src, D.sha1 as dst, NULL::unix_path as path - from (select S.sha1 as src, ' || proj_unnested || ' - from %s as R - inner join %s as S on (S.id = R.' || src_field || ') - ' || filter_inner_result || ') as CL - inner join %s as D on (D.id = CL.dst) - ' || filter_outer_result, - rel_table, src_table, dst_table - ) using sha1s; - end; -$$; - -\endif --- :dbflavor_with_path \endif -- :dbflavor_norm diff --git a/swh/provenance/sql/upgrades/004.sql b/swh/provenance/sql/upgrades/004.sql new file mode 100644 index 0000000..bf1b62f --- /dev/null +++ b/swh/provenance/sql/upgrades/004.sql @@ -0,0 +1,26 @@ +-- SWH provenance DB schema upgrade +-- from_version: 3 +-- to_version: 4 +-- description: rename db flavor (without-path heving been removed) +-- will fail if the db is using a without-path flavor. +alter type database_flavor rename value 'with-path' to 'normalized'; +alter type database_flavor rename value 'with-path-denormalized' to 'denormalized'; + +alter type database_flavor rename to database_flavor_old; + +create type database_flavor as enum ( + 'normalized', + 'denormalized' +); +comment on type database_flavor is 'Flavor of the current database'; + +drop function swh_get_dbflavor; + +alter table dbflavor + alter column flavor type database_flavor using flavor::text::database_flavor; + +create function swh_get_dbflavor() returns database_flavor language sql stable as $$ + select coalesce((select flavor from dbflavor), 'normalized'); +$$; + +drop type database_flavor_old; diff --git a/swh/provenance/storage/interface.py b/swh/provenance/storage/interface.py index f238adb..023d9e5 100644 --- a/swh/provenance/storage/interface.py +++ b/swh/provenance/storage/interface.py @@ -1,229 +1,225 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations from dataclasses import dataclass from datetime import datetime import enum from types import TracebackType from typing import Dict, Generator, Iterable, List, Optional, Set, Type from typing_extensions import Protocol, runtime_checkable from swh.core.api import remote_api_endpoint from swh.model.model import Sha1Git class EntityType(enum.Enum): CONTENT = "content" DIRECTORY = "directory" REVISION = "revision" ORIGIN = "origin" class RelationType(enum.Enum): CNT_EARLY_IN_REV = "content_in_revision" CNT_IN_DIR = "content_in_directory" DIR_IN_REV = "directory_in_revision" REV_IN_ORG = "revision_in_origin" REV_BEFORE_REV = "revision_before_revision" @dataclass(eq=True, frozen=True) class ProvenanceResult: content: Sha1Git revision: Sha1Git date: datetime origin: Optional[str] path: bytes @dataclass(eq=True, frozen=True) class DirectoryData: """Object representing the data associated to a directory in the provenance model, where `date` is the date of the directory in the isochrone frontier, and `flat` is a flag acknowledging that a flat model for the elements outside the frontier has already been created. """ date: datetime flat: bool @dataclass(eq=True, frozen=True) class RevisionData: """Object representing the data associated to a revision in the provenance model, where `date` is the optional date of the revision (specifying it acknowledges that the revision was already processed by the revision-content algorithm); and `origin` identifies the preferred origin for the revision, if any. """ date: Optional[datetime] origin: Optional[Sha1Git] @dataclass(eq=True, frozen=True) class RelationData: """Object representing a relation entry in the provenance model, where `src` and `dst` are the sha1 ids of the entities being related, and `path` is optional depending on the relation being represented. """ dst: Sha1Git path: Optional[bytes] @runtime_checkable class ProvenanceStorageInterface(Protocol): def __enter__(self) -> ProvenanceStorageInterface: ... def __exit__( self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> None: ... @remote_api_endpoint("close") def close(self) -> None: """Close connection to the storage and release resources.""" ... @remote_api_endpoint("content_add") def content_add(self, cnts: Dict[Sha1Git, datetime]) -> bool: """Add blobs identified by sha1 ids, with an associated date (as paired in `cnts`) to the provenance storage. Return a boolean stating whether the information was successfully stored. """ ... @remote_api_endpoint("content_find_first") def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: """Retrieve the first occurrence of the blob identified by `id`.""" ... @remote_api_endpoint("content_find_all") def content_find_all( self, id: Sha1Git, limit: Optional[int] = None ) -> Generator[ProvenanceResult, None, None]: """Retrieve all the occurrences of the blob identified by `id`.""" ... @remote_api_endpoint("content_get") def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: """Retrieve the associated date for each blob sha1 in `ids`.""" ... @remote_api_endpoint("directory_add") def directory_add(self, dirs: Dict[Sha1Git, DirectoryData]) -> bool: """Add directories identified by sha1 ids, with associated date and (optional) flatten flag (as paired in `dirs`) to the provenance storage. If the flatten flag is set to None, the previous value present in the storage is preserved. Return a boolean stating if the information was successfully stored. """ ... @remote_api_endpoint("directory_get") def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, DirectoryData]: """Retrieve the associated date and (optional) flatten flag for each directory sha1 in `ids`. If some directories has no associated date, it is not present in the resulting dictionary. """ ... @remote_api_endpoint("directory_iter_not_flattened") def directory_iter_not_flattened( self, limit: int, start_id: Sha1Git ) -> List[Sha1Git]: """Retrieve the unflattened directories after ``start_id`` up to ``limit`` entries.""" ... @remote_api_endpoint("entity_get_all") def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]: """Retrieve all sha1 ids for entities of type `entity` present in the provenance model. This method is used only in tests. """ ... @remote_api_endpoint("location_add") def location_add(self, paths: Dict[Sha1Git, bytes]) -> bool: """Register the given `paths` in the storage.""" ... @remote_api_endpoint("location_get_all") def location_get_all(self) -> Dict[Sha1Git, bytes]: """Retrieve all paths present in the provenance model. This method is used only in tests.""" ... @remote_api_endpoint("open") def open(self) -> None: """Open connection to the storage and allocate necessary resources.""" ... @remote_api_endpoint("origin_add") def origin_add(self, orgs: Dict[Sha1Git, str]) -> bool: """Add origins identified by sha1 ids, with their corresponding url (as paired in `orgs`) to the provenance storage. Return a boolean stating if the information was successfully stored. """ ... @remote_api_endpoint("origin_get") def origin_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, str]: """Retrieve the associated url for each origin sha1 in `ids`.""" ... @remote_api_endpoint("revision_add") def revision_add(self, revs: Dict[Sha1Git, RevisionData]) -> bool: """Add revisions identified by sha1 ids, with optional associated date or origin (as paired in `revs`) to the provenance storage. Return a boolean stating if the information was successfully stored. """ ... @remote_api_endpoint("revision_get") def revision_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, RevisionData]: """Retrieve the associated date and origin for each revision sha1 in `ids`. If some revision has no associated date nor origin, it is not present in the resulting dictionary. """ ... @remote_api_endpoint("relation_add") def relation_add( self, relation: RelationType, data: Dict[Sha1Git, Set[RelationData]] ) -> bool: """Add entries in the selected `relation`. This method assumes all entities being related are already registered in the storage. See `content_add`, `directory_add`, `origin_add`, and `revision_add`. """ ... @remote_api_endpoint("relation_get") def relation_get( self, relation: RelationType, ids: Iterable[Sha1Git], reverse: bool = False ) -> Dict[Sha1Git, Set[RelationData]]: """Retrieve all entries in the selected `relation` whose source entities are identified by some sha1 id in `ids`. If `reverse` is set, destination entities are matched instead. """ ... @remote_api_endpoint("relation_get_all") def relation_get_all( self, relation: RelationType ) -> Dict[Sha1Git, Set[RelationData]]: """Retrieve all entries in the selected `relation` that are present in the provenance model. This method is used only in tests. """ ... - - @remote_api_endpoint("with_path") - def with_path(self) -> bool: - ... diff --git a/swh/provenance/storage/journal.py b/swh/provenance/storage/journal.py index b05ba88..8e96bd6 100644 --- a/swh/provenance/storage/journal.py +++ b/swh/provenance/storage/journal.py @@ -1,152 +1,149 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations from dataclasses import asdict from datetime import datetime from types import TracebackType from typing import Dict, Generator, Iterable, List, Optional, Set, Type from swh.model.model import Sha1Git from swh.provenance.storage.interface import ( DirectoryData, EntityType, ProvenanceResult, ProvenanceStorageInterface, RelationData, RelationType, RevisionData, ) class JournalMessage: def __init__(self, id, value): self.id = id self.value = value def anonymize(self): return None def unique_key(self): return self.id def to_dict(self): return { "id": self.id, "value": self.value, } class ProvenanceStorageJournal: def __init__(self, storage, journal): self.storage = storage self.journal = journal def __enter__(self) -> ProvenanceStorageInterface: self.storage.__enter__() return self def __exit__( self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> None: return self.storage.__exit__(exc_type, exc_val, exc_tb) def open(self) -> None: self.storage.open() def close(self) -> None: self.storage.close() def content_add(self, cnts: Dict[Sha1Git, datetime]) -> bool: self.journal.write_additions( "content", [JournalMessage(key, value) for (key, value) in cnts.items()] ) return self.storage.content_add(cnts) def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: return self.storage.content_find_first(id) def content_find_all( self, id: Sha1Git, limit: Optional[int] = None ) -> Generator[ProvenanceResult, None, None]: return self.storage.content_find_all(id, limit) def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: return self.storage.content_get(ids) def directory_add(self, dirs: Dict[Sha1Git, DirectoryData]) -> bool: self.journal.write_additions( "directory", [JournalMessage(key, asdict(value)) for (key, value) in dirs.items()], ) return self.storage.directory_add(dirs) def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, DirectoryData]: return self.storage.directory_get(ids) def directory_iter_not_flattened( self, limit: int, start_id: Sha1Git ) -> List[Sha1Git]: return self.storage.directory_iter_not_flattened(limit, start_id) def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]: return self.storage.entity_get_all(entity) def location_add(self, paths: Dict[Sha1Git, bytes]) -> bool: self.journal.write_additions( "location", [JournalMessage(key, value) for (key, value) in paths.items()] ) return self.storage.location_add(paths) def location_get_all(self) -> Dict[Sha1Git, bytes]: return self.storage.location_get_all() def origin_add(self, orgs: Dict[Sha1Git, str]) -> bool: self.journal.write_additions( "origin", [JournalMessage(key, value) for (key, value) in orgs.items()] ) return self.storage.origin_add(orgs) def origin_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, str]: return self.storage.origin_get(ids) def revision_add(self, revs: Dict[Sha1Git, RevisionData]) -> bool: self.journal.write_additions( "revision", [JournalMessage(key, asdict(value)) for (key, value) in revs.items()], ) return self.storage.revision_add(revs) def revision_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, RevisionData]: return self.storage.revision_get(ids) def relation_add( self, relation: RelationType, data: Dict[Sha1Git, Set[RelationData]] ) -> bool: self.journal.write_additions( relation.value, [ JournalMessage(key, [asdict(reldata) for reldata in value]) for (key, value) in data.items() ], ) return self.storage.relation_add(relation, data) def relation_get( self, relation: RelationType, ids: Iterable[Sha1Git], reverse: bool = False ) -> Dict[Sha1Git, Set[RelationData]]: return self.storage.relation_get(relation, ids, reverse) def relation_get_all( self, relation: RelationType ) -> Dict[Sha1Git, Set[RelationData]]: return self.storage.relation_get_all(relation) - - def with_path(self) -> bool: - return self.storage.with_path() diff --git a/swh/provenance/storage/postgresql.py b/swh/provenance/storage/postgresql.py index bb24a87..9ce88c6 100644 --- a/swh/provenance/storage/postgresql.py +++ b/swh/provenance/storage/postgresql.py @@ -1,398 +1,393 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations from contextlib import contextmanager from datetime import datetime from functools import wraps from hashlib import sha1 import itertools import logging from types import TracebackType from typing import Dict, Generator, Iterable, List, Optional, Set, Type import psycopg2.extensions import psycopg2.extras from swh.core.db import BaseDb from swh.core.statsd import statsd from swh.model.model import Sha1Git from swh.provenance.storage.interface import ( DirectoryData, EntityType, ProvenanceResult, ProvenanceStorageInterface, RelationData, RelationType, RevisionData, ) LOGGER = logging.getLogger(__name__) STORAGE_DURATION_METRIC = "swh_provenance_storage_postgresql_duration_seconds" def handle_raise_on_commit(f): @wraps(f) def handle(self, *args, **kwargs): try: return f(self, *args, **kwargs) except BaseException as ex: # Unexpected error occurred, rollback all changes and log message LOGGER.exception("Unexpected error") if self.raise_on_commit: raise ex return False return handle class ProvenanceStoragePostgreSql: - current_version = 3 + current_version = 4 def __init__( self, page_size: Optional[int] = None, raise_on_commit: bool = False, **kwargs ) -> None: self.conn: Optional[psycopg2.extensions.connection] = None self.conn_args = kwargs self._flavor: Optional[str] = None self.page_size = page_size self.raise_on_commit = raise_on_commit def __enter__(self) -> ProvenanceStorageInterface: self.open() return self def __exit__( self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> None: self.close() @contextmanager def transaction( self, readonly: bool = False ) -> Generator[psycopg2.extras.RealDictCursor, None, None]: if self.conn is None: raise RuntimeError( "Tried to access ProvenanceStoragePostgreSQL transaction() without opening it" ) self.conn.set_session(readonly=readonly) with self.conn: with self.conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: yield cur @property def flavor(self) -> str: if self._flavor is None: with self.transaction(readonly=True) as cursor: cursor.execute("SELECT swh_get_dbflavor() AS flavor") flavor = cursor.fetchone() assert flavor # please mypy self._flavor = flavor["flavor"] assert self._flavor is not None return self._flavor @property def denormalized(self) -> bool: return "denormalized" in self.flavor @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "close"}) def close(self) -> None: assert self.conn is not None self.conn.close() @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "content_add"}) @handle_raise_on_commit def content_add(self, cnts: Dict[Sha1Git, datetime]) -> bool: if cnts: sql = """ INSERT INTO content(sha1, date) VALUES %s ON CONFLICT (sha1) DO UPDATE SET date=LEAST(EXCLUDED.date,content.date) """ page_size = self.page_size or len(cnts) with self.transaction() as cursor: psycopg2.extras.execute_values( cursor, sql, argslist=cnts.items(), page_size=page_size ) return True @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "content_find_first"}) def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: sql = "SELECT * FROM swh_provenance_content_find_first(%s)" with self.transaction(readonly=True) as cursor: cursor.execute(query=sql, vars=(id,)) row = cursor.fetchone() return ProvenanceResult(**row) if row is not None else None @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "content_find_all"}) def content_find_all( self, id: Sha1Git, limit: Optional[int] = None ) -> Generator[ProvenanceResult, None, None]: sql = "SELECT * FROM swh_provenance_content_find_all(%s, %s)" with self.transaction(readonly=True) as cursor: cursor.execute(query=sql, vars=(id, limit)) yield from (ProvenanceResult(**row) for row in cursor) @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "content_get"}) def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: dates: Dict[Sha1Git, datetime] = {} sha1s = tuple(ids) if sha1s: # TODO: consider splitting this query in several ones if sha1s is too big! values = ", ".join(itertools.repeat("%s", len(sha1s))) sql = f""" SELECT sha1, date FROM content WHERE sha1 IN ({values}) AND date IS NOT NULL """ with self.transaction(readonly=True) as cursor: cursor.execute(query=sql, vars=sha1s) dates.update((row["sha1"], row["date"]) for row in cursor) return dates @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "directory_add"}) @handle_raise_on_commit def directory_add(self, dirs: Dict[Sha1Git, DirectoryData]) -> bool: data = [(sha1, rev.date, rev.flat) for sha1, rev in dirs.items()] if data: sql = """ INSERT INTO directory(sha1, date, flat) VALUES %s ON CONFLICT (sha1) DO UPDATE SET date=LEAST(EXCLUDED.date, directory.date), flat=(EXCLUDED.flat OR directory.flat) """ page_size = self.page_size or len(data) with self.transaction() as cursor: psycopg2.extras.execute_values( cur=cursor, sql=sql, argslist=data, page_size=page_size ) return True @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "directory_get"}) def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, DirectoryData]: result: Dict[Sha1Git, DirectoryData] = {} sha1s = tuple(ids) if sha1s: # TODO: consider splitting this query in several ones if sha1s is too big! values = ", ".join(itertools.repeat("%s", len(sha1s))) sql = f""" SELECT sha1, date, flat FROM directory WHERE sha1 IN ({values}) AND date IS NOT NULL """ with self.transaction(readonly=True) as cursor: cursor.execute(query=sql, vars=sha1s) result.update( (row["sha1"], DirectoryData(date=row["date"], flat=row["flat"])) for row in cursor ) return result @statsd.timed( metric=STORAGE_DURATION_METRIC, tags={"method": "directory_iter_not_flattened"} ) def directory_iter_not_flattened( self, limit: int, start_id: Sha1Git ) -> List[Sha1Git]: sql = """ SELECT sha1 FROM directory WHERE flat=false AND sha1>%s ORDER BY sha1 LIMIT %s """ with self.transaction(readonly=True) as cursor: cursor.execute(query=sql, vars=(start_id, limit)) return [row["sha1"] for row in cursor] @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "entity_get_all"}) def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]: with self.transaction(readonly=True) as cursor: cursor.execute(f"SELECT sha1 FROM {entity.value}") return {row["sha1"] for row in cursor} @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "location_add"}) @handle_raise_on_commit def location_add(self, paths: Dict[Sha1Git, bytes]) -> bool: - if self.with_path(): - values = [(path,) for path in paths.values()] - if values: - sql = """ - INSERT INTO location(path) VALUES %s - ON CONFLICT DO NOTHING - """ - page_size = self.page_size or len(values) - with self.transaction() as cursor: - psycopg2.extras.execute_values( - cursor, sql, argslist=values, page_size=page_size - ) + values = [(path,) for path in paths.values()] + if values: + sql = """ + INSERT INTO location(path) VALUES %s + ON CONFLICT DO NOTHING + """ + page_size = self.page_size or len(values) + with self.transaction() as cursor: + psycopg2.extras.execute_values( + cursor, sql, argslist=values, page_size=page_size + ) return True @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "location_get_all"}) def location_get_all(self) -> Dict[Sha1Git, bytes]: with self.transaction(readonly=True) as cursor: cursor.execute("SELECT location.path AS path FROM location") return {sha1(row["path"]).digest(): row["path"] for row in cursor} @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "origin_add"}) @handle_raise_on_commit def origin_add(self, orgs: Dict[Sha1Git, str]) -> bool: if orgs: sql = """ INSERT INTO origin(sha1, url) VALUES %s ON CONFLICT DO NOTHING """ page_size = self.page_size or len(orgs) with self.transaction() as cursor: psycopg2.extras.execute_values( cur=cursor, sql=sql, argslist=orgs.items(), page_size=page_size, ) return True @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "open"}) def open(self) -> None: self.conn = BaseDb.connect(**self.conn_args).conn BaseDb.adapt_conn(self.conn) with self.transaction() as cursor: cursor.execute("SET timezone TO 'UTC'") @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "origin_get"}) def origin_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, str]: urls: Dict[Sha1Git, str] = {} sha1s = tuple(ids) if sha1s: # TODO: consider splitting this query in several ones if sha1s is too big! values = ", ".join(itertools.repeat("%s", len(sha1s))) sql = f""" SELECT sha1, url FROM origin WHERE sha1 IN ({values}) """ with self.transaction(readonly=True) as cursor: cursor.execute(query=sql, vars=sha1s) urls.update((row["sha1"], row["url"]) for row in cursor) return urls @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "revision_add"}) @handle_raise_on_commit def revision_add(self, revs: Dict[Sha1Git, RevisionData]) -> bool: if revs: data = [(sha1, rev.date, rev.origin) for sha1, rev in revs.items()] sql = """ INSERT INTO revision(sha1, date, origin) (SELECT V.rev AS sha1, V.date::timestamptz AS date, O.id AS origin FROM (VALUES %s) AS V(rev, date, org) LEFT JOIN origin AS O ON (O.sha1=V.org::sha1_git)) ON CONFLICT (sha1) DO UPDATE SET date=LEAST(EXCLUDED.date, revision.date), origin=COALESCE(EXCLUDED.origin, revision.origin) """ page_size = self.page_size or len(data) with self.transaction() as cursor: psycopg2.extras.execute_values( cur=cursor, sql=sql, argslist=data, page_size=page_size ) return True @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "revision_get"}) def revision_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, RevisionData]: result: Dict[Sha1Git, RevisionData] = {} sha1s = tuple(ids) if sha1s: # TODO: consider splitting this query in several ones if sha1s is too big! values = ", ".join(itertools.repeat("%s", len(sha1s))) sql = f""" SELECT R.sha1, R.date, O.sha1 AS origin FROM revision AS R LEFT JOIN origin AS O ON (O.id=R.origin) WHERE R.sha1 IN ({values}) AND (R.date is not NULL OR O.sha1 is not NULL) """ with self.transaction(readonly=True) as cursor: cursor.execute(query=sql, vars=sha1s) result.update( (row["sha1"], RevisionData(date=row["date"], origin=row["origin"])) for row in cursor ) return result @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "relation_add"}) @handle_raise_on_commit def relation_add( self, relation: RelationType, data: Dict[Sha1Git, Set[RelationData]] ) -> bool: rows = [(src, rel.dst, rel.path) for src, dsts in data.items() for rel in dsts] if rows: rel_table = relation.value src_table, *_, dst_table = rel_table.split("_") page_size = self.page_size or len(rows) # Put the next three queries in a manual single transaction: # they use the same temp table with self.transaction() as cursor: cursor.execute("SELECT swh_mktemp_relation_add()") psycopg2.extras.execute_values( cur=cursor, sql="INSERT INTO tmp_relation_add(src, dst, path) VALUES %s", argslist=rows, page_size=page_size, ) sql = "SELECT swh_provenance_relation_add_from_temp(%s, %s, %s)" cursor.execute(query=sql, vars=(rel_table, src_table, dst_table)) return True @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "relation_get"}) def relation_get( self, relation: RelationType, ids: Iterable[Sha1Git], reverse: bool = False ) -> Dict[Sha1Git, Set[RelationData]]: return self._relation_get(relation, ids, reverse) @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "relation_get_all"}) def relation_get_all( self, relation: RelationType ) -> Dict[Sha1Git, Set[RelationData]]: return self._relation_get(relation, None) def _relation_get( self, relation: RelationType, ids: Optional[Iterable[Sha1Git]], reverse: bool = False, ) -> Dict[Sha1Git, Set[RelationData]]: result: Dict[Sha1Git, Set[RelationData]] = {} sha1s: List[Sha1Git] if ids is not None: sha1s = list(ids) filter = "filter-src" if not reverse else "filter-dst" else: sha1s = [] filter = "no-filter" if filter == "no-filter" or sha1s: rel_table = relation.value src_table, *_, dst_table = rel_table.split("_") sql = "SELECT * FROM swh_provenance_relation_get(%s, %s, %s, %s, %s)" with self.transaction(readonly=True) as cursor: cursor.execute( query=sql, vars=(rel_table, src_table, dst_table, filter, sha1s) ) for row in cursor: src = row.pop("src") result.setdefault(src, set()).add(RelationData(**row)) return result - - @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "with_path"}) - def with_path(self) -> bool: - return "with-path" in self.flavor diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py index cffeb44..60ae464 100644 --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -1,179 +1,179 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from contextlib import contextmanager from datetime import datetime from functools import partial import multiprocessing from os import path from pathlib import Path from typing import Any, Dict, Generator, List from _pytest.fixtures import SubRequest from aiohttp.test_utils import TestClient, TestServer, loop_context import msgpack import psycopg2.extensions import pytest from pytest_postgresql import factories from swh.core.db.db_utils import initialize_database_for_module from swh.graph.http_rpc_server import make_app from swh.journal.serializers import msgpack_ext_hook from swh.model.model import BaseModel, TimestampWithTimezone from swh.provenance import get_provenance from swh.provenance.archive.interface import ArchiveInterface from swh.provenance.archive.storage import ArchiveStorage from swh.provenance.interface import ProvenanceInterface from swh.provenance.storage import get_provenance_storage from swh.provenance.storage.interface import ProvenanceStorageInterface from swh.provenance.storage.postgresql import ProvenanceStoragePostgreSql from swh.storage.interface import StorageInterface from swh.storage.replay import OBJECT_CONVERTERS, OBJECT_FIXERS, process_replay_objects provenance_postgresql_proc = factories.postgresql_proc( load=[ partial( initialize_database_for_module, modname="provenance", - flavor="with-path", + flavor="normalized", version=ProvenanceStoragePostgreSql.current_version, ) ], ) postgres_provenance = factories.postgresql("provenance_postgresql_proc") @pytest.fixture() def provenance_postgresqldb(request, postgres_provenance): return postgres_provenance.get_dsn_parameters() @pytest.fixture() def provenance_storage( request: SubRequest, provenance_postgresqldb: Dict[str, str], ) -> Generator[ProvenanceStorageInterface, None, None]: """Return a working and initialized ProvenanceStorageInterface object""" # in test sessions, we DO want to raise any exception occurring at commit time with get_provenance_storage( cls="postgresql", db=provenance_postgresqldb, raise_on_commit=True ) as storage: yield storage @pytest.fixture def provenance( postgres_provenance: psycopg2.extensions.connection, ) -> Generator[ProvenanceInterface, None, None]: """Return a working and initialized ProvenanceInterface object""" from swh.core.db.db_utils import ( init_admin_extensions, populate_database_for_package, ) init_admin_extensions("swh.provenance", postgres_provenance.dsn) populate_database_for_package( - "swh.provenance", postgres_provenance.dsn, flavor="with-path" + "swh.provenance", postgres_provenance.dsn, flavor="normalized" ) # in test sessions, we DO want to raise any exception occurring at commit time with get_provenance( cls="postgresql", db=postgres_provenance.get_dsn_parameters(), raise_on_commit=True, ) as provenance: yield provenance @pytest.fixture def archive(swh_storage: StorageInterface) -> ArchiveInterface: """Return an ArchiveStorage-based ArchiveInterface object""" return ArchiveStorage(swh_storage) def fill_storage(storage: StorageInterface, data: Dict[str, List[dict]]) -> None: objects = { objtype: [objs_from_dict(objtype, d) for d in dicts] for objtype, dicts in data.items() } process_replay_objects(objects, storage=storage) def get_datafile(fname: str) -> str: return path.join(path.dirname(__file__), "data", fname) # TODO: this should return Dict[str, List[BaseModel]] directly, but it requires # refactoring several tests def load_repo_data(repo: str) -> Dict[str, List[dict]]: data: Dict[str, List[dict]] = {} with open(get_datafile(f"{repo}.msgpack"), "rb") as fobj: unpacker = msgpack.Unpacker( fobj, raw=False, ext_hook=msgpack_ext_hook, strict_map_key=False, timestamp=3, # convert Timestamp in datetime objects (tz UTC) ) for msg in unpacker: if len(msg) == 2: # old format objtype, objd = msg else: # now we should have a triplet (type, key, value) objtype, _, objd = msg data.setdefault(objtype, []).append(objd) return data def objs_from_dict(object_type: str, dict_repr: dict) -> BaseModel: if object_type in OBJECT_FIXERS: dict_repr = OBJECT_FIXERS[object_type](dict_repr) obj = OBJECT_CONVERTERS[object_type](dict_repr) return obj def ts2dt(ts: Dict[str, Any]) -> datetime: return TimestampWithTimezone.from_dict(ts).to_datetime() def run_grpc_server(queue, dataset_path): try: config = { "graph": { "cls": "local", "grpc_server": {"path": dataset_path}, "http_rpc_server": {"debug": True}, } } with loop_context() as loop: app = make_app(config=config) client = TestClient(TestServer(app), loop=loop) loop.run_until_complete(client.start_server()) url = client.make_url("/graph/") queue.put((url, app["rpc_url"])) loop.run_forever() except Exception as e: queue.put(e) @contextmanager def grpc_server(dataset): dataset_path = ( Path(__file__).parents[0] / "data/swhgraph" / dataset / "compressed/example" ) queue = multiprocessing.Queue() server = multiprocessing.Process( target=run_grpc_server, kwargs={"queue": queue, "dataset_path": dataset_path} ) server.start() res = queue.get() if isinstance(res, Exception): raise res grpc_url = res[1] try: yield grpc_url finally: server.terminate() diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py index 6336483..fbf86ec 100644 --- a/swh/provenance/tests/test_cli.py +++ b/swh/provenance/tests/test_cli.py @@ -1,165 +1,160 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Dict, List, Set +from typing import Dict, List from _pytest.monkeypatch import MonkeyPatch from click.testing import CliRunner import psycopg2.extensions import pytest from swh.core.cli import swh as swhmain import swh.core.cli.db # noqa ; ensure cli is loaded from swh.core.db import BaseDb from swh.core.db.db_utils import init_admin_extensions from swh.model.hashutil import MultiHash import swh.provenance.cli # noqa ; ensure cli is loaded from swh.provenance.tests.conftest import fill_storage, load_repo_data from swh.storage.interface import StorageInterface from .conftest import get_datafile from .test_utils import invoke, write_configuration_path def test_cli_swh_db_help() -> None: # swhmain.add_command(provenance_cli) result = CliRunner().invoke(swhmain, ["provenance", "-h"]) assert result.exit_code == 0 assert "Commands:" in result.output commands = result.output.split("Commands:")[1] for command in ( "find-all", "find-first", "iter-frontiers", "iter-origins", "iter-revisions", ): assert f" {command} " in commands TABLES = { "dbflavor", "dbmodule", "dbversion", "content", "content_in_revision", "content_in_directory", "directory", "directory_in_revision", "location", "origin", "revision", "revision_before_revision", "revision_in_origin", } -@pytest.mark.parametrize( - "flavor, dbtables", (("with-path", TABLES), ("without-path", TABLES)) -) +@pytest.mark.parametrize("flavor", ("normalized", "denormalized")) def test_cli_db_create_and_init_db_with_flavor( monkeypatch: MonkeyPatch, postgresql: psycopg2.extensions.connection, flavor: str, - dbtables: Set[str], ) -> None: - """Test that 'swh db init provenance' works with flavors - - for both with-path and without-path flavors""" + """Test that 'swh db init provenance' works with flavors""" dbname = f"{flavor}-db" # DB creation using 'swh db create' db_params = postgresql.get_dsn_parameters() monkeypatch.setenv("PGHOST", db_params["host"]) monkeypatch.setenv("PGUSER", db_params["user"]) monkeypatch.setenv("PGPORT", db_params["port"]) result = CliRunner().invoke(swhmain, ["db", "create", "-d", dbname, "provenance"]) assert result.exit_code == 0, result.output # DB init using 'swh db init' result = CliRunner().invoke( swhmain, ["db", "init", "-d", dbname, "--flavor", flavor, "provenance"] ) assert result.exit_code == 0, result.output assert f"(flavor {flavor})" in result.output db_params["dbname"] = dbname cnx = BaseDb.connect(**db_params).conn # check the DB looks OK (check for db_flavor and expected tables) with cnx.cursor() as cur: cur.execute("select swh_get_dbflavor()") assert cur.fetchone() == (flavor,) cur.execute( "select table_name from information_schema.tables " "where table_schema = 'public' " f"and table_catalog = '{dbname}'" ) tables = set(x for (x,) in cur.fetchall()) - assert tables == dbtables + assert tables == TABLES def test_cli_init_db_default_flavor(postgresql: psycopg2.extensions.connection) -> None: - "Test that 'swh db init provenance' defaults to a with-path flavored DB" + "Test that 'swh db init provenance' defaults to a normalized flavored DB" dbname = postgresql.dsn init_admin_extensions("swh.provenance", dbname) result = CliRunner().invoke(swhmain, ["db", "init", "-d", dbname, "provenance"]) assert result.exit_code == 0, result.output with postgresql.cursor() as cur: cur.execute("select swh_get_dbflavor()") - assert cur.fetchone() == ("with-path",) + assert cur.fetchone() == ("normalized",) @pytest.mark.origin_layer @pytest.mark.parametrize( "subcommand", (["origin", "from-csv"], ["iter-origins"]), ) def test_cli_origin_from_csv( swh_storage: StorageInterface, subcommand: List[str], swh_storage_backend_config: Dict, provenance, tmp_path, ): repo = "cmdbts2" origin_url = f"https://{repo}" data = load_repo_data(repo) fill_storage(swh_storage, data) assert len(data["origin"]) >= 1 assert origin_url in [o["url"] for o in data["origin"]] cfg = { "provenance": { "archive": { "cls": "api", "storage": swh_storage_backend_config, }, "storage": { "cls": "postgresql", # "db": provenance.storage.conn.dsn, "db": provenance.storage.conn.get_dsn_parameters(), }, }, } config_path = write_configuration_path(cfg, tmp_path) csv_filepath = get_datafile("origins.csv") subcommand = subcommand + [csv_filepath] result = invoke(subcommand, config_path) assert result.exit_code == 0, f"Unexpected result: {result.output}" origin_sha1 = MultiHash.from_data( origin_url.encode(), hash_names=["sha1"] ).digest()["sha1"] actual_result = provenance.storage.origin_get([origin_sha1]) assert actual_result == {origin_sha1: origin_url} diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py index a0a0c5f..a98be40 100644 --- a/swh/provenance/tests/test_provenance_db.py +++ b/swh/provenance/tests/test_provenance_db.py @@ -1,17 +1,15 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.provenance.interface import ProvenanceInterface from swh.provenance.storage.postgresql import ProvenanceStoragePostgreSql def test_provenance_flavor(provenance: ProvenanceInterface) -> None: if isinstance(provenance.storage, ProvenanceStoragePostgreSql): assert provenance.storage.flavor in ( - "with-path", - "without-path", - "with-path-denormalized", - "without-path-denormalized", + "normalized", + "denormalized", ) diff --git a/swh/provenance/tests/test_provenance_storage.py b/swh/provenance/tests/test_provenance_storage.py index a76f922..0f85103 100644 --- a/swh/provenance/tests/test_provenance_storage.py +++ b/swh/provenance/tests/test_provenance_storage.py @@ -1,531 +1,487 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import hashlib import inspect import os from typing import Any, Dict, Iterable, Optional, Set, Tuple import pytest from swh.model.hashutil import hash_to_bytes from swh.model.model import Origin, Sha1Git from swh.provenance.algos.origin import origin_add from swh.provenance.algos.revision import revision_add from swh.provenance.archive import ArchiveInterface from swh.provenance.interface import ProvenanceInterface from swh.provenance.model import OriginEntry, RevisionEntry from swh.provenance.provenance import Provenance from swh.provenance.storage.interface import ( DirectoryData, EntityType, ProvenanceResult, ProvenanceStorageInterface, RelationData, RelationType, RevisionData, ) from swh.provenance.tests.conftest import fill_storage, load_repo_data, ts2dt class TestProvenanceStorage: def test_provenance_storage_content( self, provenance_storage: ProvenanceStorageInterface, ) -> None: """Tests content methods for every `ProvenanceStorageInterface` implementation.""" # Read data/README.md for more details on how these datasets are generated. data = load_repo_data("cmdbts2") # Add all content present in the current repo to the storage, just assigning their # creation dates. Then check that the returned results when querying are the same. cnt_dates = { cnt["sha1_git"]: cnt["ctime"] for idx, cnt in enumerate(data["content"]) } assert provenance_storage.content_add(cnt_dates) assert provenance_storage.content_get(set(cnt_dates.keys())) == cnt_dates assert provenance_storage.entity_get_all(EntityType.CONTENT) == set( cnt_dates.keys() ) def test_provenance_storage_directory( self, provenance_storage: ProvenanceStorageInterface, ) -> None: """Tests directory methods for every `ProvenanceStorageInterface` implementation.""" # Read data/README.md for more details on how these datasets are generated. data = load_repo_data("cmdbts2") # Of all directories present in the current repo, only assign a date to those # containing blobs (picking the max date among the available ones). Then check that # the returned results when querying are the same. def getmaxdate( directory: Dict[str, Any], contents: Iterable[Dict[str, Any]] ) -> Optional[datetime]: dates = [ content["ctime"] for entry in directory["entries"] for content in contents if entry["type"] == "file" and entry["target"] == content["sha1_git"] ] return max(dates) if dates else None flat_values = (False, True) dir_dates = {} for idx, dir in enumerate(data["directory"]): date = getmaxdate(dir, data["content"]) if date is not None: dir_dates[dir["id"]] = DirectoryData( date=date, flat=flat_values[idx % 2] ) assert provenance_storage.directory_add(dir_dates) assert provenance_storage.directory_get(set(dir_dates.keys())) == dir_dates assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == set( dir_dates.keys() ) def test_provenance_storage_location( self, provenance_storage: ProvenanceStorageInterface, ) -> None: """Tests location methods for every `ProvenanceStorageInterface` implementation.""" # Read data/README.md for more details on how these datasets are generated. data = load_repo_data("cmdbts2") # Add all names of entries present in the directories of the current repo as paths # to the storage. Then check that the returned results when querying are the same. paths = { hashlib.sha1(entry["name"]).digest(): entry["name"] for dir in data["directory"] for entry in dir["entries"] } assert provenance_storage.location_add(paths) - - if provenance_storage.with_path(): - assert provenance_storage.location_get_all() == paths - else: - assert not provenance_storage.location_get_all() + assert provenance_storage.location_get_all() == paths @pytest.mark.origin_layer def test_provenance_storage_origin( self, provenance_storage: ProvenanceStorageInterface, ) -> None: """Tests origin methods for every `ProvenanceStorageInterface` implementation.""" # Read data/README.md for more details on how these datasets are generated. data = load_repo_data("cmdbts2") # Test origin methods. # Add all origins present in the current repo to the storage. Then check that the # returned results when querying are the same. orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]} assert orgs assert provenance_storage.origin_add(orgs) assert provenance_storage.origin_get(set(orgs.keys())) == orgs assert provenance_storage.entity_get_all(EntityType.ORIGIN) == set(orgs.keys()) def test_provenance_storage_revision( self, provenance_storage: ProvenanceStorageInterface, ) -> None: """Tests revision methods for every `ProvenanceStorageInterface` implementation.""" # Read data/README.md for more details on how these datasets are generated. data = load_repo_data("cmdbts2") # Test revision methods. # Add all revisions present in the current repo to the storage, assigning their # dates and an arbitrary origin to each one. Then check that the returned results # when querying are the same. origin = Origin(url=next(iter(data["origin"]))["url"]) # Origin must be inserted in advance. assert provenance_storage.origin_add({origin.id: origin.url}) revs = {rev["id"] for idx, rev in enumerate(data["revision"])} rev_data = { rev["id"]: RevisionData( date=ts2dt(rev["date"]) if idx % 2 != 0 else None, origin=origin.id if idx % 3 != 0 else None, ) for idx, rev in enumerate(data["revision"]) } assert revs assert provenance_storage.revision_add(rev_data) assert provenance_storage.revision_get(set(rev_data.keys())) == { k: v for (k, v) in rev_data.items() if v.date is not None or v.origin is not None } assert provenance_storage.entity_get_all(EntityType.REVISION) == set(rev_data) def test_provenance_storage_relation_revision_layer( self, provenance_storage: ProvenanceStorageInterface, ) -> None: """Tests relation methods for every `ProvenanceStorageInterface` implementation.""" # Read data/README.md for more details on how these datasets are generated. data = load_repo_data("cmdbts2") # Test content-in-revision relation. # Create flat models of every root directory for the revisions in the dataset. cnt_in_rev: Dict[Sha1Git, Set[RelationData]] = {} for rev in data["revision"]: root = next( subdir for subdir in data["directory"] if subdir["id"] == rev["directory"] ) for cnt, rel in dircontent(data, rev["id"], root): cnt_in_rev.setdefault(cnt, set()).add(rel) relation_add_and_compare_result( provenance_storage, RelationType.CNT_EARLY_IN_REV, cnt_in_rev ) # Test content-in-directory relation. # Create flat models for every directory in the dataset. cnt_in_dir: Dict[Sha1Git, Set[RelationData]] = {} for dir in data["directory"]: for cnt, rel in dircontent(data, dir["id"], dir): cnt_in_dir.setdefault(cnt, set()).add(rel) relation_add_and_compare_result( provenance_storage, RelationType.CNT_IN_DIR, cnt_in_dir ) # Test content-in-directory relation. # Add root directories to their correspondent revision in the dataset. dir_in_rev: Dict[Sha1Git, Set[RelationData]] = {} for rev in data["revision"]: dir_in_rev.setdefault(rev["directory"], set()).add( RelationData(dst=rev["id"], path=b".") ) relation_add_and_compare_result( provenance_storage, RelationType.DIR_IN_REV, dir_in_rev ) @pytest.mark.origin_layer def test_provenance_storage_relation_orign_layer( self, provenance_storage: ProvenanceStorageInterface, ) -> None: """Tests relation methods for every `ProvenanceStorageInterface` implementation.""" # Read data/README.md for more details on how these datasets are generated. data = load_repo_data("cmdbts2") # Test revision-in-origin relation. # Origins must be inserted in advance (cannot be done by `entity_add` inside # `relation_add_and_compare_result`). orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]} assert provenance_storage.origin_add(orgs) # Add all revisions that are head of some snapshot branch to the corresponding # origin. rev_in_org: Dict[Sha1Git, Set[RelationData]] = {} for status in data["origin_visit_status"]: if status["snapshot"] is not None: for snapshot in data["snapshot"]: if snapshot["id"] == status["snapshot"]: for branch in snapshot["branches"].values(): if branch["target_type"] == "revision": rev_in_org.setdefault(branch["target"], set()).add( RelationData( dst=Origin(url=status["origin"]).id, path=None, ) ) relation_add_and_compare_result( provenance_storage, RelationType.REV_IN_ORG, rev_in_org ) # Test revision-before-revision relation. # For each revision in the data set add an entry for each parent to the relation. rev_before_rev: Dict[Sha1Git, Set[RelationData]] = {} for rev in data["revision"]: for parent in rev["parents"]: rev_before_rev.setdefault(parent, set()).add( RelationData(dst=rev["id"], path=None) ) relation_add_and_compare_result( provenance_storage, RelationType.REV_BEFORE_REV, rev_before_rev ) def test_provenance_storage_find_revision_layer( self, provenance: ProvenanceInterface, provenance_storage: ProvenanceStorageInterface, archive: ArchiveInterface, ) -> None: """Tests `content_find_first` and `content_find_all` methods for every `ProvenanceStorageInterface` implementation. """ # Read data/README.md for more details on how these datasets are generated. data = load_repo_data("cmdbts2") fill_storage(archive.storage, data) # Test content_find_first and content_find_all, first only executing the # revision-content algorithm, then adding the origin-revision layer. - def adapt_result( - result: Optional[ProvenanceResult], with_path: bool - ) -> Optional[ProvenanceResult]: - if result is not None: - return ProvenanceResult( - result.content, - result.revision, - result.date, - result.origin, - result.path if with_path else b"", - ) - return result # Execute the revision-content algorithm on both storages. revisions = [ RevisionEntry(id=rev["id"], date=ts2dt(rev["date"]), root=rev["directory"]) for rev in data["revision"] ] revision_add(provenance, archive, revisions) revision_add(Provenance(provenance_storage), archive, revisions) - assert adapt_result( - ProvenanceResult( - content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), - revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"), - date=datetime.fromtimestamp(1000000000.0, timezone.utc), - origin=None, - path=b"A/B/C/a", - ), - provenance_storage.with_path(), + assert ProvenanceResult( + content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), + revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"), + date=datetime.fromtimestamp(1000000000.0, timezone.utc), + origin=None, + path=b"A/B/C/a", ) == provenance_storage.content_find_first( hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494") ) for cnt in {cnt["sha1_git"] for cnt in data["content"]}: - assert adapt_result( - provenance.storage.content_find_first(cnt), - provenance_storage.with_path(), + assert provenance.storage.content_find_first( + cnt ) == provenance_storage.content_find_first(cnt) - assert { - adapt_result(occur, provenance_storage.with_path()) - for occur in provenance.storage.content_find_all(cnt) - } == set(provenance_storage.content_find_all(cnt)) + assert set(provenance.storage.content_find_all(cnt)) == set( + provenance_storage.content_find_all(cnt) + ) @pytest.mark.origin_layer def test_provenance_storage_find_origin_layer( self, provenance: ProvenanceInterface, provenance_storage: ProvenanceStorageInterface, archive: ArchiveInterface, ) -> None: """Tests `content_find_first` and `content_find_all` methods for every `ProvenanceStorageInterface` implementation. """ # Read data/README.md for more details on how these datasets are generated. data = load_repo_data("cmdbts2") fill_storage(archive.storage, data) # Execute the revision-content algorithm on both storages. revisions = [ RevisionEntry(id=rev["id"], date=ts2dt(rev["date"]), root=rev["directory"]) for rev in data["revision"] ] revision_add(provenance, archive, revisions) revision_add(Provenance(provenance_storage), archive, revisions) # Test content_find_first and content_find_all, first only executing the # revision-content algorithm, then adding the origin-revision layer. - def adapt_result( - result: Optional[ProvenanceResult], with_path: bool - ) -> Optional[ProvenanceResult]: - if result is not None: - return ProvenanceResult( - result.content, - result.revision, - result.date, - result.origin, - result.path if with_path else b"", - ) - return result # Execute the origin-revision algorithm on both storages. origins = [ OriginEntry(url=sta["origin"], snapshot=sta["snapshot"]) for sta in data["origin_visit_status"] if sta["snapshot"] is not None ] origin_add(provenance, archive, origins) origin_add(Provenance(provenance_storage), archive, origins) - assert adapt_result( - ProvenanceResult( - content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), - revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"), - date=datetime.fromtimestamp(1000000000.0, timezone.utc), - origin="https://cmdbts2", - path=b"A/B/C/a", - ), - provenance_storage.with_path(), + assert ProvenanceResult( + content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), + revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"), + date=datetime.fromtimestamp(1000000000.0, timezone.utc), + origin="https://cmdbts2", + path=b"A/B/C/a", ) == provenance_storage.content_find_first( hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494") ) for cnt in {cnt["sha1_git"] for cnt in data["content"]}: - assert adapt_result( - provenance.storage.content_find_first(cnt), - provenance_storage.with_path(), + assert provenance.storage.content_find_first( + cnt ) == provenance_storage.content_find_first(cnt) - assert { - adapt_result(occur, provenance_storage.with_path()) - for occur in provenance.storage.content_find_all(cnt) - } == set(provenance_storage.content_find_all(cnt)) + assert set(provenance.storage.content_find_all(cnt)) == set( + provenance_storage.content_find_all(cnt) + ) def test_types(self, provenance_storage: ProvenanceStorageInterface) -> None: """Checks all methods of ProvenanceStorageInterface are implemented by this backend, and that they have the same signature.""" # Create an instance of the protocol (which cannot be instantiated # directly, so this creates a subclass, then instantiates it) interface = type("_", (ProvenanceStorageInterface,), {})() assert "content_find_first" in dir(interface) missing_methods = [] for meth_name in dir(interface): if meth_name.startswith("_"): continue interface_meth = getattr(interface, meth_name) try: concrete_meth = getattr(provenance_storage, meth_name) except AttributeError: if not getattr(interface_meth, "deprecated_endpoint", False): # The backend is missing a (non-deprecated) endpoint missing_methods.append(meth_name) continue expected_signature = inspect.signature(interface_meth) actual_signature = inspect.signature(concrete_meth) assert expected_signature == actual_signature, meth_name assert missing_methods == [] # If all the assertions above succeed, then this one should too. # But there's no harm in double-checking. # And we could replace the assertions above by this one, but unlike # the assertions above, it doesn't explain what is missing. assert isinstance(provenance_storage, ProvenanceStorageInterface) def dircontent( data: Dict[str, Any], ref: Sha1Git, dir: Dict[str, Any], prefix: bytes = b"", ) -> Iterable[Tuple[Sha1Git, RelationData]]: content = { ( entry["target"], RelationData(dst=ref, path=os.path.join(prefix, entry["name"])), ) for entry in dir["entries"] if entry["type"] == "file" } for entry in dir["entries"]: if entry["type"] == "dir": child = next( subdir for subdir in data["directory"] if subdir["id"] == entry["target"] ) content.update( dircontent(data, ref, child, os.path.join(prefix, entry["name"])) ) return content def entity_add( storage: ProvenanceStorageInterface, entity: EntityType, ids: Set[Sha1Git] ) -> bool: now = datetime.now(tz=timezone.utc) if entity == EntityType.CONTENT: return storage.content_add({sha1: now for sha1 in ids}) elif entity == EntityType.DIRECTORY: return storage.directory_add( {sha1: DirectoryData(date=now, flat=False) for sha1 in ids} ) else: # entity == EntityType.REVISION: return storage.revision_add( {sha1: RevisionData(date=None, origin=None) for sha1 in ids} ) def relation_add_and_compare_result( storage: ProvenanceStorageInterface, relation: RelationType, data: Dict[Sha1Git, Set[RelationData]], ) -> None: # Source, destinations and locations must be added in advance. src, *_, dst = relation.value.split("_") srcs = {sha1 for sha1 in data} if src != "origin": assert entity_add(storage, EntityType(src), srcs) dsts = {rel.dst for rels in data.values() for rel in rels} if dst != "origin": assert entity_add(storage, EntityType(dst), dsts) - if storage.with_path(): - assert storage.location_add( - { - hashlib.sha1(rel.path).digest(): rel.path - for rels in data.values() - for rel in rels - if rel.path is not None - } - ) + assert storage.location_add( + { + hashlib.sha1(rel.path).digest(): rel.path + for rels in data.values() + for rel in rels + if rel.path is not None + } + ) assert data assert storage.relation_add(relation, data) for src_sha1 in srcs: relation_compare_result( storage.relation_get(relation, [src_sha1]), {src_sha1: data[src_sha1]}, - storage.with_path(), ) for dst_sha1 in dsts: relation_compare_result( storage.relation_get(relation, [dst_sha1], reverse=True), { src_sha1: { RelationData(dst=dst_sha1, path=rel.path) for rel in rels if dst_sha1 == rel.dst } for src_sha1, rels in data.items() if dst_sha1 in {rel.dst for rel in rels} }, - storage.with_path(), ) relation_compare_result( - storage.relation_get_all(relation), data, storage.with_path() + storage.relation_get_all(relation), + data, ) def relation_compare_result( computed: Dict[Sha1Git, Set[RelationData]], expected: Dict[Sha1Git, Set[RelationData]], - with_path: bool, ) -> None: assert { - src_sha1: { - RelationData(dst=rel.dst, path=rel.path if with_path else None) - for rel in rels - } + src_sha1: {RelationData(dst=rel.dst, path=rel.path) for rel in rels} for src_sha1, rels in expected.items() } == computed diff --git a/swh/provenance/tests/test_provenance_storage_without_path.py b/swh/provenance/tests/test_provenance_storage_denormalized.py similarity index 95% rename from swh/provenance/tests/test_provenance_storage_without_path.py rename to swh/provenance/tests/test_provenance_storage_denormalized.py index b423e0c..8fc9062 100644 --- a/swh/provenance/tests/test_provenance_storage_without_path.py +++ b/swh/provenance/tests/test_provenance_storage_denormalized.py @@ -1,24 +1,24 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from functools import partial from pytest_postgresql import factories from swh.core.db.db_utils import initialize_database_for_module from swh.provenance.storage.postgresql import ProvenanceStoragePostgreSql from .test_provenance_storage import TestProvenanceStorage # noqa: F401 provenance_postgresql_proc = factories.postgresql_proc( load=[ partial( initialize_database_for_module, modname="provenance", - flavor="without-path", + flavor="denormalized", version=ProvenanceStoragePostgreSql.current_version, ) ], ) diff --git a/swh/provenance/tests/test_provenance_storage_rabbitmq.py b/swh/provenance/tests/test_provenance_storage_rabbitmq.py index 217fb9d..bfe2ecb 100644 --- a/swh/provenance/tests/test_provenance_storage_rabbitmq.py +++ b/swh/provenance/tests/test_provenance_storage_rabbitmq.py @@ -1,43 +1,48 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Dict, Generator import pytest from swh.provenance import get_provenance_storage from swh.provenance.storage.interface import ProvenanceStorageInterface -from .test_provenance_storage import TestProvenanceStorage # noqa: F401 +from .test_provenance_storage import TestProvenanceStorage as _TestProvenanceStorage @pytest.fixture() def provenance_storage( provenance_postgresqldb: Dict[str, str], rabbitmq, ) -> Generator[ProvenanceStorageInterface, None, None]: """Return a working and initialized ProvenanceStorageInterface object""" from swh.provenance.storage.rabbitmq.server import ProvenanceStorageRabbitMQServer host = rabbitmq.args["host"] port = rabbitmq.args["port"] rabbitmq_params: Dict[str, Any] = { "url": f"amqp://guest:guest@{host}:{port}/%2f", "storage_config": { "cls": "postgresql", "db": provenance_postgresqldb, "raise_on_commit": True, }, } server = ProvenanceStorageRabbitMQServer( url=rabbitmq_params["url"], storage_config=rabbitmq_params["storage_config"] ) server.start() try: with get_provenance_storage(cls="rabbitmq", **rabbitmq_params) as storage: yield storage finally: server.stop() + + +@pytest.mark.rabbitmq +class TestProvenanceStorageRabbitMQ(_TestProvenanceStorage): + pass diff --git a/swh/provenance/tests/test_provenance_storage_with_path_denormalized.py b/swh/provenance/tests/test_provenance_storage_with_path_denormalized.py deleted file mode 100644 index cb3fa61..0000000 --- a/swh/provenance/tests/test_provenance_storage_with_path_denormalized.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (C) 2021-2022 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from functools import partial - -from pytest_postgresql import factories - -from swh.core.db.db_utils import initialize_database_for_module -from swh.provenance.storage.postgresql import ProvenanceStoragePostgreSql - -from .test_provenance_storage import TestProvenanceStorage # noqa: F401 - -provenance_postgresql_proc = factories.postgresql_proc( - load=[ - partial( - initialize_database_for_module, - modname="provenance", - flavor="with-path-denormalized", - version=ProvenanceStoragePostgreSql.current_version, - ) - ], -) diff --git a/swh/provenance/tests/test_provenance_storage_without_path_denormalized.py b/swh/provenance/tests/test_provenance_storage_without_path_denormalized.py deleted file mode 100644 index 8c282fc..0000000 --- a/swh/provenance/tests/test_provenance_storage_without_path_denormalized.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (C) 2021-2022 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from functools import partial - -from pytest_postgresql import factories - -from swh.core.db.db_utils import initialize_database_for_module -from swh.provenance.storage.postgresql import ProvenanceStoragePostgreSql - -from .test_provenance_storage import TestProvenanceStorage # noqa: F401 - -provenance_postgresql_proc = factories.postgresql_proc( - load=[ - partial( - initialize_database_for_module, - modname="provenance", - flavor="without-path-denormalized", - version=ProvenanceStoragePostgreSql.current_version, - ) - ], -) diff --git a/swh/provenance/tests/test_revision_content_layer.py b/swh/provenance/tests/test_revision_content_layer.py index d566a91..958f7c4 100644 --- a/swh/provenance/tests/test_revision_content_layer.py +++ b/swh/provenance/tests/test_revision_content_layer.py @@ -1,482 +1,466 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple import pytest from typing_extensions import TypedDict from swh.model.hashutil import hash_to_bytes from swh.model.model import Sha1Git from swh.provenance.algos.directory import directory_add from swh.provenance.algos.revision import revision_add from swh.provenance.archive import ArchiveInterface from swh.provenance.interface import ProvenanceInterface from swh.provenance.model import DirectoryEntry, RevisionEntry from swh.provenance.storage.interface import EntityType, RelationType from swh.provenance.tests.conftest import ( fill_storage, get_datafile, load_repo_data, ts2dt, ) class SynthRelation(TypedDict): prefix: Optional[str] path: str src: Sha1Git dst: Sha1Git rel_ts: float class SynthRevision(TypedDict): sha1: Sha1Git date: float msg: str R_C: List[SynthRelation] R_D: List[SynthRelation] D_C: List[SynthRelation] def synthetic_revision_content_result(filename: str) -> Iterator[SynthRevision]: """Generates dict representations of synthetic revisions found in the synthetic file (from the data/ directory) given as argument of the generator. Generated SynthRevision (typed dict) with the following elements: "sha1": (Sha1Git) sha1 of the revision, "date": (float) timestamp of the revision, "msg": (str) commit message of the revision, "R_C": (list) new R---C relations added by this revision "R_D": (list) new R-D relations added by this revision "D_C": (list) new D-C relations added by this revision Each relation above is a SynthRelation typed dict with: "path": (str) location "src": (Sha1Git) sha1 of the source of the relation "dst": (Sha1Git) sha1 of the destination of the relation "rel_ts": (float) timestamp of the target of the relation (related to the timestamp of the revision) """ with open(get_datafile(filename), "r") as fobj: yield from _parse_synthetic_revision_content_file(fobj) def _parse_synthetic_revision_content_file( fobj: Iterable[str], ) -> Iterator[SynthRevision]: """Read a 'synthetic' file and generate a dict representation of the synthetic revision for each revision listed in the synthetic file. """ regs = [ "(?PR[0-9]{2,4})?", "(?P[^| ]*)", "([+] )?(?P[^| +]*?)[/]?", "(?P[RDC]) (?P[0-9a-f]{40})", "(?P-?[0-9]+(.[0-9]+)?)", ] regex = re.compile("^ *" + r" *[|] *".join(regs) + r" *(#.*)?$") current_rev: List[dict] = [] for m in (regex.match(line) for line in fobj): if m: d = m.groupdict() if d["revname"]: if current_rev: yield _mk_synth_rev(current_rev) current_rev.clear() current_rev.append(d) if current_rev: yield _mk_synth_rev(current_rev) def _mk_synth_rev(synth_rev: List[Dict[str, str]]) -> SynthRevision: assert synth_rev[0]["type"] == "R" rev = SynthRevision( sha1=hash_to_bytes(synth_rev[0]["sha1"]), date=float(synth_rev[0]["ts"]), msg=synth_rev[0]["revname"], R_C=[], R_D=[], D_C=[], ) current_path = None # path of the last R-D relation we parsed, used a prefix for next D-C # relations for row in synth_rev[1:]: if row["reltype"] == "R---C": assert row["type"] == "C" rev["R_C"].append( SynthRelation( prefix=None, path=row["path"], src=rev["sha1"], dst=hash_to_bytes(row["sha1"]), rel_ts=float(row["ts"]), ) ) current_path = None elif row["reltype"] == "R-D": assert row["type"] == "D" rev["R_D"].append( SynthRelation( prefix=None, path=row["path"], src=rev["sha1"], dst=hash_to_bytes(row["sha1"]), rel_ts=float(row["ts"]), ) ) current_path = row["path"] elif row["reltype"] == "D-C": assert row["type"] == "C" rev["D_C"].append( SynthRelation( prefix=current_path, path=row["path"], src=rev["R_D"][-1]["dst"], dst=hash_to_bytes(row["sha1"]), rel_ts=float(row["ts"]), ) ) return rev @pytest.mark.parametrize( "repo, lower, mindepth, flatten", ( ("cmdbts2", True, 1, True), ("cmdbts2", True, 1, False), ("cmdbts2", False, 1, True), ("cmdbts2", False, 1, False), ("cmdbts2", True, 2, True), ("cmdbts2", True, 2, False), ("cmdbts2", False, 2, True), ("cmdbts2", False, 2, False), ("out-of-order", True, 1, True), ("out-of-order", True, 1, False), ), ) def test_revision_content_result( provenance: ProvenanceInterface, archive: ArchiveInterface, repo: str, lower: bool, mindepth: int, flatten: bool, ) -> None: # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(archive.storage, data) syntheticfile = get_datafile( f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" ) revisions = {rev["id"]: rev for rev in data["revision"]} rows: Dict[str, Set[Any]] = { "content": set(), "content_in_directory": set(), "content_in_revision": set(), "directory": set(), "directory_in_revision": set(), "location": set(), "revision": set(), } - def maybe_path(path: str) -> Optional[bytes]: - if provenance.storage.with_path(): - return path.encode("utf-8") - return None - for synth_rev in synthetic_revision_content_result(syntheticfile): revision = revisions[synth_rev["sha1"]] entry = RevisionEntry( id=revision["id"], date=ts2dt(revision["date"]), root=revision["directory"], ) if flatten: revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth) else: prev_directories = provenance.storage.entity_get_all(EntityType.DIRECTORY) revision_add( provenance, archive, [entry], lower=lower, mindepth=mindepth, flatten=False, ) directories = [ DirectoryEntry(id=sha1) for sha1 in provenance.storage.entity_get_all( EntityType.DIRECTORY ).difference(prev_directories) ] for directory in directories: assert not provenance.directory_already_flattened(directory) directory_add(provenance, archive, directories) # each "entry" in the synth file is one new revision rows["revision"].add(synth_rev["sha1"]) assert rows["revision"] == provenance.storage.entity_get_all( EntityType.REVISION ), synth_rev["msg"] # check the timestamp of the revision rev_ts = synth_rev["date"] rev_data = provenance.storage.revision_get([synth_rev["sha1"]])[ synth_rev["sha1"] ] assert ( rev_data.date is not None and rev_ts == rev_data.date.timestamp() ), synth_rev["msg"] # this revision might have added new content objects rows["content"] |= set(x["dst"] for x in synth_rev["R_C"]) rows["content"] |= set(x["dst"] for x in synth_rev["D_C"]) assert rows["content"] == provenance.storage.entity_get_all( EntityType.CONTENT ), synth_rev["msg"] # check for R-C (direct) entries # these are added directly in the content_early_in_rev table rows["content_in_revision"] |= set( - (x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["R_C"] + (x["dst"], x["src"], x["path"].encode()) for x in synth_rev["R_C"] ) assert rows["content_in_revision"] == { (src, rel.dst, rel.path) for src, rels in provenance.storage.relation_get_all( RelationType.CNT_EARLY_IN_REV ).items() for rel in rels }, synth_rev["msg"] # check timestamps for rc in synth_rev["R_C"]: assert ( rev_ts + rc["rel_ts"] == provenance.storage.content_get([rc["dst"]])[rc["dst"]].timestamp() ), synth_rev["msg"] # check directories # each directory stored in the provenance index is an entry # in the "directory" table... rows["directory"] |= set(x["dst"] for x in synth_rev["R_D"]) assert rows["directory"] == provenance.storage.entity_get_all( EntityType.DIRECTORY ), synth_rev["msg"] # ... + a number of rows in the "directory_in_rev" table... # check for R-D entries rows["directory_in_revision"] |= set( - (x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["R_D"] + (x["dst"], x["src"], x["path"].encode()) for x in synth_rev["R_D"] ) assert rows["directory_in_revision"] == { (src, rel.dst, rel.path) for src, rels in provenance.storage.relation_get_all( RelationType.DIR_IN_REV ).items() for rel in rels }, synth_rev["msg"] # check timestamps for rd in synth_rev["R_D"]: dir_data = provenance.storage.directory_get([rd["dst"]])[rd["dst"]] assert rev_ts + rd["rel_ts"] == dir_data.date.timestamp(), synth_rev["msg"] assert dir_data.flat, synth_rev["msg"] # ... + a number of rows in the "content_in_dir" table # for content of the directory. # check for D-C entries rows["content_in_directory"] |= set( - (x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["D_C"] + (x["dst"], x["src"], x["path"].encode()) for x in synth_rev["D_C"] ) assert rows["content_in_directory"] == { (src, rel.dst, rel.path) for src, rels in provenance.storage.relation_get_all( RelationType.CNT_IN_DIR ).items() for rel in rels }, synth_rev["msg"] # check timestamps for dc in synth_rev["D_C"]: assert ( rev_ts + dc["rel_ts"] == provenance.storage.content_get([dc["dst"]])[dc["dst"]].timestamp() ), synth_rev["msg"] - if provenance.storage.with_path(): - # check for location entries - rows["location"] |= set(x["path"].encode() for x in synth_rev["R_C"]) - rows["location"] |= set(x["path"].encode() for x in synth_rev["D_C"]) - rows["location"] |= set(x["path"].encode() for x in synth_rev["R_D"]) - assert rows["location"] == set( - provenance.storage.location_get_all().values() - ), synth_rev["msg"] + # check for location entries + rows["location"] |= set(x["path"].encode() for x in synth_rev["R_C"]) + rows["location"] |= set(x["path"].encode() for x in synth_rev["D_C"]) + rows["location"] |= set(x["path"].encode() for x in synth_rev["R_D"]) + assert rows["location"] == set( + provenance.storage.location_get_all().values() + ), synth_rev["msg"] @pytest.mark.parametrize( "repo, lower, mindepth", ( ("cmdbts2", True, 1), ("cmdbts2", False, 1), ("cmdbts2", True, 2), ("cmdbts2", False, 2), ("out-of-order", True, 1), ), ) @pytest.mark.parametrize("batch", (True, False)) def test_provenance_heuristics_content_find_all( provenance: ProvenanceInterface, archive: ArchiveInterface, repo: str, lower: bool, mindepth: int, batch: bool, ) -> None: # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(archive.storage, data) revisions = [ RevisionEntry( id=revision["id"], date=ts2dt(revision["date"]), root=revision["directory"], ) for revision in data["revision"] ] - def maybe_path(path: str) -> str: - if provenance.storage.with_path(): - return path - return "" - if batch: revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth) else: for revision in revisions: revision_add( provenance, archive, [revision], lower=lower, mindepth=mindepth ) syntheticfile = get_datafile( f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" ) expected_occurrences: Dict[str, List[Tuple[str, float, Optional[str], str]]] = {} for synth_rev in synthetic_revision_content_result(syntheticfile): rev_id = synth_rev["sha1"].hex() rev_ts = synth_rev["date"] for rc in synth_rev["R_C"]: expected_occurrences.setdefault(rc["dst"].hex(), []).append( - (rev_id, rev_ts, None, maybe_path(rc["path"])) + (rev_id, rev_ts, None, rc["path"]) ) for dc in synth_rev["D_C"]: assert dc["prefix"] is not None # to please mypy expected_occurrences.setdefault(dc["dst"].hex(), []).append( - (rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"])) + (rev_id, rev_ts, None, dc["prefix"] + "/" + dc["path"]) ) for content_id, results in expected_occurrences.items(): expected = [(content_id, *result) for result in results] db_occurrences = [ ( occur.content.hex(), occur.revision.hex(), occur.date.timestamp(), occur.origin, occur.path.decode(), ) for occur in provenance.content_find_all(hash_to_bytes(content_id)) ] - if provenance.storage.with_path(): - # this is not true if the db stores no path, because a same content - # that appears several times in a given revision may be reported - # only once by content_find_all() - assert len(db_occurrences) == len(expected) + assert len(db_occurrences) == len(expected) assert set(db_occurrences) == set(expected) @pytest.mark.parametrize( "repo, lower, mindepth", ( ("cmdbts2", True, 1), ("cmdbts2", False, 1), ("cmdbts2", True, 2), ("cmdbts2", False, 2), ("out-of-order", True, 1), ), ) @pytest.mark.parametrize("batch", (True, False)) def test_provenance_heuristics_content_find_first( provenance: ProvenanceInterface, archive: ArchiveInterface, repo: str, lower: bool, mindepth: int, batch: bool, ) -> None: # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(archive.storage, data) revisions = [ RevisionEntry( id=revision["id"], date=ts2dt(revision["date"]), root=revision["directory"], ) for revision in data["revision"] ] if batch: revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth) else: for revision in revisions: revision_add( provenance, archive, [revision], lower=lower, mindepth=mindepth ) syntheticfile = get_datafile( f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" ) expected_first: Dict[str, Tuple[str, float, List[str]]] = {} # dict of tuples (blob_id, rev_id, [path, ...]) the third element for path # is a list because a content can be added at several places in a single # revision, in which case the result of content_find_first() is one of # those path, but we have no guarantee which one it will return. for synth_rev in synthetic_revision_content_result(syntheticfile): rev_id = synth_rev["sha1"].hex() rev_ts = synth_rev["date"] for rc in synth_rev["R_C"]: sha1 = rc["dst"].hex() if sha1 not in expected_first: assert rc["rel_ts"] == 0 expected_first[sha1] = (rev_id, rev_ts, [rc["path"]]) else: if rev_ts == expected_first[sha1][1]: expected_first[sha1][2].append(rc["path"]) elif rev_ts < expected_first[sha1][1]: expected_first[sha1] = (rev_id, rev_ts, [rc["path"]]) for dc in synth_rev["D_C"]: sha1 = rc["dst"].hex() assert sha1 in expected_first # nothing to do there, this content cannot be a "first seen file" for content_id, (rev_id, ts, paths) in expected_first.items(): occur = provenance.content_find_first(hash_to_bytes(content_id)) assert occur is not None assert occur.content.hex() == content_id assert occur.revision.hex() == rev_id assert occur.date.timestamp() == ts assert occur.origin is None - if provenance.storage.with_path(): - assert occur.path.decode() in paths + assert occur.path.decode() in paths