diff --git a/sql/upgrades/116.sql b/sql/upgrades/116.sql new file mode 100644 index 0000000..991f81f --- /dev/null +++ b/sql/upgrades/116.sql @@ -0,0 +1,81 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 115 +-- to_version: 116 +-- description: + +insert into dbversion(version, release, description) +values(116, now(), 'Work In Progress'); + +drop table origin_metadata_translation; + +create table origin_intrinsic_metadata( + origin_id bigserial not null, + metadata jsonb, + indexer_configuration_id bigint not null, + from_revision sha1_git not null +); + +comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin'; +comment on column origin_intrinsic_metadata.origin_id is 'the entry id in origin'; +comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision'; +comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata'; +comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.'; + +-- create a temporary table for retrieving origin_intrinsic_metadata +create or replace function swh_mktemp_origin_intrinsic_metadata() + returns void + language sql +as $$ + create temporary table tmp_origin_intrinsic_metadata ( + like origin_intrinsic_metadata including defaults + ) on commit drop; +$$; + +comment on function swh_mktemp_origin_intrinsic_metadata() is 'Helper table to add origin intrinsic metadata'; + + +-- add tmp_origin_intrinsic_metadata entries to origin_intrinsic_metadata, +-- overwriting duplicates if conflict_update is true, skipping duplicates +-- otherwise. +-- +-- If filtering duplicates is in order, the call to +-- swh_origin_intrinsic_metadata_missing must take place before calling this +-- function. +-- +-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- tmp_origin_intrinsic_metadata, 2. call this function +create or replace function swh_origin_intrinsic_metadata_add( + conflict_update boolean) + returns void + language plpgsql +as $$ +begin + if conflict_update then + insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision) + select origin_id, metadata, indexer_configuration_id, from_revision + from tmp_origin_intrinsic_metadata + on conflict(origin_id, indexer_configuration_id) + do update set metadata = excluded.metadata; + + else + insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision) + select origin_id, metadata, indexer_configuration_id, from_revision + from tmp_origin_intrinsic_metadata + on conflict(origin_id, indexer_configuration_id) + do nothing; + end if; + return; +end +$$; + +comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata'; + + +-- origin_intrinsic_metadata +create unique index origin_intrinsic_metadata_pkey on origin_intrinsic_metadata(origin_id, indexer_configuration_id); +alter table origin_intrinsic_metadata add primary key using index origin_intrinsic_metadata_pkey; + +alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey; +alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_revision_metadata_fkey foreign key (from_revision, indexer_configuration_id) references revision_metadata(id, indexer_configuration_id) not valid; +alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_revision_metadata_fkey; diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql index 2131a03..6f45f14 100644 --- a/swh/indexer/sql/30-swh-schema.sql +++ b/swh/indexer/sql/30-swh-schema.sql @@ -1,140 +1,140 @@ --- --- Software Heritage Indexers Data Model --- -- drop schema if exists swh cascade; -- create schema swh; -- set search_path to swh; create table dbversion ( version int primary key, release timestamptz, description text ); insert into dbversion(version, release, description) - values(115, now(), 'Work In Progress'); + values(116, now(), 'Work In Progress'); -- Computing metadata on sha1's contents -- a SHA1 checksum (not necessarily originating from Git) create domain sha1 as bytea check (length(value) = 20); -- a Git object ID, i.e., a SHA1 checksum create domain sha1_git as bytea check (length(value) = 20); create table indexer_configuration ( id serial not null, tool_name text not null, tool_version text not null, tool_configuration jsonb ); comment on table indexer_configuration is 'Indexer''s configuration version'; comment on column indexer_configuration.id is 'Tool identifier'; comment on column indexer_configuration.tool_version is 'Tool name'; comment on column indexer_configuration.tool_version is 'Tool version'; comment on column indexer_configuration.tool_configuration is 'Tool configuration: command line, flags, etc...'; -- Properties (mimetype, encoding, etc...) create table content_mimetype ( id sha1 not null, mimetype bytea not null, encoding bytea not null, indexer_configuration_id bigint not null ); comment on table content_mimetype is 'Metadata associated to a raw content'; comment on column content_mimetype.mimetype is 'Raw content Mimetype'; comment on column content_mimetype.encoding is 'Raw content encoding'; comment on column content_mimetype.indexer_configuration_id is 'Tool used to compute the information'; -- Language metadata create table content_language ( id sha1 not null, lang languages not null, indexer_configuration_id bigint not null ); comment on table content_language is 'Language information on a raw content'; comment on column content_language.lang is 'Language information'; comment on column content_language.indexer_configuration_id is 'Tool used to compute the information'; -- ctags information per content create table content_ctags ( id sha1 not null, name text not null, kind text not null, line bigint not null, lang ctags_languages not null, indexer_configuration_id bigint not null ); comment on table content_ctags is 'Ctags information on a raw content'; comment on column content_ctags.id is 'Content identifier'; comment on column content_ctags.name is 'Symbol name'; comment on column content_ctags.kind is 'Symbol kind (function, class, variable, const...)'; comment on column content_ctags.line is 'Symbol line'; comment on column content_ctags.lang is 'Language information for that content'; comment on column content_ctags.indexer_configuration_id is 'Tool used to compute the information'; create table fossology_license( id smallserial, name text not null ); comment on table fossology_license is 'Possible license recognized by license indexer'; comment on column fossology_license.id is 'License identifier'; comment on column fossology_license.name is 'License name'; create table content_fossology_license ( id sha1 not null, license_id smallserial not null, indexer_configuration_id bigint not null ); comment on table content_fossology_license is 'license associated to a raw content'; comment on column content_fossology_license.id is 'Raw content identifier'; comment on column content_fossology_license.license_id is 'One of the content''s license identifier'; comment on column content_fossology_license.indexer_configuration_id is 'Tool used to compute the information'; -- The table content_metadata provides a translation to files -- identified as potentially containning metadata with a translation tool (indexer_configuration_id) create table content_metadata( id sha1 not null, translated_metadata jsonb not null, indexer_configuration_id bigint not null ); comment on table content_metadata is 'metadata semantically translated from a content file'; comment on column content_metadata.id is 'sha1 of content file'; comment on column content_metadata.translated_metadata is 'result of translation with defined format'; comment on column content_metadata.indexer_configuration_id is 'tool used for translation'; -- The table revision_metadata provides a minimal set of intrinsic metadata -- detected with the detection tool (indexer_configuration_id) and aggregated -- from the content_metadata translation. create table revision_metadata( id sha1_git not null, translated_metadata jsonb not null, indexer_configuration_id bigint not null ); comment on table revision_metadata is 'metadata semantically detected and translated in a revision'; comment on column revision_metadata.id is 'sha1_git of revision'; comment on column revision_metadata.translated_metadata is 'result of detection and translation with defined format'; comment on column revision_metadata.indexer_configuration_id is 'tool used for detection'; create table origin_intrinsic_metadata( origin_id bigserial not null, metadata jsonb, indexer_configuration_id bigint not null, from_revision sha1_git not null ); comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin'; comment on column origin_intrinsic_metadata.origin_id is 'the entry id in origin'; comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision'; comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata'; comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.';