diff --git a/sql/swh-func.sql b/sql/swh-func.sql --- a/sql/swh-func.sql +++ b/sql/swh-func.sql @@ -1945,6 +1945,116 @@ comment on function swh_content_fossology_license_get() IS 'List content licenses'; +-- content_metadata functions +-- +-- create a temporary table for content_metadata tmp_content_metadata, +create or replace function swh_mktemp_content_metadata_missing() + returns void + language sql +as $$ + create temporary table tmp_content_metadata_missing ( + id sha1, + indexer_configuration_id integer + ) on commit drop; +$$; + +comment on function swh_mktemp_content_metadata_missing() is 'Helper table to filter missing metadata in content_metadata'; + +-- check which entries of tmp_bytea are missing from content_metadata +-- +-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, +-- 2. call this function +create or replace function swh_content_metadata_missing() + returns setof sha1 + language plpgsql +as $$ +begin + return query + select id::sha1 from tmp_content_metadata_missing as tmp + where not exists + (select 1 from content_metadata as c + where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); + return; +end +$$; + +comment on function swh_content_metadata_missing() IS 'Filter missing content metadata'; + +-- add tmp_content_metadata entries to content_metadata, overwriting +-- duplicates if conflict_update is true, skipping duplicates otherwise. +-- +-- If filtering duplicates is in order, the call to +-- swh_content_metadata_missing must take place before calling this +-- function. +-- +-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- tmp_content_metadata, 2. call this function +create or replace function swh_content_metadata_add(conflict_update boolean) + returns void + language plpgsql +as $$ +begin + if conflict_update then + insert into content_metadata (id, translated_metadata, indexer_configuration_id) + select id, translated_metadata, indexer_configuration_id + from tmp_content_metadata tcm + on conflict(id, indexer_configuration_id) + do update set translated_metadata = excluded.translated_metadata; + + else + insert into content_language (id, translated_metadata, indexer_configuration_id) + select id, translated_metadata, indexer_configuration_id + from tmp_content_language tcm + on conflict(id, indexer_configuration_id) + do nothing; + end if; + return; +end +$$; + +comment on function swh_content_metadata_add(boolean) IS 'Add new content metadata'; + +-- create a temporary table for retrieving content_metadata +create or replace function swh_mktemp_content_metadata() + returns void + language sql +as $$ + create temporary table tmp_content_metadata ( + like content_metadata including defaults + ) on commit drop; +$$; + +comment on function swh_mktemp_content_metadata() is 'Helper table to add content metadata'; + +-- +create type content_metadata_signature as ( + id sha1, + translated_metadata jsonb, + tool_id integer, + tool_name text, + tool_version text, + tool_configuration jsonb +); + +-- Retrieve list of content metadata from the temporary table. +-- +-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function +create or replace function swh_content_metadata_get() + returns setof content_metadata_signature + language plpgsql +as $$ +begin + return query + select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration + from tmp_bytea t + inner join content_metadata c on c.id = t.id + inner join indexer_configuration i on i.id=c.indexer_configuration_id; + return; +end +$$; + +comment on function swh_content_metadata_get() is 'List content''s metadata'; +-- end content_metadata functions -- simple counter mapping a textual label to an integer value create type counter as ( diff --git a/sql/swh-indexes.sql b/sql/swh-indexes.sql --- a/sql/swh-indexes.sql +++ b/sql/swh-indexes.sql @@ -272,3 +272,17 @@ alter table content_fossology_license add constraint content_fossology_license_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table content_fossology_license validate constraint content_fossology_license_indexer_configuration_id_fkey; + +-- content_metadata +create unique index concurrently content_metadata_pkey on content_metadata(id, indexer_configuration_id); +alter table content_metadata add primary key using index content_metadata_pkey; + +alter table content_metadata add constraint content_metadata_id_fkey foreign key (id) references content(sha1) not valid; +alter table content_metadata validate constraint content_metadata_id_fkey; + +alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey; + +-- origin_metadata_history and origin_metadata +-- TODO PK: origin_id, discovery_date +-- TODO FK: origin_id, indexer_configuration_id diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -463,3 +463,54 @@ comment on column content_fossology_license.id is 'Raw content identifier'; comment on column content_fossology_license.license_id is 'One of the content''s license identifier'; comment on column content_fossology_license.indexer_configuration_id is 'Tool used to compute the information'; + + +-- The table content_metadata provides a translation to files +-- identified as potentially containning metadata with a translation tool (indexer_configuration_id) +create table content_metadata( + id sha1 not null, + translated_metadata jsonb not null, + indexer_configuration_id bigint not null +); + +comment on table content_metadata is 'metadata semantically translated from a content file'; +comment on column content_metadata.id is 'sha1 of content file'; +comment on column content_metadata.translated_metadata is 'result of translation with defined format'; +comment on column content_metadata.indexer_configuration_id is 'tool used for translation'; + + +-- Discovery of metadata during a listing or a deposit of an origin +-- also provides a translation to a defined json schema using a translation tool (indexer_configuration_id) +create table origin_metadata_history( + origin_id bigint not null, + discovery_date timestamptz not null, + translation_date timestamptz, + provenance_type text not null, -- TODO use an enum (?) + raw_metadata jsonb not null, + translated_metadata jsonb, + indexer_configuration_id bigint, + object_id bigserial -- short object identifier +); + + +comment on table origin_metadata_history is 'keeps latest metadata concerning an origin'; +comment on column origin_metadata_history.origin_id is 'the origin id for which the metadata was found'; +comment on column origin_metadata_history.discovery_date is 'the date of retrieval'; +comment on column origin_metadata_history.translation_date is 'the date of translation'; +comment on column origin_metadata_history.provenance_type is 'lister, publisher, etc' +comment on column origin_metadata_history.raw_metadata is 'metadata in json format but with original terms'; +comment on column origin_metadata_history.translated_metadata is 'metadata in defined terms in json schema'; +comment on column origin_metadata_history.indexer_configuration_id is 'tool used for translation'; + + +-- Materialized view of origin_metadata_history, storing the *current* value of +-- metadata, as last seen by SWH. +create table origin_metadata( + origin_id bigint not null, + discovery_date timestamptz not null, + translation_date timestamptz, + provenance_type text not null, -- TODO use an enum (?) + raw_metadata jsonb not null, + translated_metadata jsonb, + indexer_configuration_id bigint, +); diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -967,6 +967,35 @@ cur.execute(query) yield from cursor_to_bytes(cur) + content_metadata_cols = [ + 'id', 'translated_metadata', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + + @stored_procedure('swh_mktemp_content_metadata') + def mktemp_content_metadata(self, cur=None): pass + + @stored_procedure('swh_mktemp_content_metadata_missing') + def mktemp_content_metadata_missing(self, cur=None): pass + + def content_metadata_missing_from_temp(self, cur=None): + """List missing metadatas. + + """ + cur = self._cursor(cur) + cur.execute("SELECT * FROM swh_content_metadata_missing()") + yield from cursor_to_bytes(cur) + + def content_metadata_add_from_temp(self, conflict_update, cur=None): + self._cursor(cur).execute("SELECT swh_content_metadata_add(%s)", + (conflict_update, )) + + def content_metadata_get_from_temp(self, cur=None): + cur = self._cursor(cur) + query = "SELECT %s FROM swh_content_metadata_get()" % ( + ','.join(self.content_metadata_cols)) + cur.execute(query) + yield from cursor_to_bytes(cur) + indexer_configuration_cols = ['id', 'tool_name', 'tool_version', 'tool_configuration'] diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -1558,6 +1558,61 @@ cur=cur) db.content_fossology_license_add_from_temp(conflict_update, cur) + @db_transaction_generator + def content_metadata_missing(self, metadatas, cur=None): + """List metadatas missing from storage. + + Args: + metadatas: iterable of dict with keys: + - id (bytes): sha1 identifier + - tool_name (str): tool used to compute the results + - tool_version (str): associated tool's version + + Returns: + an iterable of missing id + + """ + db = self.db + db.mktemp_content_metadata_missing(cur) + db.copy_to(metadatas, 'tmp_content_metadata_missing', + ['id', 'indexer_configuration_id'], cur) + for obj in db.content_metadata_missing_from_temp(cur): + yield obj[0] + + @db_transaction_generator + def content_metadata_get(self, ids, cur=None): + db = self.db + db.store_tmp_bytea(ids, cur) + for c in db.content_metadata_get_from_temp(): + yield converters.db_to_metadata( + dict(zip(db.content_metadata_cols, c))) + + @db_transaction + def content_metadata_add(self, metadatas, conflict_update=False, cur=None): + """Add metadatas not present in storage. + + Args: + metadatas: iterable of dictionary with keys: + - id: sha1 + - translated_metadata: bytes / jsonb ? + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + """ + db = self.db + db.mktemp_content_metadata(cur) + # empty metadata is mapped to 'unknown' + db.copy_to( + ({ + 'id': m['id'], + 'translated_metadata': m['translated_metadata'], + 'indexer_configuration_id': m['indexer_configuration_id'], + } for m in metadatas), + 'tmp_content_metadata', + ['id', 'translated_metadata', 'indexer_configuration_id'], cur) + + db.content_metadata_add_from_temp(conflict_update, cur) + @db_transaction def indexer_configuration_get(self, tool, cur=None): db = self.db