diff --git a/sql/json/revision_metadata.translated_metadata.json b/sql/json/revision_metadata.translated_metadata.json new file mode 100644 --- /dev/null +++ b/sql/json/revision_metadata.translated_metadata.json @@ -0,0 +1,59 @@ +{ + "$schema": "http://json-schema.org/schema#", + "id": "http://softwareheritage.org/schemas/revision_metadata.translated_metadata.schema.json", + + "type": "object", + "properties": { + "developmentStatus": { + "type": "list" + }, + "version": { + "type": "list" + }, + "operatingSystem": { + "type": "list" + }, + "description": { + "type": "list" + }, + "keywords": { + "type": "list" + }, + "issueTracker": { + "type": "list" + }, + "name": { + "type": "list" + }, + "author": { + "type": "list" + }, + "relatedLink": { + "type": "list" + }, + "url": { + "type": "list" + }, + "type": { + "type": "list" + }, + "license": { + "type": "list" + }, + "maintainer": { + "type": "list" + }, + "email": { + "type": "list" + }, + "softwareRequirements": { + "type": "list" + }, + "identifier": { + "type": "list" + }, + "codeRepository": { + "type": "list" + }, + } +} diff --git a/sql/swh-data.sql b/sql/swh-data.sql --- a/sql/swh-data.sql +++ b/sql/swh-data.sql @@ -875,3 +875,6 @@ insert into indexer_configuration(tool_name, tool_version, tool_configuration) values ('swh-metadata-translator', '0.0.1', '{"type": "local", "context": "npm"}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('swh-metadata-detector', '0.0.1', '{"type": "local", "context": ["npm", "codemeta"]}'); diff --git a/sql/swh-func.sql b/sql/swh-func.sql --- a/sql/swh-func.sql +++ b/sql/swh-func.sql @@ -1665,8 +1665,8 @@ as $$ begin if conflict_update then - insert into content_language (id, lang, indexer_configuration_id) - select id, lang, indexer_configuration_id + insert into content_language (id, lang, indexer_configuration_id) + select id, lang, indexer_configuration_id from tmp_content_language tcl on conflict(id, indexer_configuration_id) do update set lang = excluded.lang; @@ -1674,7 +1674,7 @@ else insert into content_language (id, lang, indexer_configuration_id) select id, lang, indexer_configuration_id - from tmp_content_language tcl + from tmp_content_language tcl on conflict(id, indexer_configuration_id) do nothing; end if; @@ -1995,8 +1995,8 @@ as $$ begin if conflict_update then - insert into content_metadata (id, translated_metadata, indexer_configuration_id) - select id, translated_metadata, indexer_configuration_id + insert into content_metadata (id, translated_metadata, indexer_configuration_id) + select id, translated_metadata, indexer_configuration_id from tmp_content_metadata tcm on conflict(id, indexer_configuration_id) do update set translated_metadata = excluded.translated_metadata; @@ -2056,6 +2056,117 @@ comment on function swh_content_metadata_get() is 'List content''s metadata'; -- end content_metadata functions +-- revision_metadata functions +-- +-- create a temporary table for revision_metadata tmp_revision_metadata, +create or replace function swh_mktemp_revision_metadata_missing() + returns void + language sql +as $$ + create temporary table tmp_revision_metadata_missing ( + id sha1_git, + indexer_configuration_id integer + ) on commit drop; +$$; + +comment on function swh_mktemp_revision_metadata_missing() is 'Helper table to filter missing metadata in revision_metadata'; + +-- check which entries of tmp_bytea are missing from revision_metadata +-- +-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, +-- 2. call this function +create or replace function swh_revision_metadata_missing() + returns setof sha1 + language plpgsql +as $$ +begin + return query + select id::sha1 from tmp_revision_metadata_missing as tmp + where not exists + (select 1 from revision_metadata as c + where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); + return; +end +$$; + +comment on function swh_revision_metadata_missing() IS 'Filter missing content metadata'; + +-- add tmp_revision_metadata entries to revision_metadata, overwriting +-- duplicates if conflict_update is true, skipping duplicates otherwise. +-- +-- If filtering duplicates is in order, the call to +-- swh_revision_metadata_missing must take place before calling this +-- function. +-- +-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- tmp_revision_metadata, 2. call this function +create or replace function swh_revision_metadata_add(conflict_update boolean) + returns void + language plpgsql +as $$ +begin + if conflict_update then + insert into revision_metadata (id, translated_metadata, indexer_configuration_id) + select id, translated_metadata, indexer_configuration_id + from tmp_revision_metadata tcm + on conflict(id, indexer_configuration_id) + do update set translated_metadata = excluded.translated_metadata; + + else + insert into revision_metadata (id, translated_metadata, indexer_configuration_id) + select id, translated_metadata, indexer_configuration_id + from tmp_revision_metadata tcm + on conflict(id, indexer_configuration_id) + do nothing; + end if; + return; +end +$$; + +comment on function swh_revision_metadata_add(boolean) IS 'Add new revision metadata'; + +-- create a temporary table for retrieving revision_metadata +create or replace function swh_mktemp_revision_metadata() + returns void + language sql +as $$ + create temporary table tmp_revision_metadata ( + like revision_metadata including defaults + ) on commit drop; +$$; + +comment on function swh_mktemp_revision_metadata() is 'Helper table to add revision metadata'; + +-- +create type revision_metadata_signature as ( + id sha1_git, + translated_metadata jsonb, + tool_id integer, + tool_name text, + tool_version text, + tool_configuration jsonb +); + +-- Retrieve list of revision metadata from the temporary table. +-- +-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function +create or replace function swh_revision_metadata_get() + returns setof revision_metadata_signature + language plpgsql +as $$ +begin + return query + select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration + from tmp_bytea t + inner join revision_metadata c on c.id = t.id + inner join indexer_configuration i on i.id=c.indexer_configuration_id; + return; +end +$$; + +comment on function swh_revision_metadata_get() is 'List revision''s metadata'; +-- end revision_metadata functions + -- simple counter mapping a textual label to an integer value create type counter as ( label text, diff --git a/sql/swh-indexes.sql b/sql/swh-indexes.sql --- a/sql/swh-indexes.sql +++ b/sql/swh-indexes.sql @@ -283,6 +283,13 @@ alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey; --- origin_metadata_history and origin_metadata --- TODO PK: origin_id, discovery_date --- TODO FK: origin_id, indexer_configuration_id + +-- revision_metadata +create unique index concurrently revision_metadata_pkey on revision_metadata(id, indexer_configuration_id); +alter table revision_metadata add primary key using index revision_metadata_pkey; + +alter table revision_metadata add constraint revision_metadata_id_fkey foreign key (id) references revision(id) not valid; +alter table revision_metadata validate constraint revision_metadata_id_fkey; + +alter table revision_metadata add constraint revision_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table revision_metadata validate constraint revision_metadata_indexer_configuration_id_fkey; diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -477,3 +477,17 @@ comment on column content_metadata.id is 'sha1 of content file'; comment on column content_metadata.translated_metadata is 'result of translation with defined format'; comment on column content_metadata.indexer_configuration_id is 'tool used for translation'; + +-- The table revision_metadata provides a minimal set of intrinsic metadata +-- detected with the detection tool (indexer_configuration_id) and aggregated +-- from the content_metadata translation. +create table revision_metadata( + id sha1_git not null, + translated_metadata jsonb not null, + indexer_configuration_id bigint not null +); + +comment on table revision_metadata is 'metadata semantically detected and translated in a revision'; +comment on column revision_metadata.id is 'sha1_git of revision'; +comment on column revision_metadata.translated_metadata is 'result of detection and translation with defined format'; +comment on column revision_metadata.indexer_configuration_id is 'tool used for detection'; diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -243,5 +243,17 @@ def content_metadata_get(self, ids): return self.post('content_metadata', {'ids': ids}) + def revision_metadata_add(self, metadatas, conflict_update=False): + return self.post('revision_metadata/add', { + 'metadatas': metadatas, + 'conflict_update': conflict_update, + }) + + def revision_metadata_missing(self, metadatas): + return self.post('revision_metadata/missing', {'metadatas': metadatas}) + + def revision_metadata_get(self, ids): + return self.post('revision_metadata', {'ids': ids}) + def indexer_configuration_get(self, tool): return self.post('indexer_configuration/data', {'tool': tool}) diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py --- a/swh/storage/api/server.py +++ b/swh/storage/api/server.py @@ -398,6 +398,24 @@ g.storage.content_metadata_get(**decode_request(request))) +@app.route('/revision_metadata/add', methods=['POST']) +def revision_metadata_add(): + return encode_data( + g.storage.revision_metadata_add(**decode_request(request))) + + +@app.route('/revision_metadata/missing', methods=['POST']) +def revision_metadata_missing(): + return encode_data( + g.storage.revision_metadata_missing(**decode_request(request))) + + +@app.route('/revision_metadata', methods=['POST']) +def revision_metadata_get(): + return encode_data( + g.storage.revision_metadata_get(**decode_request(request))) + + @app.route('/stat/counters', methods=['GET']) def stat_counters(): return encode_data(g.storage.stat_counters()) diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -996,6 +996,35 @@ cur.execute(query) yield from cursor_to_bytes(cur) + revision_metadata_cols = [ + 'id', 'translated_metadata', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + + @stored_procedure('swh_mktemp_revision_metadata') + def mktemp_revision_metadata(self, cur=None): pass + + @stored_procedure('swh_mktemp_revision_metadata_missing') + def mktemp_revision_metadata_missing(self, cur=None): pass + + def revision_metadata_missing_from_temp(self, cur=None): + """List missing metadatas. + + """ + cur = self._cursor(cur) + cur.execute("SELECT * FROM swh_revision_metadata_missing()") + yield from cursor_to_bytes(cur) + + def revision_metadata_add_from_temp(self, conflict_update, cur=None): + self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)", + (conflict_update, )) + + def revision_metadata_get_from_temp(self, cur=None): + cur = self._cursor(cur) + query = "SELECT %s FROM swh_revision_metadata_get()" % ( + ','.join(self.revision_metadata_cols)) + cur.execute(query) + yield from cursor_to_bytes(cur) + indexer_configuration_cols = ['id', 'tool_name', 'tool_version', 'tool_configuration'] diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -1608,6 +1608,57 @@ cur) db.content_metadata_add_from_temp(conflict_update, cur) + @db_transaction_generator + def revision_metadata_missing(self, metadatas, cur=None): + """List metadatas missing from storage. + + Args: + metadatas: iterable of dict with keys: + - id (bytes): sha1_git revision identifier + - tool_name (str): tool used to compute the results + - tool_version (str): associated tool's version + + Returns: + an iterable of missing id + + """ + db = self.db + db.mktemp_revision_metadata_missing(cur) + db.copy_to(metadatas, 'tmp_revision_metadata_missing', + ['id', 'indexer_configuration_id'], cur) + for obj in db.revision_metadata_missing_from_temp(cur): + yield obj[0] + + @db_transaction_generator + def revision_metadata_get(self, ids, cur=None): + db = self.db + db.store_tmp_bytea(ids, cur) + for c in db.revision_metadata_get_from_temp(): + yield converters.db_to_metadata( + dict(zip(db.revision_metadata_cols, c))) + + @db_transaction + def revision_metadata_add(self, metadatas, + conflict_update=False, cur=None): + """Add metadatas not present in storage. + + Args: + metadatas: iterable of dictionary with keys: + - id: sha1_git of revision + - translated_metadata: bytes / jsonb ? + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + """ + db = self.db + db.mktemp_revision_metadata(cur) + # empty metadata is mapped to 'unknown' + + db.copy_to(metadatas, 'tmp_revision_metadata', + ['id', 'translated_metadata', 'indexer_configuration_id'], + cur) + db.revision_metadata_add_from_temp(conflict_update, cur) + @db_transaction def indexer_configuration_get(self, tool, cur=None): db = self.db diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -3129,6 +3129,247 @@ # metadata did change as the v2 was used to overwrite v1 self.assertEqual(actual_metadatas, expected_metadatas_v2) + @istest + def revision_metadata_missing(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-detector']['id'] + + rev = self.revision + missing_rev = self.revision2 + self.storage.revision_add([rev]) + + metadatas = [ + { + 'id': rev['id'], + 'indexer_configuration_id': tool_id, + }, + { + 'id': missing_rev['id'], + 'indexer_configuration_id': tool_id, + } + ] + + # when + actual_missing = list(self.storage.revision_metadata_missing( + metadatas)) + + # then + self.assertEqual(list(actual_missing), [ + rev['id'], + missing_rev['id'], + ]) + + # given + self.storage.revision_metadata_add([{ + 'id': rev['id'], + 'translated_metadata': { + 'developmentStatus': None, + 'version': None, + 'operatingSystem': None, + 'description': None, + 'keywords': None, + 'issueTracker': None, + 'name': None, + 'author': None, + 'relatedLink': None, + 'url': None, + 'type': None, + 'license': None, + 'maintainer': None, + 'email': None, + 'softwareRequirements': None, + 'identifier': None + }, + 'indexer_configuration_id': tool_id + }]) + + # when + actual_missing = list(self.storage.revision_metadata_missing( + metadatas)) + + # then + self.assertEqual(actual_missing, [missing_rev['id']]) + + @istest + def revision_metadata_get(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-detector']['id'] + rev = self.revision2 + self.storage.revision_add([rev]) + + metadata_rev = { + 'id': rev['id'], + 'translated_metadata': { + 'developmentStatus': None, + 'version': None, + 'operatingSystem': None, + 'description': None, + 'keywords': None, + 'issueTracker': None, + 'name': None, + 'author': None, + 'relatedLink': None, + 'url': None, + 'type': None, + 'license': None, + 'maintainer': None, + 'email': None, + 'softwareRequirements': None, + 'identifier': None + }, + 'indexer_configuration_id': tool_id + } + + # when + self.storage.revision_metadata_add([metadata_rev]) + + # then + actual_metadatas = list(self.storage.revision_metadata_get( + [self.revision2['id'], self.revision['id']])) + + expected_metadatas = [{ + 'id': rev['id'], + 'translated_metadata': metadata_rev['translated_metadata'], + 'tool': tools['swh-metadata-detector'] + }] + + self.assertEqual(actual_metadatas, expected_metadatas) + + @istest + def revision_metadata_add_drop_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-detector']['id'] + revision = self.revision + self.storage.revision_add([revision]) + + metadata_v1 = { + 'id': self.revision['id'], + 'translated_metadata': { + 'developmentStatus': None, + 'version': None, + 'operatingSystem': None, + 'description': None, + 'keywords': None, + 'issueTracker': None, + 'name': None, + 'author': None, + 'relatedLink': None, + 'url': None, + 'type': None, + 'license': None, + 'maintainer': None, + 'email': None, + 'softwareRequirements': None, + 'identifier': None + }, + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.revision_metadata_add([metadata_v1]) + + # when + actual_metadatas = list(self.storage.revision_metadata_get( + [self.revision['id']])) + + expected_metadatas_v1 = [{ + 'id': self.revision['id'], + 'translated_metadata': metadata_v1['translated_metadata'], + 'tool': tools['swh-metadata-detector'] + }] + + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + # given + metadata_v2 = metadata_v1.copy() + metadata_v2.update({ + 'translated_metadata': { + 'name': 'test_metadata', + 'author': 'MG', + }, + }) + + self.storage.revision_metadata_add([metadata_v2]) + + # then + actual_metadatas = list(self.storage.revision_metadata_get( + [self.revision['id']])) + + # metadata did not change as the v2 was dropped. + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + @istest + def revision_metadata_add_update_in_place_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-detector']['id'] + revision = self.revision2 + self.storage.revision_add([revision]) + + metadata_v1 = { + 'id': self.revision2['id'], + 'translated_metadata': { + 'developmentStatus': None, + 'version': None, + 'operatingSystem': None, + 'description': None, + 'keywords': None, + 'issueTracker': None, + 'name': None, + 'author': None, + 'relatedLink': None, + 'url': None, + 'type': None, + 'license': None, + 'maintainer': None, + 'email': None, + 'softwareRequirements': None, + 'identifier': None + }, + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.revision_metadata_add([metadata_v1]) + + # when + actual_metadatas = list(self.storage.revision_metadata_get( + [self.revision2['id']])) + + # then + expected_metadatas_v1 = [{ + 'id': self.revision2['id'], + 'translated_metadata': metadata_v1['translated_metadata'], + 'tool': tools['swh-metadata-detector'] + }] + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + # given + metadata_v2 = metadata_v1.copy() + metadata_v2.update({ + 'translated_metadata': { + 'name': 'test_update_duplicated_metadata', + 'author': 'MG' + }, + }) + self.storage.revision_metadata_add([metadata_v2], conflict_update=True) + + actual_metadatas = list(self.storage.revision_metadata_get( + [self.revision2['id']])) + + # language did not change as the v2 was dropped. + expected_metadatas_v2 = [{ + 'id': self.revision2['id'], + 'translated_metadata': metadata_v2['translated_metadata'], + 'tool': tools['swh-metadata-detector'] + }] + + # metadata did change as the v2 was used to overwrite v1 + self.assertEqual(actual_metadatas, expected_metadatas_v2) + class TestLocalStorage(CommonTestStorage, unittest.TestCase): """Test the local storage"""