diff --git a/sql/swh-data.sql b/sql/swh-data.sql --- a/sql/swh-data.sql +++ b/sql/swh-data.sql @@ -872,3 +872,9 @@ insert into indexer_configuration(tool_name, tool_version, tool_configuration) values ('pygments', '2.0.1+dfsg-1.1+deb8u1', '{"type": "library", "debian-package": "python3-pygments", "max_content_size": 10240}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('swh-metadata-translator', '0.0.1', '{"type": "local", "context": "npm"}'); + +-- insert into indexer_configuration(tool_name, tool_version, tool_configuration) +-- values ('swh-metadata-translator', '0.0.1', '{"type": "local", "context": "maven"}'); diff --git a/sql/swh-func.sql b/sql/swh-func.sql --- a/sql/swh-func.sql +++ b/sql/swh-func.sql @@ -1945,6 +1945,116 @@ comment on function swh_content_fossology_license_get() IS 'List content licenses'; +-- content_metadata functions +-- +-- create a temporary table for content_metadata tmp_content_metadata, +create or replace function swh_mktemp_content_metadata_missing() + returns void + language sql +as $$ + create temporary table tmp_content_metadata_missing ( + id sha1, + indexer_configuration_id integer + ) on commit drop; +$$; + +comment on function swh_mktemp_content_metadata_missing() is 'Helper table to filter missing metadata in content_metadata'; + +-- check which entries of tmp_bytea are missing from content_metadata +-- +-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, +-- 2. call this function +create or replace function swh_content_metadata_missing() + returns setof sha1 + language plpgsql +as $$ +begin + return query + select id::sha1 from tmp_content_metadata_missing as tmp + where not exists + (select 1 from content_metadata as c + where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); + return; +end +$$; + +comment on function swh_content_metadata_missing() IS 'Filter missing content metadata'; + +-- add tmp_content_metadata entries to content_metadata, overwriting +-- duplicates if conflict_update is true, skipping duplicates otherwise. +-- +-- If filtering duplicates is in order, the call to +-- swh_content_metadata_missing must take place before calling this +-- function. +-- +-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- tmp_content_metadata, 2. call this function +create or replace function swh_content_metadata_add(conflict_update boolean) + returns void + language plpgsql +as $$ +begin + if conflict_update then + insert into content_metadata (id, translated_metadata, indexer_configuration_id) + select id, translated_metadata, indexer_configuration_id + from tmp_content_metadata tcm + on conflict(id, indexer_configuration_id) + do update set translated_metadata = excluded.translated_metadata; + + else + insert into content_metadata (id, translated_metadata, indexer_configuration_id) + select id, translated_metadata, indexer_configuration_id + from tmp_content_metadata tcm + on conflict(id, indexer_configuration_id) + do nothing; + end if; + return; +end +$$; + +comment on function swh_content_metadata_add(boolean) IS 'Add new content metadata'; + +-- create a temporary table for retrieving content_metadata +create or replace function swh_mktemp_content_metadata() + returns void + language sql +as $$ + create temporary table tmp_content_metadata ( + like content_metadata including defaults + ) on commit drop; +$$; + +comment on function swh_mktemp_content_metadata() is 'Helper table to add content metadata'; + +-- +create type content_metadata_signature as ( + id sha1, + translated_metadata jsonb, + tool_id integer, + tool_name text, + tool_version text, + tool_configuration jsonb +); + +-- Retrieve list of content metadata from the temporary table. +-- +-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function +create or replace function swh_content_metadata_get() + returns setof content_metadata_signature + language plpgsql +as $$ +begin + return query + select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration + from tmp_bytea t + inner join content_metadata c on c.id = t.id + inner join indexer_configuration i on i.id=c.indexer_configuration_id; + return; +end +$$; + +comment on function swh_content_metadata_get() is 'List content''s metadata'; +-- end content_metadata functions -- simple counter mapping a textual label to an integer value create type counter as ( diff --git a/sql/swh-indexes.sql b/sql/swh-indexes.sql --- a/sql/swh-indexes.sql +++ b/sql/swh-indexes.sql @@ -272,3 +272,17 @@ alter table content_fossology_license add constraint content_fossology_license_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table content_fossology_license validate constraint content_fossology_license_indexer_configuration_id_fkey; + +-- content_metadata +create unique index concurrently content_metadata_pkey on content_metadata(id, indexer_configuration_id); +alter table content_metadata add primary key using index content_metadata_pkey; + +alter table content_metadata add constraint content_metadata_id_fkey foreign key (id) references content(sha1) not valid; +alter table content_metadata validate constraint content_metadata_id_fkey; + +alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey; + +-- origin_metadata_history and origin_metadata +-- TODO PK: origin_id, discovery_date +-- TODO FK: origin_id, indexer_configuration_id diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -463,3 +463,54 @@ comment on column content_fossology_license.id is 'Raw content identifier'; comment on column content_fossology_license.license_id is 'One of the content''s license identifier'; comment on column content_fossology_license.indexer_configuration_id is 'Tool used to compute the information'; + + +-- The table content_metadata provides a translation to files +-- identified as potentially containning metadata with a translation tool (indexer_configuration_id) +create table content_metadata( + id sha1 not null, + translated_metadata jsonb not null, + indexer_configuration_id bigint not null +); + +comment on table content_metadata is 'metadata semantically translated from a content file'; +comment on column content_metadata.id is 'sha1 of content file'; +comment on column content_metadata.translated_metadata is 'result of translation with defined format'; +comment on column content_metadata.indexer_configuration_id is 'tool used for translation'; + + +-- Discovery of metadata during a listing or a deposit of an origin +-- also provides a translation to a defined json schema using a translation tool (indexer_configuration_id) +create table origin_metadata_history( + origin_id bigint not null, + discovery_date timestamptz not null, + translation_date timestamptz, + provenance_type text not null, -- TODO use an enum (?) + raw_metadata jsonb not null, + translated_metadata jsonb, + indexer_configuration_id bigint, + object_id bigserial -- short object identifier +); + + +comment on table origin_metadata_history is 'keeps latest metadata concerning an origin'; +comment on column origin_metadata_history.origin_id is 'the origin id for which the metadata was found'; +comment on column origin_metadata_history.discovery_date is 'the date of retrieval'; +comment on column origin_metadata_history.translation_date is 'the date of translation'; +comment on column origin_metadata_history.provenance_type is 'lister, publisher, etc'; +comment on column origin_metadata_history.raw_metadata is 'metadata in json format but with original terms'; +comment on column origin_metadata_history.translated_metadata is 'metadata in defined terms in json schema'; +comment on column origin_metadata_history.indexer_configuration_id is 'tool used for translation'; + + +-- Materialized view of origin_metadata_history, storing the *current* value of +-- metadata, as last seen by SWH. +create table origin_metadata( + origin_id bigint not null, + discovery_date timestamptz not null, + translation_date timestamptz, + provenance_type text not null, -- TODO use an enum (?) + raw_metadata jsonb not null, + translated_metadata jsonb, + indexer_configuration_id bigint +); diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -231,5 +231,17 @@ def content_fossology_license_get(self, ids): return self.post('content/fossology_license', {'ids': ids}) + def content_metadata_add(self, metadatas, conflict_update=False): + return self.post('content_metadata/add', { + 'metadatas': metadatas, + 'conflict_update': conflict_update, + }) + + def content_metadata_missing(self, metadatas): + return self.post('content_metadata/missing', {'metadatas': metadatas}) + + def content_metadata_get(self, ids): + return self.post('content_metadata', {'ids': ids}) + def indexer_configuration_get(self, tool): return self.post('indexer_configuration/data', {'tool': tool}) diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py --- a/swh/storage/api/server.py +++ b/swh/storage/api/server.py @@ -380,6 +380,24 @@ **decode_request(request))) +@app.route('/content_metadata/add', methods=['POST']) +def content_metadata_add(): + return encode_data( + g.storage.content_metadata_add(**decode_request(request))) + + +@app.route('/content_metadata/missing', methods=['POST']) +def content_metadata_missing(): + return encode_data( + g.storage.content_metadata_missing(**decode_request(request))) + + +@app.route('/content_metadata', methods=['POST']) +def content_metadata_get(): + return encode_data( + g.storage.content_metadata_get(**decode_request(request))) + + @app.route('/stat/counters', methods=['GET']) def stat_counters(): return encode_data(g.storage.stat_counters()) diff --git a/swh/storage/converters.py b/swh/storage/converters.py --- a/swh/storage/converters.py +++ b/swh/storage/converters.py @@ -416,6 +416,22 @@ } +def db_to_metadata(metadata): + """Convert a metadata entry into a ready metadata output. + + """ + return { + 'id': metadata['id'], + 'translated_metadata': metadata['translated_metadata'], + 'tool': { + 'id': metadata['tool_id'], + 'name': metadata['tool_name'], + 'version': metadata['tool_version'], + 'configuration': metadata['tool_configuration'] + } + } + + def db_to_fossology_license(license): return { 'id': license['id'], diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -967,6 +967,35 @@ cur.execute(query) yield from cursor_to_bytes(cur) + content_metadata_cols = [ + 'id', 'translated_metadata', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + + @stored_procedure('swh_mktemp_content_metadata') + def mktemp_content_metadata(self, cur=None): pass + + @stored_procedure('swh_mktemp_content_metadata_missing') + def mktemp_content_metadata_missing(self, cur=None): pass + + def content_metadata_missing_from_temp(self, cur=None): + """List missing metadatas. + + """ + cur = self._cursor(cur) + cur.execute("SELECT * FROM swh_content_metadata_missing()") + yield from cursor_to_bytes(cur) + + def content_metadata_add_from_temp(self, conflict_update, cur=None): + self._cursor(cur).execute("SELECT swh_content_metadata_add(%s)", + (conflict_update, )) + + def content_metadata_get_from_temp(self, cur=None): + cur = self._cursor(cur) + query = "SELECT %s FROM swh_content_metadata_get()" % ( + ','.join(self.content_metadata_cols)) + cur.execute(query) + yield from cursor_to_bytes(cur) + indexer_configuration_cols = ['id', 'tool_name', 'tool_version', 'tool_configuration'] diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -1558,6 +1558,61 @@ cur=cur) db.content_fossology_license_add_from_temp(conflict_update, cur) + @db_transaction_generator + def content_metadata_missing(self, metadatas, cur=None): + """List metadatas missing from storage. + + Args: + metadatas: iterable of dict with keys: + - id (bytes): sha1 identifier + - tool_name (str): tool used to compute the results + - tool_version (str): associated tool's version + + Returns: + an iterable of missing id + + """ + db = self.db + db.mktemp_content_metadata_missing(cur) + db.copy_to(metadatas, 'tmp_content_metadata_missing', + ['id', 'indexer_configuration_id'], cur) + for obj in db.content_metadata_missing_from_temp(cur): + yield obj[0] + + @db_transaction_generator + def content_metadata_get(self, ids, cur=None): + db = self.db + db.store_tmp_bytea(ids, cur) + for c in db.content_metadata_get_from_temp(): + yield converters.db_to_metadata( + dict(zip(db.content_metadata_cols, c))) + + @db_transaction + def content_metadata_add(self, metadatas, conflict_update=False, cur=None): + """Add metadatas not present in storage. + + Args: + metadatas: iterable of dictionary with keys: + - id: sha1 + - translated_metadata: bytes / jsonb ? + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + """ + db = self.db + db.mktemp_content_metadata(cur) + # empty metadata is mapped to 'unknown' + db.copy_to( + ({ + 'id': m['id'], + 'translated_metadata': m['translated_metadata'], + 'indexer_configuration_id': m['indexer_configuration_id'], + } for m in metadatas), + 'tmp_content_metadata', + ['id', 'translated_metadata', 'indexer_configuration_id'], cur) + + db.content_metadata_add_from_temp(conflict_update, cur) + @db_transaction def indexer_configuration_get(self, tool, cur=None): db = self.db diff --git a/swh/storage/tests/test_converters.py b/swh/storage/tests/test_converters.py --- a/swh/storage/tests/test_converters.py +++ b/swh/storage/tests/test_converters.py @@ -284,3 +284,29 @@ actual_license = converters.db_to_fossology_license(input_license) self.assertEquals(actual_license, expected_license) + + @istest + def db_to_metadata(self): + input_metadata = { + 'id': b'some-id', + 'tool_id': 20, + 'tool_name': 'some-toolname', + 'tool_version': 'some-toolversion', + 'tool_configuration': {}, + 'translated_metadata': b'translated_metadata', + } + + expected_metadata = { + 'id': b'some-id', + 'translated_metadata': b'translated_metadata', + 'tool': { + 'id': 20, + 'name': 'some-toolname', + 'version': 'some-toolversion', + 'configuration': {}, + } + } + + actual_metadata = converters.db_to_metadata(input_metadata) + + self.assertEquals(actual_metadata, expected_metadata) diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -2947,6 +2947,225 @@ }) self.assertEqual(actual_licenses, [expected_license]) + @istest + def content_metadata_missing(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-translator']['id'] + + cont2 = self.cont2 + self.storage.content_add([cont2]) + + metadatas = [ + { + 'id': self.cont2['sha1'], + 'indexer_configuration_id': tool_id, + }, + { + 'id': self.missing_cont['sha1'], + 'indexer_configuration_id': tool_id, + } + ] + + # when + actual_missing = list(self.storage.content_metadata_missing(metadatas)) + + # then + self.assertEqual(list(actual_missing), [ + self.cont2['sha1'], + self.missing_cont['sha1'], + ]) + + # given + self.storage.content_metadata_add([{ + 'id': self.cont2['sha1'], + 'translated_metadata': { + 'other': {}, + 'codeRepository': { + 'type': 'git', + 'url': 'https://github.com/moranegg/metadata_test' + }, + 'description': 'Simple package.json test for indexer', + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'indexer_configuration_id': tool_id + }]) + + # when + actual_missing = list(self.storage.content_metadata_missing(metadatas)) + + # then + self.assertEqual(actual_missing, [self.missing_cont['sha1']]) + + @istest + def content_metadata_get(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-translator']['id'] + cont2 = self.cont2 + self.storage.content_add([cont2]) + + metadata1 = { + 'id': self.cont2['sha1'], + 'translated_metadata': { + 'other': {}, + 'codeRepository': { + 'type': 'git', + 'url': 'https://github.com/moranegg/metadata_test' + }, + 'description': 'Simple package.json test for indexer', + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'indexer_configuration_id': tool_id, + } + + # when + self.storage.content_metadata_add([metadata1]) + + # then + actual_metadatas = list(self.storage.content_metadata_get( + [self.cont2['sha1'], self.missing_cont['sha1']])) + + expected_metadatas = [{ + 'id': self.cont2['sha1'], + 'translated_metadata': { + 'other': {}, + 'codeRepository': { + 'type': 'git', + 'url': 'https://github.com/moranegg/metadata_test' + }, + 'description': 'Simple package.json test for indexer', + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'tool': tools['swh-metadata-translator'] + }] + + self.assertEqual(actual_metadatas, expected_metadatas) + + @istest + def content_metadata_add_drop_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-translator']['id'] + cont2 = self.cont2 + self.storage.content_add([cont2]) + + metadata_v1 = { + 'id': self.cont2['sha1'], + 'translated_metadata': { + 'other': {}, + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_metadata_add([metadata_v1]) + + # when + actual_metadatas = list(self.storage.content_metadata_get( + [self.cont2['sha1']])) + + expected_metadatas_v1 = [{ + 'id': self.cont2['sha1'], + 'translated_metadata': { + 'other': {}, + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'tool': tools['swh-metadata-translator'] + }] + + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + # given + metadata_v2 = metadata_v1.copy() + metadata_v2.update({ + 'translated_metadata': { + 'other': {}, + 'name': 'test_drop_duplicated_metadata', + 'version': '0.0.1' + }, + }) + + self.storage.content_metadata_add([metadata_v2]) + + # then + actual_metadatas = list(self.storage.content_metadata_get( + [self.cont2['sha1']])) + + # metadata did not change as the v2 was dropped. + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + @istest + def content_metadata_add_update_in_place_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-translator']['id'] + cont2 = self.cont2 + self.storage.content_add([cont2]) + + metadata_v1 = { + 'id': self.cont2['sha1'], + 'translated_metadata': { + 'other': {}, + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_metadata_add([metadata_v1]) + + # when + actual_metadatas = list(self.storage.content_metadata_get( + [self.cont2['sha1']])) + + # then + expected_metadatas_v1 = [{ + 'id': self.cont2['sha1'], + 'translated_metadata': { + 'other': {}, + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'tool': tools['swh-metadata-translator'] + }] + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + # given + metadata_v2 = metadata_v1.copy() + metadata_v2.update({ + 'translated_metadata': { + 'other': {}, + 'name': 'test_update_duplicated_metadata', + 'version': '0.0.1' + }, + }) + self.storage.content_metadata_add([metadata_v2], conflict_update=True) + + actual_metadatas = list(self.storage.content_metadata_get( + [self.cont2['sha1']])) + + # language did not change as the v2 was dropped. + expected_metadatas_v2 = [{ + 'id': self.cont2['sha1'], + 'translated_metadata': { + 'other': {}, + 'name': 'test_update_duplicated_metadata', + 'version': '0.0.1' + }, + 'tool': tools['swh-metadata-translator'] + }] + + # language did change as the v2 was used to overwrite v1 + self.assertEqual(actual_metadatas, expected_metadatas_v2) + class TestLocalStorage(CommonTestStorage, unittest.TestCase): """Test the local storage""" @@ -3040,6 +3259,33 @@ self.assertEqual(expected_tool, actual_tool) + @istest + def indexer_configuration_metadata_get_missing_context(self): + tool = { + 'tool_name': 'swh-metadata-translator', + 'tool_version': '0.0.1', + 'tool_configuration': {"context": "unknown-context"}, + } + + actual_tool = self.storage.indexer_configuration_get(tool) + + self.assertIsNone(actual_tool) + + @istest + def indexer_configuration_metadata_get(self): + tool = { + 'tool_name': 'swh-metadata-translator', + 'tool_version': '0.0.1', + 'tool_configuration': {"type": "local", "context": "npm"}, + } + + actual_tool = self.storage.indexer_configuration_get(tool) + + expected_tool = tool.copy() + expected_tool['id'] = actual_tool['id'] + + self.assertEqual(expected_tool, actual_tool) + class AlteringSchemaTest(BaseTestStorage, unittest.TestCase): """This class is dedicated for the rare case where the schema needs to