diff --git a/sql/swh-data.sql b/sql/swh-data.sql --- a/sql/swh-data.sql +++ b/sql/swh-data.sql @@ -878,3 +878,6 @@ insert into indexer_configuration(tool_name, tool_version, tool_configuration) values ('swh-metadata-detector', '0.0.1', '{"type": "local", "context": ["npm", "codemeta"]}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('swh-deposit', '0.0.1', '{"sword_version": "2"}'); diff --git a/sql/swh-func.sql b/sql/swh-func.sql --- a/sql/swh-func.sql +++ b/sql/swh-func.sql @@ -2166,6 +2166,36 @@ comment on function swh_revision_metadata_get() is 'List revision''s metadata'; -- end revision_metadata functions +-- origin_metadata functions +create type origin_metadata_signature as ( + id bigint, + origin_id bigint, + discovery_date timestamptz, + tool_id bigint, + metadata jsonb, + provider_id integer, + provider_name text, + provider_type text, + provider_url text +); + + +create or replace function swh_origin_metadata_get_by_provider_type( + origin integer, + type text) + returns setof origin_metadata_signature + language sql + stable +as $$ + select om.id as id, origin_id, discovery_date, tool_id, om.metadata, + mp.id as provider_id, provider_name, provider_type, provider_url + from origin_metadata as om + inner join metadata_provider mp on om.provider_id = mp.id + where om.origin_id = origin + and mp.provider_type = type + order by discovery_date desc; +$$; +-- end origin_metadata functions -- simple counter mapping a textual label to an integer value create type counter as ( diff --git a/sql/swh-indexes.sql b/sql/swh-indexes.sql --- a/sql/swh-indexes.sql +++ b/sql/swh-indexes.sql @@ -294,6 +294,34 @@ alter table revision_metadata add constraint revision_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table revision_metadata validate constraint revision_metadata_indexer_configuration_id_fkey; +-- metadata_provider +create unique index concurrently metadata_provider_pkey on metadata_provider(id); +alter table metadata_provider add primary key using index metadata_provider_pkey; + +create index concurrently on metadata_provider(provider_name, provider_url); + +-- origin_metadata +create unique index concurrently origin_metadata_pkey on origin_metadata(id); +alter table origin_metadata add primary key using index origin_metadata_pkey; + +create index concurrently on origin_metadata(origin_id, provider_id, tool_id); + +alter table origin_metadata add constraint origin_metadata_origin_fkey foreign key (origin_id) references origin(id) not valid; +alter table origin_metadata validate constraint origin_metadata_origin_fkey; + +alter table origin_metadata add constraint origin_metadata_provider_fkey foreign key (provider_id) references metadata_provider(id) not valid; +alter table origin_metadata validate constraint origin_metadata_provider_fkey; + +alter table origin_metadata add constraint origin_metadata_tool_fkey foreign key (tool_id) references indexer_configuration(id) not valid; +alter table origin_metadata validate constraint origin_metadata_tool_fkey; + +-- origin_metadata_translation +create unique index concurrently origin_metadata_translation_pkey on origin_metadata_translation(id, indexer_configuration_id); +alter table origin_metadata_translation add primary key using index origin_metadata_translation_pkey; + +alter table origin_metadata_translation add constraint origin_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table origin_metadata_translation validate constraint origin_metadata_indexer_configuration_id_fkey; + -- object_counts create unique index concurrently object_counts_pkey on object_counts(object_type); alter table object_counts add primary key using index object_counts_pkey; diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -14,7 +14,7 @@ ); insert into dbversion(version, release, description) - values(109, now(), 'Work In Progress'); + values(111, now(), 'Work In Progress'); -- a SHA1 checksum (not necessarily originating from Git) create domain sha1 as bytea check (length(value) = 20); @@ -492,6 +492,52 @@ comment on column revision_metadata.translated_metadata is 'result of detection and translation with defined format'; comment on column revision_metadata.indexer_configuration_id is 'tool used for detection'; + +create table metadata_provider ( + id serial not null, + provider_name text not null, + provider_type text not null, + provider_url text, + metadata jsonb +); + +comment on table metadata_provider is 'Metadata provider information'; +comment on column metadata_provider.id is 'Provider''s identifier'; +comment on column metadata_provider.provider_name is 'Provider''s name'; +comment on column metadata_provider.provider_url is 'Provider''s url'; +comment on column metadata_provider.metadata is 'Other metadata about provider'; + + +-- Discovery of metadata during a listing, loading, deposit or external_catalog of an origin +-- also provides a translation to a defined json schema using a translation tool (indexer_configuration_id) +create table origin_metadata( + id bigserial not null, -- PK object identifier + origin_id bigint not null, -- references origin(id) + discovery_date timestamptz not null, -- when it was extracted + provider_id bigint not null, -- ex: 'hal', 'lister-github', 'loader-github' + tool_id bigint not null, + metadata jsonb not null +); + +comment on table origin_metadata is 'keeps all metadata found concerning an origin'; +comment on column origin_metadata.id is 'the origin_metadata object''s id'; +comment on column origin_metadata.origin_id is 'the origin id for which the metadata was found'; +comment on column origin_metadata.discovery_date is 'the date of retrieval'; +comment on column origin_metadata.provider_id is 'the metadata provider: github, openhub, deposit, etc.'; +comment on column origin_metadata.tool_id is 'the tool used for extracting metadata: lister-github, etc.'; +comment on column origin_metadata.metadata is 'metadata in json format but with original terms'; + +create table origin_metadata_translation( + id bigserial not null, -- PK origin_metadata identifier + result jsonb, + indexer_configuration_id bigint +); + +comment on table origin_metadata_translation is 'keeps translated for an origin_metadata entry'; +comment on column origin_metadata_translation.id is 'the entry id in origin_metadata'; +comment on column origin_metadata_translation.result is 'translated_metadata result after translation with tool'; +comment on column origin_metadata_translation.indexer_configuration_id is 'tool used for translation'; + -- Keep a cache of object counts create table object_counts ( object_type text, diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -257,3 +257,31 @@ def indexer_configuration_get(self, tool): return self.post('indexer_configuration/data', {'tool': tool}) + + def origin_metadata_add(self, origin_id, ts, provider, tool, metadata): + return self.post('origin/metadata/add', {'origin_id': origin_id, + 'ts': ts, + 'provider': provider, + 'tool': tool, + 'metadata': metadata}) + + def origin_metadata_get(self, id): + return self.post('origin/metadata/get', {'id': id}) + + def origin_metadata_get_all(self, origin_id): + return self.post('origin/metadata/getall', {'origin_id': origin_id}) + + def origin_metadata_get_by_provider_type(self, origin_id, provider_type): + return self.post('origin/metadata/byprovidertype', { + 'origin_id': origin_id, + 'provider_type': provider_type}) + + def metadata_provider_add(self, provider_name, provider_type, provider_url, + metadata): + return self.post('provider/add', {'provider_name': provider_name, + 'provider_type': provider_type, + 'provider_url': provider_url, + 'metadata': metadata}) + + def metadata_provider_get(self, provider_id): + return self.post('provider/data', {'provider_id': provider_id}) diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py --- a/swh/storage/api/server.py +++ b/swh/storage/api/server.py @@ -416,6 +416,43 @@ g.storage.revision_metadata_get(**decode_request(request))) +@app.route('/origin/metadata/add', methods=['POST']) +def origin_metadata_add(): + return encode_data(g.storage.origin_metadata_add(**decode_request( + request))) + + +@app.route('/origin/metadata/get', methods=['POST']) +def origin_metadata_get(): + return encode_data(g.storage.origin_metadata_get(**decode_request( + request))) + + +@app.route('/origin/metadata/getall', methods=['POST']) +def origin_metadata_getall(): + return encode_data(g.storage.origin_metadata_get_all(**decode_request( + request))) + + +@app.route('/origin/metadata/byprovidertype', methods=['POST']) +def origin_metadata_get_by_provider(): + return encode_data( + g.storage.origin_metadata_get_by_provider_type(**decode_request( + request))) + + +@app.route('/provider/add', methods=['POST']) +def metadata_provider_add(): + return encode_data(g.storage.metadata_provider_add(**decode_request( + request))) + + +@app.route('/provider/get', methods=['POST']) +def metadata_provider_get(): + return encode_data(g.storage.metadata_provider_get(**decode_request( + request))) + + @app.route('/stat/counters', methods=['GET']) def stat_counters(): return encode_data(g.storage.stat_counters()) diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -1028,6 +1028,84 @@ cur.execute(query) yield from cursor_to_bytes(cur) + def origin_metadata_add(self, origin, ts, provider, tool, + metadata, cur=None): + """ Add an origin_metadata for the origin at ts with provider, tooland + metadata. + + Args: + origin (int): the origin's id for which the metadata is added + ts (date): timestamp of the found metadata + provider (int): the metadata provider identifier + tool (int): the tool's identifier used to extract metadata + metadata (jsonb): the metadata retrieved at the time and location + + Returns: + id (int): the origin_metadata unique id + + """ + insert = """INSERT INTO origin_metadata (origin_id, discovery_date, + provider_id, tool_id, metadata) values (%s, %s, %s, %s, %s) + RETURNING id""" + cur.execute(insert, (origin, ts, provider, tool, jsonize(metadata))) + + return cur.fetchone()[0] + + origin_metadata_get_cols = ['id', 'origin_id', 'discovery_date', + 'provider_id', 'tool_id', 'metadata'] + + def origin_metadata_get(self, id, cur=None): + """Retrieve the unique entry of one origin_metadata by its identifier + + """ + cur = self._cursor(cur) + + query = """SELECT %s + FROM origin_metadata + WHERE id=%%s + """ % (', '.join(self.origin_metadata_get_cols)) + + cur.execute(query, (id, )) + + r = cur.fetchone() + if not r: + return None + return line_to_bytes(r) + + def origin_metadata_get_all(self, origin_id, cur=None): + """Retrieve all origin_metadata entries for one origin_id + + """ + cur = self._cursor(cur) + + query = """SELECT %s + FROM origin_metadata + WHERE origin_id=%%s """ % ( + ', '.join(self.origin_metadata_get_cols)) + + cur.execute(query, (origin_id, )) + + yield from cursor_to_bytes(cur) + + origin_metadata_provider_cols = ['id', 'origin_id', 'discovery_date', + 'tool_id', 'metadata', 'provider_id', + 'provider_name', 'provider_type', + 'provider_url'] + + def origin_metadata_get_by_provider_type(self, origin_id, provider_type, + cur=None): + """Retrieve all entries for one origin_id and from one provider + + """ + cur = self._cursor(cur) + query = '''SELECT %s + FROM swh_origin_metadata_get_by_provider_type( + %%s, %%s)''' % (','.join( + self.origin_metadata_provider_cols)) + + cur.execute(query, (origin_id, provider_type)) + yield from cursor_to_bytes(cur) + indexer_configuration_cols = ['id', 'tool_name', 'tool_version', 'tool_configuration'] @@ -1046,3 +1124,30 @@ if not data: return None return line_to_bytes(data) + + metadata_provider_cols = ['id', 'provider_name', 'provider_type', + 'provider_url', 'metadata'] + + def metadata_provider_get(self, provider_id, cur=None): + cur = self._cursor(cur) + cur.execute('''select %s + from metadata_provider + where provider_id=%%s ''' % ( + ','.join(self.metadata_provider_cols)), + (provider_id, )) + + data = cur.fetchone() + if not data: + return None + return line_to_bytes(data) + + def metadata_provider_add(self, provider_name, provider_type, + provider_url, metadata, cur=None): + """Insert a new provider and return the new identifier.""" + insert = """INSERT INTO metadata_provider (provider_name, provider_type, + provider_url, metadata) values (%s, %s, %s, %s) + RETURNING id""" + + cur.execute(insert, (provider_name, provider_type, provider_url, + jsonize(metadata))) + return cur.fetchone()[0] diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -1724,6 +1724,108 @@ db.revision_metadata_add_from_temp(conflict_update, cur) @db_transaction + def origin_metadata_add(self, origin_id, ts, provider, tool, metadata, + cur=None): + """ Add an origin_metadata for the origin at ts with provenance and + metadata. + + Args: + origin_id (int): the origin's id for which the metadata is added + ts (datetime): timestamp of the found metadata + provider (int): the provider of metadata (ex:'hal') + tool (int): tool used to extract metadata + metadata (jsonb): the metadata retrieved at the time and location + + Returns: + id (int): the origin_metadata unique id + """ + if isinstance(ts, str): + ts = dateutil.parser.parse(ts) + + return self.db.origin_metadata_add(origin_id, ts, provider, tool, + metadata, cur) + + @db_transaction + def origin_metadata_get(self, id, cur=None): + """Return the origin_metadata entry for the unique id + + Args: + origin_metadata_id (int): the unique metadata identifier + + Returns: + dict: the origin_metadata dictionary with the keys: + + - id: origin_metadata's id + - origin_id: origin's id + - discovery_date: timestamp of discovery + - provider (int): metadata's provider + - tool (int): tool used for this metadata + - metadata (jsonb) + + """ + db = self.db + + om = db.origin_metadata_get(id, cur) + + if om: + return dict(zip(db.origin_metadata_get_cols, om)) + return None + + @db_transaction_generator + def origin_metadata_get_all(self, origin_id, cur=None): + """Retrieve list of all origin_metadata entries for the origin_id + + Args: + origin_id (int): the unique origin identifier + + Returns: + list of dicts: the origin_metadata dictionary with the keys: + + - id: origin_metadata's id + - origin_id: origin's id + - discovery_date: timestamp of discovery + - provider (id): metadata's provider + - tool (int): tool used for this metadata + - metadata (jsonb): + + """ + db = self.db + for line in db.origin_metadata_get_all(origin_id, cur): + data = dict(zip(db.origin_metadata_get_cols, line)) + yield data + + @db_transaction_generator + def origin_metadata_get_by_provider_type(self, origin_id, provider_type, + cur=None): + """Retrieve list of origin_metadata entries for an origin and + a specific provider_type (e.g: 'registry', 'deposit-client') + + Args: + origin_id (int): the unique origin identifier + provider_type (text): the type of provider + + Returns: + list of dicts: the origin_metadata dictionary with the keys: + + - id (int): origin_metadata's id + - origin_id (int): origin's id + - discovery_date (datetime): timestamp of discovery + - tool_id (int): metadata's extracting tool + - metadata (jsonb) + - provider_id (int): metadata's provider + - provider_name (str) + - provider_type (str) + - provider_url (str) + + """ + db = self.db + for line in db.origin_metadata_get_by_provider_type(origin_id, + provider_type, + cur): + data = dict(zip(self.db.origin_metadata_provider_cols, line)) + yield data + + @db_transaction def indexer_configuration_get(self, tool, cur=None): db = self.db tool_conf = tool['tool_configuration'] @@ -1735,3 +1837,18 @@ if not idx: return None return dict(zip(self.db.indexer_configuration_cols, idx)) + + @db_transaction + def metadata_provider_add(self, provider_name, provider_type, provider_url, + metadata, cur=None): + db = self.db + return db.metadata_provider_add(provider_name, provider_type, + provider_url, metadata, cur) + + @db_transaction + def metadata_provider_get(self, provider_id, cur=None): + db = self.db + result = db.metadata_provider_get(provider_id) + if not result: + return None + return dict(zip(self.db.metadata_provider_cols, result)) diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -335,6 +335,47 @@ 'type': 'git', } + self.provider = { + 'name': 'hal', + 'type': 'deposit-client', + 'url': 'http:///hal/inria', + 'metadata': { + 'location': 'France' + } + } + + self.metadata_tool = { + 'tool_name': 'swh-deposit', + 'tool_version': '0.0.1', + 'tool_configuration': { + 'sword_version': '2' + } + } + + self.origin_metadata = { + 'origin': self.origin, + 'discovery_date': datetime.datetime(2015, 1, 1, 23, 0, 0, + tzinfo=datetime.timezone.utc), + 'provider': self.provider, + 'tool': 'swh-deposit', + 'metadata': { + 'name': 'test_origin_metadata', + 'version': '0.0.1' + } + } + + self.origin_metadata2 = { + 'origin': self.origin, + 'discovery_date': datetime.datetime(2017, 1, 1, 23, 0, 0, + tzinfo=datetime.timezone.utc), + 'provider': self.provider, + 'tool': 'swh-deposit', + 'metadata': { + 'name': 'test_origin_metadata', + 'version': '0.0.1' + } + } + self.date_visit1 = datetime.datetime(2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) @@ -3429,6 +3470,178 @@ self.assertEqual(expected_tool, actual_tool) + @istest + def origin_metadata_add(self): + # given + metadata_id = 1 + origin_metadata0 = self.storage.origin_metadata_get(metadata_id) + self.assertIsNone(origin_metadata0) + + origin_id = self.storage.origin_add([self.origin])[0] + provider_id = self.storage.metadata_provider_add( + self.provider['name'], + self.provider['type'], + self.provider['url'], + self.provider['metadata']) + tool = self.storage.indexer_configuration_get(self.metadata_tool) + + # when adding for the same origin 2 metadatas + o_m1 = self.storage.origin_metadata_add( + origin_id, + self.origin_metadata['discovery_date'], + provider_id, + tool['id'], + self.origin_metadata['metadata']) + o_m2 = self.storage.origin_metadata_add( + origin_id, + self.origin_metadata2['discovery_date'], + provider_id, + tool['id'], + self.origin_metadata2['metadata']) + actual_om1 = self.storage.origin_metadata_get(o_m1) + actual_om2 = self.storage.origin_metadata_get(o_m2) + + # then + self.assertEqual(actual_om1['id'], o_m1) + self.assertEqual(actual_om2['id'], o_m2) + self.assertEqual(actual_om1['origin_id'], origin_id) + self.assertEqual(actual_om2['origin_id'], origin_id) + + @istest + def origin_metadata_get_all(self): + # given + origin_id = self.storage.origin_add([self.origin])[0] + origin_id2 = self.storage.origin_add([self.origin2])[0] + + provider_id = self.storage.metadata_provider_add( + self.provider['name'], + self.provider['type'], + self.provider['url'], + self.provider['metadata']) + tool = self.storage.indexer_configuration_get(self.metadata_tool) + # when adding for the same origin 2 metadatas + o_m1 = self.storage.origin_metadata_add( + origin_id, + self.origin_metadata['discovery_date'], + provider_id, + tool['id'], + self.origin_metadata['metadata']) + o_m2 = self.storage.origin_metadata_add( + origin_id2, + self.origin_metadata2['discovery_date'], + provider_id, + tool['id'], + self.origin_metadata2['metadata']) + o_m3 = self.storage.origin_metadata_add( + origin_id, + self.origin_metadata2['discovery_date'], + provider_id, + tool['id'], + self.origin_metadata2['metadata']) + all_metadatas = list(self.storage.origin_metadata_get_all(origin_id)) + metadatas_for_origin2 = list(self.storage.origin_metadata_get_all( + origin_id2)) + expected_results = [{ + 'origin_id': origin_id, + 'discovery_date': datetime.datetime( + 2015, 1, 2, 0, 0, + tzinfo=psycopg2.tz.FixedOffsetTimezone( + offset=60, + name=None)), + 'metadata': { + 'name': 'test_origin_metadata', + 'version': '0.0.1' + }, + 'id': o_m1, + 'provider_id': provider_id, + 'tool_id': tool['id'] + }, { + 'origin_id': origin_id, + 'discovery_date': datetime.datetime( + 2017, 1, 2, 0, 0, + tzinfo=psycopg2.tz.FixedOffsetTimezone( + offset=60, + name=None)), + 'metadata': { + 'name': 'test_origin_metadata', + 'version': '0.0.1' + }, + 'id': o_m3, + 'provider_id': provider_id, + 'tool_id': tool['id'] + }] + + # then + self.assertEqual(len(all_metadatas), 2) + self.assertEqual(len(metadatas_for_origin2), 1) + self.assertEqual(metadatas_for_origin2[0]['id'], o_m2) + self.assertEqual(all_metadatas, expected_results) + + @istest + def origin_metadata_get_by_provider_type(self): + # given + origin_id = self.storage.origin_add([self.origin])[0] + origin_id2 = self.storage.origin_add([self.origin2])[0] + provider_id = self.storage.metadata_provider_add( + self.provider['name'], + self.provider['type'], + self.provider['url'], + self.provider['metadata']) + + provider_id2 = self.storage.metadata_provider_add( + 'swMATH', + 'registry', + 'http://www.swmath.org/', + {'email': 'contact@swmath.org', + 'license': 'All rights reserved'}) + + # using the only tool now inserted in the data.sql, but for this + # provider should be a crawler tool (not yet implemented) + tool = self.storage.indexer_configuration_get(self.metadata_tool) + + # when adding for the same origin 2 metadatas + o_m1 = self.storage.origin_metadata_add( + origin_id, + self.origin_metadata['discovery_date'], + provider_id, + tool['id'], + self.origin_metadata['metadata']) + o_m2 = self.storage.origin_metadata_add( + origin_id2, + self.origin_metadata2['discovery_date'], + provider_id2, + tool['id'], + self.origin_metadata2['metadata']) + provider_type = 'registry' + m_by_provider = list(self.storage. + origin_metadata_get_by_provider_type( + origin_id2, + provider_type)) + expected_results = [{ + 'origin_id': origin_id2, + 'discovery_date': datetime.datetime( + 2017, 1, 2, 0, 0, + tzinfo=psycopg2.tz.FixedOffsetTimezone( + offset=60, + name=None)), + 'metadata': { + 'name': 'test_origin_metadata', + 'version': '0.0.1' + }, + 'id': o_m2, + 'provider_id': provider_id2, + 'provider_name': 'swMATH', + 'provider_type': provider_type, + 'provider_url': 'http://www.swmath.org/', + 'tool_id': tool['id'] + }] + # then + + self.assertEqual(len(m_by_provider), 1) + self.assertEqual(m_by_provider, expected_results) + self.assertEqual(m_by_provider[0]['id'], o_m2) + self.assertIsNotNone(o_m1) + class TestLocalStorage(CommonTestStorage, unittest.TestCase): """Test the local storage"""