diff --git a/sql/swh-indexes.sql b/sql/swh-indexes.sql --- a/sql/swh-indexes.sql +++ b/sql/swh-indexes.sql @@ -294,6 +294,22 @@ alter table revision_metadata add constraint revision_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table revision_metadata validate constraint revision_metadata_indexer_configuration_id_fkey; +-- origin_metadata +create unique index concurrently origin_metadata_pkey on origin_metadata(id); +alter table origin_metadata add primary key using index origin_metadata_pkey; + +create index concurrently on origin_metadata(origin_id, provenance); + +alter table origin_metadata add constraint origin_metadata_origin_fkey foreign key (origin_id) references origin(id) not valid; +alter table origin_metadata validate constraint origin_metadata_origin_fkey; + +-- origin_metadata_translation +create unique index concurrently origin_metadata_translation_pkey on origin_metadata_translation(id, indexer_configuration_id); +alter table origin_metadata_translation add primary key using index origin_metadata_translation_pkey; + +alter table origin_metadata_translation add constraint origin_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table origin_metadata_translation validate constraint origin_metadata_indexer_configuration_id_fkey; + -- object_counts create unique index concurrently object_counts_pkey on object_counts(object_type); alter table object_counts add primary key using index object_counts_pkey; diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -492,6 +492,34 @@ comment on column revision_metadata.translated_metadata is 'result of detection and translation with defined format'; comment on column revision_metadata.indexer_configuration_id is 'tool used for detection'; +-- Discovery of metadata during a listing, loading, deposit or external_catalog of an origin +-- also provides a translation to a defined json schema using a translation tool (indexer_configuration_id) +create table origin_metadata( + id bigserial not null -- PK object identifier + origin_id bigint not null -- references origin(id), + date timestamptz not null, -- when it was extracted + provenance text not null, -- ex: 'deposit-hal', 'lister-github', 'loader-github' + metadata jsonb not null, +); + +comment on table origin_metadata is 'keeps all metadata found concerning an origin'; +comment on column origin_metadata.id is 'the origin_metadata object''s id'; +comment on column origin_metadata.origin_id is 'the origin id for which the metadata was found'; +comment on column origin_metadata.date is 'the date of retrieval'; +comment on column origin_metadata.provenance is 'the type of metadata provenance: deposit, lister, loader, publisher, etc'; +comment on column origin_metadata.metadata is 'metadata in json format but with original terms'; + +create table origin_metadata_translation( + id bigserial not null -- PK origin_metadata identifier + result jsonb, + indexer_configuration_id bigint, +); + +comment on table origin_metadata_translation is 'keeps translated for an origin_metadata entry'; +comment on column origin_metadata_translation.id is 'the entry id in origin_metadata'; +comment on column origin_metadata_translation.result is 'translated_metadata result after translation with tool'; +comment on column origin_metadata_translation.indexer_configuration_id is 'tool used for translation'; + -- Keep a cache of object counts create table object_counts ( object_type text, diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -255,5 +255,22 @@ def revision_metadata_get(self, ids): return self.post('revision_metadata', {'ids': ids}) + def origin_metadata_add(self, origin, ts, provenance, metadata): + return self.post('/origin/metadata/add', {'origin': origin, + 'ts': ts, + 'provenance': provenance, + 'metadata': metadata}) + + def origin_metadata_get(self, origin_metadata_id): + return self.post('/origin/metadata/get', {'id': origin_metadata_id}) + + def origin_metadata_get_all(self, origin_id): + return self.post('/origin/metadata/getall', {'origin_id': origin_id}) + + def origin_metadata_get_by_provenance(self, origin_id, provenance): + return self.post('/origin/metadata/getbyprovenance', { + 'origin_id': origin_id, + 'provenance': provenance}) + def indexer_configuration_get(self, tool): return self.post('indexer_configuration/data', {'tool': tool}) diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py --- a/swh/storage/api/server.py +++ b/swh/storage/api/server.py @@ -416,6 +416,30 @@ g.storage.revision_metadata_get(**decode_request(request))) +@app.route('/origin/metadata/add', methods=['POST']) +def origin_metadata_add(): + return encode_data(g.storage.origin_metadata_add(**decode_request( + request))) + + +@app.route('/origin/metadata/get', methods=['POST']) +def origin_metadata_get(): + return encode_data(g.storage.origin_metadata_get(**decode_request( + request))) + + +@app.route('/origin/metadata/getall', methods=['POST']) +def origin_metadata_getall(): + return encode_data(g.storage.origin_metadata_get_all(**decode_request( + request))) + + +@app.route('/origin/metadata/getbyprovenance', methods=['POST']) +def origin_metadata_get_by(): + return encode_data( + g.storage.origin_metadata_get_by_provenance(**decode_request(request))) + + @app.route('/stat/counters', methods=['GET']) def stat_counters(): return encode_data(g.storage.stat_counters()) diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -335,6 +335,28 @@ 'type': 'git', } + self.origin_metadata = { + 'origin': self.origin, + 'date': datetime.datetime(2015, 1, 1, 23, 0, 0, + tzinfo=datetime.timezone.utc), + 'provenance': 'deposit-hal', + 'metadata': { + 'name': 'test_origin_metadata', + 'version': '0.0.1' + } + } + + self.origin_metadata2 = { + 'origin': self.origin, + 'date': datetime.datetime(2017, 1, 1, 23, 0, 0, + tzinfo=datetime.timezone.utc), + 'provenance': 'lister-github', + 'metadata': { + 'name': 'test_origin_metadata', + 'version': '0.0.1' + } + } + self.date_visit1 = datetime.datetime(2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) @@ -3429,6 +3451,68 @@ self.assertEqual(expected_tool, actual_tool) + @istest + def origin_metadata_add(self): + # given + origin_metadata0 = self.storage.origin_metadata_get(self.origin) + self.assertIsNone(origin_metadata0) + + origin_id = self.storage.origin_add([self.origin]) + + # when adding for the same origin 2 metadatas + o_m_id1 = self.storage.origin_metadata_add(self.origin_metadata) + o_m_id2 = self.storage.origin_metadata_add(self.origin_metadata2) + actual_origin_metadata1 = self.storage.origin_metadata_get(o_m_id1) + actual_origin_metadata2 = self.storage.origin_metadata_get(o_m_id2) + + # then + self.assertEqual(actual_origin_metadata1['id'], o_m_id1) + self.assertEqual(actual_origin_metadata2['id'], o_m_id2) + self.assertEqual(actual_origin_metadata1['origin_id'], origin_id) + self.assertEqual(actual_origin_metadata2['origin_id'], origin_id) + + @istest + def origin_metadata_get(self): + # given + origin_id = self.storage.origin_add([self.origin]) + + # when adding for the same origin 2 metadatas + o_m_id1 = self.storage.origin_metadata_add(self.origin_metadata) + o_m_id2 = self.storage.origin_metadata_add(self.origin_metadata2) + all_metadatas = self.storage.origin_metadata_get_all({ + 'origin_id': origin_id, + }) + m_by_provenance = self.storage.origin_metadata_get_by_provenance({ + 'origin_id': origin_id, + 'provenance': self.origin_metadata['provenance'] + }) + expected_results = [{ + 'id': o_m_id1, + 'origin_id': origin_id, + 'date': datetime.datetime(2015, 1, 1, 23, 0, 0, + tzinfo=datetime.timezone.utc), + 'provenance': 'deposit-hal', + 'metadata': { + 'name': 'test_origin_metadata', + 'version': '0.0.1' + } + }, { + 'id': o_m_id2, + 'origin_id': origin_id, + 'date': datetime.datetime(2017, 1, 1, 23, 0, 0, + tzinfo=datetime.timezone.utc), + 'provenance': 'lister-hal', + 'metadata': { + 'name': 'test_origin_metadata', + 'version': '0.0.1' + } + }] + + # then + self.assertEqual(len(all_metadatas), 2) + self.assertEqual(len(m_by_provenance), 1) + self.assertEqual(all_metadatas, expected_results) + class TestLocalStorage(CommonTestStorage, unittest.TestCase): """Test the local storage"""