diff --git a/sql/swh-indexes.sql b/sql/swh-indexes.sql --- a/sql/swh-indexes.sql +++ b/sql/swh-indexes.sql @@ -294,6 +294,22 @@ alter table revision_metadata add constraint revision_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table revision_metadata validate constraint revision_metadata_indexer_configuration_id_fkey; +-- origin_metadata +create unique index concurrently origin_metadata_pkey on origin_metadata(id); +alter table origin_metadata add primary key using index origin_metadata_pkey; + +create index concurrently on origin_metadata(origin_id, provenance); + +alter table origin_metadata add constraint origin_metadata_origin_fkey foreign key (origin_id) references origin(id) not valid; +alter table origin_metadata validate constraint origin_metadata_origin_fkey; + +-- origin_metadata_translation +create unique index concurrently origin_metadata_translation_pkey on origin_metadata_translation(id, indexer_configuration_id); +alter table origin_metadata_translation add primary key using index origin_metadata_translation_pkey; + +alter table origin_metadata_translation add constraint origin_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table origin_metadata_translation validate constraint origin_metadata_indexer_configuration_id_fkey; + -- object_counts create unique index concurrently object_counts_pkey on object_counts(object_type); alter table object_counts add primary key using index object_counts_pkey; diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -14,7 +14,7 @@ ); insert into dbversion(version, release, description) - values(109, now(), 'Work In Progress'); + values(110, now(), 'Work In Progress'); -- a SHA1 checksum (not necessarily originating from Git) create domain sha1 as bytea check (length(value) = 20); @@ -492,6 +492,34 @@ comment on column revision_metadata.translated_metadata is 'result of detection and translation with defined format'; comment on column revision_metadata.indexer_configuration_id is 'tool used for detection'; +-- Discovery of metadata during a listing, loading, deposit or external_catalog of an origin +-- also provides a translation to a defined json schema using a translation tool (indexer_configuration_id) +create table origin_metadata( + id bigserial not null, -- PK object identifier + origin_id bigint not null, -- references origin(id) + discovery_date timestamptz not null, -- when it was extracted + provenance text not null, -- ex: 'deposit-hal', 'lister-github', 'loader-github' + metadata jsonb not null +); + +comment on table origin_metadata is 'keeps all metadata found concerning an origin'; +comment on column origin_metadata.id is 'the origin_metadata object''s id'; +comment on column origin_metadata.origin_id is 'the origin id for which the metadata was found'; +comment on column origin_metadata.discovery_date is 'the date of retrieval'; +comment on column origin_metadata.provenance is 'the type of metadata provenance: deposit, lister, loader, publisher, etc'; +comment on column origin_metadata.metadata is 'metadata in json format but with original terms'; + +create table origin_metadata_translation( + id bigserial not null, -- PK origin_metadata identifier + result jsonb, + indexer_configuration_id bigint +); + +comment on table origin_metadata_translation is 'keeps translated for an origin_metadata entry'; +comment on column origin_metadata_translation.id is 'the entry id in origin_metadata'; +comment on column origin_metadata_translation.result is 'translated_metadata result after translation with tool'; +comment on column origin_metadata_translation.indexer_configuration_id is 'tool used for translation'; + -- Keep a cache of object counts create table object_counts ( object_type text, diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -255,5 +255,22 @@ def revision_metadata_get(self, ids): return self.post('revision_metadata', {'ids': ids}) + def origin_metadata_add(self, origin_id, ts, provenance, metadata): + return self.post('origin/metadata/add', {'origin_id': origin_id, + 'ts': ts, + 'provenance': provenance, + 'metadata': metadata}) + + def origin_metadata_get(self, id): + return self.post('origin/metadata/get', {'id': id}) + + def origin_metadata_get_all(self, origin_id): + return self.post('origin/metadata/getall', {'origin_id': origin_id}) + + def origin_metadata_get_by_provenance(self, origin_id, provenance): + return self.post('origin/metadata/getbyprovenance', { + 'origin_id': origin_id, + 'provenance': provenance}) + def indexer_configuration_get(self, tool): return self.post('indexer_configuration/data', {'tool': tool}) diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py --- a/swh/storage/api/server.py +++ b/swh/storage/api/server.py @@ -416,6 +416,30 @@ g.storage.revision_metadata_get(**decode_request(request))) +@app.route('/origin/metadata/add', methods=['POST']) +def origin_metadata_add(): + return encode_data(g.storage.origin_metadata_add(**decode_request( + request))) + + +@app.route('/origin/metadata/get', methods=['POST']) +def origin_metadata_get(): + return encode_data(g.storage.origin_metadata_get(**decode_request( + request))) + + +@app.route('/origin/metadata/getall', methods=['POST']) +def origin_metadata_getall(): + return encode_data(g.storage.origin_metadata_get_all(**decode_request( + request))) + + +@app.route('/origin/metadata/getbyprovenance', methods=['POST']) +def origin_metadata_get_by(): + return encode_data( + g.storage.origin_metadata_get_by_provenance(**decode_request(request))) + + @app.route('/stat/counters', methods=['GET']) def stat_counters(): return encode_data(g.storage.stat_counters()) diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -1043,3 +1043,80 @@ if not data: return None return line_to_bytes(data) + + def origin_metadata_add(self, origin, ts, provenance, metadata, cur=None): + """ Add an origin_metadata for the origin at ts with provenance and + metadata. + + Args: + origin: the origin's id for which the metadata is added + ts: timestamp of the found metadata + provenance (text): the tool and location where it was found + (ex:'deposit-hal') + metadata (jsonb): the metadata retrieved at the time and location + + Returns: + id (int): the origin_metadata unique id + + """ + insert = """INSERT INTO origin_metadata (origin_id, discovery_date, + provenance, metadata) values (%s, %s, %s, %s) + RETURNING id""" + cur.execute(insert, (origin, ts, provenance, jsonize(metadata))) + + return cur.fetchone()[0] + + origin_metadata_get_cols = ['id', 'origin_id', 'discovery_date', + 'provenance', 'metadata'] + + def origin_metadata_get(self, id, cur=None): + """Retrieve the unique entry of one origin_metadata by its identifier + + """ + cur = self._cursor(cur) + + query = """\ + SELECT %s + FROM origin_metadata + WHERE id=%%s + """ % (', '.join(self.origin_metadata_get_cols)) + + cur.execute(query, (id, )) + + r = cur.fetchone() + if not r: + return None + return line_to_bytes(r) + + def origin_metadata_get_all(self, origin_id, cur=None): + """Retrieve all origin_metadata entries for one origin_id + + """ + cur = self._cursor(cur) + + query = """\ + SELECT %s + FROM origin_metadata + WHERE origin_id=%%s """ % ( + ', '.join(self.origin_metadata_get_cols)) + + cur.execute(query, (origin_id, )) + + yield from cursor_to_bytes(cur) + + def origin_metadata_get_by_provenance(self, origin_id, provenance, + cur=None): + """Retrieve all entries for one origin_id and from one provenance + + """ + cur = self._cursor(cur) + + query = """\ + SELECT %s + FROM origin_metadata + WHERE origin_id=%%s AND provenance=%%s + """ % (', '.join(self.origin_metadata_get_cols)) + + cur.execute(query, (origin_id, provenance)) + + yield from cursor_to_bytes(cur) diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -1724,6 +1724,91 @@ db.revision_metadata_add_from_temp(conflict_update, cur) @db_transaction + def origin_metadata_add(self, origin_id, ts, provenance, metadata, + cur=None): + """ Add an origin_metadata for the origin at ts with provenance and + metadata. + + Args: + origin_id: the origin's id for which the metadata is added + ts: timestamp of the found metadata + provenance (text): the tool and location where it was found + (ex:'deposit-hal') + metadata (jsonb): the metadata retrieved at the time and location + + Returns: + id (int): the origin_metadata unique id + """ + if isinstance(ts, str): + ts = dateutil.parser.parse(ts) + + return self.db.origin_metadata_add(origin_id, ts, provenance, + metadata, cur) + + @db_transaction + def origin_metadata_get(self, id, cur=None): + """Return the origin_metadata entry for the unique id + + Returns: + dict: the origin_metadata dictionary with the keys: + + - id: origin_metadata's id + - origin_id: origin's id + - discovery_date: timestamp of discovery + - provenance (text): metadata's provenance + - metadata (jsonb): + + """ + db = self.db + + om = db.origin_metadata_get(id, cur) + + if om: + return dict(zip(self.db.origin_metadata_get_cols, om)) + return None + + @db_transaction_generator + def origin_metadata_get_all(self, origin_id, cur=None): + """Retrieve list of all origin_metadata entries for the origin_id + + Returns: + list of dicts: the origin_metadata dictionary with the keys: + + - id: origin_metadata's id + - origin_id: origin's id + - discovery_date: timestamp of discovery + - provenance (text): metadata's provenance + - metadata (jsonb): + + """ + db = self.db + for line in db.origin_metadata_get_all(origin_id, cur): + data = dict(zip(self.db.origin_metadata_get_cols, line)) + yield data + + @db_transaction_generator + def origin_metadata_get_by_provenance(self, origin_id, provenance, + cur=None): + """Retrieve list of origin_metadata entries for an origin and + a specific provenance + + Returns: + list of dicts: the origin_metadata dictionary with the keys: + + - id: origin_metadata's id + - origin_id: origin's id + - discovery_date: timestamp of discovery + - provenance (text): metadata's provenance + - metadata (jsonb): + + """ + db = self.db + for line in db.origin_metadata_get_by_provenance(origin_id, provenance, + cur): + data = dict(zip(self.db.origin_metadata_get_cols, line)) + yield data + + @db_transaction def indexer_configuration_get(self, tool, cur=None): db = self.db tool_conf = tool['tool_configuration'] diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -335,6 +335,28 @@ 'type': 'git', } + self.origin_metadata = { + 'origin': self.origin, + 'discovery_date': datetime.datetime(2015, 1, 1, 23, 0, 0, + tzinfo=datetime.timezone.utc), + 'provenance': 'deposit-hal', + 'metadata': { + 'name': 'test_origin_metadata', + 'version': '0.0.1' + } + } + + self.origin_metadata2 = { + 'origin': self.origin, + 'discovery_date': datetime.datetime(2017, 1, 1, 23, 0, 0, + tzinfo=datetime.timezone.utc), + 'provenance': 'lister-github', + 'metadata': { + 'name': 'test_origin_metadata', + 'version': '0.0.1' + } + } + self.date_visit1 = datetime.datetime(2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) @@ -3429,6 +3451,118 @@ self.assertEqual(expected_tool, actual_tool) + @istest + def origin_metadata_add(self): + # given + metadata_id = 1 + origin_metadata0 = self.storage.origin_metadata_get(metadata_id) + self.assertIsNone(origin_metadata0) + + origin_id = self.storage.origin_add([self.origin])[0] + # when adding for the same origin 2 metadatas + o_m1 = self.storage.origin_metadata_add( + origin_id, + self.origin_metadata['discovery_date'], + self.origin_metadata['provenance'], + self.origin_metadata['metadata']) + o_m2 = self.storage.origin_metadata_add( + origin_id, + self.origin_metadata2['discovery_date'], + self.origin_metadata2['provenance'], + self.origin_metadata2['metadata']) + actual_om1 = self.storage.origin_metadata_get(o_m1) + actual_om2 = self.storage.origin_metadata_get(o_m2) + + # then + self.assertEqual(actual_om1['id'], o_m1) + self.assertEqual(actual_om2['id'], o_m2) + self.assertEqual(actual_om1['origin_id'], origin_id) + self.assertEqual(actual_om2['origin_id'], origin_id) + + @istest + def origin_metadata_get_all(self): + # given + origin_id = self.storage.origin_add([self.origin])[0] + + # when adding for the same origin 2 metadatas + o_m1 = self.storage.origin_metadata_add( + origin_id, + self.origin_metadata['discovery_date'], + self.origin_metadata['provenance'], + self.origin_metadata['metadata']) + o_m2 = self.storage.origin_metadata_add( + origin_id, + self.origin_metadata2['discovery_date'], + self.origin_metadata2['provenance'], + self.origin_metadata2['metadata']) + all_metadatas = list(self.storage.origin_metadata_get_all(origin_id)) + + expected_results = [{ + 'origin_id': origin_id, + 'discovery_date': datetime.datetime( + 2015, 1, 2, 0, 0, + tzinfo=psycopg2.tz.FixedOffsetTimezone( + offset=60, + name=None)), + 'metadata': { + 'name': 'test_origin_metadata', + 'version': '0.0.1' + }, + 'id': o_m1, + 'provenance': 'deposit-hal' + }, { + 'origin_id': origin_id, + 'discovery_date': datetime.datetime( + 2017, 1, 2, 0, 0, + tzinfo=psycopg2.tz.FixedOffsetTimezone( + offset=60, + name=None)), + 'metadata': { + 'name': 'test_origin_metadata', + 'version': '0.0.1' + }, + 'id': o_m2, + 'provenance': 'lister-github' + }] + + # then + self.assertEqual(len(all_metadatas), 2) + # self.assertEqual(len(m_by_provenance), 1) + self.assertEqual(all_metadatas, expected_results) + + @istest + def origin_metadata_get_by_provenance(self): + # given + origin_id = self.storage.origin_add([self.origin])[0] + + # when adding for the same origin 2 metadatas + o_m1 = self.storage.origin_metadata_add( + origin_id, + self.origin_metadata['discovery_date'], + self.origin_metadata['provenance'], + self.origin_metadata['metadata']) + m_by_provenance = list(self.storage.origin_metadata_get_by_provenance( + origin_id, self.origin_metadata['provenance'])) + + expected_results = [{ + 'origin_id': origin_id, + 'discovery_date': datetime.datetime( + 2015, 1, 2, 0, 0, + tzinfo=psycopg2.tz.FixedOffsetTimezone( + offset=60, + name=None)), + 'metadata': { + 'name': 'test_origin_metadata', + 'version': '0.0.1' + }, + 'id': o_m1, + 'provenance': 'deposit-hal' + }] + # then + + self.assertEqual(len(m_by_provenance), 1) + self.assertEqual(m_by_provenance, expected_results) + class TestLocalStorage(CommonTestStorage, unittest.TestCase): """Test the local storage"""