diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql --- a/swh/indexer/sql/30-swh-schema.sql +++ b/swh/indexer/sql/30-swh-schema.sql @@ -126,13 +126,15 @@ comment on column revision_metadata.translated_metadata is 'result of detection and translation with defined format'; comment on column revision_metadata.indexer_configuration_id is 'tool used for detection'; -create table origin_metadata_translation( - id bigserial not null, -- PK origin_metadata identifier - result jsonb, - tool_id bigint +create table origin_intrinsic_metadata( + origin_id bigserial not null, + metadata jsonb, + indexer_configuration_id bigint not null, + from_revision sha1_git not null ); -comment on table origin_metadata_translation is 'keeps translated for an origin_metadata entry'; -comment on column origin_metadata_translation.id is 'the entry id in origin_metadata'; -comment on column origin_metadata_translation.result is 'translated_metadata result after translation with tool'; -comment on column origin_metadata_translation.tool_id is 'tool used for translation'; +comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin'; +comment on column origin_intrinsic_metadata.origin_id is 'the entry id in origin'; +comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision'; +comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata'; +comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.'; diff --git a/swh/indexer/sql/40-swh-func.sql b/swh/indexer/sql/40-swh-func.sql --- a/swh/indexer/sql/40-swh-func.sql +++ b/swh/indexer/sql/40-swh-func.sql @@ -346,6 +346,18 @@ comment on function swh_mktemp_revision_metadata() is 'Helper table to add revision metadata'; +-- create a temporary table for retrieving origin_intrinsic_metadata +create or replace function swh_mktemp_origin_intrinsic_metadata() + returns void + language sql +as $$ + create temporary table tmp_origin_intrinsic_metadata ( + like origin_intrinsic_metadata including defaults + ) on commit drop; +$$; + +comment on function swh_mktemp_origin_intrinsic_metadata() is 'Helper table to add origin intrinsic metadata'; + create or replace function swh_mktemp_indexer_configuration() returns void language sql @@ -379,3 +391,39 @@ return; end $$; + +-- add tmp_origin_intrinsic_metadata entries to origin_intrinsic_metadata, +-- overwriting duplicates if conflict_update is true, skipping duplicates +-- otherwise. +-- +-- If filtering duplicates is in order, the call to +-- swh_origin_intrinsic_metadata_missing must take place before calling this +-- function. +-- +-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- tmp_origin_intrinsic_metadata, 2. call this function +create or replace function swh_origin_intrinsic_metadata_add( + conflict_update boolean) + returns void + language plpgsql +as $$ +begin + if conflict_update then + insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision) + select origin_id, metadata, indexer_configuration_id, from_revision + from tmp_origin_intrinsic_metadata + on conflict(origin_id, indexer_configuration_id) + do update set metadata = excluded.metadata; + + else + insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision) + select origin_id, metadata, indexer_configuration_id, from_revision + from tmp_origin_intrinsic_metadata + on conflict(origin_id, indexer_configuration_id) + do nothing; + end if; + return; +end +$$; + +comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata'; diff --git a/swh/indexer/sql/60-swh-indexes.sql b/swh/indexer/sql/60-swh-indexes.sql --- a/swh/indexer/sql/60-swh-indexes.sql +++ b/swh/indexer/sql/60-swh-indexes.sql @@ -55,3 +55,12 @@ alter table content_fossology_license add constraint content_fossology_license_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table content_fossology_license validate constraint content_fossology_license_indexer_configuration_id_fkey; + +-- origin_intrinsic_metadata +create unique index origin_intrinsic_metadata_pkey on origin_intrinsic_metadata(origin_id, indexer_configuration_id); +alter table origin_intrinsic_metadata add primary key using index origin_intrinsic_metadata_pkey; + +alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey; +alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_revision_metadata_fkey foreign key (from_revision, indexer_configuration_id) references revision_metadata(id, indexer_configuration_id) not valid; +alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_revision_metadata_fkey; diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -428,14 +428,13 @@ metadata (iterable): dictionaries with keys: id: sha1 - translated_metadata: bytes / jsonb ? + translated_metadata: arbitrary dict conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ db.mktemp_content_metadata(cur) - # empty metadata is mapped to 'unknown' db.copy_to(metadata, 'tmp_content_metadata', ['id', 'translated_metadata', 'indexer_configuration_id'], @@ -491,20 +490,66 @@ metadata (iterable): dictionaries with keys: - id: sha1_git of revision - - translated_metadata: bytes / jsonb ? + - translated_metadata: arbitrary dict conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ db.mktemp_revision_metadata(cur) - # empty metadata is mapped to 'unknown' db.copy_to(metadata, 'tmp_revision_metadata', ['id', 'translated_metadata', 'indexer_configuration_id'], cur) db.revision_metadata_add_from_temp(conflict_update, cur) + @remote_api_endpoint('origin_intrinsic_metadata') + @db_transaction_generator() + def origin_intrinsic_metadata_get(self, ids, db=None, cur=None): + """Retrieve origin metadata per id. + + Args: + ids (iterable): origin identifiers + + Yields: + list: dictionaries with the following keys: + + id (int) + translated_metadata (str): associated metadata + tool (dict): tool used to compute metadata + + """ + for c in db.origin_intrinsic_metadata_get_from_list(ids, cur): + yield converters.db_to_metadata( + dict(zip(db.origin_intrinsic_metadata_cols, c))) + + @remote_api_endpoint('origin_intrinsic_metadata/add') + @db_transaction() + def origin_intrinsic_metadata_add(self, metadata, + conflict_update=False, db=None, + cur=None): + """Add origin metadata not present in storage. + + Args: + metadata (iterable): dictionaries with keys: + + - origin_id: origin identifier + - from_revision: sha1 id of the revision used to generate + these metadata. + - metadata: arbitrary dict + + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + """ + db.mktemp_origin_intrinsic_metadata(cur) + + db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', + ['origin_id', 'metadata', 'indexer_configuration_id', + 'from_revision'], + cur) + db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) + @remote_api_endpoint('indexer_configuration/add') @db_transaction_generator() def indexer_configuration_add(self, tools, db=None, cur=None): diff --git a/swh/indexer/storage/converters.py b/swh/indexer/storage/converters.py --- a/swh/indexer/storage/converters.py +++ b/swh/indexer/storage/converters.py @@ -115,16 +115,15 @@ """Convert a metadata entry into a ready metadata output. """ - return { - 'id': metadata['id'], - 'translated_metadata': metadata['translated_metadata'], - 'tool': { - 'id': metadata['tool_id'], - 'name': metadata['tool_name'], - 'version': metadata['tool_version'], - 'configuration': metadata['tool_configuration'] - } + metadata['tool'] = { + 'id': metadata['tool_id'], + 'name': metadata['tool_name'], + 'version': metadata['tool_version'], + 'configuration': metadata['tool_configuration'] } + del metadata['tool_id'], metadata['tool_configuration'] + del metadata['tool_version'], metadata['tool_name'] + return metadata def db_to_fossology_license(license): diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -89,18 +89,22 @@ array_agg(%s.license_id))) as licenses''' % main_table return key - def _get_from_list(self, table, ids, cols, cur=None): + def _get_from_list(self, table, ids, cols, cur=None, id_col='id'): cur = self._cursor(cur) keys = map(self._convert_key, cols) - yield from execute_values_to_bytes( - cur, """ - select %s - from (values %%s) as t(id) - inner join %s c - on c.id=t.id + query = """ + select {keys} + from (values %s) as t(id) + inner join {table} c + on c.{id_col}=t.id inner join indexer_configuration i on c.indexer_configuration_id=i.id; - """ % (', '.join(keys), table), + """.format( + keys=', '.join(keys), + id_col=id_col, + table=table) + yield from execute_values_to_bytes( + cur, query, ((_id,) for _id in ids) ) @@ -275,6 +279,26 @@ yield from self._get_from_list( 'revision_metadata', ids, self.revision_metadata_cols, cur=cur) + origin_intrinsic_metadata_cols = [ + 'origin_id', 'metadata', 'from_revision', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + + @stored_procedure('swh_mktemp_origin_intrinsic_metadata') + def mktemp_origin_intrinsic_metadata(self, cur=None): pass + + def origin_intrinsic_metadata_add_from_temp( + self, conflict_update, cur=None): + cur = self._cursor(cur) + cur.execute( + "SELECT swh_origin_intrinsic_metadata_add(%s)", + (conflict_update, )) + + def origin_intrinsic_metadata_get_from_list(self, orig_ids, cur=None): + yield from self._get_from_list( + 'origin_intrinsic_metadata', orig_ids, + self.origin_intrinsic_metadata_cols, cur=cur, + id_col='origin_id') + indexer_configuration_cols = ['id', 'tool_name', 'tool_version', 'tool_configuration'] diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -41,6 +41,7 @@ '7026b7c1a2af56521e951c01ed20f255fa054238') self.revision_id_2 = hash_to_bytes( '7026b7c1a2af56521e9587659012345678904321') + self.origin_id_1 = 54974445 cur = self.test_db[self.TEST_DB_NAME].cursor tools = {} @@ -1129,7 +1130,6 @@ 'author': None, 'relatedLink': None, 'url': None, - 'type': None, 'license': None, 'maintainer': None, 'email': None, @@ -1163,7 +1163,6 @@ 'author': None, 'relatedLink': None, 'url': None, - 'type': None, 'license': None, 'maintainer': None, 'email': None, @@ -1205,7 +1204,6 @@ 'author': None, 'relatedLink': None, 'url': None, - 'type': None, 'license': None, 'maintainer': None, 'email': None, @@ -1265,7 +1263,6 @@ 'author': None, 'relatedLink': None, 'url': None, - 'type': None, 'license': None, 'maintainer': None, 'email': None, @@ -1303,7 +1300,6 @@ actual_metadata = list(self.storage.revision_metadata_get( [self.revision_id_2])) - # language did not change as the v2 was dropped. expected_metadata_v2 = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_v2['translated_metadata'], @@ -1313,6 +1309,206 @@ # metadata did change as the v2 was used to overwrite v1 self.assertEqual(actual_metadata, expected_metadata_v2) + def test_origin_intrinsic_metadata_get(self): + # given + tool_id = self.tools['swh-metadata-detector']['id'] + + metadata = { + 'developmentStatus': None, + 'version': None, + 'operatingSystem': None, + 'description': None, + 'keywords': None, + 'issueTracker': None, + 'name': None, + 'author': None, + 'relatedLink': None, + 'url': None, + 'license': None, + 'maintainer': None, + 'email': None, + 'softwareRequirements': None, + 'identifier': None, + } + metadata_rev = { + 'id': self.revision_id_2, + 'translated_metadata': metadata, + 'indexer_configuration_id': tool_id, + } + metadata_origin = { + 'origin_id': self.origin_id_1, + 'metadata': metadata, + 'indexer_configuration_id': tool_id, + 'from_revision': self.revision_id_2, + } + + # when + self.storage.revision_metadata_add([metadata_rev]) + self.storage.origin_intrinsic_metadata_add([metadata_origin]) + + # then + actual_metadata = list(self.storage.origin_intrinsic_metadata_get( + [self.origin_id_1, 42])) + + expected_metadata = [{ + 'origin_id': self.origin_id_1, + 'metadata': metadata, + 'tool': self.tools['swh-metadata-detector'], + 'from_revision': self.revision_id_2, + }] + + self.assertEqual(actual_metadata, expected_metadata) + + def test_origin_intrinsic_metadata_add_drop_duplicate(self): + # given + tool_id = self.tools['swh-metadata-detector']['id'] + + metadata_v1 = { + 'developmentStatus': None, + 'version': None, + 'operatingSystem': None, + 'description': None, + 'keywords': None, + 'issueTracker': None, + 'name': None, + 'author': None, + 'relatedLink': None, + 'url': None, + 'license': None, + 'maintainer': None, + 'email': None, + 'softwareRequirements': None, + 'identifier': None + } + metadata_rev_v1 = { + 'id': self.revision_id_1, + 'translated_metadata': metadata_v1.copy(), + 'indexer_configuration_id': tool_id, + } + metadata_origin_v1 = { + 'origin_id': self.origin_id_1, + 'metadata': metadata_v1.copy(), + 'indexer_configuration_id': tool_id, + 'from_revision': self.revision_id_1, + } + + # given + self.storage.revision_metadata_add([metadata_rev_v1]) + self.storage.origin_intrinsic_metadata_add([metadata_origin_v1]) + + # when + actual_metadata = list(self.storage.origin_intrinsic_metadata_get( + [self.origin_id_1, 42])) + + expected_metadata_v1 = [{ + 'origin_id': self.origin_id_1, + 'metadata': metadata_v1, + 'tool': self.tools['swh-metadata-detector'], + 'from_revision': self.revision_id_1, + }] + + self.assertEqual(actual_metadata, expected_metadata_v1) + + # given + metadata_v2 = metadata_v1.copy() + metadata_v2.update({ + 'name': 'test_metadata', + 'author': 'MG', + }) + metadata_rev_v2 = metadata_rev_v1.copy() + metadata_origin_v2 = metadata_origin_v1.copy() + metadata_rev_v2['translated_metadata'] = metadata_v2 + metadata_origin_v2['translated_metadata'] = metadata_v2 + + self.storage.revision_metadata_add([metadata_rev_v2]) + self.storage.origin_intrinsic_metadata_add([metadata_origin_v2]) + + # then + actual_metadata = list(self.storage.origin_intrinsic_metadata_get( + [self.origin_id_1])) + + # metadata did not change as the v2 was dropped. + self.assertEqual(actual_metadata, expected_metadata_v1) + + def test_origin_intrinsic_metadata_add_update_in_place_duplicate(self): + # given + tool_id = self.tools['swh-metadata-detector']['id'] + + metadata_v1 = { + 'developmentStatus': None, + 'version': None, + 'operatingSystem': None, + 'description': None, + 'keywords': None, + 'issueTracker': None, + 'name': None, + 'author': None, + 'relatedLink': None, + 'url': None, + 'license': None, + 'maintainer': None, + 'email': None, + 'softwareRequirements': None, + 'identifier': None + } + metadata_rev_v1 = { + 'id': self.revision_id_2, + 'translated_metadata': metadata_v1, + 'indexer_configuration_id': tool_id, + } + metadata_origin_v1 = { + 'origin_id': self.origin_id_1, + 'metadata': metadata_v1.copy(), + 'indexer_configuration_id': tool_id, + 'from_revision': self.revision_id_2, + } + + # given + self.storage.revision_metadata_add([metadata_rev_v1]) + self.storage.origin_intrinsic_metadata_add([metadata_origin_v1]) + + # when + actual_metadata = list(self.storage.origin_intrinsic_metadata_get( + [self.origin_id_1])) + + # then + expected_metadata_v1 = [{ + 'origin_id': self.origin_id_1, + 'metadata': metadata_v1, + 'tool': self.tools['swh-metadata-detector'], + 'from_revision': self.revision_id_2, + }] + self.assertEqual(actual_metadata, expected_metadata_v1) + + # given + metadata_v2 = metadata_v1.copy() + metadata_v2.update({ + 'name': 'test_update_duplicated_metadata', + 'author': 'MG', + }) + metadata_rev_v2 = metadata_rev_v1.copy() + metadata_origin_v2 = metadata_origin_v1.copy() + metadata_rev_v2['translated_metadata'] = metadata_v2 + metadata_origin_v2['metadata'] = metadata_v2 + + self.storage.revision_metadata_add([metadata_rev_v2], + conflict_update=True) + self.storage.origin_intrinsic_metadata_add([metadata_origin_v2], + conflict_update=True) + + actual_metadata = list(self.storage.origin_intrinsic_metadata_get( + [self.origin_id_1])) + + expected_metadata_v2 = [{ + 'origin_id': self.origin_id_1, + 'metadata': metadata_v2, + 'tool': self.tools['swh-metadata-detector'], + 'from_revision': self.revision_id_2, + }] + + # metadata did change as the v2 was used to overwrite v1 + self.assertEqual(actual_metadata, expected_metadata_v2) + def test_indexer_configuration_add(self): tool = { 'tool_name': 'some-unknown-tool',