diff --git a/sql/upgrades/119.sql b/sql/upgrades/119.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/119.sql @@ -0,0 +1,19 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 118 +-- to_version: 119 +-- description: metadata tables: add 'mappings' column + +insert into dbversion(version, release, description) +values(119, now(), 'Work In Progress'); + +alter table revision_metadata + add column mappings text array not null default {}; +alter table revision_metadata + alter column mappings + drop default; + +alter table origin_intrinsic_metadata + add column mappings text array not null default {}; +alter table origin_intrinsic_metadata + alter column mappings + drop default; diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql --- a/swh/indexer/sql/30-swh-schema.sql +++ b/swh/indexer/sql/30-swh-schema.sql @@ -14,7 +14,7 @@ ); insert into dbversion(version, release, description) - values(118, now(), 'Work In Progress'); + values(119, now(), 'Work In Progress'); -- Computing metadata on sha1's contents -- a SHA1 checksum (not necessarily originating from Git) @@ -118,7 +118,8 @@ create table revision_metadata( id sha1_git not null, translated_metadata jsonb not null, - indexer_configuration_id bigint not null + indexer_configuration_id bigint not null, + mappings text array not null ); comment on table revision_metadata is 'metadata semantically detected and translated in a revision'; @@ -131,7 +132,8 @@ metadata jsonb, indexer_configuration_id bigint not null, from_revision sha1_git not null, - metadata_tsvector tsvector + metadata_tsvector tsvector, + mappings text array not null ); comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin'; diff --git a/swh/indexer/sql/40-swh-func.sql b/swh/indexer/sql/40-swh-func.sql --- a/swh/indexer/sql/40-swh-func.sql +++ b/swh/indexer/sql/40-swh-func.sql @@ -315,15 +315,15 @@ as $$ begin if conflict_update then - insert into revision_metadata (id, translated_metadata, indexer_configuration_id) - select id, translated_metadata, indexer_configuration_id + insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id) + select id, translated_metadata, mappings, indexer_configuration_id from tmp_revision_metadata tcm on conflict(id, indexer_configuration_id) do update set translated_metadata = excluded.translated_metadata; else - insert into revision_metadata (id, translated_metadata, indexer_configuration_id) - select id, translated_metadata, indexer_configuration_id + insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id) + select id, translated_metadata, mappings, indexer_configuration_id from tmp_revision_metadata tcm on conflict(id, indexer_configuration_id) do nothing; @@ -410,17 +410,17 @@ begin perform swh_origin_intrinsic_metadata_compute_tsvector(); if conflict_update then - insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector) + insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) select origin_id, metadata, indexer_configuration_id, from_revision, - metadata_tsvector + metadata_tsvector, mappings from tmp_origin_intrinsic_metadata on conflict(origin_id, indexer_configuration_id) do update set metadata = excluded.metadata; else - insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector) + insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) select origin_id, metadata, indexer_configuration_id, from_revision, - metadata_tsvector + metadata_tsvector, mappings from tmp_origin_intrinsic_metadata on conflict(origin_id, indexer_configuration_id) do nothing; diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -586,6 +586,8 @@ - **id** (bytes) - **translated_metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata + - **mappings** (List[str]): list of mappings used to translate + these metadata """ for c in db.revision_metadata_get_from_list(ids, cur): @@ -604,6 +606,8 @@ - **id**: sha1_git of revision - **translated_metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata + - **mappings** (List[str]): list of mappings used to translate + these metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) @@ -612,7 +616,8 @@ db.mktemp_revision_metadata(cur) db.copy_to(metadata, 'tmp_revision_metadata', - ['id', 'translated_metadata', 'indexer_configuration_id'], + ['id', 'translated_metadata', 'mappings', + 'indexer_configuration_id'], cur) db.revision_metadata_add_from_temp(conflict_update, cur) @@ -630,6 +635,8 @@ - **origin_id** (int) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata + - **mappings** (List[str]): list of mappings used to translate + these metadata """ for c in db.origin_intrinsic_metadata_get_from_list(ids, cur): @@ -651,6 +658,8 @@ these metadata. - **metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata + - **mappings** (List[str]): list of mappings used to translate + these metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) @@ -660,7 +669,7 @@ db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', ['origin_id', 'metadata', 'indexer_configuration_id', - 'from_revision'], + 'from_revision', 'mappings'], cur) db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) @@ -680,6 +689,8 @@ - **id** (int) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata + - **mappings** (List[str]): list of mappings used to translate + these metadata """ for c in db.origin_intrinsic_metadata_search_fulltext( diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -295,7 +295,8 @@ yield from self._get_from_list( 'content_metadata', ids, self.content_metadata_cols, cur=cur) - revision_metadata_hash_keys = ['id', 'indexer_configuration_id'] + revision_metadata_hash_keys = [ + 'id', 'indexer_configuration_id'] def revision_metadata_missing_from_list(self, metadata, cur=None): """List missing metadata. @@ -306,7 +307,7 @@ cur=cur) revision_metadata_cols = [ - 'id', 'translated_metadata', + 'id', 'translated_metadata', 'mappings', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_revision_metadata') @@ -321,7 +322,7 @@ 'revision_metadata', ids, self.revision_metadata_cols, cur=cur) origin_intrinsic_metadata_cols = [ - 'origin_id', 'metadata', 'from_revision', + 'origin_id', 'metadata', 'from_revision', 'mappings', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] origin_intrinsic_metadata_regconfig = 'pg_catalog.simple' diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -1217,6 +1217,7 @@ 'softwareRequirements': None, 'identifier': None }, + 'mappings': [], 'indexer_configuration_id': tool_id }]) @@ -1250,6 +1251,7 @@ 'softwareRequirements': None, 'identifier': None }, + 'mappings': ['mapping1', 'mapping2'], 'indexer_configuration_id': tool_id } @@ -1263,9 +1265,11 @@ expected_metadata = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_rev['translated_metadata'], + 'mappings': ['mapping1', 'mapping2'], 'tool': self.tools['swh-metadata-detector'] }] + self.maxDiff = None self.assertEqual(actual_metadata, expected_metadata) def test_revision_metadata_add_drop_duplicate(self): @@ -1291,6 +1295,7 @@ 'softwareRequirements': None, 'identifier': None }, + 'mappings': [], 'indexer_configuration_id': tool_id, } @@ -1304,6 +1309,7 @@ expected_metadata_v1 = [{ 'id': self.revision_id_1, 'translated_metadata': metadata_v1['translated_metadata'], + 'mappings': [], 'tool': self.tools['swh-metadata-detector'] }] @@ -1350,6 +1356,7 @@ 'softwareRequirements': None, 'identifier': None }, + 'mappings': [], 'indexer_configuration_id': tool_id, } @@ -1364,6 +1371,7 @@ expected_metadata_v1 = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_v1['translated_metadata'], + 'mappings': [], 'tool': self.tools['swh-metadata-detector'] }] self.assertEqual(actual_metadata, expected_metadata_v1) @@ -1384,6 +1392,7 @@ expected_metadata_v2 = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_v2['translated_metadata'], + 'mappings': [], 'tool': self.tools['swh-metadata-detector'] }] @@ -1414,12 +1423,14 @@ metadata_rev = { 'id': self.revision_id_2, 'translated_metadata': metadata, + 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { 'origin_id': self.origin_id_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, + 'mappings': ['mapping1'], 'from_revision': self.revision_id_2, } @@ -1436,6 +1447,7 @@ 'metadata': metadata, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, + 'mappings': ['mapping1'], }] self.assertEqual(actual_metadata, expected_metadata) @@ -1464,12 +1476,14 @@ metadata_rev_v1 = { 'id': self.revision_id_1, 'translated_metadata': metadata_v1.copy(), + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { 'origin_id': self.origin_id_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, + 'mappings': [], 'from_revision': self.revision_id_1, } @@ -1486,6 +1500,7 @@ 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_1, + 'mappings': [], }] self.assertEqual(actual_metadata, expected_metadata_v1) @@ -1535,12 +1550,14 @@ metadata_rev_v1 = { 'id': self.revision_id_2, 'translated_metadata': metadata_v1, + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { 'origin_id': self.origin_id_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, + 'mappings': [], 'from_revision': self.revision_id_2, } @@ -1558,6 +1575,7 @@ 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, + 'mappings': [], }] self.assertEqual(actual_metadata, expected_metadata_v1) @@ -1585,6 +1603,7 @@ 'metadata': metadata_v2, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, + 'mappings': [], }] # metadata did change as the v2 was used to overwrite v1 @@ -1600,11 +1619,13 @@ metadata1_rev = { 'id': self.revision_id_1, 'translated_metadata': metadata1, + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata1_origin = { 'origin_id': self.origin_id_1, 'metadata': metadata1, + 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_1, } @@ -1614,11 +1635,13 @@ metadata2_rev = { 'id': self.revision_id_2, 'translated_metadata': metadata2, + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { 'origin_id': self.origin_id_2, 'metadata': metadata2, + 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_2, } @@ -1662,11 +1685,13 @@ metadata1_rev = { 'id': self.revision_id_1, 'translated_metadata': metadata1, + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata1_origin = { 'origin_id': self.origin_id_1, 'metadata': metadata1, + 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_1, } @@ -1679,11 +1704,13 @@ metadata2_rev = { 'id': self.revision_id_2, 'translated_metadata': metadata2, + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { 'origin_id': self.origin_id_2, 'metadata': metadata2, + 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_2, }