diff --git a/sql/upgrades/119.sql b/sql/upgrades/119.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/119.sql @@ -0,0 +1,19 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 118 +-- to_version: 119 +-- description: metadata tables: add 'mappings' column + +insert into dbversion(version, release, description) +values(119, now(), 'Work In Progress'); + +alter table revision_metadata + add column mappings text array not null default {}; +alter table revision_metadata + alter column mappings + drop default; + +alter table origin_intrinsic_metadata + add column mappings text array not null default {}; +alter table origin_intrinsic_metadata + alter column mappings + drop default; diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -150,6 +150,7 @@ result = { 'id': rev['id'], 'indexer_configuration_id': self.tool['id'], + 'mappings': None, 'translated_metadata': None } @@ -158,10 +159,11 @@ dir_ls = self.storage.directory_ls(root_dir, recursive=False) files = [entry for entry in dir_ls if entry['type'] == 'file'] detected_files = detect_metadata(files) - result['translated_metadata'] = self.translate_revision_metadata( + (mappings, metadata) = self.translate_revision_metadata( detected_files, - log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id']) - ) + log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id'])) + result['mappings'] = mappings + result['translated_metadata'] = metadata except Exception as e: self.log.exception( 'Problem when indexing rev: %r', e) @@ -194,10 +196,11 @@ "npm", "authors") to list of sha1 Returns: - dict: dict with translated metadata according to the CodeMeta - vocabulary + (List[str], dict): list of mappings used and dict with + translated metadata according to the CodeMeta vocabulary """ + used_mappings = [MAPPINGS[context].name for context in detected_files] translated_metadata = [] tool = { 'name': 'swh-metadata-translator', @@ -252,7 +255,7 @@ # transform translated_metadata into min set with swh-metadata-detector min_metadata = extract_minimal_metadata_dict(translated_metadata) - return min_metadata + return (used_mappings, min_metadata) class OriginMetadataIndexer(OriginIndexer): @@ -287,6 +290,7 @@ 'from_revision': rev_metadata['id'], 'origin_id': origin['id'], 'metadata': rev_metadata['translated_metadata'], + 'mappings': rev_metadata['mappings'], 'indexer_configuration_id': rev_metadata['indexer_configuration_id'], } diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -76,6 +76,13 @@ self.__class__.__module__, self.__class__.__name__)) + @property + @abc.abstractmethod + def name(self): + """A name of this mapping, used as an identifier in the + indexer storage.""" + pass + @classmethod @abc.abstractmethod def detect_metadata_files(cls, files): @@ -203,6 +210,7 @@ """ dedicated class for NPM (package.json) mapping and translation """ + name = 'npm' mapping = CROSSWALK_TABLE['NodeJS'] filename = b'package.json' @@ -341,6 +349,7 @@ """ dedicated class for CodeMeta (codemeta.json) mapping and translation """ + name = 'codemeta' filename = b'codemeta.json' def translate(self, content): @@ -352,6 +361,7 @@ """ dedicated class for Maven (pom.xml) mapping and translation """ + name = 'maven' filename = b'pom.xml' mapping = CROSSWALK_TABLE['Java (Maven)'] @@ -484,6 +494,7 @@ """Dedicated class for Python's PKG-INFO mapping and translation. https://www.python.org/dev/peps/pep-0314/""" + name = 'pkg-info' filename = b'PKG-INFO' mapping = {_normalize_pkginfo_key(k): v for (k, v) in CROSSWALK_TABLE['Python PKG-INFO'].items()} @@ -520,11 +531,12 @@ @register_mapping class GemspecMapping(DictMapping): + name = 'gemspec' + mapping = CROSSWALK_TABLE['Ruby Gem'] + _re_spec_new = re.compile(r'.*Gem::Specification.new do \|.*\|.*') _re_spec_entry = re.compile(r'\s*\w+\.(?P\w+)\s*=\s*(?P.*)') - mapping = CROSSWALK_TABLE['Ruby Gem'] - @classmethod def detect_metadata_files(cls, file_entries): for entry in file_entries: diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql --- a/swh/indexer/sql/30-swh-schema.sql +++ b/swh/indexer/sql/30-swh-schema.sql @@ -14,7 +14,7 @@ ); insert into dbversion(version, release, description) - values(118, now(), 'Work In Progress'); + values(119, now(), 'Work In Progress'); -- Computing metadata on sha1's contents -- a SHA1 checksum (not necessarily originating from Git) @@ -118,7 +118,8 @@ create table revision_metadata( id sha1_git not null, translated_metadata jsonb not null, - indexer_configuration_id bigint not null + indexer_configuration_id bigint not null, + mappings text array not null ); comment on table revision_metadata is 'metadata semantically detected and translated in a revision'; @@ -131,7 +132,8 @@ metadata jsonb, indexer_configuration_id bigint not null, from_revision sha1_git not null, - metadata_tsvector tsvector + metadata_tsvector tsvector, + mappings text array not null ); comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin'; diff --git a/swh/indexer/sql/40-swh-func.sql b/swh/indexer/sql/40-swh-func.sql --- a/swh/indexer/sql/40-swh-func.sql +++ b/swh/indexer/sql/40-swh-func.sql @@ -315,15 +315,15 @@ as $$ begin if conflict_update then - insert into revision_metadata (id, translated_metadata, indexer_configuration_id) - select id, translated_metadata, indexer_configuration_id + insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id) + select id, translated_metadata, mappings, indexer_configuration_id from tmp_revision_metadata tcm on conflict(id, indexer_configuration_id) do update set translated_metadata = excluded.translated_metadata; else - insert into revision_metadata (id, translated_metadata, indexer_configuration_id) - select id, translated_metadata, indexer_configuration_id + insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id) + select id, translated_metadata, mappings, indexer_configuration_id from tmp_revision_metadata tcm on conflict(id, indexer_configuration_id) do nothing; @@ -410,17 +410,17 @@ begin perform swh_origin_intrinsic_metadata_compute_tsvector(); if conflict_update then - insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector) + insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) select origin_id, metadata, indexer_configuration_id, from_revision, - metadata_tsvector + metadata_tsvector, mappings from tmp_origin_intrinsic_metadata on conflict(origin_id, indexer_configuration_id) do update set metadata = excluded.metadata; else - insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector) + insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) select origin_id, metadata, indexer_configuration_id, from_revision, - metadata_tsvector + metadata_tsvector, mappings from tmp_origin_intrinsic_metadata on conflict(origin_id, indexer_configuration_id) do nothing; diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -586,6 +586,8 @@ - **id** (bytes) - **translated_metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata + - **mappings** (List[str]): list of mappings used to translate + these metadata """ for c in db.revision_metadata_get_from_list(ids, cur): @@ -604,6 +606,8 @@ - **id**: sha1_git of revision - **translated_metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata + - **mappings** (List[str]): list of mappings used to translate + these metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) @@ -612,7 +616,8 @@ db.mktemp_revision_metadata(cur) db.copy_to(metadata, 'tmp_revision_metadata', - ['id', 'translated_metadata', 'indexer_configuration_id'], + ['id', 'translated_metadata', 'mappings', + 'indexer_configuration_id'], cur) db.revision_metadata_add_from_temp(conflict_update, cur) @@ -630,6 +635,8 @@ - **origin_id** (int) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata + - **mappings** (List[str]): list of mappings used to translate + these metadata """ for c in db.origin_intrinsic_metadata_get_from_list(ids, cur): @@ -651,6 +658,8 @@ these metadata. - **metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata + - **mappings** (List[str]): list of mappings used to translate + these metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) @@ -660,7 +669,7 @@ db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', ['origin_id', 'metadata', 'indexer_configuration_id', - 'from_revision'], + 'from_revision', 'mappings'], cur) db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) @@ -680,6 +689,8 @@ - **id** (int) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata + - **mappings** (List[str]): list of mappings used to translate + these metadata """ for c in db.origin_intrinsic_metadata_search_fulltext( diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -295,7 +295,8 @@ yield from self._get_from_list( 'content_metadata', ids, self.content_metadata_cols, cur=cur) - revision_metadata_hash_keys = ['id', 'indexer_configuration_id'] + revision_metadata_hash_keys = [ + 'id', 'indexer_configuration_id'] def revision_metadata_missing_from_list(self, metadata, cur=None): """List missing metadata. @@ -306,7 +307,7 @@ cur=cur) revision_metadata_cols = [ - 'id', 'translated_metadata', + 'id', 'translated_metadata', 'mappings', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_revision_metadata') @@ -321,7 +322,7 @@ 'revision_metadata', ids, self.revision_metadata_cols, cur=cur) origin_intrinsic_metadata_cols = [ - 'origin_id', 'metadata', 'from_revision', + 'origin_id', 'metadata', 'from_revision', 'mappings', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] origin_intrinsic_metadata_regconfig = 'pg_catalog.simple' diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -1217,6 +1217,7 @@ 'softwareRequirements': None, 'identifier': None }, + 'mappings': [], 'indexer_configuration_id': tool_id }]) @@ -1250,6 +1251,7 @@ 'softwareRequirements': None, 'identifier': None }, + 'mappings': ['mapping1', 'mapping2'], 'indexer_configuration_id': tool_id } @@ -1263,9 +1265,11 @@ expected_metadata = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_rev['translated_metadata'], + 'mappings': ['mapping1', 'mapping2'], 'tool': self.tools['swh-metadata-detector'] }] + self.maxDiff = None self.assertEqual(actual_metadata, expected_metadata) def test_revision_metadata_add_drop_duplicate(self): @@ -1291,6 +1295,7 @@ 'softwareRequirements': None, 'identifier': None }, + 'mappings': [], 'indexer_configuration_id': tool_id, } @@ -1304,6 +1309,7 @@ expected_metadata_v1 = [{ 'id': self.revision_id_1, 'translated_metadata': metadata_v1['translated_metadata'], + 'mappings': [], 'tool': self.tools['swh-metadata-detector'] }] @@ -1350,6 +1356,7 @@ 'softwareRequirements': None, 'identifier': None }, + 'mappings': [], 'indexer_configuration_id': tool_id, } @@ -1364,6 +1371,7 @@ expected_metadata_v1 = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_v1['translated_metadata'], + 'mappings': [], 'tool': self.tools['swh-metadata-detector'] }] self.assertEqual(actual_metadata, expected_metadata_v1) @@ -1384,6 +1392,7 @@ expected_metadata_v2 = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_v2['translated_metadata'], + 'mappings': [], 'tool': self.tools['swh-metadata-detector'] }] @@ -1414,12 +1423,14 @@ metadata_rev = { 'id': self.revision_id_2, 'translated_metadata': metadata, + 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { 'origin_id': self.origin_id_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, + 'mappings': ['mapping1'], 'from_revision': self.revision_id_2, } @@ -1436,6 +1447,7 @@ 'metadata': metadata, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, + 'mappings': ['mapping1'], }] self.assertEqual(actual_metadata, expected_metadata) @@ -1464,12 +1476,14 @@ metadata_rev_v1 = { 'id': self.revision_id_1, 'translated_metadata': metadata_v1.copy(), + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { 'origin_id': self.origin_id_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, + 'mappings': [], 'from_revision': self.revision_id_1, } @@ -1486,6 +1500,7 @@ 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_1, + 'mappings': [], }] self.assertEqual(actual_metadata, expected_metadata_v1) @@ -1535,12 +1550,14 @@ metadata_rev_v1 = { 'id': self.revision_id_2, 'translated_metadata': metadata_v1, + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { 'origin_id': self.origin_id_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, + 'mappings': [], 'from_revision': self.revision_id_2, } @@ -1558,6 +1575,7 @@ 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, + 'mappings': [], }] self.assertEqual(actual_metadata, expected_metadata_v1) @@ -1585,6 +1603,7 @@ 'metadata': metadata_v2, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, + 'mappings': [], }] # metadata did change as the v2 was used to overwrite v1 @@ -1600,11 +1619,13 @@ metadata1_rev = { 'id': self.revision_id_1, 'translated_metadata': metadata1, + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata1_origin = { 'origin_id': self.origin_id_1, 'metadata': metadata1, + 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_1, } @@ -1614,11 +1635,13 @@ metadata2_rev = { 'id': self.revision_id_2, 'translated_metadata': metadata2, + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { 'origin_id': self.origin_id_2, 'metadata': metadata2, + 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_2, } @@ -1662,11 +1685,13 @@ metadata1_rev = { 'id': self.revision_id_1, 'translated_metadata': metadata1, + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata1_origin = { 'origin_id': self.origin_id_1, 'metadata': metadata1, + 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_1, } @@ -1679,11 +1704,13 @@ metadata2_rev = { 'id': self.revision_id_2, 'translated_metadata': metadata2, + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { 'origin_id': self.origin_id_2, 'metadata': metadata2, + 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_2, } diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -254,7 +254,7 @@ 'name': 'test_metadata', 'version': '0.0.1' }, - 'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5') + 'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'), }, { 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', @@ -927,6 +927,7 @@ 'name': 'yarn-parser', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], }, + 'mappings': ['npm'], }] for result in results: diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -94,11 +94,13 @@ rev_metadata = { 'id': rev_id, 'translated_metadata': metadata, + 'mappings': ['npm'], } origin_metadata = { 'origin_id': origin['id'], 'from_revision': rev_id, 'metadata': metadata, + 'mappings': ['npm'], } results = list(indexer.idx_storage.revision_metadata_get([rev_id]))