diff --git a/sql/upgrades/127.sql b/sql/upgrades/127.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/127.sql @@ -0,0 +1,14 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 126 +-- to_version: 127 +-- description: Remove content_metadata table + +insert into dbversion(version, release, description) +values(127, now(), 'Work In Progress'); + +drop function swh_content_metadata_add; +drop function swh_mktemp_content_metadata; + +drop index content_metadata_pkey; + +drop table content_metadata; diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -3,18 +3,15 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from copy import deepcopy - from swh.core.utils import grouper +from swh.objstorage.exc import ObjNotFoundError +from swh.model import hashutil from swh.indexer.codemeta import merge_documents -from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer +from swh.indexer.indexer import RevisionIndexer, OriginIndexer from swh.indexer.origin_head import OriginHeadIndexer from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_detector import detect_metadata -from swh.indexer.storage import INDEXER_CFG_KEY - -from swh.model import hashutil REVISION_GET_BATCH_SIZE = 10 @@ -29,75 +26,6 @@ yield from f(list(group)) -class ContentMetadataIndexer(ContentIndexer): - """Content-level indexer - - This indexer is in charge of: - - - filtering out content already indexed in content_metadata - - reading content from objstorage with the content's id sha1 - - computing metadata by given context - - using the metadata_dictionary as the 'swh-metadata-translator' tool - - store result in content_metadata table - - """ - def filter(self, ids): - """Filter out known sha1s and return only missing ones. - """ - yield from self.idx_storage.content_metadata_missing(( - { - 'id': sha1, - 'indexer_configuration_id': self.tool['id'], - } for sha1 in ids - )) - - def index(self, id, data, log_suffix='unknown revision'): - """Index sha1s' content and store result. - - Args: - id (bytes): content's identifier - data (bytes): raw content in bytes - - Returns: - dict: dictionary representing a content_metadata. If the - translation wasn't successful the metadata keys will - be returned as None - - """ - result = { - 'id': id, - 'indexer_configuration_id': self.tool['id'], - 'metadata': None - } - try: - mapping_name = self.tool['tool_configuration']['context'] - log_suffix += ', content_id=%s' % hashutil.hash_to_hex(id) - result['metadata'] = \ - MAPPINGS[mapping_name](log_suffix).translate(data) - except Exception: - self.log.exception( - "Problem during metadata translation " - "for content %s" % hashutil.hash_to_hex(id)) - if result['metadata'] is None: - return None - return result - - def persist_index_computations(self, results, policy_update): - """Persist the results in storage. - - Args: - results ([dict]): list of content_metadata, dict with the - following keys: - - id (bytes): content's identifier (sha1) - - metadata (jsonb): detected metadata - policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them - - """ - self.idx_storage.content_metadata_add( - results, conflict_update=(policy_update == 'update-dups')) - - class RevisionMetadataIndexer(RevisionIndexer): """Revision-level indexer @@ -212,60 +140,47 @@ translated metadata according to the CodeMeta vocabulary """ - used_mappings = [MAPPINGS[context].name for context in detected_files] + used_mappings = [MAPPINGS[mapping_name].name + for mapping_name in detected_files] metadata = [] - tool = { - 'name': 'swh-metadata-translator', - 'version': '0.0.2', - 'configuration': { - }, - } - # TODO: iterate on each context, on each file - # -> get raw_contents - # -> translate each content - config = { - k: self.config[k] - for k in [INDEXER_CFG_KEY, 'objstorage', 'storage'] - } - config['tools'] = [tool] - for context in detected_files.keys(): - cfg = deepcopy(config) - cfg['tools'][0]['configuration']['context'] = context - c_metadata_indexer = ContentMetadataIndexer(config=cfg) - # sha1s that are in content_metadata table - sha1s_in_storage = [] - metadata_generator = self.idx_storage.content_metadata_get( - detected_files[context]) - for c in metadata_generator: - # extracting metadata - sha1 = c['id'] - sha1s_in_storage.append(sha1) - local_metadata = c['metadata'] - # local metadata is aggregated - if local_metadata: - metadata.append(local_metadata) - - sha1s_filtered = [item for item in detected_files[context] - if item not in sha1s_in_storage] - - if sha1s_filtered: - # content indexing - try: - c_metadata_indexer.run(sha1s_filtered, - policy_update='ignore-dups', - log_suffix=log_suffix) - # on the fly possibility: - for result in c_metadata_indexer.results: - local_metadata = result['metadata'] - metadata.append(local_metadata) - - except Exception: - self.log.exception( - "Exception while indexing metadata on contents") + + for (mapping_name, sha1s) in detected_files.items(): + for sha1 in sha1s: + result = self.index_content( + sha1, mapping_name, log_suffix=log_suffix) + if result: + metadata.append(result) metadata = merge_documents(metadata) return (used_mappings, metadata) + def index_content(self, id, mapping_name, log_suffix='unknown revision'): + """Index sha1s' content and store result. + + Args: + id (bytes): content's identifier + data (bytes): raw content in bytes + + Returns: + dict: dictionary representing a content_metadata. If the + translation wasn't successful the metadata keys will + be returned as None + + """ + try: + raw_content = self.objstorage.get(id) + except ObjNotFoundError: + self.log.warning('Content %s not found in objstorage' % + hashutil.hash_to_hex(id)) + return + try: + log_suffix += ', content_id=%s' % hashutil.hash_to_hex(id) + return MAPPINGS[mapping_name](log_suffix).translate(raw_content) + except Exception: + self.log.exception( + "Problem during metadata translation " + "for content %s" % hashutil.hash_to_hex(id)) + class OriginMetadataIndexer(OriginIndexer): ADDITIONAL_CONFIG = RevisionMetadataIndexer.ADDITIONAL_CONFIG diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql --- a/swh/indexer/sql/30-swh-schema.sql +++ b/swh/indexer/sql/30-swh-schema.sql @@ -98,23 +98,8 @@ comment on column content_fossology_license.license_id is 'One of the content''s license identifier'; comment on column content_fossology_license.indexer_configuration_id is 'Tool used to compute the information'; - --- The table content_metadata provides a translation to files --- identified as potentially containning metadata with a translation tool (indexer_configuration_id) -create table content_metadata( - id sha1 not null, - metadata jsonb not null, - indexer_configuration_id bigint not null -); - -comment on table content_metadata is 'metadata semantically translated from a content file'; -comment on column content_metadata.id is 'sha1 of content file'; -comment on column content_metadata.metadata is 'result of translation with defined format'; -comment on column content_metadata.indexer_configuration_id is 'tool used for translation'; - -- The table revision_intrinsic_metadata provides a minimal set of intrinsic --- metadata detected with the detection tool (indexer_configuration_id) and --- aggregated from the content_metadata translation. +-- metadata detected with the detection tool (indexer_configuration_id) create table revision_intrinsic_metadata( id sha1_git not null, metadata jsonb not null, diff --git a/swh/indexer/sql/40-swh-func.sql b/swh/indexer/sql/40-swh-func.sql --- a/swh/indexer/sql/40-swh-func.sql +++ b/swh/indexer/sql/40-swh-func.sql @@ -250,56 +250,6 @@ comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses'; --- content_metadata functions - --- add tmp_content_metadata entries to content_metadata, overwriting --- duplicates if conflict_update is true, skipping duplicates otherwise. --- --- If filtering duplicates is in order, the call to --- swh_content_metadata_missing must take place before calling this --- function. --- --- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to --- tmp_content_metadata, 2. call this function -create or replace function swh_content_metadata_add(conflict_update boolean) - returns void - language plpgsql -as $$ -begin - if conflict_update then - insert into content_metadata (id, metadata, indexer_configuration_id) - select id, metadata, indexer_configuration_id - from tmp_content_metadata tcm - on conflict(id, indexer_configuration_id) - do update set metadata = excluded.metadata; - - else - insert into content_metadata (id, metadata, indexer_configuration_id) - select id, metadata, indexer_configuration_id - from tmp_content_metadata tcm - on conflict(id, indexer_configuration_id) - do nothing; - end if; - return; -end -$$; - -comment on function swh_content_metadata_add(boolean) IS 'Add new content metadata'; - --- create a temporary table for retrieving content_metadata -create or replace function swh_mktemp_content_metadata() - returns void - language sql -as $$ - create temporary table tmp_content_metadata ( - like content_metadata including defaults - ) on commit drop; -$$; - -comment on function swh_mktemp_content_metadata() is 'Helper table to add content metadata'; - --- end content_metadata functions - -- add tmp_revision_intrinsic_metadata entries to revision_intrinsic_metadata, -- overwriting duplicates if conflict_update is true, skipping duplicates -- otherwise. diff --git a/swh/indexer/sql/60-swh-indexes.sql b/swh/indexer/sql/60-swh-indexes.sql --- a/swh/indexer/sql/60-swh-indexes.sql +++ b/swh/indexer/sql/60-swh-indexes.sql @@ -18,13 +18,6 @@ alter table content_ctags add constraint content_ctags_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table content_ctags validate constraint content_ctags_indexer_configuration_id_fkey; --- content_metadata -create unique index content_metadata_pkey on content_metadata(id, indexer_configuration_id); -alter table content_metadata add primary key using index content_metadata_pkey; - -alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; -alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey; - -- revision_intrinsic_metadata create unique index revision_intrinsic_metadata_pkey on revision_intrinsic_metadata(id, indexer_configuration_id); alter table revision_intrinsic_metadata add primary key using index revision_intrinsic_metadata_pkey; diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -531,71 +531,6 @@ indexer_configuration_id, limit=limit, with_textual_data=True, db=db, cur=cur) - @remote_api_endpoint('content_metadata/missing') - @db_transaction_generator() - def content_metadata_missing(self, metadata, db=None, cur=None): - """List metadata missing from storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - missing sha1s - - """ - for obj in db.content_metadata_missing_from_list(metadata, cur): - yield obj[0] - - @remote_api_endpoint('content_metadata') - @db_transaction_generator() - def content_metadata_get(self, ids, db=None, cur=None): - """Retrieve metadata per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - dictionaries with the following keys: - - id (bytes) - metadata (str): associated metadata - tool (dict): tool used to compute metadata - - """ - for c in db.content_metadata_get_from_list(ids, cur): - yield converters.db_to_metadata( - dict(zip(db.content_metadata_cols, c))) - - @remote_api_endpoint('content_metadata/add') - @db_transaction() - def content_metadata_add(self, metadata, conflict_update=False, db=None, - cur=None): - """Add metadata not present in storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id**: sha1 - - **metadata**: arbitrary dict - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - """ - _check_id_duplicates(metadata) - metadata.sort(key=lambda m: m['id']) - - db.mktemp_content_metadata(cur) - - db.copy_to(metadata, 'tmp_content_metadata', - ['id', 'metadata', 'indexer_configuration_id'], - cur) - db.content_metadata_add_from_temp(conflict_update, cur) - @remote_api_endpoint('revision_intrinsic_metadata/missing') @db_transaction_generator() def revision_intrinsic_metadata_missing(self, metadata, db=None, cur=None): diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -195,7 +195,6 @@ self._languages = SubStorage(self._tools) self._content_ctags = SubStorage(self._tools) self._licenses = SubStorage(self._tools) - self._content_metadata = SubStorage(self._tools) self._revision_intrinsic_metadata = SubStorage(self._tools) self._origin_intrinsic_metadata = SubStorage(self._tools) @@ -487,57 +486,6 @@ return self._licenses.get_range( start, end, indexer_configuration_id, limit) - def content_metadata_missing(self, metadata): - """List metadata missing from storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - missing sha1s - - """ - yield from self._content_metadata.missing(metadata) - - def content_metadata_get(self, ids): - """Retrieve metadata per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - dictionaries with the following keys: - - - **id** (bytes) - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - """ - yield from self._content_metadata.get(ids) - - def content_metadata_add(self, metadata, conflict_update=False): - """Add metadata not present in storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id**: sha1 - - **metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute the - results - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - """ - if not all(isinstance(x['id'], bytes) for x in metadata): - raise TypeError('identifiers must be bytes.') - self._content_metadata.add(metadata, conflict_update) - def revision_intrinsic_metadata_missing(self, metadata): """List metadata missing from storage. diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -852,7 +852,7 @@ _, # test_content_fossology_license_add__drop_duplicate, _, # test_content_fossology_license_add__update_in_place_duplicate, _, # test_content_fossology_license_add__update_in_place_deadlock, - _, # test_content_metadata_add__duplicate_twice, + _, # test_content_fossology_license_add__duplicate_twice, _, # test_content_fossology_license_get, _, # test_content_fossology_license_delete, _, # test_content_fossology_license_delete_nonexisting, @@ -917,40 +917,6 @@ # license did not change as the v2 was dropped. self.assertEqual(actual_licenses, [expected_license]) - # content_metadata tests - ( - test_content_metadata_missing, - test_content_metadata_add__drop_duplicate, - test_content_metadata_add__update_in_place_duplicate, - test_content_metadata_add__update_in_place_deadlock, - test_content_metadata_add__duplicate_twice, - test_content_metadata_get, - _, # test_content_metadata_delete, - _, # test_content_metadata_delete_nonexisting, - ) = gen_generic_endpoint_tests( - endpoint_type='content_metadata', - tool_name='swh-metadata-detector', - example_data1={ - 'metadata': { - 'other': {}, - 'codeRepository': { - 'type': 'git', - 'url': 'https://github.com/moranegg/metadata_test' - }, - 'description': 'Simple package.json test for indexer', - 'name': 'test_metadata', - 'version': '0.0.1' - }, - }, - example_data2={ - 'metadata': { - 'other': {}, - 'name': 'test_metadata', - 'version': '0.0.1' - }, - }, - ) - # revision_intrinsic_metadata tests ( test_revision_intrinsic_metadata_missing, diff --git a/swh/indexer/tests/tasks.py b/swh/indexer/tests/tasks.py --- a/swh/indexer/tests/tasks.py +++ b/swh/indexer/tests/tasks.py @@ -4,7 +4,6 @@ OriginMetadataIndexer, RevisionMetadataIndexer ) from .test_origin_head import OriginHeadTestIndexer -from .test_metadata import ContentMetadataTestIndexer from .utils import BASE_TEST_CONFIG @@ -12,8 +11,6 @@ """Specific indexer whose configuration is enough to satisfy the indexing tests. """ - ContentMetadataIndexer = ContentMetadataTestIndexer - def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -5,6 +5,7 @@ import json import unittest +import unittest.mock from hypothesis import given, strategies, settings, HealthCheck @@ -17,9 +18,7 @@ from swh.indexer.metadata_detector import ( detect_metadata ) -from swh.indexer.metadata import ( - ContentMetadataIndexer, RevisionMetadataIndexer -) +from swh.indexer.metadata import RevisionMetadataIndexer from .utils import ( BASE_TEST_CONFIG, fill_obj_storage, fill_storage, @@ -38,14 +37,6 @@ } -class ContentMetadataTestIndexer(ContentMetadataIndexer): - """Specific Metadata whose configuration is enough to satisfy the - indexing tests. - """ - def parse_config_file(self, *args, **kwargs): - assert False, 'should not be called; the rev indexer configures it.' - - REVISION_METADATA_CONFIG = { **BASE_TEST_CONFIG, 'tools': TRANSLATOR_TOOL, @@ -226,77 +217,6 @@ } self.assertEqual(expected_results, results) - def test_index_content_metadata_npm(self): - """ - testing NPM with package.json - - one sha1 uses a file that can't be translated to metadata and - should return None in the translated metadata - """ - # given - sha1s = [ - hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'), - hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607'), - hash_to_bytes('02fb2c89e14f7fab46701478c83779c7beb7b069'), - ] - # this metadata indexer computes only metadata for package.json - # in npm context with a hard mapping - config = BASE_TEST_CONFIG.copy() - config['tools'] = [TRANSLATOR_TOOL] - metadata_indexer = ContentMetadataTestIndexer(config=config) - fill_obj_storage(metadata_indexer.objstorage) - fill_storage(metadata_indexer.storage) - - # when - metadata_indexer.run(sha1s, policy_update='ignore-dups') - results = list(metadata_indexer.idx_storage.content_metadata_get( - sha1s)) - - expected_results = [{ - 'metadata': { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'type': 'SoftwareSourceCode', - 'codeRepository': - 'git+https://github.com/moranegg/metadata_test', - 'description': 'Simple package.json test for indexer', - 'name': 'test_metadata', - 'version': '0.0.1' - }, - 'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'), - }, { - 'metadata': { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'type': 'SoftwareSourceCode', - 'issueTracker': - 'https://github.com/npm/npm/issues', - 'author': [{ - 'type': 'Person', - 'name': 'Isaac Z. Schlueter', - 'email': 'i@izs.me', - 'url': 'http://blog.izs.me', - }], - 'codeRepository': - 'git+https://github.com/npm/npm', - 'description': 'a package manager for JavaScript', - 'license': 'https://spdx.org/licenses/Artistic-2.0', - 'version': '5.0.3', - 'name': 'npm', - 'keywords': [ - 'install', - 'modules', - 'package manager', - 'package.json' - ], - 'url': 'https://docs.npmjs.com/' - }, - 'id': hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607') - }] - - for result in results: - del result['tool'] - - # The assertion below returns False sometimes because of nested lists - self.assertEqual(expected_results, results) - def test_npm_bugs_normalization(self): # valid dictionary package_json = b"""{ @@ -1120,20 +1040,13 @@ fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) - tool = metadata_indexer.idx_storage.indexer_configuration_get( - {'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()}) - assert tool is not None - - metadata_indexer.idx_storage.content_metadata_add([{ - 'indexer_configuration_id': tool['id'], - 'id': b'cde', - 'metadata': YARN_PARSER_METADATA, - }]) - sha1_gits = [ hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), ] - metadata_indexer.run(sha1_gits, 'update-dups') + with unittest.mock.patch( + 'swh.indexer.metadata.RevisionMetadataIndexer.index_content', + return_value=YARN_PARSER_METADATA): + metadata_indexer.run(sha1_gits, 'update-dups') results = list( metadata_indexer.idx_storage. @@ -1174,20 +1087,13 @@ }], }]) - tool = metadata_indexer.idx_storage.indexer_configuration_get( - {'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()}) - assert tool is not None - - metadata_indexer.idx_storage.content_metadata_add([{ - 'indexer_configuration_id': tool['id'], - 'id': b'cde', - 'metadata': YARN_PARSER_METADATA, - }]) - sha1_gits = [ hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), ] - metadata_indexer.run(sha1_gits, 'update-dups') + with unittest.mock.patch( + 'swh.indexer.metadata.RevisionMetadataIndexer.index_content', + return_value=YARN_PARSER_METADATA): + metadata_indexer.run(sha1_gits, 'update-dups') results = list( metadata_indexer.idx_storage.