diff --git a/swh/indexer/journal_client.py b/swh/indexer/journal_client.py --- a/swh/indexer/journal_client.py +++ b/swh/indexer/journal_client.py @@ -26,7 +26,7 @@ }), 'origin_visit_tasks': ('List[dict]', [ { - 'type': 'indexer_origin_head', + 'type': 'indexer_full_origin_metadata', 'kwargs': { 'policy_update': 'update-dups', 'parse_ids': False, diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -9,6 +9,7 @@ from copy import deepcopy from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer +from swh.indexer.origin_head import OriginHeadIndexer from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict @@ -136,7 +137,7 @@ - if multiple file detected -> translation needed at revision level Args: - rev (bytes): revision artifact from storage + rev (dict): revision artifact from storage Returns: dict: dictionary representing a revision_metadata, with keys: @@ -149,6 +150,7 @@ result = { 'id': rev['id'], 'indexer_configuration_id': self.tool['id'], + 'mappings': None, 'translated_metadata': None } @@ -157,8 +159,10 @@ dir_ls = self.storage.directory_ls(root_dir, recursive=False) files = [entry for entry in dir_ls if entry['type'] == 'file'] detected_files = detect_metadata(files) - result['translated_metadata'] = self.translate_revision_metadata( - detected_files) + (mappings, metadata) = self.translate_revision_metadata( + detected_files) + result['mappings'] = mappings + result['translated_metadata'] = metadata except Exception as e: self.log.exception( 'Problem when indexing rev: %r', e) @@ -191,10 +195,11 @@ "npm", "authors") to list of sha1 Returns: - dict: dict with translated metadata according to the CodeMeta - vocabulary + (List[str], dict): list of mappings used and dict with + translated metadata according to the CodeMeta vocabulary """ + used_mappings = [MAPPINGS[context].name for context in detected_files] translated_metadata = [] tool = { 'name': 'swh-metadata-translator', @@ -248,7 +253,7 @@ # transform translated_metadata into min set with swh-metadata-detector min_metadata = extract_minimal_metadata_dict(translated_metadata) - return min_metadata + return (used_mappings, min_metadata) class OriginMetadataIndexer(OriginIndexer): @@ -296,6 +301,7 @@ 'origin_id': origin['id'], 'metadata': item['translated_metadata'], 'from_revision': revision_id, + 'mappings': item['mappings'], 'indexer_configuration_id': item['tool']['id'], }) @@ -307,6 +313,57 @@ conflict_update=(policy_update == 'update-dups')) +class FullOriginMetadataIndexer(OriginIndexer): + CONFIG_BASE_FILENAME = 'indexer/full_origin_intrinsic_metadata' + + ADDITIONAL_CONFIG = { + 'tools': ('list', []) + } + + USE_TOOLS = False + + def __init__(self): + super().__init__() + self._prepare_sub_indexers() + + def _prepare_sub_indexers(self): + self.origin_head_indexer = OriginHeadIndexer() + self.revision_metadata_indexer = RevisionMetadataIndexer() + + def index(self, origin): + head_result = self.origin_head_indexer.index(origin) + if not head_result: + return + rev_id = head_result['revision_id'] + + rev = list(self.storage.revision_get([rev_id])) + if not rev: + self.warning('Missing head revision %s of origin %r', + (hashutil.hash_to_bytes(rev_id), origin)) + return + assert len(rev) == 1 + rev = rev[0] + rev_metadata = self.revision_metadata_indexer.index(rev) + orig_metadata = { + 'from_revision': rev_metadata['id'], + 'origin_id': origin['id'], + 'metadata': rev_metadata['translated_metadata'], + 'mappings': rev_metadata['mappings'], + 'indexer_configuration_id': + rev_metadata['indexer_configuration_id'], + } + return (orig_metadata, rev_metadata) + + def persist_index_computations(self, results, policy_update): + self.idx_storage.revision_metadata_add( + [rev_item for (orig_item, rev_item) in results], + conflict_update=(policy_update == 'update-dups')) + + self.idx_storage.origin_intrinsic_metadata_add( + [orig_item for (orig_item, rev_item) in results], + conflict_update=(policy_update == 'update-dups')) + + @click.command() @click.option('--revs', '-i', help='Default sha1_git to lookup', multiple=True) diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -38,6 +38,13 @@ self.__class__.__module__, self.__class__.__name__)) + @property + @abc.abstractmethod + def name(self): + """A name of this mapping, used as an identifier in the + indexer storage.""" + pass + @abc.abstractmethod def detect_metadata_files(self, files): """ @@ -158,6 +165,7 @@ """ dedicated class for NPM (package.json) mapping and translation """ + name = 'npm' mapping = CROSSWALK_TABLE['NodeJS'] filename = b'package.json' @@ -295,6 +303,7 @@ """ dedicated class for CodeMeta (codemeta.json) mapping and translation """ + name = 'codemeta' filename = b'codemeta.json' def translate(self, content): @@ -306,6 +315,7 @@ """ dedicated class for Maven (pom.xml) mapping and translation """ + name = 'maven' filename = b'pom.xml' mapping = CROSSWALK_TABLE['Java (Maven)'] @@ -430,6 +440,7 @@ """Dedicated class for Python's PKG-INFO mapping and translation. https://www.python.org/dev/peps/pep-0314/""" + name = 'pkg-info' filename = b'PKG-INFO' mapping = {_normalize_pkginfo_key(k): v for (k, v) in CROSSWALK_TABLE['Python PKG-INFO'].items()} diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql --- a/swh/indexer/sql/30-swh-schema.sql +++ b/swh/indexer/sql/30-swh-schema.sql @@ -118,7 +118,8 @@ create table revision_metadata( id sha1_git not null, translated_metadata jsonb not null, - indexer_configuration_id bigint not null + indexer_configuration_id bigint not null, + mappings text array not null ); comment on table revision_metadata is 'metadata semantically detected and translated in a revision'; @@ -131,7 +132,8 @@ metadata jsonb, indexer_configuration_id bigint not null, from_revision sha1_git not null, - metadata_tsvector tsvector + metadata_tsvector tsvector, + mappings text array not null ); comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin'; diff --git a/swh/indexer/sql/40-swh-func.sql b/swh/indexer/sql/40-swh-func.sql --- a/swh/indexer/sql/40-swh-func.sql +++ b/swh/indexer/sql/40-swh-func.sql @@ -315,15 +315,15 @@ as $$ begin if conflict_update then - insert into revision_metadata (id, translated_metadata, indexer_configuration_id) - select id, translated_metadata, indexer_configuration_id + insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id) + select id, translated_metadata, mappings, indexer_configuration_id from tmp_revision_metadata tcm on conflict(id, indexer_configuration_id) do update set translated_metadata = excluded.translated_metadata; else - insert into revision_metadata (id, translated_metadata, indexer_configuration_id) - select id, translated_metadata, indexer_configuration_id + insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id) + select id, translated_metadata, mappings, indexer_configuration_id from tmp_revision_metadata tcm on conflict(id, indexer_configuration_id) do nothing; @@ -410,17 +410,17 @@ begin perform swh_origin_intrinsic_metadata_compute_tsvector(); if conflict_update then - insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector) + insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) select origin_id, metadata, indexer_configuration_id, from_revision, - metadata_tsvector + metadata_tsvector, mappings from tmp_origin_intrinsic_metadata on conflict(origin_id, indexer_configuration_id) do update set metadata = excluded.metadata; else - insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector) + insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) select origin_id, metadata, indexer_configuration_id, from_revision, - metadata_tsvector + metadata_tsvector, mappings from tmp_origin_intrinsic_metadata on conflict(origin_id, indexer_configuration_id) do nothing; diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -586,6 +586,8 @@ - **id** (bytes) - **translated_metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata + - **mappings** (List[str]): list of mappings used to translate + these metadata """ for c in db.revision_metadata_get_from_list(ids, cur): @@ -604,6 +606,8 @@ - **id**: sha1_git of revision - **translated_metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata + - **mappings** (List[str]): list of mappings used to translate + these metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) @@ -611,8 +615,11 @@ """ db.mktemp_revision_metadata(cur) + import pprint + pprint.pprint(metadata) db.copy_to(metadata, 'tmp_revision_metadata', - ['id', 'translated_metadata', 'indexer_configuration_id'], + ['id', 'translated_metadata', 'mappings', + 'indexer_configuration_id'], cur) db.revision_metadata_add_from_temp(conflict_update, cur) @@ -630,6 +637,8 @@ - **origin_id** (int) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata + - **mappings** (List[str]): list of mappings used to translate + these metadata """ for c in db.origin_intrinsic_metadata_get_from_list(ids, cur): @@ -651,6 +660,8 @@ these metadata. - **metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata + - **mappings** (List[str]): list of mappings used to translate + these metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) @@ -660,7 +671,7 @@ db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', ['origin_id', 'metadata', 'indexer_configuration_id', - 'from_revision'], + 'from_revision', 'mappings'], cur) db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) @@ -680,6 +691,8 @@ - **id** (int) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata + - **mappings** (List[str]): list of mappings used to translate + these metadata """ for c in db.origin_intrinsic_metadata_search_fulltext( diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -295,7 +295,8 @@ yield from self._get_from_list( 'content_metadata', ids, self.content_metadata_cols, cur=cur) - revision_metadata_hash_keys = ['id', 'indexer_configuration_id'] + revision_metadata_hash_keys = [ + 'id', 'indexer_configuration_id'] def revision_metadata_missing_from_list(self, metadata, cur=None): """List missing metadata. @@ -306,7 +307,7 @@ cur=cur) revision_metadata_cols = [ - 'id', 'translated_metadata', + 'id', 'translated_metadata', 'mappings', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_revision_metadata') @@ -321,7 +322,7 @@ 'revision_metadata', ids, self.revision_metadata_cols, cur=cur) origin_intrinsic_metadata_cols = [ - 'origin_id', 'metadata', 'from_revision', + 'origin_id', 'metadata', 'from_revision', 'mappings', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] origin_intrinsic_metadata_regconfig = 'pg_catalog.simple' diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py --- a/swh/indexer/tasks.py +++ b/swh/indexer/tasks.py @@ -13,7 +13,9 @@ FossologyLicenseIndexer, FossologyLicenseRangeIndexer ) from .rehash import RecomputeChecksums -from .metadata import RevisionMetadataIndexer, OriginMetadataIndexer +from .metadata import ( + RevisionMetadataIndexer, OriginMetadataIndexer, FullOriginMetadataIndexer, +) from .origin_head import OriginHeadIndexer @@ -29,6 +31,12 @@ return getattr(results, 'results', results) +@app.task(name=__name__ + '.FullOriginMetadata') +def full_origin_metadata(*args, **kwargs): + results = FullOriginMetadataIndexer().run(*args, **kwargs) + return getattr(results, 'results', results) + + @app.task(name=__name__ + '.OriginHead') def origin_head(*args, **kwargs): results = OriginHeadIndexer().run(*args, **kwargs) diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -1217,6 +1217,7 @@ 'softwareRequirements': None, 'identifier': None }, + 'mappings': [], 'indexer_configuration_id': tool_id }]) @@ -1250,6 +1251,7 @@ 'softwareRequirements': None, 'identifier': None }, + 'mappings': ['mapping1', 'mapping2'], 'indexer_configuration_id': tool_id } @@ -1263,9 +1265,11 @@ expected_metadata = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_rev['translated_metadata'], + 'mappings': ['mapping1', 'mapping2'], 'tool': self.tools['swh-metadata-detector'] }] + self.maxDiff = None self.assertEqual(actual_metadata, expected_metadata) def test_revision_metadata_add_drop_duplicate(self): @@ -1291,6 +1295,7 @@ 'softwareRequirements': None, 'identifier': None }, + 'mappings': [], 'indexer_configuration_id': tool_id, } @@ -1304,6 +1309,7 @@ expected_metadata_v1 = [{ 'id': self.revision_id_1, 'translated_metadata': metadata_v1['translated_metadata'], + 'mappings': [], 'tool': self.tools['swh-metadata-detector'] }] @@ -1350,6 +1356,7 @@ 'softwareRequirements': None, 'identifier': None }, + 'mappings': [], 'indexer_configuration_id': tool_id, } @@ -1364,6 +1371,7 @@ expected_metadata_v1 = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_v1['translated_metadata'], + 'mappings': [], 'tool': self.tools['swh-metadata-detector'] }] self.assertEqual(actual_metadata, expected_metadata_v1) @@ -1384,6 +1392,7 @@ expected_metadata_v2 = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_v2['translated_metadata'], + 'mappings': [], 'tool': self.tools['swh-metadata-detector'] }] @@ -1414,12 +1423,14 @@ metadata_rev = { 'id': self.revision_id_2, 'translated_metadata': metadata, + 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { 'origin_id': self.origin_id_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, + 'mappings': ['mapping1'], 'from_revision': self.revision_id_2, } @@ -1436,6 +1447,7 @@ 'metadata': metadata, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, + 'mappings': ['mapping1'], }] self.assertEqual(actual_metadata, expected_metadata) @@ -1464,12 +1476,14 @@ metadata_rev_v1 = { 'id': self.revision_id_1, 'translated_metadata': metadata_v1.copy(), + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { 'origin_id': self.origin_id_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, + 'mappings': [], 'from_revision': self.revision_id_1, } @@ -1486,6 +1500,7 @@ 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_1, + 'mappings': [], }] self.assertEqual(actual_metadata, expected_metadata_v1) @@ -1535,12 +1550,14 @@ metadata_rev_v1 = { 'id': self.revision_id_2, 'translated_metadata': metadata_v1, + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { 'origin_id': self.origin_id_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, + 'mappings': [], 'from_revision': self.revision_id_2, } @@ -1558,6 +1575,7 @@ 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, + 'mappings': [], }] self.assertEqual(actual_metadata, expected_metadata_v1) @@ -1585,6 +1603,7 @@ 'metadata': metadata_v2, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, + 'mappings': [], }] # metadata did change as the v2 was used to overwrite v1 @@ -1600,11 +1619,13 @@ metadata1_rev = { 'id': self.revision_id_1, 'translated_metadata': metadata1, + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata1_origin = { 'origin_id': self.origin_id_1, 'metadata': metadata1, + 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_1, } @@ -1614,11 +1635,13 @@ metadata2_rev = { 'id': self.revision_id_2, 'translated_metadata': metadata2, + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { 'origin_id': self.origin_id_2, 'metadata': metadata2, + 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_2, } @@ -1662,11 +1685,13 @@ metadata1_rev = { 'id': self.revision_id_1, 'translated_metadata': metadata1, + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata1_origin = { 'origin_id': self.origin_id_1, 'metadata': metadata1, + 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_1, } @@ -1679,11 +1704,13 @@ metadata2_rev = { 'id': self.revision_id_2, 'translated_metadata': metadata2, + 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { 'origin_id': self.origin_id_2, 'metadata': metadata2, + 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_2, } diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -38,18 +38,10 @@ assert False, 'should not be called; the rev indexer configures it.' -class RevisionMetadataTestIndexer(RevisionMetadataIndexer): - """Specific indexer whose configuration is enough to satisfy the - indexing tests. - """ - - ContentMetadataIndexer = ContentMetadataTestIndexer - - def parse_config_file(self, *args, **kwargs): - return { - **BASE_TEST_CONFIG, - 'tools': TRANSLATOR_TOOL, - } +REVISION_METADATA_CONFIG = { + **BASE_TEST_CONFIG, + 'tools': TRANSLATOR_TOOL, +} class Metadata(unittest.TestCase): @@ -213,7 +205,7 @@ 'name': 'test_metadata', 'version': '0.0.1' }, - 'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5') + 'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'), }, { 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', @@ -728,7 +720,8 @@ }) def test_revision_metadata_indexer(self): - metadata_indexer = RevisionMetadataTestIndexer() + metadata_indexer = RevisionMetadataIndexer( + config=REVISION_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) @@ -786,6 +779,7 @@ 'name': 'yarn-parser', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], }, + 'mappings': ['npm'], }] for result in results: diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -10,24 +10,26 @@ BASE_TEST_CONFIG, fill_storage ) +ORIGIN_HEAD_CONFIG = { + **BASE_TEST_CONFIG, + 'tools': { + 'name': 'origin-metadata', + 'version': '0.0.1', + 'configuration': {}, + }, + 'tasks': { + 'revision_metadata': None, + 'origin_intrinsic_metadata': None, + } +} + class OriginHeadTestIndexer(OriginHeadIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): - return { - **BASE_TEST_CONFIG, - 'tools': { - 'name': 'origin-metadata', - 'version': '0.0.1', - 'configuration': {}, - }, - 'tasks': { - 'revision_metadata': None, - 'origin_intrinsic_metadata': None, - } - } + return ORIGIN_HEAD_CONFIG def persist_index_computations(self, results, policy_update): self.results = results diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -11,28 +11,39 @@ from swh.indexer.storage.in_memory import IndexerStorage +from swh.indexer.metadata import FullOriginMetadataIndexer from swh.objstorage.objstorage_in_memory import InMemoryObjStorage from swh.scheduler.celery_backend.runner import run_ready_tasks - -from .utils import fill_storage, fill_obj_storage -from .test_origin_head import OriginHeadTestIndexer -from swh.indexer.tests.tasks import ( - RevisionMetadataTestIndexer, OriginMetadataTestIndexer) - - -class OriginHeadTestIndexer(OriginHeadTestIndexer): - def prepare(self): - super().prepare() - self.config['tasks'] = { - 'revision_metadata': 'revision_metadata', - 'origin_intrinsic_metadata': 'origin_intrinsic_metadata', - } +from swh.indexer.metadata import ( + OriginMetadataIndexer, RevisionMetadataIndexer +) +from swh.indexer.origin_head import OriginHeadIndexer + +from .utils import fill_storage, fill_obj_storage, BASE_TEST_CONFIG +from .test_metadata import REVISION_METADATA_CONFIG + + +ORIGIN_HEAD_CONFIG = { + **BASE_TEST_CONFIG, + 'tools': { + 'name': 'origin-metadata', + 'version': '0.0.1', + 'configuration': {}, + }, + 'tasks': { + 'revision_metadata': 'revision_metadata', + 'origin_intrinsic_metadata': 'origin_intrinsic_metadata', + } +} +@mock.patch('swh.indexer.metadata.RevisionMetadataIndexer.parse_config_file') +@mock.patch('swh.indexer.origin_head.OriginHeadIndexer.parse_config_file') @mock.patch('swh.indexer.storage.in_memory.IndexerStorage') @mock.patch('swh.storage.in_memory.Storage') def test_pipeline(storage_mock, idx_storage_mock, + origin_head_parse_config, revision_metadata_parse_config, swh_app, celery_session_worker, indexer_scheduler): scheduler = indexer_scheduler # Always returns the same instance of the idx storage, because @@ -41,6 +52,8 @@ storage = Storage() idx_storage = IndexerStorage() + origin_head_parse_config.return_value = ORIGIN_HEAD_CONFIG + revision_metadata_parse_config.return_value = REVISION_METADATA_CONFIG storage_mock.return_value = storage idx_storage_mock.return_value = idx_storage @@ -53,9 +66,9 @@ old_inmem_objstorage = swh.objstorage._STORAGE_CLASSES['memory'] swh.objstorage._STORAGE_CLASSES['memory'] = lambda: objstorage try: - RevisionMetadataTestIndexer.scheduler = scheduler - OriginMetadataTestIndexer.scheduler = scheduler - indexer = OriginHeadTestIndexer() + RevisionMetadataIndexer.scheduler = scheduler + OriginMetadataIndexer.scheduler = scheduler + indexer = OriginHeadIndexer() indexer.scheduler = scheduler indexer.run(["git+https://github.com/librariesio/yarn-parser"]) tasks = [] @@ -74,8 +87,90 @@ promise.wait() finally: swh.objstorage._STORAGE_CLASSES['memory'] = old_inmem_objstorage - del RevisionMetadataTestIndexer.scheduler - del OriginMetadataTestIndexer.scheduler + del RevisionMetadataIndexer.scheduler + del OriginMetadataIndexer.scheduler + + origin = storage.origin_get({ + 'type': 'git', + 'url': 'https://github.com/librariesio/yarn-parser'}) + rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') + + metadata = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'url': + 'https://github.com/librariesio/yarn-parser#readme', + 'codeRepository': + 'git+git+https://github.com/librariesio/yarn-parser.git', + 'author': [{ + 'type': 'Person', + 'name': 'Andrew Nesbitt' + }], + 'license': 'https://spdx.org/licenses/AGPL-3.0', + 'version': '1.0.0', + 'description': + 'Tiny web service for parsing yarn.lock files', + 'issueTracker': + 'https://github.com/librariesio/yarn-parser/issues', + 'name': 'yarn-parser', + 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], + } + rev_metadata = { + 'id': rev_id, + 'translated_metadata': metadata, + 'mappings': ['npm'], + } + origin_metadata = { + 'origin_id': origin['id'], + 'from_revision': rev_id, + 'metadata': metadata, + 'mappings': ['npm'], + } + + results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + for result in results: + del result['tool'] + assert results == [rev_metadata] + + results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ + origin['id']])) + for result in results: + del result['tool'] + assert results == [origin_metadata] + + +@mock.patch('swh.indexer.metadata.RevisionMetadataIndexer.parse_config_file') +@mock.patch('swh.indexer.origin_head.OriginHeadIndexer.parse_config_file') +@mock.patch('swh.indexer.storage.in_memory.IndexerStorage') +@mock.patch('swh.storage.in_memory.Storage') +def test_full_origin_metadata_indexer( + storage_mock, idx_storage_mock, origin_head_parse_config, + revision_metadata_parse_config): + # Always returns the same instance of the idx storage, because + # this function is called by each of the three indexers. + objstorage = InMemoryObjStorage() + storage = Storage() + idx_storage = IndexerStorage() + + origin_head_parse_config.return_value = ORIGIN_HEAD_CONFIG + revision_metadata_parse_config.return_value = REVISION_METADATA_CONFIG + storage_mock.return_value = storage + idx_storage_mock.return_value = idx_storage + + fill_obj_storage(objstorage) + fill_storage(storage) + + # TODO: find a better way to share the ContentMetadataIndexer use + # the same objstorage instance. + import swh.objstorage + old_inmem_objstorage = swh.objstorage._STORAGE_CLASSES['memory'] + swh.objstorage._STORAGE_CLASSES['memory'] = lambda: objstorage + try: + indexer = FullOriginMetadataIndexer() + indexer.storage = storage + indexer.idx_storage = idx_storage + indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + finally: + swh.objstorage._STORAGE_CLASSES['memory'] = old_inmem_objstorage origin = storage.origin_get({ 'type': 'git', @@ -104,11 +199,13 @@ rev_metadata = { 'id': rev_id, 'translated_metadata': metadata, + 'mappings': ['npm'], } origin_metadata = { 'origin_id': origin['id'], 'from_revision': rev_id, 'metadata': metadata, + 'mappings': ['npm'], } results = list(indexer.idx_storage.revision_metadata_get([rev_id]))