diff --git a/swh/indexer/journal_client.py b/swh/indexer/journal_client.py --- a/swh/indexer/journal_client.py +++ b/swh/indexer/journal_client.py @@ -26,7 +26,7 @@ }), 'origin_visit_tasks': ('List[dict]', [ { - 'type': 'indexer_origin_head', + 'type': 'indexer_full_origin_metadata', 'kwargs': { 'policy_update': 'update-dups', 'parse_ids': False, diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -9,6 +9,7 @@ from copy import deepcopy from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer +from swh.indexer.origin_head import OriginHeadIndexer from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict @@ -136,7 +137,7 @@ - if multiple file detected -> translation needed at revision level Args: - rev (bytes): revision artifact from storage + rev (dict): revision artifact from storage Returns: dict: dictionary representing a revision_metadata, with keys: @@ -307,6 +308,56 @@ conflict_update=(policy_update == 'update-dups')) +class FullOriginMetadataIndexer(OriginIndexer): + CONFIG_BASE_FILENAME = 'indexer/full_origin_intrinsic_metadata' + + ADDITIONAL_CONFIG = { + 'tools': ('list', []) + } + + USE_TOOLS = False + + def __init__(self): + super().__init__() + self._prepare_sub_indexers() + + def _prepare_sub_indexers(self): + self.origin_head_indexer = OriginHeadIndexer() + self.revision_metadata_indexer = RevisionMetadataIndexer() + + def index(self, origin): + head_result = self.origin_head_indexer.index(origin) + if not head_result: + return + rev_id = head_result['revision_id'] + + rev = list(self.storage.revision_get([rev_id])) + if not rev: + self.warning('Missing head revision %s of origin %r', + (hashutil.hash_to_bytes(rev_id), origin)) + return + assert len(rev) == 1 + rev = rev[0] + rev_metadata = self.revision_metadata_indexer.index(rev) + orig_metadata = { + 'from_revision': rev_metadata['id'], + 'origin_id': origin['id'], + 'metadata': rev_metadata['translated_metadata'], + 'indexer_configuration_id': + rev_metadata['indexer_configuration_id'], + } + return (orig_metadata, rev_metadata) + + def persist_index_computations(self, results, policy_update): + self.idx_storage.revision_metadata_add( + [rev_item for (orig_item, rev_item) in results], + conflict_update=(policy_update == 'update-dups')) + + self.idx_storage.origin_intrinsic_metadata_add( + [orig_item for (orig_item, rev_item) in results], + conflict_update=(policy_update == 'update-dups')) + + @click.command() @click.option('--revs', '-i', help='Default sha1_git to lookup', multiple=True) diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py --- a/swh/indexer/tasks.py +++ b/swh/indexer/tasks.py @@ -13,7 +13,9 @@ FossologyLicenseIndexer, FossologyLicenseRangeIndexer ) from .rehash import RecomputeChecksums -from .metadata import RevisionMetadataIndexer, OriginMetadataIndexer +from .metadata import ( + RevisionMetadataIndexer, OriginMetadataIndexer, FullOriginMetadataIndexer, +) from .origin_head import OriginHeadIndexer @@ -29,6 +31,12 @@ return getattr(results, 'results', results) +@app.task(name=__name__ + '.FullOriginMetadata') +def full_origin_metadata(*args, **kwargs): + results = FullOriginMetadataIndexer().run(*args, **kwargs) + return getattr(results, 'results', results) + + @app.task(name=__name__ + '.OriginHead') def origin_head(*args, **kwargs): results = OriginHeadIndexer().run(*args, **kwargs) diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -38,18 +38,10 @@ assert False, 'should not be called; the rev indexer configures it.' -class RevisionMetadataTestIndexer(RevisionMetadataIndexer): - """Specific indexer whose configuration is enough to satisfy the - indexing tests. - """ - - ContentMetadataIndexer = ContentMetadataTestIndexer - - def parse_config_file(self, *args, **kwargs): - return { - **BASE_TEST_CONFIG, - 'tools': TRANSLATOR_TOOL, - } +REVISION_METADATA_CONFIG = { + **BASE_TEST_CONFIG, + 'tools': TRANSLATOR_TOOL, +} class Metadata(unittest.TestCase): @@ -728,7 +720,8 @@ }) def test_revision_metadata_indexer(self): - metadata_indexer = RevisionMetadataTestIndexer() + metadata_indexer = RevisionMetadataIndexer( + config=REVISION_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -10,24 +10,26 @@ BASE_TEST_CONFIG, fill_storage ) +ORIGIN_HEAD_CONFIG = { + **BASE_TEST_CONFIG, + 'tools': { + 'name': 'origin-metadata', + 'version': '0.0.1', + 'configuration': {}, + }, + 'tasks': { + 'revision_metadata': None, + 'origin_intrinsic_metadata': None, + } +} + class OriginHeadTestIndexer(OriginHeadIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): - return { - **BASE_TEST_CONFIG, - 'tools': { - 'name': 'origin-metadata', - 'version': '0.0.1', - 'configuration': {}, - }, - 'tasks': { - 'revision_metadata': None, - 'origin_intrinsic_metadata': None, - } - } + return ORIGIN_HEAD_CONFIG def persist_index_computations(self, results, policy_update): self.results = results diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -11,28 +11,39 @@ from swh.indexer.storage.in_memory import IndexerStorage +from swh.indexer.metadata import FullOriginMetadataIndexer from swh.objstorage.objstorage_in_memory import InMemoryObjStorage from swh.scheduler.celery_backend.runner import run_ready_tasks - -from .utils import fill_storage, fill_obj_storage -from .test_origin_head import OriginHeadTestIndexer -from swh.indexer.tests.tasks import ( - RevisionMetadataTestIndexer, OriginMetadataTestIndexer) - - -class OriginHeadTestIndexer(OriginHeadTestIndexer): - def prepare(self): - super().prepare() - self.config['tasks'] = { - 'revision_metadata': 'revision_metadata', - 'origin_intrinsic_metadata': 'origin_intrinsic_metadata', - } +from swh.indexer.metadata import ( + OriginMetadataIndexer, RevisionMetadataIndexer +) +from swh.indexer.origin_head import OriginHeadIndexer + +from .utils import fill_storage, fill_obj_storage, BASE_TEST_CONFIG +from .test_metadata import REVISION_METADATA_CONFIG + + +ORIGIN_HEAD_CONFIG = { + **BASE_TEST_CONFIG, + 'tools': { + 'name': 'origin-metadata', + 'version': '0.0.1', + 'configuration': {}, + }, + 'tasks': { + 'revision_metadata': 'revision_metadata', + 'origin_intrinsic_metadata': 'origin_intrinsic_metadata', + } +} +@mock.patch('swh.indexer.metadata.RevisionMetadataIndexer.parse_config_file') +@mock.patch('swh.indexer.origin_head.OriginHeadIndexer.parse_config_file') @mock.patch('swh.indexer.storage.in_memory.IndexerStorage') @mock.patch('swh.storage.in_memory.Storage') def test_pipeline(storage_mock, idx_storage_mock, + origin_head_parse_config, revision_metadata_parse_config, swh_app, celery_session_worker, indexer_scheduler): scheduler = indexer_scheduler # Always returns the same instance of the idx storage, because @@ -41,6 +52,8 @@ storage = Storage() idx_storage = IndexerStorage() + origin_head_parse_config.return_value = ORIGIN_HEAD_CONFIG + revision_metadata_parse_config.return_value = REVISION_METADATA_CONFIG storage_mock.return_value = storage idx_storage_mock.return_value = idx_storage @@ -53,9 +66,9 @@ old_inmem_objstorage = swh.objstorage._STORAGE_CLASSES['memory'] swh.objstorage._STORAGE_CLASSES['memory'] = lambda: objstorage try: - RevisionMetadataTestIndexer.scheduler = scheduler - OriginMetadataTestIndexer.scheduler = scheduler - indexer = OriginHeadTestIndexer() + RevisionMetadataIndexer.scheduler = scheduler + OriginMetadataIndexer.scheduler = scheduler + indexer = OriginHeadIndexer() indexer.scheduler = scheduler indexer.run(["git+https://github.com/librariesio/yarn-parser"]) tasks = [] @@ -74,8 +87,88 @@ promise.wait() finally: swh.objstorage._STORAGE_CLASSES['memory'] = old_inmem_objstorage - del RevisionMetadataTestIndexer.scheduler - del OriginMetadataTestIndexer.scheduler + del RevisionMetadataIndexer.scheduler + del OriginMetadataIndexer.scheduler + + origin = storage.origin_get({ + 'type': 'git', + 'url': 'https://github.com/librariesio/yarn-parser'}) + rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') + + metadata = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'url': + 'https://github.com/librariesio/yarn-parser#readme', + 'codeRepository': + 'git+git+https://github.com/librariesio/yarn-parser.git', + 'author': [{ + 'type': 'Person', + 'name': 'Andrew Nesbitt' + }], + 'license': 'https://spdx.org/licenses/AGPL-3.0', + 'version': '1.0.0', + 'description': + 'Tiny web service for parsing yarn.lock files', + 'issueTracker': + 'https://github.com/librariesio/yarn-parser/issues', + 'name': 'yarn-parser', + 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], + } + rev_metadata = { + 'id': rev_id, + 'translated_metadata': metadata, + } + origin_metadata = { + 'origin_id': origin['id'], + 'from_revision': rev_id, + 'metadata': metadata, + } + + results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + for result in results: + del result['tool'] + assert results == [rev_metadata] + + results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ + origin['id']])) + for result in results: + del result['tool'] + assert results == [origin_metadata] + + +@mock.patch('swh.indexer.metadata.RevisionMetadataIndexer.parse_config_file') +@mock.patch('swh.indexer.origin_head.OriginHeadIndexer.parse_config_file') +@mock.patch('swh.indexer.storage.in_memory.IndexerStorage') +@mock.patch('swh.storage.in_memory.Storage') +def test_full_origin_metadata_indexer( + storage_mock, idx_storage_mock, origin_head_parse_config, + revision_metadata_parse_config): + # Always returns the same instance of the idx storage, because + # this function is called by each of the three indexers. + objstorage = InMemoryObjStorage() + storage = Storage() + idx_storage = IndexerStorage() + + origin_head_parse_config.return_value = ORIGIN_HEAD_CONFIG + revision_metadata_parse_config.return_value = REVISION_METADATA_CONFIG + storage_mock.return_value = storage + idx_storage_mock.return_value = idx_storage + + fill_obj_storage(objstorage) + fill_storage(storage) + + # TODO: find a better way to share the ContentMetadataIndexer use + # the same objstorage instance. + import swh.objstorage + old_inmem_objstorage = swh.objstorage._STORAGE_CLASSES['memory'] + swh.objstorage._STORAGE_CLASSES['memory'] = lambda: objstorage + try: + indexer = FullOriginMetadataIndexer() + indexer.storage = storage + indexer.idx_storage = idx_storage + indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + finally: + swh.objstorage._STORAGE_CLASSES['memory'] = old_inmem_objstorage origin = storage.origin_get({ 'type': 'git',