diff --git a/swh/indexer/journal_client.py b/swh/indexer/journal_client.py --- a/swh/indexer/journal_client.py +++ b/swh/indexer/journal_client.py @@ -26,7 +26,7 @@ }), 'origin_visit_tasks': ('List[dict]', [ { - 'type': 'indexer_origin_head', + 'type': 'indexer_full_origin_metadata', 'kwargs': { 'policy_update': 'update-dups', 'parse_ids': False, diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -9,6 +9,7 @@ from copy import deepcopy from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer +from swh.indexer.origin_head import OriginHeadIndexer from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict @@ -137,7 +138,7 @@ - if multiple file detected -> translation needed at revision level Args: - rev (bytes): revision artifact from storage + rev (dict): revision artifact from storage Returns: dict: dictionary representing a revision_metadata, with keys: @@ -311,6 +312,53 @@ conflict_update=(policy_update == 'update-dups')) +class FullOriginMetadataIndexer(OriginIndexer): + CONFIG_BASE_FILENAME = 'indexer/full_origin_intrinsic_metadata' + + ADDITIONAL_CONFIG = { + 'tools': ('list', []) + } + + USE_TOOLS = False + + def __init__(self): + super().__init__() + self.origin_head_indexer = OriginHeadIndexer() + self.revision_metadata_indexer = RevisionMetadataIndexer() + + def index(self, origin): + head_result = self.origin_head_indexer.index(origin) + if not head_result: + return + rev_id = head_result['revision_id'] + + rev = list(self.storage.revision_get([rev_id])) + if not rev: + self.warning('Missing head revision %s of origin %r', + (hashutil.hash_to_bytes(rev_id), origin)) + return + assert len(rev) == 1 + rev = rev[0] + rev_metadata = self.revision_metadata_indexer.index(rev) + orig_metadata = { + 'from_revision': rev_metadata['id'], + 'origin_id': origin['id'], + 'metadata': rev_metadata['translated_metadata'], + 'indexer_configuration_id': + rev_metadata['indexer_configuration_id'], + } + return (orig_metadata, rev_metadata) + + def persist_index_computations(self, results, policy_update): + self.idx_storage.revision_metadata_add( + [rev_item for (orig_item, rev_item) in results], + conflict_update=(policy_update == 'update-dups')) + + self.idx_storage.origin_intrinsic_metadata_add( + [orig_item for (orig_item, rev_item) in results], + conflict_update=(policy_update == 'update-dups')) + + @click.command() @click.option('--revs', '-i', help='Default sha1_git to lookup', multiple=True) diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py --- a/swh/indexer/tasks.py +++ b/swh/indexer/tasks.py @@ -13,7 +13,9 @@ FossologyLicenseIndexer, FossologyLicenseRangeIndexer ) from .rehash import RecomputeChecksums -from .metadata import RevisionMetadataIndexer, OriginMetadataIndexer +from .metadata import ( + RevisionMetadataIndexer, OriginMetadataIndexer, FullOriginMetadataIndexer, +) from .origin_head import OriginHeadIndexer @@ -29,6 +31,12 @@ return getattr(results, 'results', results) +@app.task(name=__name__ + '.FullOriginMetadata') +def full_origin_metadata(*args, **kwargs): + results = FullOriginMetadataIndexer().run(*args, **kwargs) + return getattr(results, 'results', results) + + @app.task(name=__name__ + '.OriginHead') def origin_head(*args, **kwargs): results = OriginHeadIndexer().run(*args, **kwargs) diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -771,7 +771,7 @@ Description: foo Hydrology N\xc2\xb083 ''') # noqa - result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content) + result = self.pkginfo_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -8,18 +8,17 @@ from celery.result import AsyncResult from unittest import mock +from swh.objstorage.objstorage_in_memory import InMemoryObjStorage from swh.model.hashutil import hash_to_bytes +from swh.scheduler.celery_backend.runner import run_ready_tasks from swh.storage.in_memory import Storage - -from swh.indexer.storage.in_memory import IndexerStorage -from swh.objstorage.objstorage_in_memory import InMemoryObjStorage - -from swh.scheduler.celery_backend.runner import run_ready_tasks from swh.indexer.metadata import ( - OriginMetadataIndexer, RevisionMetadataIndexer + OriginMetadataIndexer, RevisionMetadataIndexer, + FullOriginMetadataIndexer ) from swh.indexer.origin_head import OriginHeadIndexer +from swh.indexer.storage.in_memory import IndexerStorage from .utils import fill_storage, fill_obj_storage, BASE_TEST_CONFIG from .test_metadata import REVISION_METADATA_CONFIG @@ -136,3 +135,83 @@ for result in results: del result['tool'] assert results == [origin_metadata] + + +@mock.patch('swh.indexer.metadata.RevisionMetadataIndexer.parse_config_file') +@mock.patch('swh.indexer.origin_head.OriginHeadIndexer.parse_config_file') +@mock.patch('swh.indexer.storage.in_memory.IndexerStorage') +@mock.patch('swh.storage.in_memory.Storage') +def test_full_origin_metadata_indexer( + storage_mock, idx_storage_mock, origin_head_parse_config, + revision_metadata_parse_config): + # Always returns the same instance of the idx storage, because + # this function is called by each of the three indexers. + objstorage = InMemoryObjStorage() + storage = Storage() + idx_storage = IndexerStorage() + + origin_head_parse_config.return_value = ORIGIN_HEAD_CONFIG + revision_metadata_parse_config.return_value = REVISION_METADATA_CONFIG + storage_mock.return_value = storage + idx_storage_mock.return_value = idx_storage + + fill_obj_storage(objstorage) + fill_storage(storage) + + # TODO: find a better way to share the ContentMetadataIndexer use + # the same objstorage instance. + import swh.objstorage + old_inmem_objstorage = swh.objstorage._STORAGE_CLASSES['memory'] + swh.objstorage._STORAGE_CLASSES['memory'] = lambda: objstorage + try: + indexer = FullOriginMetadataIndexer() + indexer.storage = storage + indexer.idx_storage = idx_storage + indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + finally: + swh.objstorage._STORAGE_CLASSES['memory'] = old_inmem_objstorage + + origin = storage.origin_get({ + 'type': 'git', + 'url': 'https://github.com/librariesio/yarn-parser'}) + rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') + + metadata = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'url': + 'https://github.com/librariesio/yarn-parser#readme', + 'codeRepository': + 'git+git+https://github.com/librariesio/yarn-parser.git', + 'author': [{ + 'type': 'Person', + 'name': 'Andrew Nesbitt' + }], + 'license': 'https://spdx.org/licenses/AGPL-3.0', + 'version': '1.0.0', + 'description': + 'Tiny web service for parsing yarn.lock files', + 'issueTracker': + 'https://github.com/librariesio/yarn-parser/issues', + 'name': 'yarn-parser', + 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], + } + rev_metadata = { + 'id': rev_id, + 'translated_metadata': metadata, + } + origin_metadata = { + 'origin_id': origin['id'], + 'from_revision': rev_id, + 'metadata': metadata, + } + + results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + for result in results: + del result['tool'] + assert results == [rev_metadata] + + results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ + origin['id']])) + for result in results: + del result['tool'] + assert results == [origin_metadata]