diff --git a/swh/indexer/__init__.py b/swh/indexer/__init__.py --- a/swh/indexer/__init__.py +++ b/swh/indexer/__init__.py @@ -21,6 +21,8 @@ 'ctags': 'swh.indexer.tasks.SWHCtagsTask', 'fossology_license': 'swh.indexer.tasks.SWHContentFossologyLicenseTask', 'rehash': 'swh.indexer.tasks.SWHRecomputeChecksumsTask', + 'revision_metadata_task': 'swh.indexer.tasks.SWHRevisionMetadataTask', + 'origin_metadata_task': 'swh.indexer.tasks.SWHOriginMetadataTask', } diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -361,6 +361,7 @@ """ results = [] + promises = [] try: for sha1 in ids: try: @@ -374,7 +375,9 @@ results.append(res) self.persist_index_computations(results, policy_update) - self.next_step(results) + promise = self.next_step(results) + if promise is not None: + promises.append(promise) except Exception: self.log.exception( 'Problem when reading contents metadata.') @@ -382,6 +385,8 @@ self.log.warn('Rescheduling batch') self.rescheduling_task.delay(ids, policy_update) + return promises + class OriginIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -5,7 +5,7 @@ import click import logging -from swh.indexer.indexer import ContentIndexer, RevisionIndexer +from swh.indexer.indexer import ContentIndexer, RevisionIndexer, BaseIndexer from swh.indexer.metadata_dictionary import compute_metadata from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict @@ -266,6 +266,20 @@ return min_metadata +class OriginMetadataIndexer(BaseIndexer): + def filter(self, ids): + return ids + + def run(self, ids): + pass # TODO + + def index(self, ids): + pass # TODO + + def persist_index_computations(self, results): + pass # TODO + + @click.command() @click.option('--revs', '-i', help='Default sha1_git to lookup', multiple=True) diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py --- a/swh/indexer/origin_head.py +++ b/swh/indexer/origin_head.py @@ -8,6 +8,7 @@ import logging from swh.indexer.indexer import OriginIndexer +from swh.indexer.tasks import SWHOriginMetadataTask, SWHRevisionMetadataTask class OriginHeadIndexer(OriginIndexer): @@ -26,6 +27,11 @@ }), } + CONFIG_BASE_FILENAME = 'indexer/origin_head' + + RevisionMetadataTask = SWHRevisionMetadataTask + OriginMetadataTask = SWHOriginMetadataTask + def filter(self, ids): yield from ids @@ -34,6 +40,23 @@ should only be piped to another indexer via the orchestrator.""" pass + def next_step(self, results): + """Once the head is found, call the RevisionMetadataIndexer + on these revisions, then call the OriginMetadataIndexer with + both the origin_id and the revision metadata, so it can copy the + revision metadata to the origin's metadata. + + Args: + results (Iterable[dict]): Iterable of return values from `index`. + + """ + origin_to_revision = {res['origin_id']: res['revision_id'] + for res in results} + return self.RevisionMetadataTask.apply_async( + args=([res['revision_id'] for res in results],), + link=self.OriginMetadataTask.s(map_=origin_to_revision), + ) + # Dispatch def index(self, origin): diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py --- a/swh/indexer/tasks.py +++ b/swh/indexer/tasks.py @@ -14,6 +14,7 @@ from .ctags import CtagsIndexer from .fossology_license import ContentFossologyLicenseIndexer from .rehash import RecomputeChecksums +from .metadata import RevisionMetadataIndexer, OriginMetadataIndexer logging.basicConfig(level=logging.INFO) @@ -40,6 +41,20 @@ OrchestratorTextContentsIndexer().run(*args, **kwargs) +class SWHRevisionMetadataTask(Task): + task_queue = 'swh_indexer_revision_metadata' + + def run_task(self, *args, **kwargs): + RevisionMetadataIndexer().run(*args, **kwargs) + + +class SWHOriginMetadataTask(Task): + task_queue = 'swh_indexer_origin_metadata' + + def run_task(self, *args, **kwargs): + OriginMetadataIndexer().run(*args, **kwargs) + + class SWHContentMimetypeTask(Task): """Task which computes the mimetype, encoding from the sha1's content. diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -41,6 +41,12 @@ 'project': None, 'type': 'svn', 'url': 'http://0-512-md.googlecode.com/svn/'}, + { + 'id': 424242, + 'lister': None, + 'project': None, + 'type': 'git', + 'url': 'https://github.com/moranegg/metadata_test'}, ] SNAPSHOTS = { @@ -114,6 +120,11 @@ 'target_type': 'revision'}}, 'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7' b'\x05\xea\xb8\x1f\xc4H\xf4s'}, + 424242: { + 'branches': { + b'HEAD': { + 'target': b'head_branch_metadata_test', + 'target_type': 'revision'}}} } diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/test_origin_metadata.py @@ -0,0 +1,69 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging +import unittest + +from celery import Task + +from swh.indexer.metadata import OriginMetadataIndexer +from swh.indexer.tests.test_utils import MockObjStorage, MockStorage +from swh.indexer.tests.test_utils import MockIndexerStorage +from swh.indexer.tests.test_origin_head import TestOriginHeadIndexer +from swh.indexer.tests.test_metadata import TestRevisionMetadataIndexer + +from . import start_worker_thread + + +class TestOriginMetadataIndexer(OriginMetadataIndexer): + def prepare(self): + self.config = { + 'storage': { + 'cls': 'remote', + 'args': { + 'url': 'http://localhost:9999', + } + }, + 'tools': { + 'name': 'origin-metadata', + 'version': '0.0.1', + 'configuration': {} + } + } + self.storage = MockStorage() + self.idx_storage = MockIndexerStorage() + self.log = logging.getLogger('swh.indexer') + self.objstorage = MockObjStorage() + self.destination_task = None + self.tools = self.register_tools(self.config['tools']) + self.tool = self.tools[0] + self.results = [] + + +class TestRevisionMetadataTask(Task): + task_queue = 'swh_indexer_revision_metadata' + + def run_task(self, *args, **kwargs): + TestRevisionMetadataIndexer().run(*args, **kwargs) + + +class TestOriginMetadataTask(Task): + task_queue = 'swh_indexer_revision_metadata' + + def run_task(self, *args, **kwargs): + TestOriginMetadataIndexer().run(*args, **kwargs) + + +class TestOriginMetadata(unittest.TestCase): + def test_pipeline(self): + indexer = TestOriginHeadIndexer() + with start_worker_thread(): + promises = indexer.run( + ["git+https://github.com/moranegg/metadata_test"], + policy_update='remove-dups', + parse_ids=True) + print(len(promises)) + for promise in promises: + promise.get()