diff --git a/requirements-swh.txt b/requirements-swh.txt index 394e977..00962c2 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,6 +1,6 @@ swh.core[db,http] >= 0.0.61 swh.model >= 0.0.15 swh.objstorage >= 0.0.28 swh.scheduler >= 0.0.47 swh.storage >= 0.0.123 -swh.journal >= 0.0.6 +swh.journal >= 0.0.11 diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py index 37e97f0..f4ecfc6 100644 --- a/swh/indexer/cli.py +++ b/swh/indexer/cli.py @@ -1,188 +1,241 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import functools + import click from swh.core import config from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup +from swh.journal.cli import get_journal_client from swh.scheduler import get_scheduler from swh.scheduler.cli_utils import schedule_origin_batches from swh.storage import get_storage from swh.indexer import metadata_dictionary +from swh.indexer.journal_client import process_journal_objects from swh.indexer.storage import get_indexer_storage from swh.indexer.storage.api.server import load_and_check_config, app @click.group(name='indexer', context_settings=CONTEXT_SETTINGS, cls=AliasedGroup) @click.option('--config-file', '-C', default=None, type=click.Path(exists=True, dir_okay=False,), help="Configuration file.") @click.pass_context def cli(ctx, config_file): """Software Heritage Indexer tools. The Indexer is used to mine the content of the archive and extract derived information from archive source code artifacts. """ ctx.ensure_object(dict) conf = config.read(config_file) ctx.obj['config'] = conf def _get_api(getter, config, config_key, url): if url: config[config_key] = { 'cls': 'remote', 'args': {'url': url} } elif config_key not in config: raise click.ClickException( 'Missing configuration for {}'.format(config_key)) return getter(**config[config_key]) @cli.group('mapping') def mapping(): '''Manage Software Heritage Indexer mappings.''' pass @mapping.command('list') def mapping_list(): """Prints the list of known mappings.""" mapping_names = [mapping.name for mapping in metadata_dictionary.MAPPINGS.values()] mapping_names.sort() for mapping_name in mapping_names: click.echo(mapping_name) @mapping.command('list-terms') @click.option('--exclude-mapping', multiple=True, help='Exclude the given mapping from the output') @click.option('--concise', is_flag=True, default=False, help='Don\'t print the list of mappings supporting each term.') def mapping_list_terms(concise, exclude_mapping): """Prints the list of known CodeMeta terms, and which mappings support them.""" properties = metadata_dictionary.list_terms() for (property_name, supported_mappings) in sorted(properties.items()): supported_mappings = {m.name for m in supported_mappings} supported_mappings -= set(exclude_mapping) if supported_mappings: if concise: click.echo(property_name) else: click.echo('{}:'.format(property_name)) click.echo('\t' + ', '.join(sorted(supported_mappings))) @cli.group('schedule') @click.option('--scheduler-url', '-s', default=None, help="URL of the scheduler API") @click.option('--indexer-storage-url', '-i', default=None, help="URL of the indexer storage API") @click.option('--storage-url', '-g', default=None, help="URL of the (graph) storage API") @click.option('--dry-run/--no-dry-run', is_flag=True, default=False, help='List only what would be scheduled.') @click.pass_context def schedule(ctx, scheduler_url, storage_url, indexer_storage_url, dry_run): """Manipulate Software Heritage Indexer tasks. Via SWH Scheduler's API.""" ctx.obj['indexer_storage'] = _get_api( get_indexer_storage, ctx.obj['config'], 'indexer_storage', indexer_storage_url ) ctx.obj['storage'] = _get_api( get_storage, ctx.obj['config'], 'storage', storage_url ) ctx.obj['scheduler'] = _get_api( get_scheduler, ctx.obj['config'], 'scheduler', scheduler_url ) if dry_run: ctx.obj['scheduler'] = None def list_origins_by_producer(idx_storage, mappings, tool_ids): start = 0 limit = 10000 while True: origins = list( idx_storage.origin_intrinsic_metadata_search_by_producer( start=start, limit=limit, ids_only=True, mappings=mappings or None, tool_ids=tool_ids or None)) if not origins: break start = origins[-1]+1 yield from origins @schedule.command('reindex_origin_metadata') @click.option('--batch-size', '-b', 'origin_batch_size', default=10, show_default=True, type=int, help="Number of origins per task") @click.option('--tool-id', '-t', 'tool_ids', type=int, multiple=True, help="Restrict search of old metadata to this/these tool ids.") @click.option('--mapping', '-m', 'mappings', multiple=True, help="Mapping(s) that should be re-scheduled (eg. 'npm', " "'gemspec', 'maven')") @click.option('--task-type', default='index-origin-metadata', show_default=True, help="Name of the task type to schedule.") @click.pass_context def schedule_origin_metadata_reindex( ctx, origin_batch_size, tool_ids, mappings, task_type): """Schedules indexing tasks for origins that were already indexed.""" idx_storage = ctx.obj['indexer_storage'] scheduler = ctx.obj['scheduler'] origins = list_origins_by_producer(idx_storage, mappings, tool_ids) - kwargs = {"policy_update": "update-dups", "parse_ids": False} + kwargs = {"policy_update": "update-dups"} schedule_origin_batches( scheduler, task_type, origins, origin_batch_size, kwargs) +@cli.command('journal-client') +@click.option('--scheduler-url', '-s', default=None, + help="URL of the scheduler API") +@click.option('--origin-metadata-task-type', + default='index-origin-metadata', + help='Name of the task running the origin metadata indexer.') +@click.option('--broker', 'brokers', type=str, multiple=True, + help='Kafka broker to connect to.') +@click.option('--prefix', type=str, default=None, + help='Prefix of Kafka topic names to read from.') +@click.option('--group-id', '--consumer-id', type=str, + help='Name of the consumer/group id for reading from Kafka.') +@click.option('--max-messages', '-m', default=None, type=int, + help='Maximum number of objects to replay. Default is to ' + 'run forever.') +@click.pass_context +def journal_client(ctx, scheduler_url, origin_metadata_task_type, + brokers, prefix, group_id, max_messages): + """Listens for new objects from the SWH Journal, and schedules tasks + to run relevant indexers (currently, only origin-intrinsic-metadata) + on these new objects.""" + scheduler = _get_api( + get_scheduler, + ctx.obj['config'], + 'scheduler', + scheduler_url + ) + + client = get_journal_client( + ctx, brokers, prefix, group_id, object_types=['origin_visit']) + + worker_fn = functools.partial( + process_journal_objects, + scheduler=scheduler, + task_names={ + 'origin_metadata': origin_metadata_task_type, + } + ) + nb_messages = 0 + try: + while not max_messages or nb_messages < max_messages: + nb_messages += client.process(worker_fn) + print('Processed %d messages.' % nb_messages) + except KeyboardInterrupt: + ctx.exit(0) + else: + print('Done.') + + @cli.command('rpc-serve') @click.argument('config-path', required=1) @click.option('--host', default='0.0.0.0', help="Host to run the server") @click.option('--port', default=5007, type=click.INT, help="Binding port of the server") @click.option('--debug/--nodebug', default=True, help="Indicates if the server should run in debug mode") def rpc_server(config_path, host, port, debug): """Starts a Software Heritage Indexer RPC HTTP server.""" api_cfg = load_and_check_config(config_path, type='any') app.config.update(api_cfg) app.run(host, port=int(port), debug=bool(debug)) cli.add_alias(rpc_server, 'api-server') cli.add_alias(rpc_server, 'serve') def main(): return cli(auto_envvar_prefix='SWH_INDEXER') if __name__ == '__main__': main() diff --git a/swh/indexer/journal_client.py b/swh/indexer/journal_client.py index ac236cc..aec8ad7 100644 --- a/swh/indexer/journal_client.py +++ b/swh/indexer/journal_client.py @@ -1,88 +1,34 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging -from swh.journal.client import JournalClient -from swh.scheduler import get_scheduler from swh.scheduler.utils import create_task_dict -class IndexerJournalClient(JournalClient): - """Client in charge of listing new received origins and origin_visits - in the swh journal. - - """ - CONFIG_BASE_FILENAME = 'indexer/journal_client' - - ADDITIONAL_CONFIG = { - 'scheduler': ('dict', { - 'cls': 'remote', - 'args': { - 'url': 'http://localhost:5008/', - } - }), - 'origin_visit_tasks': ('List[dict]', [ - { - 'type': 'index-origin-metadata', - 'kwargs': { - 'policy_update': 'update-dups', - 'parse_ids': False, - } - } - ]), - } - - def __init__(self): - super().__init__(extra_configuration={ - 'object_types': ['origin_visit'], - }) - self.scheduler = get_scheduler(**self.config['scheduler']) - logging.info( - 'Starting indexer journal client with config %r', - self.config) - - def process_objects(self, messages): - assert set(messages) == {'origin_visit'}, set(messages) - for origin_visit in messages['origin_visit']: - self.process_origin_visit(origin_visit) - - def process_origin_visit(self, origin_visit): - task_dicts = [] - logging.debug('processing origin visit %r', origin_visit) - if origin_visit[b'status'] == b'full': - for task_config in self.config['origin_visit_tasks']: - logging.info( - 'Scheduling %s for visit of origin %d', - task_config['type'], origin_visit[b'origin']) - task_dicts.append(create_task_dict( - task_config['type'], - 'oneshot', - [origin_visit[b'origin']], - **task_config['kwargs'], - )) - else: - logging.debug('status is not "full", ignoring.') - - if task_dicts: - self.scheduler.create_tasks(task_dicts) - - -if __name__ == '__main__': - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s %(process)d %(levelname)s %(message)s' - ) - - import click - - @click.command() - def main(): - """Log the new received origin and origin_visits. - - """ - IndexerJournalClient().process() - - main() +def process_journal_objects(messages, *, scheduler, task_names): + """Worker function for `JournalClient.process(worker_fn)`, after + currification of `scheduler` and `task_names`.""" + assert set(messages) == {'origin_visit'}, set(messages) + for origin_visit in messages['origin_visit']: + process_origin_visit(origin_visit, scheduler, task_names) + + +def process_origin_visit(origin_visit, scheduler, task_names): + task_dicts = [] + logging.debug('processing origin visit %r', origin_visit) + if origin_visit[b'status'] == b'full': + if task_names.get('origin_metadata'): + task_dicts.append(create_task_dict( + task_names['origin_metadata'], + 'oneshot', + [origin_visit[b'origin'][b'url']], + policy_update='update-dups', + )) + else: + logging.debug('status is not "full", ignoring.') + + if task_dicts: + scheduler.create_tasks(task_dicts) diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py index 347104c..10b1490 100644 --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -1,322 +1,371 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from collections import namedtuple from functools import reduce import re import tempfile -from unittest.mock import patch +from unittest.mock import patch, MagicMock from click.testing import CliRunner from swh.model.hashutil import hash_to_bytes from swh.indexer.cli import cli CLI_CONFIG = ''' scheduler: cls: foo args: {} storage: cls: memory args: {} indexer_storage: cls: memory args: {} ''' def fill_idx_storage(idx_storage, nb_rows): tools = [ { 'tool_name': 'tool %d' % i, 'tool_version': '0.0.1', 'tool_configuration': {}, } for i in range(2) ] tools = idx_storage.indexer_configuration_add(tools) origin_metadata = [ { 'id': origin_id, 'from_revision': hash_to_bytes('abcd{:0>4}'.format(origin_id)), 'indexer_configuration_id': tools[origin_id % 2]['id'], 'metadata': {'name': 'origin %d' % origin_id}, 'mappings': ['mapping%d' % (origin_id % 10)] } for origin_id in range(nb_rows) ] revision_metadata = [ { 'id': hash_to_bytes('abcd{:0>4}'.format(origin_id)), 'indexer_configuration_id': tools[origin_id % 2]['id'], 'metadata': {'name': 'origin %d' % origin_id}, 'mappings': ['mapping%d' % (origin_id % 10)] } for origin_id in range(nb_rows) ] idx_storage.revision_intrinsic_metadata_add(revision_metadata) idx_storage.origin_intrinsic_metadata_add(origin_metadata) return [tool['id'] for tool in tools] def _origins_in_task_args(tasks): """Returns the set of origins contained in the arguments of the provided tasks (assumed to be of type index-origin-metadata).""" return reduce( set.union, (set(task['arguments']['args'][0]) for task in tasks), set() ) def _assert_tasks_for_origins(tasks, origins): - expected_kwargs = {"policy_update": "update-dups", "parse_ids": False} + expected_kwargs = {"policy_update": "update-dups"} assert {task['type'] for task in tasks} == {'index-origin-metadata'} assert all(len(task['arguments']['args']) == 1 for task in tasks) - assert all(task['arguments']['kwargs'] == expected_kwargs - for task in tasks) + for task in tasks: + assert task['arguments']['kwargs'] == expected_kwargs, task assert _origins_in_task_args(tasks) == set(origins) def invoke(scheduler, catch_exceptions, args): runner = CliRunner() with patch('swh.indexer.cli.get_scheduler') as get_scheduler_mock, \ tempfile.NamedTemporaryFile('a', suffix='.yml') as config_fd: config_fd.write(CLI_CONFIG) config_fd.seek(0) get_scheduler_mock.return_value = scheduler result = runner.invoke(cli, ['-C' + config_fd.name] + args) if not catch_exceptions and result.exception: print(result.output) raise result.exception return result def test_mapping_list(indexer_scheduler): result = invoke(indexer_scheduler, False, [ 'mapping', 'list', ]) expected_output = '\n'.join([ 'codemeta', 'gemspec', 'maven', 'npm', 'pkg-info', '', ]) assert result.exit_code == 0, result.output assert result.output == expected_output def test_mapping_list_terms(indexer_scheduler): result = invoke(indexer_scheduler, False, [ 'mapping', 'list-terms', ]) assert result.exit_code == 0, result.output assert re.search(r'http://schema.org/url:\n.*npm', result.output) assert re.search(r'http://schema.org/url:\n.*codemeta', result.output) assert re.search( r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta', result.output) def test_mapping_list_terms_exclude(indexer_scheduler): result = invoke(indexer_scheduler, False, [ 'mapping', 'list-terms', '--exclude-mapping', 'codemeta' ]) assert result.exit_code == 0, result.output assert re.search(r'http://schema.org/url:\n.*npm', result.output) assert not re.search(r'http://schema.org/url:\n.*codemeta', result.output) assert not re.search( r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta', result.output) @patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_empty_db( indexer_scheduler, idx_storage, storage): result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', ]) expected_output = ( 'Nothing to do (no origin metadata matched the criteria).\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output tasks = indexer_scheduler.search_tasks() assert len(tasks) == 0 @patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_divisor( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 90) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', ]) # Check the output expected_output = ( 'Scheduled 3 tasks (30 origins).\n' 'Scheduled 6 tasks (60 origins).\n' 'Scheduled 9 tasks (90 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 9 _assert_tasks_for_origins(tasks, range(90)) @patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_dry_run( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 90) result = invoke(indexer_scheduler, False, [ 'schedule', '--dry-run', 'reindex_origin_metadata', ]) # Check the output expected_output = ( 'Scheduled 3 tasks (30 origins).\n' 'Scheduled 6 tasks (60 origins).\n' 'Scheduled 9 tasks (90 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 0 @patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_nondivisor( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when neither origin_batch_size or task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 70) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', '--batch-size', '20', ]) # Check the output expected_output = ( 'Scheduled 3 tasks (60 origins).\n' 'Scheduled 4 tasks (70 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 4 _assert_tasks_for_origins(tasks, range(70)) @patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_filter_one_mapping( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 110) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', '--mapping', 'mapping1', ]) # Check the output expected_output = ( 'Scheduled 2 tasks (11 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 2 _assert_tasks_for_origins( tasks, [1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101]) @patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_filter_two_mappings( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 110) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', '--mapping', 'mapping1', '--mapping', 'mapping2', ]) # Check the output expected_output = ( 'Scheduled 3 tasks (22 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 3 _assert_tasks_for_origins( tasks, [1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101, 2, 12, 22, 32, 42, 52, 62, 72, 82, 92, 102]) @patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_filter_one_tool( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" tool_ids = fill_idx_storage(idx_storage, 110) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', '--tool-id', str(tool_ids[0]), ]) # Check the output expected_output = ( 'Scheduled 3 tasks (30 origins).\n' 'Scheduled 6 tasks (55 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 6 _assert_tasks_for_origins( tasks, [x*2 for x in range(55)]) + + +def test_journal_client(storage, indexer_scheduler): + """Tests the re-indexing when origin_batch_size*task_batch_size is a + divisor of nb_origins.""" + mock_consumer = MagicMock() + + partition = namedtuple('_partition', ['topic'])( + topic='swh.journal.objects.origin_visit') + message = namedtuple('_message', ['value'])( + value={ + b'status': b'full', + b'origin': { + b'url': 'file:///dev/zero', + } + } + ) + mock_consumer.poll.return_value = {partition: [message]} + + with patch('swh.journal.client.KafkaConsumer', + return_value=mock_consumer): + result = invoke(indexer_scheduler, False, [ + 'journal-client', + '--max-messages', '1', + '--broker', '192.0.2.1', + '--prefix', 'swh.journal.objects', + '--group-id', 'test-consumer', + ]) + + mock_consumer.subscribe.assert_called_once_with( + topics=['swh.journal.objects.origin_visit']) + mock_consumer.poll.assert_called_once_with() + mock_consumer.commit.assert_called_once_with() + + # Check the output + expected_output = ( + 'Processed 1 messages.\n' + 'Done.\n' + ) + assert result.exit_code == 0, result.output + assert result.output == expected_output + + # Check scheduled tasks + tasks = indexer_scheduler.search_tasks() + assert len(tasks) == 1 + _assert_tasks_for_origins( + tasks, + ['file:///dev/zero']) diff --git a/swh/indexer/tests/test_journal_client.py b/swh/indexer/tests/test_journal_client.py new file mode 100644 index 0000000..889b26a --- /dev/null +++ b/swh/indexer/tests/test_journal_client.py @@ -0,0 +1,42 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest +from unittest.mock import Mock + +from swh.indexer.journal_client import process_journal_objects + + +class JournalClientTest(unittest.TestCase): + def testOriginVisit(self): + mock_scheduler = Mock() + messages = { + 'origin_visit': [ + { + b'status': b'full', + b'origin': { + b'url': 'file:///dev/zero', + } + } + ] + } + process_journal_objects( + messages, scheduler=mock_scheduler, + task_names={'origin_metadata': 'task-name'}) + self.assertTrue(mock_scheduler.create_tasks.called) + call_args = mock_scheduler.create_tasks.call_args + (args, kwargs) = call_args + self.assertEqual(kwargs, {}) + del args[0][0]['next_run'] + self.assertEqual(args, ([ + { + 'arguments': { + 'kwargs': {'policy_update': 'update-dups'}, + 'args': (['file:///dev/zero'],) + }, + 'policy': 'oneshot', + 'type': 'task-name' + } + ],))