No OneTemporary
Actions

Size

21 KB

Subscribers

None

View Options

	diff --git a/requirements-swh.txt b/requirements-swh.txt
	index c49aa91..3e5516d 100644
	--- a/requirements-swh.txt
	+++ b/requirements-swh.txt
	@@ -1,6 +1,6 @@
	swh.core[db,http] >= 0.0.87
	swh.model >= 0.0.15
	swh.objstorage >= 0.0.28
	swh.scheduler >= 0.0.47
	swh.storage >= 0.0.156
	-swh.journal >= 0.0.17
	+swh.journal >= 0.0.27
	diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py
	index 24ebb63..2c283e2 100644
	--- a/swh/indexer/cli.py
	+++ b/swh/indexer/cli.py
	@@ -1,258 +1,258 @@
	# Copyright (C) 2019 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import functools
	import json
	import time

	import click

	from swh.core import config
	from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup
	from swh.journal.cli import get_journal_client
	from swh.scheduler import get_scheduler
	from swh.scheduler.cli_utils import schedule_origin_batches
	from swh.storage import get_storage

	from swh.indexer import metadata_dictionary
	from swh.indexer.journal_client import process_journal_objects
	from swh.indexer.storage import get_indexer_storage
	from swh.indexer.storage.api.server import load_and_check_config, app


	@click.group(name='indexer', context_settings=CONTEXT_SETTINGS,
	cls=AliasedGroup)
	@click.option('--config-file', '-C', default=None,
	type=click.Path(exists=True, dir_okay=False,),
	help="Configuration file.")
	@click.pass_context
	def cli(ctx, config_file):
	"""Software Heritage Indexer tools.

	The Indexer is used to mine the content of the archive and extract derived
	information from archive source code artifacts.

	"""
	ctx.ensure_object(dict)

	conf = config.read(config_file)
	ctx.obj['config'] = conf


	def _get_api(getter, config, config_key, url):
	if url:
	config[config_key] = {
	'cls': 'remote',
	'args': {'url': url}
	}
	elif config_key not in config:
	raise click.ClickException(
	'Missing configuration for {}'.format(config_key))
	return getter(**config[config_key])


	@cli.group('mapping')
	def mapping():
	'''Manage Software Heritage Indexer mappings.'''
	pass


	@mapping.command('list')
	def mapping_list():
	"""Prints the list of known mappings."""
	mapping_names = [mapping.name
	for mapping in metadata_dictionary.MAPPINGS.values()]
	mapping_names.sort()
	for mapping_name in mapping_names:
	click.echo(mapping_name)


	@mapping.command('list-terms')
	@click.option('--exclude-mapping', multiple=True,
	help='Exclude the given mapping from the output')
	@click.option('--concise', is_flag=True,
	default=False,
	help='Don\'t print the list of mappings supporting each term.')
	def mapping_list_terms(concise, exclude_mapping):
	"""Prints the list of known CodeMeta terms, and which mappings
	support them."""
	properties = metadata_dictionary.list_terms()
	for (property_name, supported_mappings) in sorted(properties.items()):
	supported_mappings = {m.name for m in supported_mappings}
	supported_mappings -= set(exclude_mapping)
	if supported_mappings:
	if concise:
	click.echo(property_name)
	else:
	click.echo('{}:'.format(property_name))
	click.echo('\t' + ', '.join(sorted(supported_mappings)))


	@mapping.command('translate')
	@click.argument('mapping-name')
	@click.argument('file', type=click.File('rb'))
	def mapping_translate(mapping_name, file):
	"""Prints the list of known mappings."""
	mapping_cls = [cls for cls in metadata_dictionary.MAPPINGS.values()
	if cls.name == mapping_name]
	if not mapping_cls:
	raise click.ClickException('Unknown mapping {}'.format(mapping_name))
	assert len(mapping_cls) == 1
	mapping_cls = mapping_cls[0]
	mapping = mapping_cls()
	codemeta_doc = mapping.translate(file.read())
	click.echo(json.dumps(codemeta_doc, indent=4))


	@cli.group('schedule')
	@click.option('--scheduler-url', '-s', default=None,
	help="URL of the scheduler API")
	@click.option('--indexer-storage-url', '-i', default=None,
	help="URL of the indexer storage API")
	@click.option('--storage-url', '-g', default=None,
	help="URL of the (graph) storage API")
	@click.option('--dry-run/--no-dry-run', is_flag=True,
	default=False,
	help='List only what would be scheduled.')
	@click.pass_context
	def schedule(ctx, scheduler_url, storage_url, indexer_storage_url,
	dry_run):
	"""Manipulate Software Heritage Indexer tasks.

	Via SWH Scheduler's API."""
	ctx.obj['indexer_storage'] = _get_api(
	get_indexer_storage,
	ctx.obj['config'],
	'indexer_storage',
	indexer_storage_url
	)
	ctx.obj['storage'] = _get_api(
	get_storage,
	ctx.obj['config'],
	'storage',
	storage_url
	)
	ctx.obj['scheduler'] = _get_api(
	get_scheduler,
	ctx.obj['config'],
	'scheduler',
	scheduler_url
	)
	if dry_run:
	ctx.obj['scheduler'] = None


	def list_origins_by_producer(idx_storage, mappings, tool_ids):
	next_page_token = ''
	limit = 10000
	while next_page_token is not None:
	result = idx_storage.origin_intrinsic_metadata_search_by_producer(
	page_token=next_page_token, limit=limit, ids_only=True,
	mappings=mappings or None, tool_ids=tool_ids or None)
	next_page_token = result.get('next_page_token')
	yield from result['origins']


	@schedule.command('reindex_origin_metadata')
	@click.option('--batch-size', '-b', 'origin_batch_size',
	default=10, show_default=True, type=int,
	help="Number of origins per task")
	@click.option('--tool-id', '-t', 'tool_ids', type=int, multiple=True,
	help="Restrict search of old metadata to this/these tool ids.")
	@click.option('--mapping', '-m', 'mappings', multiple=True,
	help="Mapping(s) that should be re-scheduled (eg. 'npm', "
	"'gemspec', 'maven')")
	@click.option('--task-type',
	default='index-origin-metadata', show_default=True,
	help="Name of the task type to schedule.")
	@click.pass_context
	def schedule_origin_metadata_reindex(
	ctx, origin_batch_size, tool_ids, mappings, task_type):
	"""Schedules indexing tasks for origins that were already indexed."""
	idx_storage = ctx.obj['indexer_storage']
	scheduler = ctx.obj['scheduler']

	origins = list_origins_by_producer(idx_storage, mappings, tool_ids)

	kwargs = {"policy_update": "update-dups"}
	schedule_origin_batches(
	scheduler, task_type, origins, origin_batch_size, kwargs)


	@cli.command('journal-client')
	@click.option('--scheduler-url', '-s', default=None,
	help="URL of the scheduler API")
	@click.option('--origin-metadata-task-type',
	default='index-origin-metadata',
	help='Name of the task running the origin metadata indexer.')
	@click.option('--broker', 'brokers', type=str, multiple=True,
	help='Kafka broker to connect to.')
	@click.option('--prefix', type=str, default=None,
	help='Prefix of Kafka topic names to read from.')
	@click.option('--group-id', type=str,
	help='Consumer/group id for reading from Kafka.')
	-@click.option('--max-messages', '-m', default=None, type=int,
	+@click.option('--stop-after-objects', '-m', default=None, type=int,
	help='Maximum number of objects to replay. Default is to '
	'run forever.')
	@click.pass_context
	def journal_client(ctx, scheduler_url, origin_metadata_task_type,
	- brokers, prefix, group_id, max_messages):
	+ brokers, prefix, group_id, stop_after_objects):
	"""Listens for new objects from the SWH Journal, and schedules tasks
	to run relevant indexers (currently, only origin-intrinsic-metadata)
	on these new objects."""
	scheduler = _get_api(
	get_scheduler,
	ctx.obj['config'],
	'scheduler',
	scheduler_url
	)

	client = get_journal_client(
	ctx, brokers=brokers, prefix=prefix, group_id=group_id,
	object_types=['origin_visit'],
	- max_messages=max_messages,
	+ stop_after_objects=stop_after_objects,
	)

	worker_fn = functools.partial(
	process_journal_objects,
	scheduler=scheduler,
	task_names={
	'origin_metadata': origin_metadata_task_type,
	}
	)
	nb_messages = 0
	last_log_time = 0
	try:
	- while not max_messages or nb_messages < max_messages:
	+ while not stop_after_objects or nb_messages < stop_after_objects:
	nb_messages += client.process(worker_fn)
	if time.monotonic() - last_log_time >= 60:
	print('Processed %d messages.' % nb_messages)
	last_log_time = time.monotonic()
	except KeyboardInterrupt:
	ctx.exit(0)
	else:
	print('Done.')


	@cli.command('rpc-serve')
	@click.argument('config-path', required=True)
	@click.option('--host', default='0.0.0.0', help="Host to run the server")
	@click.option('--port', default=5007, type=click.INT,
	help="Binding port of the server")
	@click.option('--debug/--nodebug', default=True,
	help="Indicates if the server should run in debug mode")
	def rpc_server(config_path, host, port, debug):
	"""Starts a Software Heritage Indexer RPC HTTP server."""
	api_cfg = load_and_check_config(config_path, type='any')
	app.config.update(api_cfg)
	app.run(host, port=int(port), debug=bool(debug))


	def main():
	return cli(auto_envvar_prefix='SWH_INDEXER')


	if __name__ == '__main__':
	main()
	diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py
	index 6a60799..16a97bc 100644
	--- a/swh/indexer/tests/test_cli.py
	+++ b/swh/indexer/tests/test_cli.py
	@@ -1,360 +1,360 @@
	# Copyright (C) 2019 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	from functools import reduce
	import re
	import tempfile
	from unittest.mock import patch

	from click.testing import CliRunner

	from swh.journal.tests.utils import FakeKafkaMessage, MockedKafkaConsumer
	from swh.model.hashutil import hash_to_bytes

	from swh.indexer.cli import cli


	CLI_CONFIG = '''
	scheduler:
	cls: foo
	args: {}
	storage:
	cls: memory
	indexer_storage:
	cls: memory
	args: {}
	'''


	def fill_idx_storage(idx_storage, nb_rows):
	tools = [
	{
	'tool_name': 'tool %d' % i,
	'tool_version': '0.0.1',
	'tool_configuration': {},
	}
	for i in range(2)
	]
	tools = idx_storage.indexer_configuration_add(tools)

	origin_metadata = [
	{
	'id': 'file://dev/%04d' % origin_id,
	'from_revision': hash_to_bytes('abcd{:0>4}'.format(origin_id)),
	'indexer_configuration_id': tools[origin_id % 2]['id'],
	'metadata': {'name': 'origin %d' % origin_id},
	'mappings': ['mapping%d' % (origin_id % 10)]
	}
	for origin_id in range(nb_rows)
	]
	revision_metadata = [
	{
	'id': hash_to_bytes('abcd{:0>4}'.format(origin_id)),
	'indexer_configuration_id': tools[origin_id % 2]['id'],
	'metadata': {'name': 'origin %d' % origin_id},
	'mappings': ['mapping%d' % (origin_id % 10)]
	}
	for origin_id in range(nb_rows)
	]

	idx_storage.revision_intrinsic_metadata_add(revision_metadata)
	idx_storage.origin_intrinsic_metadata_add(origin_metadata)

	return [tool['id'] for tool in tools]


	def _origins_in_task_args(tasks):
	"""Returns the set of origins contained in the arguments of the
	provided tasks (assumed to be of type index-origin-metadata)."""
	return reduce(
	set.union,
	(set(task['arguments']['args'][0]) for task in tasks),
	set()
	)


	def _assert_tasks_for_origins(tasks, origins):
	expected_kwargs = {"policy_update": "update-dups"}
	assert {task['type'] for task in tasks} == {'index-origin-metadata'}
	assert all(len(task['arguments']['args']) == 1 for task in tasks)
	for task in tasks:
	assert task['arguments']['kwargs'] == expected_kwargs, task
	assert _origins_in_task_args(tasks) == set([
	'file://dev/%04d' % i for i in origins])


	def invoke(scheduler, catch_exceptions, args):
	runner = CliRunner()
	with patch('swh.indexer.cli.get_scheduler') as get_scheduler_mock, \
	tempfile.NamedTemporaryFile('a', suffix='.yml') as config_fd:
	config_fd.write(CLI_CONFIG)
	config_fd.seek(0)
	get_scheduler_mock.return_value = scheduler
	result = runner.invoke(cli, ['-C' + config_fd.name] + args)
	if not catch_exceptions and result.exception:
	print(result.output)
	raise result.exception
	return result


	def test_mapping_list(indexer_scheduler):
	result = invoke(indexer_scheduler, False, [
	'mapping', 'list',
	])
	expected_output = '\n'.join([
	'codemeta', 'gemspec', 'maven', 'npm', 'pkg-info', '',
	])
	assert result.exit_code == 0, result.output
	assert result.output == expected_output


	def test_mapping_list_terms(indexer_scheduler):
	result = invoke(indexer_scheduler, False, [
	'mapping', 'list-terms',
	])
	assert result.exit_code == 0, result.output
	assert re.search(r'http://schema.org/url:\n.*npm', result.output)
	assert re.search(r'http://schema.org/url:\n.*codemeta', result.output)
	assert re.search(
	r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta',
	result.output)


	def test_mapping_list_terms_exclude(indexer_scheduler):
	result = invoke(indexer_scheduler, False, [
	'mapping', 'list-terms',
	'--exclude-mapping', 'codemeta'
	])
	assert result.exit_code == 0, result.output
	assert re.search(r'http://schema.org/url:\n.*npm', result.output)
	assert not re.search(r'http://schema.org/url:\n.*codemeta', result.output)
	assert not re.search(
	r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta',
	result.output)


	@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3)
	@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
	def test_origin_metadata_reindex_empty_db(
	indexer_scheduler, idx_storage, storage):
	result = invoke(indexer_scheduler, False, [
	'schedule', 'reindex_origin_metadata',
	])
	expected_output = (
	'Nothing to do (no origin metadata matched the criteria).\n'
	)
	assert result.exit_code == 0, result.output
	assert result.output == expected_output
	tasks = indexer_scheduler.search_tasks()
	assert len(tasks) == 0


	@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3)
	@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
	def test_origin_metadata_reindex_divisor(
	indexer_scheduler, idx_storage, storage):
	"""Tests the re-indexing when origin_batch_size*task_batch_size is a
	divisor of nb_origins."""
	fill_idx_storage(idx_storage, 90)

	result = invoke(indexer_scheduler, False, [
	'schedule', 'reindex_origin_metadata',
	])

	# Check the output
	expected_output = (
	'Scheduled 3 tasks (30 origins).\n'
	'Scheduled 6 tasks (60 origins).\n'
	'Scheduled 9 tasks (90 origins).\n'
	'Done.\n'
	)
	assert result.exit_code == 0, result.output
	assert result.output == expected_output

	# Check scheduled tasks
	tasks = indexer_scheduler.search_tasks()
	assert len(tasks) == 9
	_assert_tasks_for_origins(tasks, range(90))


	@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3)
	@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
	def test_origin_metadata_reindex_dry_run(
	indexer_scheduler, idx_storage, storage):
	"""Tests the re-indexing when origin_batch_size*task_batch_size is a
	divisor of nb_origins."""
	fill_idx_storage(idx_storage, 90)

	result = invoke(indexer_scheduler, False, [
	'schedule', '--dry-run', 'reindex_origin_metadata',
	])

	# Check the output
	expected_output = (
	'Scheduled 3 tasks (30 origins).\n'
	'Scheduled 6 tasks (60 origins).\n'
	'Scheduled 9 tasks (90 origins).\n'
	'Done.\n'
	)
	assert result.exit_code == 0, result.output
	assert result.output == expected_output

	# Check scheduled tasks
	tasks = indexer_scheduler.search_tasks()
	assert len(tasks) == 0


	@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3)
	@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
	def test_origin_metadata_reindex_nondivisor(
	indexer_scheduler, idx_storage, storage):
	"""Tests the re-indexing when neither origin_batch_size or
	task_batch_size is a divisor of nb_origins."""
	fill_idx_storage(idx_storage, 70)

	result = invoke(indexer_scheduler, False, [
	'schedule', 'reindex_origin_metadata',
	'--batch-size', '20',
	])

	# Check the output
	expected_output = (
	'Scheduled 3 tasks (60 origins).\n'
	'Scheduled 4 tasks (70 origins).\n'
	'Done.\n'
	)
	assert result.exit_code == 0, result.output
	assert result.output == expected_output

	# Check scheduled tasks
	tasks = indexer_scheduler.search_tasks()
	assert len(tasks) == 4
	_assert_tasks_for_origins(tasks, range(70))


	@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3)
	@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
	def test_origin_metadata_reindex_filter_one_mapping(
	indexer_scheduler, idx_storage, storage):
	"""Tests the re-indexing when origin_batch_size*task_batch_size is a
	divisor of nb_origins."""
	fill_idx_storage(idx_storage, 110)

	result = invoke(indexer_scheduler, False, [
	'schedule', 'reindex_origin_metadata',
	'--mapping', 'mapping1',
	])

	# Check the output
	expected_output = (
	'Scheduled 2 tasks (11 origins).\n'
	'Done.\n'
	)
	assert result.exit_code == 0, result.output
	assert result.output == expected_output

	# Check scheduled tasks
	tasks = indexer_scheduler.search_tasks()
	assert len(tasks) == 2
	_assert_tasks_for_origins(
	tasks,
	[1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101])


	@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3)
	@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
	def test_origin_metadata_reindex_filter_two_mappings(
	indexer_scheduler, idx_storage, storage):
	"""Tests the re-indexing when origin_batch_size*task_batch_size is a
	divisor of nb_origins."""
	fill_idx_storage(idx_storage, 110)

	result = invoke(indexer_scheduler, False, [
	'schedule', 'reindex_origin_metadata',
	'--mapping', 'mapping1', '--mapping', 'mapping2',
	])

	# Check the output
	expected_output = (
	'Scheduled 3 tasks (22 origins).\n'
	'Done.\n'
	)
	assert result.exit_code == 0, result.output
	assert result.output == expected_output

	# Check scheduled tasks
	tasks = indexer_scheduler.search_tasks()
	assert len(tasks) == 3
	_assert_tasks_for_origins(
	tasks,
	[1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101,
	2, 12, 22, 32, 42, 52, 62, 72, 82, 92, 102])


	@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3)
	@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3)
	def test_origin_metadata_reindex_filter_one_tool(
	indexer_scheduler, idx_storage, storage):
	"""Tests the re-indexing when origin_batch_size*task_batch_size is a
	divisor of nb_origins."""
	tool_ids = fill_idx_storage(idx_storage, 110)

	result = invoke(indexer_scheduler, False, [
	'schedule', 'reindex_origin_metadata',
	'--tool-id', str(tool_ids[0]),
	])

	# Check the output
	expected_output = (
	'Scheduled 3 tasks (30 origins).\n'
	'Scheduled 6 tasks (55 origins).\n'
	'Done.\n'
	)
	assert result.exit_code == 0, result.output
	assert result.output == expected_output

	# Check scheduled tasks
	tasks = indexer_scheduler.search_tasks()
	assert len(tasks) == 6
	_assert_tasks_for_origins(
	tasks,
	[x*2 for x in range(55)])


	def test_journal_client(storage, indexer_scheduler):
	"""Test the 'swh indexer journal-client' cli tool."""
	message = FakeKafkaMessage('swh.journal.objects.origin_visit', 'bogus', {
	'status': 'full',
	'origin': {
	'url': 'file://dev/0000',
	}
	})

	consumer = MockedKafkaConsumer([message])

	with patch('swh.journal.client.Consumer',
	return_value=consumer):
	result = invoke(indexer_scheduler, False, [
	'journal-client',
	- '--max-messages', '1',
	+ '--stop-after-objects', '1',
	'--broker', '192.0.2.1',
	'--prefix', 'swh.journal.objects',
	'--group-id', 'test-consumer',
	])

	# Check the output
	expected_output = (
	'Processed 1 messages.\n'
	'Done.\n'
	)
	assert result.exit_code == 0, result.output
	assert result.output == expected_output

	# Check scheduled tasks
	tasks = indexer_scheduler.search_tasks()
	assert len(tasks) == 1
	_assert_tasks_for_origins(
	tasks,
	[0])

File Metadata

Mime Type: text/x-diff
Expires: Tue, Aug 19, 12:09 AM (2 w, 2 h ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3256056

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions