diff --git a/PKG-INFO b/PKG-INFO index fb4e89f..e228fed 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,69 +1,69 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.0.154 +Version: 0.0.155 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN -Project-URL: Funding, https://www.softwareheritage.org/donate -Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer +Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest +Project-URL: Funding, https://www.softwareheritage.org/donate Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index fb4e89f..e228fed 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,69 +1,69 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.0.154 +Version: 0.0.155 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN -Project-URL: Funding, https://www.softwareheritage.org/donate -Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer +Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest +Project-URL: Funding, https://www.softwareheritage.org/donate Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.indexer.egg-info/SOURCES.txt b/swh.indexer.egg-info/SOURCES.txt index 7640b31..5457431 100644 --- a/swh.indexer.egg-info/SOURCES.txt +++ b/swh.indexer.egg-info/SOURCES.txt @@ -1,93 +1,92 @@ MANIFEST.in Makefile README.md requirements-swh.txt requirements.txt setup.py version.txt sql/bin/db-upgrade sql/bin/dot_add_content sql/doc/json/.gitignore sql/doc/json/Makefile sql/doc/json/indexer_configuration.tool_configuration.schema.json sql/doc/json/revision_metadata.translated_metadata.json sql/json/.gitignore sql/json/Makefile sql/json/indexer_configuration.tool_configuration.schema.json sql/json/revision_metadata.translated_metadata.json sql/upgrades/115.sql sql/upgrades/116.sql sql/upgrades/117.sql sql/upgrades/118.sql sql/upgrades/119.sql sql/upgrades/120.sql sql/upgrades/121.sql sql/upgrades/122.sql sql/upgrades/123.sql sql/upgrades/124.sql sql/upgrades/125.sql sql/upgrades/126.sql swh/__init__.py swh.indexer.egg-info/PKG-INFO swh.indexer.egg-info/SOURCES.txt swh.indexer.egg-info/dependency_links.txt swh.indexer.egg-info/entry_points.txt swh.indexer.egg-info/requires.txt swh.indexer.egg-info/top_level.txt swh/indexer/__init__.py swh/indexer/cli.py swh/indexer/codemeta.py swh/indexer/ctags.py swh/indexer/fossology_license.py swh/indexer/indexer.py swh/indexer/journal_client.py swh/indexer/metadata.py swh/indexer/metadata_detector.py swh/indexer/mimetype.py swh/indexer/origin_head.py swh/indexer/rehash.py swh/indexer/tasks.py swh/indexer/data/codemeta/CITATION swh/indexer/data/codemeta/LICENSE swh/indexer/data/codemeta/codemeta.jsonld swh/indexer/data/codemeta/crosswalk.csv swh/indexer/metadata_dictionary/__init__.py swh/indexer/metadata_dictionary/base.py swh/indexer/metadata_dictionary/codemeta.py swh/indexer/metadata_dictionary/maven.py swh/indexer/metadata_dictionary/npm.py swh/indexer/metadata_dictionary/python.py swh/indexer/metadata_dictionary/ruby.py swh/indexer/sql/10-swh-init.sql swh/indexer/sql/20-swh-enums.sql swh/indexer/sql/30-swh-schema.sql swh/indexer/sql/40-swh-func.sql swh/indexer/sql/50-swh-data.sql swh/indexer/sql/60-swh-indexes.sql swh/indexer/storage/__init__.py swh/indexer/storage/converters.py swh/indexer/storage/db.py swh/indexer/storage/in_memory.py swh/indexer/storage/api/__init__.py swh/indexer/storage/api/client.py swh/indexer/storage/api/server.py -swh/indexer/storage/api/wsgi.py swh/indexer/tests/__init__.py swh/indexer/tests/conftest.py swh/indexer/tests/tasks.py swh/indexer/tests/test_cli.py swh/indexer/tests/test_ctags.py swh/indexer/tests/test_fossology_license.py swh/indexer/tests/test_journal_client.py swh/indexer/tests/test_metadata.py swh/indexer/tests/test_mimetype.py swh/indexer/tests/test_origin_head.py swh/indexer/tests/test_origin_metadata.py swh/indexer/tests/utils.py swh/indexer/tests/storage/__init__.py swh/indexer/tests/storage/generate_data_test.py swh/indexer/tests/storage/test_api_client.py swh/indexer/tests/storage/test_converters.py swh/indexer/tests/storage/test_in_memory.py swh/indexer/tests/storage/test_server.py swh/indexer/tests/storage/test_storage.py \ No newline at end of file diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py index 3c463c6..ad94ba4 100644 --- a/swh/indexer/cli.py +++ b/swh/indexer/cli.py @@ -1,238 +1,259 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import functools +import json +import time import click from swh.core import config from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup from swh.journal.cli import get_journal_client from swh.scheduler import get_scheduler from swh.scheduler.cli_utils import schedule_origin_batches from swh.storage import get_storage from swh.indexer import metadata_dictionary from swh.indexer.journal_client import process_journal_objects from swh.indexer.storage import get_indexer_storage from swh.indexer.storage.api.server import load_and_check_config, app @click.group(name='indexer', context_settings=CONTEXT_SETTINGS, cls=AliasedGroup) @click.option('--config-file', '-C', default=None, type=click.Path(exists=True, dir_okay=False,), help="Configuration file.") @click.pass_context def cli(ctx, config_file): """Software Heritage Indexer tools. The Indexer is used to mine the content of the archive and extract derived information from archive source code artifacts. """ ctx.ensure_object(dict) conf = config.read(config_file) ctx.obj['config'] = conf def _get_api(getter, config, config_key, url): if url: config[config_key] = { 'cls': 'remote', 'args': {'url': url} } elif config_key not in config: raise click.ClickException( 'Missing configuration for {}'.format(config_key)) return getter(**config[config_key]) @cli.group('mapping') def mapping(): '''Manage Software Heritage Indexer mappings.''' pass @mapping.command('list') def mapping_list(): """Prints the list of known mappings.""" mapping_names = [mapping.name for mapping in metadata_dictionary.MAPPINGS.values()] mapping_names.sort() for mapping_name in mapping_names: click.echo(mapping_name) @mapping.command('list-terms') @click.option('--exclude-mapping', multiple=True, help='Exclude the given mapping from the output') @click.option('--concise', is_flag=True, default=False, help='Don\'t print the list of mappings supporting each term.') def mapping_list_terms(concise, exclude_mapping): """Prints the list of known CodeMeta terms, and which mappings support them.""" properties = metadata_dictionary.list_terms() for (property_name, supported_mappings) in sorted(properties.items()): supported_mappings = {m.name for m in supported_mappings} supported_mappings -= set(exclude_mapping) if supported_mappings: if concise: click.echo(property_name) else: click.echo('{}:'.format(property_name)) click.echo('\t' + ', '.join(sorted(supported_mappings))) +@mapping.command('translate') +@click.argument('mapping-name') +@click.argument('file', type=click.File('rb')) +def mapping_translate(mapping_name, file): + """Prints the list of known mappings.""" + mapping_cls = [cls for cls in metadata_dictionary.MAPPINGS.values() + if cls.name == mapping_name] + if not mapping_cls: + raise click.ClickException('Unknown mapping {}'.format(mapping_name)) + assert len(mapping_cls) == 1 + mapping_cls = mapping_cls[0] + mapping = mapping_cls() + codemeta_doc = mapping.translate(file.read()) + click.echo(json.dumps(codemeta_doc, indent=4)) + + @cli.group('schedule') @click.option('--scheduler-url', '-s', default=None, help="URL of the scheduler API") @click.option('--indexer-storage-url', '-i', default=None, help="URL of the indexer storage API") @click.option('--storage-url', '-g', default=None, help="URL of the (graph) storage API") @click.option('--dry-run/--no-dry-run', is_flag=True, default=False, help='List only what would be scheduled.') @click.pass_context def schedule(ctx, scheduler_url, storage_url, indexer_storage_url, dry_run): """Manipulate Software Heritage Indexer tasks. Via SWH Scheduler's API.""" ctx.obj['indexer_storage'] = _get_api( get_indexer_storage, ctx.obj['config'], 'indexer_storage', indexer_storage_url ) ctx.obj['storage'] = _get_api( get_storage, ctx.obj['config'], 'storage', storage_url ) ctx.obj['scheduler'] = _get_api( get_scheduler, ctx.obj['config'], 'scheduler', scheduler_url ) if dry_run: ctx.obj['scheduler'] = None def list_origins_by_producer(idx_storage, mappings, tool_ids): start = 0 limit = 10000 while True: origins = list( idx_storage.origin_intrinsic_metadata_search_by_producer( start=start, limit=limit, ids_only=True, mappings=mappings or None, tool_ids=tool_ids or None)) if not origins: break start = origins[-1]+1 yield from origins @schedule.command('reindex_origin_metadata') @click.option('--batch-size', '-b', 'origin_batch_size', default=10, show_default=True, type=int, help="Number of origins per task") @click.option('--tool-id', '-t', 'tool_ids', type=int, multiple=True, help="Restrict search of old metadata to this/these tool ids.") @click.option('--mapping', '-m', 'mappings', multiple=True, help="Mapping(s) that should be re-scheduled (eg. 'npm', " "'gemspec', 'maven')") @click.option('--task-type', default='index-origin-metadata', show_default=True, help="Name of the task type to schedule.") @click.pass_context def schedule_origin_metadata_reindex( ctx, origin_batch_size, tool_ids, mappings, task_type): """Schedules indexing tasks for origins that were already indexed.""" idx_storage = ctx.obj['indexer_storage'] scheduler = ctx.obj['scheduler'] origins = list_origins_by_producer(idx_storage, mappings, tool_ids) kwargs = {"policy_update": "update-dups"} schedule_origin_batches( scheduler, task_type, origins, origin_batch_size, kwargs) @cli.command('journal-client') @click.option('--scheduler-url', '-s', default=None, help="URL of the scheduler API") @click.option('--origin-metadata-task-type', default='index-origin-metadata', help='Name of the task running the origin metadata indexer.') @click.option('--broker', 'brokers', type=str, multiple=True, help='Kafka broker to connect to.') @click.option('--prefix', type=str, default=None, help='Prefix of Kafka topic names to read from.') @click.option('--group-id', type=str, help='Consumer/group id for reading from Kafka.') @click.option('--max-messages', '-m', default=None, type=int, help='Maximum number of objects to replay. Default is to ' 'run forever.') @click.pass_context def journal_client(ctx, scheduler_url, origin_metadata_task_type, brokers, prefix, group_id, max_messages): """Listens for new objects from the SWH Journal, and schedules tasks to run relevant indexers (currently, only origin-intrinsic-metadata) on these new objects.""" scheduler = _get_api( get_scheduler, ctx.obj['config'], 'scheduler', scheduler_url ) client = get_journal_client( ctx, brokers=brokers, prefix=prefix, group_id=group_id, object_types=['origin_visit']) worker_fn = functools.partial( process_journal_objects, scheduler=scheduler, task_names={ 'origin_metadata': origin_metadata_task_type, } ) nb_messages = 0 + last_log_time = 0 try: while not max_messages or nb_messages < max_messages: nb_messages += client.process(worker_fn) - print('Processed %d messages.' % nb_messages) + if time.monotonic() - last_log_time >= 60: + print('Processed %d messages.' % nb_messages) + last_log_time = time.monotonic() except KeyboardInterrupt: ctx.exit(0) else: print('Done.') @cli.command('rpc-serve') @click.argument('config-path', required=1) @click.option('--host', default='0.0.0.0', help="Host to run the server") @click.option('--port', default=5007, type=click.INT, help="Binding port of the server") @click.option('--debug/--nodebug', default=True, help="Indicates if the server should run in debug mode") def rpc_server(config_path, host, port, debug): """Starts a Software Heritage Indexer RPC HTTP server.""" api_cfg = load_and_check_config(config_path, type='any') app.config.update(api_cfg) app.run(host, port=int(port), debug=bool(debug)) def main(): return cli(auto_envvar_prefix='SWH_INDEXER') if __name__ == '__main__': main() diff --git a/swh/indexer/storage/api/wsgi.py b/swh/indexer/storage/api/wsgi.py deleted file mode 100644 index 02c4901..0000000 --- a/swh/indexer/storage/api/wsgi.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (C) 2019 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from .server import make_app_from_configfile - -application = make_app_from_configfile() diff --git a/version.txt b/version.txt index 9f1125f..34fee3b 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.154-0-ga6ce599 \ No newline at end of file +v0.0.155-0-g1d625af \ No newline at end of file