diff --git a/PKG-INFO b/PKG-INFO index a126486..fb4e89f 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,69 +1,69 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.0.153 +Version: 0.0.154 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN +Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer -Project-URL: Funding, https://www.softwareheritage.org/donate Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/requirements-swh.txt b/requirements-swh.txt index 0ca12b4..4a8746e 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,6 +1,6 @@ swh.core[db,http] >= 0.0.65 swh.model >= 0.0.15 swh.objstorage >= 0.0.28 swh.scheduler >= 0.0.47 swh.storage >= 0.0.143 -swh.journal >= 0.0.11 +swh.journal >= 0.0.17 diff --git a/requirements.txt b/requirements.txt index 84e7278..925629e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ vcversioner click -file-magic +python-magic >= 0.4.13 pyld xmltodict diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index a126486..fb4e89f 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,69 +1,69 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.0.153 +Version: 0.0.154 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN +Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer -Project-URL: Funding, https://www.softwareheritage.org/donate Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.indexer.egg-info/requires.txt b/swh.indexer.egg-info/requires.txt index 2880880..688cbee 100644 --- a/swh.indexer.egg-info/requires.txt +++ b/swh.indexer.egg-info/requires.txt @@ -1,16 +1,16 @@ vcversioner click -file-magic +python-magic>=0.4.13 pyld xmltodict swh.core[db,http]>=0.0.65 swh.model>=0.0.15 swh.objstorage>=0.0.28 swh.scheduler>=0.0.47 swh.storage>=0.0.143 -swh.journal>=0.0.11 +swh.journal>=0.0.17 [testing] pytest<4 pytest-postgresql hypothesis>=3.11.0 diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py index 141636e..3c463c6 100644 --- a/swh/indexer/cli.py +++ b/swh/indexer/cli.py @@ -1,242 +1,238 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import functools import click from swh.core import config from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup from swh.journal.cli import get_journal_client from swh.scheduler import get_scheduler from swh.scheduler.cli_utils import schedule_origin_batches from swh.storage import get_storage from swh.indexer import metadata_dictionary from swh.indexer.journal_client import process_journal_objects from swh.indexer.storage import get_indexer_storage from swh.indexer.storage.api.server import load_and_check_config, app @click.group(name='indexer', context_settings=CONTEXT_SETTINGS, cls=AliasedGroup) @click.option('--config-file', '-C', default=None, type=click.Path(exists=True, dir_okay=False,), help="Configuration file.") @click.pass_context def cli(ctx, config_file): """Software Heritage Indexer tools. The Indexer is used to mine the content of the archive and extract derived information from archive source code artifacts. """ ctx.ensure_object(dict) conf = config.read(config_file) ctx.obj['config'] = conf def _get_api(getter, config, config_key, url): if url: config[config_key] = { 'cls': 'remote', 'args': {'url': url} } elif config_key not in config: raise click.ClickException( 'Missing configuration for {}'.format(config_key)) return getter(**config[config_key]) @cli.group('mapping') def mapping(): '''Manage Software Heritage Indexer mappings.''' pass @mapping.command('list') def mapping_list(): """Prints the list of known mappings.""" mapping_names = [mapping.name for mapping in metadata_dictionary.MAPPINGS.values()] mapping_names.sort() for mapping_name in mapping_names: click.echo(mapping_name) @mapping.command('list-terms') @click.option('--exclude-mapping', multiple=True, help='Exclude the given mapping from the output') @click.option('--concise', is_flag=True, default=False, help='Don\'t print the list of mappings supporting each term.') def mapping_list_terms(concise, exclude_mapping): """Prints the list of known CodeMeta terms, and which mappings support them.""" properties = metadata_dictionary.list_terms() for (property_name, supported_mappings) in sorted(properties.items()): supported_mappings = {m.name for m in supported_mappings} supported_mappings -= set(exclude_mapping) if supported_mappings: if concise: click.echo(property_name) else: click.echo('{}:'.format(property_name)) click.echo('\t' + ', '.join(sorted(supported_mappings))) @cli.group('schedule') @click.option('--scheduler-url', '-s', default=None, help="URL of the scheduler API") @click.option('--indexer-storage-url', '-i', default=None, help="URL of the indexer storage API") @click.option('--storage-url', '-g', default=None, help="URL of the (graph) storage API") @click.option('--dry-run/--no-dry-run', is_flag=True, default=False, help='List only what would be scheduled.') @click.pass_context def schedule(ctx, scheduler_url, storage_url, indexer_storage_url, dry_run): """Manipulate Software Heritage Indexer tasks. Via SWH Scheduler's API.""" ctx.obj['indexer_storage'] = _get_api( get_indexer_storage, ctx.obj['config'], 'indexer_storage', indexer_storage_url ) ctx.obj['storage'] = _get_api( get_storage, ctx.obj['config'], 'storage', storage_url ) ctx.obj['scheduler'] = _get_api( get_scheduler, ctx.obj['config'], 'scheduler', scheduler_url ) if dry_run: ctx.obj['scheduler'] = None def list_origins_by_producer(idx_storage, mappings, tool_ids): start = 0 limit = 10000 while True: origins = list( idx_storage.origin_intrinsic_metadata_search_by_producer( start=start, limit=limit, ids_only=True, mappings=mappings or None, tool_ids=tool_ids or None)) if not origins: break start = origins[-1]+1 yield from origins @schedule.command('reindex_origin_metadata') @click.option('--batch-size', '-b', 'origin_batch_size', default=10, show_default=True, type=int, help="Number of origins per task") @click.option('--tool-id', '-t', 'tool_ids', type=int, multiple=True, help="Restrict search of old metadata to this/these tool ids.") @click.option('--mapping', '-m', 'mappings', multiple=True, help="Mapping(s) that should be re-scheduled (eg. 'npm', " "'gemspec', 'maven')") @click.option('--task-type', default='index-origin-metadata', show_default=True, help="Name of the task type to schedule.") @click.pass_context def schedule_origin_metadata_reindex( ctx, origin_batch_size, tool_ids, mappings, task_type): """Schedules indexing tasks for origins that were already indexed.""" idx_storage = ctx.obj['indexer_storage'] scheduler = ctx.obj['scheduler'] origins = list_origins_by_producer(idx_storage, mappings, tool_ids) kwargs = {"policy_update": "update-dups"} schedule_origin_batches( scheduler, task_type, origins, origin_batch_size, kwargs) @cli.command('journal-client') @click.option('--scheduler-url', '-s', default=None, help="URL of the scheduler API") @click.option('--origin-metadata-task-type', default='index-origin-metadata', help='Name of the task running the origin metadata indexer.') @click.option('--broker', 'brokers', type=str, multiple=True, help='Kafka broker to connect to.') @click.option('--prefix', type=str, default=None, help='Prefix of Kafka topic names to read from.') @click.option('--group-id', type=str, help='Consumer/group id for reading from Kafka.') @click.option('--max-messages', '-m', default=None, type=int, help='Maximum number of objects to replay. Default is to ' 'run forever.') @click.pass_context def journal_client(ctx, scheduler_url, origin_metadata_task_type, brokers, prefix, group_id, max_messages): """Listens for new objects from the SWH Journal, and schedules tasks to run relevant indexers (currently, only origin-intrinsic-metadata) on these new objects.""" scheduler = _get_api( get_scheduler, ctx.obj['config'], 'scheduler', scheduler_url ) client = get_journal_client( ctx, brokers=brokers, prefix=prefix, group_id=group_id, object_types=['origin_visit']) worker_fn = functools.partial( process_journal_objects, scheduler=scheduler, task_names={ 'origin_metadata': origin_metadata_task_type, } ) nb_messages = 0 try: while not max_messages or nb_messages < max_messages: nb_messages += client.process(worker_fn) print('Processed %d messages.' % nb_messages) except KeyboardInterrupt: ctx.exit(0) else: print('Done.') @cli.command('rpc-serve') @click.argument('config-path', required=1) @click.option('--host', default='0.0.0.0', help="Host to run the server") @click.option('--port', default=5007, type=click.INT, help="Binding port of the server") @click.option('--debug/--nodebug', default=True, help="Indicates if the server should run in debug mode") def rpc_server(config_path, host, port, debug): """Starts a Software Heritage Indexer RPC HTTP server.""" api_cfg = load_and_check_config(config_path, type='any') app.config.update(api_cfg) app.run(host, port=int(port), debug=bool(debug)) -cli.add_alias(rpc_server, 'api-server') -cli.add_alias(rpc_server, 'serve') - - def main(): return cli(auto_envvar_prefix='SWH_INDEXER') if __name__ == '__main__': main() diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py index c5fa978..867e9ed 100644 --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -1,624 +1,620 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import os import logging import shutil import tempfile import datetime from copy import deepcopy from contextlib import contextmanager from swh.scheduler import get_scheduler -try: - from swh.scheduler import CONFIG as SWH_CONFIG -except ImportError: - # for swh-scheduler < 0.0.47 bw compat - SWH_CONFIG = None +from swh.scheduler import CONFIG as SWH_CONFIG from swh.storage import get_storage from swh.core.config import SWHConfig from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from swh.indexer.storage import get_indexer_storage, INDEXER_CFG_KEY from swh.model import hashutil from swh.core import utils @contextmanager def write_to_temp(filename, data, working_directory): """Write the sha1's content in a temporary file. Args: filename (str): one of sha1's many filenames data (bytes): the sha1's content to write in temporary file Returns: The path to the temporary file created. That file is filled in with the raw content's data. """ os.makedirs(working_directory, exist_ok=True) temp_dir = tempfile.mkdtemp(dir=working_directory) content_path = os.path.join(temp_dir, filename) with open(content_path, 'wb') as f: f.write(data) yield content_path shutil.rmtree(temp_dir) class BaseIndexer(SWHConfig, metaclass=abc.ABCMeta): """Base class for indexers to inherit from. The main entry point is the :func:`run` function which is in charge of triggering the computations on the batch dict/ids received. Indexers can: - filter out ids whose data has already been indexed. - retrieve ids data from storage or objstorage - index this data depending on the object and store the result in storage. To implement a new object type indexer, inherit from the BaseIndexer and implement indexing: :meth:`~BaseIndexer.run`: object_ids are different depending on object. For example: sha1 for content, sha1_git for revision, directory, release, and id for origin To implement a new concrete indexer, inherit from the object level classes: :class:`ContentIndexer`, :class:`RevisionIndexer`, :class:`OriginIndexer`. Then you need to implement the following functions: :meth:`~BaseIndexer.filter`: filter out data already indexed (in storage). :meth:`~BaseIndexer.index_object`: compute index on id with data (retrieved from the storage or the objstorage by the id key) and return the resulting index computation. :meth:`~BaseIndexer.persist_index_computations`: persist the results of multiple index computations in the storage. The new indexer implementation can also override the following functions: :meth:`~BaseIndexer.prepare`: Configuration preparation for the indexer. When overriding, this must call the `super().prepare()` instruction. :meth:`~BaseIndexer.check`: Configuration check for the indexer. When overriding, this must call the `super().check()` instruction. :meth:`~BaseIndexer.register_tools`: This should return a dict of the tool(s) to use when indexing or filtering. """ CONFIG = 'indexer/base' DEFAULT_CONFIG = { INDEXER_CFG_KEY: ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5007/' } }), 'storage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5002/', } }), 'objstorage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5003/', } }) } ADDITIONAL_CONFIG = {} USE_TOOLS = True catch_exceptions = True """Prevents exceptions in `index()` from raising too high. Set to False in tests to properly catch all exceptions.""" def __init__(self, config=None, **kw): """Prepare and check that the indexer is ready to run. """ super().__init__() if config is not None: self.config = config elif SWH_CONFIG: self.config = SWH_CONFIG.copy() else: config_keys = ('base_filename', 'config_filename', 'additional_configs', 'global_config') config_args = {k: v for k, v in kw.items() if k in config_keys} if self.ADDITIONAL_CONFIG: config_args.setdefault('additional_configs', []).append( self.ADDITIONAL_CONFIG) self.config = self.parse_config_file(**config_args) self.prepare() self.check() self.log.debug('%s: config=%s', self, self.config) def prepare(self): """Prepare the indexer's needed runtime configuration. Without this step, the indexer cannot possibly run. """ config_storage = self.config.get('storage') if config_storage: self.storage = get_storage(**config_storage) objstorage = self.config['objstorage'] self.objstorage = get_objstorage(objstorage['cls'], objstorage['args']) idx_storage = self.config[INDEXER_CFG_KEY] self.idx_storage = get_indexer_storage(**idx_storage) _log = logging.getLogger('requests.packages.urllib3.connectionpool') _log.setLevel(logging.WARN) self.log = logging.getLogger('swh.indexer') if self.USE_TOOLS: self.tools = list(self.register_tools( self.config.get('tools', []))) self.results = [] @property def tool(self): return self.tools[0] def check(self): """Check the indexer's configuration is ok before proceeding. If ok, does nothing. If not raise error. """ if self.USE_TOOLS and not self.tools: raise ValueError('Tools %s is unknown, cannot continue' % self.tools) def _prepare_tool(self, tool): """Prepare the tool dict to be compliant with the storage api. """ return {'tool_%s' % key: value for key, value in tool.items()} def register_tools(self, tools): """Permit to register tools to the storage. Add a sensible default which can be overridden if not sufficient. (For now, all indexers use only one tool) Expects the self.config['tools'] property to be set with one or more tools. Args: tools (dict/[dict]): Either a dict or a list of dict. Returns: list: List of dicts with additional id key. Raises: ValueError: if not a list nor a dict. """ if isinstance(tools, list): tools = list(map(self._prepare_tool, tools)) elif isinstance(tools, dict): tools = [self._prepare_tool(tools)] else: raise ValueError('Configuration tool(s) must be a dict or list!') if tools: return self.idx_storage.indexer_configuration_add(tools) else: return [] def index(self, id, data): """Index computation for the id and associated raw data. Args: id (bytes): identifier data (bytes): id's data from storage or objstorage depending on object type Returns: dict: a dict that makes sense for the :meth:`.persist_index_computations` method. """ raise NotImplementedError() def filter(self, ids): """Filter missing ids for that particular indexer. Args: ids ([bytes]): list of ids Yields: iterator of missing ids """ yield from ids @abc.abstractmethod def persist_index_computations(self, results, policy_update): """Persist the computation resulting from the index. Args: results ([result]): List of results. One result is the result of the index function. policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them Returns: None """ pass def next_step(self, results, task): """Do something else with computations results (e.g. send to another queue, ...). (This is not an abstractmethod since it is optional). Args: results ([result]): List of results (dict) as returned by index function. task (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus an optional `result_name` key. Returns: None """ if task: if getattr(self, 'scheduler', None): scheduler = self.scheduler else: scheduler = get_scheduler(**self.config['scheduler']) task = deepcopy(task) result_name = task.pop('result_name', None) task['next_run'] = datetime.datetime.now() if result_name: task['arguments']['kwargs'][result_name] = self.results scheduler.create_tasks([task]) @abc.abstractmethod def run(self, ids, policy_update, next_step=None, **kwargs): """Given a list of ids: - retrieves the data from the storage - executes the indexing computations - stores the results (according to policy_update) Args: ids ([bytes]): id's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus a `result_name` key. **kwargs: passed to the `index` method """ pass class ContentIndexer(BaseIndexer): """A content indexer working on a list of ids directly. To work on indexer range, use the :class:`ContentRangeIndexer` instead. Note: :class:`ContentIndexer` is not an instantiable object. To use it, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update, next_step=None, **kwargs): """Given a list of ids: - retrieve the content from the storage - execute the indexing computations - store the results (according to policy_update) Args: ids (Iterable[Union[bytes, str]]): sha1's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus an optional `result_name` key. **kwargs: passed to the `index` method """ ids = [hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids] results = [] try: for sha1 in ids: try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content, **kwargs) if res: # If no results, skip it results.append(res) self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) except Exception: if not self.catch_exceptions: raise self.log.exception( 'Problem when reading contents metadata.') class ContentRangeIndexer(BaseIndexer): """A content range indexer. This expects as input a range of ids to index. To work on a list of ids, use the :class:`ContentIndexer` instead. Note: :class:`ContentRangeIndexer` is not an instantiable object. To use it, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ @abc.abstractmethod def indexed_contents_in_range(self, start, end): """Retrieve indexed contents within range [start, end]. Args: start (bytes): Starting bound from range identifier end (bytes): End range identifier Yields: bytes: Content identifier present in the range ``[start, end]`` """ pass def _list_contents_to_index(self, start, end, indexed): """Compute from storage the new contents to index in the range [start, end]. The already indexed contents are skipped. Args: start (bytes): Starting bound from range identifier end (bytes): End range identifier indexed (Set[bytes]): Set of content already indexed. Yields: bytes: Identifier of contents to index. """ if not isinstance(start, bytes) or not isinstance(end, bytes): raise TypeError('identifiers must be bytes, not %r and %r.' % (start, end)) while start: result = self.storage.content_get_range(start, end) contents = result['contents'] for c in contents: _id = hashutil.hash_to_bytes(c['sha1']) if _id in indexed: continue yield _id start = result['next'] def _index_contents(self, start, end, indexed, **kwargs): """Index the contents from within range [start, end] Args: start (bytes): Starting bound from range identifier end (bytes): End range identifier indexed (Set[bytes]): Set of content already indexed. Yields: dict: Data indexed to persist using the indexer storage """ for sha1 in self._list_contents_to_index(start, end, indexed): try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content, **kwargs) if res: if not isinstance(res['id'], bytes): raise TypeError( '%r.index should return ids as bytes, not %r' % (self.__class__.__name__, res['id'])) yield res def _index_with_skipping_already_done(self, start, end): """Index not already indexed contents in range [start, end]. Args: start** (Union[bytes, str]): Starting range identifier end (Union[bytes, str]): Ending range identifier Yields: bytes: Content identifier present in the range ``[start, end]`` which are not already indexed. """ while start: indexed_page = self.indexed_contents_in_range(start, end) contents = indexed_page['ids'] _end = contents[-1] if contents else end yield from self._index_contents( start, _end, contents) start = indexed_page['next'] def run(self, start, end, skip_existing=True, **kwargs): """Given a range of content ids, compute the indexing computations on the contents within. Either the indexer is incremental (filter out existing computed data) or not (compute everything from scratch). Args: start (Union[bytes, str]): Starting range identifier end (Union[bytes, str]): Ending range identifier skip_existing (bool): Skip existing indexed data (default) or not **kwargs: passed to the `index` method Returns: bool: True if data was indexed, False otherwise. """ with_indexed_data = False try: if isinstance(start, str): start = hashutil.hash_to_bytes(start) if isinstance(end, str): end = hashutil.hash_to_bytes(end) if skip_existing: gen = self._index_with_skipping_already_done(start, end) else: gen = self._index_contents(start, end, indexed=[]) for results in utils.grouper(gen, n=self.config['write_batch_size']): self.persist_index_computations( results, policy_update='update-dups') with_indexed_data = True except Exception: if not self.catch_exceptions: raise self.log.exception( 'Problem when computing metadata.') finally: return with_indexed_data class OriginIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Origin indexing using the run method Note: the :class:`OriginIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, origin_urls, policy_update='update-dups', next_step=None, **kwargs): """Given a list of origin ids: - retrieve origins from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or (type, url) tuples. policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates (default) or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus an optional `result_name` key. parse_ids (bool): Do we need to parse id or not (default) **kwargs: passed to the `index` method """ results = self.index_list(origin_urls, **kwargs) self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) def index_list(self, origins, **kwargs): results = [] for origin in origins: try: res = self.index(origin, **kwargs) if res: # If no results, skip it results.append(res) except Exception: if not self.catch_exceptions: raise self.log.exception( 'Problem when processing origin %s', origin['id']) return results class RevisionIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Revision indexing using the run method Note: the :class:`RevisionIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update, next_step=None): """Given a list of sha1_gits: - retrieve revisions from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([bytes or str]): sha1_git's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ results = [] ids = [hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids] revs = self.storage.revision_get(ids) for rev in revs: if not rev: self.log.warning('Revisions %s not found in storage' % list(map(hashutil.hash_to_hex, ids))) continue try: res = self.index(rev) if res: # If no results, skip it results.append(res) except Exception: if not self.catch_exceptions: raise self.log.exception( 'Problem when processing revision') self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py index bcdac02..afe700e 100644 --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -1,146 +1,145 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import magic -from swh.model import hashutil - from .indexer import ContentIndexer, ContentRangeIndexer +if not hasattr(magic.Magic, 'from_buffer'): + raise ImportError( + 'Expected "import magic" to import python-magic, but file_magic ' + 'was imported instead.') + def compute_mimetype_encoding(raw_content): """Determine mimetype and encoding from the raw content. Args: raw_content (bytes): content's raw data Returns: dict: mimetype and encoding key and corresponding values (as bytes). """ - r = magic.detect_from_content(raw_content) + m = magic.Magic(mime=True, mime_encoding=True) + res = m.from_buffer(raw_content) + (mimetype, encoding) = res.split('; charset=') return { - 'mimetype': r.mime_type, - 'encoding': r.encoding, + 'mimetype': mimetype, + 'encoding': encoding, } class MixinMimetypeIndexer: """Mixin mimetype indexer. See :class:`MimetypeIndexer` and :class:`MimetypeRangeIndexer` """ ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }), 'write_batch_size': ('int', 1000), } CONFIG_BASE_FILENAME = 'indexer/mimetype' def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: content's mimetype; dict keys being - **id** (bytes): content's identifier (sha1) - **mimetype** (bytes): mimetype in bytes - **encoding** (bytes): encoding in bytes """ - try: - properties = compute_mimetype_encoding(data) - properties.update({ - 'id': id, - 'indexer_configuration_id': self.tool['id'], - }) - except TypeError: - self.log.error('Detecting mimetype error for id %s' % ( - hashutil.hash_to_hex(id), )) - return None - + properties = compute_mimetype_encoding(data) + properties.update({ + 'id': id, + 'indexer_configuration_id': self.tool['id'], + }) return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content's mimetype dicts (see :meth:`.index`) policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_mimetype_add( results, conflict_update=(policy_update == 'update-dups')) class MimetypeIndexer(MixinMimetypeIndexer, ContentIndexer): """Mimetype Indexer working on list of content identifiers. It: - (optionally) filters out content already indexed (cf. :meth:`.filter`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage """ def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_mimetype_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) class MimetypeRangeIndexer(MixinMimetypeIndexer, ContentRangeIndexer): """Mimetype Range Indexer working on range of content identifiers. It: - (optionally) filters out content already indexed (cf :meth:`.indexed_contents_in_range`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage """ def indexed_contents_in_range(self, start, end): """Retrieve indexed content id within range [start, end]. Args: start (bytes): Starting bound from range identifier end (bytes): End range identifier Returns: dict: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ return self.idx_storage.content_mimetype_get_range( start, end, self.tool['id']) diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py index 6f93061..70bd442 100644 --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -1,372 +1,362 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from collections import namedtuple from functools import reduce import re import tempfile -from unittest.mock import patch, MagicMock +from unittest.mock import patch from click.testing import CliRunner +from swh.journal.tests.utils import FakeKafkaMessage, MockedKafkaConsumer from swh.model.hashutil import hash_to_bytes from swh.indexer.cli import cli CLI_CONFIG = ''' scheduler: cls: foo args: {} storage: cls: memory args: {} indexer_storage: cls: memory args: {} ''' def fill_idx_storage(idx_storage, nb_rows): tools = [ { 'tool_name': 'tool %d' % i, 'tool_version': '0.0.1', 'tool_configuration': {}, } for i in range(2) ] tools = idx_storage.indexer_configuration_add(tools) origin_metadata = [ { 'id': origin_id, 'origin_url': 'file:///dev/zero', 'from_revision': hash_to_bytes('abcd{:0>4}'.format(origin_id)), 'indexer_configuration_id': tools[origin_id % 2]['id'], 'metadata': {'name': 'origin %d' % origin_id}, 'mappings': ['mapping%d' % (origin_id % 10)] } for origin_id in range(nb_rows) ] revision_metadata = [ { 'id': hash_to_bytes('abcd{:0>4}'.format(origin_id)), 'indexer_configuration_id': tools[origin_id % 2]['id'], 'metadata': {'name': 'origin %d' % origin_id}, 'mappings': ['mapping%d' % (origin_id % 10)] } for origin_id in range(nb_rows) ] idx_storage.revision_intrinsic_metadata_add(revision_metadata) idx_storage.origin_intrinsic_metadata_add(origin_metadata) return [tool['id'] for tool in tools] def _origins_in_task_args(tasks): """Returns the set of origins contained in the arguments of the provided tasks (assumed to be of type index-origin-metadata).""" return reduce( set.union, (set(task['arguments']['args'][0]) for task in tasks), set() ) def _assert_tasks_for_origins(tasks, origins): expected_kwargs = {"policy_update": "update-dups"} assert {task['type'] for task in tasks} == {'index-origin-metadata'} assert all(len(task['arguments']['args']) == 1 for task in tasks) for task in tasks: assert task['arguments']['kwargs'] == expected_kwargs, task assert _origins_in_task_args(tasks) == set(origins) def invoke(scheduler, catch_exceptions, args): runner = CliRunner() with patch('swh.indexer.cli.get_scheduler') as get_scheduler_mock, \ tempfile.NamedTemporaryFile('a', suffix='.yml') as config_fd: config_fd.write(CLI_CONFIG) config_fd.seek(0) get_scheduler_mock.return_value = scheduler result = runner.invoke(cli, ['-C' + config_fd.name] + args) if not catch_exceptions and result.exception: print(result.output) raise result.exception return result def test_mapping_list(indexer_scheduler): result = invoke(indexer_scheduler, False, [ 'mapping', 'list', ]) expected_output = '\n'.join([ 'codemeta', 'gemspec', 'maven', 'npm', 'pkg-info', '', ]) assert result.exit_code == 0, result.output assert result.output == expected_output def test_mapping_list_terms(indexer_scheduler): result = invoke(indexer_scheduler, False, [ 'mapping', 'list-terms', ]) assert result.exit_code == 0, result.output assert re.search(r'http://schema.org/url:\n.*npm', result.output) assert re.search(r'http://schema.org/url:\n.*codemeta', result.output) assert re.search( r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta', result.output) def test_mapping_list_terms_exclude(indexer_scheduler): result = invoke(indexer_scheduler, False, [ 'mapping', 'list-terms', '--exclude-mapping', 'codemeta' ]) assert result.exit_code == 0, result.output assert re.search(r'http://schema.org/url:\n.*npm', result.output) assert not re.search(r'http://schema.org/url:\n.*codemeta', result.output) assert not re.search( r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta', result.output) @patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_empty_db( indexer_scheduler, idx_storage, storage): result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', ]) expected_output = ( 'Nothing to do (no origin metadata matched the criteria).\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output tasks = indexer_scheduler.search_tasks() assert len(tasks) == 0 @patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_divisor( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 90) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', ]) # Check the output expected_output = ( 'Scheduled 3 tasks (30 origins).\n' 'Scheduled 6 tasks (60 origins).\n' 'Scheduled 9 tasks (90 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 9 _assert_tasks_for_origins(tasks, range(90)) @patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_dry_run( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 90) result = invoke(indexer_scheduler, False, [ 'schedule', '--dry-run', 'reindex_origin_metadata', ]) # Check the output expected_output = ( 'Scheduled 3 tasks (30 origins).\n' 'Scheduled 6 tasks (60 origins).\n' 'Scheduled 9 tasks (90 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 0 @patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_nondivisor( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when neither origin_batch_size or task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 70) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', '--batch-size', '20', ]) # Check the output expected_output = ( 'Scheduled 3 tasks (60 origins).\n' 'Scheduled 4 tasks (70 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 4 _assert_tasks_for_origins(tasks, range(70)) @patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_filter_one_mapping( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 110) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', '--mapping', 'mapping1', ]) # Check the output expected_output = ( 'Scheduled 2 tasks (11 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 2 _assert_tasks_for_origins( tasks, [1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101]) @patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_filter_two_mappings( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 110) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', '--mapping', 'mapping1', '--mapping', 'mapping2', ]) # Check the output expected_output = ( 'Scheduled 3 tasks (22 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 3 _assert_tasks_for_origins( tasks, [1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101, 2, 12, 22, 32, 42, 52, 62, 72, 82, 92, 102]) @patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_filter_one_tool( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" tool_ids = fill_idx_storage(idx_storage, 110) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', '--tool-id', str(tool_ids[0]), ]) # Check the output expected_output = ( 'Scheduled 3 tasks (30 origins).\n' 'Scheduled 6 tasks (55 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 6 _assert_tasks_for_origins( tasks, [x*2 for x in range(55)]) def test_journal_client(storage, indexer_scheduler): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" - mock_consumer = MagicMock() - - partition = namedtuple('_partition', ['topic'])( - topic='swh.journal.objects.origin_visit') - message = namedtuple('_message', ['value'])( - value={ - 'status': 'full', - 'origin': { - 'url': 'file:///dev/zero', - } + message = FakeKafkaMessage('swh.journal.objects.origin_visit', 'bogus', { + 'status': 'full', + 'origin': { + 'url': 'file:///dev/zero', } - ) - mock_consumer.poll.return_value = {partition: [message]} + }) + + consumer = MockedKafkaConsumer([message]) - with patch('swh.journal.client.KafkaConsumer', - return_value=mock_consumer): + with patch('swh.journal.client.Consumer', + return_value=consumer): result = invoke(indexer_scheduler, False, [ 'journal-client', '--max-messages', '1', '--broker', '192.0.2.1', '--prefix', 'swh.journal.objects', '--group-id', 'test-consumer', ]) - mock_consumer.subscribe.assert_called_once_with( - topics=['swh.journal.objects.origin_visit']) - mock_consumer.poll.assert_called_once_with() - mock_consumer.commit.assert_called_once_with() - # Check the output expected_output = ( 'Processed 1 messages.\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 1 _assert_tasks_for_origins( tasks, ['file:///dev/zero']) diff --git a/version.txt b/version.txt index bfd45df..9f1125f 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.153-0-g12801f8 \ No newline at end of file +v0.0.154-0-ga6ce599 \ No newline at end of file