diff --git a/PKG-INFO b/PKG-INFO index 68bb5a5..7cff760 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,69 +1,69 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.0.146 +Version: 0.0.147 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/requirements-swh.txt b/requirements-swh.txt index be61102..394e977 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,6 +1,6 @@ -swh.core >= 0.0.53 +swh.core[db,http] >= 0.0.61 swh.model >= 0.0.15 swh.objstorage >= 0.0.28 swh.scheduler >= 0.0.47 swh.storage >= 0.0.123 swh.journal >= 0.0.6 diff --git a/requirements.txt b/requirements.txt index a578b91..84e7278 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,5 @@ vcversioner -pygments click -chardet file-magic pyld xmltodict diff --git a/setup.py b/setup.py index d2b2a85..299b040 100755 --- a/setup.py +++ b/setup.py @@ -1,69 +1,71 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from setuptools import setup, find_packages from os import path from io import open here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, 'README.md'), encoding='utf-8') as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = 'requirements-%s.txt' % name else: reqf = 'requirements.txt' requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith('#'): continue requirements.append(line) return requirements setup( name='swh.indexer', description='Software Heritage Content Indexer', long_description=long_description, long_description_content_type='text/markdown', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/diffusion/78/', packages=find_packages(), scripts=[], install_requires=parse_requirements() + parse_requirements('swh'), setup_requires=['vcversioner'], extras_require={'testing': parse_requirements('test')}, vcversioner={}, include_package_data=True, entry_points=''' [console_scripts] swh-indexer=swh.indexer.cli:main + [swh.cli.subcommands] + indexer=swh.indexer.cli:cli ''', classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ 'Bug Reports': 'https://forge.softwareheritage.org/maniphest', 'Funding': 'https://www.softwareheritage.org/donate', 'Source': 'https://forge.softwareheritage.org/source/swh-indexer', }, ) diff --git a/sql/upgrades/124.sql b/sql/upgrades/124.sql new file mode 100644 index 0000000..a773c32 --- /dev/null +++ b/sql/upgrades/124.sql @@ -0,0 +1,9 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 123 +-- to_version: 124 +-- description: drop constraint that origin_intrinsic_metadata references an existing revision_intrinsic_metadata. + +insert into dbversion(version, release, description) +values(124, now(), 'Work In Progress'); + +alter table origin_intrinsic_metadata drop constraint origin_intrinsic_metadata_revision_metadata_fkey; diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index 68bb5a5..7cff760 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,69 +1,69 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.0.146 +Version: 0.0.147 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.indexer.egg-info/SOURCES.txt b/swh.indexer.egg-info/SOURCES.txt index 80cb6e8..8595712 100644 --- a/swh.indexer.egg-info/SOURCES.txt +++ b/swh.indexer.egg-info/SOURCES.txt @@ -1,91 +1,90 @@ MANIFEST.in Makefile README.md requirements-swh.txt requirements.txt setup.py version.txt sql/bin/db-upgrade sql/bin/dot_add_content sql/doc/json/.gitignore sql/doc/json/Makefile sql/doc/json/indexer_configuration.tool_configuration.schema.json sql/doc/json/revision_metadata.translated_metadata.json sql/json/.gitignore sql/json/Makefile sql/json/indexer_configuration.tool_configuration.schema.json sql/json/revision_metadata.translated_metadata.json sql/upgrades/115.sql sql/upgrades/116.sql sql/upgrades/117.sql sql/upgrades/118.sql sql/upgrades/119.sql sql/upgrades/120.sql sql/upgrades/121.sql sql/upgrades/122.sql sql/upgrades/123.sql +sql/upgrades/124.sql swh/__init__.py swh.indexer.egg-info/PKG-INFO swh.indexer.egg-info/SOURCES.txt swh.indexer.egg-info/dependency_links.txt swh.indexer.egg-info/entry_points.txt swh.indexer.egg-info/requires.txt swh.indexer.egg-info/top_level.txt swh/indexer/__init__.py swh/indexer/cli.py swh/indexer/codemeta.py swh/indexer/ctags.py swh/indexer/fossology_license.py swh/indexer/indexer.py swh/indexer/journal_client.py -swh/indexer/language.py swh/indexer/metadata.py swh/indexer/metadata_detector.py swh/indexer/mimetype.py swh/indexer/origin_head.py swh/indexer/rehash.py swh/indexer/tasks.py swh/indexer/data/codemeta/CITATION swh/indexer/data/codemeta/LICENSE swh/indexer/data/codemeta/codemeta.jsonld swh/indexer/data/codemeta/crosswalk.csv swh/indexer/metadata_dictionary/__init__.py swh/indexer/metadata_dictionary/base.py swh/indexer/metadata_dictionary/codemeta.py swh/indexer/metadata_dictionary/maven.py swh/indexer/metadata_dictionary/npm.py swh/indexer/metadata_dictionary/python.py swh/indexer/metadata_dictionary/ruby.py swh/indexer/sql/10-swh-init.sql swh/indexer/sql/20-swh-enums.sql swh/indexer/sql/30-swh-schema.sql swh/indexer/sql/40-swh-func.sql swh/indexer/sql/50-swh-data.sql swh/indexer/sql/60-swh-indexes.sql swh/indexer/storage/__init__.py swh/indexer/storage/converters.py swh/indexer/storage/db.py swh/indexer/storage/in_memory.py swh/indexer/storage/api/__init__.py swh/indexer/storage/api/client.py swh/indexer/storage/api/server.py swh/indexer/storage/api/wsgi.py swh/indexer/tests/__init__.py swh/indexer/tests/conftest.py swh/indexer/tests/tasks.py swh/indexer/tests/test_cli.py swh/indexer/tests/test_ctags.py swh/indexer/tests/test_fossology_license.py -swh/indexer/tests/test_language.py swh/indexer/tests/test_metadata.py swh/indexer/tests/test_mimetype.py swh/indexer/tests/test_origin_head.py swh/indexer/tests/test_origin_metadata.py swh/indexer/tests/utils.py swh/indexer/tests/storage/__init__.py swh/indexer/tests/storage/generate_data_test.py swh/indexer/tests/storage/test_api_client.py swh/indexer/tests/storage/test_converters.py swh/indexer/tests/storage/test_in_memory.py swh/indexer/tests/storage/test_server.py swh/indexer/tests/storage/test_storage.py \ No newline at end of file diff --git a/swh.indexer.egg-info/entry_points.txt b/swh.indexer.egg-info/entry_points.txt index 43042e1..49be9af 100644 --- a/swh.indexer.egg-info/entry_points.txt +++ b/swh.indexer.egg-info/entry_points.txt @@ -1,4 +1,6 @@ [console_scripts] swh-indexer=swh.indexer.cli:main + [swh.cli.subcommands] + indexer=swh.indexer.cli:cli \ No newline at end of file diff --git a/swh.indexer.egg-info/requires.txt b/swh.indexer.egg-info/requires.txt index cc485e1..7fc2845 100644 --- a/swh.indexer.egg-info/requires.txt +++ b/swh.indexer.egg-info/requires.txt @@ -1,18 +1,16 @@ vcversioner -pygments click -chardet file-magic pyld xmltodict -swh.core>=0.0.53 +swh.core[db,http]>=0.0.61 swh.model>=0.0.15 swh.objstorage>=0.0.28 swh.scheduler>=0.0.47 swh.storage>=0.0.123 swh.journal>=0.0.6 [testing] pytest<4 pytest-postgresql hypothesis>=3.11.0 diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py index c5244be..37e97f0 100644 --- a/swh/indexer/cli.py +++ b/swh/indexer/cli.py @@ -1,177 +1,188 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click from swh.core import config +from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup from swh.scheduler import get_scheduler from swh.scheduler.cli_utils import schedule_origin_batches from swh.storage import get_storage from swh.indexer import metadata_dictionary from swh.indexer.storage import get_indexer_storage from swh.indexer.storage.api.server import load_and_check_config, app -CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) - - -@click.group(context_settings=CONTEXT_SETTINGS) +@click.group(name='indexer', context_settings=CONTEXT_SETTINGS, + cls=AliasedGroup) @click.option('--config-file', '-C', default=None, type=click.Path(exists=True, dir_okay=False,), help="Configuration file.") @click.pass_context def cli(ctx, config_file): - """Software Heritage Indexer CLI interface + """Software Heritage Indexer tools. + + The Indexer is used to mine the content of the archive and extract derived + information from archive source code artifacts. + """ ctx.ensure_object(dict) conf = config.read(config_file) ctx.obj['config'] = conf def _get_api(getter, config, config_key, url): if url: config[config_key] = { 'cls': 'remote', 'args': {'url': url} } elif config_key not in config: raise click.ClickException( 'Missing configuration for {}'.format(config_key)) return getter(**config[config_key]) @cli.group('mapping') def mapping(): + '''Manage Software Heritage Indexer mappings.''' pass @mapping.command('list') def mapping_list(): """Prints the list of known mappings.""" mapping_names = [mapping.name for mapping in metadata_dictionary.MAPPINGS.values()] mapping_names.sort() for mapping_name in mapping_names: click.echo(mapping_name) @mapping.command('list-terms') @click.option('--exclude-mapping', multiple=True, help='Exclude the given mapping from the output') @click.option('--concise', is_flag=True, default=False, help='Don\'t print the list of mappings supporting each term.') def mapping_list_terms(concise, exclude_mapping): """Prints the list of known CodeMeta terms, and which mappings support them.""" properties = metadata_dictionary.list_terms() for (property_name, supported_mappings) in sorted(properties.items()): supported_mappings = {m.name for m in supported_mappings} supported_mappings -= set(exclude_mapping) if supported_mappings: if concise: click.echo(property_name) else: click.echo('{}:'.format(property_name)) click.echo('\t' + ', '.join(sorted(supported_mappings))) @cli.group('schedule') @click.option('--scheduler-url', '-s', default=None, help="URL of the scheduler API") @click.option('--indexer-storage-url', '-i', default=None, help="URL of the indexer storage API") @click.option('--storage-url', '-g', default=None, help="URL of the (graph) storage API") @click.option('--dry-run/--no-dry-run', is_flag=True, default=False, help='List only what would be scheduled.') @click.pass_context def schedule(ctx, scheduler_url, storage_url, indexer_storage_url, dry_run): - """Manipulate indexer tasks via SWH Scheduler's API.""" + """Manipulate Software Heritage Indexer tasks. + + Via SWH Scheduler's API.""" ctx.obj['indexer_storage'] = _get_api( get_indexer_storage, ctx.obj['config'], 'indexer_storage', indexer_storage_url ) ctx.obj['storage'] = _get_api( get_storage, ctx.obj['config'], 'storage', storage_url ) ctx.obj['scheduler'] = _get_api( get_scheduler, ctx.obj['config'], 'scheduler', scheduler_url ) if dry_run: ctx.obj['scheduler'] = None def list_origins_by_producer(idx_storage, mappings, tool_ids): start = 0 limit = 10000 while True: origins = list( idx_storage.origin_intrinsic_metadata_search_by_producer( start=start, limit=limit, ids_only=True, mappings=mappings or None, tool_ids=tool_ids or None)) if not origins: break start = origins[-1]+1 yield from origins @schedule.command('reindex_origin_metadata') @click.option('--batch-size', '-b', 'origin_batch_size', default=10, show_default=True, type=int, help="Number of origins per task") @click.option('--tool-id', '-t', 'tool_ids', type=int, multiple=True, help="Restrict search of old metadata to this/these tool ids.") @click.option('--mapping', '-m', 'mappings', multiple=True, help="Mapping(s) that should be re-scheduled (eg. 'npm', " "'gemspec', 'maven')") @click.option('--task-type', - default='indexer_origin_metadata', show_default=True, + default='index-origin-metadata', show_default=True, help="Name of the task type to schedule.") @click.pass_context def schedule_origin_metadata_reindex( ctx, origin_batch_size, tool_ids, mappings, task_type): """Schedules indexing tasks for origins that were already indexed.""" idx_storage = ctx.obj['indexer_storage'] scheduler = ctx.obj['scheduler'] origins = list_origins_by_producer(idx_storage, mappings, tool_ids) kwargs = {"policy_update": "update-dups", "parse_ids": False} schedule_origin_batches( scheduler, task_type, origins, origin_batch_size, kwargs) -@cli.command('api-server') +@cli.command('rpc-serve') @click.argument('config-path', required=1) @click.option('--host', default='0.0.0.0', help="Host to run the server") @click.option('--port', default=5007, type=click.INT, help="Binding port of the server") @click.option('--debug/--nodebug', default=True, help="Indicates if the server should run in debug mode") -def api_server(config_path, host, port, debug): +def rpc_server(config_path, host, port, debug): + """Starts a Software Heritage Indexer RPC HTTP server.""" api_cfg = load_and_check_config(config_path, type='any') app.config.update(api_cfg) app.run(host, port=int(port), debug=bool(debug)) +cli.add_alias(rpc_server, 'api-server') +cli.add_alias(rpc_server, 'serve') + + def main(): return cli(auto_envvar_prefix='SWH_INDEXER') if __name__ == '__main__': main() diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py index dbf7e15..b29e4c7 100644 --- a/swh/indexer/ctags.py +++ b/swh/indexer/ctags.py @@ -1,146 +1,151 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import subprocess import json from swh.model import hashutil -from .language import compute_language from .indexer import ContentIndexer, write_to_temp # Options used to compute tags __FLAGS = [ '--fields=+lnz', # +l: language # +n: line number of tag definition # +z: include the symbol's kind (function, variable, ...) '--sort=no', # sort output on tag name '--links=no', # do not follow symlinks '--output-format=json', # outputs in json ] +def compute_language(content, log=None): + raise NotImplementedError( + 'Language detection was unreliable, so it is currently disabled. ' + 'See https://forge.softwareheritage.org/D1455') + + def run_ctags(path, lang=None, ctags_command='ctags'): """Run ctags on file path with optional language. Args: path: path to the file lang: language for that path (optional) Yields: dict: ctags' output """ optional = [] if lang: optional = ['--language-force=%s' % lang] cmd = [ctags_command] + __FLAGS + optional + [path] output = subprocess.check_output(cmd, universal_newlines=True) for symbol in output.split('\n'): if not symbol: continue js_symbol = json.loads(symbol) yield { 'name': js_symbol['name'], 'kind': js_symbol['kind'], 'line': js_symbol['line'], 'lang': js_symbol['language'], } class CtagsIndexer(ContentIndexer): CONFIG_BASE_FILENAME = 'indexer/ctags' ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.ctags'), 'tools': ('dict', { 'name': 'universal-ctags', 'version': '~git7859817b', 'configuration': { 'command_line': '''ctags --fields=+lnz --sort=no --links=no ''' '''--output-format=json ''' }, }), 'languages': ('dict', { 'ada': 'Ada', 'adl': None, 'agda': None, # ... }) } def prepare(self): super().prepare() self.working_directory = self.config['workdir'] self.language_map = self.config['languages'] def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_ctags_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: a dict representing a content_mimetype with keys: - **id** (bytes): content's identifier (sha1) - **ctags** ([dict]): ctags list of symbols """ lang = compute_language(data, log=self.log)['lang'] if not lang: return None ctags_lang = self.language_map.get(lang) if not ctags_lang: return None ctags = { 'id': id, } filename = hashutil.hash_to_hex(id) with write_to_temp( filename=filename, data=data, working_directory=self.working_directory) as content_path: result = run_ctags(content_path, lang=ctags_lang) ctags.update({ 'ctags': list(result), 'indexer_configuration_id': self.tool['id'], }) return ctags def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - ctags ([dict]): ctags list of symbols policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_ctags_add( results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/journal_client.py b/swh/indexer/journal_client.py index c8f27ea..ac236cc 100644 --- a/swh/indexer/journal_client.py +++ b/swh/indexer/journal_client.py @@ -1,88 +1,88 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from swh.journal.client import JournalClient from swh.scheduler import get_scheduler from swh.scheduler.utils import create_task_dict class IndexerJournalClient(JournalClient): """Client in charge of listing new received origins and origin_visits in the swh journal. """ CONFIG_BASE_FILENAME = 'indexer/journal_client' ADDITIONAL_CONFIG = { 'scheduler': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5008/', } }), 'origin_visit_tasks': ('List[dict]', [ { - 'type': 'indexer_origin_metadata', + 'type': 'index-origin-metadata', 'kwargs': { 'policy_update': 'update-dups', 'parse_ids': False, } } ]), } def __init__(self): super().__init__(extra_configuration={ 'object_types': ['origin_visit'], }) self.scheduler = get_scheduler(**self.config['scheduler']) logging.info( 'Starting indexer journal client with config %r', self.config) def process_objects(self, messages): assert set(messages) == {'origin_visit'}, set(messages) for origin_visit in messages['origin_visit']: self.process_origin_visit(origin_visit) def process_origin_visit(self, origin_visit): task_dicts = [] logging.debug('processing origin visit %r', origin_visit) if origin_visit[b'status'] == b'full': for task_config in self.config['origin_visit_tasks']: logging.info( 'Scheduling %s for visit of origin %d', task_config['type'], origin_visit[b'origin']) task_dicts.append(create_task_dict( task_config['type'], 'oneshot', [origin_visit[b'origin']], **task_config['kwargs'], )) else: logging.debug('status is not "full", ignoring.') if task_dicts: self.scheduler.create_tasks(task_dicts) if __name__ == '__main__': logging.basicConfig( level=logging.INFO, format='%(asctime)s %(process)d %(levelname)s %(message)s' ) import click @click.command() def main(): """Log the new received origin and origin_visits. """ IndexerJournalClient().process() main() diff --git a/swh/indexer/language.py b/swh/indexer/language.py deleted file mode 100644 index c69b1dc..0000000 --- a/swh/indexer/language.py +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright (C) 2016-2018 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - - -import io - -from pygments.lexers import guess_lexer -from pygments.util import ClassNotFound -from chardet.universaldetector import UniversalDetector - -from .indexer import ContentIndexer - - -def _cleanup_classname(classname): - """Determine the language from the pygments' lexer names. - - """ - return classname.lower().replace(' ', '-') - - -def _read_raw(raw_content, size=2048): - """Read raw content in chunk. - - """ - bs = io.BytesIO(raw_content) - while True: - chunk = bs.read(size) - if not chunk: - break - yield chunk - - -def _detect_encoding(raw_content): - """Given a raw content, try and detect its encoding. - - """ - detector = UniversalDetector() - for chunk in _read_raw(raw_content): - detector.feed(chunk) - if detector.done: - break - detector.close() - return detector.result['encoding'] - - -def compute_language_from_chunk(encoding, length, raw_content, max_size, - log=None): - """Determine the raw content's language. - - Args: - encoding (str): Encoding to use to decode the content - length (int): raw_content's length - raw_content (bytes): raw content to work with - max_size (int): max size to split the raw content at - - Returns: - dict: Dict with keys: - - **lang**: None if nothing found or the possible language - - """ - try: - if max_size <= length: - raw_content = raw_content[0:max_size] - - content = raw_content.decode(encoding) - lang = _cleanup_classname( - guess_lexer(content).name) - except ClassNotFound: - lang = None - except UnicodeDecodeError: - raise - except Exception: - if log: - log.exception('Problem during language detection, skipping') - lang = None - return { - 'lang': lang - } - - -def compute_language(raw_content, encoding=None, log=None): - """Determine the raw content's language. - - Args: - raw_content (bytes): raw content to work with - - Returns: - dict: Dict with keys: - - **lang**: None if nothing found or the possible language - - """ - try: - encoding = _detect_encoding(raw_content) - content = raw_content.decode(encoding) - lang = _cleanup_classname( - guess_lexer(content).name) - except ClassNotFound: - lang = None - except Exception: - if log: - log.exception('Problem during language detection, skipping') - lang = None - return { - 'lang': lang - } - - -class LanguageIndexer(ContentIndexer): - """Indexer in charge of: - - - filtering out content already indexed - - reading content from objstorage per the content's id (sha1) - - computing {mimetype, encoding} from that content - - store result in storage - - """ - CONFIG_BASE_FILENAME = 'indexer/language' - - ADDITIONAL_CONFIG = { - 'tools': ('dict', { - 'name': 'pygments', - 'version': '2.0.1+dfsg-1.1+deb8u1', - 'configuration': { - 'type': 'library', - 'debian-package': 'python3-pygments', - 'max_content_size': 10240, - }, - }), - } - - @property - def max_content_size(self): - return self.tool['tool_configuration']['max_content_size'] - - def filter(self, ids): - """Filter out known sha1s and return only missing ones. - - """ - yield from self.idx_storage.content_language_missing(( - { - 'id': sha1, - 'indexer_configuration_id': self.tool['id'] - } for sha1 in ids - )) - - def index(self, id, data): - """Index sha1s' content and store result. - - Args: - id (bytes): content's identifier - data (bytes): raw content in bytes - - Returns: - dict: Dict that represents a content_mimetype, with keys: - - id (bytes): content's identifier (sha1) - - lang (bytes): detected language - - """ - result = { - 'id': id, - 'indexer_configuration_id': self.tool['id'], - 'lang': None, - } - - encoding = _detect_encoding(data) - - if not encoding: - return result - - _len = len(data) - for i in range(0, 9): - max_size = self.max_content_size + i - - try: - result = compute_language_from_chunk( - encoding, _len, data, max_size, log=self.log) - except UnicodeDecodeError: - self.log.warning( - 'Decoding failed on wrong byte chunk at [0-%s]' - ', trying again at next ending byte.' % max_size) - continue - - # we found something, so we return it - result.update({ - 'id': id, - 'indexer_configuration_id': self.tool['id'], - }) - break - - return result - - def persist_index_computations(self, results, policy_update): - """Persist the results in storage. - - Args: - results ([dict]): list of content_mimetype, dict with the - following keys: - - id (bytes): content's identifier (sha1) - - lang (bytes): detected language - policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them - - """ - self.idx_storage.content_language_add( - results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index e9fe41e..c3a7e5e 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,335 +1,336 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from copy import deepcopy from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer from swh.indexer.origin_head import OriginHeadIndexer from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.indexer.storage import INDEXER_CFG_KEY from swh.model import hashutil class ContentMetadataIndexer(ContentIndexer): """Content-level indexer This indexer is in charge of: - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table """ def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_metadata_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def index(self, id, data, log_suffix='unknown revision'): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: dictionary representing a content_metadata. If the translation wasn't successful the metadata keys will be returned as None """ result = { 'id': id, 'indexer_configuration_id': self.tool['id'], 'metadata': None } try: mapping_name = self.tool['tool_configuration']['context'] log_suffix += ', content_id=%s' % hashutil.hash_to_hex(id) result['metadata'] = \ MAPPINGS[mapping_name](log_suffix).translate(data) except Exception: self.log.exception( "Problem during metadata translation " "for content %s" % hashutil.hash_to_hex(id)) if result['metadata'] is None: return None return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_metadata, dict with the following keys: - id (bytes): content's identifier (sha1) - metadata (jsonb): detected metadata policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_metadata_add( results, conflict_update=(policy_update == 'update-dups')) class RevisionMetadataIndexer(RevisionIndexer): """Revision-level indexer This indexer is in charge of: - filtering revisions already indexed in revision_intrinsic_metadata table with defined computation tool - retrieve all entry_files in root directory - use metadata_detector for file_names containing metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - store the results for revision """ ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'swh-metadata-detector', 'version': '0.0.2', 'configuration': { }, }), } def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.revision_intrinsic_metadata_missing(( { 'id': sha1_git, 'indexer_configuration_id': self.tool['id'], } for sha1_git in sha1_gits )) def index(self, rev): """Index rev by processing it and organizing result. use metadata_detector to iterate on filenames - if one filename detected -> sends file to content indexer - if multiple file detected -> translation needed at revision level Args: rev (dict): revision artifact from storage Returns: dict: dictionary representing a revision_intrinsic_metadata, with keys: - id (str): rev's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - metadata: dict of retrieved metadata """ result = { 'id': rev['id'], 'indexer_configuration_id': self.tool['id'], 'mappings': None, 'metadata': None } try: root_dir = rev['directory'] dir_ls = list(self.storage.directory_ls(root_dir, recursive=False)) if [entry['type'] for entry in dir_ls] == ['dir']: # If the root is just a single directory, recurse into it # eg. PyPI packages, GNU tarballs subdir = dir_ls[0]['target'] dir_ls = self.storage.directory_ls(subdir, recursive=False) files = [entry for entry in dir_ls if entry['type'] == 'file'] detected_files = detect_metadata(files) (mappings, metadata) = self.translate_revision_intrinsic_metadata( detected_files, log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id'])) result['mappings'] = mappings result['metadata'] = metadata except Exception as e: self.log.exception( 'Problem when indexing rev: %r', e) return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ # TODO: add functions in storage to keep data in # revision_intrinsic_metadata self.idx_storage.revision_intrinsic_metadata_add( results, conflict_update=(policy_update == 'update-dups')) def translate_revision_intrinsic_metadata( self, detected_files, log_suffix): """ Determine plan of action to translate metadata when containing one or multiple detected files: Args: detected_files (dict): dictionary mapping context names (e.g., "npm", "authors") to list of sha1 Returns: (List[str], dict): list of mappings used and dict with translated metadata according to the CodeMeta vocabulary """ used_mappings = [MAPPINGS[context].name for context in detected_files] metadata = [] tool = { 'name': 'swh-metadata-translator', 'version': '0.0.2', 'configuration': { }, } # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content config = { k: self.config[k] for k in [INDEXER_CFG_KEY, 'objstorage', 'storage'] } config['tools'] = [tool] for context in detected_files.keys(): cfg = deepcopy(config) cfg['tools'][0]['configuration']['context'] = context c_metadata_indexer = ContentMetadataIndexer(config=cfg) # sha1s that are in content_metadata table sha1s_in_storage = [] metadata_generator = self.idx_storage.content_metadata_get( detected_files[context]) for c in metadata_generator: # extracting metadata sha1 = c['id'] sha1s_in_storage.append(sha1) local_metadata = c['metadata'] # local metadata is aggregated if local_metadata: metadata.append(local_metadata) sha1s_filtered = [item for item in detected_files[context] if item not in sha1s_in_storage] if sha1s_filtered: # content indexing try: c_metadata_indexer.run(sha1s_filtered, policy_update='ignore-dups', log_suffix=log_suffix) # on the fly possibility: for result in c_metadata_indexer.results: local_metadata = result['metadata'] metadata.append(local_metadata) except Exception: self.log.exception( "Exception while indexing metadata on contents") # transform metadata into min set with swh-metadata-detector min_metadata = extract_minimal_metadata_dict(metadata) return (used_mappings, min_metadata) class OriginMetadataIndexer(OriginIndexer): ADDITIONAL_CONFIG = RevisionMetadataIndexer.ADDITIONAL_CONFIG USE_TOOLS = False def __init__(self, config=None, **kwargs): super().__init__(config=config, **kwargs) self.origin_head_indexer = OriginHeadIndexer(config=config) self.revision_metadata_indexer = RevisionMetadataIndexer(config=config) def index_list(self, origins): head_rev_ids = [] origins_with_head = [] for origin in origins: head_result = self.origin_head_indexer.index(origin) if head_result: origins_with_head.append(origin) head_rev_ids.append(head_result['revision_id']) head_revs = list(self.storage.revision_get(head_rev_ids)) assert len(head_revs) == len(head_rev_ids) results = [] for (origin, rev) in zip(origins_with_head, head_revs): if not rev: self.log.warning('Missing head revision of origin %r', origin) continue rev_metadata = self.revision_metadata_indexer.index(rev) orig_metadata = { 'from_revision': rev_metadata['id'], 'id': origin['id'], 'metadata': rev_metadata['metadata'], 'mappings': rev_metadata['mappings'], 'indexer_configuration_id': rev_metadata['indexer_configuration_id'], } results.append((orig_metadata, rev_metadata)) return results def persist_index_computations(self, results, policy_update): conflict_update = (policy_update == 'update-dups') # Deduplicate revisions rev_metadata = [] orig_metadata = [] revs_to_delete = [] origs_to_delete = [] for (orig_item, rev_item) in results: - assert rev_item['mappings'] == orig_item['mappings'] - if rev_item['mappings']: - # Only store translated metadata if we found a metadata file. - # Otherwise it's just an empty dict with a "@context" key. - if rev_item not in rev_metadata: - rev_metadata.append(rev_item) - if orig_item not in orig_metadata: - orig_metadata.append(orig_item) - else: + assert rev_item['metadata'] == orig_item['metadata'] + if not rev_item['metadata'] or \ + rev_item['metadata'].keys() <= {'@context'}: + # If we didn't find any metadata, don't store a DB record + # (and delete existing ones, if any) if rev_item not in revs_to_delete: revs_to_delete.append(rev_item) if orig_item not in origs_to_delete: origs_to_delete.append(orig_item) + else: + if rev_item not in rev_metadata: + rev_metadata.append(rev_item) + if orig_item not in orig_metadata: + orig_metadata.append(orig_item) if rev_metadata: self.idx_storage.revision_intrinsic_metadata_add( rev_metadata, conflict_update=conflict_update) if orig_metadata: self.idx_storage.origin_intrinsic_metadata_add( orig_metadata, conflict_update=conflict_update) # revs_to_delete should always be empty unless we changed a mapping - # to detect less files. + # to detect less files or less content. # However, origs_to_delete may be empty whenever an upstream deletes # a metadata file. if origs_to_delete: self.idx_storage.origin_intrinsic_metadata_delete(origs_to_delete) if revs_to_delete: self.idx_storage.revision_intrinsic_metadata_delete(revs_to_delete) diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql index 9fcc66c..2bd0bd5 100644 --- a/swh/indexer/sql/30-swh-schema.sql +++ b/swh/indexer/sql/30-swh-schema.sql @@ -1,145 +1,145 @@ --- --- Software Heritage Indexers Data Model --- -- drop schema if exists swh cascade; -- create schema swh; -- set search_path to swh; create table dbversion ( version int primary key, release timestamptz, description text ); insert into dbversion(version, release, description) - values(122, now(), 'Work In Progress'); + values(124, now(), 'Work In Progress'); -- Computing metadata on sha1's contents -- a SHA1 checksum (not necessarily originating from Git) create domain sha1 as bytea check (length(value) = 20); -- a Git object ID, i.e., a SHA1 checksum create domain sha1_git as bytea check (length(value) = 20); create table indexer_configuration ( id serial not null, tool_name text not null, tool_version text not null, tool_configuration jsonb ); comment on table indexer_configuration is 'Indexer''s configuration version'; comment on column indexer_configuration.id is 'Tool identifier'; comment on column indexer_configuration.tool_version is 'Tool name'; comment on column indexer_configuration.tool_version is 'Tool version'; comment on column indexer_configuration.tool_configuration is 'Tool configuration: command line, flags, etc...'; -- Properties (mimetype, encoding, etc...) create table content_mimetype ( id sha1 not null, mimetype text not null, encoding text not null, indexer_configuration_id bigint not null ); comment on table content_mimetype is 'Metadata associated to a raw content'; comment on column content_mimetype.mimetype is 'Raw content Mimetype'; comment on column content_mimetype.encoding is 'Raw content encoding'; comment on column content_mimetype.indexer_configuration_id is 'Tool used to compute the information'; -- Language metadata create table content_language ( id sha1 not null, lang languages not null, indexer_configuration_id bigint not null ); comment on table content_language is 'Language information on a raw content'; comment on column content_language.lang is 'Language information'; comment on column content_language.indexer_configuration_id is 'Tool used to compute the information'; -- ctags information per content create table content_ctags ( id sha1 not null, name text not null, kind text not null, line bigint not null, lang ctags_languages not null, indexer_configuration_id bigint not null ); comment on table content_ctags is 'Ctags information on a raw content'; comment on column content_ctags.id is 'Content identifier'; comment on column content_ctags.name is 'Symbol name'; comment on column content_ctags.kind is 'Symbol kind (function, class, variable, const...)'; comment on column content_ctags.line is 'Symbol line'; comment on column content_ctags.lang is 'Language information for that content'; comment on column content_ctags.indexer_configuration_id is 'Tool used to compute the information'; create table fossology_license( id smallserial, name text not null ); comment on table fossology_license is 'Possible license recognized by license indexer'; comment on column fossology_license.id is 'License identifier'; comment on column fossology_license.name is 'License name'; create table content_fossology_license ( id sha1 not null, license_id smallserial not null, indexer_configuration_id bigint not null ); comment on table content_fossology_license is 'license associated to a raw content'; comment on column content_fossology_license.id is 'Raw content identifier'; comment on column content_fossology_license.license_id is 'One of the content''s license identifier'; comment on column content_fossology_license.indexer_configuration_id is 'Tool used to compute the information'; -- The table content_metadata provides a translation to files -- identified as potentially containning metadata with a translation tool (indexer_configuration_id) create table content_metadata( id sha1 not null, metadata jsonb not null, indexer_configuration_id bigint not null ); comment on table content_metadata is 'metadata semantically translated from a content file'; comment on column content_metadata.id is 'sha1 of content file'; comment on column content_metadata.metadata is 'result of translation with defined format'; comment on column content_metadata.indexer_configuration_id is 'tool used for translation'; -- The table revision_intrinsic_metadata provides a minimal set of intrinsic -- metadata detected with the detection tool (indexer_configuration_id) and -- aggregated from the content_metadata translation. create table revision_intrinsic_metadata( id sha1_git not null, metadata jsonb not null, indexer_configuration_id bigint not null, mappings text array not null ); comment on table revision_intrinsic_metadata is 'metadata semantically detected and translated in a revision'; comment on column revision_intrinsic_metadata.id is 'sha1_git of revision'; comment on column revision_intrinsic_metadata.metadata is 'result of detection and translation with defined format'; comment on column revision_intrinsic_metadata.indexer_configuration_id is 'tool used for detection'; comment on column revision_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; create table origin_intrinsic_metadata( id bigserial not null, metadata jsonb, indexer_configuration_id bigint not null, from_revision sha1_git not null, metadata_tsvector tsvector, mappings text array not null ); comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin'; comment on column origin_intrinsic_metadata.id is 'the entry id in origin'; comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision'; comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata'; comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.'; comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; diff --git a/swh/indexer/sql/60-swh-indexes.sql b/swh/indexer/sql/60-swh-indexes.sql index 80ffe8e..c12622c 100644 --- a/swh/indexer/sql/60-swh-indexes.sql +++ b/swh/indexer/sql/60-swh-indexes.sql @@ -1,69 +1,67 @@ -- fossology_license create unique index fossology_license_pkey on fossology_license(id); alter table fossology_license add primary key using index fossology_license_pkey; create unique index on fossology_license(name); -- indexer_configuration create unique index concurrently indexer_configuration_pkey on indexer_configuration(id); alter table indexer_configuration add primary key using index indexer_configuration_pkey; create unique index on indexer_configuration(tool_name, tool_version, tool_configuration); -- content_ctags create index on content_ctags(id); create index on content_ctags(hash_sha1(name)); create unique index on content_ctags(id, hash_sha1(name), kind, line, lang, indexer_configuration_id); alter table content_ctags add constraint content_ctags_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table content_ctags validate constraint content_ctags_indexer_configuration_id_fkey; -- content_metadata create unique index content_metadata_pkey on content_metadata(id, indexer_configuration_id); alter table content_metadata add primary key using index content_metadata_pkey; alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey; -- revision_intrinsic_metadata create unique index revision_intrinsic_metadata_pkey on revision_intrinsic_metadata(id, indexer_configuration_id); alter table revision_intrinsic_metadata add primary key using index revision_intrinsic_metadata_pkey; alter table revision_intrinsic_metadata add constraint revision_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table revision_intrinsic_metadata validate constraint revision_intrinsic_metadata_indexer_configuration_id_fkey; -- content_mimetype create unique index content_mimetype_pkey on content_mimetype(id, indexer_configuration_id); alter table content_mimetype add primary key using index content_mimetype_pkey; alter table content_mimetype add constraint content_mimetype_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table content_mimetype validate constraint content_mimetype_indexer_configuration_id_fkey; -- content_language create unique index content_language_pkey on content_language(id, indexer_configuration_id); alter table content_language add primary key using index content_language_pkey; alter table content_language add constraint content_language_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table content_language validate constraint content_language_indexer_configuration_id_fkey; -- content_fossology_license create unique index content_fossology_license_pkey on content_fossology_license(id, license_id, indexer_configuration_id); alter table content_fossology_license add primary key using index content_fossology_license_pkey; alter table content_fossology_license add constraint content_fossology_license_license_id_fkey foreign key (license_id) references fossology_license(id) not valid; alter table content_fossology_license validate constraint content_fossology_license_license_id_fkey; alter table content_fossology_license add constraint content_fossology_license_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table content_fossology_license validate constraint content_fossology_license_indexer_configuration_id_fkey; -- origin_intrinsic_metadata create unique index origin_intrinsic_metadata_pkey on origin_intrinsic_metadata(id, indexer_configuration_id); alter table origin_intrinsic_metadata add primary key using index origin_intrinsic_metadata_pkey; alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey; -alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_revision_metadata_fkey foreign key (from_revision, indexer_configuration_id) references revision_intrinsic_metadata(id, indexer_configuration_id) not valid; -alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_revision_metadata_fkey; create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector); create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings); diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py index 97f921c..dc47146 100644 --- a/swh/indexer/tasks.py +++ b/swh/indexer/tasks.py @@ -1,64 +1,57 @@ # Copyright (C) 2016-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import current_app as app from .mimetype import MimetypeIndexer, MimetypeRangeIndexer -from .language import LanguageIndexer from .ctags import CtagsIndexer from .fossology_license import ( FossologyLicenseIndexer, FossologyLicenseRangeIndexer ) from .rehash import RecomputeChecksums from .metadata import OriginMetadataIndexer @app.task(name=__name__ + '.OriginMetadata') def origin_metadata(*args, **kwargs): results = OriginMetadataIndexer().run(*args, **kwargs) return getattr(results, 'results', results) -@app.task(name=__name__ + '.ContentLanguage') -def content_language(*args, **kwargs): - results = LanguageIndexer().run(*args, **kwargs) - return getattr(results, 'results', results) - - @app.task(name=__name__ + '.Ctags') def ctags(*args, **kwargs): results = CtagsIndexer().run(*args, **kwargs) return getattr(results, 'results', results) @app.task(name=__name__ + '.ContentFossologyLicense') def fossology_license(*args, **kwargs): results = FossologyLicenseIndexer().run(*args, **kwargs) return getattr(results, 'results', results) @app.task(name=__name__ + '.RecomputeChecksums') def recompute_checksums(*args, **kwargs): results = RecomputeChecksums().run(*args, **kwargs) return getattr(results, 'results', results) @app.task(name=__name__ + '.ContentMimetype') def mimetype(*args, **kwargs): results = MimetypeIndexer().run(*args, **kwargs) return {'status': 'eventful' if results else 'uneventful'} @app.task(name=__name__ + '.ContentRangeMimetype') def range_mimetype(*args, **kwargs): results = MimetypeRangeIndexer().run(*args, **kwargs) return {'status': 'eventful' if results else 'uneventful'} @app.task(name=__name__ + '.ContentRangeFossologyLicense') def range_license(*args, **kwargs): results = FossologyLicenseRangeIndexer().run(*args, **kwargs) return {'status': 'eventful' if results else 'uneventful'} diff --git a/swh/indexer/tests/storage/test_api_client.py b/swh/indexer/tests/storage/test_api_client.py index 4b51858..a139099 100644 --- a/swh/indexer/tests/storage/test_api_client.py +++ b/swh/indexer/tests/storage/test_api_client.py @@ -1,38 +1,38 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest -from swh.core.tests.server_testing import ServerTestFixture +from swh.core.api.tests.server_testing import ServerTestFixture from swh.indexer.storage import INDEXER_CFG_KEY from swh.indexer.storage.api.client import RemoteStorage from swh.indexer.storage.api.server import app from .test_storage import CommonTestStorage, BasePgTestStorage class TestRemoteStorage(CommonTestStorage, ServerTestFixture, BasePgTestStorage, unittest.TestCase): """Test the indexer's remote storage API. This class doesn't define any tests as we want identical functionality between local and remote storage. All the tests are therefore defined in `class`:swh.indexer.storage.test_storage.CommonTestStorage. """ def setUp(self): self.config = { INDEXER_CFG_KEY: { 'cls': 'local', 'args': { 'db': 'dbname=%s' % self.TEST_DB_NAME, } } } self.app = app super().setUp() self.storage = RemoteStorage(self.url()) diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py index f4f78eb..2843f38 100644 --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -1,1964 +1,1964 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import threading import unittest import pytest from hypothesis import given from swh.model.hashutil import hash_to_bytes from swh.indexer.storage import get_indexer_storage, MAPPING_NAMES -from swh.core.tests.db_testing import SingleDbTestFixture +from swh.core.db.tests.db_testing import SingleDbTestFixture from swh.indexer.tests.storage.generate_data_test import ( gen_content_mimetypes, gen_content_fossology_licenses ) from swh.indexer.tests.storage import SQL_DIR from swh.indexer.metadata_dictionary import MAPPINGS TOOLS = [ { 'tool_name': 'universal-ctags', 'tool_version': '~git7859817b', 'tool_configuration': { "command_line": "ctags --fields=+lnz --sort=no --links=no " "--output-format=json "} }, { 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': {"type": "local", "context": "NpmMapping"}, }, { 'tool_name': 'swh-metadata-detector', 'tool_version': '0.0.1', 'tool_configuration': { "type": "local", "context": ["NpmMapping", "CodemetaMapping"]}, }, { 'tool_name': 'swh-metadata-detector2', 'tool_version': '0.0.1', 'tool_configuration': { "type": "local", "context": ["NpmMapping", "CodemetaMapping"]}, }, { 'tool_name': 'file', 'tool_version': '5.22', 'tool_configuration': {"command_line": "file --mime "}, }, { 'tool_name': 'pygments', 'tool_version': '2.0.1+dfsg-1.1+deb8u1', 'tool_configuration': { "type": "library", "debian-package": "python3-pygments"}, }, { 'tool_name': 'pygments', 'tool_version': '2.0.1+dfsg-1.1+deb8u1', 'tool_configuration': { "type": "library", "debian-package": "python3-pygments", "max_content_size": 10240 }, }, { 'tool_name': 'nomos', 'tool_version': '3.1.0rc2-31-ga2cbb8c', 'tool_configuration': {"command_line": "nomossa "}, } ] @pytest.mark.db class BasePgTestStorage(SingleDbTestFixture): """Base test class for most indexer tests. It adds support for Storage testing to the SingleDbTestFixture class. It will also build the database from the swh-indexed/sql/*.sql files. """ TEST_DB_NAME = 'softwareheritage-test-indexer' TEST_DB_DUMP = os.path.join(SQL_DIR, '*.sql') def setUp(self): super().setUp() self.storage_config = { 'cls': 'local', 'args': { 'db': 'dbname=%s' % self.TEST_DB_NAME, }, } def tearDown(self): self.reset_storage_tables() self.storage = None super().tearDown() def reset_storage_tables(self): excluded = {'indexer_configuration'} self.reset_db_tables(self.TEST_DB_NAME, excluded=excluded) db = self.test_db[self.TEST_DB_NAME] db.conn.commit() def gen_generic_endpoint_tests(endpoint_type, tool_name, example_data1, example_data2): def rename(f): f.__name__ = 'test_' + endpoint_type + f.__name__ return f def endpoint(self, endpoint_name): return getattr(self.storage, endpoint_type + '_' + endpoint_name) @rename def missing(self): # given tool_id = self.tools[tool_name]['id'] query = [ { 'id': self.sha1_1, 'indexer_configuration_id': tool_id, }, { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, }] # when actual_missing = endpoint(self, 'missing')(query) # then self.assertEqual(list(actual_missing), [ self.sha1_1, self.sha1_2, ]) # given endpoint(self, 'add')([{ 'id': self.sha1_2, **example_data1, 'indexer_configuration_id': tool_id, }]) # when actual_missing = endpoint(self, 'missing')(query) # then self.assertEqual(list(actual_missing), [self.sha1_1]) @rename def add__drop_duplicate(self): # given tool_id = self.tools[tool_name]['id'] data_v1 = { 'id': self.sha1_2, **example_data1, 'indexer_configuration_id': tool_id, } # given endpoint(self, 'add')([data_v1]) # when actual_data = list(endpoint(self, 'get')([self.sha1_2])) # then expected_data_v1 = [{ 'id': self.sha1_2, **example_data1, 'tool': self.tools[tool_name], }] self.assertEqual(actual_data, expected_data_v1) # given data_v2 = data_v1.copy() data_v2.update(example_data2) endpoint(self, 'add')([data_v2]) actual_data = list(endpoint(self, 'get')([self.sha1_2])) # data did not change as the v2 was dropped. self.assertEqual(actual_data, expected_data_v1) @rename def add__update_in_place_duplicate(self): # given tool_id = self.tools[tool_name]['id'] data_v1 = { 'id': self.sha1_2, **example_data1, 'indexer_configuration_id': tool_id, } # given endpoint(self, 'add')([data_v1]) # when actual_data = list(endpoint(self, 'get')([self.sha1_2])) expected_data_v1 = [{ 'id': self.sha1_2, **example_data1, 'tool': self.tools[tool_name], }] # then self.assertEqual(actual_data, expected_data_v1) # given data_v2 = data_v1.copy() data_v2.update(example_data2) endpoint(self, 'add')([data_v2], conflict_update=True) actual_data = list(endpoint(self, 'get')([self.sha1_2])) expected_data_v2 = [{ 'id': self.sha1_2, **example_data2, 'tool': self.tools[tool_name], }] # data did change as the v2 was used to overwrite v1 self.assertEqual(actual_data, expected_data_v2) @rename def add__update_in_place_deadlock(self): # given tool_id = self.tools[tool_name]['id'] hashes = [ hash_to_bytes( '34973274ccef6ab4dfaaf86599792fa9c3fe4{:03d}'.format(i)) for i in range(1000)] data_v1 = [ { 'id': hash_, **example_data1, 'indexer_configuration_id': tool_id, } for hash_ in hashes ] data_v2 = [ { 'id': hash_, **example_data2, 'indexer_configuration_id': tool_id, } for hash_ in hashes ] # Remove one item from each, so that both queries have to succeed for # all items to be in the DB. data_v2a = data_v2[1:] data_v2b = list(reversed(data_v2[0:-1])) # given endpoint(self, 'add')(data_v1) # when actual_data = list(endpoint(self, 'get')(hashes)) expected_data_v1 = [ { 'id': hash_, **example_data1, 'tool': self.tools[tool_name], } for hash_ in hashes ] # then self.assertEqual(actual_data, expected_data_v1) # given def f1(): endpoint(self, 'add')(data_v2a, conflict_update=True) def f2(): endpoint(self, 'add')(data_v2b, conflict_update=True) t1 = threading.Thread(target=f1) t2 = threading.Thread(target=f2) t2.start() t1.start() t1.join() t2.join() actual_data = list(endpoint(self, 'get')(hashes)) expected_data_v2 = [ { 'id': hash_, **example_data2, 'tool': self.tools[tool_name], } for hash_ in hashes ] self.assertCountEqual(actual_data, expected_data_v2) def add__duplicate_twice(self): # given tool_id = self.tools[tool_name]['id'] data_rev1 = { 'id': self.revision_id_2, **example_data1, 'indexer_configuration_id': tool_id } data_rev2 = { 'id': self.revision_id_2, **example_data2, 'indexer_configuration_id': tool_id } # when endpoint(self, 'add')([data_rev1]) with self.assertRaises(ValueError): endpoint(self, 'add')( [data_rev2, data_rev2], conflict_update=True) # then actual_data = list(endpoint(self, 'get')( [self.revision_id_2, self.revision_id_1])) expected_data = [{ 'id': self.revision_id_2, **example_data1, 'tool': self.tools[tool_name] }] self.assertEqual(actual_data, expected_data) @rename def get(self): # given tool_id = self.tools[tool_name]['id'] query = [self.sha1_2, self.sha1_1] data1 = { 'id': self.sha1_2, **example_data1, 'indexer_configuration_id': tool_id, } # when endpoint(self, 'add')([data1]) # then actual_data = list(endpoint(self, 'get')(query)) # then expected_data = [{ 'id': self.sha1_2, **example_data1, 'tool': self.tools[tool_name] }] self.assertEqual(actual_data, expected_data) @rename def delete(self): # given tool_id = self.tools[tool_name]['id'] query = [self.sha1_2, self.sha1_1] data1 = { 'id': self.sha1_2, **example_data1, 'indexer_configuration_id': tool_id, } # when endpoint(self, 'add')([data1]) endpoint(self, 'delete')([ { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, } ]) # then actual_data = list(endpoint(self, 'get')(query)) # then self.assertEqual(actual_data, []) @rename def delete_nonexisting(self): tool_id = self.tools[tool_name]['id'] endpoint(self, 'delete')([ { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, } ]) return ( missing, add__drop_duplicate, add__update_in_place_duplicate, add__update_in_place_deadlock, add__duplicate_twice, get, delete, delete_nonexisting, ) class CommonTestStorage: """Base class for Indexer Storage testing. """ def setUp(self): super().setUp() self.storage = get_indexer_storage(**self.storage_config) tools = self.storage.indexer_configuration_add(TOOLS) self.tools = {} for tool in tools: tool_name = tool['tool_name'] while tool_name in self.tools: tool_name += '_' self.tools[tool_name] = { 'id': tool['id'], 'name': tool['tool_name'], 'version': tool['tool_version'], 'configuration': tool['tool_configuration'], } self.sha1_1 = hash_to_bytes('34973274ccef6ab4dfaaf86599792fa9c3fe4689') self.sha1_2 = hash_to_bytes('61c2b3a30496d329e21af70dd2d7e097046d07b7') self.revision_id_1 = hash_to_bytes( '7026b7c1a2af56521e951c01ed20f255fa054238') self.revision_id_2 = hash_to_bytes( '7026b7c1a2af56521e9587659012345678904321') self.revision_id_3 = hash_to_bytes( '7026b7c1a2af56521e9587659012345678904320') self.origin_id_1 = 44434341 self.origin_id_2 = 44434342 self.origin_id_3 = 54974445 def test_check_config(self): self.assertTrue(self.storage.check_config(check_write=True)) self.assertTrue(self.storage.check_config(check_write=False)) # generate content_mimetype tests ( test_content_mimetype_missing, test_content_mimetype_add__drop_duplicate, test_content_mimetype_add__update_in_place_duplicate, test_content_mimetype_add__update_in_place_deadlock, test_content_mimetype_add__duplicate_twice, test_content_mimetype_get, _, # content_mimetype_detete, _, # content_mimetype_detete_nonexisting, ) = gen_generic_endpoint_tests( endpoint_type='content_mimetype', tool_name='file', example_data1={ 'mimetype': 'text/plain', 'encoding': 'utf-8', }, example_data2={ 'mimetype': 'text/html', 'encoding': 'us-ascii', }, ) # content_language tests ( test_content_language_missing, test_content_language_add__drop_duplicate, test_content_language_add__update_in_place_duplicate, test_content_language_add__update_in_place_deadlock, test_content_language_add__duplicate_twice, test_content_language_get, _, # test_content_language_delete, _, # test_content_language_delete_nonexisting, ) = gen_generic_endpoint_tests( endpoint_type='content_language', tool_name='pygments', example_data1={ 'lang': 'haskell', }, example_data2={ 'lang': 'common-lisp', }, ) # content_ctags tests ( test_content_ctags_missing, # the following tests are disabled because CTAGS behave differently _, # test_content_ctags_add__drop_duplicate, _, # test_content_ctags_add__update_in_place_duplicate, _, # test_content_ctags_add__update_in_place_deadlock, _, # test_content_ctags_add__duplicate_twice, _, # test_content_ctags_get, _, # test_content_ctags_delete, _, # test_content_ctags_delete_nonexisting, ) = gen_generic_endpoint_tests( endpoint_type='content_ctags', tool_name='universal-ctags', example_data1={ 'ctags': [{ 'name': 'done', 'kind': 'variable', 'line': 119, 'lang': 'OCaml', }] }, example_data2={ 'ctags': [ { 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Python', }, { 'name': 'main', 'kind': 'function', 'line': 119, 'lang': 'Python', }] }, ) def test_content_ctags_search(self): # 1. given tool = self.tools['universal-ctags'] tool_id = tool['id'] ctag1 = { 'id': self.sha1_1, 'indexer_configuration_id': tool_id, 'ctags': [ { 'name': 'hello', 'kind': 'function', 'line': 133, 'lang': 'Python', }, { 'name': 'counter', 'kind': 'variable', 'line': 119, 'lang': 'Python', }, { 'name': 'hello', 'kind': 'variable', 'line': 210, 'lang': 'Python', }, ] } ctag2 = { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [ { 'name': 'hello', 'kind': 'variable', 'line': 100, 'lang': 'C', }, { 'name': 'result', 'kind': 'variable', 'line': 120, 'lang': 'C', }, ] } self.storage.content_ctags_add([ctag1, ctag2]) # 1. when actual_ctags = list(self.storage.content_ctags_search('hello', limit=1)) # 1. then self.assertEqual(actual_ctags, [ { 'id': ctag1['id'], 'tool': tool, 'name': 'hello', 'kind': 'function', 'line': 133, 'lang': 'Python', } ]) # 2. when actual_ctags = list(self.storage.content_ctags_search( 'hello', limit=1, last_sha1=ctag1['id'])) # 2. then self.assertEqual(actual_ctags, [ { 'id': ctag2['id'], 'tool': tool, 'name': 'hello', 'kind': 'variable', 'line': 100, 'lang': 'C', } ]) # 3. when actual_ctags = list(self.storage.content_ctags_search('hello')) # 3. then self.assertEqual(actual_ctags, [ { 'id': ctag1['id'], 'tool': tool, 'name': 'hello', 'kind': 'function', 'line': 133, 'lang': 'Python', }, { 'id': ctag1['id'], 'tool': tool, 'name': 'hello', 'kind': 'variable', 'line': 210, 'lang': 'Python', }, { 'id': ctag2['id'], 'tool': tool, 'name': 'hello', 'kind': 'variable', 'line': 100, 'lang': 'C', }, ]) # 4. when actual_ctags = list(self.storage.content_ctags_search('counter')) # then self.assertEqual(actual_ctags, [{ 'id': ctag1['id'], 'tool': tool, 'name': 'counter', 'kind': 'variable', 'line': 119, 'lang': 'Python', }]) # 5. when actual_ctags = list(self.storage.content_ctags_search('result', limit=1)) # then self.assertEqual(actual_ctags, [{ 'id': ctag2['id'], 'tool': tool, 'name': 'result', 'kind': 'variable', 'line': 120, 'lang': 'C', }]) def test_content_ctags_search_no_result(self): actual_ctags = list(self.storage.content_ctags_search('counter')) self.assertEqual(actual_ctags, []) def test_content_ctags_add__add_new_ctags_added(self): # given tool = self.tools['universal-ctags'] tool_id = tool['id'] ctag_v1 = { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [{ 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', }] } # given self.storage.content_ctags_add([ctag_v1]) self.storage.content_ctags_add([ctag_v1]) # conflict does nothing # when actual_ctags = list(self.storage.content_ctags_get( [self.sha1_2])) # then expected_ctags = [{ 'id': self.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool, }] self.assertEqual(actual_ctags, expected_ctags) # given ctag_v2 = ctag_v1.copy() ctag_v2.update({ 'ctags': [ { 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', } ] }) self.storage.content_ctags_add([ctag_v2]) expected_ctags = [ { 'id': self.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool, }, { 'id': self.sha1_2, 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', 'tool': tool, } ] actual_ctags = list(self.storage.content_ctags_get( [self.sha1_2])) self.assertEqual(actual_ctags, expected_ctags) def test_content_ctags_add__update_in_place(self): # given tool = self.tools['universal-ctags'] tool_id = tool['id'] ctag_v1 = { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [{ 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', }] } # given self.storage.content_ctags_add([ctag_v1]) # when actual_ctags = list(self.storage.content_ctags_get( [self.sha1_2])) # then expected_ctags = [ { 'id': self.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool } ] self.assertEqual(actual_ctags, expected_ctags) # given ctag_v2 = ctag_v1.copy() ctag_v2.update({ 'ctags': [ { 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', }, { 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', } ] }) self.storage.content_ctags_add([ctag_v2], conflict_update=True) actual_ctags = list(self.storage.content_ctags_get( [self.sha1_2])) # ctag did change as the v2 was used to overwrite v1 expected_ctags = [ { 'id': self.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool, }, { 'id': self.sha1_2, 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', 'tool': tool, } ] self.assertEqual(actual_ctags, expected_ctags) # content_fossology_license tests ( _, # The endpoint content_fossology_license_missing does not exist # the following tests are disabled because fossology_license tests # behave differently _, # test_content_fossology_license_add__drop_duplicate, _, # test_content_fossology_license_add__update_in_place_duplicate, _, # test_content_fossology_license_add__update_in_place_deadlock, _, # test_content_metadata_add__duplicate_twice, _, # test_content_fossology_license_get, _, # test_content_fossology_license_delete, _, # test_content_fossology_license_delete_nonexisting, ) = gen_generic_endpoint_tests( endpoint_type='content_fossology_license', tool_name='nomos', example_data1={ 'licenses': ['Apache-2.0'], }, example_data2={ 'licenses': ['BSD-2-Clause'], }, ) def test_content_fossology_license_add__new_license_added(self): # given tool = self.tools['nomos'] tool_id = tool['id'] license_v1 = { 'id': self.sha1_1, 'licenses': ['Apache-2.0'], 'indexer_configuration_id': tool_id, } # given self.storage.content_fossology_license_add([license_v1]) # conflict does nothing self.storage.content_fossology_license_add([license_v1]) # when actual_licenses = list(self.storage.content_fossology_license_get( [self.sha1_1])) # then expected_license = { self.sha1_1: [{ 'licenses': ['Apache-2.0'], 'tool': tool, }] } self.assertEqual(actual_licenses, [expected_license]) # given license_v2 = license_v1.copy() license_v2.update({ 'licenses': ['BSD-2-Clause'], }) self.storage.content_fossology_license_add([license_v2]) actual_licenses = list(self.storage.content_fossology_license_get( [self.sha1_1])) expected_license = { self.sha1_1: [{ 'licenses': ['Apache-2.0', 'BSD-2-Clause'], 'tool': tool }] } # license did not change as the v2 was dropped. self.assertEqual(actual_licenses, [expected_license]) # content_metadata tests ( test_content_metadata_missing, test_content_metadata_add__drop_duplicate, test_content_metadata_add__update_in_place_duplicate, test_content_metadata_add__update_in_place_deadlock, test_content_metadata_add__duplicate_twice, test_content_metadata_get, _, # test_content_metadata_delete, _, # test_content_metadata_delete_nonexisting, ) = gen_generic_endpoint_tests( endpoint_type='content_metadata', tool_name='swh-metadata-detector', example_data1={ 'metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, }, example_data2={ 'metadata': { 'other': {}, 'name': 'test_metadata', 'version': '0.0.1' }, }, ) # revision_intrinsic_metadata tests ( test_revision_intrinsic_metadata_missing, test_revision_intrinsic_metadata_add__drop_duplicate, test_revision_intrinsic_metadata_add__update_in_place_duplicate, test_revision_intrinsic_metadata_add__update_in_place_deadlock, test_revision_intrinsic_metadata_add__duplicate_twice, test_revision_intrinsic_metadata_get, test_revision_intrinsic_metadata_delete, test_revision_intrinsic_metadata_delete_nonexisting, ) = gen_generic_endpoint_tests( endpoint_type='revision_intrinsic_metadata', tool_name='swh-metadata-detector', example_data1={ 'metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'mappings': ['mapping1'], }, example_data2={ 'metadata': { 'other': {}, 'name': 'test_metadata', 'version': '0.0.1' }, 'mappings': ['mapping2'], }, ) def test_origin_intrinsic_metadata_get(self): # given tool_id = self.tools['swh-metadata-detector']['id'] metadata = { 'version': None, 'name': None, } metadata_rev = { 'id': self.revision_id_2, 'metadata': metadata, 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { 'id': self.origin_id_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], 'from_revision': self.revision_id_2, } # when self.storage.revision_intrinsic_metadata_add([metadata_rev]) self.storage.origin_intrinsic_metadata_add([metadata_origin]) # then actual_metadata = list(self.storage.origin_intrinsic_metadata_get( [self.origin_id_1, 42])) expected_metadata = [{ 'id': self.origin_id_1, 'metadata': metadata, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, 'mappings': ['mapping1'], }] self.assertEqual(actual_metadata, expected_metadata) def test_origin_intrinsic_metadata_delete(self): # given tool_id = self.tools['swh-metadata-detector']['id'] metadata = { 'version': None, 'name': None, } metadata_rev = { 'id': self.revision_id_2, 'metadata': metadata, 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { 'id': self.origin_id_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], 'from_revision': self.revision_id_2, } metadata_origin2 = metadata_origin.copy() metadata_origin2['id'] = self.origin_id_2 # when self.storage.revision_intrinsic_metadata_add([metadata_rev]) self.storage.origin_intrinsic_metadata_add([ metadata_origin, metadata_origin2]) self.storage.origin_intrinsic_metadata_delete([ { 'id': self.origin_id_1, 'indexer_configuration_id': tool_id } ]) # then actual_metadata = list(self.storage.origin_intrinsic_metadata_get( [self.origin_id_1, self.origin_id_2, 42])) for item in actual_metadata: item['indexer_configuration_id'] = item.pop('tool')['id'] self.assertEqual(actual_metadata, [metadata_origin2]) def test_origin_intrinsic_metadata_delete_nonexisting(self): tool_id = self.tools['swh-metadata-detector']['id'] self.storage.origin_intrinsic_metadata_delete([ { 'id': self.origin_id_1, 'indexer_configuration_id': tool_id } ]) def test_origin_intrinsic_metadata_add_drop_duplicate(self): # given tool_id = self.tools['swh-metadata-detector']['id'] metadata_v1 = { 'version': None, 'name': None, } metadata_rev_v1 = { 'id': self.revision_id_1, 'metadata': metadata_v1.copy(), 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { 'id': self.origin_id_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, 'mappings': [], 'from_revision': self.revision_id_1, } # given self.storage.revision_intrinsic_metadata_add([metadata_rev_v1]) self.storage.origin_intrinsic_metadata_add([metadata_origin_v1]) # when actual_metadata = list(self.storage.origin_intrinsic_metadata_get( [self.origin_id_1, 42])) expected_metadata_v1 = [{ 'id': self.origin_id_1, 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_1, 'mappings': [], }] self.assertEqual(actual_metadata, expected_metadata_v1) # given metadata_v2 = metadata_v1.copy() metadata_v2.update({ 'name': 'test_metadata', 'author': 'MG', }) metadata_rev_v2 = metadata_rev_v1.copy() metadata_origin_v2 = metadata_origin_v1.copy() metadata_rev_v2['metadata'] = metadata_v2 metadata_origin_v2['metadata'] = metadata_v2 self.storage.revision_intrinsic_metadata_add([metadata_rev_v2]) self.storage.origin_intrinsic_metadata_add([metadata_origin_v2]) # then actual_metadata = list(self.storage.origin_intrinsic_metadata_get( [self.origin_id_1])) # metadata did not change as the v2 was dropped. self.assertEqual(actual_metadata, expected_metadata_v1) def test_origin_intrinsic_metadata_add_update_in_place_duplicate(self): # given tool_id = self.tools['swh-metadata-detector']['id'] metadata_v1 = { 'version': None, 'name': None, } metadata_rev_v1 = { 'id': self.revision_id_2, 'metadata': metadata_v1, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { 'id': self.origin_id_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, 'mappings': [], 'from_revision': self.revision_id_2, } # given self.storage.revision_intrinsic_metadata_add([metadata_rev_v1]) self.storage.origin_intrinsic_metadata_add([metadata_origin_v1]) # when actual_metadata = list(self.storage.origin_intrinsic_metadata_get( [self.origin_id_1])) # then expected_metadata_v1 = [{ 'id': self.origin_id_1, 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, 'mappings': [], }] self.assertEqual(actual_metadata, expected_metadata_v1) # given metadata_v2 = metadata_v1.copy() metadata_v2.update({ 'name': 'test_update_duplicated_metadata', 'author': 'MG', }) metadata_rev_v2 = metadata_rev_v1.copy() metadata_origin_v2 = metadata_origin_v1.copy() metadata_rev_v2['metadata'] = metadata_v2 metadata_origin_v2['metadata'] = metadata_v2 self.storage.revision_intrinsic_metadata_add( [metadata_rev_v2], conflict_update=True) self.storage.origin_intrinsic_metadata_add( [metadata_origin_v2], conflict_update=True) actual_metadata = list(self.storage.origin_intrinsic_metadata_get( [self.origin_id_1])) expected_metadata_v2 = [{ 'id': self.origin_id_1, 'metadata': metadata_v2, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, 'mappings': [], }] # metadata did change as the v2 was used to overwrite v1 self.assertEqual(actual_metadata, expected_metadata_v2) def test_origin_intrinsic_metadata_add__update_in_place_deadlock(self): # given tool_id = self.tools['swh-metadata-detector']['id'] ids = list(range(1000)) example_data1 = { 'metadata': { 'version': None, 'name': None, }, 'mappings': [], } example_data2 = { 'metadata': { 'version': 'v1.1.1', 'name': 'foo', }, 'mappings': [], } metadata_rev_v1 = { 'id': self.revision_id_2, 'metadata': { 'version': None, 'name': None, }, 'mappings': [], 'indexer_configuration_id': tool_id, } data_v1 = [ { 'id': id_, 'from_revision': self.revision_id_2, **example_data1, 'indexer_configuration_id': tool_id, } for id_ in ids ] data_v2 = [ { 'id': id_, 'from_revision': self.revision_id_2, **example_data2, 'indexer_configuration_id': tool_id, } for id_ in ids ] # Remove one item from each, so that both queries have to succeed for # all items to be in the DB. data_v2a = data_v2[1:] data_v2b = list(reversed(data_v2[0:-1])) # given self.storage.revision_intrinsic_metadata_add([metadata_rev_v1]) self.storage.origin_intrinsic_metadata_add(data_v1) # when actual_data = list(self.storage.origin_intrinsic_metadata_get(ids)) expected_data_v1 = [ { 'id': id_, 'from_revision': self.revision_id_2, **example_data1, 'tool': self.tools['swh-metadata-detector'], } for id_ in ids ] # then self.assertEqual(actual_data, expected_data_v1) # given def f1(): self.storage.origin_intrinsic_metadata_add( data_v2a, conflict_update=True) def f2(): self.storage.origin_intrinsic_metadata_add( data_v2b, conflict_update=True) t1 = threading.Thread(target=f1) t2 = threading.Thread(target=f2) t2.start() t1.start() t1.join() t2.join() actual_data = list(self.storage.origin_intrinsic_metadata_get(ids)) expected_data_v2 = [ { 'id': id_, 'from_revision': self.revision_id_2, **example_data2, 'tool': self.tools['swh-metadata-detector'], } for id_ in ids ] self.maxDiff = None self.assertCountEqual(actual_data, expected_data_v2) def test_origin_intrinsic_metadata_add__duplicate_twice(self): # given tool_id = self.tools['swh-metadata-detector']['id'] metadata = { 'developmentStatus': None, 'name': None, } metadata_rev = { 'id': self.revision_id_2, 'metadata': metadata, 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { 'id': self.origin_id_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], 'from_revision': self.revision_id_2, } # when self.storage.revision_intrinsic_metadata_add([metadata_rev]) with self.assertRaises(ValueError): self.storage.origin_intrinsic_metadata_add([ metadata_origin, metadata_origin]) def test_origin_intrinsic_metadata_search_fulltext(self): # given tool_id = self.tools['swh-metadata-detector']['id'] metadata1 = { 'author': 'John Doe', } metadata1_rev = { 'id': self.revision_id_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata1_origin = { 'id': self.origin_id_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_1, } metadata2 = { 'author': 'Jane Doe', } metadata2_rev = { 'id': self.revision_id_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { 'id': self.origin_id_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_2, } # when self.storage.revision_intrinsic_metadata_add([metadata1_rev]) self.storage.origin_intrinsic_metadata_add([metadata1_origin]) self.storage.revision_intrinsic_metadata_add([metadata2_rev]) self.storage.origin_intrinsic_metadata_add([metadata2_origin]) # then search = self.storage.origin_intrinsic_metadata_search_fulltext self.assertCountEqual( [res['id'] for res in search(['Doe'])], [self.origin_id_1, self.origin_id_2]) self.assertEqual( [res['id'] for res in search(['John', 'Doe'])], [self.origin_id_1]) self.assertEqual( [res['id'] for res in search(['John'])], [self.origin_id_1]) self.assertEqual( [res['id'] for res in search(['John', 'Jane'])], []) def test_origin_intrinsic_metadata_search_fulltext_rank(self): # given tool_id = self.tools['swh-metadata-detector']['id'] # The following authors have "Random Person" to add some more content # to the JSON data, to work around normalization quirks when there # are few words (rank/(1+ln(nb_words)) is very sensitive to nb_words # for small values of nb_words). metadata1 = { 'author': [ 'Random Person', 'John Doe', 'Jane Doe', ] } metadata1_rev = { 'id': self.revision_id_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata1_origin = { 'id': self.origin_id_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_1, } metadata2 = { 'author': [ 'Random Person', 'Jane Doe', ] } metadata2_rev = { 'id': self.revision_id_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { 'id': self.origin_id_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_2, } # when self.storage.revision_intrinsic_metadata_add([metadata1_rev]) self.storage.origin_intrinsic_metadata_add([metadata1_origin]) self.storage.revision_intrinsic_metadata_add([metadata2_rev]) self.storage.origin_intrinsic_metadata_add([metadata2_origin]) # then search = self.storage.origin_intrinsic_metadata_search_fulltext self.assertEqual( [res['id'] for res in search(['Doe'])], [self.origin_id_1, self.origin_id_2]) self.assertEqual( [res['id'] for res in search(['Doe'], limit=1)], [self.origin_id_1]) self.assertEqual( [res['id'] for res in search(['John'])], [self.origin_id_1]) self.assertEqual( [res['id'] for res in search(['Jane'])], [self.origin_id_2, self.origin_id_1]) self.assertEqual( [res['id'] for res in search(['John', 'Jane'])], [self.origin_id_1]) def _fill_origin_intrinsic_metadata(self): tool1_id = self.tools['swh-metadata-detector']['id'] tool2_id = self.tools['swh-metadata-detector2']['id'] metadata1 = { '@context': 'foo', 'author': 'John Doe', } metadata1_rev = { 'id': self.revision_id_1, 'metadata': metadata1, 'mappings': ['npm'], 'indexer_configuration_id': tool1_id, } metadata1_origin = { 'id': self.origin_id_1, 'metadata': metadata1, 'mappings': ['npm'], 'indexer_configuration_id': tool1_id, 'from_revision': self.revision_id_1, } metadata2 = { '@context': 'foo', 'author': 'Jane Doe', } metadata2_rev = { 'id': self.revision_id_2, 'metadata': metadata2, 'mappings': ['npm', 'gemspec'], 'indexer_configuration_id': tool2_id, } metadata2_origin = { 'id': self.origin_id_2, 'metadata': metadata2, 'mappings': ['npm', 'gemspec'], 'indexer_configuration_id': tool2_id, 'from_revision': self.revision_id_2, } metadata3 = { '@context': 'foo', } metadata3_rev = { 'id': self.revision_id_3, 'metadata': metadata3, 'mappings': ['npm', 'gemspec'], 'indexer_configuration_id': tool2_id, } metadata3_origin = { 'id': self.origin_id_3, 'metadata': metadata3, 'mappings': ['pkg-info'], 'indexer_configuration_id': tool2_id, 'from_revision': self.revision_id_3, } self.storage.revision_intrinsic_metadata_add([metadata1_rev]) self.storage.origin_intrinsic_metadata_add([metadata1_origin]) self.storage.revision_intrinsic_metadata_add([metadata2_rev]) self.storage.origin_intrinsic_metadata_add([metadata2_origin]) self.storage.revision_intrinsic_metadata_add([metadata3_rev]) self.storage.origin_intrinsic_metadata_add([metadata3_origin]) def test_origin_intrinsic_metadata_search_by_producer(self): self._fill_origin_intrinsic_metadata() tool1 = self.tools['swh-metadata-detector'] tool2 = self.tools['swh-metadata-detector2'] endpoint = self.storage.origin_intrinsic_metadata_search_by_producer # test pagination self.assertCountEqual( endpoint(ids_only=True), [self.origin_id_1, self.origin_id_2, self.origin_id_3]) self.assertCountEqual( endpoint(start=0, ids_only=True), [self.origin_id_1, self.origin_id_2, self.origin_id_3]) self.assertCountEqual( endpoint(start=0, limit=2, ids_only=True), [self.origin_id_1, self.origin_id_2]) self.assertCountEqual( endpoint(start=self.origin_id_1+1, ids_only=True), [self.origin_id_2, self.origin_id_3]) self.assertCountEqual( endpoint(start=self.origin_id_1+1, end=self.origin_id_3-1, ids_only=True), [self.origin_id_2]) # test mappings filtering self.assertCountEqual( endpoint(mappings=['npm'], ids_only=True), [self.origin_id_1, self.origin_id_2]) self.assertCountEqual( endpoint(mappings=['npm', 'gemspec'], ids_only=True), [self.origin_id_1, self.origin_id_2]) self.assertCountEqual( endpoint(mappings=['gemspec'], ids_only=True), [self.origin_id_2]) self.assertCountEqual( endpoint(mappings=['pkg-info'], ids_only=True), [self.origin_id_3]) self.assertCountEqual( endpoint(mappings=['foobar'], ids_only=True), []) # test pagination + mappings self.assertCountEqual( endpoint(mappings=['npm'], limit=1, ids_only=True), [self.origin_id_1]) # test tool filtering self.assertCountEqual( endpoint(tool_ids=[tool1['id']], ids_only=True), [self.origin_id_1]) self.assertCountEqual( endpoint(tool_ids=[tool2['id']], ids_only=True), [self.origin_id_2, self.origin_id_3]) self.assertCountEqual( endpoint(tool_ids=[tool1['id'], tool2['id']], ids_only=True), [self.origin_id_1, self.origin_id_2, self.origin_id_3]) # test ids_only=False self.assertEqual(list(endpoint(mappings=['gemspec'])), [{ 'id': self.origin_id_2, 'metadata': { '@context': 'foo', 'author': 'Jane Doe', }, 'mappings': ['npm', 'gemspec'], 'tool': tool2, 'from_revision': self.revision_id_2, }]) def test_origin_intrinsic_metadata_stats(self): self._fill_origin_intrinsic_metadata() result = self.storage.origin_intrinsic_metadata_stats() self.assertEqual(result, { 'per_mapping': { 'gemspec': 1, 'npm': 2, 'pkg-info': 1, 'codemeta': 0, 'maven': 0, }, 'total': 3, 'non_empty': 2, }) def test_indexer_configuration_add(self): tool = { 'tool_name': 'some-unknown-tool', 'tool_version': 'some-version', 'tool_configuration': {"debian-package": "some-package"}, } actual_tool = self.storage.indexer_configuration_get(tool) self.assertIsNone(actual_tool) # does not exist # add it actual_tools = list(self.storage.indexer_configuration_add([tool])) self.assertEqual(len(actual_tools), 1) actual_tool = actual_tools[0] self.assertIsNotNone(actual_tool) # now it exists new_id = actual_tool.pop('id') self.assertEqual(actual_tool, tool) actual_tools2 = list(self.storage.indexer_configuration_add([tool])) actual_tool2 = actual_tools2[0] self.assertIsNotNone(actual_tool2) # now it exists new_id2 = actual_tool2.pop('id') self.assertEqual(new_id, new_id2) self.assertEqual(actual_tool, actual_tool2) def test_indexer_configuration_add_multiple(self): tool = { 'tool_name': 'some-unknown-tool', 'tool_version': 'some-version', 'tool_configuration': {"debian-package": "some-package"}, } actual_tools = list(self.storage.indexer_configuration_add([tool])) self.assertEqual(len(actual_tools), 1) new_tools = [tool, { 'tool_name': 'yet-another-tool', 'tool_version': 'version', 'tool_configuration': {}, }] actual_tools = list(self.storage.indexer_configuration_add(new_tools)) self.assertEqual(len(actual_tools), 2) # order not guaranteed, so we iterate over results to check for tool in actual_tools: _id = tool.pop('id') self.assertIsNotNone(_id) self.assertIn(tool, new_tools) def test_indexer_configuration_get_missing(self): tool = { 'tool_name': 'unknown-tool', 'tool_version': '3.1.0rc2-31-ga2cbb8c', 'tool_configuration': {"command_line": "nomossa "}, } actual_tool = self.storage.indexer_configuration_get(tool) self.assertIsNone(actual_tool) def test_indexer_configuration_get(self): tool = { 'tool_name': 'nomos', 'tool_version': '3.1.0rc2-31-ga2cbb8c', 'tool_configuration': {"command_line": "nomossa "}, } self.storage.indexer_configuration_add([tool]) actual_tool = self.storage.indexer_configuration_get(tool) expected_tool = tool.copy() del actual_tool['id'] self.assertEqual(expected_tool, actual_tool) def test_indexer_configuration_metadata_get_missing_context(self): tool = { 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': {"context": "unknown-context"}, } actual_tool = self.storage.indexer_configuration_get(tool) self.assertIsNone(actual_tool) def test_indexer_configuration_metadata_get(self): tool = { 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': {"type": "local", "context": "NpmMapping"}, } self.storage.indexer_configuration_add([tool]) actual_tool = self.storage.indexer_configuration_get(tool) expected_tool = tool.copy() expected_tool['id'] = actual_tool['id'] self.assertEqual(expected_tool, actual_tool) @pytest.mark.property_based def test_generate_content_mimetype_get_range_limit_none(self): """mimetype_get_range call with wrong limit input should fail""" with self.assertRaises(ValueError) as e: self.storage.content_mimetype_get_range( start=None, end=None, indexer_configuration_id=None, limit=None) self.assertEqual(e.exception.args, ( 'Development error: limit should not be None',)) @pytest.mark.property_based @given(gen_content_mimetypes(min_size=1, max_size=4)) def test_generate_content_mimetype_get_range_no_limit(self, mimetypes): """mimetype_get_range returns mimetypes within range provided""" self.reset_storage_tables() # add mimetypes to storage self.storage.content_mimetype_add(mimetypes) # All ids from the db content_ids = sorted([c['id'] for c in mimetypes]) start = content_ids[0] end = content_ids[-1] # retrieve mimetypes tool_id = mimetypes[0]['indexer_configuration_id'] actual_result = self.storage.content_mimetype_get_range( start, end, indexer_configuration_id=tool_id) actual_ids = actual_result['ids'] actual_next = actual_result['next'] self.assertEqual(len(mimetypes), len(actual_ids)) self.assertIsNone(actual_next) self.assertEqual(content_ids, actual_ids) @pytest.mark.property_based @given(gen_content_mimetypes(min_size=4, max_size=4)) def test_generate_content_mimetype_get_range_limit(self, mimetypes): """mimetype_get_range paginates results if limit exceeded""" self.reset_storage_tables() # add mimetypes to storage self.storage.content_mimetype_add(mimetypes) # input the list of sha1s we want from storage content_ids = sorted([c['id'] for c in mimetypes]) start = content_ids[0] end = content_ids[-1] # retrieve mimetypes limited to 3 results limited_results = len(mimetypes) - 1 tool_id = mimetypes[0]['indexer_configuration_id'] actual_result = self.storage.content_mimetype_get_range( start, end, indexer_configuration_id=tool_id, limit=limited_results) actual_ids = actual_result['ids'] actual_next = actual_result['next'] self.assertEqual(limited_results, len(actual_ids)) self.assertIsNotNone(actual_next) self.assertEqual(actual_next, content_ids[-1]) expected_mimetypes = content_ids[:-1] self.assertEqual(expected_mimetypes, actual_ids) # retrieve next part actual_results2 = self.storage.content_mimetype_get_range( start=end, end=end, indexer_configuration_id=tool_id) actual_ids2 = actual_results2['ids'] actual_next2 = actual_results2['next'] self.assertIsNone(actual_next2) expected_mimetypes2 = [content_ids[-1]] self.assertEqual(expected_mimetypes2, actual_ids2) @pytest.mark.property_based def test_generate_content_fossology_license_get_range_limit_none(self): """license_get_range call with wrong limit input should fail""" with self.assertRaises(ValueError) as e: self.storage.content_fossology_license_get_range( start=None, end=None, indexer_configuration_id=None, limit=None) self.assertEqual(e.exception.args, ( 'Development error: limit should not be None',)) @pytest.mark.property_based def prepare_mimetypes_from(self, fossology_licenses): """Fossology license needs some consistent data in db to run. """ mimetypes = [] for c in fossology_licenses: mimetypes.append({ 'id': c['id'], 'mimetype': 'text/plain', 'encoding': 'utf-8', 'indexer_configuration_id': c['indexer_configuration_id'], }) return mimetypes @pytest.mark.property_based @given(gen_content_fossology_licenses(min_size=1, max_size=4)) def test_generate_content_fossology_license_get_range_no_limit( self, fossology_licenses): """license_get_range returns licenses within range provided""" self.reset_storage_tables() # craft some consistent mimetypes mimetypes = self.prepare_mimetypes_from(fossology_licenses) self.storage.content_mimetype_add(mimetypes) # add fossology_licenses to storage self.storage.content_fossology_license_add(fossology_licenses) # All ids from the db content_ids = sorted([c['id'] for c in fossology_licenses]) start = content_ids[0] end = content_ids[-1] # retrieve fossology_licenses tool_id = fossology_licenses[0]['indexer_configuration_id'] actual_result = self.storage.content_fossology_license_get_range( start, end, indexer_configuration_id=tool_id) actual_ids = actual_result['ids'] actual_next = actual_result['next'] self.assertEqual(len(fossology_licenses), len(actual_ids)) self.assertIsNone(actual_next) self.assertEqual(content_ids, actual_ids) @pytest.mark.property_based @given(gen_content_fossology_licenses(min_size=1, max_size=4), gen_content_mimetypes(min_size=1, max_size=1)) def test_generate_content_fossology_license_get_range_no_limit_with_filter( self, fossology_licenses, mimetypes): """This filters non textual, then returns results within range""" self.reset_storage_tables() # craft some consistent mimetypes _mimetypes = self.prepare_mimetypes_from(fossology_licenses) # add binary mimetypes which will get filtered out in results for m in mimetypes: _mimetypes.append({ 'mimetype': 'binary', **m, }) self.storage.content_mimetype_add(_mimetypes) # add fossology_licenses to storage self.storage.content_fossology_license_add(fossology_licenses) # All ids from the db content_ids = sorted([c['id'] for c in fossology_licenses]) start = content_ids[0] end = content_ids[-1] # retrieve fossology_licenses tool_id = fossology_licenses[0]['indexer_configuration_id'] actual_result = self.storage.content_fossology_license_get_range( start, end, indexer_configuration_id=tool_id) actual_ids = actual_result['ids'] actual_next = actual_result['next'] self.assertEqual(len(fossology_licenses), len(actual_ids)) self.assertIsNone(actual_next) self.assertEqual(content_ids, actual_ids) @pytest.mark.property_based @given(gen_content_fossology_licenses(min_size=4, max_size=4)) def test_generate_fossology_license_get_range_limit( self, fossology_licenses): """fossology_license_get_range paginates results if limit exceeded""" self.reset_storage_tables() # craft some consistent mimetypes mimetypes = self.prepare_mimetypes_from(fossology_licenses) # add fossology_licenses to storage self.storage.content_mimetype_add(mimetypes) self.storage.content_fossology_license_add(fossology_licenses) # input the list of sha1s we want from storage content_ids = sorted([c['id'] for c in fossology_licenses]) start = content_ids[0] end = content_ids[-1] # retrieve fossology_licenses limited to 3 results limited_results = len(fossology_licenses) - 1 tool_id = fossology_licenses[0]['indexer_configuration_id'] actual_result = self.storage.content_fossology_license_get_range( start, end, indexer_configuration_id=tool_id, limit=limited_results) actual_ids = actual_result['ids'] actual_next = actual_result['next'] self.assertEqual(limited_results, len(actual_ids)) self.assertIsNotNone(actual_next) self.assertEqual(actual_next, content_ids[-1]) expected_fossology_licenses = content_ids[:-1] self.assertEqual(expected_fossology_licenses, actual_ids) # retrieve next part actual_results2 = self.storage.content_fossology_license_get_range( start=end, end=end, indexer_configuration_id=tool_id) actual_ids2 = actual_results2['ids'] actual_next2 = actual_results2['next'] self.assertIsNone(actual_next2) expected_fossology_licenses2 = [content_ids[-1]] self.assertEqual(expected_fossology_licenses2, actual_ids2) @pytest.mark.db class IndexerTestStorage(CommonTestStorage, BasePgTestStorage, unittest.TestCase): """Running the tests locally. For the client api tests (remote storage), see `class`:swh.indexer.storage.test_api_client:TestRemoteStorage class. """ pass def test_mapping_names(): assert set(MAPPING_NAMES) == {m.name for m in MAPPINGS.values()} diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py index d6aaf02..347104c 100644 --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -1,315 +1,322 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from functools import reduce import re import tempfile from unittest.mock import patch from click.testing import CliRunner from swh.model.hashutil import hash_to_bytes from swh.indexer.cli import cli CLI_CONFIG = ''' scheduler: cls: foo args: {} storage: cls: memory args: {} indexer_storage: cls: memory args: {} ''' def fill_idx_storage(idx_storage, nb_rows): tools = [ { 'tool_name': 'tool %d' % i, 'tool_version': '0.0.1', 'tool_configuration': {}, } for i in range(2) ] tools = idx_storage.indexer_configuration_add(tools) origin_metadata = [ { 'id': origin_id, 'from_revision': hash_to_bytes('abcd{:0>4}'.format(origin_id)), 'indexer_configuration_id': tools[origin_id % 2]['id'], 'metadata': {'name': 'origin %d' % origin_id}, 'mappings': ['mapping%d' % (origin_id % 10)] } for origin_id in range(nb_rows) ] revision_metadata = [ { 'id': hash_to_bytes('abcd{:0>4}'.format(origin_id)), 'indexer_configuration_id': tools[origin_id % 2]['id'], 'metadata': {'name': 'origin %d' % origin_id}, 'mappings': ['mapping%d' % (origin_id % 10)] } for origin_id in range(nb_rows) ] idx_storage.revision_intrinsic_metadata_add(revision_metadata) idx_storage.origin_intrinsic_metadata_add(origin_metadata) return [tool['id'] for tool in tools] def _origins_in_task_args(tasks): """Returns the set of origins contained in the arguments of the - provided tasks (assumed to be of type indexer_origin_metadata).""" + provided tasks (assumed to be of type index-origin-metadata).""" return reduce( set.union, (set(task['arguments']['args'][0]) for task in tasks), set() ) def _assert_tasks_for_origins(tasks, origins): expected_kwargs = {"policy_update": "update-dups", "parse_ids": False} - assert {task['type'] for task in tasks} == {'indexer_origin_metadata'} + assert {task['type'] for task in tasks} == {'index-origin-metadata'} assert all(len(task['arguments']['args']) == 1 for task in tasks) assert all(task['arguments']['kwargs'] == expected_kwargs for task in tasks) assert _origins_in_task_args(tasks) == set(origins) def invoke(scheduler, catch_exceptions, args): runner = CliRunner() with patch('swh.indexer.cli.get_scheduler') as get_scheduler_mock, \ tempfile.NamedTemporaryFile('a', suffix='.yml') as config_fd: config_fd.write(CLI_CONFIG) config_fd.seek(0) get_scheduler_mock.return_value = scheduler result = runner.invoke(cli, ['-C' + config_fd.name] + args) if not catch_exceptions and result.exception: print(result.output) raise result.exception return result def test_mapping_list(indexer_scheduler): result = invoke(indexer_scheduler, False, [ 'mapping', 'list', ]) expected_output = '\n'.join([ 'codemeta', 'gemspec', 'maven', 'npm', 'pkg-info', '', ]) assert result.exit_code == 0, result.output assert result.output == expected_output def test_mapping_list_terms(indexer_scheduler): result = invoke(indexer_scheduler, False, [ 'mapping', 'list-terms', ]) assert result.exit_code == 0, result.output assert re.search(r'http://schema.org/url:\n.*npm', result.output) assert re.search(r'http://schema.org/url:\n.*codemeta', result.output) assert re.search( r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta', result.output) def test_mapping_list_terms_exclude(indexer_scheduler): result = invoke(indexer_scheduler, False, [ 'mapping', 'list-terms', '--exclude-mapping', 'codemeta' ]) assert result.exit_code == 0, result.output assert re.search(r'http://schema.org/url:\n.*npm', result.output) assert not re.search(r'http://schema.org/url:\n.*codemeta', result.output) assert not re.search( r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta', result.output) +@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_empty_db( indexer_scheduler, idx_storage, storage): result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', ]) expected_output = ( 'Nothing to do (no origin metadata matched the criteria).\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output tasks = indexer_scheduler.search_tasks() assert len(tasks) == 0 +@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_divisor( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 90) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', ]) # Check the output expected_output = ( 'Scheduled 3 tasks (30 origins).\n' 'Scheduled 6 tasks (60 origins).\n' 'Scheduled 9 tasks (90 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 9 _assert_tasks_for_origins(tasks, range(90)) +@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_dry_run( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 90) result = invoke(indexer_scheduler, False, [ 'schedule', '--dry-run', 'reindex_origin_metadata', ]) # Check the output expected_output = ( 'Scheduled 3 tasks (30 origins).\n' 'Scheduled 6 tasks (60 origins).\n' 'Scheduled 9 tasks (90 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 0 +@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_nondivisor( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when neither origin_batch_size or task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 70) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', '--batch-size', '20', ]) # Check the output expected_output = ( 'Scheduled 3 tasks (60 origins).\n' 'Scheduled 4 tasks (70 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 4 _assert_tasks_for_origins(tasks, range(70)) +@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_filter_one_mapping( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 110) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', '--mapping', 'mapping1', ]) # Check the output expected_output = ( 'Scheduled 2 tasks (11 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 2 _assert_tasks_for_origins( tasks, [1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101]) +@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_filter_two_mappings( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 110) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', '--mapping', 'mapping1', '--mapping', 'mapping2', ]) # Check the output expected_output = ( 'Scheduled 3 tasks (22 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 3 _assert_tasks_for_origins( tasks, [1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101, 2, 12, 22, 32, 42, 52, 62, 72, 82, 92, 102]) +@patch('swh.scheduler.cli.utils.TASK_BATCH_SIZE', 3) @patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_filter_one_tool( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" tool_ids = fill_idx_storage(idx_storage, 110) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', '--tool-id', str(tool_ids[0]), ]) # Check the output expected_output = ( 'Scheduled 3 tasks (30 origins).\n' 'Scheduled 6 tasks (55 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 6 _assert_tasks_for_origins( tasks, [x*2 for x in range(55)]) diff --git a/swh/indexer/tests/test_language.py b/swh/indexer/tests/test_language.py deleted file mode 100644 index dc4e0c0..0000000 --- a/swh/indexer/tests/test_language.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (C) 2017-2018 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest -import pytest - -from swh.indexer import language -from swh.indexer.language import LanguageIndexer -from swh.indexer.tests.utils import ( - CommonContentIndexerTest, - BASE_TEST_CONFIG, fill_storage, fill_obj_storage, filter_dict, -) - - -CONFIG = { - **BASE_TEST_CONFIG, - 'tools': { - 'name': 'pygments', - 'version': '2.0.1+dfsg-1.1+deb8u1', - 'configuration': { - 'type': 'library', - 'debian-package': 'python3-pygments', - 'max_content_size': 10240, - }, - } -} - - -class Language(unittest.TestCase): - """Tests pygments tool for language detection - - """ - def test_compute_language_none(self): - # given - self.content = "" - self.declared_language = { - 'lang': None - } - # when - result = language.compute_language(self.content) - # then - self.assertEqual(self.declared_language, result) - - -class TestLanguageIndexer(CommonContentIndexerTest, unittest.TestCase): - """Language indexer test scenarios: - - - Known sha1s in the input list have their data indexed - - Unknown sha1 in the input list are not indexed - - """ - - legacy_get_format = True - - def get_indexer_results(self, ids): - yield from self.indexer.idx_storage.content_language_get(ids) - - def setUp(self): - self.indexer = LanguageIndexer(config=CONFIG) - self.indexer.catch_exceptions = False - fill_storage(self.indexer.storage) - fill_obj_storage(self.indexer.objstorage) - - self.id0 = '02fb2c89e14f7fab46701478c83779c7beb7b069' - self.id1 = '103bc087db1d26afc3a0283f38663d081e9b01e6' - self.id2 = 'd4c647f0fc257591cc9ba1722484229780d1c607' - - tool = {k.replace('tool_', ''): v - for (k, v) in self.indexer.tool.items()} - - self.expected_results = { - self.id0: { - 'id': self.id0, - 'tool': tool, - 'lang': 'python', - }, - self.id1: { - 'id': self.id1, - 'tool': tool, - 'lang': 'c' - }, - self.id2: { - 'id': self.id2, - 'tool': tool, - 'lang': 'text-only' - } - } - - -def test_language_w_no_tool(): - with pytest.raises(ValueError): - LanguageIndexer(config=filter_dict(CONFIG, 'tools')) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py index a5be367..321cc1f 100644 --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -1,217 +1,263 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from unittest.mock import patch from swh.model.hashutil import hash_to_bytes from swh.indexer.metadata import OriginMetadataIndexer from .utils import YARN_PARSER_METADATA from .test_metadata import REVISION_METADATA_CONFIG def test_origin_metadata_indexer( idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.run(["git+https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') rev_metadata = { 'id': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } origin_metadata = { 'id': origin['id'], 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) for result in results: del result['tool'] assert results == [rev_metadata] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) for result in results: del result['tool'] assert results == [origin_metadata] def test_origin_metadata_indexer_duplicate_origin( idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.storage = storage indexer.idx_storage = idx_storage indexer.run(["git+https://github.com/librariesio/yarn-parser"]) indexer.run(["git+https://github.com/librariesio/yarn-parser"]*2) origin = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert len(results) == 1 results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) assert len(results) == 1 def test_origin_metadata_indexer_missing_head( idx_storage, storage, obj_storage): storage.origin_add([{ 'type': 'git', 'url': 'https://example.com' }]) indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.run(["git+https://example.com"]) origin = storage.origin_get({ 'type': 'git', 'url': 'https://example.com'}) results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) assert results == [] def test_origin_metadata_indexer_partial_missing_head( idx_storage, storage, obj_storage): storage.origin_add([{ 'type': 'git', 'url': 'https://example.com' }]) indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.run(["git+https://example.com", "git+https://github.com/librariesio/yarn-parser"]) origin1 = storage.origin_get({ 'type': 'git', 'url': 'https://example.com'}) origin2 = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') rev_metadata = { 'id': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } origin_metadata = { 'id': origin2['id'], 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) for result in results: del result['tool'] assert results == [rev_metadata] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin1['id'], origin2['id']])) for result in results: del result['tool'] assert results == [origin_metadata] def test_origin_metadata_indexer_duplicate_revision( idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.storage = storage indexer.idx_storage = idx_storage indexer.run(["git+https://github.com/librariesio/yarn-parser", "git+https://github.com/librariesio/yarn-parser.git"]) origin1 = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) origin2 = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser.git'}) assert origin1['id'] != origin2['id'] rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert len(results) == 1 results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin1['id'], origin2['id']])) assert len(results) == 2 -def test_origin_metadata_indexer_no_metadata( +def test_origin_metadata_indexer_no_metadata_file( idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) with patch('swh.indexer.metadata_dictionary.npm.NpmMapping.filename', b'foo.json'): indexer.run(["git+https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) assert results == [] +def test_origin_metadata_indexer_no_metadata( + idx_storage, storage, obj_storage): + + indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) + with patch('swh.indexer.metadata.RevisionMetadataIndexer' + '.translate_revision_intrinsic_metadata', + return_value=(['npm'], {'@context': 'foo'})): + indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + + origin = storage.origin_get({ + 'type': 'git', + 'url': 'https://github.com/librariesio/yarn-parser'}) + rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') + + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) + assert results == [] + + results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ + origin['id']])) + assert results == [] + + +def test_origin_metadata_indexer_error( + idx_storage, storage, obj_storage): + + indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) + with patch('swh.indexer.metadata.RevisionMetadataIndexer' + '.translate_revision_intrinsic_metadata', + return_value=None): + indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + + origin = storage.origin_get({ + 'type': 'git', + 'url': 'https://github.com/librariesio/yarn-parser'}) + rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') + + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) + assert results == [] + + results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ + origin['id']])) + assert results == [] + + def test_origin_metadata_indexer_delete_metadata( idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.run(["git+https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results != [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) assert results != [] with patch('swh.indexer.metadata_dictionary.npm.NpmMapping.filename', b'foo.json'): indexer.run(["git+https://github.com/librariesio/yarn-parser"]) results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) assert results == [] diff --git a/version.txt b/version.txt index 94717a4..f4e725e 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.146-0-g669998e \ No newline at end of file +v0.0.147-0-gde4744f \ No newline at end of file