diff --git a/PKG-INFO b/PKG-INFO index e279eb8..68bb5a5 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,69 +1,69 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.0.145 +Version: 0.0.146 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Funding, https://www.softwareheritage.org/donate -Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer +Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/requirements.txt b/requirements.txt index 3a7428c..a578b91 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ vcversioner pygments click chardet -file_magic +file-magic pyld xmltodict diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index e279eb8..68bb5a5 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,69 +1,69 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.0.145 +Version: 0.0.146 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Funding, https://www.softwareheritage.org/donate -Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer +Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.indexer.egg-info/SOURCES.txt b/swh.indexer.egg-info/SOURCES.txt index 2bc85f8..80cb6e8 100644 --- a/swh.indexer.egg-info/SOURCES.txt +++ b/swh.indexer.egg-info/SOURCES.txt @@ -1,85 +1,91 @@ MANIFEST.in Makefile README.md requirements-swh.txt requirements.txt setup.py version.txt sql/bin/db-upgrade sql/bin/dot_add_content sql/doc/json/.gitignore sql/doc/json/Makefile sql/doc/json/indexer_configuration.tool_configuration.schema.json sql/doc/json/revision_metadata.translated_metadata.json sql/json/.gitignore sql/json/Makefile sql/json/indexer_configuration.tool_configuration.schema.json sql/json/revision_metadata.translated_metadata.json sql/upgrades/115.sql sql/upgrades/116.sql sql/upgrades/117.sql sql/upgrades/118.sql sql/upgrades/119.sql sql/upgrades/120.sql sql/upgrades/121.sql sql/upgrades/122.sql sql/upgrades/123.sql swh/__init__.py swh.indexer.egg-info/PKG-INFO swh.indexer.egg-info/SOURCES.txt swh.indexer.egg-info/dependency_links.txt swh.indexer.egg-info/entry_points.txt swh.indexer.egg-info/requires.txt swh.indexer.egg-info/top_level.txt swh/indexer/__init__.py swh/indexer/cli.py swh/indexer/codemeta.py swh/indexer/ctags.py swh/indexer/fossology_license.py swh/indexer/indexer.py swh/indexer/journal_client.py swh/indexer/language.py swh/indexer/metadata.py swh/indexer/metadata_detector.py -swh/indexer/metadata_dictionary.py swh/indexer/mimetype.py swh/indexer/origin_head.py swh/indexer/rehash.py swh/indexer/tasks.py swh/indexer/data/codemeta/CITATION swh/indexer/data/codemeta/LICENSE swh/indexer/data/codemeta/codemeta.jsonld swh/indexer/data/codemeta/crosswalk.csv +swh/indexer/metadata_dictionary/__init__.py +swh/indexer/metadata_dictionary/base.py +swh/indexer/metadata_dictionary/codemeta.py +swh/indexer/metadata_dictionary/maven.py +swh/indexer/metadata_dictionary/npm.py +swh/indexer/metadata_dictionary/python.py +swh/indexer/metadata_dictionary/ruby.py swh/indexer/sql/10-swh-init.sql swh/indexer/sql/20-swh-enums.sql swh/indexer/sql/30-swh-schema.sql swh/indexer/sql/40-swh-func.sql swh/indexer/sql/50-swh-data.sql swh/indexer/sql/60-swh-indexes.sql swh/indexer/storage/__init__.py swh/indexer/storage/converters.py swh/indexer/storage/db.py swh/indexer/storage/in_memory.py swh/indexer/storage/api/__init__.py swh/indexer/storage/api/client.py swh/indexer/storage/api/server.py swh/indexer/storage/api/wsgi.py swh/indexer/tests/__init__.py swh/indexer/tests/conftest.py swh/indexer/tests/tasks.py swh/indexer/tests/test_cli.py swh/indexer/tests/test_ctags.py swh/indexer/tests/test_fossology_license.py swh/indexer/tests/test_language.py swh/indexer/tests/test_metadata.py swh/indexer/tests/test_mimetype.py swh/indexer/tests/test_origin_head.py swh/indexer/tests/test_origin_metadata.py swh/indexer/tests/utils.py swh/indexer/tests/storage/__init__.py swh/indexer/tests/storage/generate_data_test.py swh/indexer/tests/storage/test_api_client.py swh/indexer/tests/storage/test_converters.py swh/indexer/tests/storage/test_in_memory.py swh/indexer/tests/storage/test_server.py swh/indexer/tests/storage/test_storage.py \ No newline at end of file diff --git a/swh.indexer.egg-info/requires.txt b/swh.indexer.egg-info/requires.txt index 9d81572..cc485e1 100644 --- a/swh.indexer.egg-info/requires.txt +++ b/swh.indexer.egg-info/requires.txt @@ -1,18 +1,18 @@ vcversioner pygments click chardet -file_magic +file-magic pyld xmltodict swh.core>=0.0.53 swh.model>=0.0.15 swh.objstorage>=0.0.28 swh.scheduler>=0.0.47 swh.storage>=0.0.123 swh.journal>=0.0.6 [testing] pytest<4 pytest-postgresql hypothesis>=3.11.0 diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py index 56c7f88..c5244be 100644 --- a/swh/indexer/cli.py +++ b/swh/indexer/cli.py @@ -1,208 +1,177 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click from swh.core import config from swh.scheduler import get_scheduler -from swh.scheduler.utils import create_task_dict +from swh.scheduler.cli_utils import schedule_origin_batches from swh.storage import get_storage from swh.indexer import metadata_dictionary from swh.indexer.storage import get_indexer_storage from swh.indexer.storage.api.server import load_and_check_config, app CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) -TASK_BATCH_SIZE = 1000 # Number of tasks per query to the scheduler - @click.group(context_settings=CONTEXT_SETTINGS) @click.option('--config-file', '-C', default=None, type=click.Path(exists=True, dir_okay=False,), help="Configuration file.") @click.pass_context def cli(ctx, config_file): """Software Heritage Indexer CLI interface """ ctx.ensure_object(dict) conf = config.read(config_file) ctx.obj['config'] = conf def _get_api(getter, config, config_key, url): if url: config[config_key] = { 'cls': 'remote', 'args': {'url': url} } elif config_key not in config: raise click.ClickException( 'Missing configuration for {}'.format(config_key)) return getter(**config[config_key]) @cli.group('mapping') def mapping(): pass @mapping.command('list') def mapping_list(): """Prints the list of known mappings.""" mapping_names = [mapping.name for mapping in metadata_dictionary.MAPPINGS.values()] mapping_names.sort() for mapping_name in mapping_names: click.echo(mapping_name) @mapping.command('list-terms') @click.option('--exclude-mapping', multiple=True, help='Exclude the given mapping from the output') @click.option('--concise', is_flag=True, default=False, help='Don\'t print the list of mappings supporting each term.') def mapping_list_terms(concise, exclude_mapping): """Prints the list of known CodeMeta terms, and which mappings support them.""" properties = metadata_dictionary.list_terms() for (property_name, supported_mappings) in sorted(properties.items()): supported_mappings = {m.name for m in supported_mappings} supported_mappings -= set(exclude_mapping) if supported_mappings: if concise: click.echo(property_name) else: click.echo('{}:'.format(property_name)) click.echo('\t' + ', '.join(sorted(supported_mappings))) @cli.group('schedule') @click.option('--scheduler-url', '-s', default=None, help="URL of the scheduler API") @click.option('--indexer-storage-url', '-i', default=None, help="URL of the indexer storage API") @click.option('--storage-url', '-g', default=None, help="URL of the (graph) storage API") @click.option('--dry-run/--no-dry-run', is_flag=True, default=False, - help='Default to list only what would be scheduled.') + help='List only what would be scheduled.') @click.pass_context def schedule(ctx, scheduler_url, storage_url, indexer_storage_url, dry_run): """Manipulate indexer tasks via SWH Scheduler's API.""" ctx.obj['indexer_storage'] = _get_api( get_indexer_storage, ctx.obj['config'], 'indexer_storage', indexer_storage_url ) ctx.obj['storage'] = _get_api( get_storage, ctx.obj['config'], 'storage', storage_url ) ctx.obj['scheduler'] = _get_api( get_scheduler, ctx.obj['config'], 'scheduler', scheduler_url ) if dry_run: ctx.obj['scheduler'] = None def list_origins_by_producer(idx_storage, mappings, tool_ids): start = 0 limit = 10000 while True: origins = list( idx_storage.origin_intrinsic_metadata_search_by_producer( start=start, limit=limit, ids_only=True, mappings=mappings or None, tool_ids=tool_ids or None)) if not origins: break start = origins[-1]+1 yield from origins @schedule.command('reindex_origin_metadata') @click.option('--batch-size', '-b', 'origin_batch_size', default=10, show_default=True, type=int, help="Number of origins per task") @click.option('--tool-id', '-t', 'tool_ids', type=int, multiple=True, help="Restrict search of old metadata to this/these tool ids.") @click.option('--mapping', '-m', 'mappings', multiple=True, help="Mapping(s) that should be re-scheduled (eg. 'npm', " "'gemspec', 'maven')") @click.option('--task-type', default='indexer_origin_metadata', show_default=True, help="Name of the task type to schedule.") @click.pass_context def schedule_origin_metadata_reindex( - ctx, origin_batch_size, mappings, tool_ids, task_type): + ctx, origin_batch_size, tool_ids, mappings, task_type): """Schedules indexing tasks for origins that were already indexed.""" idx_storage = ctx.obj['indexer_storage'] scheduler = ctx.obj['scheduler'] origins = list_origins_by_producer(idx_storage, mappings, tool_ids) - kwargs = {"policy_update": "update-dups", "parse_ids": False} - nb_origins = 0 - nb_tasks = 0 - while True: - task_batch = [] - for _ in range(TASK_BATCH_SIZE): - # Group origins - origin_batch = [] - for (_, origin) in zip(range(origin_batch_size), origins): - origin_batch.append(origin) - nb_origins += len(origin_batch) - if not origin_batch: - break - - # Create a task for these origins - args = [origin_batch] - task_dict = create_task_dict(task_type, 'oneshot', *args, **kwargs) - task_batch.append(task_dict) - - # Schedule a batch of tasks - if not task_batch: - break - nb_tasks += len(task_batch) - if scheduler: - scheduler.create_tasks(task_batch) - click.echo('Scheduled %d tasks (%d origins).' % (nb_tasks, nb_origins)) - - # Print final status. - if nb_tasks: - click.echo('Done.') - else: - click.echo('Nothing to do (no origin metadata matched the criteria).') + kwargs = {"policy_update": "update-dups", "parse_ids": False} + schedule_origin_batches( + scheduler, task_type, origins, origin_batch_size, kwargs) @cli.command('api-server') @click.argument('config-path', required=1) @click.option('--host', default='0.0.0.0', help="Host to run the server") @click.option('--port', default=5007, type=click.INT, help="Binding port of the server") @click.option('--debug/--nodebug', default=True, help="Indicates if the server should run in debug mode") def api_server(config_path, host, port, debug): api_cfg = load_and_check_config(config_path, type='any') app.config.update(api_cfg) app.run(host, port=int(port), debug=bool(debug)) def main(): return cli(auto_envvar_prefix='SWH_INDEXER') if __name__ == '__main__': main() diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py deleted file mode 100644 index 02b4626..0000000 --- a/swh/indexer/metadata_dictionary.py +++ /dev/null @@ -1,733 +0,0 @@ -# Copyright (C) 2017 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import os -import re -import abc -import ast -import json -import logging -import itertools -import collections -import email.parser -import email.policy -import xml.parsers.expat - -import click -import xmltodict - -from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI, CODEMETA_TERMS -from swh.indexer.codemeta import compact, expand - - -MAPPINGS = {} - - -def register_mapping(cls): - MAPPINGS[cls.__name__] = cls - return cls - - -def list_terms(): - """Returns a dictionary with all supported CodeMeta terms as keys, - and the mappings that support each of them as values.""" - d = collections.defaultdict(set) - for mapping in MAPPINGS.values(): - for term in mapping.supported_terms(): - d[term].add(mapping) - return d - - -def merge_values(v1, v2): - """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`, - returns `{"@list": l1 + l2}`. - Otherwise, make them lists (if they are not already) and concatenate - them. - - >>> merge_values('a', 'b') - ['a', 'b'] - >>> merge_values(['a', 'b'], 'c') - ['a', 'b', 'c'] - >>> merge_values({'@list': ['a', 'b']}, {'@list': ['c']}) - {'@list': ['a', 'b', 'c']} - """ - if v1 is None: - return v2 - elif v2 is None: - return v1 - elif isinstance(v1, dict) and set(v1) == {'@list'}: - assert isinstance(v1['@list'], list) - if isinstance(v2, dict) and set(v2) == {'@list'}: - assert isinstance(v2['@list'], list) - return {'@list': v1['@list'] + v2['@list']} - else: - raise ValueError('Cannot merge %r and %r' % (v1, v2)) - else: - if isinstance(v2, dict) and '@list' in v2: - raise ValueError('Cannot merge %r and %r' % (v1, v2)) - if not isinstance(v1, list): - v1 = [v1] - if not isinstance(v2, list): - v2 = [v2] - return v1 + v2 - - -class BaseMapping(metaclass=abc.ABCMeta): - """Base class for mappings to inherit from - - To implement a new mapping: - - - inherit this class - - override translate function - """ - def __init__(self, log_suffix=''): - self.log_suffix = log_suffix - self.log = logging.getLogger('%s.%s' % ( - self.__class__.__module__, - self.__class__.__name__)) - - @property - @abc.abstractmethod - def name(self): - """A name of this mapping, used as an identifier in the - indexer storage.""" - pass - - @classmethod - @abc.abstractmethod - def detect_metadata_files(cls, files): - """ - Detects files potentially containing metadata - - Args: - file_entries (list): list of files - - Returns: - list: list of sha1 (possibly empty) - """ - pass - - @abc.abstractmethod - def translate(self, file_content): - pass - - def normalize_translation(self, metadata): - return compact(metadata) - - -class SingleFileMapping(BaseMapping): - """Base class for all mappings that use a single file as input.""" - - @property - @abc.abstractmethod - def filename(self): - """The .json file to extract metadata from.""" - pass - - @classmethod - def detect_metadata_files(cls, file_entries): - for entry in file_entries: - if entry['name'] == cls.filename: - return [entry['sha1']] - return [] - - -class DictMapping(BaseMapping): - """Base class for mappings that take as input a file that is mostly - a key-value store (eg. a shallow JSON dict).""" - - string_fields = [] - '''List of fields that are simple strings, and don't need any - normalization.''' - - @property - @abc.abstractmethod - def mapping(self): - """A translation dict to map dict keys into a canonical name.""" - pass - - @staticmethod - def _normalize_method_name(name): - return name.replace('-', '_') - - @classmethod - def supported_terms(cls): - return { - term for (key, term) in cls.mapping.items() - if key in cls.string_fields - or hasattr(cls, 'translate_' + cls._normalize_method_name(key)) - or hasattr(cls, 'normalize_' + cls._normalize_method_name(key))} - - def _translate_dict(self, content_dict, *, normalize=True): - """ - Translates content by parsing content from a dict object - and translating with the appropriate mapping - - Args: - content_dict (dict): content dict to translate - - Returns: - dict: translated metadata in json-friendly form needed for - the indexer - - """ - translated_metadata = {'@type': SCHEMA_URI + 'SoftwareSourceCode'} - for k, v in content_dict.items(): - # First, check if there is a specific translation - # method for this key - translation_method = getattr( - self, 'translate_' + self._normalize_method_name(k), None) - if translation_method: - translation_method(translated_metadata, v) - elif k in self.mapping: - # if there is no method, but the key is known from the - # crosswalk table - codemeta_key = self.mapping[k] - - # if there is a normalization method, use it on the value - normalization_method = getattr( - self, 'normalize_' + self._normalize_method_name(k), None) - if normalization_method: - v = normalization_method(v) - elif k in self.string_fields and isinstance(v, str): - pass - elif k in self.string_fields and isinstance(v, list): - v = [x for x in v if isinstance(x, str)] - else: - continue - - # set the translation metadata with the normalized value - if codemeta_key in translated_metadata: - translated_metadata[codemeta_key] = merge_values( - translated_metadata[codemeta_key], v) - else: - translated_metadata[codemeta_key] = v - if normalize: - return self.normalize_translation(translated_metadata) - else: - return translated_metadata - - -class JsonMapping(DictMapping, SingleFileMapping): - """Base class for all mappings that use a JSON file as input.""" - - def translate(self, raw_content): - """ - Translates content by parsing content from a bytestring containing - json data and translating with the appropriate mapping - - Args: - raw_content (bytes): raw content to translate - - Returns: - dict: translated metadata in json-friendly form needed for - the indexer - - """ - try: - raw_content = raw_content.decode() - except UnicodeDecodeError: - self.log.warning('Error unidecoding from %s', self.log_suffix) - return - try: - content_dict = json.loads(raw_content) - except json.JSONDecodeError: - self.log.warning('Error unjsoning from %s', self.log_suffix) - return - if isinstance(content_dict, dict): - return self._translate_dict(content_dict) - - -@register_mapping -class NpmMapping(JsonMapping): - """ - dedicated class for NPM (package.json) mapping and translation - """ - name = 'npm' - mapping = CROSSWALK_TABLE['NodeJS'] - filename = b'package.json' - string_fields = ['name', 'version', 'homepage', 'description', 'email'] - - _schema_shortcuts = { - 'github': 'git+https://github.com/%s.git', - 'gist': 'git+https://gist.github.com/%s.git', - 'gitlab': 'git+https://gitlab.com/%s.git', - # Bitbucket supports both hg and git, and the shortcut does not - # tell which one to use. - # 'bitbucket': 'https://bitbucket.org/', - } - - def normalize_repository(self, d): - """https://docs.npmjs.com/files/package.json#repository - - >>> NpmMapping().normalize_repository({ - ... 'type': 'git', - ... 'url': 'https://example.org/foo.git' - ... }) - {'@id': 'git+https://example.org/foo.git'} - >>> NpmMapping().normalize_repository( - ... 'gitlab:foo/bar') - {'@id': 'git+https://gitlab.com/foo/bar.git'} - >>> NpmMapping().normalize_repository( - ... 'foo/bar') - {'@id': 'git+https://github.com/foo/bar.git'} - """ - if isinstance(d, dict) and isinstance(d.get('type'), str) \ - and isinstance(d.get('url'), str): - url = '{type}+{url}'.format(**d) - elif isinstance(d, str): - if '://' in d: - url = d - elif ':' in d: - (schema, rest) = d.split(':', 1) - if schema in self._schema_shortcuts: - url = self._schema_shortcuts[schema] % rest - else: - return None - else: - url = self._schema_shortcuts['github'] % d - - else: - return None - - return {'@id': url} - - def normalize_bugs(self, d): - """https://docs.npmjs.com/files/package.json#bugs - - >>> NpmMapping().normalize_bugs({ - ... 'url': 'https://example.org/bugs/', - ... 'email': 'bugs@example.org' - ... }) - {'@id': 'https://example.org/bugs/'} - >>> NpmMapping().normalize_bugs( - ... 'https://example.org/bugs/') - {'@id': 'https://example.org/bugs/'} - """ - if isinstance(d, dict) and isinstance(d.get('url'), str): - return {'@id': d['url']} - elif isinstance(d, str): - return {'@id': d} - else: - return None - - _parse_author = re.compile(r'^ *' - r'(?P.*?)' - r'( +<(?P.*)>)?' - r'( +\((?P.*)\))?' - r' *$') - - def normalize_author(self, d): - """https://docs.npmjs.com/files/package.json#people-fields-author-contributors' - - >>> from pprint import pprint - >>> pprint(NpmMapping().normalize_author({ - ... 'name': 'John Doe', - ... 'email': 'john.doe@example.org', - ... 'url': 'https://example.org/~john.doe', - ... })) - {'@list': [{'@type': 'http://schema.org/Person', - 'http://schema.org/email': 'john.doe@example.org', - 'http://schema.org/name': 'John Doe', - 'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]} - >>> pprint(NpmMapping().normalize_author( - ... 'John Doe (https://example.org/~john.doe)' - ... )) - {'@list': [{'@type': 'http://schema.org/Person', - 'http://schema.org/email': 'john.doe@example.org', - 'http://schema.org/name': 'John Doe', - 'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]} - """ # noqa - author = {'@type': SCHEMA_URI+'Person'} - if isinstance(d, dict): - name = d.get('name', None) - email = d.get('email', None) - url = d.get('url', None) - elif isinstance(d, str): - match = self._parse_author.match(d) - name = match.group('name') - email = match.group('email') - url = match.group('url') - else: - return None - if name and isinstance(name, str): - author[SCHEMA_URI+'name'] = name - if email and isinstance(email, str): - author[SCHEMA_URI+'email'] = email - if url and isinstance(url, str): - author[SCHEMA_URI+'url'] = {'@id': url} - return {"@list": [author]} - - def normalize_license(self, s): - """https://docs.npmjs.com/files/package.json#license - - >>> NpmMapping().normalize_license('MIT') - {'@id': 'https://spdx.org/licenses/MIT'} - """ - if isinstance(s, str): - return {"@id": "https://spdx.org/licenses/" + s} - else: - return None - - def normalize_homepage(self, s): - """https://docs.npmjs.com/files/package.json#homepage - - >>> NpmMapping().normalize_homepage('https://example.org/~john.doe') - {'@id': 'https://example.org/~john.doe'} - """ - if isinstance(s, str): - return {"@id": s} - - def normalize_keywords(self, l): - """https://docs.npmjs.com/files/package.json#homepage - - >>> NpmMapping().normalize_keywords(['foo', 'bar']) - ['foo', 'bar'] - """ - if isinstance(l, list): - return [x for x in l if isinstance(x, str)] - - -@register_mapping -class CodemetaMapping(SingleFileMapping): - """ - dedicated class for CodeMeta (codemeta.json) mapping and translation - """ - name = 'codemeta' - filename = b'codemeta.json' - string_fields = None - - @classmethod - def supported_terms(cls): - return [term for term in CODEMETA_TERMS if not term.startswith('@')] - - def translate(self, content): - try: - return self.normalize_translation(expand( - json.loads(content.decode()))) - except Exception: - return None - - -@register_mapping -class MavenMapping(DictMapping, SingleFileMapping): - """ - dedicated class for Maven (pom.xml) mapping and translation - """ - name = 'maven' - filename = b'pom.xml' - mapping = CROSSWALK_TABLE['Java (Maven)'] - string_fields = ['name', 'version', 'description', 'email'] - - def translate(self, content): - try: - d = xmltodict.parse(content).get('project') or {} - except xml.parsers.expat.ExpatError: - self.log.warning('Error parsing XML from %s', self.log_suffix) - return None - except UnicodeDecodeError: - self.log.warning('Error unidecoding XML from %s', self.log_suffix) - return None - except (LookupError, ValueError): - # unknown encoding or multi-byte encoding - self.log.warning('Error detecting XML encoding from %s', - self.log_suffix) - return None - metadata = self._translate_dict(d, normalize=False) - metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d) - metadata[SCHEMA_URI+'license'] = self.parse_licenses(d) - return self.normalize_translation(metadata) - - _default_repository = {'url': 'https://repo.maven.apache.org/maven2/'} - - def parse_repositories(self, d): - """https://maven.apache.org/pom.html#Repositories - - >>> import xmltodict - >>> from pprint import pprint - >>> d = xmltodict.parse(''' - ... - ... - ... codehausSnapshots - ... Codehaus Snapshots - ... http://snapshots.maven.codehaus.org/maven2 - ... default - ... - ... - ... ''') - >>> MavenMapping().parse_repositories(d) - """ - repositories = d.get('repositories') - if not repositories: - results = [self.parse_repository(d, self._default_repository)] - elif isinstance(repositories, dict): - repositories = repositories.get('repository') or [] - if not isinstance(repositories, list): - repositories = [repositories] - results = [self.parse_repository(d, repo) - for repo in repositories] - else: - results = [] - return [res for res in results if res] or None - - def parse_repository(self, d, repo): - if not isinstance(repo, dict): - return - if repo.get('layout', 'default') != 'default': - return # TODO ? - url = repo.get('url') - group_id = d.get('groupId') - artifact_id = d.get('artifactId') - if (isinstance(url, str) and isinstance(group_id, str) - and isinstance(artifact_id, str)): - repo = os.path.join(url, *group_id.split('.'), artifact_id) - return {"@id": repo} - - def normalize_groupId(self, id_): - """https://maven.apache.org/pom.html#Maven_Coordinates - - >>> MavenMapping().normalize_groupId('org.example') - {'@id': 'org.example'} - """ - if isinstance(id_, str): - return {"@id": id_} - - def parse_licenses(self, d): - """https://maven.apache.org/pom.html#Licenses - - >>> import xmltodict - >>> import json - >>> d = xmltodict.parse(''' - ... - ... - ... Apache License, Version 2.0 - ... https://www.apache.org/licenses/LICENSE-2.0.txt - ... - ... - ... ''') - >>> print(json.dumps(d, indent=4)) - { - "licenses": { - "license": { - "name": "Apache License, Version 2.0", - "url": "https://www.apache.org/licenses/LICENSE-2.0.txt" - } - } - } - >>> MavenMapping().parse_licenses(d) - [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}] - - or, if there are more than one license: - - >>> import xmltodict - >>> from pprint import pprint - >>> d = xmltodict.parse(''' - ... - ... - ... Apache License, Version 2.0 - ... https://www.apache.org/licenses/LICENSE-2.0.txt - ... - ... - ... MIT License - ... https://opensource.org/licenses/MIT - ... - ... - ... ''') - >>> pprint(MavenMapping().parse_licenses(d)) - [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}, - {'@id': 'https://opensource.org/licenses/MIT'}] - """ - - licenses = d.get('licenses') - if not isinstance(licenses, dict): - return - licenses = licenses.get('license') - if isinstance(licenses, dict): - licenses = [licenses] - elif not isinstance(licenses, list): - return - return [{"@id": license['url']} - for license in licenses - if isinstance(license, dict) - and isinstance(license.get('url'), str)] or None - - -_normalize_pkginfo_key = str.lower - - -class LinebreakPreservingEmailPolicy(email.policy.EmailPolicy): - def header_fetch_parse(self, name, value): - if hasattr(value, 'name'): - return value - value = value.replace('\n ', '\n') - return self.header_factory(name, value) - - -@register_mapping -class PythonPkginfoMapping(DictMapping, SingleFileMapping): - """Dedicated class for Python's PKG-INFO mapping and translation. - - https://www.python.org/dev/peps/pep-0314/""" - name = 'pkg-info' - filename = b'PKG-INFO' - mapping = {_normalize_pkginfo_key(k): v - for (k, v) in CROSSWALK_TABLE['Python PKG-INFO'].items()} - string_fields = ['name', 'version', 'description', 'summary', - 'author', 'author-email'] - - _parser = email.parser.BytesHeaderParser( - policy=LinebreakPreservingEmailPolicy()) - - def translate(self, content): - msg = self._parser.parsebytes(content) - d = {} - for (key, value) in msg.items(): - key = _normalize_pkginfo_key(key) - if value != 'UNKNOWN': - d.setdefault(key, []).append(value) - metadata = self._translate_dict(d, normalize=False) - if SCHEMA_URI+'author' in metadata or SCHEMA_URI+'email' in metadata: - metadata[SCHEMA_URI+'author'] = { - '@list': [{ - '@type': SCHEMA_URI+'Person', - SCHEMA_URI+'name': - metadata.pop(SCHEMA_URI+'author', [None])[0], - SCHEMA_URI+'email': - metadata.pop(SCHEMA_URI+'email', [None])[0], - }] - } - return self.normalize_translation(metadata) - - def normalize_home_page(self, urls): - return [{'@id': url} for url in urls] - - def normalize_keywords(self, keywords): - return list(itertools.chain.from_iterable( - s.split(' ') for s in keywords)) - - def normalize_license(self, licenses): - return [{'@id': license} for license in licenses] - - -@register_mapping -class GemspecMapping(DictMapping): - name = 'gemspec' - mapping = CROSSWALK_TABLE['Ruby Gem'] - string_fields = ['name', 'version', 'description', 'summary', 'email'] - - _re_spec_new = re.compile(r'.*Gem::Specification.new +(do|\{) +\|.*\|.*') - _re_spec_entry = re.compile(r'\s*\w+\.(?P\w+)\s*=\s*(?P.*)') - - @classmethod - def detect_metadata_files(cls, file_entries): - for entry in file_entries: - if entry['name'].endswith(b'.gemspec'): - return [entry['sha1']] - return [] - - def translate(self, raw_content): - try: - raw_content = raw_content.decode() - except UnicodeDecodeError: - self.log.warning('Error unidecoding from %s', self.log_suffix) - return - - # Skip lines before 'Gem::Specification.new' - lines = itertools.dropwhile( - lambda x: not self._re_spec_new.match(x), - raw_content.split('\n')) - - try: - next(lines) # Consume 'Gem::Specification.new' - except StopIteration: - self.log.warning('Could not find Gem::Specification in %s', - self.log_suffix) - return - - content_dict = {} - for line in lines: - match = self._re_spec_entry.match(line) - if match: - value = self.eval_ruby_expression(match.group('expr')) - if value: - content_dict[match.group('key')] = value - return self._translate_dict(content_dict) - - def eval_ruby_expression(self, expr): - """Very simple evaluator of Ruby expressions. - - >>> GemspecMapping().eval_ruby_expression('"Foo bar"') - 'Foo bar' - >>> GemspecMapping().eval_ruby_expression("'Foo bar'") - 'Foo bar' - >>> GemspecMapping().eval_ruby_expression("['Foo', 'bar']") - ['Foo', 'bar'] - >>> GemspecMapping().eval_ruby_expression("'Foo bar'.freeze") - 'Foo bar' - >>> GemspecMapping().eval_ruby_expression( \ - "['Foo'.freeze, 'bar'.freeze]") - ['Foo', 'bar'] - """ - def evaluator(node): - if isinstance(node, ast.Str): - return node.s - elif isinstance(node, ast.List): - res = [] - for element in node.elts: - val = evaluator(element) - if not val: - return - res.append(val) - return res - - expr = expr.replace('.freeze', '') - try: - # We're parsing Ruby expressions here, but Python's - # ast.parse works for very simple Ruby expressions - # (mainly strings delimited with " or ', and lists - # of such strings). - tree = ast.parse(expr, mode='eval') - except (SyntaxError, ValueError): - return - if isinstance(tree, ast.Expression): - return evaluator(tree.body) - - def normalize_homepage(self, s): - if isinstance(s, str): - return {"@id": s} - - def normalize_license(self, s): - if isinstance(s, str): - return [{"@id": "https://spdx.org/licenses/" + s}] - - def normalize_licenses(self, licenses): - if isinstance(licenses, list): - return [{"@id": "https://spdx.org/licenses/" + license} - for license in licenses - if isinstance(license, str)] - - def normalize_author(self, author): - if isinstance(author, str): - return {"@list": [author]} - - def normalize_authors(self, authors): - if isinstance(authors, list): - return {"@list": [author for author in authors - if isinstance(author, str)]} - - -@click.command() -@click.argument('mapping_name') -@click.argument('file_name') -def main(mapping_name, file_name): - from pprint import pprint - with open(file_name, 'rb') as fd: - file_content = fd.read() - res = MAPPINGS[mapping_name]().translate(file_content) - pprint(res) - - -if __name__ == '__main__': - main() diff --git a/swh/indexer/metadata_dictionary/__init__.py b/swh/indexer/metadata_dictionary/__init__.py new file mode 100644 index 0000000..107a8b3 --- /dev/null +++ b/swh/indexer/metadata_dictionary/__init__.py @@ -0,0 +1,38 @@ +import collections + +import click + +from . import maven, npm, codemeta, python, ruby + +MAPPINGS = { + 'CodemetaMapping': codemeta.CodemetaMapping, + 'MavenMapping': maven.MavenMapping, + 'NpmMapping': npm.NpmMapping, + 'PythonPkginfoMapping': python.PythonPkginfoMapping, + 'GemspecMapping': ruby.GemspecMapping, +} + + +def list_terms(): + """Returns a dictionary with all supported CodeMeta terms as keys, + and the mappings that support each of them as values.""" + d = collections.defaultdict(set) + for mapping in MAPPINGS.values(): + for term in mapping.supported_terms(): + d[term].add(mapping) + return d + + +@click.command() +@click.argument('mapping_name') +@click.argument('file_name') +def main(mapping_name, file_name): + from pprint import pprint + with open(file_name, 'rb') as fd: + file_content = fd.read() + res = MAPPINGS[mapping_name]().translate(file_content) + pprint(res) + + +if __name__ == '__main__': + main() diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py new file mode 100644 index 0000000..9bc0ef5 --- /dev/null +++ b/swh/indexer/metadata_dictionary/base.py @@ -0,0 +1,211 @@ +# Copyright (C) 2017-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import abc +import json +import logging + +from swh.indexer.codemeta import SCHEMA_URI +from swh.indexer.codemeta import compact + + +def merge_values(v1, v2): + """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`, + returns `{"@list": l1 + l2}`. + Otherwise, make them lists (if they are not already) and concatenate + them. + + >>> merge_values('a', 'b') + ['a', 'b'] + >>> merge_values(['a', 'b'], 'c') + ['a', 'b', 'c'] + >>> merge_values({'@list': ['a', 'b']}, {'@list': ['c']}) + {'@list': ['a', 'b', 'c']} + """ + if v1 is None: + return v2 + elif v2 is None: + return v1 + elif isinstance(v1, dict) and set(v1) == {'@list'}: + assert isinstance(v1['@list'], list) + if isinstance(v2, dict) and set(v2) == {'@list'}: + assert isinstance(v2['@list'], list) + return {'@list': v1['@list'] + v2['@list']} + else: + raise ValueError('Cannot merge %r and %r' % (v1, v2)) + else: + if isinstance(v2, dict) and '@list' in v2: + raise ValueError('Cannot merge %r and %r' % (v1, v2)) + if not isinstance(v1, list): + v1 = [v1] + if not isinstance(v2, list): + v2 = [v2] + return v1 + v2 + + +class BaseMapping(metaclass=abc.ABCMeta): + """Base class for mappings to inherit from + + To implement a new mapping: + + - inherit this class + - override translate function + """ + def __init__(self, log_suffix=''): + self.log_suffix = log_suffix + self.log = logging.getLogger('%s.%s' % ( + self.__class__.__module__, + self.__class__.__name__)) + + @property + @abc.abstractmethod + def name(self): + """A name of this mapping, used as an identifier in the + indexer storage.""" + pass + + @classmethod + @abc.abstractmethod + def detect_metadata_files(cls, files): + """ + Detects files potentially containing metadata + + Args: + file_entries (list): list of files + + Returns: + list: list of sha1 (possibly empty) + """ + pass + + @abc.abstractmethod + def translate(self, file_content): + pass + + def normalize_translation(self, metadata): + return compact(metadata) + + +class SingleFileMapping(BaseMapping): + """Base class for all mappings that use a single file as input.""" + + @property + @abc.abstractmethod + def filename(self): + """The .json file to extract metadata from.""" + pass + + @classmethod + def detect_metadata_files(cls, file_entries): + for entry in file_entries: + if entry['name'] == cls.filename: + return [entry['sha1']] + return [] + + +class DictMapping(BaseMapping): + """Base class for mappings that take as input a file that is mostly + a key-value store (eg. a shallow JSON dict).""" + + string_fields = [] + '''List of fields that are simple strings, and don't need any + normalization.''' + + @property + @abc.abstractmethod + def mapping(self): + """A translation dict to map dict keys into a canonical name.""" + pass + + @staticmethod + def _normalize_method_name(name): + return name.replace('-', '_') + + @classmethod + def supported_terms(cls): + return { + term for (key, term) in cls.mapping.items() + if key in cls.string_fields + or hasattr(cls, 'translate_' + cls._normalize_method_name(key)) + or hasattr(cls, 'normalize_' + cls._normalize_method_name(key))} + + def _translate_dict(self, content_dict, *, normalize=True): + """ + Translates content by parsing content from a dict object + and translating with the appropriate mapping + + Args: + content_dict (dict): content dict to translate + + Returns: + dict: translated metadata in json-friendly form needed for + the indexer + + """ + translated_metadata = {'@type': SCHEMA_URI + 'SoftwareSourceCode'} + for k, v in content_dict.items(): + # First, check if there is a specific translation + # method for this key + translation_method = getattr( + self, 'translate_' + self._normalize_method_name(k), None) + if translation_method: + translation_method(translated_metadata, v) + elif k in self.mapping: + # if there is no method, but the key is known from the + # crosswalk table + codemeta_key = self.mapping[k] + + # if there is a normalization method, use it on the value + normalization_method = getattr( + self, 'normalize_' + self._normalize_method_name(k), None) + if normalization_method: + v = normalization_method(v) + elif k in self.string_fields and isinstance(v, str): + pass + elif k in self.string_fields and isinstance(v, list): + v = [x for x in v if isinstance(x, str)] + else: + continue + + # set the translation metadata with the normalized value + if codemeta_key in translated_metadata: + translated_metadata[codemeta_key] = merge_values( + translated_metadata[codemeta_key], v) + else: + translated_metadata[codemeta_key] = v + if normalize: + return self.normalize_translation(translated_metadata) + else: + return translated_metadata + + +class JsonMapping(DictMapping, SingleFileMapping): + """Base class for all mappings that use a JSON file as input.""" + + def translate(self, raw_content): + """ + Translates content by parsing content from a bytestring containing + json data and translating with the appropriate mapping + + Args: + raw_content (bytes): raw content to translate + + Returns: + dict: translated metadata in json-friendly form needed for + the indexer + + """ + try: + raw_content = raw_content.decode() + except UnicodeDecodeError: + self.log.warning('Error unidecoding from %s', self.log_suffix) + return + try: + content_dict = json.loads(raw_content) + except json.JSONDecodeError: + self.log.warning('Error unjsoning from %s', self.log_suffix) + return + if isinstance(content_dict, dict): + return self._translate_dict(content_dict) diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py new file mode 100644 index 0000000..bfb336c --- /dev/null +++ b/swh/indexer/metadata_dictionary/codemeta.py @@ -0,0 +1,30 @@ +# Copyright (C) 2018-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json + +from swh.indexer.codemeta import CODEMETA_TERMS +from swh.indexer.codemeta import expand +from .base import SingleFileMapping + + +class CodemetaMapping(SingleFileMapping): + """ + dedicated class for CodeMeta (codemeta.json) mapping and translation + """ + name = 'codemeta' + filename = b'codemeta.json' + string_fields = None + + @classmethod + def supported_terms(cls): + return [term for term in CODEMETA_TERMS if not term.startswith('@')] + + def translate(self, content): + try: + return self.normalize_translation(expand( + json.loads(content.decode()))) + except Exception: + return None diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py new file mode 100644 index 0000000..38592ba --- /dev/null +++ b/swh/indexer/metadata_dictionary/maven.py @@ -0,0 +1,154 @@ +# Copyright (C) 2018-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import xml.parsers.expat + +import xmltodict + +from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from .base import DictMapping, SingleFileMapping + + +class MavenMapping(DictMapping, SingleFileMapping): + """ + dedicated class for Maven (pom.xml) mapping and translation + """ + name = 'maven' + filename = b'pom.xml' + mapping = CROSSWALK_TABLE['Java (Maven)'] + string_fields = ['name', 'version', 'description', 'email'] + + def translate(self, content): + try: + d = xmltodict.parse(content).get('project') or {} + except xml.parsers.expat.ExpatError: + self.log.warning('Error parsing XML from %s', self.log_suffix) + return None + except UnicodeDecodeError: + self.log.warning('Error unidecoding XML from %s', self.log_suffix) + return None + except (LookupError, ValueError): + # unknown encoding or multi-byte encoding + self.log.warning('Error detecting XML encoding from %s', + self.log_suffix) + return None + metadata = self._translate_dict(d, normalize=False) + metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d) + metadata[SCHEMA_URI+'license'] = self.parse_licenses(d) + return self.normalize_translation(metadata) + + _default_repository = {'url': 'https://repo.maven.apache.org/maven2/'} + + def parse_repositories(self, d): + """https://maven.apache.org/pom.html#Repositories + + >>> import xmltodict + >>> from pprint import pprint + >>> d = xmltodict.parse(''' + ... + ... + ... codehausSnapshots + ... Codehaus Snapshots + ... http://snapshots.maven.codehaus.org/maven2 + ... default + ... + ... + ... ''') + >>> MavenMapping().parse_repositories(d) + """ + repositories = d.get('repositories') + if not repositories: + results = [self.parse_repository(d, self._default_repository)] + elif isinstance(repositories, dict): + repositories = repositories.get('repository') or [] + if not isinstance(repositories, list): + repositories = [repositories] + results = [self.parse_repository(d, repo) + for repo in repositories] + else: + results = [] + return [res for res in results if res] or None + + def parse_repository(self, d, repo): + if not isinstance(repo, dict): + return + if repo.get('layout', 'default') != 'default': + return # TODO ? + url = repo.get('url') + group_id = d.get('groupId') + artifact_id = d.get('artifactId') + if (isinstance(url, str) and isinstance(group_id, str) + and isinstance(artifact_id, str)): + repo = os.path.join(url, *group_id.split('.'), artifact_id) + return {"@id": repo} + + def normalize_groupId(self, id_): + """https://maven.apache.org/pom.html#Maven_Coordinates + + >>> MavenMapping().normalize_groupId('org.example') + {'@id': 'org.example'} + """ + if isinstance(id_, str): + return {"@id": id_} + + def parse_licenses(self, d): + """https://maven.apache.org/pom.html#Licenses + + >>> import xmltodict + >>> import json + >>> d = xmltodict.parse(''' + ... + ... + ... Apache License, Version 2.0 + ... https://www.apache.org/licenses/LICENSE-2.0.txt + ... + ... + ... ''') + >>> print(json.dumps(d, indent=4)) + { + "licenses": { + "license": { + "name": "Apache License, Version 2.0", + "url": "https://www.apache.org/licenses/LICENSE-2.0.txt" + } + } + } + >>> MavenMapping().parse_licenses(d) + [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}] + + or, if there are more than one license: + + >>> import xmltodict + >>> from pprint import pprint + >>> d = xmltodict.parse(''' + ... + ... + ... Apache License, Version 2.0 + ... https://www.apache.org/licenses/LICENSE-2.0.txt + ... + ... + ... MIT License + ... https://opensource.org/licenses/MIT + ... + ... + ... ''') + >>> pprint(MavenMapping().parse_licenses(d)) + [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}, + {'@id': 'https://opensource.org/licenses/MIT'}] + """ + + licenses = d.get('licenses') + if not isinstance(licenses, dict): + return + licenses = licenses.get('license') + if isinstance(licenses, dict): + licenses = [licenses] + elif not isinstance(licenses, list): + return + return [{"@id": license['url']} + for license in licenses + if isinstance(license, dict) + and isinstance(license.get('url'), str)] or None diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py new file mode 100644 index 0000000..659fe77 --- /dev/null +++ b/swh/indexer/metadata_dictionary/npm.py @@ -0,0 +1,156 @@ +# Copyright (C) 2018-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import re + +from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from .base import JsonMapping + + +class NpmMapping(JsonMapping): + """ + dedicated class for NPM (package.json) mapping and translation + """ + name = 'npm' + mapping = CROSSWALK_TABLE['NodeJS'] + filename = b'package.json' + string_fields = ['name', 'version', 'homepage', 'description', 'email'] + + _schema_shortcuts = { + 'github': 'git+https://github.com/%s.git', + 'gist': 'git+https://gist.github.com/%s.git', + 'gitlab': 'git+https://gitlab.com/%s.git', + # Bitbucket supports both hg and git, and the shortcut does not + # tell which one to use. + # 'bitbucket': 'https://bitbucket.org/', + } + + def normalize_repository(self, d): + """https://docs.npmjs.com/files/package.json#repository + + >>> NpmMapping().normalize_repository({ + ... 'type': 'git', + ... 'url': 'https://example.org/foo.git' + ... }) + {'@id': 'git+https://example.org/foo.git'} + >>> NpmMapping().normalize_repository( + ... 'gitlab:foo/bar') + {'@id': 'git+https://gitlab.com/foo/bar.git'} + >>> NpmMapping().normalize_repository( + ... 'foo/bar') + {'@id': 'git+https://github.com/foo/bar.git'} + """ + if isinstance(d, dict) and isinstance(d.get('type'), str) \ + and isinstance(d.get('url'), str): + url = '{type}+{url}'.format(**d) + elif isinstance(d, str): + if '://' in d: + url = d + elif ':' in d: + (schema, rest) = d.split(':', 1) + if schema in self._schema_shortcuts: + url = self._schema_shortcuts[schema] % rest + else: + return None + else: + url = self._schema_shortcuts['github'] % d + + else: + return None + + return {'@id': url} + + def normalize_bugs(self, d): + """https://docs.npmjs.com/files/package.json#bugs + + >>> NpmMapping().normalize_bugs({ + ... 'url': 'https://example.org/bugs/', + ... 'email': 'bugs@example.org' + ... }) + {'@id': 'https://example.org/bugs/'} + >>> NpmMapping().normalize_bugs( + ... 'https://example.org/bugs/') + {'@id': 'https://example.org/bugs/'} + """ + if isinstance(d, dict) and isinstance(d.get('url'), str): + return {'@id': d['url']} + elif isinstance(d, str): + return {'@id': d} + else: + return None + + _parse_author = re.compile(r'^ *' + r'(?P.*?)' + r'( +<(?P.*)>)?' + r'( +\((?P.*)\))?' + r' *$') + + def normalize_author(self, d): + """https://docs.npmjs.com/files/package.json#people-fields-author-contributors' + + >>> from pprint import pprint + >>> pprint(NpmMapping().normalize_author({ + ... 'name': 'John Doe', + ... 'email': 'john.doe@example.org', + ... 'url': 'https://example.org/~john.doe', + ... })) + {'@list': [{'@type': 'http://schema.org/Person', + 'http://schema.org/email': 'john.doe@example.org', + 'http://schema.org/name': 'John Doe', + 'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]} + >>> pprint(NpmMapping().normalize_author( + ... 'John Doe (https://example.org/~john.doe)' + ... )) + {'@list': [{'@type': 'http://schema.org/Person', + 'http://schema.org/email': 'john.doe@example.org', + 'http://schema.org/name': 'John Doe', + 'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]} + """ # noqa + author = {'@type': SCHEMA_URI+'Person'} + if isinstance(d, dict): + name = d.get('name', None) + email = d.get('email', None) + url = d.get('url', None) + elif isinstance(d, str): + match = self._parse_author.match(d) + name = match.group('name') + email = match.group('email') + url = match.group('url') + else: + return None + if name and isinstance(name, str): + author[SCHEMA_URI+'name'] = name + if email and isinstance(email, str): + author[SCHEMA_URI+'email'] = email + if url and isinstance(url, str): + author[SCHEMA_URI+'url'] = {'@id': url} + return {"@list": [author]} + + def normalize_license(self, s): + """https://docs.npmjs.com/files/package.json#license + + >>> NpmMapping().normalize_license('MIT') + {'@id': 'https://spdx.org/licenses/MIT'} + """ + if isinstance(s, str): + return {"@id": "https://spdx.org/licenses/" + s} + + def normalize_homepage(self, s): + """https://docs.npmjs.com/files/package.json#homepage + + >>> NpmMapping().normalize_homepage('https://example.org/~john.doe') + {'@id': 'https://example.org/~john.doe'} + """ + if isinstance(s, str): + return {"@id": s} + + def normalize_keywords(self, l): + """https://docs.npmjs.com/files/package.json#homepage + + >>> NpmMapping().normalize_keywords(['foo', 'bar']) + ['foo', 'bar'] + """ + if isinstance(l, list): + return [x for x in l if isinstance(x, str)] diff --git a/swh/indexer/metadata_dictionary/python.py b/swh/indexer/metadata_dictionary/python.py new file mode 100644 index 0000000..4bcb81b --- /dev/null +++ b/swh/indexer/metadata_dictionary/python.py @@ -0,0 +1,67 @@ +# Copyright (C) 2018-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import email.parser +import email.policy +import itertools + +from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from .base import DictMapping, SingleFileMapping + + +_normalize_pkginfo_key = str.lower + + +class LinebreakPreservingEmailPolicy(email.policy.EmailPolicy): + def header_fetch_parse(self, name, value): + if hasattr(value, 'name'): + return value + value = value.replace('\n ', '\n') + return self.header_factory(name, value) + + +class PythonPkginfoMapping(DictMapping, SingleFileMapping): + """Dedicated class for Python's PKG-INFO mapping and translation. + + https://www.python.org/dev/peps/pep-0314/""" + name = 'pkg-info' + filename = b'PKG-INFO' + mapping = {_normalize_pkginfo_key(k): v + for (k, v) in CROSSWALK_TABLE['Python PKG-INFO'].items()} + string_fields = ['name', 'version', 'description', 'summary', + 'author', 'author-email'] + + _parser = email.parser.BytesHeaderParser( + policy=LinebreakPreservingEmailPolicy()) + + def translate(self, content): + msg = self._parser.parsebytes(content) + d = {} + for (key, value) in msg.items(): + key = _normalize_pkginfo_key(key) + if value != 'UNKNOWN': + d.setdefault(key, []).append(value) + metadata = self._translate_dict(d, normalize=False) + if SCHEMA_URI+'author' in metadata or SCHEMA_URI+'email' in metadata: + metadata[SCHEMA_URI+'author'] = { + '@list': [{ + '@type': SCHEMA_URI+'Person', + SCHEMA_URI+'name': + metadata.pop(SCHEMA_URI+'author', [None])[0], + SCHEMA_URI+'email': + metadata.pop(SCHEMA_URI+'email', [None])[0], + }] + } + return self.normalize_translation(metadata) + + def normalize_home_page(self, urls): + return [{'@id': url} for url in urls] + + def normalize_keywords(self, keywords): + return list(itertools.chain.from_iterable( + s.split(' ') for s in keywords)) + + def normalize_license(self, licenses): + return [{'@id': license} for license in licenses] diff --git a/swh/indexer/metadata_dictionary/ruby.py b/swh/indexer/metadata_dictionary/ruby.py new file mode 100644 index 0000000..8d5b4a7 --- /dev/null +++ b/swh/indexer/metadata_dictionary/ruby.py @@ -0,0 +1,117 @@ +# Copyright (C) 2018-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import ast +import itertools +import re + +from swh.indexer.codemeta import CROSSWALK_TABLE +from .base import DictMapping + + +class GemspecMapping(DictMapping): + name = 'gemspec' + mapping = CROSSWALK_TABLE['Ruby Gem'] + string_fields = ['name', 'version', 'description', 'summary', 'email'] + + _re_spec_new = re.compile(r'.*Gem::Specification.new +(do|\{) +\|.*\|.*') + _re_spec_entry = re.compile(r'\s*\w+\.(?P\w+)\s*=\s*(?P.*)') + + @classmethod + def detect_metadata_files(cls, file_entries): + for entry in file_entries: + if entry['name'].endswith(b'.gemspec'): + return [entry['sha1']] + return [] + + def translate(self, raw_content): + try: + raw_content = raw_content.decode() + except UnicodeDecodeError: + self.log.warning('Error unidecoding from %s', self.log_suffix) + return + + # Skip lines before 'Gem::Specification.new' + lines = itertools.dropwhile( + lambda x: not self._re_spec_new.match(x), + raw_content.split('\n')) + + try: + next(lines) # Consume 'Gem::Specification.new' + except StopIteration: + self.log.warning('Could not find Gem::Specification in %s', + self.log_suffix) + return + + content_dict = {} + for line in lines: + match = self._re_spec_entry.match(line) + if match: + value = self.eval_ruby_expression(match.group('expr')) + if value: + content_dict[match.group('key')] = value + return self._translate_dict(content_dict) + + def eval_ruby_expression(self, expr): + """Very simple evaluator of Ruby expressions. + + >>> GemspecMapping().eval_ruby_expression('"Foo bar"') + 'Foo bar' + >>> GemspecMapping().eval_ruby_expression("'Foo bar'") + 'Foo bar' + >>> GemspecMapping().eval_ruby_expression("['Foo', 'bar']") + ['Foo', 'bar'] + >>> GemspecMapping().eval_ruby_expression("'Foo bar'.freeze") + 'Foo bar' + >>> GemspecMapping().eval_ruby_expression( \ + "['Foo'.freeze, 'bar'.freeze]") + ['Foo', 'bar'] + """ + def evaluator(node): + if isinstance(node, ast.Str): + return node.s + elif isinstance(node, ast.List): + res = [] + for element in node.elts: + val = evaluator(element) + if not val: + return + res.append(val) + return res + + expr = expr.replace('.freeze', '') + try: + # We're parsing Ruby expressions here, but Python's + # ast.parse works for very simple Ruby expressions + # (mainly strings delimited with " or ', and lists + # of such strings). + tree = ast.parse(expr, mode='eval') + except (SyntaxError, ValueError): + return + if isinstance(tree, ast.Expression): + return evaluator(tree.body) + + def normalize_homepage(self, s): + if isinstance(s, str): + return {"@id": s} + + def normalize_license(self, s): + if isinstance(s, str): + return [{"@id": "https://spdx.org/licenses/" + s}] + + def normalize_licenses(self, licenses): + if isinstance(licenses, list): + return [{"@id": "https://spdx.org/licenses/" + license} + for license in licenses + if isinstance(license, str)] + + def normalize_author(self, author): + if isinstance(author, str): + return {"@list": [author]} + + def normalize_authors(self, authors): + if isinstance(authors, list): + return {"@list": [author for author in authors + if isinstance(author, str)]} diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py index cd8c2f5..841c17e 100644 --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -1,915 +1,917 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import psycopg2 from collections import defaultdict from swh.core.api import remote_api_endpoint from swh.storage.common import db_transaction_generator, db_transaction from swh.storage.exc import StorageDBError from .db import Db from . import converters INDEXER_CFG_KEY = 'indexer_storage' MAPPING_NAMES = ['codemeta', 'gemspec', 'maven', 'npm', 'pkg-info'] def get_indexer_storage(cls, args): """Get an indexer storage object of class `storage_class` with arguments `storage_args`. Args: cls (str): storage's class, either 'local' or 'remote' args (dict): dictionary of arguments passed to the storage class constructor Returns: an instance of swh.indexer's storage (either local or remote) Raises: ValueError if passed an unknown storage class. """ if cls == 'remote': from .api.client import RemoteStorage as IndexerStorage elif cls == 'local': from . import IndexerStorage elif cls == 'memory': from .in_memory import IndexerStorage else: raise ValueError('Unknown indexer storage class `%s`' % cls) return IndexerStorage(**args) def _check_id_duplicates(data): """ If any two dictionaries in `data` have the same id, raises a `ValueError`. Values associated to the key must be hashable. Args: data (List[dict]): List of dictionaries to be inserted >>> _check_id_duplicates([ ... {'id': 'foo', 'data': 'spam'}, ... {'id': 'bar', 'data': 'egg'}, ... ]) >>> _check_id_duplicates([ ... {'id': 'foo', 'data': 'spam'}, ... {'id': 'foo', 'data': 'egg'}, ... ]) Traceback (most recent call last): ... ValueError: The same id is present more than once. """ if len({item['id'] for item in data}) < len(data): raise ValueError('The same id is present more than once.') class IndexerStorage: """SWH Indexer Storage """ def __init__(self, db, min_pool_conns=1, max_pool_conns=10): """ Args: db_conn: either a libpq connection string, or a psycopg2 connection """ try: if isinstance(db, psycopg2.extensions.connection): self._pool = None self._db = Db(db) else: self._pool = psycopg2.pool.ThreadedConnectionPool( min_pool_conns, max_pool_conns, db ) self._db = None except psycopg2.OperationalError as e: raise StorageDBError(e) def get_db(self): if self._db: return self._db return Db.from_pool(self._pool) + def put_db(self, db): + if db is not self._db: + db.put_conn() + @remote_api_endpoint('check_config') - def check_config(self, *, check_write): + @db_transaction() + def check_config(self, *, check_write, db=None, cur=None): """Check that the storage is configured and ready to go.""" # Check permissions on one of the tables - with self.get_db().transaction() as cur: - if check_write: - check = 'INSERT' - else: - check = 'SELECT' - - cur.execute( - "select has_table_privilege(current_user, 'content_mimetype', %s)", # noqa - (check,) - ) - return cur.fetchone()[0] + if check_write: + check = 'INSERT' + else: + check = 'SELECT' - return True + cur.execute( + "select has_table_privilege(current_user, 'content_mimetype', %s)", # noqa + (check,) + ) + return cur.fetchone()[0] @remote_api_endpoint('content_mimetype/missing') @db_transaction_generator() def content_mimetype_missing(self, mimetypes, db=None, cur=None): """Generate mimetypes missing from storage. Args: mimetypes (iterable): iterable of dict with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: tuple (id, indexer_configuration_id): missing id """ for obj in db.content_mimetype_missing_from_list(mimetypes, cur): yield obj[0] def _content_get_range(self, content_type, start, end, indexer_configuration_id, limit=1000, with_textual_data=False, db=None, cur=None): """Retrieve ids of type content_type within range [start, end] bound by limit. Args: **content_type** (str): content's type (mimetype, language, etc...) **start** (bytes): Starting identifier range (expected smaller than end) **end** (bytes): Ending identifier range (expected larger than start) **indexer_configuration_id** (int): The tool used to index data **limit** (int): Limit result (default to 1000) **with_textual_data** (bool): Deal with only textual content (True) or all content (all contents by defaults, False) Raises: ValueError for; - limit to None - wrong content_type provided Returns: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ if limit is None: raise ValueError('Development error: limit should not be None') if content_type not in db.content_indexer_names: err = 'Development error: Wrong type. Should be one of [%s]' % ( ','.join(db.content_indexer_names)) raise ValueError(err) ids = [] next_id = None for counter, obj in enumerate(db.content_get_range( content_type, start, end, indexer_configuration_id, limit=limit+1, with_textual_data=with_textual_data, cur=cur)): _id = obj[0] if counter >= limit: next_id = _id break ids.append(_id) return { 'ids': ids, 'next': next_id } @remote_api_endpoint('content_mimetype/range') @db_transaction() def content_mimetype_get_range(self, start, end, indexer_configuration_id, limit=1000, db=None, cur=None): """Retrieve mimetypes within range [start, end] bound by limit. Args: **start** (bytes): Starting identifier range (expected smaller than end) **end** (bytes): Ending identifier range (expected larger than start) **indexer_configuration_id** (int): The tool used to index data **limit** (int): Limit result (default to 1000) Raises: ValueError for limit to None Returns: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ return self._content_get_range('mimetype', start, end, indexer_configuration_id, limit=limit, db=db, cur=cur) @remote_api_endpoint('content_mimetype/add') @db_transaction() def content_mimetype_add(self, mimetypes, conflict_update=False, db=None, cur=None): """Add mimetypes not present in storage. Args: mimetypes (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **mimetype** (bytes): raw content's mimetype - **encoding** (bytes): raw content's encoding - **indexer_configuration_id** (int): tool's id used to compute the results - **conflict_update** (bool): Flag to determine if we want to overwrite (``True``) or skip duplicates (``False``, the default) """ _check_id_duplicates(mimetypes) mimetypes.sort(key=lambda m: m['id']) db.mktemp_content_mimetype(cur) db.copy_to(mimetypes, 'tmp_content_mimetype', ['id', 'mimetype', 'encoding', 'indexer_configuration_id'], cur) db.content_mimetype_add_from_temp(conflict_update, cur) @remote_api_endpoint('content_mimetype') @db_transaction_generator() def content_mimetype_get(self, ids, db=None, cur=None): """Retrieve full content mimetype per ids. Args: ids (iterable): sha1 identifier Yields: mimetypes (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **mimetype** (bytes): raw content's mimetype - **encoding** (bytes): raw content's encoding - **tool** (dict): Tool used to compute the language """ for c in db.content_mimetype_get_from_list(ids, cur): yield converters.db_to_mimetype( dict(zip(db.content_mimetype_cols, c))) @remote_api_endpoint('content_language/missing') @db_transaction_generator() def content_language_missing(self, languages, db=None, cur=None): """List languages missing from storage. Args: languages (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: an iterable of missing id for the tuple (id, indexer_configuration_id) """ for obj in db.content_language_missing_from_list(languages, cur): yield obj[0] @remote_api_endpoint('content_language') @db_transaction_generator() def content_language_get(self, ids, db=None, cur=None): """Retrieve full content language per ids. Args: ids (iterable): sha1 identifier Yields: languages (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **lang** (bytes): raw content's language - **tool** (dict): Tool used to compute the language """ for c in db.content_language_get_from_list(ids, cur): yield converters.db_to_language( dict(zip(db.content_language_cols, c))) @remote_api_endpoint('content_language/add') @db_transaction() def content_language_add(self, languages, conflict_update=False, db=None, cur=None): """Add languages not present in storage. Args: languages (iterable): dictionaries with keys: - **id** (bytes): sha1 - **lang** (bytes): language detected conflict_update (bool): Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ _check_id_duplicates(languages) languages.sort(key=lambda m: m['id']) db.mktemp_content_language(cur) # empty language is mapped to 'unknown' db.copy_to( ({ 'id': l['id'], 'lang': 'unknown' if not l['lang'] else l['lang'], 'indexer_configuration_id': l['indexer_configuration_id'], } for l in languages), 'tmp_content_language', ['id', 'lang', 'indexer_configuration_id'], cur) db.content_language_add_from_temp(conflict_update, cur) @remote_api_endpoint('content/ctags/missing') @db_transaction_generator() def content_ctags_missing(self, ctags, db=None, cur=None): """List ctags missing from storage. Args: ctags (iterable): dicts with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: an iterable of missing id for the tuple (id, indexer_configuration_id) """ for obj in db.content_ctags_missing_from_list(ctags, cur): yield obj[0] @remote_api_endpoint('content/ctags') @db_transaction_generator() def content_ctags_get(self, ids, db=None, cur=None): """Retrieve ctags per id. Args: ids (iterable): sha1 checksums Yields: Dictionaries with keys: - **id** (bytes): content's identifier - **name** (str): symbol's name - **kind** (str): symbol's kind - **lang** (str): language for that content - **tool** (dict): tool used to compute the ctags' info """ for c in db.content_ctags_get_from_list(ids, cur): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) @remote_api_endpoint('content/ctags/add') @db_transaction() def content_ctags_add(self, ctags, conflict_update=False, db=None, cur=None): """Add ctags not present in storage Args: ctags (iterable): dictionaries with keys: - **id** (bytes): sha1 - **ctags** ([list): List of dictionary with keys: name, kind, line, lang """ _check_id_duplicates(ctags) ctags.sort(key=lambda m: m['id']) def _convert_ctags(__ctags): """Convert ctags dict to list of ctags. """ for ctags in __ctags: yield from converters.ctags_to_db(ctags) db.mktemp_content_ctags(cur) db.copy_to(list(_convert_ctags(ctags)), tblname='tmp_content_ctags', columns=['id', 'name', 'kind', 'line', 'lang', 'indexer_configuration_id'], cur=cur) db.content_ctags_add_from_temp(conflict_update, cur) @remote_api_endpoint('content/ctags/search') @db_transaction_generator() def content_ctags_search(self, expression, limit=10, last_sha1=None, db=None, cur=None): """Search through content's raw ctags symbols. Args: expression (str): Expression to search for limit (int): Number of rows to return (default to 10). last_sha1 (str): Offset from which retrieving data (default to ''). Yields: rows of ctags including id, name, lang, kind, line, etc... """ for obj in db.content_ctags_search(expression, last_sha1, limit, cur=cur): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj))) @remote_api_endpoint('content/fossology_license') @db_transaction_generator() def content_fossology_license_get(self, ids, db=None, cur=None): """Retrieve licenses per id. Args: ids (iterable): sha1 checksums Yields: dict: ``{id: facts}`` where ``facts`` is a dict with the following keys: - **licenses** ([str]): associated licenses for that content - **tool** (dict): Tool used to compute the license """ d = defaultdict(list) for c in db.content_fossology_license_get_from_list(ids, cur): license = dict(zip(db.content_fossology_license_cols, c)) id_ = license['id'] d[id_].append(converters.db_to_fossology_license(license)) for id_, facts in d.items(): yield {id_: facts} @remote_api_endpoint('content/fossology_license/add') @db_transaction() def content_fossology_license_add(self, licenses, conflict_update=False, db=None, cur=None): """Add licenses not present in storage. Args: licenses (iterable): dictionaries with keys: - **id**: sha1 - **licenses** ([bytes]): List of licenses associated to sha1 - **tool** (str): nomossa conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) Returns: list: content_license entries which failed due to unknown licenses """ _check_id_duplicates(licenses) licenses.sort(key=lambda m: m['id']) db.mktemp_content_fossology_license(cur) db.copy_to( ({ 'id': sha1['id'], 'indexer_configuration_id': sha1['indexer_configuration_id'], 'license': license, } for sha1 in licenses for license in sha1['licenses']), tblname='tmp_content_fossology_license', columns=['id', 'license', 'indexer_configuration_id'], cur=cur) db.content_fossology_license_add_from_temp(conflict_update, cur) @remote_api_endpoint('content/fossology_license/range') @db_transaction() def content_fossology_license_get_range( self, start, end, indexer_configuration_id, limit=1000, db=None, cur=None): """Retrieve licenses within range [start, end] bound by limit. Args: **start** (bytes): Starting identifier range (expected smaller than end) **end** (bytes): Ending identifier range (expected larger than start) **indexer_configuration_id** (int): The tool used to index data **limit** (int): Limit result (default to 1000) Raises: ValueError for limit to None Returns: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ return self._content_get_range('fossology_license', start, end, indexer_configuration_id, limit=limit, with_textual_data=True, db=db, cur=cur) @remote_api_endpoint('content_metadata/missing') @db_transaction_generator() def content_metadata_missing(self, metadata, db=None, cur=None): """List metadata missing from storage. Args: metadata (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: missing sha1s """ for obj in db.content_metadata_missing_from_list(metadata, cur): yield obj[0] @remote_api_endpoint('content_metadata') @db_transaction_generator() def content_metadata_get(self, ids, db=None, cur=None): """Retrieve metadata per id. Args: ids (iterable): sha1 checksums Yields: dictionaries with the following keys: id (bytes) metadata (str): associated metadata tool (dict): tool used to compute metadata """ for c in db.content_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.content_metadata_cols, c))) @remote_api_endpoint('content_metadata/add') @db_transaction() def content_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: sha1 - **metadata**: arbitrary dict conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ _check_id_duplicates(metadata) metadata.sort(key=lambda m: m['id']) db.mktemp_content_metadata(cur) db.copy_to(metadata, 'tmp_content_metadata', ['id', 'metadata', 'indexer_configuration_id'], cur) db.content_metadata_add_from_temp(conflict_update, cur) @remote_api_endpoint('revision_intrinsic_metadata/missing') @db_transaction_generator() def revision_intrinsic_metadata_missing(self, metadata, db=None, cur=None): """List metadata missing from storage. Args: metadata (iterable): dictionaries with keys: - **id** (bytes): sha1_git revision identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: missing ids """ for obj in db.revision_intrinsic_metadata_missing_from_list( metadata, cur): yield obj[0] @remote_api_endpoint('revision_intrinsic_metadata') @db_transaction_generator() def revision_intrinsic_metadata_get(self, ids, db=None, cur=None): """Retrieve revision metadata per id. Args: ids (iterable): sha1 checksums Yields: dictionaries with the following keys: - **id** (bytes) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata """ for c in db.revision_intrinsic_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.revision_intrinsic_metadata_cols, c))) @remote_api_endpoint('revision_intrinsic_metadata/add') @db_transaction() def revision_intrinsic_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: sha1_git of revision - **metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ _check_id_duplicates(metadata) metadata.sort(key=lambda m: m['id']) db.mktemp_revision_intrinsic_metadata(cur) db.copy_to(metadata, 'tmp_revision_intrinsic_metadata', ['id', 'metadata', 'mappings', 'indexer_configuration_id'], cur) db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur) @remote_api_endpoint('revision_intrinsic_metadata/delete') @db_transaction() def revision_intrinsic_metadata_delete(self, entries, db=None, cur=None): """Remove revision metadata from the storage. Args: entries (dict): dictionaries with the following keys: - **id** (bytes): revision identifier - **indexer_configuration_id** (int): tool used to compute metadata """ db.revision_intrinsic_metadata_delete(entries, cur) @remote_api_endpoint('origin_intrinsic_metadata') @db_transaction_generator() def origin_intrinsic_metadata_get(self, ids, db=None, cur=None): """Retrieve origin metadata per id. Args: ids (iterable): origin identifiers Yields: list: dictionaries with the following keys: - **id** (int) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata """ for c in db.origin_intrinsic_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c))) @remote_api_endpoint('origin_intrinsic_metadata/add') @db_transaction() def origin_intrinsic_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): """Add origin metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: origin identifier - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ _check_id_duplicates(metadata) metadata.sort(key=lambda m: m['id']) db.mktemp_origin_intrinsic_metadata(cur) db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', ['id', 'metadata', 'indexer_configuration_id', 'from_revision', 'mappings'], cur) db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) @remote_api_endpoint('origin_intrinsic_metadata/delete') @db_transaction() def origin_intrinsic_metadata_delete( self, entries, db=None, cur=None): """Remove origin metadata from the storage. Args: entries (dict): dictionaries with the following keys: - **id** (int): origin identifier - **indexer_configuration_id** (int): tool used to compute metadata """ db.origin_intrinsic_metadata_delete(entries, cur) @remote_api_endpoint('origin_intrinsic_metadata/search/fulltext') @db_transaction_generator() def origin_intrinsic_metadata_search_fulltext( self, conjunction, limit=100, db=None, cur=None): """Returns the list of origins whose metadata contain all the terms. Args: conjunction (List[str]): List of terms to be searched for. limit (int): The maximum number of results to return Yields: list: dictionaries with the following keys: - **id** (int) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata """ for c in db.origin_intrinsic_metadata_search_fulltext( conjunction, limit=limit, cur=cur): yield converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c))) @remote_api_endpoint('origin_intrinsic_metadata/search/by_producer') @db_transaction_generator() def origin_intrinsic_metadata_search_by_producer( self, start=0, end=None, limit=100, ids_only=False, mappings=None, tool_ids=None, db=None, cur=None): """Returns the list of origins whose metadata contain all the terms. Args: start (int): The minimum origin id to return end (int): The maximum origin id to return limit (int): The maximum number of results to return ids_only (bool): Determines whether only origin ids are returned or the content as well mappings (List[str]): Returns origins whose intrinsic metadata were generated using at least one of these mappings. Yields: list: list of origin ids (int) if `ids_only=True`, else dictionaries with the following keys: - **id** (int) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata """ res = db.origin_intrinsic_metadata_search_by_producer( start, end, limit, ids_only, mappings, tool_ids, cur) if ids_only: for (origin_id,) in res: yield origin_id else: for c in res: yield converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c))) @remote_api_endpoint('origin_intrinsic_metadata/stats') @db_transaction() def origin_intrinsic_metadata_stats( self, db=None, cur=None): """Returns counts of indexed metadata per origins, broken down into metadata types. Returns: dict: dictionary with keys: - total (int): total number of origins that were indexed (possibly yielding an empty metadata dictionary) - non_empty (int): total number of origins that we extracted a non-empty metadata dictionary from - per_mapping (dict): a dictionary with mapping names as keys and number of origins whose indexing used this mapping. Note that indexing a given origin may use 0, 1, or many mappings. """ mapping_names = [m for m in MAPPING_NAMES] select_parts = [] # Count rows for each mapping for mapping_name in mapping_names: select_parts.append(( "sum(case when (mappings @> ARRAY['%s']) " " then 1 else 0 end)" ) % mapping_name) # Total select_parts.append("sum(1)") # Rows whose metadata has at least one key that is not '@context' select_parts.append( "sum(case when ('{}'::jsonb @> (metadata - '@context')) " " then 0 else 1 end)") cur.execute('select ' + ', '.join(select_parts) + ' from origin_intrinsic_metadata') results = dict(zip(mapping_names + ['total', 'non_empty'], cur.fetchone())) return { 'total': results.pop('total'), 'non_empty': results.pop('non_empty'), 'per_mapping': results, } @remote_api_endpoint('indexer_configuration/add') @db_transaction_generator() def indexer_configuration_add(self, tools, db=None, cur=None): """Add new tools to the storage. Args: tools ([dict]): List of dictionary representing tool to insert in the db. Dictionary with the following keys: - **tool_name** (str): tool's name - **tool_version** (str): tool's version - **tool_configuration** (dict): tool's configuration (free form dict) Returns: List of dict inserted in the db (holding the id key as well). The order of the list is not guaranteed to match the order of the initial list. """ db.mktemp_indexer_configuration(cur) db.copy_to(tools, 'tmp_indexer_configuration', ['tool_name', 'tool_version', 'tool_configuration'], cur) tools = db.indexer_configuration_add_from_temp(cur) for line in tools: yield dict(zip(db.indexer_configuration_cols, line)) @remote_api_endpoint('indexer_configuration/data') @db_transaction() def indexer_configuration_get(self, tool, db=None, cur=None): """Retrieve tool information. Args: tool (dict): Dictionary representing a tool with the following keys: - **tool_name** (str): tool's name - **tool_version** (str): tool's version - **tool_configuration** (dict): tool's configuration (free form dict) Returns: The same dictionary with an `id` key, None otherwise. """ tool_conf = tool['tool_configuration'] if isinstance(tool_conf, dict): tool_conf = json.dumps(tool_conf) idx = db.indexer_configuration_get(tool['tool_name'], tool['tool_version'], tool_conf) if not idx: return None return dict(zip(db.indexer_configuration_cols, idx)) diff --git a/swh/indexer/storage/api/client.py b/swh/indexer/storage/api/client.py index 7dc616d..085c8cd 100644 --- a/swh/indexer/storage/api/client.py +++ b/swh/indexer/storage/api/client.py @@ -1,20 +1,17 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.core.api import SWHRemoteAPI from swh.storage.exc import StorageAPIError from .. import IndexerStorage class RemoteStorage(SWHRemoteAPI): """Proxy to a remote storage API""" backend_class = IndexerStorage - - def __init__(self, url, timeout=None): - super().__init__( - api_exception=StorageAPIError, url=url, timeout=timeout) + api_exception = StorageAPIError diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py index e2ae3eb..5f651bf 100644 --- a/swh/indexer/tests/conftest.py +++ b/swh/indexer/tests/conftest.py @@ -1,71 +1,71 @@ from datetime import timedelta from unittest.mock import patch import pytest -from swh.objstorage.objstorage_in_memory import InMemoryObjStorage +from swh.objstorage import get_objstorage from swh.scheduler.tests.conftest import * # noqa from swh.storage.in_memory import Storage from swh.indexer.storage.in_memory import IndexerStorage from .utils import fill_storage, fill_obj_storage TASK_NAMES = ['revision_intrinsic_metadata', 'origin_intrinsic_metadata'] @pytest.fixture def indexer_scheduler(swh_scheduler): for taskname in TASK_NAMES: swh_scheduler.create_task_type({ 'type': taskname, 'description': 'The {} indexer testing task'.format(taskname), 'backend_name': 'swh.indexer.tests.tasks.{}'.format(taskname), 'default_interval': timedelta(days=1), 'min_interval': timedelta(hours=6), 'max_interval': timedelta(days=12), 'num_retries': 3, }) return swh_scheduler @pytest.fixture def idx_storage(): """An instance of swh.indexer.storage.in_memory.IndexerStorage that gets injected into all indexers classes.""" idx_storage = IndexerStorage() with patch('swh.indexer.storage.in_memory.IndexerStorage') \ as idx_storage_mock: idx_storage_mock.return_value = idx_storage yield idx_storage @pytest.fixture def storage(): """An instance of swh.storage.in_memory.Storage that gets injected into all indexers classes.""" storage = Storage() fill_storage(storage) with patch('swh.storage.in_memory.Storage') as storage_mock: storage_mock.return_value = storage yield storage @pytest.fixture def obj_storage(): """An instance of swh.objstorage.objstorage_in_memory.InMemoryObjStorage that gets injected into all indexers classes.""" - objstorage = InMemoryObjStorage() + objstorage = get_objstorage('memory', {}) fill_obj_storage(objstorage) with patch.dict('swh.objstorage._STORAGE_CLASSES', {'memory': lambda: objstorage}): yield objstorage @pytest.fixture(scope='session') def celery_includes(): return [ 'swh.indexer.tests.tasks', 'swh.indexer.tasks', ] diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py index d58dd3d..d6aaf02 100644 --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -1,315 +1,315 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from functools import reduce import re import tempfile from unittest.mock import patch from click.testing import CliRunner from swh.model.hashutil import hash_to_bytes from swh.indexer.cli import cli CLI_CONFIG = ''' scheduler: cls: foo args: {} storage: cls: memory args: {} indexer_storage: cls: memory args: {} ''' def fill_idx_storage(idx_storage, nb_rows): tools = [ { 'tool_name': 'tool %d' % i, 'tool_version': '0.0.1', 'tool_configuration': {}, } for i in range(2) ] tools = idx_storage.indexer_configuration_add(tools) origin_metadata = [ { 'id': origin_id, 'from_revision': hash_to_bytes('abcd{:0>4}'.format(origin_id)), 'indexer_configuration_id': tools[origin_id % 2]['id'], 'metadata': {'name': 'origin %d' % origin_id}, 'mappings': ['mapping%d' % (origin_id % 10)] } for origin_id in range(nb_rows) ] revision_metadata = [ { 'id': hash_to_bytes('abcd{:0>4}'.format(origin_id)), 'indexer_configuration_id': tools[origin_id % 2]['id'], 'metadata': {'name': 'origin %d' % origin_id}, 'mappings': ['mapping%d' % (origin_id % 10)] } for origin_id in range(nb_rows) ] idx_storage.revision_intrinsic_metadata_add(revision_metadata) idx_storage.origin_intrinsic_metadata_add(origin_metadata) return [tool['id'] for tool in tools] def _origins_in_task_args(tasks): """Returns the set of origins contained in the arguments of the provided tasks (assumed to be of type indexer_origin_metadata).""" return reduce( set.union, (set(task['arguments']['args'][0]) for task in tasks), set() ) def _assert_tasks_for_origins(tasks, origins): expected_kwargs = {"policy_update": "update-dups", "parse_ids": False} assert {task['type'] for task in tasks} == {'indexer_origin_metadata'} assert all(len(task['arguments']['args']) == 1 for task in tasks) assert all(task['arguments']['kwargs'] == expected_kwargs for task in tasks) assert _origins_in_task_args(tasks) == set(origins) def invoke(scheduler, catch_exceptions, args): runner = CliRunner() with patch('swh.indexer.cli.get_scheduler') as get_scheduler_mock, \ tempfile.NamedTemporaryFile('a', suffix='.yml') as config_fd: config_fd.write(CLI_CONFIG) config_fd.seek(0) get_scheduler_mock.return_value = scheduler result = runner.invoke(cli, ['-C' + config_fd.name] + args) if not catch_exceptions and result.exception: print(result.output) raise result.exception return result def test_mapping_list(indexer_scheduler): result = invoke(indexer_scheduler, False, [ 'mapping', 'list', ]) expected_output = '\n'.join([ 'codemeta', 'gemspec', 'maven', 'npm', 'pkg-info', '', ]) assert result.exit_code == 0, result.output assert result.output == expected_output def test_mapping_list_terms(indexer_scheduler): result = invoke(indexer_scheduler, False, [ 'mapping', 'list-terms', ]) assert result.exit_code == 0, result.output assert re.search(r'http://schema.org/url:\n.*npm', result.output) assert re.search(r'http://schema.org/url:\n.*codemeta', result.output) assert re.search( r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta', result.output) def test_mapping_list_terms_exclude(indexer_scheduler): result = invoke(indexer_scheduler, False, [ 'mapping', 'list-terms', '--exclude-mapping', 'codemeta' ]) assert result.exit_code == 0, result.output assert re.search(r'http://schema.org/url:\n.*npm', result.output) assert not re.search(r'http://schema.org/url:\n.*codemeta', result.output) assert not re.search( r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta', result.output) -@patch('swh.indexer.cli.TASK_BATCH_SIZE', 3) +@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_empty_db( indexer_scheduler, idx_storage, storage): result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', ]) expected_output = ( 'Nothing to do (no origin metadata matched the criteria).\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output tasks = indexer_scheduler.search_tasks() assert len(tasks) == 0 -@patch('swh.indexer.cli.TASK_BATCH_SIZE', 3) +@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_divisor( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 90) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', ]) # Check the output expected_output = ( 'Scheduled 3 tasks (30 origins).\n' 'Scheduled 6 tasks (60 origins).\n' 'Scheduled 9 tasks (90 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 9 _assert_tasks_for_origins(tasks, range(90)) -@patch('swh.indexer.cli.TASK_BATCH_SIZE', 3) +@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_dry_run( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 90) result = invoke(indexer_scheduler, False, [ 'schedule', '--dry-run', 'reindex_origin_metadata', ]) # Check the output expected_output = ( 'Scheduled 3 tasks (30 origins).\n' 'Scheduled 6 tasks (60 origins).\n' 'Scheduled 9 tasks (90 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 0 -@patch('swh.indexer.cli.TASK_BATCH_SIZE', 3) +@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_nondivisor( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when neither origin_batch_size or task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 70) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', '--batch-size', '20', ]) # Check the output expected_output = ( 'Scheduled 3 tasks (60 origins).\n' 'Scheduled 4 tasks (70 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 4 _assert_tasks_for_origins(tasks, range(70)) -@patch('swh.indexer.cli.TASK_BATCH_SIZE', 3) +@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_filter_one_mapping( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 110) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', '--mapping', 'mapping1', ]) # Check the output expected_output = ( 'Scheduled 2 tasks (11 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 2 _assert_tasks_for_origins( tasks, [1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101]) -@patch('swh.indexer.cli.TASK_BATCH_SIZE', 3) +@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_filter_two_mappings( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 110) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', '--mapping', 'mapping1', '--mapping', 'mapping2', ]) # Check the output expected_output = ( 'Scheduled 3 tasks (22 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 3 _assert_tasks_for_origins( tasks, [1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101, 2, 12, 22, 32, 42, 52, 62, 72, 82, 92, 102]) -@patch('swh.indexer.cli.TASK_BATCH_SIZE', 3) +@patch('swh.scheduler.cli_utils.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_filter_one_tool( indexer_scheduler, idx_storage, storage): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" tool_ids = fill_idx_storage(idx_storage, 110) result = invoke(indexer_scheduler, False, [ 'schedule', 'reindex_origin_metadata', '--tool-id', str(tool_ids[0]), ]) # Check the output expected_output = ( 'Scheduled 3 tasks (30 origins).\n' 'Scheduled 6 tasks (55 origins).\n' 'Done.\n' ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 6 _assert_tasks_for_origins( tasks, [x*2 for x in range(55)]) diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index 2b8d651..e20e1e9 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,1209 +1,1209 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import unittest from hypothesis import given, strategies, settings, HealthCheck import xmltodict from swh.model.hashutil import hash_to_bytes -from swh.indexer.codemeta import CODEMETA_TERMS -from swh.indexer.metadata_dictionary import ( - CROSSWALK_TABLE, MAPPINGS, merge_values) +from swh.indexer.codemeta import CODEMETA_TERMS, CROSSWALK_TABLE +from swh.indexer.metadata_dictionary import MAPPINGS +from swh.indexer.metadata_dictionary.base import merge_values from swh.indexer.metadata_detector import ( detect_metadata, extract_minimal_metadata_dict ) from swh.indexer.metadata import ( ContentMetadataIndexer, RevisionMetadataIndexer ) from .utils import ( BASE_TEST_CONFIG, fill_obj_storage, fill_storage, YARN_PARSER_METADATA, json_document_strategy ) TRANSLATOR_TOOL = { 'name': 'swh-metadata-translator', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': 'NpmMapping' } } class ContentMetadataTestIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): assert False, 'should not be called; the rev indexer configures it.' REVISION_METADATA_CONFIG = { **BASE_TEST_CONFIG, 'tools': TRANSLATOR_TOOL, } class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None self.npm_mapping = MAPPINGS['NpmMapping']() self.codemeta_mapping = MAPPINGS['CodemetaMapping']() self.maven_mapping = MAPPINGS['MavenMapping']() self.pkginfo_mapping = MAPPINGS['PythonPkginfoMapping']() self.gemspec_mapping = MAPPINGS['GemspecMapping']() def test_crosstable(self): self.assertEqual(CROSSWALK_TABLE['NodeJS'], { 'repository': 'http://schema.org/codeRepository', 'os': 'http://schema.org/operatingSystem', 'cpu': 'http://schema.org/processorRequirements', 'engines': 'http://schema.org/processorRequirements', 'author': 'http://schema.org/author', 'author.email': 'http://schema.org/email', 'author.name': 'http://schema.org/name', 'contributor': 'http://schema.org/contributor', 'keywords': 'http://schema.org/keywords', 'license': 'http://schema.org/license', 'version': 'http://schema.org/version', 'description': 'http://schema.org/description', 'name': 'http://schema.org/name', 'bugs': 'https://codemeta.github.io/terms/issueTracker', 'homepage': 'http://schema.org/url' }) def test_merge_values(self): self.assertEqual( merge_values('a', 'b'), ['a', 'b']) self.assertEqual( merge_values(['a', 'b'], 'c'), ['a', 'b', 'c']) self.assertEqual( merge_values('a', ['b', 'c']), ['a', 'b', 'c']) self.assertEqual( merge_values({'@list': ['a']}, {'@list': ['b']}), {'@list': ['a', 'b']}) self.assertEqual( merge_values({'@list': ['a', 'b']}, {'@list': ['c']}), {'@list': ['a', 'b', 'c']}) with self.assertRaises(ValueError): merge_values({'@list': ['a']}, 'b') with self.assertRaises(ValueError): merge_values('a', {'@list': ['b']}) with self.assertRaises(ValueError): merge_values({'@list': ['a']}, ['b']) with self.assertRaises(ValueError): merge_values(['a'], {'@list': ['b']}) self.assertEqual( merge_values('a', None), 'a') self.assertEqual( merge_values(['a', 'b'], None), ['a', 'b']) self.assertEqual( merge_values(None, ['b', 'c']), ['b', 'c']) self.assertEqual( merge_values({'@list': ['a']}, None), {'@list': ['a']}) self.assertEqual( merge_values(None, {'@list': ['a']}), {'@list': ['a']}) def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" # None if no metadata was found or an error occurred declared_metadata = None # when result = self.npm_mapping.translate(content) # then self.assertEqual(declared_metadata, result) def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" }, "author": { "email": "moranegg@example.com", "name": "Morane G" } } """ declared_metadata = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'test_metadata', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', 'author': [{ 'type': 'Person', 'name': 'Morane G', 'email': 'moranegg@example.com', }], } # when result = self.npm_mapping.translate(content) # then self.assertEqual(declared_metadata, result) def test_extract_minimal_metadata_dict(self): """ Test the creation of a coherent minimal metadata set """ # given metadata_list = [{ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_0_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test' }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_metadata', 'version': '0.0.2', 'author': 'moranegg', }] # when results = extract_minimal_metadata_dict(metadata_list) # then expected_results = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', "version": '0.0.2', "description": 'Simple package.json test for indexer', "name": ['test_1', 'test_0_1', 'test_metadata'], "author": ['moranegg'], "codeRepository": 'git+https://github.com/moranegg/metadata_test', } self.assertEqual(expected_results, results) def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = [ hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'), hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607'), hash_to_bytes('02fb2c89e14f7fab46701478c83779c7beb7b069'), ] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping config = BASE_TEST_CONFIG.copy() config['tools'] = [TRANSLATOR_TOOL] metadata_indexer = ContentMetadataTestIndexer(config=config) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # when metadata_indexer.run(sha1s, policy_update='ignore-dups') results = list(metadata_indexer.idx_storage.content_metadata_get( sha1s)) expected_results = [{ 'metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'), }, { 'metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'issueTracker': 'https://github.com/npm/npm/issues', 'author': [{ 'type': 'Person', 'name': 'Isaac Z. Schlueter', 'email': 'i@izs.me', 'url': 'http://blog.izs.me', }], 'codeRepository': 'git+https://github.com/npm/npm', 'description': 'a package manager for JavaScript', 'license': 'https://spdx.org/licenses/Artistic-2.0', 'version': '5.0.3', 'name': 'npm', 'keywords': [ 'install', 'modules', 'package manager', 'package.json' ], 'url': 'https://docs.npmjs.com/' }, 'id': hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607') }] for result in results: del result['tool'] # The assertion below returns False sometimes because of nested lists self.assertEqual(expected_results, results) def test_npm_bugs_normalization(self): # valid dictionary package_json = b"""{ "name": "foo", "bugs": { "url": "https://github.com/owner/project/issues", "email": "foo@example.com" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'issueTracker': 'https://github.com/owner/project/issues', 'type': 'SoftwareSourceCode', }) # "invalid" dictionary package_json = b"""{ "name": "foo", "bugs": { "email": "foo@example.com" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'type': 'SoftwareSourceCode', }) # string package_json = b"""{ "name": "foo", "bugs": "https://github.com/owner/project/issues" }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'issueTracker': 'https://github.com/owner/project/issues', 'type': 'SoftwareSourceCode', }) def test_npm_repository_normalization(self): # normal package_json = b"""{ "name": "foo", "repository": { "type" : "git", "url" : "https://github.com/npm/cli.git" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'codeRepository': 'git+https://github.com/npm/cli.git', 'type': 'SoftwareSourceCode', }) # missing url package_json = b"""{ "name": "foo", "repository": { "type" : "git" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'type': 'SoftwareSourceCode', }) # github shortcut package_json = b"""{ "name": "foo", "repository": "github:npm/cli" }""" result = self.npm_mapping.translate(package_json) expected_result = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'codeRepository': 'git+https://github.com/npm/cli.git', 'type': 'SoftwareSourceCode', } self.assertEqual(result, expected_result) # github shortshortcut package_json = b"""{ "name": "foo", "repository": "npm/cli" }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, expected_result) # gitlab shortcut package_json = b"""{ "name": "foo", "repository": "gitlab:user/repo" }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'codeRepository': 'git+https://gitlab.com/user/repo.git', 'type': 'SoftwareSourceCode', }) def test_detect_metadata_package_json(self): # given df = [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'cde' }] # when results = detect_metadata(df) expected_results = { 'NpmMapping': [ b'cde' ] } # then self.assertEqual(expected_results, results) def test_compute_metadata_valid_codemeta(self): raw_content = ( b"""{ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, { "@type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "@id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "@id": "https://doi.org/10.13039/100000001", "@type": "Organization", "name": "National Science Foundation" }, "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", "keywords": [ "metadata", "software" ], "version":"2.0", "dateCreated":"2017-06-05", "datePublished":"2017-06-05", "programmingLanguage": "JSON-LD" }""") # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can " "be used to standardize the exchange of software metadata " "across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science " "software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X" }, { "type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "id": "https://doi.org/10.13039/100000001", "type": "Organization", "name": "National Science Foundation" }, "funding": "1549758; Codemeta: A Rosetta Stone for Metadata " "in Scientific Software", "keywords": [ "metadata", "software" ], "version": "2.0", "dateCreated": "2017-06-05", "datePublished": "2017-06-05", "programmingLanguage": "JSON-LD" } result = self.codemeta_mapping.translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_codemeta_alternate_context(self): raw_content = ( b"""{ "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld", "@type": "SoftwareSourceCode", "identifier": "CodeMeta" }""") # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", } result = self.codemeta_mapping.translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_maven(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'license': 'https://www.apache.org/licenses/LICENSE-2.0.txt', 'codeRepository': 'http://repo1.maven.org/maven2/com/mycompany/app/my-app', }) def test_compute_metadata_maven_empty(self): raw_content = b""" """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', }) def test_compute_metadata_maven_almost_empty(self): raw_content = b""" """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', }) def test_compute_metadata_maven_invalid_xml(self): expected_warning = ( - 'WARNING:swh.indexer.metadata_dictionary.MavenMapping:' + 'WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:' 'Error parsing XML from foo') raw_content = b""" """ with self.assertLogs('swh.indexer.metadata_dictionary', level='WARNING') as cm: result = MAPPINGS["MavenMapping"]('foo').translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) raw_content = b""" """ with self.assertLogs('swh.indexer.metadata_dictionary', level='WARNING') as cm: result = MAPPINGS["MavenMapping"]('foo').translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_unknown_encoding(self): expected_warning = ( - 'WARNING:swh.indexer.metadata_dictionary.MavenMapping:' + 'WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:' 'Error detecting XML encoding from foo') raw_content = b""" """ with self.assertLogs('swh.indexer.metadata_dictionary', level='WARNING') as cm: result = MAPPINGS["MavenMapping"]('foo').translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) raw_content = b""" """ with self.assertLogs('swh.indexer.metadata_dictionary', level='WARNING') as cm: result = MAPPINGS["MavenMapping"]('foo').translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_invalid_encoding(self): expected_warning = ( - 'WARNING:swh.indexer.metadata_dictionary.MavenMapping:' + 'WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:' 'Error unidecoding XML from foo') raw_content = b""" """ with self.assertLogs('swh.indexer.metadata_dictionary', level='WARNING') as cm: result = MAPPINGS["MavenMapping"]('foo').translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_minimal(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'codeRepository': 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', }) def test_compute_metadata_maven_empty_nodes(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'codeRepository': 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', }) raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'codeRepository': 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', }) raw_content = b""" 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'codeRepository': 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', }) raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'codeRepository': 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', }) raw_content = b""" 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'version': '1.2.3', }) def test_compute_metadata_maven_invalid_licenses(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 foo """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'codeRepository': 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', }) def test_compute_metadata_maven_multiple(self): '''Tests when there are multiple code repos and licenses.''' raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false example Example Maven Repo default http://example.org/maven2 Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license MIT license https://opensource.org/licenses/MIT """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'license': [ 'https://www.apache.org/licenses/LICENSE-2.0.txt', 'https://opensource.org/licenses/MIT', ], 'codeRepository': [ 'http://repo1.maven.org/maven2/com/mycompany/app/my-app', 'http://example.org/maven2/com/mycompany/app/my-app', ] }) def test_compute_metadata_pkginfo(self): raw_content = (b"""\ Metadata-Version: 2.1 Name: swh.core Version: 0.0.49 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Description: swh-core ======== \x20 core library for swh's modules: - config parser - hash computations - serialization - logging mechanism \x20 Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing """) # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertCountEqual(result['description'], [ 'Software Heritage core utilities', # note the comma here 'swh-core\n' '========\n' '\n' "core library for swh's modules:\n" '- config parser\n' '- hash computations\n' '- serialization\n' '- logging mechanism\n' ''], result) del result['description'] self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'url': 'https://forge.softwareheritage.org/diffusion/DCORE/', 'name': 'swh.core', 'author': [{ 'type': 'Person', 'name': 'Software Heritage developers', 'email': 'swh-devel@inria.fr', }], 'version': '0.0.49', }) def test_compute_metadata_pkginfo_utf8(self): raw_content = (b'''\ Metadata-Version: 1.1 Name: snowpyt Description-Content-Type: UNKNOWN Description: foo Hydrology N\xc2\xb083 ''') # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'snowpyt', 'description': 'foo\nHydrology N°83', }) def test_compute_metadata_pkginfo_keywords(self): raw_content = (b"""\ Metadata-Version: 2.1 Name: foo Keywords: foo bar baz """) # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'foo', 'keywords': ['foo', 'bar', 'baz'], }) def test_compute_metadata_pkginfo_license(self): raw_content = (b"""\ Metadata-Version: 2.1 Name: foo License: MIT """) # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'foo', 'license': 'MIT', }) def test_gemspec_base(self): raw_content = b""" Gem::Specification.new do |s| s.name = 'example' s.version = '0.1.0' s.licenses = ['MIT'] s.summary = "This is an example!" s.description = "Much longer explanation of the example!" s.authors = ["Ruby Coder"] s.email = 'rubycoder@example.com' s.files = ["lib/example.rb"] s.homepage = 'https://rubygems.org/gems/example' s.metadata = { "source_code_uri" => "https://github.com/example/example" } end""" result = self.gemspec_mapping.translate(raw_content) self.assertCountEqual(result.pop('description'), [ "This is an example!", "Much longer explanation of the example!" ]) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'author': ['Ruby Coder'], 'name': 'example', 'license': 'https://spdx.org/licenses/MIT', 'codeRepository': 'https://rubygems.org/gems/example', 'email': 'rubycoder@example.com', 'version': '0.1.0', }) def test_gemspec_two_author_fields(self): raw_content = b""" Gem::Specification.new do |s| s.authors = ["Ruby Coder1"] s.author = "Ruby Coder2" end""" result = self.gemspec_mapping.translate(raw_content) self.assertCountEqual(result.pop('author'), [ 'Ruby Coder1', 'Ruby Coder2']) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', }) def test_gemspec_invalid_author(self): raw_content = b""" Gem::Specification.new do |s| s.author = ["Ruby Coder"] end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', }) raw_content = b""" Gem::Specification.new do |s| s.author = "Ruby Coder1", end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', }) raw_content = b""" Gem::Specification.new do |s| s.authors = ["Ruby Coder1", ["Ruby Coder2"]] end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'author': ['Ruby Coder1'], }) def test_gemspec_alternative_header(self): raw_content = b""" require './lib/version' Gem::Specification.new { |s| s.name = 'rb-system-with-aliases' s.summary = 'execute system commands with aliases' } """ result = self.gemspec_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'rb-system-with-aliases', 'description': 'execute system commands with aliases', }) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy( keys=list(MAPPINGS['NpmMapping'].mapping))) def test_npm_adversarial(self, doc): raw = json.dumps(doc).encode() self.npm_mapping.translate(raw) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy(keys=CODEMETA_TERMS)) def test_codemeta_adversarial(self, doc): raw = json.dumps(doc).encode() self.codemeta_mapping.translate(raw) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy( keys=list(MAPPINGS['MavenMapping'].mapping))) def test_maven_adversarial(self, doc): raw = xmltodict.unparse({'project': doc}, pretty=True) self.maven_mapping.translate(raw) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(strategies.dictionaries( # keys strategies.one_of( strategies.text(), *map(strategies.just, MAPPINGS['GemspecMapping'].mapping) ), # values strategies.recursive( strategies.characters(), lambda children: strategies.lists(children, 1) ) )) def test_gemspec_adversarial(self, doc): parts = [b'Gem::Specification.new do |s|\n'] for (k, v) in doc.items(): parts.append(' s.{} = {}\n'.format(k, repr(v)).encode()) parts.append(b'end\n') self.gemspec_mapping.translate(b''.join(parts)) def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataIndexer( config=REVISION_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) tool = metadata_indexer.idx_storage.indexer_configuration_get( {'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()}) assert tool is not None metadata_indexer.idx_storage.content_metadata_add([{ 'indexer_configuration_id': tool['id'], 'id': b'cde', 'metadata': YARN_PARSER_METADATA, }]) sha1_gits = [ hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), ] metadata_indexer.run(sha1_gits, 'update-dups') results = list( metadata_indexer.idx_storage. revision_intrinsic_metadata_get(sha1_gits)) expected_results = [{ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'tool': TRANSLATOR_TOOL, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], }] for result in results: del result['tool']['id'] # then self.assertEqual(expected_results, results) def test_revision_metadata_indexer_single_root_dir(self): metadata_indexer = RevisionMetadataIndexer( config=REVISION_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # Add a parent directory, that is the only directory at the root # of the revision rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') subdir_id = metadata_indexer.storage._revisions[rev_id]['directory'] metadata_indexer.storage._revisions[rev_id]['directory'] = b'123456' metadata_indexer.storage.directory_add([{ 'id': b'123456', 'entries': [{ 'target': subdir_id, 'type': 'dir', 'length': None, 'name': b'foobar-1.0.0', 'sha1': None, 'perms': 16384, 'sha1_git': None, 'status': None, 'sha256': None }], }]) tool = metadata_indexer.idx_storage.indexer_configuration_get( {'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()}) assert tool is not None metadata_indexer.idx_storage.content_metadata_add([{ 'indexer_configuration_id': tool['id'], 'id': b'cde', 'metadata': YARN_PARSER_METADATA, }]) sha1_gits = [ hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), ] metadata_indexer.run(sha1_gits, 'update-dups') results = list( metadata_indexer.idx_storage. revision_intrinsic_metadata_get(sha1_gits)) expected_results = [{ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'tool': TRANSLATOR_TOOL, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], }] for result in results: del result['tool']['id'] # then self.assertEqual(expected_results, results) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py index 3971fdb..a5be367 100644 --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -1,217 +1,217 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from unittest.mock import patch from swh.model.hashutil import hash_to_bytes from swh.indexer.metadata import OriginMetadataIndexer from .utils import YARN_PARSER_METADATA from .test_metadata import REVISION_METADATA_CONFIG def test_origin_metadata_indexer( idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.run(["git+https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') rev_metadata = { 'id': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } origin_metadata = { 'id': origin['id'], 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) for result in results: del result['tool'] assert results == [rev_metadata] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) for result in results: del result['tool'] assert results == [origin_metadata] def test_origin_metadata_indexer_duplicate_origin( idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.storage = storage indexer.idx_storage = idx_storage indexer.run(["git+https://github.com/librariesio/yarn-parser"]) indexer.run(["git+https://github.com/librariesio/yarn-parser"]*2) origin = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert len(results) == 1 results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) assert len(results) == 1 def test_origin_metadata_indexer_missing_head( idx_storage, storage, obj_storage): storage.origin_add([{ 'type': 'git', 'url': 'https://example.com' }]) indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.run(["git+https://example.com"]) origin = storage.origin_get({ 'type': 'git', 'url': 'https://example.com'}) results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) assert results == [] def test_origin_metadata_indexer_partial_missing_head( idx_storage, storage, obj_storage): storage.origin_add([{ 'type': 'git', 'url': 'https://example.com' }]) indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.run(["git+https://example.com", "git+https://github.com/librariesio/yarn-parser"]) origin1 = storage.origin_get({ 'type': 'git', 'url': 'https://example.com'}) origin2 = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') rev_metadata = { 'id': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } origin_metadata = { 'id': origin2['id'], 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) for result in results: del result['tool'] assert results == [rev_metadata] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin1['id'], origin2['id']])) for result in results: del result['tool'] assert results == [origin_metadata] def test_origin_metadata_indexer_duplicate_revision( idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.storage = storage indexer.idx_storage = idx_storage indexer.run(["git+https://github.com/librariesio/yarn-parser", "git+https://github.com/librariesio/yarn-parser.git"]) origin1 = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) origin2 = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser.git'}) assert origin1['id'] != origin2['id'] rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert len(results) == 1 results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin1['id'], origin2['id']])) assert len(results) == 2 def test_origin_metadata_indexer_no_metadata( idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) - with patch('swh.indexer.metadata_dictionary.NpmMapping.filename', + with patch('swh.indexer.metadata_dictionary.npm.NpmMapping.filename', b'foo.json'): indexer.run(["git+https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) assert results == [] def test_origin_metadata_indexer_delete_metadata( idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.run(["git+https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results != [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) assert results != [] - with patch('swh.indexer.metadata_dictionary.NpmMapping.filename', + with patch('swh.indexer.metadata_dictionary.npm.NpmMapping.filename', b'foo.json'): indexer.run(["git+https://github.com/librariesio/yarn-parser"]) results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) assert results == [] diff --git a/version.txt b/version.txt index 0f33698..94717a4 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.145-0-g645f08e \ No newline at end of file +v0.0.146-0-g669998e \ No newline at end of file